{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.112048827111721, "advantage_mean": -2.2972623692218974e-08, "advantage_min": -1.0226236283779144, "advantage_std": 0.809523094445467, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.14223624765872955, "kl": 0.0, "lambda_div_used": 0.9000000000000001, "learning_rate": 2e-08, "loss": 0.0762, "reward": 0.383966077119112, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.383966077119112, "reward_after_std": 0.8095231093466282, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.000140361487865448, "reward_change_mean": -0.10579865705221891, "reward_change_min": -0.2073100507259369, "reward_change_std": 0.08411919022910297, "reward_std": 0.8095231391489506, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 0.5256818607449532, "advantage_mean": -1.7384688744126464e-08, "advantage_min": -0.5523970872163773, "advantage_std": 0.42011943086981773, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.0626918151974678, "kl": 0.0, "lambda_div_used": 0.9000000000000001, "learning_rate": 4e-08, "loss": 0.0254, "reward": 0.17750850692391396, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17750850692391396, "reward_after_std": 0.42011942341923714, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0003265589475631714, "reward_change_mean": -0.09788906387984753, "reward_change_min": -0.1594111192971468, "reward_change_std": 0.06503142253495753, "reward_std": 0.42011944204568863, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 0.855322539806366, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6352410092949867, "advantage_std": 0.5779101513326168, "completion_length": 3328.5416717529297, "epoch": 0.0034285714285714284, "grad_norm": 0.09340520948171616, "kl": 4.819035530090332e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6e-08, "loss": -0.0049, "reward": -0.1236064094118774, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1236064094118774, "reward_after_std": 0.5779101625084877, "reward_before_mean": -0.05990193039178848, "reward_before_std": 0.5883516669273376, "reward_change_max": 0.0, "reward_change_mean": -0.06370447622612119, "reward_change_min": -0.1256130812689662, "reward_change_std": 0.05005356459878385, "reward_std": 0.5779101811349392, "rewards/cosine_scaled_reward": -0.13411763461772352, "rewards/format_reward": 0.2083333358168602, "step": 3 }, { "advantage_max": 1.494198601692915, "advantage_mean": -2.1730860888524717e-08, "advantage_min": -0.9187480844557285, "advantage_std": 0.9113606847822666, "completion_length": 2199.1667098999023, "epoch": 0.004571428571428572, "grad_norm": 0.1944754272699356, "kl": 3.168359398841858e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8e-08, "loss": 0.0598, "reward": 0.5572676844894886, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5572676844894886, "reward_after_std": 0.9113606996834278, "reward_before_mean": 0.672414306551218, "reward_before_std": 0.9033316560089588, "reward_change_max": 7.709860801696777e-05, "reward_change_mean": -0.11514659691601992, "reward_change_min": -0.20749736204743385, "reward_change_std": 0.07951459102332592, "reward_std": 0.9113607443869114, "rewards/cosine_scaled_reward": -0.017959539778530598, "rewards/format_reward": 0.7083333432674408, "step": 4 }, { "advantage_max": 1.2024615556001663, "advantage_mean": 7.140139812733537e-09, "advantage_min": -0.6320518404245377, "advantage_std": 0.6965500302612782, "completion_length": 3183.3958740234375, "epoch": 0.005714285714285714, "grad_norm": 0.12876158952713013, "kl": 4.059821367263794e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-07, "loss": 0.04, "reward": -0.1184095498174429, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1184095498174429, "reward_after_std": 0.6965500302612782, "reward_before_mean": -0.06059689383255318, "reward_before_std": 0.6977124996483326, "reward_change_max": 0.00010875612497329712, "reward_change_mean": -0.05781265441328287, "reward_change_min": -0.12108294386416674, "reward_change_std": 0.04674133914522827, "reward_std": 0.6965500451624393, "rewards/cosine_scaled_reward": -0.21779845468699932, "rewards/format_reward": 0.37500000558793545, "step": 5 }, { "advantage_max": 1.084119837731123, "advantage_mean": -1.2417634476236117e-08, "advantage_min": -0.7321569994091988, "advantage_std": 0.7003737837076187, "completion_length": 3075.5833892822266, "epoch": 0.006857142857142857, "grad_norm": 0.1684904843568802, "kl": 4.678219556808472e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2e-07, "loss": 0.0172, "reward": 0.0019676077645272017, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0019676077645272017, "reward_after_std": 0.700373774394393, "reward_before_mean": 0.07271714322268963, "reward_before_std": 0.708720869384706, "reward_change_max": 0.00048591941595077515, "reward_change_mean": -0.07074952917173505, "reward_change_min": -0.14616843592375517, "reward_change_std": 0.06034345901571214, "reward_std": 0.7003737911581993, "rewards/cosine_scaled_reward": -0.13030810561031103, "rewards/format_reward": 0.3333333395421505, "step": 6 }, { "advantage_max": 1.569422885775566, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -1.1949431672692299, "advantage_std": 1.095862664282322, "completion_length": 3141.2084197998047, "epoch": 0.008, "grad_norm": 0.21224753558635712, "kl": 3.159046173095703e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4e-07, "loss": 0.0914, "reward": 0.26081612706184387, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26081612706184387, "reward_after_std": 1.0958626568317413, "reward_before_mean": 0.3473575795069337, "reward_before_std": 1.130651280283928, "reward_change_max": 9.550899267196655e-05, "reward_change_mean": -0.08654144563479349, "reward_change_min": -0.18773514311760664, "reward_change_std": 0.08604789082892239, "reward_std": 1.0958627052605152, "rewards/cosine_scaled_reward": -0.045071213971823454, "rewards/format_reward": 0.43750000558793545, "step": 7 }, { "advantage_max": 1.1458450332283974, "advantage_mean": -3.849466767569254e-08, "advantage_min": -0.843430656939745, "advantage_std": 0.7730266377329826, "completion_length": 2811.3333740234375, "epoch": 0.009142857142857144, "grad_norm": 0.13289576768875122, "kl": 1.934915781021118e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6e-07, "loss": 0.0725, "reward": 0.418302733451128, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.418302733451128, "reward_after_std": 0.7730266749858856, "reward_before_mean": 0.5264986455440521, "reward_before_std": 0.7791682276874781, "reward_change_max": 6.0439109802246094e-05, "reward_change_mean": -0.10819594142958522, "reward_change_min": -0.2019729232415557, "reward_change_std": 0.08076721499674022, "reward_std": 0.7730266973376274, "rewards/cosine_scaled_reward": 0.06533264694735408, "rewards/format_reward": 0.39583333767950535, "step": 8 }, { "advantage_max": 0.9698346555233002, "advantage_mean": 9.002785295031401e-09, "advantage_min": -0.8632857594639063, "advantage_std": 0.7153309807181358, "completion_length": 3139.7709045410156, "epoch": 0.010285714285714285, "grad_norm": 0.1402571052312851, "kl": 4.3511390686035156e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8e-07, "loss": 0.0534, "reward": 0.06348151830025017, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06348151830025017, "reward_after_std": 0.715330995619297, "reward_before_mean": 0.14234677236527205, "reward_before_std": 0.7345745638012886, "reward_change_max": 0.0003286302089691162, "reward_change_mean": -0.07886525010690093, "reward_change_min": -0.1620482699945569, "reward_change_std": 0.06977574108168483, "reward_std": 0.7153310105204582, "rewards/cosine_scaled_reward": -0.13715994078665972, "rewards/format_reward": 0.416666679084301, "step": 9 }, { "advantage_max": 1.0015601068735123, "advantage_mean": -1.2417630257388623e-09, "advantage_min": -0.5780720449984074, "advantage_std": 0.6456005871295929, "completion_length": 2626.4791831970215, "epoch": 0.011428571428571429, "grad_norm": 0.07943809777498245, "kl": 3.2432377338409424e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2e-07, "loss": 0.0169, "reward": -0.04100732016377151, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04100732016377151, "reward_after_std": 0.6456005834043026, "reward_before_mean": 0.02712206542491913, "reward_before_std": 0.6530085504055023, "reward_change_max": 0.00024922192096710205, "reward_change_mean": -0.06812940072268248, "reward_change_min": -0.14841055870056152, "reward_change_std": 0.05704877176322043, "reward_std": 0.6456005908548832, "rewards/cosine_scaled_reward": -0.18435563705861568, "rewards/format_reward": 0.39583333767950535, "step": 10 }, { "advantage_max": 1.211370985955, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.7788897380232811, "advantage_std": 0.8170069120824337, "completion_length": 3313.9791870117188, "epoch": 0.012571428571428572, "grad_norm": 0.16173361241817474, "kl": 3.5762786865234375e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1999999999999998e-07, "loss": 0.1018, "reward": -0.009695451706647873, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.009695451706647873, "reward_after_std": 0.8170069074258208, "reward_before_mean": 0.057828957214951515, "reward_before_std": 0.8377649770118296, "reward_change_max": 7.179379463195801e-05, "reward_change_mean": -0.06752440868876874, "reward_change_min": -0.17309920210391283, "reward_change_std": 0.06957878917455673, "reward_std": 0.8170069148764014, "rewards/cosine_scaled_reward": -0.09608553373254836, "rewards/format_reward": 0.25000000558793545, "step": 11 }, { "advantage_max": 1.1693915463984013, "advantage_mean": -2.6697914712325854e-08, "advantage_min": -0.9561029076576233, "advantage_std": 0.811566423624754, "completion_length": 2493.3959197998047, "epoch": 0.013714285714285714, "grad_norm": 0.11758553236722946, "kl": 3.711692988872528e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4e-07, "loss": 0.0303, "reward": 0.4016623003408313, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4016623003408313, "reward_after_std": 0.8115664124488831, "reward_before_mean": 0.5080796424299479, "reward_before_std": 0.8227832280099392, "reward_change_max": 0.0005908533930778503, "reward_change_mean": -0.10641733650118113, "reward_change_min": -0.20470311492681503, "reward_change_std": 0.08011856814846396, "reward_std": 0.811566423624754, "rewards/cosine_scaled_reward": -0.06887686066329479, "rewards/format_reward": 0.6458333469927311, "step": 12 }, { "advantage_max": 0.9242342077195644, "advantage_mean": 1.024454859832602e-08, "advantage_min": -0.8469453305006027, "advantage_std": 0.6706404201686382, "completion_length": 2971.375030517578, "epoch": 0.014857142857142857, "grad_norm": 0.11529818177223206, "kl": 3.364682197570801e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6e-07, "loss": 0.0336, "reward": 0.17752011120319366, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17752011120319366, "reward_after_std": 0.6706404276192188, "reward_before_mean": 0.267868370981887, "reward_before_std": 0.6849480085074902, "reward_change_max": 7.892400026321411e-05, "reward_change_mean": -0.09034825977869332, "reward_change_min": -0.1780348438769579, "reward_change_std": 0.07077532494440675, "reward_std": 0.6706404536962509, "rewards/cosine_scaled_reward": -0.07439915277063847, "rewards/format_reward": 0.41666667722165585, "step": 13 }, { "advantage_max": 0.9090108498930931, "advantage_mean": -2.23517424569053e-08, "advantage_min": -0.8783823177218437, "advantage_std": 0.6800604276359081, "completion_length": 2952.2708892822266, "epoch": 0.016, "grad_norm": 0.13060258328914642, "kl": 2.4463282898068428e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8e-07, "loss": 0.0531, "reward": 0.250303965061903, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.250303965061903, "reward_after_std": 0.6800604052841663, "reward_before_mean": 0.34783143922686577, "reward_before_std": 0.6966002192348242, "reward_change_max": 0.0002243444323539734, "reward_change_mean": -0.09752750615007244, "reward_change_min": -0.18189744092524052, "reward_change_std": 0.07319995859870687, "reward_std": 0.6800604090094566, "rewards/cosine_scaled_reward": -0.024000946432352066, "rewards/format_reward": 0.39583334513008595, "step": 14 }, { "advantage_max": 0.8086681701242924, "advantage_mean": -1.0554989271494009e-08, "advantage_min": -0.8142276704311371, "advantage_std": 0.6346138492226601, "completion_length": 2722.041702270508, "epoch": 0.017142857142857144, "grad_norm": 0.06529524177312851, "kl": 1.6065314412117004e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3e-07, "loss": 0.0207, "reward": 0.2515428556362167, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2515428556362167, "reward_after_std": 0.6346138343214989, "reward_before_mean": 0.35106278862804174, "reward_before_std": 0.6507196612656116, "reward_change_max": 0.00019127130508422852, "reward_change_mean": -0.09951994521543384, "reward_change_min": -0.1813431465998292, "reward_change_std": 0.0730478495825082, "reward_std": 0.6346138529479504, "rewards/cosine_scaled_reward": -0.02238526940345764, "rewards/format_reward": 0.39583333767950535, "step": 15 }, { "advantage_max": 1.0669633708894253, "advantage_mean": 2.2972624136308184e-08, "advantage_min": -0.6060903668403625, "advantage_std": 0.672724723815918, "completion_length": 3512.1666870117188, "epoch": 0.018285714285714287, "grad_norm": 0.12345141172409058, "kl": 4.026293754577637e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.2e-07, "loss": 0.0155, "reward": -0.3499115873128176, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3499115873128176, "reward_after_std": 0.6727247424423695, "reward_before_mean": -0.31081429310142994, "reward_before_std": 0.68895673006773, "reward_change_max": 0.00038511306047439575, "reward_change_mean": -0.03909728996222839, "reward_change_min": -0.10860877297818661, "reward_change_std": 0.04502260871231556, "reward_std": 0.6727247759699821, "rewards/cosine_scaled_reward": -0.20749048702418804, "rewards/format_reward": 0.10416666977107525, "step": 16 }, { "advantage_max": 0.855644017457962, "advantage_mean": -1.4280279181377864e-08, "advantage_min": -0.8395511396229267, "advantage_std": 0.6393131166696548, "completion_length": 2412.354179382324, "epoch": 0.019428571428571427, "grad_norm": 0.112917959690094, "kl": 3.923475742340088e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4000000000000003e-07, "loss": 0.0264, "reward": 0.4075395166873932, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4075395166873932, "reward_after_std": 0.6393131166696548, "reward_before_mean": 0.5204492211341858, "reward_before_std": 0.6488973777741194, "reward_change_max": 0.00010866671800613403, "reward_change_mean": -0.11290970025584102, "reward_change_min": -0.21267955005168915, "reward_change_std": 0.07953456928953528, "reward_std": 0.639313155785203, "rewards/cosine_scaled_reward": -0.02102540386840701, "rewards/format_reward": 0.5625000074505806, "step": 17 }, { "advantage_max": 1.1588939391076565, "advantage_mean": -1.3038515989105548e-08, "advantage_min": -0.6808314770460129, "advantage_std": 0.7133111041039228, "completion_length": 2964.750030517578, "epoch": 0.02057142857142857, "grad_norm": 0.18582671880722046, "kl": 2.3312866687774658e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6e-07, "loss": 0.0591, "reward": 0.21607510233297944, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21607510233297944, "reward_after_std": 0.713311119005084, "reward_before_mean": 0.3058173172175884, "reward_before_std": 0.7081319317221642, "reward_change_max": 0.0002251937985420227, "reward_change_mean": -0.08974221721291542, "reward_change_min": -0.17165155429393053, "reward_change_std": 0.06901370617561042, "reward_std": 0.7133111339062452, "rewards/cosine_scaled_reward": -0.04500802047550678, "rewards/format_reward": 0.39583333767950535, "step": 18 }, { "advantage_max": 1.3655128255486488, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.9774691089987755, "advantage_std": 0.9219000674784184, "completion_length": 2742.479232788086, "epoch": 0.021714285714285714, "grad_norm": 0.16909454762935638, "kl": 2.4370849132537842e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.7999999999999996e-07, "loss": 0.0466, "reward": 0.8763818801380694, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8763818801380694, "reward_after_std": 0.921900074928999, "reward_before_mean": 1.0231477841734886, "reward_before_std": 0.921125877648592, "reward_change_max": 0.0003154948353767395, "reward_change_mean": -0.1467658975161612, "reward_change_min": -0.25718063302338123, "reward_change_std": 0.10302350029814988, "reward_std": 0.9219001047313213, "rewards/cosine_scaled_reward": 0.23032389022409916, "rewards/format_reward": 0.562500013038516, "step": 19 }, { "advantage_max": 1.1179804876446724, "advantage_mean": -2.9181441429937394e-08, "advantage_min": -0.9255211316049099, "advantage_std": 0.8370347116142511, "completion_length": 2553.62508392334, "epoch": 0.022857142857142857, "grad_norm": 0.15487366914749146, "kl": 1.2110918760299683e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4e-07, "loss": 0.1127, "reward": 0.6232295744121075, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6232295744121075, "reward_after_std": 0.8370346948504448, "reward_before_mean": 0.7511535082012415, "reward_before_std": 0.8457692014053464, "reward_change_max": 0.00010447204113006592, "reward_change_mean": -0.12792393937706947, "reward_change_min": -0.24280552193522453, "reward_change_std": 0.10140588087961078, "reward_std": 0.8370347060263157, "rewards/cosine_scaled_reward": 0.05266007035970688, "rewards/format_reward": 0.6458333395421505, "step": 20 }, { "advantage_max": 0.8383369371294975, "advantage_mean": -6.208816238917336e-10, "advantage_min": -0.7214223481714725, "advantage_std": 0.5726866647601128, "completion_length": 2885.1250610351562, "epoch": 0.024, "grad_norm": 0.09243390709161758, "kl": 3.4226104617118835e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1999999999999995e-07, "loss": 0.0314, "reward": 0.13582478649914265, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13582478649914265, "reward_after_std": 0.5726866982877254, "reward_before_mean": 0.22373342886567116, "reward_before_std": 0.5761883333325386, "reward_change_max": 0.0, "reward_change_mean": -0.08790865843184292, "reward_change_min": -0.14482644200325012, "reward_change_std": 0.06016037776134908, "reward_std": 0.5726867038756609, "rewards/cosine_scaled_reward": -0.08604994229972363, "rewards/format_reward": 0.39583334513008595, "step": 21 }, { "advantage_max": 0.9808385744690895, "advantage_mean": -2.6077032422300306e-08, "advantage_min": -0.7824203744530678, "advantage_std": 0.6632657460868359, "completion_length": 1853.5833587646484, "epoch": 0.025142857142857144, "grad_norm": 0.15798239409923553, "kl": 3.5993754863739014e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3999999999999997e-07, "loss": 0.0177, "reward": 0.6410952005535364, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6410952005535364, "reward_after_std": 0.6632657647132874, "reward_before_mean": 0.7731111478060484, "reward_before_std": 0.6540880165994167, "reward_change_max": 0.00019576400518417358, "reward_change_mean": -0.1320159137248993, "reward_change_min": -0.22317629400640726, "reward_change_std": 0.0877161487005651, "reward_std": 0.6632657833397388, "rewards/cosine_scaled_reward": 0.021972209215164185, "rewards/format_reward": 0.7291666753590107, "step": 22 }, { "advantage_max": 0.7594370692968369, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.5250892378389835, "advantage_std": 0.4808049462735653, "completion_length": 2642.6458740234375, "epoch": 0.026285714285714287, "grad_norm": 0.0673590898513794, "kl": 3.293342888355255e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.6e-07, "loss": 0.047, "reward": -0.059821839444339275, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.059821839444339275, "reward_after_std": 0.4808049499988556, "reward_before_mean": 0.011269157752394676, "reward_before_std": 0.47694574669003487, "reward_change_max": 0.0, "reward_change_mean": -0.07109098765067756, "reward_change_min": -0.13114846032112837, "reward_change_std": 0.048369639087468386, "reward_std": 0.4808049723505974, "rewards/cosine_scaled_reward": -0.19228209322318435, "rewards/format_reward": 0.3958333358168602, "step": 23 }, { "advantage_max": 1.3633764162659645, "advantage_mean": -7.450580596923828e-09, "advantage_min": -1.2567225024104118, "advantage_std": 1.080951388925314, "completion_length": 3095.5833740234375, "epoch": 0.027428571428571427, "grad_norm": 0.18176376819610596, "kl": 2.3409724235534668e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.8e-07, "loss": 0.085, "reward": 0.42115747928619385, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.42115747928619385, "reward_after_std": 1.0809513852000237, "reward_before_mean": 0.5261210352182388, "reward_before_std": 1.12699656188488, "reward_change_max": 9.399652481079102e-05, "reward_change_mean": -0.10496355732902884, "reward_change_min": -0.24463928490877151, "reward_change_std": 0.10649053333327174, "reward_std": 1.0809514187276363, "rewards/cosine_scaled_reward": 0.03389384981710464, "rewards/format_reward": 0.4583333432674408, "step": 24 }, { "advantage_max": 0.9184271581470966, "advantage_mean": -2.2351742401394148e-08, "advantage_min": -0.8210932016372681, "advantage_std": 0.6771118529140949, "completion_length": 2621.937530517578, "epoch": 0.02857142857142857, "grad_norm": 0.0775315910577774, "kl": 3.3681513741612434e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5e-07, "loss": 0.0084, "reward": 0.45451467111706734, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.45451467111706734, "reward_after_std": 0.6771118529140949, "reward_before_mean": 0.5703998990356922, "reward_before_std": 0.6853658109903336, "reward_change_max": 0.0002382621169090271, "reward_change_mean": -0.11588521907106042, "reward_change_min": -0.2103752288967371, "reward_change_std": 0.0814046454615891, "reward_std": 0.6771118678152561, "rewards/cosine_scaled_reward": 0.024783269211184233, "rewards/format_reward": 0.520833333954215, "step": 25 }, { "advantage_max": 0.646996196359396, "advantage_mean": -1.6453366169510986e-08, "advantage_min": -0.7164939921349287, "advantage_std": 0.5295879691839218, "completion_length": 3004.125030517578, "epoch": 0.029714285714285714, "grad_norm": 0.08456138521432877, "kl": 2.5331974029541016e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.2e-07, "loss": 0.0187, "reward": 0.19068976771086454, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19068976771086454, "reward_after_std": 0.5295879682525992, "reward_before_mean": 0.28746249340474606, "reward_before_std": 0.541738043539226, "reward_change_max": 0.0001690015196800232, "reward_change_mean": -0.09677271894179285, "reward_change_min": -0.17185449041426182, "reward_change_std": 0.06900891847908497, "reward_std": 0.5295879831537604, "rewards/cosine_scaled_reward": -0.06460210494697094, "rewards/format_reward": 0.416666679084301, "step": 26 }, { "advantage_max": 1.1920481473207474, "advantage_mean": 1.8626450937198058e-09, "advantage_min": -0.8214569389820099, "advantage_std": 0.7810939662158489, "completion_length": 2973.958396911621, "epoch": 0.030857142857142857, "grad_norm": 0.17237314581871033, "kl": 2.349168062210083e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.4e-07, "loss": 0.0396, "reward": 0.12759432382881641, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12759432382881641, "reward_after_std": 0.7810939662158489, "reward_before_mean": 0.20839167991653085, "reward_before_std": 0.7917383573949337, "reward_change_max": 0.0, "reward_change_mean": -0.08079732768237591, "reward_change_min": -0.15045349579304457, "reward_change_std": 0.06384936673566699, "reward_std": 0.781093992292881, "rewards/cosine_scaled_reward": -0.09372084098868072, "rewards/format_reward": 0.39583333767950535, "step": 27 }, { "advantage_max": 0.9704475104808807, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.9775747060775757, "advantage_std": 0.707570880651474, "completion_length": 2929.395896911621, "epoch": 0.032, "grad_norm": 0.10196535289287567, "kl": 1.9277911633253098e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.6e-07, "loss": 0.0309, "reward": 0.24632756784558296, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24632756784558296, "reward_after_std": 0.7075709030032158, "reward_before_mean": 0.34219497814774513, "reward_before_std": 0.7241467647254467, "reward_change_max": 0.0002382919192314148, "reward_change_mean": -0.09586740791564807, "reward_change_min": -0.16923209372907877, "reward_change_std": 0.0729997109156102, "reward_std": 0.7075709067285061, "rewards/cosine_scaled_reward": -0.02681917743757367, "rewards/format_reward": 0.39583334140479565, "step": 28 }, { "advantage_max": 0.8871082998812199, "advantage_mean": 2.2817403591557373e-08, "advantage_min": -0.593153104186058, "advantage_std": 0.561270035803318, "completion_length": 3434.416717529297, "epoch": 0.03314285714285714, "grad_norm": 0.11913178116083145, "kl": 1.4252960681915283e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.8e-07, "loss": 0.0478, "reward": -0.35079627111554146, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35079627111554146, "reward_after_std": 0.5612700264900923, "reward_before_mean": -0.3084347862750292, "reward_before_std": 0.5713800620287657, "reward_change_max": 0.00014052540063858032, "reward_change_mean": -0.042361461790278554, "reward_change_min": -0.10619675740599632, "reward_change_std": 0.04429285158403218, "reward_std": 0.5612700562924147, "rewards/cosine_scaled_reward": -0.2375507289543748, "rewards/format_reward": 0.16666666977107525, "step": 29 }, { "advantage_max": 1.0437628850340843, "advantage_mean": -1.0554989215982857e-08, "advantage_min": -0.8979457393288612, "advantage_std": 0.6987891010940075, "completion_length": 2934.1250610351562, "epoch": 0.03428571428571429, "grad_norm": 0.1300898790359497, "kl": 2.6658177375793457e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6e-07, "loss": 0.0668, "reward": 0.4072896996513009, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4072896996513009, "reward_after_std": 0.6987890899181366, "reward_before_mean": 0.516874086111784, "reward_before_std": 0.7041825018823147, "reward_change_max": 0.0003464892506599426, "reward_change_mean": -0.1095843828516081, "reward_change_min": -0.17533477023243904, "reward_change_std": 0.07200261077377945, "reward_std": 0.6987891085445881, "rewards/cosine_scaled_reward": 0.008437026292085648, "rewards/format_reward": 0.5000000167638063, "step": 30 }, { "advantage_max": 1.0937358774244785, "advantage_mean": -1.2417634420724966e-08, "advantage_min": -0.7162820585072041, "advantage_std": 0.7303737886250019, "completion_length": 3102.354202270508, "epoch": 0.03542857142857143, "grad_norm": 0.12131313979625702, "kl": 3.250502049922943e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.2e-07, "loss": 0.0399, "reward": -0.049326577223837376, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.049326577223837376, "reward_after_std": 0.7303738072514534, "reward_before_mean": 0.016891899227630347, "reward_before_std": 0.7446071989834309, "reward_change_max": 0.0001913905143737793, "reward_change_mean": -0.06621849071234465, "reward_change_min": -0.14599811471998692, "reward_change_std": 0.060666448436677456, "reward_std": 0.7303738184273243, "rewards/cosine_scaled_reward": -0.12697071363800205, "rewards/format_reward": 0.2708333432674408, "step": 31 }, { "advantage_max": 1.0439314730465412, "advantage_mean": -2.220446049250313e-16, "advantage_min": -0.8557010069489479, "advantage_std": 0.717094523832202, "completion_length": 3120.104217529297, "epoch": 0.036571428571428574, "grad_norm": 0.13845178484916687, "kl": 3.4399330615997314e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.4e-07, "loss": 0.034, "reward": 0.10101676848717034, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10101676848717034, "reward_after_std": 0.717094536870718, "reward_before_mean": 0.18201410118490458, "reward_before_std": 0.7300637681037188, "reward_change_max": 0.00033176690340042114, "reward_change_mean": -0.0809973543509841, "reward_change_min": -0.15452067088335752, "reward_change_std": 0.06580975430551916, "reward_std": 0.7170945592224598, "rewards/cosine_scaled_reward": -0.0964929424226284, "rewards/format_reward": 0.3750000111758709, "step": 32 }, { "advantage_max": 1.3146421052515507, "advantage_mean": 4.346171533775589e-09, "advantage_min": -0.9072512723505497, "advantage_std": 0.9102408867329359, "completion_length": 3378.291717529297, "epoch": 0.037714285714285714, "grad_norm": 0.13238871097564697, "kl": 3.647804260253906e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.6e-07, "loss": 0.0247, "reward": -0.030413204804062843, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.030413204804062843, "reward_after_std": 0.9102408867329359, "reward_before_mean": 0.03352950140833855, "reward_before_std": 0.9408820159733295, "reward_change_max": 0.0003842562437057495, "reward_change_mean": -0.06394269852899015, "reward_change_min": -0.1856099097058177, "reward_change_std": 0.07536959834396839, "reward_std": 0.910240899771452, "rewards/cosine_scaled_reward": -0.10823525360319763, "rewards/format_reward": 0.25000000931322575, "step": 33 }, { "advantage_max": 1.2045657709240913, "advantage_mean": -1.3038516377683607e-08, "advantage_min": -1.0522035360336304, "advantage_std": 0.8648568205535412, "completion_length": 2520.2708587646484, "epoch": 0.038857142857142854, "grad_norm": 0.17876236140727997, "kl": 0.00010145828127861023, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.800000000000001e-07, "loss": 0.0492, "reward": 0.6656378395855427, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6656378395855427, "reward_after_std": 0.8648568540811539, "reward_before_mean": 0.7960071973502636, "reward_before_std": 0.8760812990367413, "reward_change_max": 9.147077798843384e-05, "reward_change_mean": -0.13036933494731784, "reward_change_min": -0.2507061818614602, "reward_change_std": 0.09630749723874032, "reward_std": 0.8648568838834763, "rewards/cosine_scaled_reward": 0.1271702533122152, "rewards/format_reward": 0.5416666772216558, "step": 34 }, { "advantage_max": 1.3862405456602573, "advantage_mean": -7.450580596923828e-09, "advantage_min": -0.8664616197347641, "advantage_std": 0.918156361207366, "completion_length": 2984.1666870117188, "epoch": 0.04, "grad_norm": 0.13064010441303253, "kl": 8.083879947662354e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7e-07, "loss": 0.0494, "reward": 0.1301069421460852, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1301069421460852, "reward_after_std": 0.918156361207366, "reward_before_mean": 0.20769228972494602, "reward_before_std": 0.9387262333184481, "reward_change_max": 0.0002269744873046875, "reward_change_mean": -0.077585359220393, "reward_change_min": -0.19662514980882406, "reward_change_std": 0.07557157322298735, "reward_std": 0.9181563761085272, "rewards/cosine_scaled_reward": -0.07323719232226722, "rewards/format_reward": 0.3541666716337204, "step": 35 }, { "advantage_max": 0.5974312610924244, "advantage_mean": 0.0, "advantage_min": -0.4941564276814461, "advantage_std": 0.41885973513126373, "completion_length": 3372.6666870117188, "epoch": 0.04114285714285714, "grad_norm": 0.07860232889652252, "kl": 0.00011217966675758362, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.2e-07, "loss": 0.037, "reward": -0.4468262065201998, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4468262065201998, "reward_after_std": 0.41885972395539284, "reward_before_mean": -0.40820746310055256, "reward_before_std": 0.4311361648142338, "reward_change_max": 0.0005601868033409119, "reward_change_mean": -0.03861874993890524, "reward_change_min": -0.09212729427963495, "reward_change_std": 0.03797971783205867, "reward_std": 0.41885973140597343, "rewards/cosine_scaled_reward": -0.29785373993217945, "rewards/format_reward": 0.18750000558793545, "step": 36 }, { "advantage_max": 0.7455697171390057, "advantage_mean": -1.2417633032946185e-09, "advantage_min": -0.5482769273221493, "advantage_std": 0.47954942658543587, "completion_length": 3295.4166870117188, "epoch": 0.04228571428571429, "grad_norm": 0.09077224880456924, "kl": 6.353668868541718e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.4e-07, "loss": 0.0224, "reward": -0.30808172933757305, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.30808172933757305, "reward_after_std": 0.47954943403601646, "reward_before_mean": -0.259306862950325, "reward_before_std": 0.4846712723374367, "reward_change_max": 6.873160600662231e-05, "reward_change_mean": -0.048774888389743865, "reward_change_min": -0.09794063959270716, "reward_change_std": 0.038542215479537845, "reward_std": 0.47954943776130676, "rewards/cosine_scaled_reward": -0.2338200956583023, "rewards/format_reward": 0.2083333432674408, "step": 37 }, { "advantage_max": 0.9543525539338589, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.537376195192337, "advantage_std": 0.5838014744222164, "completion_length": 3174.2083435058594, "epoch": 0.04342857142857143, "grad_norm": 0.10276984423398972, "kl": 0.00011193100363016129, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.599999999999999e-07, "loss": 0.0019, "reward": 0.018349453806877136, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.018349453806877136, "reward_after_std": 0.5838014855980873, "reward_before_mean": 0.09325479716062546, "reward_before_std": 0.5771887041628361, "reward_change_max": 0.00024618208408355713, "reward_change_mean": -0.0749053421895951, "reward_change_min": -0.15158047154545784, "reward_change_std": 0.05743449926376343, "reward_std": 0.5838015079498291, "rewards/cosine_scaled_reward": -0.07837261259555817, "rewards/format_reward": 0.2500000037252903, "step": 38 }, { "advantage_max": 0.6583370007574558, "advantage_mean": 2.1730860333413204e-08, "advantage_min": -0.6974942088127136, "advantage_std": 0.5173010732978582, "completion_length": 2943.625030517578, "epoch": 0.044571428571428574, "grad_norm": 0.0883026197552681, "kl": 8.260644972324371e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.799999999999999e-07, "loss": 0.0277, "reward": 0.2838073279708624, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2838073279708624, "reward_after_std": 0.5173010714352131, "reward_before_mean": 0.38903780886903405, "reward_before_std": 0.519108316861093, "reward_change_max": 0.000254705548286438, "reward_change_mean": -0.10523045598529279, "reward_change_min": -0.18225091230124235, "reward_change_std": 0.07373343408107758, "reward_std": 0.5173010863363743, "rewards/cosine_scaled_reward": -0.04506444372236729, "rewards/format_reward": 0.479166679084301, "step": 39 }, { "advantage_max": 0.8892773240804672, "advantage_mean": -1.3659398168108794e-08, "advantage_min": -0.722268171608448, "advantage_std": 0.6039473228156567, "completion_length": 2825.562545776367, "epoch": 0.045714285714285714, "grad_norm": 0.10289324820041656, "kl": 0.0005248039960861206, "lambda_div_used": 0.9000000000000001, "learning_rate": 8e-07, "loss": 0.0336, "reward": 0.20918016554787755, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20918016554787755, "reward_after_std": 0.6039473339915276, "reward_before_mean": 0.30307046277448535, "reward_before_std": 0.6087586358189583, "reward_change_max": 0.0, "reward_change_mean": -0.09389030793681741, "reward_change_min": -0.1630670754238963, "reward_change_std": 0.06309279799461365, "reward_std": 0.6039473377168179, "rewards/cosine_scaled_reward": -0.06721477210521698, "rewards/format_reward": 0.4375000037252903, "step": 40 }, { "advantage_max": 1.5364714190363884, "advantage_mean": -5.587935503204022e-09, "advantage_min": -0.7693024277687073, "advantage_std": 0.8801210299134254, "completion_length": 3060.625030517578, "epoch": 0.046857142857142854, "grad_norm": 0.17200098931789398, "kl": 0.00013019144535064697, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.199999999999999e-07, "loss": 0.0641, "reward": -0.11428980063647032, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11428980063647032, "reward_after_std": 0.8801210466772318, "reward_before_mean": -0.06253744126297534, "reward_before_std": 0.8866547737270594, "reward_change_max": 0.0007996335625648499, "reward_change_mean": -0.051752346043940634, "reward_change_min": -0.13621089048683643, "reward_change_std": 0.052859612624160945, "reward_std": 0.8801210653036833, "rewards/cosine_scaled_reward": -0.19793538935482502, "rewards/format_reward": 0.33333334513008595, "step": 41 }, { "advantage_max": 0.47750604152679443, "advantage_mean": 1.614292477469803e-08, "advantage_min": -0.4910551328212023, "advantage_std": 0.38328032568097115, "completion_length": 2901.8541774749756, "epoch": 0.048, "grad_norm": 0.057079702615737915, "kl": 8.524954319000244e-05, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.399999999999999e-07, "loss": -0.0038, "reward": -0.3438019538298249, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3438019538298249, "reward_after_std": 0.38328034430742264, "reward_before_mean": -0.2935850992798805, "reward_before_std": 0.3953288784250617, "reward_change_max": 0.0003351941704750061, "reward_change_mean": -0.05021684942767024, "reward_change_min": -0.10241542104631662, "reward_change_std": 0.04318908369168639, "reward_std": 0.38328035920858383, "rewards/cosine_scaled_reward": -0.2613758873194456, "rewards/format_reward": 0.2291666716337204, "step": 42 }, { "advantage_max": 0.915099672973156, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.6373906396329403, "advantage_std": 0.6149002127349377, "completion_length": 3080.7500228881836, "epoch": 0.04914285714285714, "grad_norm": 0.10517507046461105, "kl": 0.00013168156147003174, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.599999999999999e-07, "loss": 0.0442, "reward": -0.11758936569094658, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11758936569094658, "reward_after_std": 0.6149002015590668, "reward_before_mean": -0.054422732442617416, "reward_before_std": 0.626342048868537, "reward_change_max": 0.0004749596118927002, "reward_change_mean": -0.06316663301549852, "reward_change_min": -0.13307728618383408, "reward_change_std": 0.05423153773881495, "reward_std": 0.6149002015590668, "rewards/cosine_scaled_reward": -0.1417947057634592, "rewards/format_reward": 0.22916666977107525, "step": 43 }, { "advantage_max": 0.6789183877408504, "advantage_mean": -2.3593506592867186e-08, "advantage_min": -0.6343220472335815, "advantage_std": 0.5155519731342793, "completion_length": 2674.687530517578, "epoch": 0.05028571428571429, "grad_norm": 0.07515233010053635, "kl": 0.00036903470754623413, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.799999999999999e-07, "loss": -0.0067, "reward": 0.3380686726886779, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3380686726886779, "reward_after_std": 0.5155519805848598, "reward_before_mean": 0.4477055589668453, "reward_before_std": 0.5174354799091816, "reward_change_max": 0.00022473931312561035, "reward_change_mean": -0.10963688185438514, "reward_change_min": -0.1835013646632433, "reward_change_std": 0.07373402267694473, "reward_std": 0.5155520141124725, "rewards/cosine_scaled_reward": -0.02614724636077881, "rewards/format_reward": 0.5000000037252903, "step": 44 }, { "advantage_max": 0.9863204881548882, "advantage_mean": -4.967054212379196e-09, "advantage_min": -0.8636655509471893, "advantage_std": 0.7028949670493603, "completion_length": 3406.4791870117188, "epoch": 0.05142857142857143, "grad_norm": 0.10877612978219986, "kl": 0.00013073720037937164, "lambda_div_used": 0.9000000000000001, "learning_rate": 9e-07, "loss": 0.0269, "reward": -0.03182817902415991, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03182817902415991, "reward_after_std": 0.70289496332407, "reward_before_mean": 0.03812003450002521, "reward_before_std": 0.7237787656486034, "reward_change_max": 0.00034902244806289673, "reward_change_mean": -0.06994821969419718, "reward_change_min": -0.15288027469068766, "reward_change_std": 0.06384195922873914, "reward_std": 0.7028949670493603, "rewards/cosine_scaled_reward": -0.11635664664208889, "rewards/format_reward": 0.27083334140479565, "step": 45 }, { "advantage_max": 0.7321793995797634, "advantage_mean": 4.346172088887101e-09, "advantage_min": -0.5258349291980267, "advantage_std": 0.4733831323683262, "completion_length": 3226.1875, "epoch": 0.052571428571428575, "grad_norm": 0.0635356530547142, "kl": 0.0003596842288970947, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.2e-07, "loss": 0.0042, "reward": -0.3120793215930462, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.3120793215930462, "reward_after_std": 0.4733831509947777, "reward_before_mean": -0.26340748369693756, "reward_before_std": 0.4777502194046974, "reward_change_max": 0.00038643181324005127, "reward_change_mean": -0.0486718516331166, "reward_change_min": -0.09745941311120987, "reward_change_std": 0.04000273603014648, "reward_std": 0.4733831658959389, "rewards/cosine_scaled_reward": -0.20462040696293116, "rewards/format_reward": 0.14583333395421505, "step": 46 }, { "advantage_max": 1.245551098138094, "advantage_mean": 1.3038516488705909e-08, "advantage_min": -1.1025776639580727, "advantage_std": 0.9889654777944088, "completion_length": 2921.208396911621, "epoch": 0.053714285714285714, "grad_norm": 0.13856422901153564, "kl": 0.0002876073122024536, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.399999999999999e-07, "loss": 0.0558, "reward": 0.4111839346587658, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4111839346587658, "reward_after_std": 0.9889654964208603, "reward_before_mean": 0.5171041414141655, "reward_before_std": 1.0297090746462345, "reward_change_max": 0.0004226118326187134, "reward_change_mean": -0.10592022282071412, "reward_change_min": -0.24152968171983957, "reward_change_std": 0.10095622227527201, "reward_std": 0.9889655113220215, "rewards/cosine_scaled_reward": 0.029385413974523544, "rewards/format_reward": 0.4583333395421505, "step": 47 }, { "advantage_max": 1.1130247823894024, "advantage_mean": -2.5456151464542387e-08, "advantage_min": -0.6210677027702332, "advantage_std": 0.6353725697845221, "completion_length": 2762.416717529297, "epoch": 0.054857142857142854, "grad_norm": 0.13798625767230988, "kl": 0.000944383442401886, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.6e-07, "loss": 0.0428, "reward": 0.0739603266119957, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0739603266119957, "reward_after_std": 0.6353725772351027, "reward_before_mean": 0.15098485595081002, "reward_before_std": 0.6252976432442665, "reward_change_max": 0.0002612695097923279, "reward_change_mean": -0.0770245383027941, "reward_change_min": -0.13992419466376305, "reward_change_std": 0.05226586083881557, "reward_std": 0.6353725884109735, "rewards/cosine_scaled_reward": -0.11200757790356874, "rewards/format_reward": 0.3750000037252903, "step": 48 }, { "advantage_max": 1.1139013655483723, "advantage_mean": -2.483527011820641e-08, "advantage_min": -0.7361991293728352, "advantage_std": 0.7340975552797318, "completion_length": 2435.8125610351562, "epoch": 0.056, "grad_norm": 0.11226435005664825, "kl": 0.0004952177405357361, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.8e-07, "loss": 0.0351, "reward": 0.22510142996907234, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22510142996907234, "reward_after_std": 0.7340975552797318, "reward_before_mean": 0.31644671969115734, "reward_before_std": 0.7392865046858788, "reward_change_max": 7.228553295135498e-05, "reward_change_mean": -0.09134529763832688, "reward_change_min": -0.18282661493867636, "reward_change_std": 0.06951131741516292, "reward_std": 0.7340975776314735, "rewards/cosine_scaled_reward": -0.1021933276206255, "rewards/format_reward": 0.5208333432674408, "step": 49 }, { "advantage_max": 0.8116575442254543, "advantage_mean": -1.9868215128671096e-08, "advantage_min": -0.7131141312420368, "advantage_std": 0.5905804056674242, "completion_length": 2969.750030517578, "epoch": 0.05714285714285714, "grad_norm": 0.11635003983974457, "kl": 0.0006887298077344894, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-06, "loss": 0.0295, "reward": 0.14436959475278854, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14436959475278854, "reward_after_std": 0.5905804093927145, "reward_before_mean": 0.23427366465330124, "reward_before_std": 0.5951451063156128, "reward_change_max": 0.00026381760835647583, "reward_change_mean": -0.08990406547673047, "reward_change_min": -0.17287831474095583, "reward_change_std": 0.07043572561815381, "reward_std": 0.5905804317444563, "rewards/cosine_scaled_reward": -0.039113187696784735, "rewards/format_reward": 0.31250000558793545, "step": 50 }, { "advantage_max": 1.1374978050589561, "advantage_mean": -2.0333876360467684e-08, "advantage_min": -0.7362420186400414, "advantage_std": 0.7058875262737274, "completion_length": 2325.437557220459, "epoch": 0.05828571428571429, "grad_norm": 0.10930530726909637, "kl": 0.00194627046585083, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999890338174275e-07, "loss": 0.033, "reward": 0.18171239644289017, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18171239644289017, "reward_after_std": 0.7058875374495983, "reward_before_mean": 0.2686680965125561, "reward_before_std": 0.7034656405448914, "reward_change_max": 0.0003391280770301819, "reward_change_mean": -0.08695570658892393, "reward_change_min": -0.17610237654298544, "reward_change_std": 0.06442677089944482, "reward_std": 0.7058875598013401, "rewards/cosine_scaled_reward": -0.12608262081630528, "rewards/format_reward": 0.5208333376795053, "step": 51 }, { "advantage_max": 1.2928729727864265, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.9549924209713936, "advantage_std": 0.9508500788360834, "completion_length": 2870.7500534057617, "epoch": 0.05942857142857143, "grad_norm": 0.14845716953277588, "kl": 0.0017962455749511719, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999561358041868e-07, "loss": 0.0508, "reward": 0.34414372593164444, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34414372593164444, "reward_after_std": 0.9508501011878252, "reward_before_mean": 0.44306235015392303, "reward_before_std": 0.98055231384933, "reward_change_max": 0.00012401491403579712, "reward_change_mean": -0.09891863886150531, "reward_change_min": -0.2104501435533166, "reward_change_std": 0.08957649956573732, "reward_std": 0.950850136578083, "rewards/cosine_scaled_reward": 0.023614494362846017, "rewards/format_reward": 0.3958333358168602, "step": 52 }, { "advantage_max": 1.2351016141474247, "advantage_mean": 6.208818459363386e-10, "advantage_min": -1.186292514204979, "advantage_std": 0.9753887392580509, "completion_length": 2831.3333587646484, "epoch": 0.060571428571428575, "grad_norm": 0.16659170389175415, "kl": 0.003074079751968384, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.999013075636804e-07, "loss": 0.0531, "reward": 0.5398093909025192, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5398093909025192, "reward_after_std": 0.9753887094557285, "reward_before_mean": 0.6579989977180958, "reward_before_std": 1.008890239521861, "reward_change_max": 0.00019691884517669678, "reward_change_mean": -0.11818960297387093, "reward_change_min": -0.25947624258697033, "reward_change_std": 0.10222019837237895, "reward_std": 0.9753887467086315, "rewards/cosine_scaled_reward": 0.047749497927725315, "rewards/format_reward": 0.5625000111758709, "step": 53 }, { "advantage_max": 1.079960823059082, "advantage_mean": -2.3593505593666464e-08, "advantage_min": -1.5959435552358627, "advantage_std": 0.9909502565860748, "completion_length": 2885.0000610351562, "epoch": 0.061714285714285715, "grad_norm": 0.19337964057922363, "kl": 0.0005113296210765839, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.998245517681593e-07, "loss": 0.1239, "reward": 0.781525231897831, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.781525231897831, "reward_after_std": 0.9909502565860748, "reward_before_mean": 0.9249188676476479, "reward_before_std": 1.039504911750555, "reward_change_max": 0.0005872845649719238, "reward_change_mean": -0.14339364634361118, "reward_change_min": -0.26627498492598534, "reward_change_std": 0.11663470219355077, "reward_std": 0.9909502938389778, "rewards/cosine_scaled_reward": 0.19162609428167343, "rewards/format_reward": 0.5416666865348816, "step": 54 }, { "advantage_max": 1.3312099613249302, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.5829783342778683, "advantage_std": 0.7693915814161301, "completion_length": 3077.0208587646484, "epoch": 0.06285714285714286, "grad_norm": 0.13275492191314697, "kl": 0.0009839534759521484, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.997258721585931e-07, "loss": 0.0549, "reward": 0.10769246646668762, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10769246646668762, "reward_after_std": 0.7693915888667107, "reward_before_mean": 0.1841644076630473, "reward_before_std": 0.7638834398239851, "reward_change_max": 5.056709051132202e-05, "reward_change_mean": -0.07647194608580321, "reward_change_min": -0.15568101592361927, "reward_change_std": 0.060355184017680585, "reward_std": 0.7693916037678719, "rewards/cosine_scaled_reward": -0.07458446500822902, "rewards/format_reward": 0.3333333358168602, "step": 55 }, { "advantage_max": 0.6472447663545609, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.5673340857028961, "advantage_std": 0.4818668272346258, "completion_length": 2857.8542098999023, "epoch": 0.064, "grad_norm": 0.07451209425926208, "kl": 0.000834345817565918, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.996052735444862e-07, "loss": 0.004, "reward": 0.1308621042408049, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1308621042408049, "reward_after_std": 0.48186682537198067, "reward_before_mean": 0.2224470037035644, "reward_before_std": 0.4868644941598177, "reward_change_max": 0.0003842562437057495, "reward_change_mean": -0.09158491797279567, "reward_change_min": -0.1648717550560832, "reward_change_std": 0.06347237096633762, "reward_std": 0.48186685517430305, "rewards/cosine_scaled_reward": -0.0971098318696022, "rewards/format_reward": 0.416666679084301, "step": 56 }, { "advantage_max": 0.9662680625915527, "advantage_mean": 3.104408563547878e-09, "advantage_min": -0.8334208503365517, "advantage_std": 0.7065017186105251, "completion_length": 3336.7708740234375, "epoch": 0.06514285714285714, "grad_norm": 0.25560063123703003, "kl": 0.026805490255355835, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.994627618036452e-07, "loss": 0.0182, "reward": -0.058415647596120834, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.058415647596120834, "reward_after_std": 0.7065017186105251, "reward_before_mean": 0.009561575949192047, "reward_before_std": 0.7297605946660042, "reward_change_max": 0.00029649585485458374, "reward_change_mean": -0.06797723285853863, "reward_change_min": -0.15673517063260078, "reward_change_std": 0.06704360526055098, "reward_std": 0.7065017279237509, "rewards/cosine_scaled_reward": -0.14105254039168358, "rewards/format_reward": 0.29166667722165585, "step": 57 }, { "advantage_max": 1.276023730635643, "advantage_mean": -2.110997909809953e-08, "advantage_min": -0.9838252775371075, "advantage_std": 0.9304085336625576, "completion_length": 2217.479217529297, "epoch": 0.06628571428571428, "grad_norm": 0.19546259939670563, "kl": 0.003041982650756836, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.992983438818915e-07, "loss": 0.0658, "reward": 0.7234169393777847, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7234169393777847, "reward_after_std": 0.9304085709154606, "reward_before_mean": 0.8578736670315266, "reward_before_std": 0.9398499056696892, "reward_change_max": 0.0002982392907142639, "reward_change_mean": -0.134456732776016, "reward_change_min": -0.2525649508461356, "reward_change_std": 0.10482977563515306, "reward_std": 0.9304086118936539, "rewards/cosine_scaled_reward": 0.06435349676758051, "rewards/format_reward": 0.7291666697710752, "step": 58 }, { "advantage_max": 0.8459695763885975, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.7203714326024055, "advantage_std": 0.5940009132027626, "completion_length": 3036.3125076293945, "epoch": 0.06742857142857143, "grad_norm": 0.10515865683555603, "kl": 0.0018346011638641357, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.991120277927223e-07, "loss": 0.0007, "reward": 0.016265645623207092, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.016265645623207092, "reward_after_std": 0.5940009374171495, "reward_before_mean": 0.09327167272567749, "reward_before_std": 0.6062601394951344, "reward_change_max": 0.00019018352031707764, "reward_change_mean": -0.07700600824318826, "reward_change_min": -0.1557639017701149, "reward_change_std": 0.06312599789816886, "reward_std": 0.594000943005085, "rewards/cosine_scaled_reward": -0.0783641804009676, "rewards/format_reward": 0.2500000037252903, "step": 59 }, { "advantage_max": 1.0619999282062054, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.6794852465391159, "advantage_std": 0.6504358239471912, "completion_length": 2965.6458740234375, "epoch": 0.06857142857142857, "grad_norm": 0.10484857112169266, "kl": 0.0010578781366348267, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.989038226169207e-07, "loss": 0.0099, "reward": -0.017510805279016495, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.017510805279016495, "reward_after_std": 0.6504357941448689, "reward_before_mean": 0.05231862887740135, "reward_before_std": 0.6509438417851925, "reward_change_max": 0.0002585947513580322, "reward_change_mean": -0.06982943858020008, "reward_change_min": -0.14922125078737736, "reward_change_std": 0.05565405311062932, "reward_std": 0.6504357997328043, "rewards/cosine_scaled_reward": -0.1613406909746118, "rewards/format_reward": 0.37500000186264515, "step": 60 }, { "advantage_max": 0.9533044211566448, "advantage_mean": 1.241763691872677e-09, "advantage_min": -0.7701556235551834, "advantage_std": 0.6510935872793198, "completion_length": 3141.937530517578, "epoch": 0.06971428571428571, "grad_norm": 0.12680523097515106, "kl": 0.001262512058019638, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.98673738502114e-07, "loss": 0.0681, "reward": 0.3617987995967269, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3617987995967269, "reward_after_std": 0.6510935984551907, "reward_before_mean": 0.46881416253745556, "reward_before_std": 0.6530401557683945, "reward_change_max": 0.00017626583576202393, "reward_change_mean": -0.10701534803956747, "reward_change_min": -0.1809440078213811, "reward_change_std": 0.0730011141858995, "reward_std": 0.6510936245322227, "rewards/cosine_scaled_reward": -0.015592940151691437, "rewards/format_reward": 0.5000000074505806, "step": 61 }, { "advantage_max": 1.4476277567446232, "advantage_mean": 1.2417638028949796e-09, "advantage_min": -1.1485518887639046, "advantage_std": 1.023965161293745, "completion_length": 2572.458427429199, "epoch": 0.07085714285714285, "grad_norm": 0.12858064472675323, "kl": 0.0027109384536743164, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.98421786662277e-07, "loss": 0.032, "reward": 0.4915749344509095, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4915749344509095, "reward_after_std": 1.0239651463925838, "reward_before_mean": 0.6014097668230534, "reward_before_std": 1.0463957451283932, "reward_change_max": 1.6786158084869385e-05, "reward_change_mean": -0.10983482981100678, "reward_change_min": -0.23927107453346252, "reward_change_std": 0.09747252892702818, "reward_std": 1.0239651799201965, "rewards/cosine_scaled_reward": -0.022211784962564707, "rewards/format_reward": 0.6458333469927311, "step": 62 }, { "advantage_max": 1.250469371676445, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -1.0716526806354523, "advantage_std": 0.8724186569452286, "completion_length": 2255.791717529297, "epoch": 0.072, "grad_norm": 0.138187974691391, "kl": 0.0027687549591064453, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.981479793771866e-07, "loss": 0.0289, "reward": 0.7566146403551102, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7566146403551102, "reward_after_std": 0.8724186569452286, "reward_before_mean": 0.8947709426283836, "reward_before_std": 0.8801536336541176, "reward_change_max": 0.0, "reward_change_mean": -0.1381562864407897, "reward_change_min": -0.2546289935708046, "reward_change_std": 0.09683123417198658, "reward_std": 0.8724186718463898, "rewards/cosine_scaled_reward": 0.08280211873352528, "rewards/format_reward": 0.7291666828095913, "step": 63 }, { "advantage_max": 1.065049335360527, "advantage_mean": -1.676380662063437e-08, "advantage_min": -0.9578208141028881, "advantage_std": 0.8730638399720192, "completion_length": 2977.041732788086, "epoch": 0.07314285714285715, "grad_norm": 0.16038207709789276, "kl": 0.0029485225677490234, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.97852329991824e-07, "loss": 0.1249, "reward": 0.17387355864048004, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17387355864048004, "reward_after_std": 0.8730638436973095, "reward_before_mean": 0.26105861994437873, "reward_before_std": 0.9116242602467537, "reward_change_max": 0.00032076984643936157, "reward_change_mean": -0.08718509506434202, "reward_change_min": -0.205215728841722, "reward_change_std": 0.08837446081452072, "reward_std": 0.873063862323761, "rewards/cosine_scaled_reward": -0.056970683857798576, "rewards/format_reward": 0.3750000111758709, "step": 64 }, { "advantage_max": 1.068474367260933, "advantage_mean": -7.450581263057643e-09, "advantage_min": -0.6501489132642746, "advantage_std": 0.6808883212506771, "completion_length": 2789.729221343994, "epoch": 0.07428571428571429, "grad_norm": 0.14595411717891693, "kl": 0.005219936370849609, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.975348529157229e-07, "loss": -0.0022, "reward": 0.16831020638346672, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16831020638346672, "reward_after_std": 0.6808883063495159, "reward_before_mean": 0.25501670874655247, "reward_before_std": 0.6813295837491751, "reward_change_max": 4.532933235168457e-05, "reward_change_mean": -0.08670650841668248, "reward_change_min": -0.17894641030579805, "reward_change_std": 0.06918191979639232, "reward_std": 0.6808883361518383, "rewards/cosine_scaled_reward": -0.10165832610800862, "rewards/format_reward": 0.45833333767950535, "step": 65 }, { "advantage_max": 0.7757134735584259, "advantage_mean": -1.6763806787167823e-08, "advantage_min": -0.6671213656663895, "advantage_std": 0.5757374875247478, "completion_length": 2300.583351135254, "epoch": 0.07542857142857143, "grad_norm": 0.06580257415771484, "kl": 0.0022726058959960938, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.971955636222684e-07, "loss": -0.0312, "reward": 0.38516049087047577, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38516049087047577, "reward_after_std": 0.5757375098764896, "reward_before_mean": 0.49741994962096214, "reward_before_std": 0.5758095439523458, "reward_change_max": 0.00032460689544677734, "reward_change_mean": -0.11225945455953479, "reward_change_min": -0.19416429102420807, "reward_change_std": 0.0764637878164649, "reward_std": 0.5757375229150057, "rewards/cosine_scaled_reward": -0.0012900326400995255, "rewards/format_reward": 0.5, "step": 66 }, { "advantage_max": 0.7150251120328903, "advantage_mean": 1.4280279625467074e-08, "advantage_min": -0.47850729525089264, "advantage_std": 0.4646703340113163, "completion_length": 3501.041717529297, "epoch": 0.07657142857142857, "grad_norm": 0.06360700726509094, "kl": 0.0019628703594207764, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.968344786479415e-07, "loss": 0.0201, "reward": -0.5012560524046421, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5012560524046421, "reward_after_std": 0.46467033214867115, "reward_before_mean": -0.46999809332191944, "reward_before_std": 0.47615547478199005, "reward_change_max": 0.0004980117082595825, "reward_change_mean": -0.03125794976949692, "reward_change_min": -0.08866909518837929, "reward_change_std": 0.03602162795141339, "reward_std": 0.46467035450041294, "rewards/cosine_scaled_reward": -0.29749905318021774, "rewards/format_reward": 0.1250000037252903, "step": 67 }, { "advantage_max": 1.1670538075268269, "advantage_mean": -1.6763806898190126e-08, "advantage_min": -0.8921488225460052, "advantage_std": 0.8203412033617496, "completion_length": 2412.750045776367, "epoch": 0.07771428571428571, "grad_norm": 0.13555291295051575, "kl": 0.007524013519287109, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.964516155915151e-07, "loss": 0.0309, "reward": 0.35750674456357956, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35750674456357956, "reward_after_std": 0.8203412406146526, "reward_before_mean": 0.4597671739757061, "reward_before_std": 0.8346320576965809, "reward_change_max": 8.17328691482544e-05, "reward_change_mean": -0.10226041614077985, "reward_change_min": -0.20731043443083763, "reward_change_std": 0.08281638938933611, "reward_std": 0.8203412480652332, "rewards/cosine_scaled_reward": -0.051366430008783937, "rewards/format_reward": 0.5625000037252903, "step": 68 }, { "advantage_max": 1.0627647154033184, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.5945528820157051, "advantage_std": 0.658388938754797, "completion_length": 2798.000045776367, "epoch": 0.07885714285714286, "grad_norm": 0.14615222811698914, "kl": 0.0053558349609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.960469931131936e-07, "loss": 0.0488, "reward": -0.11247947625815868, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11247947625815868, "reward_after_std": 0.6583889536559582, "reward_before_mean": -0.052137549966573715, "reward_before_std": 0.6629227660596371, "reward_change_max": 0.00019734352827072144, "reward_change_mean": -0.06034192198421806, "reward_change_min": -0.1471708407625556, "reward_change_std": 0.05584263487253338, "reward_std": 0.6583889685571194, "rewards/cosine_scaled_reward": -0.20315211405977607, "rewards/format_reward": 0.35416666977107525, "step": 69 }, { "advantage_max": 1.1941916979849339, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.6053403690457344, "advantage_std": 0.710853785276413, "completion_length": 3077.541702270508, "epoch": 0.08, "grad_norm": 0.10297655314207077, "kl": 0.009349465370178223, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.956206309337066e-07, "loss": 0.016, "reward": -0.00988800823688507, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.00988800823688507, "reward_after_std": 0.7108537815511227, "reward_before_mean": 0.05815399810671806, "reward_before_std": 0.7120511084794998, "reward_change_max": 0.0005014985799789429, "reward_change_mean": -0.06804199749603868, "reward_change_min": -0.13841038011014462, "reward_change_std": 0.05216056061908603, "reward_std": 0.710853811353445, "rewards/cosine_scaled_reward": -0.12717300606891513, "rewards/format_reward": 0.31250000186264515, "step": 70 }, { "advantage_max": 0.8809920065104961, "advantage_mean": -1.6142924941231485e-08, "advantage_min": -0.8779931887984276, "advantage_std": 0.6846279054880142, "completion_length": 2682.1875228881836, "epoch": 0.08114285714285714, "grad_norm": 0.1809944361448288, "kl": 0.011813968420028687, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.951725498333448e-07, "loss": 0.0711, "reward": 0.2543748412281275, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2543748412281275, "reward_after_std": 0.6846279166638851, "reward_before_mean": 0.35289028100669384, "reward_before_std": 0.7065125461667776, "reward_change_max": 0.0002439543604850769, "reward_change_mean": -0.09851543279364705, "reward_change_min": -0.19406738318502903, "reward_change_std": 0.0774632137035951, "reward_std": 0.684627927839756, "rewards/cosine_scaled_reward": -0.04230487486347556, "rewards/format_reward": 0.43750000558793545, "step": 71 }, { "advantage_max": 0.9895018897950649, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.6588705629110336, "advantage_std": 0.6476662866771221, "completion_length": 3254.2291870117188, "epoch": 0.08228571428571428, "grad_norm": 0.13209903240203857, "kl": 0.002953052520751953, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.947027716509488e-07, "loss": 0.0271, "reward": -0.14486753195524216, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": -0.14486753195524216, "reward_after_std": 0.6476662829518318, "reward_before_mean": -0.08536670729517937, "reward_before_std": 0.6584763266146183, "reward_change_max": 0.00038442760705947876, "reward_change_mean": -0.05950081581249833, "reward_change_min": -0.1290422910824418, "reward_change_std": 0.051864347769878805, "reward_std": 0.6476662904024124, "rewards/cosine_scaled_reward": -0.17810002015903592, "rewards/format_reward": 0.2708333395421505, "step": 72 }, { "advantage_max": 1.1704095806926489, "advantage_mean": -1.0244548209747961e-08, "advantage_min": -0.8206432610750198, "advantage_std": 0.8143216799944639, "completion_length": 3489.2291870117188, "epoch": 0.08342857142857144, "grad_norm": 0.13383091986179352, "kl": 0.0011025667190551758, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.942113192828444e-07, "loss": 0.0327, "reward": -0.05213266983628273, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05213266983628273, "reward_after_std": 0.8143216762691736, "reward_before_mean": 0.01244833879172802, "reward_before_std": 0.8403085879981518, "reward_change_max": 0.00028943270444869995, "reward_change_mean": -0.06458103121258318, "reward_change_min": -0.18091739807277918, "reward_change_std": 0.07164288056083024, "reward_std": 0.8143216967582703, "rewards/cosine_scaled_reward": -0.09794248826801777, "rewards/format_reward": 0.2083333395421505, "step": 73 }, { "advantage_max": 0.8129484131932259, "advantage_mean": -4.346172199909404e-09, "advantage_min": -0.45292046666145325, "advantage_std": 0.48586198315024376, "completion_length": 3319.2083435058594, "epoch": 0.08457142857142858, "grad_norm": 0.07312816381454468, "kl": 0.002932727336883545, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.93698216681727e-07, "loss": 0.0273, "reward": -0.18033896386623383, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18033896386623383, "reward_after_std": 0.4858619924634695, "reward_before_mean": -0.12051686272025108, "reward_before_std": 0.48246172443032265, "reward_change_max": 0.0002520233392715454, "reward_change_mean": -0.05982208307250403, "reward_change_min": -0.1109585091471672, "reward_change_std": 0.044366022309986874, "reward_std": 0.48586202412843704, "rewards/cosine_scaled_reward": -0.1540084406733513, "rewards/format_reward": 0.18750000186264515, "step": 74 }, { "advantage_max": 1.102239165455103, "advantage_mean": -9.313226134732844e-09, "advantage_min": -0.6440135687589645, "advantage_std": 0.6835919357836246, "completion_length": 3212.562515258789, "epoch": 0.08571428571428572, "grad_norm": 0.12819837033748627, "kl": 0.0043468475341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.931634888554935e-07, "loss": 0.0585, "reward": 0.17928399704396725, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17928399704396725, "reward_after_std": 0.6835919404402375, "reward_before_mean": 0.2666355334222317, "reward_before_std": 0.6772829368710518, "reward_change_max": 0.00011337548494338989, "reward_change_mean": -0.08735153428278863, "reward_change_min": -0.17496745940297842, "reward_change_std": 0.07009288971312344, "reward_std": 0.6835919748991728, "rewards/cosine_scaled_reward": 0.008317755535244942, "rewards/format_reward": 0.25000000186264515, "step": 75 }, { "advantage_max": 1.0620285347104073, "advantage_mean": 5.587935503204022e-09, "advantage_min": -0.6922560930252075, "advantage_std": 0.6782711632549763, "completion_length": 2798.0625762939453, "epoch": 0.08685714285714285, "grad_norm": 0.12901480495929718, "kl": 0.0019115209579467773, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.926071618660237e-07, "loss": 0.0855, "reward": -0.03348325379192829, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03348325379192829, "reward_after_std": 0.6782711483538151, "reward_before_mean": 0.034606458619236946, "reward_before_std": 0.6863775365054607, "reward_change_max": 0.00021073222160339355, "reward_change_mean": -0.06808969052508473, "reward_change_min": -0.1497401585802436, "reward_change_std": 0.0586240931879729, "reward_std": 0.6782711558043957, "rewards/cosine_scaled_reward": -0.19103011582046747, "rewards/format_reward": 0.41666667722165585, "step": 76 }, { "advantage_max": 0.7430855147540569, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.6691855564713478, "advantage_std": 0.5303375329822302, "completion_length": 3146.0833587646484, "epoch": 0.088, "grad_norm": 0.09785456210374832, "kl": 0.001885056495666504, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.9202926282791e-07, "loss": 0.0195, "reward": -0.005782470107078552, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.005782470107078552, "reward_after_std": 0.5303375329822302, "reward_before_mean": 0.07121062092483044, "reward_before_std": 0.5396563746035099, "reward_change_max": 0.0004342496395111084, "reward_change_mean": -0.07699309976305813, "reward_change_min": -0.1357054617255926, "reward_change_std": 0.05527487176004797, "reward_std": 0.5303375385701656, "rewards/cosine_scaled_reward": -0.12064469419419765, "rewards/format_reward": 0.3125000111758709, "step": 77 }, { "advantage_max": 1.2232607677578926, "advantage_mean": 1.5522040319737584e-09, "advantage_min": -1.0113331899046898, "advantage_std": 0.8812691904604435, "completion_length": 3311.5208740234375, "epoch": 0.08914285714285715, "grad_norm": 0.13331255316734314, "kl": 0.002320528030395508, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.91429819907136e-07, "loss": 0.048, "reward": 0.2586913288978394, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2586913288978394, "reward_after_std": 0.8812691792845726, "reward_before_mean": 0.35136597882956266, "reward_before_std": 0.9065258577466011, "reward_change_max": 0.00028318166732788086, "reward_change_mean": -0.09267466515302658, "reward_change_min": -0.20396779384464025, "reward_change_std": 0.08194569800980389, "reward_std": 0.8812692165374756, "rewards/cosine_scaled_reward": -0.0014003375545144081, "rewards/format_reward": 0.35416667722165585, "step": 78 }, { "advantage_max": 0.9030539467930794, "advantage_mean": 9.313226190243995e-09, "advantage_min": -0.963399812579155, "advantage_std": 0.7022213935852051, "completion_length": 2346.0833587646484, "epoch": 0.09028571428571429, "grad_norm": 0.10168980807065964, "kl": 0.0042231082916259766, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.908088623197048e-07, "loss": 0.038, "reward": 0.38052323646843433, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38052323646843433, "reward_after_std": 0.7022214084863663, "reward_before_mean": 0.48965731263160706, "reward_before_std": 0.7216234467923641, "reward_change_max": 0.000313684344291687, "reward_change_mean": -0.10913406126201153, "reward_change_min": -0.2034011334180832, "reward_change_std": 0.08371774014085531, "reward_std": 0.7022214457392693, "rewards/cosine_scaled_reward": -0.03642135614063591, "rewards/format_reward": 0.5625000074505806, "step": 79 }, { "advantage_max": 0.9310047589242458, "advantage_mean": 1.1175870895385742e-08, "advantage_min": -0.7848172262310982, "advantage_std": 0.7024698052555323, "completion_length": 3314.250045776367, "epoch": 0.09142857142857143, "grad_norm": 0.1139289066195488, "kl": 0.003377556800842285, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.901664203302124e-07, "loss": 0.0271, "reward": 0.0665772594511509, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0665772594511509, "reward_after_std": 0.7024698089808226, "reward_before_mean": 0.14694275334477425, "reward_before_std": 0.7254483439028263, "reward_change_max": 0.0002681538462638855, "reward_change_mean": -0.08036548434756696, "reward_change_min": -0.18215056881308556, "reward_change_std": 0.07275031437166035, "reward_std": 0.7024698238819838, "rewards/cosine_scaled_reward": -0.09319528564810753, "rewards/format_reward": 0.3333333395421505, "step": 80 }, { "advantage_max": 1.0726732574403286, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.6589022949337959, "advantage_std": 0.7009899169206619, "completion_length": 3062.2708892822266, "epoch": 0.09257142857142857, "grad_norm": 0.15159904956817627, "kl": 0.009969711303710938, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.895025252503755e-07, "loss": 0.0548, "reward": -0.11807727441191673, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11807727441191673, "reward_after_std": 0.7009899355471134, "reward_before_mean": -0.05808864161372185, "reward_before_std": 0.7156584821641445, "reward_change_max": 8.110702037811279e-05, "reward_change_mean": -0.059988636523485184, "reward_change_min": -0.14878776855766773, "reward_change_std": 0.05785088497214019, "reward_std": 0.7009899355471134, "rewards/cosine_scaled_reward": -0.17487765941768885, "rewards/format_reward": 0.2916666716337204, "step": 81 }, { "advantage_max": 1.000360194593668, "advantage_mean": -1.6763806509612067e-08, "advantage_min": -0.7447901144623756, "advantage_std": 0.676895584911108, "completion_length": 2876.7083740234375, "epoch": 0.09371428571428571, "grad_norm": 0.1397034078836441, "kl": 0.0031021833419799805, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.888172094375033e-07, "loss": 0.0471, "reward": 0.17887724190950394, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17887724190950394, "reward_after_std": 0.6768955774605274, "reward_before_mean": 0.26815129816532135, "reward_before_std": 0.6837855763733387, "reward_change_max": 7.466226816177368e-05, "reward_change_mean": -0.08927406487055123, "reward_change_min": -0.18324447609484196, "reward_change_std": 0.07135608559474349, "reward_std": 0.6768956035375595, "rewards/cosine_scaled_reward": -0.07425769325345755, "rewards/format_reward": 0.41666667722165585, "step": 82 }, { "advantage_max": 0.5430999919772148, "advantage_mean": 1.0554989271494009e-08, "advantage_min": -0.5749870277941227, "advantage_std": 0.4309024289250374, "completion_length": 2885.1458435058594, "epoch": 0.09485714285714286, "grad_norm": 0.07022266834974289, "kl": 0.004255771636962891, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.881105062929221e-07, "loss": 0.0056, "reward": -0.10269813984632492, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10269813984632492, "reward_after_std": 0.4309024289250374, "reward_before_mean": -0.03149991109967232, "reward_before_std": 0.44176167342811823, "reward_change_max": 0.0, "reward_change_mean": -0.07119821500964463, "reward_change_min": -0.12623351998627186, "reward_change_std": 0.052647512522526085, "reward_std": 0.4309024512767792, "rewards/cosine_scaled_reward": -0.1615832932293415, "rewards/format_reward": 0.2916666679084301, "step": 83 }, { "advantage_max": 1.1471601538360119, "advantage_mean": -3.352761324126874e-08, "advantage_min": -0.9769178181886673, "advantage_std": 0.8521611541509628, "completion_length": 3055.7083740234375, "epoch": 0.096, "grad_norm": 0.13345955312252045, "kl": 0.0014927387237548828, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.873824502603459e-07, "loss": 0.0476, "reward": 0.539270993322134, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.539270993322134, "reward_after_std": 0.8521611448377371, "reward_before_mean": 0.6591531746089458, "reward_before_std": 0.87022246979177, "reward_change_max": 0.0005726292729377747, "reward_change_mean": -0.11988217453472316, "reward_change_min": -0.22343150340020657, "reward_change_std": 0.09291968471370637, "reward_std": 0.8521611541509628, "rewards/cosine_scaled_reward": 0.11082656681537628, "rewards/format_reward": 0.43750000558793545, "step": 84 }, { "advantage_max": 1.1944213286042213, "advantage_mean": -2.4214387883692012e-08, "advantage_min": -0.78194659948349, "advantage_std": 0.7695669606328011, "completion_length": 3144.687545776367, "epoch": 0.09714285714285714, "grad_norm": 0.11960656940937042, "kl": 0.0030364990234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.866330768241983e-07, "loss": 0.0324, "reward": 0.35563176590949297, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35563176590949297, "reward_after_std": 0.7695669494569302, "reward_before_mean": 0.4578101532533765, "reward_before_std": 0.7720634564757347, "reward_change_max": 0.0002278536558151245, "reward_change_mean": -0.10217837616801262, "reward_change_min": -0.17799327243119478, "reward_change_std": 0.07303832424804568, "reward_std": 0.7695669531822205, "rewards/cosine_scaled_reward": -0.00026161037385463715, "rewards/format_reward": 0.4583333432674408, "step": 85 }, { "advantage_max": 0.8763966374099255, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.4972258657217026, "advantage_std": 0.5113778002560139, "completion_length": 3030.8125762939453, "epoch": 0.09828571428571428, "grad_norm": 0.10097847878932953, "kl": 0.0046710968017578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.85862422507884e-07, "loss": 0.032, "reward": 0.045074569061398506, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.045074569061398506, "reward_after_std": 0.5113777853548527, "reward_before_mean": 0.12412225641310215, "reward_before_std": 0.49582573771476746, "reward_change_max": 0.0003281235694885254, "reward_change_mean": -0.07904768036678433, "reward_change_min": -0.13752009812742472, "reward_change_std": 0.053194016218185425, "reward_std": 0.511377789080143, "rewards/cosine_scaled_reward": -0.12543888704385608, "rewards/format_reward": 0.37500000186264515, "step": 86 }, { "advantage_max": 1.0924755409359932, "advantage_mean": -5.277494885547185e-09, "advantage_min": -1.026725873351097, "advantage_std": 0.8062815628945827, "completion_length": 2812.083366394043, "epoch": 0.09942857142857142, "grad_norm": 0.14117565751075745, "kl": 0.007817387580871582, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.850705248720068e-07, "loss": 0.0421, "reward": 0.33314487524330616, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33314487524330616, "reward_after_std": 0.8062815628945827, "reward_before_mean": 0.4346958175301552, "reward_before_std": 0.8250590972602367, "reward_change_max": 0.0003003552556037903, "reward_change_mean": -0.10155095206573606, "reward_change_min": -0.20147591084241867, "reward_change_std": 0.08329685684293509, "reward_std": 0.8062815852463245, "rewards/cosine_scaled_reward": -0.06390209496021271, "rewards/format_reward": 0.5625000093132257, "step": 87 }, { "advantage_max": 1.5729316174983978, "advantage_mean": 1.9868215073159945e-08, "advantage_min": -1.1915039494633675, "advantage_std": 1.0524433217942715, "completion_length": 3050.3125610351562, "epoch": 0.10057142857142858, "grad_norm": 0.224959596991539, "kl": 0.009051322937011719, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.8425742251254e-07, "loss": 0.0922, "reward": 0.32696538232266903, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32696538232266903, "reward_after_std": 1.0524433366954327, "reward_before_mean": 0.41934662498533726, "reward_before_std": 1.0758509896695614, "reward_change_max": 0.00041228532791137695, "reward_change_mean": -0.09238121099770069, "reward_change_min": -0.19882145337760448, "reward_change_std": 0.08465232839807868, "reward_std": 1.0524433702230453, "rewards/cosine_scaled_reward": 0.011756634339690208, "rewards/format_reward": 0.39583334513008595, "step": 88 }, { "advantage_max": 0.6932465061545372, "advantage_mean": 4.346172310931706e-09, "advantage_min": -0.7988357171416283, "advantage_std": 0.5768093653023243, "completion_length": 3378.166717529297, "epoch": 0.10171428571428572, "grad_norm": 0.14413271844387054, "kl": 0.004611492156982422, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.83423155058946e-07, "loss": 0.0403, "reward": -0.06333770230412483, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06333770230412483, "reward_after_std": 0.57680937461555, "reward_before_mean": 0.009203372988849878, "reward_before_std": 0.6004473958164454, "reward_change_max": 0.00012886524200439453, "reward_change_mean": -0.07254105247557163, "reward_change_min": -0.14528132881969213, "reward_change_std": 0.06309301522560418, "reward_std": 0.5768093969672918, "rewards/cosine_scaled_reward": -0.1203983323648572, "rewards/format_reward": 0.2500000074505806, "step": 89 }, { "advantage_max": 0.7317495383322239, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.6038379520177841, "advantage_std": 0.502998935058713, "completion_length": 2579.770851135254, "epoch": 0.10285714285714286, "grad_norm": 0.09057570993900299, "kl": 0.012523651123046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.825677631722435e-07, "loss": 0.0084, "reward": 0.053218359127640724, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.053218359127640724, "reward_after_std": 0.5029989331960678, "reward_before_mean": 0.13523414358496666, "reward_before_std": 0.5063311252743006, "reward_change_max": 0.0001977384090423584, "reward_change_mean": -0.08201578538864851, "reward_change_min": -0.13647860940545797, "reward_change_std": 0.053851797711104155, "reward_std": 0.5029989369213581, "rewards/cosine_scaled_reward": -0.17196626088116318, "rewards/format_reward": 0.47916666977107525, "step": 90 }, { "advantage_max": 0.8002164922654629, "advantage_mean": 2.1109978487476866e-08, "advantage_min": -0.663936335593462, "advantage_std": 0.598348455503583, "completion_length": 3102.229202270508, "epoch": 0.104, "grad_norm": 0.1666155755519867, "kl": 0.007025480270385742, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.816912885430258e-07, "loss": 0.0472, "reward": -0.064543966203928, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.064543966203928, "reward_after_std": 0.5983484406024218, "reward_before_mean": 0.005597323179244995, "reward_before_std": 0.616460170596838, "reward_change_max": 0.0004862844944000244, "reward_change_mean": -0.07014129433082417, "reward_change_min": -0.15747894253581762, "reward_change_std": 0.060970506398007274, "reward_std": 0.5983484499156475, "rewards/cosine_scaled_reward": -0.1847013352671638, "rewards/format_reward": 0.3750000037252903, "step": 91 }, { "advantage_max": 0.9697154983878136, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.8269370794296265, "advantage_std": 0.7136020623147488, "completion_length": 2876.666717529297, "epoch": 0.10514285714285715, "grad_norm": 0.11581981927156448, "kl": 0.008868694305419922, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.807937738894303e-07, "loss": 0.0375, "reward": 0.10207435674965382, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10207435674965382, "reward_after_std": 0.7136020660400391, "reward_before_mean": 0.18458046624436975, "reward_before_std": 0.7316001541912556, "reward_change_max": 0.00030371546745300293, "reward_change_mean": -0.0825060837669298, "reward_change_min": -0.16981272120028734, "reward_change_std": 0.072384690400213, "reward_std": 0.7136020846664906, "rewards/cosine_scaled_reward": -0.1577097848057747, "rewards/format_reward": 0.5000000111758709, "step": 92 }, { "advantage_max": 0.5377812571823597, "advantage_mean": 9.623667057701013e-09, "advantage_min": -0.44808217138051987, "advantage_std": 0.37609364092350006, "completion_length": 3464.3958435058594, "epoch": 0.10628571428571429, "grad_norm": 0.09607281535863876, "kl": 0.0065135955810546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.798752629550546e-07, "loss": 0.0231, "reward": -0.473403861746192, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.473403861746192, "reward_after_std": 0.37609364092350006, "reward_before_mean": -0.43631402403116226, "reward_before_std": 0.38595324009656906, "reward_change_max": 0.00021993368864059448, "reward_change_mean": -0.03708983049727976, "reward_change_min": -0.08134532067924738, "reward_change_std": 0.0337000098079443, "reward_std": 0.37609364464879036, "rewards/cosine_scaled_reward": -0.23899034783244133, "rewards/format_reward": 0.0416666679084301, "step": 93 }, { "advantage_max": 0.7897338904440403, "advantage_mean": 1.614292521878724e-08, "advantage_min": -0.6556259952485561, "advantage_std": 0.53734415397048, "completion_length": 3259.875030517578, "epoch": 0.10742857142857143, "grad_norm": 0.08970505744218826, "kl": 0.008943557739257812, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.78935800506826e-07, "loss": 0.013, "reward": -0.11383907869458199, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.11383907869458199, "reward_after_std": 0.5373441409319639, "reward_before_mean": -0.04790402948856354, "reward_before_std": 0.5454462002962828, "reward_change_max": 0.0002677515149116516, "reward_change_mean": -0.06593502941541374, "reward_change_min": -0.12173356115818024, "reward_change_std": 0.04828887898474932, "reward_std": 0.5373441465198994, "rewards/cosine_scaled_reward": -0.10728535335510969, "rewards/format_reward": 0.1666666679084301, "step": 94 }, { "advantage_max": 0.7336770445108414, "advantage_mean": 1.7384688466570708e-08, "advantage_min": -0.5926066339015961, "advantage_std": 0.5384083883836865, "completion_length": 3437.1250610351562, "epoch": 0.10857142857142857, "grad_norm": 0.09171445667743683, "kl": 0.0028939247131347656, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.779754323328192e-07, "loss": 0.0225, "reward": -0.22875665500760078, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22875665500760078, "reward_after_std": 0.538408393971622, "reward_before_mean": -0.17224636115133762, "reward_before_std": 0.5542042218148708, "reward_change_max": 0.00020701438188552856, "reward_change_mean": -0.056510292924940586, "reward_change_min": -0.12093199416995049, "reward_change_std": 0.0512371362419799, "reward_std": 0.5384083949029446, "rewards/cosine_scaled_reward": -0.20070651546120644, "rewards/format_reward": 0.2291666753590107, "step": 95 }, { "advantage_max": 1.1119077168405056, "advantage_mean": -3.1044084525255755e-09, "advantage_min": -0.8286112174391747, "advantage_std": 0.8079782947897911, "completion_length": 3218.187545776367, "epoch": 0.10971428571428571, "grad_norm": 0.1471080780029297, "kl": 0.008174657821655273, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.769942052400235e-07, "loss": 0.0564, "reward": 0.08850634610280395, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08850634610280395, "reward_after_std": 0.8079782761633396, "reward_before_mean": 0.16710020136088133, "reward_before_std": 0.8330794721841812, "reward_change_max": 0.00022210925817489624, "reward_change_mean": -0.07859386596828699, "reward_change_min": -0.19585560448467731, "reward_change_std": 0.07616083696484566, "reward_std": 0.8079783134162426, "rewards/cosine_scaled_reward": -0.0726998969912529, "rewards/format_reward": 0.31250000558793545, "step": 96 }, { "advantage_max": 0.774888951331377, "advantage_mean": 8.07146216530441e-09, "advantage_min": -0.7759076952934265, "advantage_std": 0.6380616500973701, "completion_length": 3350.541717529297, "epoch": 0.11085714285714286, "grad_norm": 0.13446907699108124, "kl": 0.0053157806396484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.759921670520634e-07, "loss": 0.0727, "reward": -0.1549377404153347, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1549377404153347, "reward_after_std": 0.638061661273241, "reward_before_mean": -0.09221046045422554, "reward_before_std": 0.6677076295018196, "reward_change_max": 0.00012560933828353882, "reward_change_mean": -0.06272726762108505, "reward_change_min": -0.14727217331528664, "reward_change_std": 0.06415313272736967, "reward_std": 0.6380616910755634, "rewards/cosine_scaled_reward": -0.13985523395240307, "rewards/format_reward": 0.1875, "step": 97 }, { "advantage_max": 0.544066796079278, "advantage_mean": 9.934107592091124e-09, "advantage_min": -0.6370603069663048, "advantage_std": 0.4388393219560385, "completion_length": 2994.9583892822266, "epoch": 0.112, "grad_norm": 0.08746904134750366, "kl": 0.004663944244384766, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.749693666068663e-07, "loss": 0.0244, "reward": -0.046972109004855156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.046972109004855156, "reward_after_std": 0.4388393219560385, "reward_before_mean": 0.029481630073860288, "reward_before_std": 0.45071091689169407, "reward_change_max": 8.702278137207031e-06, "reward_change_mean": -0.07645371626131237, "reward_change_min": -0.13023042678833008, "reward_change_std": 0.05339970113709569, "reward_std": 0.4388393349945545, "rewards/cosine_scaled_reward": -0.183175852522254, "rewards/format_reward": 0.3958333507180214, "step": 98 }, { "advantage_max": 0.8721281476318836, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6215038821101189, "advantage_std": 0.5652411505579948, "completion_length": 2965.500015258789, "epoch": 0.11314285714285714, "grad_norm": 0.10346469283103943, "kl": 0.007491111755371094, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.739258537542835e-07, "loss": 0.0261, "reward": 0.052828481420874596, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.052828481420874596, "reward_after_std": 0.5652411617338657, "reward_before_mean": 0.1324861806933768, "reward_before_std": 0.5641120858490467, "reward_change_max": 5.237758159637451e-05, "reward_change_mean": -0.07965772063471377, "reward_change_min": -0.1496228538453579, "reward_change_std": 0.06120690796524286, "reward_std": 0.5652411766350269, "rewards/cosine_scaled_reward": -0.06917357502970845, "rewards/format_reward": 0.2708333395421505, "step": 99 }, { "advantage_max": 1.177578266710043, "advantage_mean": -6.20881640545079e-09, "advantage_min": -1.0268895998597145, "advantage_std": 0.9141651093959808, "completion_length": 2879.3958740234375, "epoch": 0.11428571428571428, "grad_norm": 0.16082407534122467, "kl": 0.011318206787109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.728616793536587e-07, "loss": 0.0754, "reward": 0.26343181263655424, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26343181263655424, "reward_after_std": 0.914165124297142, "reward_before_mean": 0.3574203960597515, "reward_before_std": 0.9503813628107309, "reward_change_max": 0.0003396347165107727, "reward_change_mean": -0.09398859040811658, "reward_change_min": -0.20327434316277504, "reward_change_std": 0.08825246221385896, "reward_std": 0.9141651410609484, "rewards/cosine_scaled_reward": -0.02962313499301672, "rewards/format_reward": 0.41666667722165585, "step": 100 }, { "advantage_max": 0.7537824623286724, "advantage_mean": 3.290673178391046e-08, "advantage_min": -0.5371817983686924, "advantage_std": 0.48059073835611343, "completion_length": 2831.1041870117188, "epoch": 0.11542857142857142, "grad_norm": 0.10156463831663132, "kl": 0.005992412567138672, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.717768952713511e-07, "loss": 0.0389, "reward": 0.06839887425303459, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06839887425303459, "reward_after_std": 0.4805907569825649, "reward_before_mean": 0.15205013938248158, "reward_before_std": 0.47612071223556995, "reward_change_max": 0.0001445487141609192, "reward_change_mean": -0.08365123742260039, "reward_change_min": -0.146190470084548, "reward_change_std": 0.05711901048198342, "reward_std": 0.4805907681584358, "rewards/cosine_scaled_reward": -0.09064160846173763, "rewards/format_reward": 0.33333334140479565, "step": 101 }, { "advantage_max": 1.2887723445892334, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.8732288219034672, "advantage_std": 0.7983098104596138, "completion_length": 3045.4584350585938, "epoch": 0.11657142857142858, "grad_norm": 0.17374233901500702, "kl": 0.013805389404296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.706715543782064e-07, "loss": 0.0493, "reward": 0.11814034357666969, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11814034357666969, "reward_after_std": 0.7983098365366459, "reward_before_mean": 0.1961025595664978, "reward_before_std": 0.8020947612822056, "reward_change_max": 0.0001891404390335083, "reward_change_mean": -0.07796221878379583, "reward_change_min": -0.15777896530926228, "reward_change_std": 0.06380243599414825, "reward_std": 0.7983098588883877, "rewards/cosine_scaled_reward": -0.1311153913848102, "rewards/format_reward": 0.45833334140479565, "step": 102 }, { "advantage_max": 1.012158952653408, "advantage_mean": 4.346171922353648e-09, "advantage_min": -0.9280484542250633, "advantage_std": 0.806487213820219, "completion_length": 3334.104248046875, "epoch": 0.11771428571428572, "grad_norm": 0.16222251951694489, "kl": 0.010880470275878906, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.695457105469804e-07, "loss": 0.0803, "reward": -0.02617565030232072, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02617565030232072, "reward_after_std": 0.8064871914684772, "reward_before_mean": 0.043522678315639496, "reward_before_std": 0.8436622954905033, "reward_change_max": 0.0005996227264404297, "reward_change_mean": -0.06969833420589566, "reward_change_min": -0.17376555316150188, "reward_change_std": 0.07702636765316129, "reward_std": 0.8064872398972511, "rewards/cosine_scaled_reward": -0.1240720006171614, "rewards/format_reward": 0.29166666977107525, "step": 103 }, { "advantage_max": 0.4506031647324562, "advantage_mean": 1.98682153507157e-08, "advantage_min": -0.49186787754297256, "advantage_std": 0.3530017454177141, "completion_length": 2766.1041870117188, "epoch": 0.11885714285714286, "grad_norm": 0.06478388607501984, "kl": 0.013357162475585938, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.683994186497132e-07, "loss": 0.0074, "reward": 0.146314088255167, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.146314088255167, "reward_after_std": 0.3530017528682947, "reward_before_mean": 0.2424488589167595, "reward_before_std": 0.35200561955571175, "reward_change_max": 0.00045055896043777466, "reward_change_mean": -0.09613475622609258, "reward_change_min": -0.14989042840898037, "reward_change_std": 0.059963473584502935, "reward_std": 0.35300176963210106, "rewards/cosine_scaled_reward": -0.06627557054162025, "rewards/format_reward": 0.375, "step": 104 }, { "advantage_max": 1.1343233175575733, "advantage_mean": -1.92473328941567e-08, "advantage_min": -0.9812361150979996, "advantage_std": 0.786447387188673, "completion_length": 2954.8333740234375, "epoch": 0.12, "grad_norm": 0.1910223364830017, "kl": 0.008061408996582031, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.672327345550543e-07, "loss": 0.0804, "reward": 0.12759940326213837, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12759940326213837, "reward_after_std": 0.7864474020898342, "reward_before_mean": 0.20970958936959505, "reward_before_std": 0.804718591272831, "reward_change_max": 0.00041823089122772217, "reward_change_mean": -0.08211018680594862, "reward_change_min": -0.166708511300385, "reward_change_std": 0.07057785498909652, "reward_std": 0.7864474169909954, "rewards/cosine_scaled_reward": -0.0826452155597508, "rewards/format_reward": 0.3750000149011612, "step": 105 }, { "advantage_max": 1.1130489706993103, "advantage_mean": -3.228584932735146e-08, "advantage_min": -1.018290713429451, "advantage_std": 0.8191616833209991, "completion_length": 2387.500057220459, "epoch": 0.12114285714285715, "grad_norm": 0.11806398630142212, "kl": 0.013179779052734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.66045715125541e-07, "loss": 0.0162, "reward": 0.8837066609412432, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8837066609412432, "reward_after_std": 0.8191616646945477, "reward_before_mean": 1.036157036665827, "reward_before_std": 0.8276788517832756, "reward_change_max": 0.0, "reward_change_mean": -0.15245038806460798, "reward_change_min": -0.2699962416663766, "reward_change_std": 0.10578735335730016, "reward_std": 0.8191616907715797, "rewards/cosine_scaled_reward": 0.18474517948925495, "rewards/format_reward": 0.666666679084301, "step": 106 }, { "advantage_max": 0.8276206701993942, "advantage_mean": -1.9247333005179e-08, "advantage_min": -0.7223168984055519, "advantage_std": 0.5651622377336025, "completion_length": 2848.854232788086, "epoch": 0.12228571428571429, "grad_norm": 0.08836143463850021, "kl": 0.00736236572265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.648384182148252e-07, "loss": 0.0252, "reward": 0.30740308575332165, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30740308575332165, "reward_after_std": 0.5651622079312801, "reward_before_mean": 0.41169095039367676, "reward_before_std": 0.5657424293458462, "reward_change_max": 7.306039333343506e-05, "reward_change_mean": -0.1042878954904154, "reward_change_min": -0.16713330894708633, "reward_change_std": 0.06733074027579278, "reward_std": 0.5651622265577316, "rewards/cosine_scaled_reward": -0.044154517352581024, "rewards/format_reward": 0.5000000111758709, "step": 107 }, { "advantage_max": 1.0464369431138039, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.920787513256073, "advantage_std": 0.8237064126878977, "completion_length": 3022.750030517578, "epoch": 0.12342857142857143, "grad_norm": 5.710745811462402, "kl": 0.679987907409668, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.636109026648554e-07, "loss": 0.1282, "reward": -0.0018061436712741852, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0018061436712741852, "reward_after_std": 0.8237064089626074, "reward_before_mean": 0.06948780082166195, "reward_before_std": 0.8604318238794804, "reward_change_max": 0.0007350221276283264, "reward_change_mean": -0.07129394076764584, "reward_change_min": -0.18075018282979727, "reward_change_std": 0.07853512931615114, "reward_std": 0.8237064350396395, "rewards/cosine_scaled_reward": -0.12150610354728997, "rewards/format_reward": 0.3125000074505806, "step": 108 }, { "advantage_max": 0.8055699914693832, "advantage_mean": -6.2088179597630244e-09, "advantage_min": -0.5043129064142704, "advantage_std": 0.495711550116539, "completion_length": 3027.312530517578, "epoch": 0.12457142857142857, "grad_norm": 0.0767723098397255, "kl": 0.006561279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.623632283030077e-07, "loss": 0.0116, "reward": 0.04255384439602494, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04255384439602494, "reward_after_std": 0.495711550116539, "reward_before_mean": 0.12285317666828632, "reward_before_std": 0.49004461243748665, "reward_change_max": 0.00023395568132400513, "reward_change_mean": -0.08029932924546301, "reward_change_min": -0.13384640123695135, "reward_change_std": 0.053254172671586275, "reward_std": 0.4957115687429905, "rewards/cosine_scaled_reward": -0.10524008236825466, "rewards/format_reward": 0.33333334140479565, "step": 109 }, { "advantage_max": 0.9220863282680511, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.5900181010365486, "advantage_std": 0.6116988677531481, "completion_length": 3102.7916717529297, "epoch": 0.12571428571428572, "grad_norm": 0.10041435062885284, "kl": 0.008144378662109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.610954559391704e-07, "loss": 0.0172, "reward": -0.07845552056096494, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07845552056096494, "reward_after_std": 0.6116988696157932, "reward_before_mean": -0.011700557777658105, "reward_before_std": 0.6220829505473375, "reward_change_max": 0.0005408599972724915, "reward_change_mean": -0.06675496569368988, "reward_change_min": -0.15565920621156693, "reward_change_std": 0.05837497836910188, "reward_std": 0.6116988770663738, "rewards/cosine_scaled_reward": -0.17251694481819868, "rewards/format_reward": 0.33333333767950535, "step": 110 }, { "advantage_max": 0.8496062196791172, "advantage_mean": -1.2107193692045826e-08, "advantage_min": -0.7639882974326611, "advantage_std": 0.6249940134584904, "completion_length": 3443.8541870117188, "epoch": 0.12685714285714286, "grad_norm": 0.11901643127202988, "kl": 0.010395050048828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.598076473627796e-07, "loss": 0.0361, "reward": -0.019697923213243484, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.019697923213243484, "reward_after_std": 0.6249940060079098, "reward_before_mean": 0.05418802239000797, "reward_before_std": 0.64259023219347, "reward_change_max": 9.847432374954224e-05, "reward_change_mean": -0.07388598122633994, "reward_change_min": -0.14417694509029388, "reward_change_std": 0.060518967104144394, "reward_std": 0.6249940097332001, "rewards/cosine_scaled_reward": -0.05623931344598532, "rewards/format_reward": 0.1666666716337204, "step": 111 }, { "advantage_max": 1.38174469769001, "advantage_mean": 7.761021492136422e-09, "advantage_min": -0.975238636136055, "advantage_std": 0.8997639156877995, "completion_length": 3331.937530517578, "epoch": 0.128, "grad_norm": 0.16729916632175446, "kl": 0.0065631866455078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.58499865339809e-07, "loss": 0.0344, "reward": 0.40944708324968815, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40944708324968815, "reward_after_std": 0.8997639268636703, "reward_before_mean": 0.5130116026848555, "reward_before_std": 0.9069201238453388, "reward_change_max": 0.0003801584243774414, "reward_change_mean": -0.10356450360268354, "reward_change_min": -0.21430773753672838, "reward_change_std": 0.08684224355965853, "reward_std": 0.899763960391283, "rewards/cosine_scaled_reward": 0.06900579854846, "rewards/format_reward": 0.37500000558793545, "step": 112 }, { "advantage_max": 1.1800456158816814, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.7997603341937065, "advantage_std": 0.7590612880885601, "completion_length": 2949.9583740234375, "epoch": 0.12914285714285714, "grad_norm": 0.21318159997463226, "kl": 0.01378631591796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.571721736097088e-07, "loss": 0.0792, "reward": -0.0245030396617949, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0245030396617949, "reward_after_std": 0.7590613029897213, "reward_before_mean": 0.04212821088731289, "reward_before_std": 0.7715008705854416, "reward_change_max": 0.00035149604082107544, "reward_change_mean": -0.06663124216720462, "reward_change_min": -0.15565018076449633, "reward_change_std": 0.0617706241318956, "reward_std": 0.7590613178908825, "rewards/cosine_scaled_reward": -0.17685257643461227, "rewards/format_reward": 0.3958333395421505, "step": 113 }, { "advantage_max": 0.80317697301507, "advantage_mean": -2.4835269951672956e-09, "advantage_min": -0.6111781224608421, "advantage_std": 0.5317833982408047, "completion_length": 2686.229217529297, "epoch": 0.13028571428571428, "grad_norm": 0.08997328579425812, "kl": 0.0075283050537109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.55824636882301e-07, "loss": 0.0055, "reward": 0.14073886536061764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14073886536061764, "reward_after_std": 0.5317834094166756, "reward_before_mean": 0.2300937306135893, "reward_before_std": 0.5275977291166782, "reward_change_max": 0.0, "reward_change_mean": -0.08935487153939903, "reward_change_min": -0.16146521922200918, "reward_change_std": 0.059935636119917035, "reward_std": 0.5317834354937077, "rewards/cosine_scaled_reward": -0.17661980912089348, "rewards/format_reward": 0.5833333488553762, "step": 114 }, { "advantage_max": 1.2508711740374565, "advantage_mean": 1.4280279958533981e-08, "advantage_min": -0.6822191178798676, "advantage_std": 0.8110570348799229, "completion_length": 2879.437530517578, "epoch": 0.13142857142857142, "grad_norm": 0.1294396072626114, "kl": 0.0072422027587890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.54457320834625e-07, "loss": 0.0491, "reward": -0.02063230169005692, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02063230169005692, "reward_after_std": 0.8110570386052132, "reward_before_mean": 0.04543160554021597, "reward_before_std": 0.8268074579536915, "reward_change_max": 6.017833948135376e-05, "reward_change_mean": -0.06606390746310353, "reward_change_min": -0.18289340753108263, "reward_change_std": 0.06786925066262484, "reward_std": 0.8110570646822453, "rewards/cosine_scaled_reward": -0.16478420107159764, "rewards/format_reward": 0.3750000074505806, "step": 115 }, { "advantage_max": 0.9620575942099094, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.7253379821777344, "advantage_std": 0.6947087794542313, "completion_length": 3391.4166870117188, "epoch": 0.13257142857142856, "grad_norm": 0.1599225401878357, "kl": 0.00789642333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.530702921077358e-07, "loss": 0.0384, "reward": -0.1465127021074295, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1465127021074295, "reward_after_std": 0.6947087645530701, "reward_before_mean": -0.086684700101614, "reward_before_std": 0.7185013070702553, "reward_change_max": 0.0002536848187446594, "reward_change_mean": -0.05982800526544452, "reward_change_min": -0.14509317371994257, "reward_change_std": 0.0610161469085142, "reward_std": 0.694708775728941, "rewards/cosine_scaled_reward": -0.1266756821423769, "rewards/format_reward": 0.16666666977107525, "step": 116 }, { "advantage_max": 0.9573287703096867, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.5708078518509865, "advantage_std": 0.608824010938406, "completion_length": 3233.687530517578, "epoch": 0.1337142857142857, "grad_norm": 0.11774991452693939, "kl": 0.01103973388671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.516636183034564e-07, "loss": 0.0571, "reward": -0.3432406187057495, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3432406187057495, "reward_after_std": 0.6088240072131157, "reward_before_mean": -0.3020863775163889, "reward_before_std": 0.62282644957304, "reward_change_max": 0.00045599788427352905, "reward_change_mean": -0.04115423874463886, "reward_change_min": -0.10824007168412209, "reward_change_std": 0.04387360031250864, "reward_std": 0.6088240295648575, "rewards/cosine_scaled_reward": -0.23437653545988724, "rewards/format_reward": 0.16666666977107525, "step": 117 }, { "advantage_max": 1.032807346433401, "advantage_mean": -4.967053324200776e-09, "advantage_min": -0.780588660389185, "advantage_std": 0.7265257742255926, "completion_length": 2903.208366394043, "epoch": 0.13485714285714287, "grad_norm": 0.11558439582586288, "kl": 0.00542449951171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.502373679810839e-07, "loss": 0.0019, "reward": 0.364423219114542, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.364423219114542, "reward_after_std": 0.7265257630497217, "reward_before_mean": 0.4703831188380718, "reward_before_std": 0.7357173040509224, "reward_change_max": 0.0, "reward_change_mean": -0.10595988691784441, "reward_change_min": -0.21627100184559822, "reward_change_std": 0.0819319833535701, "reward_std": 0.7265257872641087, "rewards/cosine_scaled_reward": 0.006024882197380066, "rewards/format_reward": 0.4583333395421505, "step": 118 }, { "advantage_max": 0.9601045697927475, "advantage_mean": -2.2972624136308184e-08, "advantage_min": -0.8546690195798874, "advantage_std": 0.7206116262823343, "completion_length": 2613.458351135254, "epoch": 0.136, "grad_norm": 0.28752654790878296, "kl": 0.11657333374023438, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.487916106540465e-07, "loss": 0.0707, "reward": 0.3269413001835346, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3269413001835346, "reward_after_std": 0.7206116206943989, "reward_before_mean": 0.4298500046133995, "reward_before_std": 0.7387393917888403, "reward_change_max": 0.0004165545105934143, "reward_change_mean": -0.1029087018687278, "reward_change_min": -0.19583096075803041, "reward_change_std": 0.07978490833193064, "reward_std": 0.7206116504967213, "rewards/cosine_scaled_reward": -0.02465835213661194, "rewards/format_reward": 0.4791666716337204, "step": 119 }, { "advantage_max": 1.0327376946806908, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.8866618499159813, "advantage_std": 0.7398336865007877, "completion_length": 2519.7708892822266, "epoch": 0.13714285714285715, "grad_norm": 0.16047891974449158, "kl": 0.01155853271484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.473264167865171e-07, "loss": 0.0541, "reward": 0.4021491319872439, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4021491319872439, "reward_after_std": 0.7398336827754974, "reward_before_mean": 0.5111243925057352, "reward_before_std": 0.7496637143194675, "reward_change_max": 0.00031457841396331787, "reward_change_mean": -0.10897527076303959, "reward_change_min": -0.21183301974087954, "reward_change_std": 0.08238738542422652, "reward_std": 0.7398337163031101, "rewards/cosine_scaled_reward": 0.005562208592891693, "rewards/format_reward": 0.5000000093132257, "step": 120 }, { "advantage_max": 1.0117009952664375, "advantage_mean": 1.5522044760629683e-09, "advantage_min": -0.8701439537107944, "advantage_std": 0.7287258952856064, "completion_length": 2291.4166870117188, "epoch": 0.1382857142857143, "grad_norm": 0.15173783898353577, "kl": 0.01496124267578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.458418577899774e-07, "loss": 0.0628, "reward": 0.2898542070761323, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2898542070761323, "reward_after_std": 0.7287258952856064, "reward_before_mean": 0.38868426950648427, "reward_before_std": 0.7429370507597923, "reward_change_max": 0.0002674087882041931, "reward_change_mean": -0.0988300465978682, "reward_change_min": -0.1901166085153818, "reward_change_std": 0.07626715092919767, "reward_std": 0.7287259213626385, "rewards/cosine_scaled_reward": -0.1181578729301691, "rewards/format_reward": 0.6250000093132257, "step": 121 }, { "advantage_max": 0.6881704181432724, "advantage_mean": -6.829698862009792e-09, "advantage_min": -0.5806205496191978, "advantage_std": 0.5137464459985495, "completion_length": 2758.06258392334, "epoch": 0.13942857142857143, "grad_norm": 0.12575501203536987, "kl": 0.008263587951660156, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.443380060197385e-07, "loss": 0.049, "reward": 0.34448036178946495, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34448036178946495, "reward_after_std": 0.5137464534491301, "reward_before_mean": 0.454755075275898, "reward_before_std": 0.5069625999312848, "reward_change_max": 0.0003061145544052124, "reward_change_mean": -0.11027471465058625, "reward_change_min": -0.18773560784757137, "reward_change_std": 0.07475250354036689, "reward_std": 0.5137464664876461, "rewards/cosine_scaled_reward": 0.008627532981336117, "rewards/format_reward": 0.4375000037252903, "step": 122 }, { "advantage_max": 0.8377508036792278, "advantage_mean": -2.79396769609086e-09, "advantage_min": -0.789555948227644, "advantage_std": 0.6173290908336639, "completion_length": 2958.666717529297, "epoch": 0.14057142857142857, "grad_norm": 0.1022125631570816, "kl": 0.008504867553710938, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.428149347714143e-07, "loss": 0.0552, "reward": 0.09822776913642883, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09822776913642883, "reward_after_std": 0.6173290759325027, "reward_before_mean": 0.18268409557640553, "reward_before_std": 0.6303858272731304, "reward_change_max": 0.0, "reward_change_mean": -0.08445633691735566, "reward_change_min": -0.16947042290121317, "reward_change_std": 0.06656381417997181, "reward_std": 0.6173290759325027, "rewards/cosine_scaled_reward": -0.10657462244853377, "rewards/format_reward": 0.3958333395421505, "step": 123 }, { "advantage_max": 1.3881800100207329, "advantage_mean": -2.3903946710923663e-08, "advantage_min": -0.8908319100737572, "advantage_std": 0.8987450338900089, "completion_length": 2499.6458892822266, "epoch": 0.1417142857142857, "grad_norm": 0.12640397250652313, "kl": 0.01064300537109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.412727182773486e-07, "loss": 0.0231, "reward": 0.2822089372202754, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2822089372202754, "reward_after_std": 0.8987450078129768, "reward_before_mean": 0.37344337720423937, "reward_before_std": 0.9098445884883404, "reward_change_max": 0.0, "reward_change_mean": -0.09123445488512516, "reward_change_min": -0.18366647511720657, "reward_change_std": 0.07298110821284354, "reward_std": 0.8987450487911701, "rewards/cosine_scaled_reward": -0.1049449909478426, "rewards/format_reward": 0.583333345130086, "step": 124 }, { "advantage_max": 0.9330607615411282, "advantage_mean": -1.986821573929376e-08, "advantage_min": -0.5653700307011604, "advantage_std": 0.5595428682863712, "completion_length": 2889.250030517578, "epoch": 0.14285714285714285, "grad_norm": 0.07559899240732193, "kl": 0.008258819580078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.397114317029974e-07, "loss": 0.005, "reward": 0.06734631024301052, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06734631024301052, "reward_after_std": 0.5595428682863712, "reward_before_mean": 0.14727872982621193, "reward_before_std": 0.549531988799572, "reward_change_max": 0.0004915222525596619, "reward_change_mean": -0.07993244659155607, "reward_change_min": -0.14166234154254198, "reward_change_std": 0.054563989629969, "reward_std": 0.5595428794622421, "rewards/cosine_scaled_reward": -0.07219396787695587, "rewards/format_reward": 0.2916666679084301, "step": 125 }, { "advantage_max": 1.0017745271325111, "advantage_mean": -1.800557009046244e-08, "advantage_min": -0.9506868049502373, "advantage_std": 0.7525536194443703, "completion_length": 2789.166717529297, "epoch": 0.144, "grad_norm": 0.18889941275119781, "kl": 0.006542205810546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.381311511432658e-07, "loss": 0.0565, "reward": 0.3237967677414417, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3237967677414417, "reward_after_std": 0.7525536343455315, "reward_before_mean": 0.42645698226988316, "reward_before_std": 0.7722005806863308, "reward_change_max": 9.433180093765259e-05, "reward_change_mean": -0.10266020614653826, "reward_change_min": -0.20530739519745111, "reward_change_std": 0.08366158325225115, "reward_std": 0.7525536641478539, "rewards/cosine_scaled_reward": -0.03677151817828417, "rewards/format_reward": 0.5000000037252903, "step": 126 }, { "advantage_max": 1.0069448873400688, "advantage_mean": -2.7755575615628914e-16, "advantage_min": -0.7727540284395218, "advantage_std": 0.6543919891119003, "completion_length": 3197.250015258789, "epoch": 0.14514285714285713, "grad_norm": 0.10907334089279175, "kl": 0.01126861572265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.36531953618799e-07, "loss": 0.0366, "reward": -0.1559242196381092, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1559242196381092, "reward_after_std": 0.6543919928371906, "reward_before_mean": -0.0979102123528719, "reward_before_std": 0.6656555905938148, "reward_change_max": 0.0006112977862358093, "reward_change_mean": -0.058013999834656715, "reward_change_min": -0.12248460203409195, "reward_change_std": 0.05088945245370269, "reward_std": 0.654392022639513, "rewards/cosine_scaled_reward": -0.18437177990563214, "rewards/format_reward": 0.27083334140479565, "step": 127 }, { "advantage_max": 0.896282397210598, "advantage_mean": 8.692343733684993e-09, "advantage_min": -1.1638800874352455, "advantage_std": 0.8231517635285854, "completion_length": 2812.562530517578, "epoch": 0.1462857142857143, "grad_norm": 0.13105569779872894, "kl": 0.01023101806640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.34913917072228e-07, "loss": 0.0184, "reward": 0.4567085765302181, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4567085765302181, "reward_after_std": 0.8231517747044563, "reward_before_mean": 0.5736916027963161, "reward_before_std": 0.8603745512664318, "reward_change_max": 0.0002776235342025757, "reward_change_mean": -0.11698298552073538, "reward_change_min": -0.2090506199747324, "reward_change_std": 0.0941194579936564, "reward_std": 0.8231517784297466, "rewards/cosine_scaled_reward": 0.05767912045121193, "rewards/format_reward": 0.4583333432674408, "step": 128 }, { "advantage_max": 1.215379349887371, "advantage_mean": -2.23517425679276e-08, "advantage_min": -0.631638377904892, "advantage_std": 0.6916131414473057, "completion_length": 3391.3958435058594, "epoch": 0.14742857142857144, "grad_norm": 0.13277558982372284, "kl": 0.011829376220703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.332771203643714e-07, "loss": 0.0186, "reward": -0.27772839553654194, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27772839553654194, "reward_after_std": 0.6916131395846605, "reward_before_mean": -0.23465027287602425, "reward_before_std": 0.6941855438053608, "reward_change_max": 0.0004948228597640991, "reward_change_mean": -0.04307813826017082, "reward_change_min": -0.10449518729001284, "reward_change_std": 0.04153990955092013, "reward_std": 0.6916131433099508, "rewards/cosine_scaled_reward": -0.20065847536170622, "rewards/format_reward": 0.1666666716337204, "step": 129 }, { "advantage_max": 0.8671197295188904, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.6465287543833256, "advantage_std": 0.5738713406026363, "completion_length": 3142.7291717529297, "epoch": 0.14857142857142858, "grad_norm": 0.10791204124689102, "kl": 0.01093292236328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.316216432703916e-07, "loss": 0.0106, "reward": -0.05407215282320976, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05407215282320976, "reward_after_std": 0.5738713443279266, "reward_before_mean": 0.01597772934474051, "reward_before_std": 0.5806875862181187, "reward_change_max": 0.0, "reward_change_mean": -0.07004988705739379, "reward_change_min": -0.13436190504580736, "reward_change_std": 0.05401943693868816, "reward_std": 0.5738713517785072, "rewards/cosine_scaled_reward": -0.10659446427598596, "rewards/format_reward": 0.22916666977107525, "step": 130 }, { "advantage_max": 1.1662424430251122, "advantage_mean": -1.2417634920325327e-08, "advantage_min": -0.8795076087117195, "advantage_std": 0.807879064232111, "completion_length": 2856.1667404174805, "epoch": 0.14971428571428572, "grad_norm": 0.15617187321186066, "kl": 0.013149261474609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.299475664759068e-07, "loss": 0.083, "reward": 0.32507515139877796, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32507515139877796, "reward_after_std": 0.8078790456056595, "reward_before_mean": 0.42472299188375473, "reward_before_std": 0.8255997374653816, "reward_change_max": 0.00011242181062698364, "reward_change_mean": -0.09964784735348076, "reward_change_min": -0.20078612212091684, "reward_change_std": 0.08023856161162257, "reward_std": 0.8078790530562401, "rewards/cosine_scaled_reward": -0.006388511508703232, "rewards/format_reward": 0.4375000074505806, "step": 131 }, { "advantage_max": 0.7666987664997578, "advantage_mean": 2.4835269507583746e-08, "advantage_min": -0.7063778378069401, "advantage_std": 0.5695809759199619, "completion_length": 2699.0625228881836, "epoch": 0.15085714285714286, "grad_norm": 0.08800819516181946, "kl": 0.0089569091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.282549715730579e-07, "loss": -0.0231, "reward": 0.20854278281331062, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20854278281331062, "reward_after_std": 0.5695809768512845, "reward_before_mean": 0.3043249621987343, "reward_before_std": 0.577835189178586, "reward_change_max": 9.389221668243408e-05, "reward_change_mean": -0.09578215330839157, "reward_change_min": -0.17351566720753908, "reward_change_std": 0.06650701339822263, "reward_std": 0.5695809926837683, "rewards/cosine_scaled_reward": -0.03533751145005226, "rewards/format_reward": 0.375, "step": 132 }, { "advantage_max": 0.8124303817749023, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.5613624155521393, "advantage_std": 0.5233605541288853, "completion_length": 3215.000030517578, "epoch": 0.152, "grad_norm": 0.11630722880363464, "kl": 0.013019561767578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.265439410565328e-07, "loss": 0.0422, "reward": -0.26990117644891143, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26990117644891143, "reward_after_std": 0.5233605578541756, "reward_before_mean": -0.21911684470251203, "reward_before_std": 0.5289643295109272, "reward_change_max": 0.00056438148021698, "reward_change_mean": -0.05078433989547193, "reward_change_min": -0.10732162557542324, "reward_change_std": 0.0434577246196568, "reward_std": 0.5233605690300465, "rewards/cosine_scaled_reward": -0.23455842956900597, "rewards/format_reward": 0.25000000931322575, "step": 133 }, { "advantage_max": 1.3054607100784779, "advantage_mean": 4.96705393482344e-09, "advantage_min": -0.8411310873925686, "advantage_std": 0.8696665279567242, "completion_length": 2491.3333587646484, "epoch": 0.15314285714285714, "grad_norm": 0.17809806764125824, "kl": 0.013080596923828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.248145583195447e-07, "loss": 0.0184, "reward": 0.2889123857021332, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2889123857021332, "reward_after_std": 0.8696665279567242, "reward_before_mean": 0.38274707458913326, "reward_before_std": 0.8822485581040382, "reward_change_max": 0.0, "reward_change_mean": -0.09383466956205666, "reward_change_min": -0.18727249279618263, "reward_change_std": 0.07527722232043743, "reward_std": 0.869666550308466, "rewards/cosine_scaled_reward": -0.08987647667527199, "rewards/format_reward": 0.5625000055879354, "step": 134 }, { "advantage_max": 1.1486023664474487, "advantage_mean": -2.0489097529718947e-08, "advantage_min": -1.1527246832847595, "advantage_std": 0.9214292168617249, "completion_length": 2233.895851135254, "epoch": 0.15428571428571428, "grad_norm": 0.4533507227897644, "kl": 0.012312889099121094, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.230669076497687e-07, "loss": 0.086, "reward": 0.7777070086449385, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7777070086449385, "reward_after_std": 0.9214292392134666, "reward_before_mean": 0.9196147546172142, "reward_before_std": 0.9462558915838599, "reward_change_max": 0.00037839263677597046, "reward_change_mean": -0.14190778695046902, "reward_change_min": -0.27010027691721916, "reward_change_std": 0.11189589183777571, "reward_std": 0.9214292727410793, "rewards/cosine_scaled_reward": 0.16814071801491082, "rewards/format_reward": 0.5833333395421505, "step": 135 }, { "advantage_max": 1.1181330271065235, "advantage_mean": -4.996003610813204e-16, "advantage_min": -1.023313906043768, "advantage_std": 0.7946756221354008, "completion_length": 2813.791748046875, "epoch": 0.15542857142857142, "grad_norm": 0.17351149022579193, "kl": 0.015716552734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.213010742252327e-07, "loss": 0.0675, "reward": 0.4721411466598511, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4721411466598511, "reward_after_std": 0.7946756184101105, "reward_before_mean": 0.5856084409169853, "reward_before_std": 0.80930364318192, "reward_change_max": 2.7738511562347412e-05, "reward_change_mean": -0.11346728634089231, "reward_change_min": -0.1980165708810091, "reward_change_std": 0.08056956913787872, "reward_std": 0.7946756482124329, "rewards/cosine_scaled_reward": 0.042804209515452385, "rewards/format_reward": 0.5000000149011612, "step": 136 }, { "advantage_max": 1.137401022017002, "advantage_mean": 1.3969838702498905e-08, "advantage_min": -0.7347718961536884, "advantage_std": 0.7254753410816193, "completion_length": 3117.041717529297, "epoch": 0.15657142857142858, "grad_norm": 0.14943963289260864, "kl": 0.013874053955078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.195171441101668e-07, "loss": 0.059, "reward": -0.13048943784087896, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13048943784087896, "reward_after_std": 0.7254753559827805, "reward_before_mean": -0.07333034556359053, "reward_before_std": 0.7382145449519157, "reward_change_max": 0.0002055242657661438, "reward_change_mean": -0.0571591054322198, "reward_change_min": -0.13340783957391977, "reward_change_std": 0.0530292927287519, "reward_std": 0.7254753783345222, "rewards/cosine_scaled_reward": -0.18249849835410714, "rewards/format_reward": 0.2916666753590107, "step": 137 }, { "advantage_max": 1.3317741975188255, "advantage_mean": 1.676380706472358e-08, "advantage_min": -0.9980174265801907, "advantage_std": 0.9026450999081135, "completion_length": 2635.625045776367, "epoch": 0.15771428571428572, "grad_norm": 0.17180199921131134, "kl": 0.014453887939453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.177152042508077e-07, "loss": 0.0665, "reward": 0.4164327224716544, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4164327224716544, "reward_after_std": 0.9026450850069523, "reward_before_mean": 0.5212587993592024, "reward_before_std": 0.916192002594471, "reward_change_max": 0.00020529329776763916, "reward_change_mean": -0.10482604894787073, "reward_change_min": -0.20750916376709938, "reward_change_std": 0.08401510119438171, "reward_std": 0.9026451222598553, "rewards/cosine_scaled_reward": 0.00021273503080010414, "rewards/format_reward": 0.5208333376795053, "step": 138 }, { "advantage_max": 1.2564072161912918, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -0.9992383420467377, "advantage_std": 0.918564435094595, "completion_length": 3073.0834045410156, "epoch": 0.15885714285714286, "grad_norm": 0.1684163361787796, "kl": 0.01435089111328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.158953424711624e-07, "loss": 0.0436, "reward": 0.3841327941045165, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3841327941045165, "reward_after_std": 0.9185644574463367, "reward_before_mean": 0.4876405708491802, "reward_before_std": 0.9425558559596539, "reward_change_max": 0.00016095489263534546, "reward_change_mean": -0.10350779164582491, "reward_change_min": -0.2199224689975381, "reward_change_std": 0.09149896074086428, "reward_std": 0.9185644909739494, "rewards/cosine_scaled_reward": -0.04784638062119484, "rewards/format_reward": 0.5833333414047956, "step": 139 }, { "advantage_max": 1.2031887620687485, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.7054705396294594, "advantage_std": 0.7336429953575134, "completion_length": 2998.2084045410156, "epoch": 0.16, "grad_norm": 0.6675270795822144, "kl": 0.020298004150390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.140576474687263e-07, "loss": 0.12, "reward": 0.07706247363239527, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07706247363239527, "reward_after_std": 0.7336429879069328, "reward_before_mean": 0.15163643937557936, "reward_before_std": 0.7301466800272465, "reward_change_max": 0.0005590468645095825, "reward_change_mean": -0.07457395037636161, "reward_change_min": -0.1526154913008213, "reward_change_std": 0.06238272855989635, "reward_std": 0.7336430214345455, "rewards/cosine_scaled_reward": -0.07001511752605438, "rewards/format_reward": 0.29166667349636555, "step": 140 }, { "advantage_max": 1.0287379138171673, "advantage_mean": -1.3659398501175701e-08, "advantage_min": -0.7826720699667931, "advantage_std": 0.7179284952580929, "completion_length": 2630.1459045410156, "epoch": 0.16114285714285714, "grad_norm": 0.13093984127044678, "kl": 0.01914215087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.122022088101613e-07, "loss": 0.0512, "reward": 0.1532113216817379, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1532113216817379, "reward_after_std": 0.717928521335125, "reward_before_mean": 0.23904765769839287, "reward_before_std": 0.7325572483241558, "reward_change_max": 0.00014876574277877808, "reward_change_mean": -0.085836345795542, "reward_change_min": -0.18532127793878317, "reward_change_std": 0.07249903492629528, "reward_std": 0.7179285399615765, "rewards/cosine_scaled_reward": -0.15130950883030891, "rewards/format_reward": 0.5416666753590107, "step": 141 }, { "advantage_max": 0.9296289533376694, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.812540490180254, "advantage_std": 0.6487125307321548, "completion_length": 2815.1875610351562, "epoch": 0.16228571428571428, "grad_norm": 0.13400672376155853, "kl": 0.01612091064453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.103291169269299e-07, "loss": 0.0648, "reward": 0.33853449299931526, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.33853449299931526, "reward_after_std": 0.6487125344574451, "reward_before_mean": 0.4438725169748068, "reward_before_std": 0.6510545462369919, "reward_change_max": 0.00021427124738693237, "reward_change_mean": -0.10533801536075771, "reward_change_min": -0.18465242069214582, "reward_change_std": 0.07665713713504374, "reward_std": 0.6487125381827354, "rewards/cosine_scaled_reward": -0.05931374244391918, "rewards/format_reward": 0.5625000111758709, "step": 142 }, { "advantage_max": 1.0350090842694044, "advantage_mean": -2.949188115941581e-09, "advantage_min": -0.6123997960239649, "advantage_std": 0.6286649722605944, "completion_length": 2656.666732788086, "epoch": 0.16342857142857142, "grad_norm": 0.1824156790971756, "kl": 0.0194549560546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.084384631108882e-07, "loss": 0.0744, "reward": 0.09283385192975402, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09283385192975402, "reward_after_std": 0.6286649648100138, "reward_before_mean": 0.17288653645664454, "reward_before_std": 0.6227559931576252, "reward_change_max": 0.0002013370394706726, "reward_change_mean": -0.08005267137195915, "reward_change_min": -0.1376419337466359, "reward_change_std": 0.05598044104408473, "reward_std": 0.6286649834364653, "rewards/cosine_scaled_reward": -0.16355674155056477, "rewards/format_reward": 0.5000000074505806, "step": 143 }, { "advantage_max": 1.0671398043632507, "advantage_mean": 2.359350581571107e-08, "advantage_min": -0.9071869850158691, "advantage_std": 0.8145530968904495, "completion_length": 2874.708396911621, "epoch": 0.16457142857142856, "grad_norm": 0.14423750340938568, "kl": 0.016429901123046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.065303395098358e-07, "loss": 0.0199, "reward": 0.361312011256814, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.361312011256814, "reward_after_std": 0.8145530857145786, "reward_before_mean": 0.4656823016703129, "reward_before_std": 0.8409868739545345, "reward_change_max": 0.0002870485186576843, "reward_change_mean": -0.10437025898136199, "reward_change_min": -0.2307471977546811, "reward_change_std": 0.08893731469288468, "reward_std": 0.8145531080663204, "rewards/cosine_scaled_reward": 0.02450781175866723, "rewards/format_reward": 0.4166666679084301, "step": 144 }, { "advantage_max": 1.0821139886975288, "advantage_mean": 3.725291630729544e-09, "advantage_min": -0.6549829691648483, "advantage_std": 0.6488157249987125, "completion_length": 2207.770866394043, "epoch": 0.1657142857142857, "grad_norm": 0.11618901789188385, "kl": 0.01737213134765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.046048391230247e-07, "loss": 0.0195, "reward": 0.41563169774599373, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41563169774599373, "reward_after_std": 0.648815743625164, "reward_before_mean": 0.5244897492229939, "reward_before_std": 0.6296542976051569, "reward_change_max": 0.0001682192087173462, "reward_change_mean": -0.10885802179109305, "reward_change_min": -0.17047070525586605, "reward_change_std": 0.0684051540447399, "reward_std": 0.6488157473504543, "rewards/cosine_scaled_reward": -0.039838479831814766, "rewards/format_reward": 0.6041666697710752, "step": 145 }, { "advantage_max": 1.0146339759230614, "advantage_mean": -3.414849431004896e-09, "advantage_min": -0.8650781996548176, "advantage_std": 0.7028192803263664, "completion_length": 2288.000045776367, "epoch": 0.16685714285714287, "grad_norm": 0.20492036640644073, "kl": 0.01233673095703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.026620557966279e-07, "loss": 0.1043, "reward": 0.12842521956190467, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12842521956190467, "reward_after_std": 0.7028192915022373, "reward_before_mean": 0.21201793756335974, "reward_before_std": 0.7153302617371082, "reward_change_max": 0.0005949661135673523, "reward_change_mean": -0.0835927234729752, "reward_change_min": -0.17898117750883102, "reward_change_std": 0.06886330037377775, "reward_std": 0.7028193026781082, "rewards/cosine_scaled_reward": -0.2169077042490244, "rewards/format_reward": 0.6458333432674408, "step": 146 }, { "advantage_max": 1.0399056002497673, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6083156391978264, "advantage_std": 0.6332942806184292, "completion_length": 2729.125015258789, "epoch": 0.168, "grad_norm": 0.1052529439330101, "kl": 0.021240234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 9.007020842191634e-07, "loss": 0.0038, "reward": -0.017400827258825302, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.017400827258825302, "reward_after_std": 0.6332942768931389, "reward_before_mean": 0.052338654175400734, "reward_before_std": 0.6333304084837437, "reward_change_max": 0.00016516447067260742, "reward_change_mean": -0.06973948935046792, "reward_change_min": -0.1433351282030344, "reward_change_std": 0.052607121178880334, "reward_std": 0.633294302970171, "rewards/cosine_scaled_reward": -0.18216401617974043, "rewards/format_reward": 0.41666667349636555, "step": 147 }, { "advantage_max": 0.816329799592495, "advantage_mean": -3.787378699549038e-08, "advantage_min": -0.718300499022007, "advantage_std": 0.5935985185205936, "completion_length": 2441.3750534057617, "epoch": 0.16914285714285715, "grad_norm": 0.14333200454711914, "kl": 0.019672393798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.987250199168808e-07, "loss": 0.0355, "reward": 0.3582087382674217, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3582087382674217, "reward_after_std": 0.5935985222458839, "reward_before_mean": 0.46716008335351944, "reward_before_std": 0.598968580365181, "reward_change_max": 0.0, "reward_change_mean": -0.10895138035994023, "reward_change_min": -0.1871479470282793, "reward_change_std": 0.07555282616522163, "reward_std": 0.5935985259711742, "rewards/cosine_scaled_reward": -0.05808662064373493, "rewards/format_reward": 0.5833333432674408, "step": 148 }, { "advantage_max": 1.1604133136570454, "advantage_mean": -2.110997909809953e-08, "advantage_min": -0.7638447806239128, "advantage_std": 0.7076913714408875, "completion_length": 2790.479232788086, "epoch": 0.1702857142857143, "grad_norm": 0.18803834915161133, "kl": 0.016819000244140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.967309592491052e-07, "loss": 0.0947, "reward": 0.3587049674242735, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3587049674242735, "reward_after_std": 0.7076913937926292, "reward_before_mean": 0.46114486269652843, "reward_before_std": 0.6992642693221569, "reward_change_max": 0.0007874518632888794, "reward_change_mean": -0.10243987792637199, "reward_change_min": -0.16466401610523462, "reward_change_std": 0.0671718236990273, "reward_std": 0.7076914049685001, "rewards/cosine_scaled_reward": -0.02984423842281103, "rewards/format_reward": 0.5208333395421505, "step": 149 }, { "advantage_max": 1.2045219093561172, "advantage_mean": 2.2972623692218974e-08, "advantage_min": -1.143689103424549, "advantage_std": 0.9179808422923088, "completion_length": 2979.7083740234375, "epoch": 0.17142857142857143, "grad_norm": 0.3000147342681885, "kl": 0.02593994140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.9471999940354e-07, "loss": 0.079, "reward": 0.2739081159234047, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2739081159234047, "reward_after_std": 0.9179808460175991, "reward_before_mean": 0.3676854632794857, "reward_before_std": 0.9514680132269859, "reward_change_max": 0.0006517991423606873, "reward_change_mean": -0.09377730148844421, "reward_change_min": -0.2144545027986169, "reward_change_std": 0.08906646957620978, "reward_std": 0.9179808907210827, "rewards/cosine_scaled_reward": -0.02449061779771, "rewards/format_reward": 0.41666668467223644, "step": 150 }, { "advantage_max": 1.5817717239260674, "advantage_mean": -2.2506963848201167e-08, "advantage_min": -0.9173250868916512, "advantage_std": 0.9457920156419277, "completion_length": 2821.354232788086, "epoch": 0.17257142857142857, "grad_norm": 0.23507796227931976, "kl": 0.02570343017578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.926922383915315e-07, "loss": 0.0797, "reward": 0.46917635947465897, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.46917635947465897, "reward_after_std": 0.9457920081913471, "reward_before_mean": 0.5744377570226789, "reward_before_std": 0.941125662997365, "reward_change_max": 0.00025819987058639526, "reward_change_mean": -0.1052614112268202, "reward_change_min": -0.1981993718072772, "reward_change_std": 0.07828537304885685, "reward_std": 0.9457920119166374, "rewards/cosine_scaled_reward": 0.047635551696657785, "rewards/format_reward": 0.47916668094694614, "step": 151 }, { "advantage_max": 0.9127373062074184, "advantage_mean": -3.1044122827950105e-10, "advantage_min": -0.7578173317015171, "advantage_std": 0.6449372805655003, "completion_length": 2599.1041870117188, "epoch": 0.1737142857142857, "grad_norm": 0.1737074851989746, "kl": 0.02773284912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.906477750432903e-07, "loss": 0.0573, "reward": 0.0322701595723629, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0322701595723629, "reward_after_std": 0.6449372880160809, "reward_before_mean": 0.10861435905098915, "reward_before_std": 0.6586843468248844, "reward_change_max": 0.0002729371190071106, "reward_change_mean": -0.07634419621899724, "reward_change_min": -0.1607343116775155, "reward_change_std": 0.06475975178182125, "reward_std": 0.6449373215436935, "rewards/cosine_scaled_reward": -0.12277615629136562, "rewards/format_reward": 0.3541666679084301, "step": 152 }, { "advantage_max": 1.0964186489582062, "advantage_mean": 1.3038516599728212e-08, "advantage_min": -0.8900428488850594, "advantage_std": 0.767373725771904, "completion_length": 2993.458366394043, "epoch": 0.17485714285714285, "grad_norm": 0.20991554856300354, "kl": 0.039215087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.88586709003076e-07, "loss": 0.0699, "reward": -0.014602228999137878, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.014602228999137878, "reward_after_std": 0.7673737443983555, "reward_before_mean": 0.05484032817184925, "reward_before_std": 0.7900453805923462, "reward_change_max": 0.0001817941665649414, "reward_change_mean": -0.06944253481924534, "reward_change_min": -0.16902944073081017, "reward_change_std": 0.06724012573249638, "reward_std": 0.7673737592995167, "rewards/cosine_scaled_reward": -0.118413170799613, "rewards/format_reward": 0.29166667722165585, "step": 153 }, { "advantage_max": 1.2021274827420712, "advantage_mean": 3.476937660007451e-08, "advantage_min": -1.179232008755207, "advantage_std": 0.9691634774208069, "completion_length": 3311.541748046875, "epoch": 0.176, "grad_norm": 0.3098473846912384, "kl": 0.02216339111328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.865091407243394e-07, "loss": 0.0688, "reward": 0.3431315952911973, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3431315952911973, "reward_after_std": 0.9691634848713875, "reward_before_mean": 0.44397585839033127, "reward_before_std": 1.010753821581602, "reward_change_max": 0.00024543702602386475, "reward_change_mean": -0.10084426286630332, "reward_change_min": -0.22217622213065624, "reward_change_std": 0.09855047892779112, "reward_std": 0.969163540750742, "rewards/cosine_scaled_reward": 0.04490460641682148, "rewards/format_reward": 0.35416667349636555, "step": 154 }, { "advantage_max": 1.155366025865078, "advantage_mean": -3.601113995888028e-08, "advantage_min": -0.9142054095864296, "advantage_std": 0.8050542362034321, "completion_length": 2577.333366394043, "epoch": 0.17714285714285713, "grad_norm": 0.15611477196216583, "kl": 0.0316009521484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.844151714648274e-07, "loss": 0.0187, "reward": 0.43145788833498955, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43145788833498955, "reward_after_std": 0.8050542436540127, "reward_before_mean": 0.5406967643648386, "reward_before_std": 0.8168010376393795, "reward_change_max": 0.00125044584274292, "reward_change_mean": -0.10923890233971179, "reward_change_min": -0.224298395216465, "reward_change_std": 0.08746692817658186, "reward_std": 0.8050542436540127, "rewards/cosine_scaled_reward": -0.0004849610850214958, "rewards/format_reward": 0.5416666716337204, "step": 155 }, { "advantage_max": 1.3044027090072632, "advantage_mean": 7.45058109652419e-09, "advantage_min": -0.7066097818315029, "advantage_std": 0.7796169742941856, "completion_length": 3190.8333587646484, "epoch": 0.1782857142857143, "grad_norm": 0.21351027488708496, "kl": 0.02843475341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.823049032816478e-07, "loss": 0.0818, "reward": -0.1954272324219346, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1954272324219346, "reward_after_std": 0.7796169780194759, "reward_before_mean": -0.14665334671735764, "reward_before_std": 0.7897529415786266, "reward_change_max": 0.0005297735333442688, "reward_change_mean": -0.048773885355331004, "reward_change_min": -0.13390359189361334, "reward_change_std": 0.05359873874112964, "reward_std": 0.7796170227229595, "rewards/cosine_scaled_reward": -0.18791001569479704, "rewards/format_reward": 0.2291666716337204, "step": 156 }, { "advantage_max": 0.6623891666531563, "advantage_mean": -1.4280279514444771e-08, "advantage_min": -0.7862861528992653, "advantage_std": 0.5497554168105125, "completion_length": 3102.7083740234375, "epoch": 0.17942857142857144, "grad_norm": 0.10205285996198654, "kl": 0.03250885009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.801784390262943e-07, "loss": 0.0245, "reward": -0.04365145298652351, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.04365145298652351, "reward_after_std": 0.5497554168105125, "reward_before_mean": 0.03132460406050086, "reward_before_std": 0.5714639872312546, "reward_change_max": 0.00042669475078582764, "reward_change_mean": -0.07497606868855655, "reward_change_min": -0.14072032365947962, "reward_change_std": 0.061189956264570355, "reward_std": 0.549755435436964, "rewards/cosine_scaled_reward": -0.1405876912176609, "rewards/format_reward": 0.31250001303851604, "step": 157 }, { "advantage_max": 1.007291927933693, "advantage_mean": -1.6142925052253787e-08, "advantage_min": -0.8780169375240803, "advantage_std": 0.7204512059688568, "completion_length": 2872.750045776367, "epoch": 0.18057142857142858, "grad_norm": 0.1799384355545044, "kl": 0.0306549072265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.780358823396352e-07, "loss": 0.0455, "reward": 0.6302909525111318, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6302909525111318, "reward_after_std": 0.7204512022435665, "reward_before_mean": 0.7603835063055158, "reward_before_std": 0.7301176488399506, "reward_change_max": 0.0009801387786865234, "reward_change_mean": -0.13009255612269044, "reward_change_min": -0.22687509935349226, "reward_change_std": 0.09314224496483803, "reward_std": 0.7204512171447277, "rewards/cosine_scaled_reward": 0.1301917377859354, "rewards/format_reward": 0.5000000111758709, "step": 158 }, { "advantage_max": 1.2010982930660248, "advantage_mean": 1.614292521878724e-08, "advantage_min": -0.7741870544850826, "advantage_std": 0.760103264823556, "completion_length": 3368.625, "epoch": 0.18171428571428572, "grad_norm": 0.16853055357933044, "kl": 0.0409698486328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.758773376468604e-07, "loss": 0.0166, "reward": -0.1673340299166739, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1673340299166739, "reward_after_std": 0.7601032964885235, "reward_before_mean": -0.11435879208147526, "reward_before_std": 0.7753438986837864, "reward_change_max": 0.0008056163787841797, "reward_change_mean": -0.05297523224726319, "reward_change_min": -0.14327362179756165, "reward_change_std": 0.05677987774834037, "reward_std": 0.760103328153491, "rewards/cosine_scaled_reward": -0.16134606953710318, "rewards/format_reward": 0.2083333395421505, "step": 159 }, { "advantage_max": 1.3550818711519241, "advantage_mean": -2.173086155465853e-09, "advantage_min": -0.8745719566941261, "advantage_std": 0.8815088830888271, "completion_length": 2607.3333740234375, "epoch": 0.18285714285714286, "grad_norm": 0.17933402955532074, "kl": 0.04034423828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.737029101523929e-07, "loss": 0.0235, "reward": 0.2858357895165682, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2858357895165682, "reward_after_std": 0.8815088346600533, "reward_before_mean": 0.37776060961186886, "reward_before_std": 0.8969038352370262, "reward_change_max": 0.00037732720375061035, "reward_change_mean": -0.09192477163742296, "reward_change_min": -0.19541947543621063, "reward_change_std": 0.07548659306485206, "reward_std": 0.8815088532865047, "rewards/cosine_scaled_reward": 0.053463623858988285, "rewards/format_reward": 0.27083334140479565, "step": 160 }, { "advantage_max": 0.843145627528429, "advantage_mean": -4.0357312713901194e-08, "advantage_min": -0.7640031352639198, "advantage_std": 0.6416959650814533, "completion_length": 3008.541732788086, "epoch": 0.184, "grad_norm": 0.17466787993907928, "kl": 0.0492401123046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.715127058347614e-07, "loss": 0.0098, "reward": 0.4757814444601536, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4757814444601536, "reward_after_std": 0.6416959837079048, "reward_before_mean": 0.5949108861386776, "reward_before_std": 0.6474121138453484, "reward_change_max": 7.144361734390259e-05, "reward_change_mean": -0.11912947986274958, "reward_change_min": -0.21004556119441986, "reward_change_std": 0.0840856155846268, "reward_std": 0.6416960023343563, "rewards/cosine_scaled_reward": 0.07870544213801622, "rewards/format_reward": 0.4375000074505806, "step": 161 }, { "advantage_max": 1.1495609432458878, "advantage_mean": -1.862645426786713e-09, "advantage_min": -0.6219765916466713, "advantage_std": 0.679344154894352, "completion_length": 3318.291702270508, "epoch": 0.18514285714285714, "grad_norm": 0.19393867254257202, "kl": 0.05615234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.693068314414344e-07, "loss": 0.0367, "reward": -0.14834206318482757, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14834206318482757, "reward_after_std": 0.679344154894352, "reward_before_mean": -0.09237463027238846, "reward_before_std": 0.6833413615822792, "reward_change_max": 0.0, "reward_change_mean": -0.055967450607568026, "reward_change_min": -0.10517737921327353, "reward_change_std": 0.042807565070688725, "reward_std": 0.6793441660702229, "rewards/cosine_scaled_reward": -0.1607706407085061, "rewards/format_reward": 0.22916666977107525, "step": 162 }, { "advantage_max": 0.818055797368288, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.8302821703255177, "advantage_std": 0.6327940300107002, "completion_length": 2528.3541717529297, "epoch": 0.18628571428571428, "grad_norm": 0.20737354457378387, "kl": 0.0474853515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.670853944836176e-07, "loss": -0.0322, "reward": 0.3084873203188181, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3084873203188181, "reward_after_std": 0.6327940560877323, "reward_before_mean": 0.4131042119115591, "reward_before_std": 0.6455821953713894, "reward_change_max": 0.0003267824649810791, "reward_change_mean": -0.1046168792527169, "reward_change_min": -0.18456690851598978, "reward_change_std": 0.07689245650544763, "reward_std": 0.6327940858900547, "rewards/cosine_scaled_reward": 0.01905210316181183, "rewards/format_reward": 0.37500000931322575, "step": 163 }, { "advantage_max": 0.904144998639822, "advantage_mean": -3.476937709967487e-08, "advantage_min": -0.9024236872792244, "advantage_std": 0.693291537463665, "completion_length": 2415.916717529297, "epoch": 0.18742857142857142, "grad_norm": 0.1441878080368042, "kl": 0.0462646484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.648485032310144e-07, "loss": -0.0149, "reward": 0.39764015562832355, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.39764015562832355, "reward_after_std": 0.693291537463665, "reward_before_mean": 0.508028662065044, "reward_before_std": 0.7092392519116402, "reward_change_max": 0.0002870708703994751, "reward_change_mean": -0.11038849456235766, "reward_change_min": -0.19456447195261717, "reward_change_std": 0.08194146119058132, "reward_std": 0.6932915560901165, "rewards/cosine_scaled_reward": 0.07693097367882729, "rewards/format_reward": 0.35416666977107525, "step": 164 }, { "advantage_max": 1.1269632391631603, "advantage_mean": 8.692344732885715e-09, "advantage_min": -0.7582625076174736, "advantage_std": 0.7576115392148495, "completion_length": 2974.5000610351562, "epoch": 0.18857142857142858, "grad_norm": 0.2055661678314209, "kl": 0.053497314453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.625962667065487e-07, "loss": 0.0408, "reward": -0.031486984342336655, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.031486984342336655, "reward_after_std": 0.7576115392148495, "reward_before_mean": 0.0358324833214283, "reward_before_std": 0.776401586830616, "reward_change_max": 0.00020839273929595947, "reward_change_mean": -0.06731945066712797, "reward_change_min": -0.1540833543986082, "reward_change_std": 0.0621999790892005, "reward_std": 0.7576115429401398, "rewards/cosine_scaled_reward": -0.07583376299589872, "rewards/format_reward": 0.1875000037252903, "step": 165 }, { "advantage_max": 1.1014218032360077, "advantage_mean": 1.2417634920325327e-08, "advantage_min": -0.7755156680941582, "advantage_std": 0.7053066603839397, "completion_length": 3034.6041717529297, "epoch": 0.18971428571428572, "grad_norm": 0.18134304881095886, "kl": 0.047393798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.603287946810513e-07, "loss": 0.0598, "reward": 0.07770548108965158, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07770548108965158, "reward_after_std": 0.70530666410923, "reward_before_mean": 0.15491134487092495, "reward_before_std": 0.7133532389998436, "reward_change_max": 0.0005127936601638794, "reward_change_mean": -0.0772058351431042, "reward_change_min": -0.15264319721609354, "reward_change_std": 0.05929103307425976, "reward_std": 0.7053066939115524, "rewards/cosine_scaled_reward": -0.037127673625946045, "rewards/format_reward": 0.2291666753590107, "step": 166 }, { "advantage_max": 1.3006695583462715, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.8689005374908447, "advantage_std": 0.8276662826538086, "completion_length": 2107.291748046875, "epoch": 0.19085714285714286, "grad_norm": 0.17068101465702057, "kl": 0.047607421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.580461976679099e-07, "loss": 0.0175, "reward": 0.23405415192246437, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23405415192246437, "reward_after_std": 0.827666288241744, "reward_before_mean": 0.32163101993501186, "reward_before_std": 0.8379855081439018, "reward_change_max": 0.0003604292869567871, "reward_change_mean": -0.08757684961892664, "reward_change_min": -0.18768702819943428, "reward_change_std": 0.07284934795461595, "reward_std": 0.8276663199067116, "rewards/cosine_scaled_reward": -0.057934501208364964, "rewards/format_reward": 0.4375000111758709, "step": 167 }, { "advantage_max": 0.8111299090087414, "advantage_mean": 1.3969838424943148e-08, "advantage_min": -1.0020024217665195, "advantage_std": 0.6965993195772171, "completion_length": 2974.041717529297, "epoch": 0.192, "grad_norm": 0.20416678488254547, "kl": 0.050445556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.557485869176825e-07, "loss": 0.0191, "reward": 0.3391971345990896, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3391971345990896, "reward_after_std": 0.6965993344783783, "reward_before_mean": 0.4466320239007473, "reward_before_std": 0.722343236207962, "reward_change_max": 0.0, "reward_change_mean": -0.10743483621627092, "reward_change_min": -0.19180598575621843, "reward_change_std": 0.0832341960631311, "reward_std": 0.6965993642807007, "rewards/cosine_scaled_reward": 0.014982646331191063, "rewards/format_reward": 0.41666668094694614, "step": 168 }, { "advantage_max": 1.1438056454062462, "advantage_mean": -3.04232063430554e-08, "advantage_min": -1.1217550933361053, "advantage_std": 0.8912684917449951, "completion_length": 2452.6250762939453, "epoch": 0.19314285714285714, "grad_norm": 0.25557219982147217, "kl": 0.0565338134765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.534360744126753e-07, "loss": 0.0333, "reward": 1.0176847303519025, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.0176847303519025, "reward_after_std": 0.8912684805691242, "reward_before_mean": 1.1816631648689508, "reward_before_std": 0.9058678522706032, "reward_change_max": 0.0, "reward_change_mean": -0.16397839970886707, "reward_change_min": -0.2810376714915037, "reward_change_std": 0.11516173789277673, "reward_std": 0.8912685364484787, "rewards/cosine_scaled_reward": 0.28874821588397026, "rewards/format_reward": 0.6041666697710752, "step": 169 }, { "advantage_max": 0.8939481191337109, "advantage_mean": -2.545615107596433e-08, "advantage_min": -0.827484168112278, "advantage_std": 0.6734241135418415, "completion_length": 2655.2500762939453, "epoch": 0.19428571428571428, "grad_norm": 0.1779620349407196, "kl": 0.061553955078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.511087728614862e-07, "loss": 0.0345, "reward": 0.4281429387629032, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4281429387629032, "reward_after_std": 0.6734241284430027, "reward_before_mean": 0.5414133286103606, "reward_before_std": 0.6823275052011013, "reward_change_max": 0.00027061253786087036, "reward_change_mean": -0.11327041126787663, "reward_change_min": -0.20588709693402052, "reward_change_std": 0.0820755553431809, "reward_std": 0.6734241358935833, "rewards/cosine_scaled_reward": 0.0936233289539814, "rewards/format_reward": 0.3541666753590107, "step": 170 }, { "advantage_max": 0.9380842223763466, "advantage_mean": 9.313225579621331e-09, "advantage_min": -0.9550211504101753, "advantage_std": 0.7192481569945812, "completion_length": 2867.312530517578, "epoch": 0.19542857142857142, "grad_norm": 0.2206490933895111, "kl": 0.05462646484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.487667956935087e-07, "loss": 0.051, "reward": 0.11909876018762589, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11909876018762589, "reward_after_std": 0.7192481234669685, "reward_before_mean": 0.20382621884346008, "reward_before_std": 0.7419612966477871, "reward_change_max": 0.0008624270558357239, "reward_change_mean": -0.0847274367697537, "reward_change_min": -0.17237501591444016, "reward_change_std": 0.07184617791790515, "reward_std": 0.7192481495440006, "rewards/cosine_scaled_reward": -0.05433690547943115, "rewards/format_reward": 0.31250001303851604, "step": 171 }, { "advantage_max": 1.2377153486013412, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.0486519858241081, "advantage_std": 0.9159692004323006, "completion_length": 2520.37508392334, "epoch": 0.19657142857142856, "grad_norm": 0.36089661717414856, "kl": 0.061431884765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.464102570534061e-07, "loss": 0.0775, "reward": 0.4834242947399616, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4834242947399616, "reward_after_std": 0.9159692041575909, "reward_before_mean": 0.5955887548625469, "reward_before_std": 0.9399407245218754, "reward_change_max": 6.854534149169922e-05, "reward_change_mean": -0.11216441867873073, "reward_change_min": -0.23346925619989634, "reward_change_std": 0.09769316925667226, "reward_std": 0.9159692190587521, "rewards/cosine_scaled_reward": 0.16237769648432732, "rewards/format_reward": 0.27083333767950535, "step": 172 }, { "advantage_max": 1.0018565282225609, "advantage_mean": 1.0554989493538613e-08, "advantage_min": -0.613800659775734, "advantage_std": 0.6544171050190926, "completion_length": 1890.8750457763672, "epoch": 0.1977142857142857, "grad_norm": 0.2708069980144501, "kl": 0.058868408203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.440392717955475e-07, "loss": 0.0501, "reward": -0.31603203853592277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31603203853592277, "reward_after_std": 0.6544171161949635, "reward_before_mean": -0.27385718561708927, "reward_before_std": 0.6710513979196548, "reward_change_max": 0.0004888400435447693, "reward_change_mean": -0.042174856178462505, "reward_change_min": -0.1296614371240139, "reward_change_std": 0.051378472126089036, "reward_std": 0.6544171385467052, "rewards/cosine_scaled_reward": -0.2306785937398672, "rewards/format_reward": 0.18750000558793545, "step": 173 }, { "advantage_max": 0.8003799468278885, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.7577654309570789, "advantage_std": 0.5632036030292511, "completion_length": 2550.3334350585938, "epoch": 0.19885714285714284, "grad_norm": 0.18074588477611542, "kl": 0.07696533203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.416539554784089e-07, "loss": -0.0175, "reward": 0.2051242869347334, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2051242869347334, "reward_after_std": 0.5632036104798317, "reward_before_mean": 0.3000453729182482, "reward_before_std": 0.5662722326815128, "reward_change_max": 0.0, "reward_change_mean": -0.09492109343409538, "reward_change_min": -0.16723101679235697, "reward_change_std": 0.06537155085243285, "reward_std": 0.5632036216557026, "rewards/cosine_scaled_reward": -0.07914398796856403, "rewards/format_reward": 0.45833333767950535, "step": 174 }, { "advantage_max": 0.8833422213792801, "advantage_mean": 1.0554988993938252e-08, "advantage_min": -0.8566731512546539, "advantage_std": 0.6538542471826077, "completion_length": 2570.6666870117188, "epoch": 0.2, "grad_norm": 0.23281656205654144, "kl": 0.0728302001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.392544243589427e-07, "loss": 0.0225, "reward": 0.17386609059758484, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17386609059758484, "reward_after_std": 0.6538542248308659, "reward_before_mean": 0.2645402289927006, "reward_before_std": 0.6693241335451603, "reward_change_max": 4.032999277114868e-05, "reward_change_mean": -0.09067412395961583, "reward_change_min": -0.1672326810657978, "reward_change_std": 0.07087655737996101, "reward_std": 0.6538542471826077, "rewards/cosine_scaled_reward": -0.034396563190966845, "rewards/format_reward": 0.33333333767950535, "step": 175 }, { "advantage_max": 1.658276230096817, "advantage_mean": -1.1796752852344383e-08, "advantage_min": -1.1352946013212204, "advantage_std": 1.1043973043560982, "completion_length": 2494.0208892822266, "epoch": 0.20114285714285715, "grad_norm": 0.35480260848999023, "kl": 0.069122314453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.368407953869103e-07, "loss": -0.0014, "reward": 0.17692266777157784, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.17692266777157784, "reward_after_std": 1.1043973118066788, "reward_before_mean": 0.2529789046384394, "reward_before_std": 1.135535355657339, "reward_change_max": 0.00035287439823150635, "reward_change_mean": -0.07605624804273248, "reward_change_min": -0.19880263041704893, "reward_change_std": 0.08487494708970189, "reward_std": 1.1043973341584206, "rewards/cosine_scaled_reward": -0.008927222341299057, "rewards/format_reward": 0.2708333358168602, "step": 176 }, { "advantage_max": 1.333267793059349, "advantage_mean": -5.898376537194494e-09, "advantage_min": -1.057743813842535, "advantage_std": 0.9380174241960049, "completion_length": 2683.3126068115234, "epoch": 0.2022857142857143, "grad_norm": 0.3615868389606476, "kl": 0.081451416015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.344131861991828e-07, "loss": -0.0352, "reward": 0.2688114494085312, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2688114494085312, "reward_after_std": 0.9380174838006496, "reward_before_mean": 0.35989896580576897, "reward_before_std": 0.9653622359037399, "reward_change_max": 0.0009453520178794861, "reward_change_mean": -0.0910875026602298, "reward_change_min": -0.19741658121347427, "reward_change_std": 0.080465252045542, "reward_std": 0.9380174949765205, "rewards/cosine_scaled_reward": -0.028383860364556313, "rewards/format_reward": 0.4166666828095913, "step": 177 }, { "advantage_max": 0.7999058216810226, "advantage_mean": 1.8471231655325937e-08, "advantage_min": -0.5875363126397133, "advantage_std": 0.525285542011261, "completion_length": 2208.791721343994, "epoch": 0.20342857142857143, "grad_norm": 0.24783383309841156, "kl": 0.094207763671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.319717151140072e-07, "loss": -0.0245, "reward": -0.32124644331634045, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32124644331634045, "reward_after_std": 0.5252855531871319, "reward_before_mean": -0.2753131858771667, "reward_before_std": 0.5358833186328411, "reward_change_max": 0.0009209141135215759, "reward_change_mean": -0.045933238812722266, "reward_change_min": -0.1145911905914545, "reward_change_std": 0.04406271001789719, "reward_std": 0.5252855755388737, "rewards/cosine_scaled_reward": -0.18973992549581453, "rewards/format_reward": 0.10416666977107525, "step": 178 }, { "advantage_max": 0.9616053961217403, "advantage_mean": -6.208815128694312e-10, "advantage_min": -0.5495030656456947, "advantage_std": 0.5631580613553524, "completion_length": 2417.0833740234375, "epoch": 0.20457142857142857, "grad_norm": 0.2671527862548828, "kl": 0.072052001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.295165011252396e-07, "loss": 0.0493, "reward": -0.20262894342886284, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20262894342886284, "reward_after_std": 0.563158068805933, "reward_before_mean": -0.14828139916062355, "reward_before_std": 0.5615837536752224, "reward_change_max": 0.0001543089747428894, "reward_change_mean": -0.054347540717571974, "reward_change_min": -0.10567384958267212, "reward_change_std": 0.041040402837097645, "reward_std": 0.5631580837070942, "rewards/cosine_scaled_reward": -0.18872403725981712, "rewards/format_reward": 0.22916666977107525, "step": 179 }, { "advantage_max": 1.3213228471577168, "advantage_mean": 3.1044087300813317e-09, "advantage_min": -0.9479734636843204, "advantage_std": 0.8566804938018322, "completion_length": 2123.750011444092, "epoch": 0.2057142857142857, "grad_norm": 0.42133620381355286, "kl": 0.0872802734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.270476638965461e-07, "loss": 0.0471, "reward": 0.38085563108325005, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38085563108325005, "reward_after_std": 0.8566805012524128, "reward_before_mean": 0.48239467665553093, "reward_before_std": 0.8608980290591717, "reward_change_max": 0.00010352581739425659, "reward_change_mean": -0.10153904324397445, "reward_change_min": -0.20027936436235905, "reward_change_std": 0.08030586317181587, "reward_std": 0.8566805496811867, "rewards/cosine_scaled_reward": 0.13703067554160953, "rewards/format_reward": 0.2083333358168602, "step": 180 }, { "advantage_max": 0.93392089381814, "advantage_mean": 9.313225857177088e-09, "advantage_min": -0.6315050572156906, "advantage_std": 0.6295647770166397, "completion_length": 2717.8958892822266, "epoch": 0.20685714285714285, "grad_norm": 0.26544320583343506, "kl": 0.094696044921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.245653237555705e-07, "loss": 0.0447, "reward": -0.0804877057671547, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0804877057671547, "reward_after_std": 0.6295647732913494, "reward_before_mean": -0.014551796950399876, "reward_before_std": 0.6384981628507376, "reward_change_max": 0.00037204474210739136, "reward_change_mean": -0.06593590078409761, "reward_change_min": -0.15629394073039293, "reward_change_std": 0.06047847680747509, "reward_std": 0.6295647993683815, "rewards/cosine_scaled_reward": -0.08019256498664618, "rewards/format_reward": 0.1458333358168602, "step": 181 }, { "advantage_max": 1.3371423408389091, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.9195161461830139, "advantage_std": 0.9004073217511177, "completion_length": 2558.1667098999023, "epoch": 0.208, "grad_norm": 0.335359662771225, "kl": 0.093170166015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.220696016880687e-07, "loss": 0.0294, "reward": 0.4829629212617874, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4829629212617874, "reward_after_std": 0.9004073329269886, "reward_before_mean": 0.5930858813226223, "reward_before_std": 0.9114826694130898, "reward_change_max": 0.0, "reward_change_mean": -0.1101229446940124, "reward_change_min": -0.2271582130342722, "reward_change_std": 0.08647168381139636, "reward_std": 0.9004073478281498, "rewards/cosine_scaled_reward": 0.10904293693602085, "rewards/format_reward": 0.3750000111758709, "step": 182 }, { "advantage_max": 1.254299134016037, "advantage_mean": -1.490116174895917e-08, "advantage_min": -0.9943030625581741, "advantage_std": 0.8474318087100983, "completion_length": 2121.4375915527344, "epoch": 0.20914285714285713, "grad_norm": 0.3153325021266937, "kl": 0.08837890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.195606193320136e-07, "loss": 0.0504, "reward": 0.1891594659537077, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1891594659537077, "reward_after_std": 0.8474318385124207, "reward_before_mean": 0.2741537671536207, "reward_before_std": 0.8667086064815521, "reward_change_max": 0.000354960560798645, "reward_change_mean": -0.08499433984979987, "reward_change_min": -0.18232434801757336, "reward_change_std": 0.07570598181337118, "reward_std": 0.8474318720400333, "rewards/cosine_scaled_reward": 0.0016602184623479843, "rewards/format_reward": 0.27083333767950535, "step": 183 }, { "advantage_max": 1.1184355579316616, "advantage_mean": 4.346172532976311e-09, "advantage_min": -0.5413721092045307, "advantage_std": 0.6433587558567524, "completion_length": 2671.7291984558105, "epoch": 0.2102857142857143, "grad_norm": 0.22072502970695496, "kl": 0.11517333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.170384989716657e-07, "loss": 0.0022, "reward": -0.31343852914869785, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.31343852914869785, "reward_after_std": 0.6433587558567524, "reward_before_mean": -0.27286421274766326, "reward_before_std": 0.6472878158092499, "reward_change_max": 0.0005966871976852417, "reward_change_mean": -0.04057432198897004, "reward_change_min": -0.10668485425412655, "reward_change_std": 0.04195940599311143, "reward_std": 0.6433587614446878, "rewards/cosine_scaled_reward": -0.19893210940063, "rewards/format_reward": 0.1250000037252903, "step": 184 }, { "advantage_max": 1.1658845506608486, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.6802770979702473, "advantage_std": 0.744084857404232, "completion_length": 2459.6875534057617, "epoch": 0.21142857142857144, "grad_norm": 0.35991156101226807, "kl": 0.111419677734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.145033635316128e-07, "loss": 0.0663, "reward": -0.05943065322935581, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05943065322935581, "reward_after_std": 0.7440848480910063, "reward_before_mean": 0.004029544070363045, "reward_before_std": 0.7564304284751415, "reward_change_max": 0.0, "reward_change_mean": -0.06346021476201713, "reward_change_min": -0.1444684062153101, "reward_change_std": 0.05615873821079731, "reward_std": 0.7440848704427481, "rewards/cosine_scaled_reward": -0.11256856098771095, "rewards/format_reward": 0.22916666977107525, "step": 185 }, { "advantage_max": 0.811072587966919, "advantage_mean": 2.17308601113686e-08, "advantage_min": -0.6112252026796341, "advantage_std": 0.6088230907917023, "completion_length": 2535.125045776367, "epoch": 0.21257142857142858, "grad_norm": 0.3562098443508148, "kl": 0.129058837890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.119553365707802e-07, "loss": -0.011, "reward": 0.07544249296188354, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07544249296188354, "reward_after_std": 0.608823087066412, "reward_before_mean": 0.15811622887849808, "reward_before_std": 0.6218390204012394, "reward_change_max": 0.00039686262607574463, "reward_change_mean": -0.08267371519468725, "reward_change_min": -0.16685685515403748, "reward_change_std": 0.06896232924191281, "reward_std": 0.6088231094181538, "rewards/cosine_scaled_reward": -0.01469188928604126, "rewards/format_reward": 0.1875000037252903, "step": 186 }, { "advantage_max": 1.175539281219244, "advantage_mean": 1.5522043539384356e-08, "advantage_min": -0.8305178135633469, "advantage_std": 0.783175390213728, "completion_length": 2573.791732788086, "epoch": 0.21371428571428572, "grad_norm": 0.33450013399124146, "kl": 0.130859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.093945422764069e-07, "loss": -0.005, "reward": -0.05765972752124071, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05765972752124071, "reward_after_std": 0.783175390213728, "reward_before_mean": 0.0057212114334106445, "reward_before_std": 0.8013257309794426, "reward_change_max": 0.00036757439374923706, "reward_change_mean": -0.06338093103840947, "reward_change_min": -0.15380204655230045, "reward_change_std": 0.061657859245315194, "reward_std": 0.7831753939390182, "rewards/cosine_scaled_reward": -0.10130605194717646, "rewards/format_reward": 0.2083333395421505, "step": 187 }, { "advantage_max": 0.6556166559457779, "advantage_mean": 5.587935225648266e-09, "advantage_min": -0.505927637219429, "advantage_std": 0.4643473494797945, "completion_length": 3036.4583435058594, "epoch": 0.21485714285714286, "grad_norm": 0.2525724470615387, "kl": 0.1484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.068211054579943e-07, "loss": 0.0062, "reward": -0.31691102124750614, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31691102124750614, "reward_after_std": 0.46434734389185905, "reward_before_mean": -0.26725206710398197, "reward_before_std": 0.47667811438441277, "reward_change_max": 0.0005693808197975159, "reward_change_mean": -0.04965895973145962, "reward_change_min": -0.11332780588418245, "reward_change_std": 0.04475319117773324, "reward_std": 0.4643473718315363, "rewards/cosine_scaled_reward": -0.15445936284959316, "rewards/format_reward": 0.0416666679084301, "step": 188 }, { "advantage_max": 0.8341766893863678, "advantage_mean": -9.313224635931761e-10, "advantage_min": -0.7918973080813885, "advantage_std": 0.60749601572752, "completion_length": 2524.3958587646484, "epoch": 0.216, "grad_norm": 0.2288750410079956, "kl": 0.1602783203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.04235151541222e-07, "loss": 0.0357, "reward": 0.26920123770833015, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.26920123770833015, "reward_after_std": 0.6074960138648748, "reward_before_mean": 0.36930515244603157, "reward_before_std": 0.6160230301320553, "reward_change_max": 0.00011741369962692261, "reward_change_mean": -0.10010391753166914, "reward_change_min": -0.17794792912900448, "reward_change_std": 0.06970303924754262, "reward_std": 0.6074960231781006, "rewards/cosine_scaled_reward": -0.013264082372188568, "rewards/format_reward": 0.3958333507180214, "step": 189 }, { "advantage_max": 0.8611217215657234, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.8677000142633915, "advantage_std": 0.6190670412033796, "completion_length": 2455.5833587646484, "epoch": 0.21714285714285714, "grad_norm": 0.1976422667503357, "kl": 0.13385009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 8.01636806561836e-07, "loss": 0.0189, "reward": 0.12271896377205849, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12271896377205849, "reward_after_std": 0.6190670412033796, "reward_before_mean": 0.2086525820195675, "reward_before_std": 0.6303278524428606, "reward_change_max": 0.0020866915583610535, "reward_change_mean": -0.08593360241502523, "reward_change_min": -0.15425695106387138, "reward_change_std": 0.0642830905271694, "reward_std": 0.6190670430660248, "rewards/cosine_scaled_reward": -0.04150705365464091, "rewards/format_reward": 0.29166667722165585, "step": 190 }, { "advantage_max": 0.8121273927390575, "advantage_mean": -1.3659397779530735e-08, "advantage_min": -0.6374476179480553, "advantage_std": 0.5626640729606152, "completion_length": 2784.687545776367, "epoch": 0.21828571428571428, "grad_norm": 0.2620941698551178, "kl": 0.16229248046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.990261971595048e-07, "loss": 0.0151, "reward": 0.05612104572355747, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05612104572355747, "reward_after_std": 0.5626640692353249, "reward_before_mean": 0.13682471262291074, "reward_before_std": 0.56803297996521, "reward_change_max": 0.00027626752853393555, "reward_change_mean": -0.08070366689935327, "reward_change_min": -0.15503592789173126, "reward_change_std": 0.0627133441157639, "reward_std": 0.5626640915870667, "rewards/cosine_scaled_reward": -0.05658765323460102, "rewards/format_reward": 0.25000000186264515, "step": 191 }, { "advantage_max": 0.9729629419744015, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.7198412269353867, "advantage_std": 0.6481819711625576, "completion_length": 3039.5208740234375, "epoch": 0.21942857142857142, "grad_norm": 0.27418825030326843, "kl": 0.13934326171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.964034505716476e-07, "loss": 0.0349, "reward": -0.16575850173830986, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16575850173830986, "reward_after_std": 0.648181963711977, "reward_before_mean": -0.10841276589781046, "reward_before_std": 0.6632645688951015, "reward_change_max": 0.00047516077756881714, "reward_change_mean": -0.057345751440152526, "reward_change_min": -0.13206746894866228, "reward_change_std": 0.05364243150688708, "reward_std": 0.6481819748878479, "rewards/cosine_scaled_reward": -0.1479563768953085, "rewards/format_reward": 0.18750000558793545, "step": 192 }, { "advantage_max": 0.9263895452022552, "advantage_mean": 2.9181441985048906e-08, "advantage_min": -0.5460419282317162, "advantage_std": 0.5525044910609722, "completion_length": 3144.125045776367, "epoch": 0.22057142857142858, "grad_norm": 0.27585750818252563, "kl": 0.16455078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.93768694627233e-07, "loss": 0.0501, "reward": -0.3938119038939476, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3938119038939476, "reward_after_std": 0.5525044947862625, "reward_before_mean": -0.3569503426551819, "reward_before_std": 0.5581550113856792, "reward_change_max": 0.0011642500758171082, "reward_change_mean": -0.0368615499464795, "reward_change_min": -0.07977760676294565, "reward_change_std": 0.03481626545544714, "reward_std": 0.5525045059621334, "rewards/cosine_scaled_reward": -0.2618085015565157, "rewards/format_reward": 0.1666666716337204, "step": 193 }, { "advantage_max": 1.1692192666232586, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.9652174562215805, "advantage_std": 0.8269610479474068, "completion_length": 2549.2708740234375, "epoch": 0.22171428571428572, "grad_norm": 0.5415308475494385, "kl": 0.110260009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.911220577405484e-07, "loss": 0.0546, "reward": 0.4768530046567321, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4768530046567321, "reward_after_std": 0.8269610404968262, "reward_before_mean": 0.5903498325496912, "reward_before_std": 0.8395131379365921, "reward_change_max": 0.0006142705678939819, "reward_change_mean": -0.11349680949933827, "reward_change_min": -0.21761109866201878, "reward_change_std": 0.09163685445673764, "reward_std": 0.826961062848568, "rewards/cosine_scaled_reward": 0.07642490416765213, "rewards/format_reward": 0.4375, "step": 194 }, { "advantage_max": 1.4493412896990776, "advantage_mean": -2.1730860888524717e-08, "advantage_min": -1.0076495185494423, "advantage_std": 0.9671072959899902, "completion_length": 2750.6250610351562, "epoch": 0.22285714285714286, "grad_norm": 1.0602513551712036, "kl": 0.14349365234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.884636689049422e-07, "loss": 0.1005, "reward": 0.08260958641767502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08260958641767502, "reward_after_std": 0.9671072959899902, "reward_before_mean": 0.15350980064249597, "reward_before_std": 0.9919899739325047, "reward_change_max": 0.00022814422845840454, "reward_change_mean": -0.07090024533681571, "reward_change_min": -0.17355335224419832, "reward_change_std": 0.0735201274510473, "reward_std": 0.9671073220670223, "rewards/cosine_scaled_reward": -0.05866176821291447, "rewards/format_reward": 0.2708333395421505, "step": 195 }, { "advantage_max": 1.2421837113797665, "advantage_mean": -3.1664969646350016e-08, "advantage_min": -0.8666118606925011, "advantage_std": 0.7809932995587587, "completion_length": 2964.3333740234375, "epoch": 0.224, "grad_norm": 0.564656674861908, "kl": 0.175201416015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.857936576865356e-07, "loss": 0.0615, "reward": 0.04462842829525471, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04462842829525471, "reward_after_std": 0.7809933163225651, "reward_before_mean": 0.11695368587970734, "reward_before_std": 0.7898044027388096, "reward_change_max": 0.0002730339765548706, "reward_change_mean": -0.07232528855092824, "reward_change_min": -0.14141991455107927, "reward_change_std": 0.05830873898230493, "reward_std": 0.780993327498436, "rewards/cosine_scaled_reward": -0.0873564900830388, "rewards/format_reward": 0.29166667722165585, "step": 196 }, { "advantage_max": 1.8417157232761383, "advantage_mean": -2.359350548264416e-08, "advantage_min": -1.2466395795345306, "advantage_std": 1.2031425386667252, "completion_length": 2360.041702270508, "epoch": 0.22514285714285714, "grad_norm": 0.5715523958206177, "kl": 0.1849365234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.831121542179086e-07, "loss": 0.0325, "reward": 0.5395329678431153, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5395329678431153, "reward_after_std": 1.2031425312161446, "reward_before_mean": 0.6457790993154049, "reward_before_std": 1.2249048128724098, "reward_change_max": 0.00048523396253585815, "reward_change_mean": -0.10624613519757986, "reward_change_min": -0.24607283994555473, "reward_change_std": 0.0999097554013133, "reward_std": 1.2031425833702087, "rewards/cosine_scaled_reward": 0.1249728761613369, "rewards/format_reward": 0.3958333395421505, "step": 197 }, { "advantage_max": 0.6936135701835155, "advantage_mean": 6.208814573582799e-10, "advantage_min": -0.8528610952198505, "advantage_std": 0.556768286973238, "completion_length": 2614.0625762939453, "epoch": 0.22628571428571428, "grad_norm": 0.4623723328113556, "kl": 0.2156982421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.804192891917571e-07, "loss": -0.0034, "reward": 0.06352788954973221, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06352788954973221, "reward_after_std": 0.5567682851105928, "reward_before_mean": 0.1473788940347731, "reward_before_std": 0.5750712193548679, "reward_change_max": 0.0010342374444007874, "reward_change_mean": -0.0838509879540652, "reward_change_min": -0.14691872242838144, "reward_change_std": 0.06345773441717029, "reward_std": 0.556768300011754, "rewards/cosine_scaled_reward": -0.08256056532263756, "rewards/format_reward": 0.31250000558793545, "step": 198 }, { "advantage_max": 1.3256859108805656, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.6804698929190636, "advantage_std": 0.7932882234454155, "completion_length": 2771.416717529297, "epoch": 0.22742857142857142, "grad_norm": 0.39074909687042236, "kl": 0.2138671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.777151938545235e-07, "loss": 0.0087, "reward": -0.22427453845739365, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22427453845739365, "reward_after_std": 0.7932882159948349, "reward_before_mean": -0.1793069913983345, "reward_before_std": 0.8059480749070644, "reward_change_max": 0.002735838294029236, "reward_change_mean": -0.044967556837946177, "reward_change_min": -0.13585915230214596, "reward_change_std": 0.053579527186229825, "reward_std": 0.7932882308959961, "rewards/cosine_scaled_reward": -0.1625701580196619, "rewards/format_reward": 0.14583333767950535, "step": 199 }, { "advantage_max": 1.051039144396782, "advantage_mean": -1.5522042928761692e-08, "advantage_min": -0.7579713463783264, "advantage_std": 0.6761558391153812, "completion_length": 2306.9583740234375, "epoch": 0.22857142857142856, "grad_norm": 0.4172566831111908, "kl": 0.211669921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.75e-07, "loss": -0.008, "reward": 0.3839730089530349, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3839730089530349, "reward_after_std": 0.6761558391153812, "reward_before_mean": 0.4907239656895399, "reward_before_std": 0.671320891007781, "reward_change_max": 0.000662676990032196, "reward_change_mean": -0.10675097140483558, "reward_change_min": -0.18421044945716858, "reward_change_std": 0.07500778371468186, "reward_std": 0.6761558465659618, "rewards/cosine_scaled_reward": 0.01619531214237213, "rewards/format_reward": 0.4583333432674408, "step": 200 }, { "advantage_max": 1.002537775784731, "advantage_mean": -4.594524793954946e-08, "advantage_min": -0.9626957811415195, "advantage_std": 0.7547497116029263, "completion_length": 2121.229179382324, "epoch": 0.2297142857142857, "grad_norm": 0.3952004015445709, "kl": 0.17352294921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.72273839962904e-07, "loss": -0.0059, "reward": 0.8897911142557859, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8897911142557859, "reward_after_std": 0.7547497153282166, "reward_before_mean": 1.044318351894617, "reward_before_std": 0.7571042329072952, "reward_change_max": 0.0001737847924232483, "reward_change_mean": -0.15452725533396006, "reward_change_min": -0.2652826514095068, "reward_change_std": 0.1033217788208276, "reward_std": 0.7547497302293777, "rewards/cosine_scaled_reward": 0.25132583547383547, "rewards/format_reward": 0.5416666697710752, "step": 201 }, { "advantage_max": 0.7856088727712631, "advantage_mean": -4.5945247717504856e-08, "advantage_min": -0.6831889897584915, "advantage_std": 0.5087563134729862, "completion_length": 2350.479202270508, "epoch": 0.23085714285714284, "grad_norm": 0.30211445689201355, "kl": 0.24383544921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.695368466124296e-07, "loss": 0.0061, "reward": 0.8790129721164703, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8790129721164703, "reward_after_std": 0.5087563041597605, "reward_before_mean": 1.0367812784388661, "reward_before_std": 0.4771445747464895, "reward_change_max": 0.00018440932035446167, "reward_change_mean": -0.1577683356590569, "reward_change_min": -0.23149394802749157, "reward_change_std": 0.08965065004304051, "reward_std": 0.5087563060224056, "rewards/cosine_scaled_reward": 0.24755729362368584, "rewards/format_reward": 0.5416666734963655, "step": 202 }, { "advantage_max": 1.0101400669664145, "advantage_mean": 7.45058115203534e-09, "advantage_min": -0.6545524224638939, "advantage_std": 0.6288202814757824, "completion_length": 2820.7291870117188, "epoch": 0.232, "grad_norm": 0.4497270882129669, "kl": 0.27264404296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.667891533457718e-07, "loss": 0.0651, "reward": 0.19614388910122216, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19614388910122216, "reward_after_std": 0.6288202852010727, "reward_before_mean": 0.28668341506272554, "reward_before_std": 0.6246930155903101, "reward_change_max": 0.0, "reward_change_mean": -0.09053952293470502, "reward_change_min": -0.16773551888763905, "reward_change_std": 0.06451876182109118, "reward_std": 0.6288202926516533, "rewards/cosine_scaled_reward": 0.04959171311929822, "rewards/format_reward": 0.18750000186264515, "step": 203 }, { "advantage_max": 0.8654856495559216, "advantage_mean": -1.0554989493538613e-08, "advantage_min": -0.9278236515820026, "advantage_std": 0.6752170845866203, "completion_length": 2224.7083892822266, "epoch": 0.23314285714285715, "grad_norm": 0.5278440713882446, "kl": 0.205322265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.640308940816239e-07, "loss": 0.0108, "reward": 0.5590728987008333, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5590728987008333, "reward_after_std": 0.6752170845866203, "reward_before_mean": 0.6843808209523559, "reward_before_std": 0.6862643286585808, "reward_change_max": 0.001576930284500122, "reward_change_mean": -0.1253079129382968, "reward_change_min": -0.2180102663114667, "reward_change_std": 0.08850083267316222, "reward_std": 0.6752171032130718, "rewards/cosine_scaled_reward": 0.008857070934027433, "rewards/format_reward": 0.666666679084301, "step": 204 }, { "advantage_max": 1.3751740790903568, "advantage_mean": -8.071461887748654e-09, "advantage_min": -1.3816149085760117, "advantage_std": 1.0784570425748825, "completion_length": 2667.6459197998047, "epoch": 0.2342857142857143, "grad_norm": 0.509384036064148, "kl": 0.229400634765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.612622032536507e-07, "loss": 0.032, "reward": 0.587292967364192, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.587292967364192, "reward_after_std": 1.078457035124302, "reward_before_mean": 0.7071726471185684, "reward_before_std": 1.1184897869825363, "reward_change_max": 8.384138345718384e-05, "reward_change_mean": -0.11987964902073145, "reward_change_min": -0.24095893651247025, "reward_change_std": 0.10655779344961047, "reward_std": 1.0784570574760437, "rewards/cosine_scaled_reward": 0.176502981223166, "rewards/format_reward": 0.35416667722165585, "step": 205 }, { "advantage_max": 1.5005393326282501, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.8025929555296898, "advantage_std": 0.8781850822269917, "completion_length": 3067.854248046875, "epoch": 0.23542857142857143, "grad_norm": 0.41775602102279663, "kl": 0.2396240234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.584832158039378e-07, "loss": 0.0312, "reward": 0.10771947354078293, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10771947354078293, "reward_after_std": 0.8781850971281528, "reward_before_mean": 0.18033906631171703, "reward_before_std": 0.8780070766806602, "reward_change_max": 2.4966895580291748e-05, "reward_change_mean": -0.07261957018636167, "reward_change_min": -0.1609745966270566, "reward_change_std": 0.061680351849645376, "reward_std": 0.8781851306557655, "rewards/cosine_scaled_reward": -0.07649714685976505, "rewards/format_reward": 0.33333333767950535, "step": 206 }, { "advantage_max": 0.859190009534359, "advantage_mean": 3.725290353973065e-09, "advantage_min": -0.7211577333509922, "advantage_std": 0.5771542005240917, "completion_length": 3049.291717529297, "epoch": 0.23657142857142857, "grad_norm": 0.35315126180648804, "kl": 0.32470703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.556940671764124e-07, "loss": 0.0438, "reward": -0.008801928721368313, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.008801928721368313, "reward_after_std": 0.5771542023867369, "reward_before_mean": 0.06464581179898232, "reward_before_std": 0.5813978333026171, "reward_change_max": 0.0005838200449943542, "reward_change_mean": -0.07344775041565299, "reward_change_min": -0.14188366383314133, "reward_change_std": 0.05655882274731994, "reward_std": 0.5771542396396399, "rewards/cosine_scaled_reward": -0.13434376008808613, "rewards/format_reward": 0.3333333432674408, "step": 207 }, { "advantage_max": 1.1458739191293716, "advantage_mean": -1.8005571034152013e-08, "advantage_min": -0.638024490326643, "advantage_std": 0.6989194564521313, "completion_length": 2599.7500610351562, "epoch": 0.2377142857142857, "grad_norm": 0.6301652789115906, "kl": 0.255859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.528948933102438e-07, "loss": 0.0427, "reward": 0.1499016471207142, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1499016471207142, "reward_after_std": 0.6989194788038731, "reward_before_mean": 0.23315771110355854, "reward_before_std": 0.6953848674893379, "reward_change_max": 0.0002986416220664978, "reward_change_mean": -0.08325609937310219, "reward_change_min": -0.16083112079650164, "reward_change_std": 0.06415085797198117, "reward_std": 0.6989195011556149, "rewards/cosine_scaled_reward": -0.0917544849216938, "rewards/format_reward": 0.4166666716337204, "step": 208 }, { "advantage_max": 0.9560815170407295, "advantage_mean": -1.3038516155639002e-08, "advantage_min": -0.669091060757637, "advantage_std": 0.6263339519500732, "completion_length": 2470.625045776367, "epoch": 0.23885714285714285, "grad_norm": 0.47854506969451904, "kl": 0.2359619140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.500858306332172e-07, "loss": -0.0121, "reward": 0.574387613683939, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.574387613683939, "reward_after_std": 0.6263339295983315, "reward_before_mean": 0.7000777684152126, "reward_before_std": 0.612229947000742, "reward_change_max": 0.0, "reward_change_mean": -0.12569017568603158, "reward_change_min": -0.2115925382822752, "reward_change_std": 0.08066713158041239, "reward_std": 0.6263339556753635, "rewards/cosine_scaled_reward": 0.07920554932206869, "rewards/format_reward": 0.5416666828095913, "step": 209 }, { "advantage_max": 0.6764157526195049, "advantage_mean": 2.5456150853919723e-08, "advantage_min": -0.6963140219449997, "advantage_std": 0.5000756457448006, "completion_length": 2879.4583740234375, "epoch": 0.24, "grad_norm": 0.2873426675796509, "kl": 0.24578857421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.472670160550848e-07, "loss": 0.027, "reward": 0.2614448294043541, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2614448294043541, "reward_after_std": 0.50007563829422, "reward_before_mean": 0.364066656678915, "reward_before_std": 0.4979663249105215, "reward_change_max": 0.00027251243591308594, "reward_change_mean": -0.10262180212885141, "reward_change_min": -0.1705446420237422, "reward_change_std": 0.06758826458826661, "reward_std": 0.5000756569206715, "rewards/cosine_scaled_reward": 0.00494999997317791, "rewards/format_reward": 0.3541666753590107, "step": 210 }, { "advantage_max": 1.1394590884447098, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.8035576418042183, "advantage_std": 0.7197157442569733, "completion_length": 2390.9791870117188, "epoch": 0.24114285714285713, "grad_norm": 0.29365745186805725, "kl": 0.21905517578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.444385869608921e-07, "loss": 0.001, "reward": 0.21905515156686306, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21905515156686306, "reward_after_std": 0.719715740531683, "reward_before_mean": 0.3086434584110975, "reward_before_std": 0.723445076495409, "reward_change_max": 0.00035256147384643555, "reward_change_mean": -0.08958828420145437, "reward_change_min": -0.16625287476927042, "reward_change_std": 0.06521394196897745, "reward_std": 0.7197157703340054, "rewards/cosine_scaled_reward": -0.043594954535365105, "rewards/format_reward": 0.3958333507180214, "step": 211 }, { "advantage_max": 1.094607064500451, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.6028978629037738, "advantage_std": 0.6431901277974248, "completion_length": 2397.0833892822266, "epoch": 0.2422857142857143, "grad_norm": 0.32407593727111816, "kl": 0.22100830078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.416006812042827e-07, "loss": 0.0192, "reward": 0.5966612412594259, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5966612412594259, "reward_after_std": 0.6431901091709733, "reward_before_mean": 0.7225769180804491, "reward_before_std": 0.6182768186554313, "reward_change_max": 0.0001141279935836792, "reward_change_mean": -0.12591569544747472, "reward_change_min": -0.20083122700452805, "reward_change_std": 0.07545896037481725, "reward_std": 0.6431901175528765, "rewards/cosine_scaled_reward": 0.06962178461253643, "rewards/format_reward": 0.5833333395421505, "step": 212 }, { "advantage_max": 1.282714981585741, "advantage_mean": -9.313226023710541e-10, "advantage_min": -1.1995286270976067, "advantage_std": 0.9880655072629452, "completion_length": 2257.3750610351562, "epoch": 0.24342857142857144, "grad_norm": 0.42579054832458496, "kl": 0.225311279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.387534371007797e-07, "loss": 0.0334, "reward": 0.43806671584025025, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43806671584025025, "reward_after_std": 0.9880655221641064, "reward_before_mean": 0.5443241535685956, "reward_before_std": 1.0230080299079418, "reward_change_max": 0.0003249123692512512, "reward_change_mean": -0.1062574377283454, "reward_change_min": -0.23165474273264408, "reward_change_std": 0.09968985430896282, "reward_std": 0.9880655538290739, "rewards/cosine_scaled_reward": 0.0013287439942359924, "rewards/format_reward": 0.5416666734963655, "step": 213 }, { "advantage_max": 1.1717126108705997, "advantage_mean": 6.053596915411852e-09, "advantage_min": -1.053213369101286, "advantage_std": 0.842260368168354, "completion_length": 2521.250045776367, "epoch": 0.24457142857142858, "grad_norm": 0.41738176345825195, "kl": 0.2025146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.358969934210438e-07, "loss": 0.0276, "reward": 0.4556270924513228, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4556270924513228, "reward_after_std": 0.8422603793442249, "reward_before_mean": 0.5666961893439293, "reward_before_std": 0.8589375950396061, "reward_change_max": 0.0005218684673309326, "reward_change_mean": -0.11106909299269319, "reward_change_min": -0.21090741362422705, "reward_change_std": 0.08776850858703256, "reward_std": 0.8422603905200958, "rewards/cosine_scaled_reward": 0.0020980946719646454, "rewards/format_reward": 0.5625000149011612, "step": 214 }, { "advantage_max": 0.7589559145271778, "advantage_mean": 2.048909669705168e-08, "advantage_min": -0.7794839665293694, "advantage_std": 0.5524455606937408, "completion_length": 2386.7708854675293, "epoch": 0.24571428571428572, "grad_norm": 0.28002220392227173, "kl": 0.220703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.330314893841101e-07, "loss": 0.0386, "reward": 0.11122296750545502, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11122296750545502, "reward_after_std": 0.5524455606937408, "reward_before_mean": 0.19726988906040788, "reward_before_std": 0.5620923303067684, "reward_change_max": 0.00015363842248916626, "reward_change_mean": -0.08604689175263047, "reward_change_min": -0.15219917241483927, "reward_change_std": 0.062479326501488686, "reward_std": 0.5524455681443214, "rewards/cosine_scaled_reward": -0.0992817347869277, "rewards/format_reward": 0.39583334140479565, "step": 215 }, { "advantage_max": 1.3616463989019394, "advantage_mean": -1.862645188088763e-08, "advantage_min": -0.9499138370156288, "advantage_std": 0.8744436055421829, "completion_length": 2387.5625762939453, "epoch": 0.24685714285714286, "grad_norm": 0.42455509305000305, "kl": 0.2247314453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.301570646506027e-07, "loss": 0.0104, "reward": 0.495633814483881, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.495633814483881, "reward_after_std": 0.8744436204433441, "reward_before_mean": 0.6068383371457458, "reward_before_std": 0.8772630579769611, "reward_change_max": 0.0, "reward_change_mean": -0.11120450543239713, "reward_change_min": -0.20330642815679312, "reward_change_std": 0.07988221733830869, "reward_std": 0.8744436576962471, "rewards/cosine_scaled_reward": -0.009080850519239902, "rewards/format_reward": 0.6250000093132257, "step": 216 }, { "advantage_max": 1.5420643910765648, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -1.0352942943572998, "advantage_std": 1.060675971210003, "completion_length": 2722.8125610351562, "epoch": 0.248, "grad_norm": 0.7412684559822083, "kl": 0.2513427734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.27273859315928e-07, "loss": 0.0535, "reward": 0.6235943953506649, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6235943953506649, "reward_after_std": 1.0606759525835514, "reward_before_mean": 0.7431438378989697, "reward_before_std": 1.0822565481066704, "reward_change_max": 2.171844244003296e-05, "reward_change_mean": -0.11954941879957914, "reward_change_min": -0.25604639016091824, "reward_change_std": 0.09751309640705585, "reward_std": 1.0606759898364544, "rewards/cosine_scaled_reward": 0.09032191522419453, "rewards/format_reward": 0.5625000055879354, "step": 217 }, { "advantage_max": 1.2730883322656155, "advantage_mean": -2.4214387217558198e-08, "advantage_min": -0.7694863900542259, "advantage_std": 0.7907967679202557, "completion_length": 2718.5833587646484, "epoch": 0.24914285714285714, "grad_norm": 0.3669975996017456, "kl": 0.41796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.243820139034464e-07, "loss": 0.0147, "reward": 0.1338215246796608, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1338215246796608, "reward_after_std": 0.7907967641949654, "reward_before_mean": 0.21257536113262177, "reward_before_std": 0.7941794954240322, "reward_change_max": 6.996095180511475e-06, "reward_change_mean": -0.07875384530052543, "reward_change_min": -0.1696187872439623, "reward_change_std": 0.06339895771816373, "reward_std": 0.7907967828214169, "rewards/cosine_scaled_reward": -0.09162899415241554, "rewards/format_reward": 0.39583334513008595, "step": 218 }, { "advantage_max": 1.155998595058918, "advantage_mean": -5.2154065066645217e-08, "advantage_min": -1.1375319361686707, "advantage_std": 0.8279825672507286, "completion_length": 2491.791763305664, "epoch": 0.2502857142857143, "grad_norm": 0.8123669624328613, "kl": 0.228179931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.214816693576234e-07, "loss": 0.0692, "reward": 0.6602697218768299, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6602697218768299, "reward_after_std": 0.8279825747013092, "reward_before_mean": 0.7905087154358625, "reward_before_std": 0.8417788743972778, "reward_change_max": 0.0013681799173355103, "reward_change_mean": -0.13023906177841127, "reward_change_min": -0.2316719852387905, "reward_change_std": 0.09396075457334518, "reward_std": 0.8279825821518898, "rewards/cosine_scaled_reward": 0.030671026557683945, "rewards/format_reward": 0.7291666846722364, "step": 219 }, { "advantage_max": 0.7060927897691727, "advantage_mean": 8.07146260939362e-09, "advantage_min": -0.6109671518206596, "advantage_std": 0.47745291143655777, "completion_length": 2906.229217529297, "epoch": 0.25142857142857145, "grad_norm": 0.3627898395061493, "kl": 0.3309326171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.185729670371604e-07, "loss": 0.0493, "reward": -0.19565529376268387, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19565529376268387, "reward_after_std": 0.47745292633771896, "reward_before_mean": -0.13670195546001196, "reward_before_std": 0.48440743423998356, "reward_change_max": 0.00014778226613998413, "reward_change_mean": -0.05895334016531706, "reward_change_min": -0.1121221724897623, "reward_change_std": 0.04572835494764149, "reward_std": 0.477452939376235, "rewards/cosine_scaled_reward": -0.24543431401252747, "rewards/format_reward": 0.35416667349636555, "step": 220 }, { "advantage_max": 1.122801061719656, "advantage_mean": -3.7873785246889113e-08, "advantage_min": -0.9494878984987736, "advantage_std": 0.7546558827161789, "completion_length": 2318.7083740234375, "epoch": 0.25257142857142856, "grad_norm": 0.3165279030799866, "kl": 0.25885009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.156560487081051e-07, "loss": 0.042, "reward": 0.46347523806616664, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46347523806616664, "reward_after_std": 0.7546558603644371, "reward_before_mean": 0.5765351159498096, "reward_before_std": 0.7582212314009666, "reward_change_max": 0.0001964569091796875, "reward_change_mean": -0.11305988254025578, "reward_change_min": -0.189128577709198, "reward_change_std": 0.07670311396941543, "reward_std": 0.7546558678150177, "rewards/cosine_scaled_reward": 0.038267549593001604, "rewards/format_reward": 0.5000000055879354, "step": 221 }, { "advantage_max": 1.036661870777607, "advantage_mean": 5.58793539218172e-09, "advantage_min": -1.0472398027777672, "advantage_std": 0.75783945992589, "completion_length": 2533.145881652832, "epoch": 0.2537142857142857, "grad_norm": 0.44233042001724243, "kl": 0.31976318359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.127310565369415e-07, "loss": 0.0246, "reward": 0.4872835408896208, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4872835408896208, "reward_after_std": 0.7578394636511803, "reward_before_mean": 0.60423843562603, "reward_before_std": 0.7705888003110886, "reward_change_max": 0.00047094374895095825, "reward_change_mean": -0.11695488588884473, "reward_change_min": -0.21127595845609903, "reward_change_std": 0.08577916398644447, "reward_std": 0.7578394673764706, "rewards/cosine_scaled_reward": -0.03121412917971611, "rewards/format_reward": 0.6666666772216558, "step": 222 }, { "advantage_max": 0.7724505327641964, "advantage_mean": -5.587935669737476e-09, "advantage_min": -0.7214891128242016, "advantage_std": 0.60935864970088, "completion_length": 2594.416717529297, "epoch": 0.25485714285714284, "grad_norm": 0.3700931966304779, "kl": 0.373779296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.097981330836616e-07, "loss": 0.0301, "reward": 0.4279163293540478, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4279163293540478, "reward_after_std": 0.60935864970088, "reward_before_mean": 0.5440852921456099, "reward_before_std": 0.6187305934727192, "reward_change_max": 0.00026647746562957764, "reward_change_mean": -0.11616896372288465, "reward_change_min": -0.20987431332468987, "reward_change_std": 0.08221541903913021, "reward_std": 0.60935864970088, "rewards/cosine_scaled_reward": -0.04045736789703369, "rewards/format_reward": 0.6250000111758709, "step": 223 }, { "advantage_max": 0.9514374248683453, "advantage_mean": -2.1730860055857448e-08, "advantage_min": -0.6713517233729362, "advantage_std": 0.6334418430924416, "completion_length": 3012.479232788086, "epoch": 0.256, "grad_norm": 0.5133031010627747, "kl": 0.428955078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.068574212948169e-07, "loss": 0.0498, "reward": 0.3854692354798317, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3854692354798317, "reward_after_std": 0.6334418393671513, "reward_before_mean": 0.4944905759766698, "reward_before_std": 0.6296762898564339, "reward_change_max": 0.0, "reward_change_mean": -0.10902136843651533, "reward_change_min": -0.19977211859077215, "reward_change_std": 0.07383694127202034, "reward_std": 0.633441861718893, "rewards/cosine_scaled_reward": 0.03891195636242628, "rewards/format_reward": 0.41666667722165585, "step": 224 }, { "advantage_max": 0.8476278707385063, "advantage_mean": -1.0554989937627823e-08, "advantage_min": -0.9130581766366959, "advantage_std": 0.6610419899225235, "completion_length": 2864.3333740234375, "epoch": 0.2571428571428571, "grad_norm": 0.6178968548774719, "kl": 0.413818359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.039090644965509e-07, "loss": 0.0233, "reward": 0.24999480694532394, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24999480694532394, "reward_after_std": 0.6610419973731041, "reward_before_mean": 0.3482043823460117, "reward_before_std": 0.6777895800769329, "reward_change_max": 0.0002451390027999878, "reward_change_mean": -0.09820958506315947, "reward_change_min": -0.17957793455570936, "reward_change_std": 0.0756394388154149, "reward_std": 0.6610420010983944, "rewards/cosine_scaled_reward": -0.07589781284332275, "rewards/format_reward": 0.5000000055879354, "step": 225 }, { "advantage_max": 0.9860688522458076, "advantage_mean": -1.3659397779530735e-08, "advantage_min": -0.8861872833222151, "advantage_std": 0.7225923500955105, "completion_length": 2687.875045776367, "epoch": 0.2582857142857143, "grad_norm": 0.42785775661468506, "kl": 0.431304931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 7.009532063876148e-07, "loss": 0.0251, "reward": 0.3786590788513422, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3786590788513422, "reward_after_std": 0.7225923649966717, "reward_before_mean": 0.48628670489415526, "reward_before_std": 0.7363221980631351, "reward_change_max": 0.00018220394849777222, "reward_change_mean": -0.10762762138620019, "reward_change_min": -0.20438862685114145, "reward_change_std": 0.08181980717927217, "reward_std": 0.7225923947989941, "rewards/cosine_scaled_reward": 0.034810012206435204, "rewards/format_reward": 0.41666667722165585, "step": 226 }, { "advantage_max": 1.3538540825247765, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -1.0006621703505516, "advantage_std": 0.8676325753331184, "completion_length": 2686.437530517578, "epoch": 0.25942857142857145, "grad_norm": 0.5147333741188049, "kl": 0.4132080078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.979899910323624e-07, "loss": 0.0256, "reward": 0.4257585988380015, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4257585988380015, "reward_after_std": 0.8676326051354408, "reward_before_mean": 0.5303960796445608, "reward_before_std": 0.8729932010173798, "reward_change_max": 0.0, "reward_change_mean": -0.10463747382164001, "reward_change_min": -0.2075261939316988, "reward_change_std": 0.08033762080594897, "reward_std": 0.8676326274871826, "rewards/cosine_scaled_reward": -0.047301971819251776, "rewards/format_reward": 0.6250000204890966, "step": 227 }, { "advantage_max": 0.8109984621405602, "advantage_mean": -3.91155482448724e-08, "advantage_min": -0.8960036411881447, "advantage_std": 0.6084200330078602, "completion_length": 2570.0000610351562, "epoch": 0.26057142857142856, "grad_norm": 0.45605042576789856, "kl": 0.366058349609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.950195628537299e-07, "loss": 0.0226, "reward": 0.5789503455162048, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5789503455162048, "reward_after_std": 0.6084200479090214, "reward_before_mean": 0.7085773483850062, "reward_before_std": 0.6109695844352245, "reward_change_max": 0.00010479241609573364, "reward_change_mean": -0.12962702754884958, "reward_change_min": -0.21464707050472498, "reward_change_std": 0.08285582205280662, "reward_std": 0.6084200702607632, "rewards/cosine_scaled_reward": 0.08345534279942513, "rewards/format_reward": 0.5416666753590107, "step": 228 }, { "advantage_max": 1.1666472516953945, "advantage_mean": 4.967053657267684e-09, "advantage_min": -0.8106258250772953, "advantage_std": 0.7726855352520943, "completion_length": 2959.2084045410156, "epoch": 0.26171428571428573, "grad_norm": 0.5730072259902954, "kl": 0.5205078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.920420666261961e-07, "loss": 0.0382, "reward": 0.43685874436050653, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43685874436050653, "reward_after_std": 0.7726855203509331, "reward_before_mean": 0.5464709792286158, "reward_before_std": 0.7791089043021202, "reward_change_max": 4.570186138153076e-05, "reward_change_mean": -0.10961220692843199, "reward_change_min": -0.19915625173598528, "reward_change_std": 0.08033318631350994, "reward_std": 0.7726855352520943, "rewards/cosine_scaled_reward": 0.03365214308723807, "rewards/format_reward": 0.47916667349636555, "step": 229 }, { "advantage_max": 0.9596210494637489, "advantage_mean": 1.30385160446167e-08, "advantage_min": -0.6449937969446182, "advantage_std": 0.6544265672564507, "completion_length": 3251.0209045410156, "epoch": 0.26285714285714284, "grad_norm": 0.5122941732406616, "kl": 0.48828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.890576474687263e-07, "loss": 0.0303, "reward": -0.04866902204230428, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04866902204230428, "reward_after_std": 0.6544265672564507, "reward_before_mean": 0.01986833941191435, "reward_before_std": 0.6688491478562355, "reward_change_max": 0.0005970969796180725, "reward_change_mean": -0.06853736680932343, "reward_change_min": -0.15719961281865835, "reward_change_std": 0.061842745169997215, "reward_std": 0.6544266007840633, "rewards/cosine_scaled_reward": -0.20881584659218788, "rewards/format_reward": 0.4375000037252903, "step": 230 }, { "advantage_max": 0.9483978226780891, "advantage_mean": -1.8936892887122525e-08, "advantage_min": -0.8996751829981804, "advantage_std": 0.6520739421248436, "completion_length": 2641.604202270508, "epoch": 0.264, "grad_norm": 0.38522300124168396, "kl": 0.384124755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.860664508377001e-07, "loss": 0.0357, "reward": 0.3406216111034155, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3406216111034155, "reward_after_std": 0.6520739495754242, "reward_before_mean": 0.4451865218579769, "reward_before_std": 0.6562475748360157, "reward_change_max": 0.00021515041589736938, "reward_change_mean": -0.104564911685884, "reward_change_min": -0.18150869477540255, "reward_change_std": 0.07289927126839757, "reward_std": 0.6520739682018757, "rewards/cosine_scaled_reward": -0.08990675210952759, "rewards/format_reward": 0.6250000149011612, "step": 231 }, { "advantage_max": 0.7513163685798645, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.4088404346257448, "advantage_std": 0.4395688883960247, "completion_length": 3061.7083740234375, "epoch": 0.2651428571428571, "grad_norm": 0.5501198768615723, "kl": 0.492919921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.83068622519821e-07, "loss": 0.0351, "reward": -0.35827588848769665, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35827588848769665, "reward_after_std": 0.4395688883960247, "reward_before_mean": -0.3145838063210249, "reward_before_std": 0.4367716293781996, "reward_change_max": 0.0004380419850349426, "reward_change_mean": -0.043692084145732224, "reward_change_min": -0.09585795551538467, "reward_change_std": 0.03534274536650628, "reward_std": 0.4395688995718956, "rewards/cosine_scaled_reward": -0.29270857013761997, "rewards/format_reward": 0.2708333358168602, "step": 232 }, { "advantage_max": 1.0172811076045036, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.6046523898839951, "advantage_std": 0.6398416385054588, "completion_length": 2803.06258392334, "epoch": 0.2662857142857143, "grad_norm": 0.31811264157295227, "kl": 0.377960205078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.800643086250121e-07, "loss": 0.0386, "reward": 0.09723816439509392, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09723816439509392, "reward_after_std": 0.6398416422307491, "reward_before_mean": 0.17845547664910555, "reward_before_std": 0.6404211223125458, "reward_change_max": 0.00018434226512908936, "reward_change_mean": -0.08121731854043901, "reward_change_min": -0.16058277525007725, "reward_change_std": 0.06039998144842684, "reward_std": 0.6398416813462973, "rewards/cosine_scaled_reward": -0.1503555942326784, "rewards/format_reward": 0.4791666716337204, "step": 233 }, { "advantage_max": 0.788167878985405, "advantage_mean": 1.3038516488705909e-08, "advantage_min": -0.6035764142870903, "advantage_std": 0.5259315725415945, "completion_length": 2721.187568664551, "epoch": 0.2674285714285714, "grad_norm": 0.27355051040649414, "kl": 0.3179168701171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.770536555792944e-07, "loss": 0.0403, "reward": 0.13820985238999128, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13820985238999128, "reward_after_std": 0.5259315762668848, "reward_before_mean": 0.22728153504431248, "reward_before_std": 0.5203691460192204, "reward_change_max": 0.00012595951557159424, "reward_change_mean": -0.08907167753204703, "reward_change_min": -0.16210692562162876, "reward_change_std": 0.06338219251483679, "reward_std": 0.5259315762668848, "rewards/cosine_scaled_reward": -0.12594256736338139, "rewards/format_reward": 0.4791666716337204, "step": 234 }, { "advantage_max": 1.1957035660743713, "advantage_mean": 2.0489098029319308e-08, "advantage_min": -1.0198877342045307, "advantage_std": 0.8198182247579098, "completion_length": 2557.6667251586914, "epoch": 0.26857142857142857, "grad_norm": 0.43036338686943054, "kl": 0.30059814453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.740368101176495e-07, "loss": 0.0263, "reward": 0.5482924771495163, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5482924771495163, "reward_after_std": 0.8198181949555874, "reward_before_mean": 0.6676017071586102, "reward_before_std": 0.8265979290008545, "reward_change_max": 0.0005790963768959045, "reward_change_mean": -0.11930918972939253, "reward_change_min": -0.21050137467682362, "reward_change_std": 0.08706249436363578, "reward_std": 0.819818202406168, "rewards/cosine_scaled_reward": 0.04213416809216142, "rewards/format_reward": 0.5833333432674408, "step": 235 }, { "advantage_max": 1.244063027203083, "advantage_mean": -4.656613428188905e-09, "advantage_min": -0.9987089484930038, "advantage_std": 0.884406566619873, "completion_length": 3065.1458740234375, "epoch": 0.26971428571428574, "grad_norm": 0.8933318257331848, "kl": 0.308349609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.710139192768694e-07, "loss": 0.0414, "reward": 0.1958321612328291, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1958321612328291, "reward_after_std": 0.8844065554440022, "reward_before_mean": 0.28194663720205426, "reward_before_std": 0.9100110270082951, "reward_change_max": 1.2174248695373535e-05, "reward_change_mean": -0.08611448504962027, "reward_change_min": -0.20149937644600868, "reward_change_std": 0.08233956387266517, "reward_std": 0.8844065703451633, "rewards/cosine_scaled_reward": -0.10902668349444866, "rewards/format_reward": 0.5000000055879354, "step": 236 }, { "advantage_max": 1.2769840061664581, "advantage_mean": -3.601114062501409e-08, "advantage_min": -0.9948960766196251, "advantage_std": 0.8261019103229046, "completion_length": 2952.625030517578, "epoch": 0.27085714285714285, "grad_norm": 0.582528829574585, "kl": 0.30987548828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.679851303883891e-07, "loss": 0.0507, "reward": 0.4712846730835736, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4712846730835736, "reward_after_std": 0.8261019252240658, "reward_before_mean": 0.5824574897997081, "reward_before_std": 0.8286083415150642, "reward_change_max": 0.00019712001085281372, "reward_change_mean": -0.11117283953353763, "reward_change_min": -0.19031432271003723, "reward_change_std": 0.07673935976345092, "reward_std": 0.8261019699275494, "rewards/cosine_scaled_reward": -0.00043792277574539185, "rewards/format_reward": 0.5833333414047956, "step": 237 }, { "advantage_max": 1.4660193622112274, "advantage_mean": -4.221995741904294e-08, "advantage_min": -1.3478331565856934, "advantage_std": 1.0443166494369507, "completion_length": 2659.354217529297, "epoch": 0.272, "grad_norm": 1.605945110321045, "kl": 0.28875732421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.649505910711058e-07, "loss": 0.0653, "reward": 0.7084148563444614, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7084148563444614, "reward_after_std": 1.0443166941404343, "reward_before_mean": 0.8377973195165396, "reward_before_std": 1.068930521607399, "reward_change_max": 4.151463508605957e-05, "reward_change_mean": -0.1293824641034007, "reward_change_min": -0.24387627188116312, "reward_change_std": 0.10105510335415602, "reward_std": 1.0443167239427567, "rewards/cosine_scaled_reward": 0.08556529941779445, "rewards/format_reward": 0.666666679084301, "step": 238 }, { "advantage_max": 0.8369556181132793, "advantage_mean": -9.313225635132483e-09, "advantage_min": -0.6261783614754677, "advantage_std": 0.5559764578938484, "completion_length": 2174.7708740234375, "epoch": 0.27314285714285713, "grad_norm": 0.5835373997688293, "kl": 0.25140380859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.619104492241847e-07, "loss": 0.0381, "reward": 0.7473947200924158, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7473947200924158, "reward_after_std": 0.5559764541685581, "reward_before_mean": 0.8919164468534291, "reward_before_std": 0.5329598225653172, "reward_change_max": 0.00044843554496765137, "reward_change_mean": -0.14452172303572297, "reward_change_min": -0.2255661329254508, "reward_change_std": 0.08641267288476229, "reward_std": 0.555976465344429, "rewards/cosine_scaled_reward": 0.1751248836517334, "rewards/format_reward": 0.5416666679084301, "step": 239 }, { "advantage_max": 0.6366955451667309, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.6315915696322918, "advantage_std": 0.48512188345193863, "completion_length": 2778.166717529297, "epoch": 0.2742857142857143, "grad_norm": 1.1766997575759888, "kl": 0.4515380859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.588648530198504e-07, "loss": 0.0056, "reward": -0.018587548285722733, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.018587548285722733, "reward_after_std": 0.48512188345193863, "reward_before_mean": 0.05906429514288902, "reward_before_std": 0.496453870087862, "reward_change_max": 4.595518112182617e-05, "reward_change_mean": -0.07765184435993433, "reward_change_min": -0.14331062696874142, "reward_change_std": 0.056621880736202, "reward_std": 0.4851218946278095, "rewards/cosine_scaled_reward": -0.2413011882454157, "rewards/format_reward": 0.5416666846722364, "step": 240 }, { "advantage_max": 0.9776230975985527, "advantage_mean": 4.03573130469681e-09, "advantage_min": -0.7462376952171326, "advantage_std": 0.6642108038067818, "completion_length": 2740.2500610351562, "epoch": 0.2754285714285714, "grad_norm": 1.105962872505188, "kl": 0.43316650390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.558139508961654e-07, "loss": -0.0129, "reward": 0.13068385515362024, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13068385515362024, "reward_after_std": 0.6642108038067818, "reward_before_mean": 0.2154865637421608, "reward_before_std": 0.6723109371960163, "reward_change_max": 0.0003187432885169983, "reward_change_mean": -0.08480269648134708, "reward_change_min": -0.17307660542428493, "reward_change_std": 0.06677990918979049, "reward_std": 0.6642108112573624, "rewards/cosine_scaled_reward": -0.13184005906805396, "rewards/format_reward": 0.47916667722165585, "step": 241 }, { "advantage_max": 0.7249393723905087, "advantage_mean": -8.692344732885715e-09, "advantage_min": -0.7632653787732124, "advantage_std": 0.5563169829547405, "completion_length": 2379.2292098999023, "epoch": 0.2765714285714286, "grad_norm": 1.055472493171692, "kl": 0.380096435546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.527578915497951e-07, "loss": -0.0068, "reward": 0.6038695313036442, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6038695313036442, "reward_after_std": 0.5563170053064823, "reward_before_mean": 0.7374595701694489, "reward_before_std": 0.5531607791781425, "reward_change_max": 0.0008292198181152344, "reward_change_mean": -0.1335900668054819, "reward_change_min": -0.2169865220785141, "reward_change_std": 0.0838638637214899, "reward_std": 0.5563170239329338, "rewards/cosine_scaled_reward": -0.027103547006845474, "rewards/format_reward": 0.7916666753590107, "step": 242 }, { "advantage_max": 1.0612565129995346, "advantage_mean": -4.470348402563218e-08, "advantage_min": -1.2809006348252296, "advantage_std": 0.8737168461084366, "completion_length": 2800.3125762939453, "epoch": 0.2777142857142857, "grad_norm": 0.7638176083564758, "kl": 0.39288330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.496968239287603e-07, "loss": 0.057, "reward": 0.8082469571381807, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8082469571381807, "reward_after_std": 0.8737168610095978, "reward_before_mean": 0.9544677063822746, "reward_before_std": 0.8974614664912224, "reward_change_max": 2.9906630516052246e-05, "reward_change_mean": -0.14622076135128736, "reward_change_min": -0.2652246952056885, "reward_change_std": 0.10711004957556725, "reward_std": 0.8737168833613396, "rewards/cosine_scaled_reward": 0.13348384480923414, "rewards/format_reward": 0.6875000204890966, "step": 243 }, { "advantage_max": 1.3663447052240372, "advantage_mean": -1.552204320631745e-08, "advantage_min": -1.0774092823266983, "advantage_std": 0.935070589184761, "completion_length": 2798.687545776367, "epoch": 0.27885714285714286, "grad_norm": 0.5033749938011169, "kl": 0.48846435546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.466308972251785e-07, "loss": 0.017, "reward": 0.47303982824087143, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.47303982824087143, "reward_after_std": 0.935070589184761, "reward_before_mean": 0.5819773841649294, "reward_before_std": 0.9512156844139099, "reward_change_max": 0.00012916326522827148, "reward_change_mean": -0.108937568962574, "reward_change_min": -0.21076048258692026, "reward_change_std": 0.08802835131064057, "reward_std": 0.9350706189870834, "rewards/cosine_scaled_reward": 0.020155361853539944, "rewards/format_reward": 0.541666679084301, "step": 244 }, { "advantage_max": 1.3429825454950333, "advantage_mean": -1.3193736880801055e-08, "advantage_min": -0.9926909804344177, "advantage_std": 0.8736944049596786, "completion_length": 3171.916717529297, "epoch": 0.28, "grad_norm": 0.7295897603034973, "kl": 0.476318359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.435602608679916e-07, "loss": 0.0051, "reward": 0.36052384227514267, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36052384227514267, "reward_after_std": 0.873694408684969, "reward_before_mean": 0.46028553135693073, "reward_before_std": 0.881868090480566, "reward_change_max": 0.00019543617963790894, "reward_change_mean": -0.0997616951353848, "reward_change_min": -0.20378663670271635, "reward_change_std": 0.08069562003947794, "reward_std": 0.8736944310367107, "rewards/cosine_scaled_reward": -0.01985724247060716, "rewards/format_reward": 0.5000000037252903, "step": 245 }, { "advantage_max": 1.2377834096550941, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.9669884629547596, "advantage_std": 0.8542461507022381, "completion_length": 2991.4584045410156, "epoch": 0.28114285714285714, "grad_norm": 1.2443678379058838, "kl": 0.4268798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.404850645156841e-07, "loss": 0.0874, "reward": 0.3200700846500695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3200700846500695, "reward_after_std": 0.8542461507022381, "reward_before_mean": 0.4179497007280588, "reward_before_std": 0.8701425269246101, "reward_change_max": 0.0, "reward_change_mean": -0.09787960723042488, "reward_change_min": -0.21520974393934011, "reward_change_std": 0.08369475090876222, "reward_std": 0.8542461581528187, "rewards/cosine_scaled_reward": -0.09310848778113723, "rewards/format_reward": 0.6041666828095913, "step": 246 }, { "advantage_max": 0.7722934186458588, "advantage_mean": 4.03573130469681e-09, "advantage_min": -0.6312546953558922, "advantage_std": 0.5287173539400101, "completion_length": 3171.2291870117188, "epoch": 0.2822857142857143, "grad_norm": 0.8870450258255005, "kl": 0.4293212890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.374054580489873e-07, "loss": 0.0132, "reward": 0.06275376630946994, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06275376630946994, "reward_after_std": 0.5287173539400101, "reward_before_mean": 0.14523836690932512, "reward_before_std": 0.5309392772614956, "reward_change_max": 0.0001341402530670166, "reward_change_mean": -0.08248458697926253, "reward_change_min": -0.15690710861235857, "reward_change_std": 0.05856957985088229, "reward_std": 0.5287173762917519, "rewards/cosine_scaled_reward": -0.25029749423265457, "rewards/format_reward": 0.6458333432674408, "step": 247 }, { "advantage_max": 0.9769742004573345, "advantage_mean": -2.1109978431965715e-08, "advantage_min": -1.1480994373559952, "advantage_std": 0.7594380117952824, "completion_length": 2604.81258392334, "epoch": 0.2834285714285714, "grad_norm": 0.4662351608276367, "kl": 0.339996337890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.343215915635761e-07, "loss": 0.0193, "reward": 0.5111812395043671, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5111812395043671, "reward_after_std": 0.7594380043447018, "reward_before_mean": 0.6313334191218019, "reward_before_std": 0.7793891243636608, "reward_change_max": 0.0005892887711524963, "reward_change_mean": -0.12015216751024127, "reward_change_min": -0.209529516287148, "reward_change_std": 0.08815564028918743, "reward_std": 0.7594380229711533, "rewards/cosine_scaled_reward": 0.055250026285648346, "rewards/format_reward": 0.520833345130086, "step": 248 }, { "advantage_max": 1.025376234203577, "advantage_mean": -2.359350625980028e-08, "advantage_min": -1.2076657563447952, "advantage_std": 0.8415849506855011, "completion_length": 2305.395851135254, "epoch": 0.2845714285714286, "grad_norm": 0.4987713396549225, "kl": 0.225341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.31233615362752e-07, "loss": 0.0169, "reward": 0.8182168155908585, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8182168155908585, "reward_after_std": 0.8415849320590496, "reward_before_mean": 0.9659371078014374, "reward_before_std": 0.8593480549752712, "reward_change_max": 0.00013027340173721313, "reward_change_mean": -0.14772030501626432, "reward_change_min": -0.24447989743202925, "reward_change_std": 0.10552188637666404, "reward_std": 0.8415849320590496, "rewards/cosine_scaled_reward": 0.1496352255344391, "rewards/format_reward": 0.6666666772216558, "step": 249 }, { "advantage_max": 1.0746857300400734, "advantage_mean": -8.071462664904772e-09, "advantage_min": -0.7048100084066391, "advantage_std": 0.6700468733906746, "completion_length": 2668.5000762939453, "epoch": 0.2857142857142857, "grad_norm": 0.6423651576042175, "kl": 0.287353515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.281416799501187e-07, "loss": 0.0064, "reward": 0.3210712969303131, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3210712969303131, "reward_after_std": 0.6700468808412552, "reward_before_mean": 0.42153167352080345, "reward_before_std": 0.6627967059612274, "reward_change_max": 0.0004504099488258362, "reward_change_mean": -0.10046036634594202, "reward_change_min": -0.17996104154735804, "reward_change_std": 0.06692047603428364, "reward_std": 0.6700468994677067, "rewards/cosine_scaled_reward": -0.1538175237365067, "rewards/format_reward": 0.7291666828095913, "step": 250 }, { "advantage_max": 1.202012088149786, "advantage_mean": -3.135452805724803e-08, "advantage_min": -0.7174399569630623, "advantage_std": 0.7109669633209705, "completion_length": 2205.416732788086, "epoch": 0.28685714285714287, "grad_norm": 0.6144553422927856, "kl": 0.200531005859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.25045936022246e-07, "loss": -0.0299, "reward": 0.5154064744710922, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5154064744710922, "reward_after_std": 0.7109669931232929, "reward_before_mean": 0.6320094037801027, "reward_before_std": 0.6927180550992489, "reward_change_max": 0.0, "reward_change_mean": -0.11660293349996209, "reward_change_min": -0.1988846454769373, "reward_change_std": 0.073700116481632, "reward_std": 0.7109670229256153, "rewards/cosine_scaled_reward": -0.06941197859123349, "rewards/format_reward": 0.7708333414047956, "step": 251 }, { "advantage_max": 0.7281642220914364, "advantage_mean": -6.208821790032459e-10, "advantage_min": -0.744192611426115, "advantage_std": 0.5717556402087212, "completion_length": 2732.6250762939453, "epoch": 0.288, "grad_norm": 0.36412715911865234, "kl": 0.254913330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.219465344613258e-07, "loss": -0.0016, "reward": 0.3161662006750703, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3161662006750703, "reward_after_std": 0.5717556513845921, "reward_before_mean": 0.4227669003084884, "reward_before_std": 0.5833228453993797, "reward_change_max": 0.00025747716426849365, "reward_change_mean": -0.10660070087760687, "reward_change_min": -0.1928198728710413, "reward_change_std": 0.07527502113953233, "reward_std": 0.5717556811869144, "rewards/cosine_scaled_reward": -0.09069989109411836, "rewards/format_reward": 0.6041666772216558, "step": 252 }, { "advantage_max": 1.0409824773669243, "advantage_mean": -1.3814618338159335e-08, "advantage_min": -1.000199355185032, "advantage_std": 0.7667047046124935, "completion_length": 2554.875015258789, "epoch": 0.28914285714285715, "grad_norm": 0.5692912936210632, "kl": 0.2001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.188436263278172e-07, "loss": 0.0317, "reward": 0.5202262690290809, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5202262690290809, "reward_after_std": 0.7667046971619129, "reward_before_mean": 0.6402729395776987, "reward_before_std": 0.7784381285309792, "reward_change_max": 0.0, "reward_change_mean": -0.12004667799919844, "reward_change_min": -0.21133162081241608, "reward_change_std": 0.08583518001250923, "reward_std": 0.766704723238945, "rewards/cosine_scaled_reward": 0.0076364679262042046, "rewards/format_reward": 0.6250000074505806, "step": 253 }, { "advantage_max": 1.2403109297156334, "advantage_mean": -9.313229354379615e-10, "advantage_min": -0.7695105597376823, "advantage_std": 0.7703098207712173, "completion_length": 3132.8125915527344, "epoch": 0.29028571428571426, "grad_norm": 0.38544797897338867, "kl": 0.282958984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.157373628530852e-07, "loss": 0.0181, "reward": 0.18571746069937944, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18571746069937944, "reward_after_std": 0.7703097984194756, "reward_before_mean": 0.27099318616092205, "reward_before_std": 0.7718499638140202, "reward_change_max": 0.0, "reward_change_mean": -0.08527570590376854, "reward_change_min": -0.16410515550523996, "reward_change_std": 0.06472577340900898, "reward_std": 0.7703098133206367, "rewards/cosine_scaled_reward": -0.10408675856888294, "rewards/format_reward": 0.47916667349636555, "step": 254 }, { "advantage_max": 1.3647899143397808, "advantage_mean": -3.042320501078777e-08, "advantage_min": -1.0756276212632656, "advantage_std": 0.9934910573065281, "completion_length": 2786.395950317383, "epoch": 0.2914285714285714, "grad_norm": 0.8177282214164734, "kl": 0.234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.126278954320294e-07, "loss": 0.0246, "reward": 0.36817927472293377, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36817927472293377, "reward_after_std": 0.9934910237789154, "reward_before_mean": 0.4680708646774292, "reward_before_std": 1.0233058147132397, "reward_change_max": 0.0005884990096092224, "reward_change_mean": -0.09989159693941474, "reward_change_min": -0.22440185863524675, "reward_change_std": 0.09326563123613596, "reward_std": 0.9934910461306572, "rewards/cosine_scaled_reward": -0.026381254196166992, "rewards/format_reward": 0.5208333432674408, "step": 255 }, { "advantage_max": 1.232586644589901, "advantage_mean": -2.793967751602011e-08, "advantage_min": -0.8954048380255699, "advantage_std": 0.7754578627645969, "completion_length": 2832.1250610351562, "epoch": 0.2925714285714286, "grad_norm": 0.8683269619941711, "kl": 0.204010009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.095153756157051e-07, "loss": 0.0341, "reward": 0.38214224576950073, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38214224576950073, "reward_after_std": 0.7754578702151775, "reward_before_mean": 0.4855796182528138, "reward_before_std": 0.7749796956777573, "reward_change_max": 0.00045193731784820557, "reward_change_mean": -0.1034374050796032, "reward_change_min": -0.18192225974053144, "reward_change_std": 0.07434211485087872, "reward_std": 0.7754578925669193, "rewards/cosine_scaled_reward": -0.03846019133925438, "rewards/format_reward": 0.562500013038516, "step": 256 }, { "advantage_max": 1.3854403644800186, "advantage_mean": -2.1109979209121832e-08, "advantage_min": -1.1317920796573162, "advantage_std": 0.901426050812006, "completion_length": 3173.6459350585938, "epoch": 0.2937142857142857, "grad_norm": 0.5222746133804321, "kl": 0.26904296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.06399955103937e-07, "loss": 0.0204, "reward": 0.4658824288053438, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4658824288053438, "reward_after_std": 0.9014260284602642, "reward_before_mean": 0.5744250370189548, "reward_before_std": 0.9102607406675816, "reward_change_max": 4.0337443351745605e-05, "reward_change_mean": -0.10854260809719563, "reward_change_min": -0.200001772493124, "reward_change_std": 0.08366946689784527, "reward_std": 0.9014260694384575, "rewards/cosine_scaled_reward": -0.025287493132054806, "rewards/format_reward": 0.6250000149011612, "step": 257 }, { "advantage_max": 1.3329339772462845, "advantage_mean": 9.002785128497948e-09, "advantage_min": -1.2666821628808975, "advantage_std": 0.9974622689187527, "completion_length": 3105.6875915527344, "epoch": 0.2948571428571429, "grad_norm": 1.931921124458313, "kl": 0.2685546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.032817857379256e-07, "loss": 0.0957, "reward": 0.4899730863980949, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4899730863980949, "reward_after_std": 0.9974622763693333, "reward_before_mean": 0.6017860121792182, "reward_before_std": 1.0291601456701756, "reward_change_max": 3.0837953090667725e-05, "reward_change_mean": -0.11181289050728083, "reward_change_min": -0.23886566143482924, "reward_change_std": 0.10037148464471102, "reward_std": 0.9974623024463654, "rewards/cosine_scaled_reward": -0.022023675497621298, "rewards/format_reward": 0.645833358168602, "step": 258 }, { "advantage_max": 0.9765863195061684, "advantage_mean": -1.7384688688615313e-08, "advantage_min": -0.8986761644482613, "advantage_std": 0.6964166983962059, "completion_length": 2581.208381652832, "epoch": 0.296, "grad_norm": 0.5722829699516296, "kl": 0.2742767333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 6.001610194928464e-07, "loss": -0.0139, "reward": 0.38566339667886496, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.38566339667886496, "reward_after_std": 0.6964167132973671, "reward_before_mean": 0.4943455453030765, "reward_before_std": 0.7021567225456238, "reward_change_max": 0.0011222511529922485, "reward_change_mean": -0.10868217144161463, "reward_change_min": -0.1917388141155243, "reward_change_std": 0.07943293126299977, "reward_std": 0.6964167356491089, "rewards/cosine_scaled_reward": -0.06532722525298595, "rewards/format_reward": 0.6250000111758709, "step": 259 }, { "advantage_max": 1.106583446264267, "advantage_mean": -2.6077032533322608e-08, "advantage_min": -0.7876567766070366, "advantage_std": 0.7193886376917362, "completion_length": 2335.1041870117188, "epoch": 0.29714285714285715, "grad_norm": 0.3414742648601532, "kl": 0.25067138671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.97037808470444e-07, "loss": 0.0166, "reward": 0.7363164806738496, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7363164806738496, "reward_after_std": 0.7193886302411556, "reward_before_mean": 0.8749077282845974, "reward_before_std": 0.7035922594368458, "reward_change_max": 0.00020250678062438965, "reward_change_mean": -0.13859128253534436, "reward_change_min": -0.2278941599652171, "reward_change_std": 0.08999049849808216, "reward_std": 0.7193886563181877, "rewards/cosine_scaled_reward": 0.08328720182180405, "rewards/format_reward": 0.708333345130086, "step": 260 }, { "advantage_max": 1.001849688589573, "advantage_mean": -1.3969839007810236e-08, "advantage_min": -0.8723765462636948, "advantage_std": 0.7362087070941925, "completion_length": 3038.8334045410156, "epoch": 0.29828571428571427, "grad_norm": 0.6822984218597412, "kl": 0.32794189453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.939123048916173e-07, "loss": 0.0571, "reward": 0.10068884119391441, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.10068884119391441, "reward_after_std": 0.7362087145447731, "reward_before_mean": 0.18243083090055734, "reward_before_std": 0.756432544440031, "reward_change_max": 0.0, "reward_change_mean": -0.08174200495705009, "reward_change_min": -0.18367165885865688, "reward_change_std": 0.07381555391475558, "reward_std": 0.7362087294459343, "rewards/cosine_scaled_reward": -0.21086792647838593, "rewards/format_reward": 0.6041666809469461, "step": 261 }, { "advantage_max": 0.9958531409502029, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.0813852436840534, "advantage_std": 0.763180959969759, "completion_length": 3018.3334045410156, "epoch": 0.29942857142857143, "grad_norm": 0.37624964118003845, "kl": 0.2686767578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.907846610890011e-07, "loss": 0.0111, "reward": 0.41146421869052574, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.41146421869052574, "reward_after_std": 0.7631809748709202, "reward_before_mean": 0.5222134934738278, "reward_before_std": 0.783074539154768, "reward_change_max": 0.00028192996978759766, "reward_change_mean": -0.11074927542358637, "reward_change_min": -0.19585992395877838, "reward_change_std": 0.08221290446817875, "reward_std": 0.7631809934973717, "rewards/cosine_scaled_reward": -0.13472658768296242, "rewards/format_reward": 0.7916666865348816, "step": 262 }, { "advantage_max": 1.0192478522658348, "advantage_mean": -3.725290353973065e-09, "advantage_min": -0.6987932249903679, "advantage_std": 0.6710875891149044, "completion_length": 3003.0834045410156, "epoch": 0.30057142857142854, "grad_norm": 0.4050155282020569, "kl": 0.33062744140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.87655029499542e-07, "loss": 0.0446, "reward": 0.20357975119259208, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20357975119259208, "reward_after_std": 0.6710875928401947, "reward_before_mean": 0.2945442688651383, "reward_before_std": 0.6724608726799488, "reward_change_max": 0.0, "reward_change_mean": -0.09096452733501792, "reward_change_min": -0.17119490914046764, "reward_change_std": 0.06640669284388423, "reward_std": 0.6710876412689686, "rewards/cosine_scaled_reward": -0.18606120673939586, "rewards/format_reward": 0.6666666716337204, "step": 263 }, { "advantage_max": 1.0631650909781456, "advantage_mean": -3.849466756467024e-08, "advantage_min": -0.8726494163274765, "advantage_std": 0.7215870209038258, "completion_length": 3126.125, "epoch": 0.3017142857142857, "grad_norm": 0.8072293400764465, "kl": 0.355224609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.845235626570683e-07, "loss": 0.0737, "reward": 0.5229286458343267, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5229286458343267, "reward_after_std": 0.7215870320796967, "reward_before_mean": 0.6427558902651072, "reward_before_std": 0.718892015516758, "reward_change_max": 0.0, "reward_change_mean": -0.11982727702707052, "reward_change_min": -0.19109745789319277, "reward_change_std": 0.07728103827685118, "reward_std": 0.7215870432555676, "rewards/cosine_scaled_reward": -0.0848720595240593, "rewards/format_reward": 0.8125000186264515, "step": 264 }, { "advantage_max": 1.0975913107395172, "advantage_mean": -3.352761424046946e-08, "advantage_min": -0.926276370882988, "advantage_std": 0.7606191523373127, "completion_length": 2679.354217529297, "epoch": 0.3028571428571429, "grad_norm": 0.5886125564575195, "kl": 0.30853271484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.813904131848564e-07, "loss": 0.0555, "reward": 0.6098247529007494, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6098247529007494, "reward_after_std": 0.7606191374361515, "reward_before_mean": 0.7371023533632979, "reward_before_std": 0.7642861232161522, "reward_change_max": 0.0, "reward_change_mean": -0.1272775838151574, "reward_change_min": -0.2192865088582039, "reward_change_std": 0.08723006211221218, "reward_std": 0.7606191672384739, "rewards/cosine_scaled_reward": -0.06894884817302227, "rewards/format_reward": 0.8750000149011612, "step": 265 }, { "advantage_max": 0.9084913916885853, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.6256868913769722, "advantage_std": 0.5924951620399952, "completion_length": 3102.8334045410156, "epoch": 0.304, "grad_norm": 0.4780454933643341, "kl": 0.46588134765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.78255733788191e-07, "loss": 0.0411, "reward": 0.1223881570622325, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1223881570622325, "reward_after_std": 0.5924951657652855, "reward_before_mean": 0.20807474991306663, "reward_before_std": 0.5930629894137383, "reward_change_max": 0.0004331320524215698, "reward_change_mean": -0.08568659145385027, "reward_change_min": -0.16301941219717264, "reward_change_std": 0.061356313060969114, "reward_std": 0.5924951769411564, "rewards/cosine_scaled_reward": -0.22929597226902843, "rewards/format_reward": 0.666666679084301, "step": 266 }, { "advantage_max": 1.3950220122933388, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.9353981837630272, "advantage_std": 0.8997549638152122, "completion_length": 3407.604217529297, "epoch": 0.30514285714285716, "grad_norm": 0.9051889777183533, "kl": 0.6298828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.751196772469237e-07, "loss": 0.0458, "reward": 0.0949839185923338, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0949839185923338, "reward_after_std": 0.8997549638152122, "reward_before_mean": 0.16919339634478092, "reward_before_std": 0.915122464299202, "reward_change_max": 0.0003774687647819519, "reward_change_mean": -0.0742094716988504, "reward_change_min": -0.16107130236923695, "reward_change_std": 0.06824947381392121, "reward_std": 0.899754986166954, "rewards/cosine_scaled_reward": -0.1654033064842224, "rewards/format_reward": 0.5000000149011612, "step": 267 }, { "advantage_max": 1.3419854082167149, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -0.9137064553797245, "advantage_std": 0.8581621535122395, "completion_length": 2647.0834197998047, "epoch": 0.3062857142857143, "grad_norm": 0.5118639469146729, "kl": 0.40020751953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.71982396408026e-07, "loss": 0.0267, "reward": 0.2210367638617754, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2210367638617754, "reward_after_std": 0.858162172138691, "reward_before_mean": 0.30672881565988064, "reward_before_std": 0.8692396897822618, "reward_change_max": 0.0002562180161476135, "reward_change_mean": -0.0856920457445085, "reward_change_min": -0.19060960225760937, "reward_change_std": 0.07353492826223373, "reward_std": 0.8581621907651424, "rewards/cosine_scaled_reward": -0.1591356061398983, "rewards/format_reward": 0.6250000111758709, "step": 268 }, { "advantage_max": 1.0998243726789951, "advantage_mean": -3.476937859847595e-08, "advantage_min": -0.9841247573494911, "advantage_std": 0.7637926824390888, "completion_length": 2915.8750610351562, "epoch": 0.30742857142857144, "grad_norm": 0.5941024422645569, "kl": 0.42169189453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.688440441781398e-07, "loss": 0.0477, "reward": 0.608371525653638, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.608371525653638, "reward_after_std": 0.7637926861643791, "reward_before_mean": 0.7351412307471037, "reward_before_std": 0.7678135149180889, "reward_change_max": 0.0, "reward_change_mean": -0.12676969449967146, "reward_change_min": -0.22412027046084404, "reward_change_std": 0.08635794976726174, "reward_std": 0.7637927196919918, "rewards/cosine_scaled_reward": -0.03867942001670599, "rewards/format_reward": 0.8125000149011612, "step": 269 }, { "advantage_max": 1.4132294282317162, "advantage_mean": -1.9868216072360667e-08, "advantage_min": -1.029867060482502, "advantage_std": 0.9192027598619461, "completion_length": 2846.041748046875, "epoch": 0.30857142857142855, "grad_norm": 0.4943372309207916, "kl": 0.4290771484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.657047735161255e-07, "loss": 0.0507, "reward": 0.6552489723544568, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6552489723544568, "reward_after_std": 0.9192027673125267, "reward_before_mean": 0.7806877940893173, "reward_before_std": 0.920238234102726, "reward_change_max": 0.00011301040649414062, "reward_change_mean": -0.12543882615864277, "reward_change_min": -0.23829853534698486, "reward_change_std": 0.08876523096114397, "reward_std": 0.9192028120160103, "rewards/cosine_scaled_reward": 0.0049272209871560335, "rewards/format_reward": 0.7708333432674408, "step": 270 }, { "advantage_max": 0.8533101379871368, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.8463800251483917, "advantage_std": 0.6255792900919914, "completion_length": 2484.5625610351562, "epoch": 0.3097142857142857, "grad_norm": 0.6052711606025696, "kl": 0.325164794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.625647374256061e-07, "loss": 0.0482, "reward": 0.8622908256947994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8622908256947994, "reward_after_std": 0.625579297542572, "reward_before_mean": 1.0175694283097982, "reward_before_std": 0.6174435224384069, "reward_change_max": 0.0001889541745185852, "reward_change_mean": -0.15527859865687788, "reward_change_min": -0.24020841717720032, "reward_change_std": 0.09653888270258904, "reward_std": 0.6255793198943138, "rewards/cosine_scaled_reward": 0.15461806394159794, "rewards/format_reward": 0.7083333525806665, "step": 271 }, { "advantage_max": 1.2468843683600426, "advantage_mean": -2.3283064809476173e-08, "advantage_min": -1.231340929865837, "advantage_std": 0.9128948226571083, "completion_length": 2975.2500915527344, "epoch": 0.31085714285714283, "grad_norm": 0.5220039486885071, "kl": 0.556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.594240889475106e-07, "loss": 0.0567, "reward": 0.5736992135643959, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5736992135643959, "reward_after_std": 0.9128948375582695, "reward_before_mean": 0.6949340249411762, "reward_before_std": 0.9327790774405003, "reward_change_max": 0.0, "reward_change_mean": -0.121234814170748, "reward_change_min": -0.2265043631196022, "reward_change_std": 0.09475089283660054, "reward_std": 0.9128948450088501, "rewards/cosine_scaled_reward": 0.02455033385194838, "rewards/format_reward": 0.6458333544433117, "step": 272 }, { "advantage_max": 1.0025320798158646, "advantage_mean": -3.476937715518602e-08, "advantage_min": -1.0941287279129028, "advantage_std": 0.7559049688279629, "completion_length": 2887.3959350585938, "epoch": 0.312, "grad_norm": 0.577610433101654, "kl": 0.543914794921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.562829811526154e-07, "loss": 0.0487, "reward": 0.6015369053930044, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6015369053930044, "reward_after_std": 0.7559049762785435, "reward_before_mean": 0.7293371063424274, "reward_before_std": 0.7717136181890965, "reward_change_max": 0.0007090941071510315, "reward_change_mean": -0.1278002504259348, "reward_change_min": -0.22420413699001074, "reward_change_std": 0.09308719309046865, "reward_std": 0.755904994904995, "rewards/cosine_scaled_reward": -0.010331441648304462, "rewards/format_reward": 0.7500000186264515, "step": 273 }, { "advantage_max": 0.98700400441885, "advantage_mean": -2.7318795781106076e-08, "advantage_min": -1.1434022709727287, "advantage_std": 0.8054995872080326, "completion_length": 2140.0209197998047, "epoch": 0.31314285714285717, "grad_norm": 0.5769612193107605, "kl": 0.376312255859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.531415671340826e-07, "loss": 0.007, "reward": 1.085528818424791, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.085528818424791, "reward_after_std": 0.8054995872080326, "reward_before_mean": 1.2585789044387639, "reward_before_std": 0.8196575529873371, "reward_change_max": 0.0, "reward_change_mean": -0.17305006738752127, "reward_change_min": -0.2874143682420254, "reward_change_std": 0.1149548664689064, "reward_std": 0.8054995909333229, "rewards/cosine_scaled_reward": 0.2542894408106804, "rewards/format_reward": 0.7500000186264515, "step": 274 }, { "advantage_max": 1.4050886556506157, "advantage_mean": -1.614292433060882e-08, "advantage_min": -1.2774573862552643, "advantage_std": 0.9720547124743462, "completion_length": 2419.020896911621, "epoch": 0.3142857142857143, "grad_norm": 0.4276851415634155, "kl": 0.441986083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.5e-07, "loss": 0.0204, "reward": 0.9169086045585573, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9169086045585573, "reward_after_std": 0.9720547050237656, "reward_before_mean": 1.0666476301848888, "reward_before_std": 0.9818779900670052, "reward_change_max": 0.00012986361980438232, "reward_change_mean": -0.14973904564976692, "reward_change_min": -0.26850674487650394, "reward_change_std": 0.10206946218386292, "reward_std": 0.972054734826088, "rewards/cosine_scaled_reward": 0.17915714625269175, "rewards/format_reward": 0.708333345130086, "step": 275 }, { "advantage_max": 1.4430090934038162, "advantage_mean": -1.8626451658843024e-08, "advantage_min": -1.3472927510738373, "advantage_std": 1.0916124135255814, "completion_length": 2607.895896911621, "epoch": 0.31542857142857145, "grad_norm": 0.6251004338264465, "kl": 0.42620849609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.468584328659172e-07, "loss": 0.0515, "reward": 0.9942958541214466, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9942958541214466, "reward_after_std": 1.0916124545037746, "reward_before_mean": 1.1503968685865402, "reward_before_std": 1.116207093000412, "reward_change_max": 0.0, "reward_change_mean": -0.15610099490731955, "reward_change_min": -0.30604325234889984, "reward_change_std": 0.11970954155549407, "reward_std": 1.091612495481968, "rewards/cosine_scaled_reward": 0.14811508357524872, "rewards/format_reward": 0.8541666865348816, "step": 276 }, { "advantage_max": 1.238736167550087, "advantage_mean": -3.104408841103634e-09, "advantage_min": -0.7994346469640732, "advantage_std": 0.7560862153768539, "completion_length": 2522.9583892822266, "epoch": 0.31657142857142856, "grad_norm": 0.6035568118095398, "kl": 0.409759521484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.437170188473847e-07, "loss": 0.0173, "reward": 0.3422631425783038, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3422631425783038, "reward_after_std": 0.7560862153768539, "reward_before_mean": 0.44170549139380455, "reward_before_std": 0.7494552433490753, "reward_change_max": 0.0009563863277435303, "reward_change_mean": -0.099442342761904, "reward_change_min": -0.18363922648131847, "reward_change_std": 0.07015622220933437, "reward_std": 0.7560862228274345, "rewards/cosine_scaled_reward": -0.08123059757053852, "rewards/format_reward": 0.604166679084301, "step": 277 }, { "advantage_max": 1.2197536677122116, "advantage_mean": -2.6077032755367213e-08, "advantage_min": -0.6300609707832336, "advantage_std": 0.7097632475197315, "completion_length": 2167.6666870117188, "epoch": 0.3177142857142857, "grad_norm": 0.5051394104957581, "kl": 0.344940185546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.405759110524894e-07, "loss": -0.012, "reward": 0.770529605448246, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.770529605448246, "reward_after_std": 0.7097632475197315, "reward_before_mean": 0.9097544327378273, "reward_before_std": 0.6844562515616417, "reward_change_max": 0.0, "reward_change_mean": -0.13922483753412962, "reward_change_min": -0.22356512024998665, "reward_change_std": 0.0822983281686902, "reward_std": 0.7097632624208927, "rewards/cosine_scaled_reward": 0.04862721357494593, "rewards/format_reward": 0.8125000149011612, "step": 278 }, { "advantage_max": 1.5004774332046509, "advantage_mean": -1.4901161971003773e-08, "advantage_min": -1.1552449837327003, "advantage_std": 0.99924161657691, "completion_length": 3084.0000915527344, "epoch": 0.31885714285714284, "grad_norm": 1.2428194284439087, "kl": 0.51416015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.37435262574394e-07, "loss": 0.0967, "reward": 0.4770056903362274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4770056903362274, "reward_after_std": 0.9992415867745876, "reward_before_mean": 0.5843611843883991, "reward_before_std": 1.010837584733963, "reward_change_max": 0.0008535534143447876, "reward_change_mean": -0.10735548380762339, "reward_change_min": -0.21564262732863426, "reward_change_std": 0.09189391741529107, "reward_std": 0.9992415942251682, "rewards/cosine_scaled_reward": -0.09323608374688774, "rewards/format_reward": 0.770833358168602, "step": 279 }, { "advantage_max": 1.2737629637122154, "advantage_mean": -2.4214387439602802e-08, "advantage_min": -1.3873290121555328, "advantage_std": 0.9990286827087402, "completion_length": 2394.5625381469727, "epoch": 0.32, "grad_norm": 0.9669297337532043, "kl": 0.3808135986328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.342952264838747e-07, "loss": 0.0576, "reward": 0.9942773611983284, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9942773611983284, "reward_after_std": 0.999028667807579, "reward_before_mean": 1.1533079743385315, "reward_before_std": 1.0210191681981087, "reward_change_max": 0.0, "reward_change_mean": -0.15903064515441656, "reward_change_min": -0.28807317093014717, "reward_change_std": 0.11587743367999792, "reward_std": 0.9990286976099014, "rewards/cosine_scaled_reward": 0.18082065833732486, "rewards/format_reward": 0.7916666865348816, "step": 280 }, { "advantage_max": 0.9880325570702553, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.7407893761992455, "advantage_std": 0.6236730217933655, "completion_length": 3392.2709350585938, "epoch": 0.3211428571428571, "grad_norm": 1.2373231649398804, "kl": 0.67236328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.311559558218603e-07, "loss": 0.0384, "reward": 0.04596708505414426, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04596708505414426, "reward_after_std": 0.6236730217933655, "reward_before_mean": 0.12323334789834917, "reward_before_std": 0.6242800801992416, "reward_change_max": 0.0001795440912246704, "reward_change_mean": -0.0772662702947855, "reward_change_min": -0.14489263761788607, "reward_change_std": 0.05781849008053541, "reward_std": 0.6236730627715588, "rewards/cosine_scaled_reward": -0.24046666733920574, "rewards/format_reward": 0.6041666753590107, "step": 281 }, { "advantage_max": 1.0304613038897514, "advantage_mean": -2.731879727990716e-08, "advantage_min": -0.9215684104710817, "advantage_std": 0.7275769785046577, "completion_length": 2536.229248046875, "epoch": 0.3222857142857143, "grad_norm": 0.366413950920105, "kl": 0.36102294921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.28017603591974e-07, "loss": 0.0364, "reward": 0.8688683672808111, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8688683672808111, "reward_after_std": 0.7275770045816898, "reward_before_mean": 1.0214176895096898, "reward_before_std": 0.722488921135664, "reward_change_max": 0.0, "reward_change_mean": -0.15254933293908834, "reward_change_min": -0.23969142325222492, "reward_change_std": 0.09497861238196492, "reward_std": 0.7275770120322704, "rewards/cosine_scaled_reward": 0.07320883683860302, "rewards/format_reward": 0.8750000074505806, "step": 282 }, { "advantage_max": 1.31711445748806, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.9042372852563858, "advantage_std": 0.8224817290902138, "completion_length": 2863.3541870117188, "epoch": 0.32342857142857145, "grad_norm": 1.1226824522018433, "kl": 0.5413818359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.248803227530763e-07, "loss": 0.0282, "reward": 0.6211966900154948, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6211966900154948, "reward_after_std": 0.8224817328155041, "reward_before_mean": 0.7448102962225676, "reward_before_std": 0.814110279083252, "reward_change_max": 0.0, "reward_change_mean": -0.12361361924558878, "reward_change_min": -0.21856752410531044, "reward_change_std": 0.0800158535130322, "reward_std": 0.8224817700684071, "rewards/cosine_scaled_reward": -0.01301151653751731, "rewards/format_reward": 0.7708333469927311, "step": 283 }, { "advantage_max": 1.0646238774061203, "advantage_mean": -2.1730860499946658e-08, "advantage_min": -1.107110746204853, "advantage_std": 0.7788519412279129, "completion_length": 2708.2500762939453, "epoch": 0.32457142857142857, "grad_norm": 0.45679256319999695, "kl": 0.387939453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.21744266211809e-07, "loss": 0.0454, "reward": 0.8224916737526655, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8224916737526655, "reward_after_std": 0.7788519263267517, "reward_before_mean": 0.9693624749779701, "reward_before_std": 0.784541878849268, "reward_change_max": 0.0, "reward_change_mean": -0.14687081146985292, "reward_change_min": -0.2431420534849167, "reward_change_std": 0.09596487786620855, "reward_std": 0.7788519412279129, "rewards/cosine_scaled_reward": 0.047181230038404465, "rewards/format_reward": 0.8750000223517418, "step": 284 }, { "advantage_max": 0.9761894531548023, "advantage_mean": -2.2351741957304938e-08, "advantage_min": -1.0390981957316399, "advantage_std": 0.7664303183555603, "completion_length": 2226.979232788086, "epoch": 0.32571428571428573, "grad_norm": 0.5204533934593201, "kl": 0.251617431640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.186095868151436e-07, "loss": -0.0131, "reward": 0.5884040435776114, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5884040435776114, "reward_after_std": 0.7664303220808506, "reward_before_mean": 0.7158968299627304, "reward_before_std": 0.7851696014404297, "reward_change_max": 0.0, "reward_change_mean": -0.1274928255006671, "reward_change_min": -0.24082167074084282, "reward_change_std": 0.09329877560958266, "reward_std": 0.7664303407073021, "rewards/cosine_scaled_reward": -0.027468256652355194, "rewards/format_reward": 0.7708333469927311, "step": 285 }, { "advantage_max": 1.318112462759018, "advantage_mean": -1.3038516488705909e-08, "advantage_min": -0.8753121644258499, "advantage_std": 0.8130339942872524, "completion_length": 2779.125045776367, "epoch": 0.32685714285714285, "grad_norm": 1.1802911758422852, "kl": 0.4268951416015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.154764373429315e-07, "loss": 0.073, "reward": 0.59721265360713, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.59721265360713, "reward_after_std": 0.813034001737833, "reward_before_mean": 0.7192092165350914, "reward_before_std": 0.80387282371521, "reward_change_max": 5.13419508934021e-05, "reward_change_mean": -0.12199656944721937, "reward_change_min": -0.2146667167544365, "reward_change_std": 0.08022241853177547, "reward_std": 0.8130340240895748, "rewards/cosine_scaled_reward": -0.036228728480637074, "rewards/format_reward": 0.7916666865348816, "step": 286 }, { "advantage_max": 1.050818793475628, "advantage_mean": -1.738468857759301e-08, "advantage_min": -1.055874053388834, "advantage_std": 0.798844076693058, "completion_length": 2294.9584197998047, "epoch": 0.328, "grad_norm": 0.42027631402015686, "kl": 0.3113861083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.123449705004581e-07, "loss": 0.018, "reward": 0.5748773384839296, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5748773384839296, "reward_after_std": 0.7988440804183483, "reward_before_mean": 0.6994684183155186, "reward_before_std": 0.816389974206686, "reward_change_max": 0.0005205199122428894, "reward_change_mean": -0.12459108140319586, "reward_change_min": -0.2359716072678566, "reward_change_std": 0.09250497492030263, "reward_std": 0.7988441213965416, "rewards/cosine_scaled_reward": -0.035682463087141514, "rewards/format_reward": 0.770833358168602, "step": 287 }, { "advantage_max": 1.2926170453429222, "advantage_mean": -9.313226134732844e-09, "advantage_min": -1.0218810439109802, "advantage_std": 0.8248549252748489, "completion_length": 2799.666732788086, "epoch": 0.3291428571428571, "grad_norm": 0.7103667855262756, "kl": 0.32928466796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.09215338910999e-07, "loss": 0.0461, "reward": 0.7114497211296111, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7114497211296111, "reward_after_std": 0.8248548954725266, "reward_before_mean": 0.8447219170629978, "reward_before_std": 0.8177867233753204, "reward_change_max": 0.0, "reward_change_mean": -0.13327217940241098, "reward_change_min": -0.22586338967084885, "reward_change_std": 0.086294736713171, "reward_std": 0.8248549252748489, "rewards/cosine_scaled_reward": -0.004722386132925749, "rewards/format_reward": 0.8541666828095913, "step": 288 }, { "advantage_max": 0.8171020597219467, "advantage_mean": -2.4835277168122616e-09, "advantage_min": -0.8185729384422302, "advantage_std": 0.5793700739741325, "completion_length": 2317.0834197998047, "epoch": 0.3302857142857143, "grad_norm": 0.2669129967689514, "kl": 0.31494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.060876951083828e-07, "loss": 0.03, "reward": 0.7267960589379072, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7267960589379072, "reward_after_std": 0.5793700367212296, "reward_before_mean": 0.8693213667720556, "reward_before_std": 0.5672201681882143, "reward_change_max": 0.0003613904118537903, "reward_change_mean": -0.1425252864137292, "reward_change_min": -0.21070224232971668, "reward_change_std": 0.08446116000413895, "reward_std": 0.5793700478971004, "rewards/cosine_scaled_reward": -0.002839326858520508, "rewards/format_reward": 0.8750000074505806, "step": 289 }, { "advantage_max": 1.2880224622786045, "advantage_mean": -2.1575639885806908e-08, "advantage_min": -0.8768154866993427, "advantage_std": 0.7828379794955254, "completion_length": 2832.7709045410156, "epoch": 0.3314285714285714, "grad_norm": 1.381922960281372, "kl": 0.517333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 5.02962191529556e-07, "loss": 0.0092, "reward": 0.6558038564398885, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6558038564398885, "reward_after_std": 0.7828379720449448, "reward_before_mean": 0.7833415642380714, "reward_before_std": 0.7664486858993769, "reward_change_max": 0.0001693814992904663, "reward_change_mean": -0.12753771618008614, "reward_change_min": -0.22125390730798244, "reward_change_std": 0.08206295082345605, "reward_std": 0.782837986946106, "rewards/cosine_scaled_reward": -0.07707922626286745, "rewards/format_reward": 0.9375000074505806, "step": 290 }, { "advantage_max": 1.7112552598118782, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.9665621928870678, "advantage_std": 1.038928933441639, "completion_length": 2836.479248046875, "epoch": 0.3325714285714286, "grad_norm": 0.6068376302719116, "kl": 0.44158935546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.998389805071536e-07, "loss": 0.0334, "reward": 0.6122805885970592, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6122805885970592, "reward_after_std": 1.0389289036393166, "reward_before_mean": 0.7282662447541952, "reward_before_std": 1.0348973795771599, "reward_change_max": 0.0002287551760673523, "reward_change_mean": -0.11598564963787794, "reward_change_min": -0.23866364359855652, "reward_change_std": 0.08767437376081944, "reward_std": 1.0389289483428001, "rewards/cosine_scaled_reward": -0.042116889264434576, "rewards/format_reward": 0.8125000074505806, "step": 291 }, { "advantage_max": 0.834974117577076, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.7933941446244717, "advantage_std": 0.5692648068070412, "completion_length": 3146.5625610351562, "epoch": 0.33371428571428574, "grad_norm": 0.9853768348693848, "kl": 0.4609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.967182142620745e-07, "loss": 0.0284, "reward": 0.31484566256403923, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.31484566256403923, "reward_after_std": 0.5692648142576218, "reward_before_mean": 0.41969927214086056, "reward_before_std": 0.5654185600578785, "reward_change_max": 0.0, "reward_change_mean": -0.10485361609607935, "reward_change_min": -0.16875915229320526, "reward_change_std": 0.06821413105353713, "reward_std": 0.569264829158783, "rewards/cosine_scaled_reward": -0.1339003685861826, "rewards/format_reward": 0.6875000186264515, "step": 292 }, { "advantage_max": 0.906766127794981, "advantage_mean": -5.122274235325186e-08, "advantage_min": -0.7590428665280342, "advantage_std": 0.6250920966267586, "completion_length": 2332.7709197998047, "epoch": 0.33485714285714285, "grad_norm": 1.096217393875122, "kl": 0.2679443359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.93600044896063e-07, "loss": -0.0133, "reward": 0.7127600498497486, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7127600498497486, "reward_after_std": 0.6250920966267586, "reward_before_mean": 0.8526535518467426, "reward_before_std": 0.6178444065153599, "reward_change_max": 0.0003881379961967468, "reward_change_mean": -0.13989355321973562, "reward_change_min": -0.22731606289744377, "reward_change_std": 0.08642083266749978, "reward_std": 0.6250921115279198, "rewards/cosine_scaled_reward": -0.0007565605919808149, "rewards/format_reward": 0.854166679084301, "step": 293 }, { "advantage_max": 0.9793417081236839, "advantage_mean": -2.1730860388924356e-08, "advantage_min": -0.9472349882125854, "advantage_std": 0.7169453538954258, "completion_length": 3070.6875915527344, "epoch": 0.336, "grad_norm": 0.9082557559013367, "kl": 0.422149658203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.904846243842949e-07, "loss": 0.0235, "reward": 0.5174050983041525, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5174050983041525, "reward_after_std": 0.7169453427195549, "reward_before_mean": 0.6380012258887291, "reward_before_std": 0.7268225848674774, "reward_change_max": 0.00013885647058486938, "reward_change_mean": -0.12059612292796373, "reward_change_min": -0.21557171922177076, "reward_change_std": 0.08509104792028666, "reward_std": 0.7169453501701355, "rewards/cosine_scaled_reward": -0.01433273358270526, "rewards/format_reward": 0.6666666809469461, "step": 294 }, { "advantage_max": 1.2744291499257088, "advantage_mean": -2.6077031867188794e-08, "advantage_min": -1.3888452351093292, "advantage_std": 1.0510571710765362, "completion_length": 2997.5001068115234, "epoch": 0.33714285714285713, "grad_norm": 0.5747594237327576, "kl": 0.39453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.873721045679706e-07, "loss": 0.0133, "reward": 1.1005014963448048, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.1005014963448048, "reward_after_std": 1.0510571599006653, "reward_before_mean": 1.2698866855353117, "reward_before_std": 1.082613728940487, "reward_change_max": 0.0, "reward_change_mean": -0.1693851826712489, "reward_change_min": -0.2982129603624344, "reward_change_std": 0.1265355981886387, "reward_std": 1.0510571897029877, "rewards/cosine_scaled_reward": 0.23910999950021505, "rewards/format_reward": 0.7916666865348816, "step": 295 }, { "advantage_max": 0.8809316977858543, "advantage_mean": -2.0489097363185493e-08, "advantage_min": -0.9141372889280319, "advantage_std": 0.693729005753994, "completion_length": 3155.250030517578, "epoch": 0.3382857142857143, "grad_norm": 0.6257061958312988, "kl": 0.3314208984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.842626371469149e-07, "loss": 0.0234, "reward": 0.43978652730584145, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.43978652730584145, "reward_after_std": 0.6937290169298649, "reward_before_mean": 0.5553540624678135, "reward_before_std": 0.7088712975382805, "reward_change_max": 0.0, "reward_change_mean": -0.11556753842160106, "reward_change_min": -0.20852719619870186, "reward_change_std": 0.08426861232146621, "reward_std": 0.6937290467321873, "rewards/cosine_scaled_reward": -0.09732297994196415, "rewards/format_reward": 0.7500000149011612, "step": 296 }, { "advantage_max": 1.0527857542037964, "advantage_mean": -8.071462664904772e-09, "advantage_min": -0.8540095165371895, "advantage_std": 0.7035439126193523, "completion_length": 3335.041717529297, "epoch": 0.3394285714285714, "grad_norm": 0.4017558693885803, "kl": 0.2974853515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.811563736721829e-07, "loss": 0.0169, "reward": 0.5707921356661245, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5707921356661245, "reward_after_std": 0.7035439126193523, "reward_before_mean": 0.6951440321281552, "reward_before_std": 0.6948239048942924, "reward_change_max": 1.8790364265441895e-05, "reward_change_mean": -0.12435187119990587, "reward_change_min": -0.21546250581741333, "reward_change_std": 0.08488783519715071, "reward_std": 0.7035439349710941, "rewards/cosine_scaled_reward": -0.01701133605092764, "rewards/format_reward": 0.7291666902601719, "step": 297 }, { "advantage_max": 0.8668230138719082, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.713488981127739, "advantage_std": 0.5623467974364758, "completion_length": 2715.2084045410156, "epoch": 0.3405714285714286, "grad_norm": 0.33654358983039856, "kl": 0.1712646484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.780534655386743e-07, "loss": -0.0011, "reward": 0.5386511981487274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5386511981487274, "reward_after_std": 0.5623467974364758, "reward_before_mean": 0.663620114326477, "reward_before_std": 0.54873913154006, "reward_change_max": 0.00012120604515075684, "reward_change_mean": -0.12496891105547547, "reward_change_min": -0.19604242593050003, "reward_change_std": 0.07404929213225842, "reward_std": 0.5623467974364758, "rewards/cosine_scaled_reward": -0.09527328005060554, "rewards/format_reward": 0.8541666828095913, "step": 298 }, { "advantage_max": 1.1117474511265755, "advantage_mean": -5.277494585786968e-08, "advantage_min": -0.9139253497123718, "advantage_std": 0.7722109295427799, "completion_length": 3096.2084197998047, "epoch": 0.3417142857142857, "grad_norm": 0.6103402376174927, "kl": 0.14898681640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.749540639777539e-07, "loss": 0.0318, "reward": 0.7331810034811497, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7331810034811497, "reward_after_std": 0.7722109220921993, "reward_before_mean": 0.8717173463664949, "reward_before_std": 0.7732285261154175, "reward_change_max": 8.639693260192871e-05, "reward_change_mean": -0.13853636337444186, "reward_change_min": -0.24009987153112888, "reward_change_std": 0.09422993147745728, "reward_std": 0.7722109258174896, "rewards/cosine_scaled_reward": 0.08169198967516422, "rewards/format_reward": 0.7083333469927311, "step": 299 }, { "advantage_max": 1.3425211235880852, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.9276892244815826, "advantage_std": 0.9135083742439747, "completion_length": 3236.479217529297, "epoch": 0.34285714285714286, "grad_norm": 1.0381654500961304, "kl": 0.1722412109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.7185832004988133e-07, "loss": 0.0563, "reward": 0.4564114101231098, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4564114101231098, "reward_after_std": 0.9135083518922329, "reward_before_mean": 0.5646192319691181, "reward_before_std": 0.9233267642557621, "reward_change_max": 0.0, "reward_change_mean": -0.10820781346410513, "reward_change_min": -0.2130340477451682, "reward_change_std": 0.08647576486691833, "reward_std": 0.913508377969265, "rewards/cosine_scaled_reward": -0.07185705937445164, "rewards/format_reward": 0.7083333488553762, "step": 300 }, { "advantage_max": 0.8454348556697369, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.6558514572679996, "advantage_std": 0.5693494603037834, "completion_length": 2719.812530517578, "epoch": 0.344, "grad_norm": 0.5055859088897705, "kl": 0.1689453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.68766384637248e-07, "loss": 0.0221, "reward": 0.32430399395525455, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32430399395525455, "reward_after_std": 0.5693494640290737, "reward_before_mean": 0.42983817867934704, "reward_before_std": 0.5612027458846569, "reward_change_max": 5.427747964859009e-05, "reward_change_mean": -0.10553418751806021, "reward_change_min": -0.1772037437185645, "reward_change_std": 0.07288656942546368, "reward_std": 0.5693494826555252, "rewards/cosine_scaled_reward": -0.08716425858438015, "rewards/format_reward": 0.6041666734963655, "step": 301 }, { "advantage_max": 1.102532796561718, "advantage_mean": -1.428027990302283e-08, "advantage_min": -0.8525043353438377, "advantage_std": 0.7396768815815449, "completion_length": 2854.2083892822266, "epoch": 0.34514285714285714, "grad_norm": 0.38935554027557373, "kl": 0.149932861328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.656784084364238e-07, "loss": 0.0121, "reward": 0.798190388828516, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.798190388828516, "reward_after_std": 0.7396768666803837, "reward_before_mean": 0.942897442728281, "reward_before_std": 0.7333322390913963, "reward_change_max": 8.501112461090088e-05, "reward_change_mean": -0.14470706274732947, "reward_change_min": -0.24838530272245407, "reward_change_std": 0.09280881565064192, "reward_std": 0.739676907658577, "rewards/cosine_scaled_reward": 0.06519871880300343, "rewards/format_reward": 0.8125000074505806, "step": 302 }, { "advantage_max": 1.1021673679351807, "advantage_mean": -5.277494774524882e-09, "advantage_min": -1.2830260694026947, "advantage_std": 0.8821544721722603, "completion_length": 2851.916748046875, "epoch": 0.3462857142857143, "grad_norm": 0.24663765728473663, "kl": 0.149932861328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.6259454195101267e-07, "loss": -0.0177, "reward": 0.7762341545894742, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7762341545894742, "reward_after_std": 0.8821544609963894, "reward_before_mean": 0.9183368273079395, "reward_before_std": 0.9060549177229404, "reward_change_max": 0.0, "reward_change_mean": -0.1421026550233364, "reward_change_min": -0.2564028147608042, "reward_change_std": 0.10370448138564825, "reward_std": 0.8821544758975506, "rewards/cosine_scaled_reward": 0.07375173456966877, "rewards/format_reward": 0.7708333507180214, "step": 303 }, { "advantage_max": 0.8954240456223488, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.7039155513048172, "advantage_std": 0.6271420307457447, "completion_length": 3038.6459350585938, "epoch": 0.3474285714285714, "grad_norm": 0.29657599329948425, "kl": 0.173858642578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.59514935484316e-07, "loss": 0.0226, "reward": 0.5110467355698347, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5110467355698347, "reward_after_std": 0.6271420381963253, "reward_before_mean": 0.6330133527517319, "reward_before_std": 0.6232540085911751, "reward_change_max": 0.0, "reward_change_mean": -0.12196661904454231, "reward_change_min": -0.21976127475500107, "reward_change_std": 0.08127523586153984, "reward_std": 0.6271420530974865, "rewards/cosine_scaled_reward": -0.08974333480000496, "rewards/format_reward": 0.8125000037252903, "step": 304 }, { "advantage_max": 0.9175121039152145, "advantage_mean": -8.381903671139668e-09, "advantage_min": -0.6949810571968555, "advantage_std": 0.5830536782741547, "completion_length": 3196.8958740234375, "epoch": 0.3485714285714286, "grad_norm": 0.1946064531803131, "kl": 0.188507080078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.5643973913200837e-07, "loss": 0.0192, "reward": 0.47706126113189384, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.47706126113189384, "reward_after_std": 0.5830536782741547, "reward_before_mean": 0.595546220894903, "reward_before_std": 0.5727699100971222, "reward_change_max": 8.014589548110962e-05, "reward_change_mean": -0.11848497577011585, "reward_change_min": -0.18940737936645746, "reward_change_std": 0.07213541446253657, "reward_std": 0.5830537006258965, "rewards/cosine_scaled_reward": -0.014726895838975906, "rewards/format_reward": 0.6250000055879354, "step": 305 }, { "advantage_max": 1.2761941775679588, "advantage_mean": -4.004687093051018e-08, "advantage_min": -1.1722888499498367, "advantage_std": 0.9643121734261513, "completion_length": 3052.8334045410156, "epoch": 0.3497142857142857, "grad_norm": 0.893444299697876, "kl": 0.17730712890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.5336910277482155e-07, "loss": 0.0269, "reward": 0.9540768321603537, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9540768321603537, "reward_after_std": 0.9643121659755707, "reward_before_mean": 1.1099904502625577, "reward_before_std": 0.9816011004149914, "reward_change_max": 0.0, "reward_change_mean": -0.15591357089579105, "reward_change_min": -0.29588284343481064, "reward_change_std": 0.11661555664613843, "reward_std": 0.9643121883273125, "rewards/cosine_scaled_reward": 0.16957851639017463, "rewards/format_reward": 0.7708333507180214, "step": 306 }, { "advantage_max": 1.2773448526859283, "advantage_mean": -1.552204287325054e-08, "advantage_min": -1.1109627187252045, "advantage_std": 0.8845279589295387, "completion_length": 2951.166748046875, "epoch": 0.35085714285714287, "grad_norm": 0.8295795321464539, "kl": 0.1986083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.503031760712397e-07, "loss": 0.0341, "reward": 0.4779685065150261, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4779685065150261, "reward_after_std": 0.8845279365777969, "reward_before_mean": 0.5894667021930218, "reward_before_std": 0.8981221094727516, "reward_change_max": 0.0001471191644668579, "reward_change_mean": -0.11149819893762469, "reward_change_min": -0.21481576841324568, "reward_change_std": 0.08700747834518552, "reward_std": 0.8845279365777969, "rewards/cosine_scaled_reward": -0.08026665821671486, "rewards/format_reward": 0.7500000204890966, "step": 307 }, { "advantage_max": 1.3166191279888153, "advantage_mean": -2.4524828390326547e-08, "advantage_min": -0.987498015165329, "advantage_std": 0.9139077328145504, "completion_length": 3266.2709350585938, "epoch": 0.352, "grad_norm": 0.409768670797348, "kl": 0.284423828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.4724210845020494e-07, "loss": 0.0142, "reward": 0.44744166173040867, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.44744166173040867, "reward_after_std": 0.9139077588915825, "reward_before_mean": 0.555396830663085, "reward_before_std": 0.9306747391819954, "reward_change_max": 0.0, "reward_change_mean": -0.10795515077188611, "reward_change_min": -0.22507263347506523, "reward_change_std": 0.0874811913818121, "reward_std": 0.9139078035950661, "rewards/cosine_scaled_reward": -0.06605160096660256, "rewards/format_reward": 0.6875000093132257, "step": 308 }, { "advantage_max": 1.3376531079411507, "advantage_mean": -1.6453365614399473e-08, "advantage_min": -1.0297049805521965, "advantage_std": 0.9688749574124813, "completion_length": 3037.020950317383, "epoch": 0.35314285714285715, "grad_norm": 0.8121221661567688, "kl": 0.24346923828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.441860491038345e-07, "loss": 0.0187, "reward": 0.40955112874507904, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.40955112874507904, "reward_after_std": 0.9688749276101589, "reward_before_mean": 0.5138961069751531, "reward_before_std": 0.99384855479002, "reward_change_max": 0.0001594945788383484, "reward_change_mean": -0.10434500779956579, "reward_change_min": -0.22386514395475388, "reward_change_std": 0.09169822558760643, "reward_std": 0.9688749499619007, "rewards/cosine_scaled_reward": -0.06596861826255918, "rewards/format_reward": 0.6458333488553762, "step": 309 }, { "advantage_max": 1.017396479845047, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -0.9639165177941322, "advantage_std": 0.7473810315132141, "completion_length": 2709.916732788086, "epoch": 0.35428571428571426, "grad_norm": 0.41897907853126526, "kl": 0.2879638671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.4113514698014953e-07, "loss": 0.0421, "reward": 0.46168462419882417, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46168462419882417, "reward_after_std": 0.7473810240626335, "reward_before_mean": 0.5762034375220537, "reward_before_std": 0.7604282721877098, "reward_change_max": 0.0, "reward_change_mean": -0.11451879888772964, "reward_change_min": -0.22137450985610485, "reward_change_std": 0.08378444751724601, "reward_std": 0.7473810315132141, "rewards/cosine_scaled_reward": -0.07648163288831711, "rewards/format_reward": 0.729166679084301, "step": 310 }, { "advantage_max": 1.0443845875561237, "advantage_mean": -3.818422714130243e-08, "advantage_min": -1.2661523073911667, "advantage_std": 0.8653931841254234, "completion_length": 2787.8126220703125, "epoch": 0.3554285714285714, "grad_norm": 1.1151634454727173, "kl": 0.28533935546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3808955077581546e-07, "loss": 0.068, "reward": 0.7599540562368929, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7599540562368929, "reward_after_std": 0.8653931654989719, "reward_before_mean": 0.9020670726895332, "reward_before_std": 0.8923026695847511, "reward_change_max": 1.7762184143066406e-05, "reward_change_mean": -0.14211303647607565, "reward_change_min": -0.26006680727005005, "reward_change_std": 0.10649908194318414, "reward_std": 0.865393191576004, "rewards/cosine_scaled_reward": 0.076033522374928, "rewards/format_reward": 0.7500000260770321, "step": 311 }, { "advantage_max": 0.9848646819591522, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.8820649348199368, "advantage_std": 0.672922782599926, "completion_length": 2488.7500915527344, "epoch": 0.3565714285714286, "grad_norm": 0.5053475499153137, "kl": 0.272918701171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.350494089288943e-07, "loss": 0.0146, "reward": 1.1891271751374006, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.1891271751374006, "reward_after_std": 0.6729227676987648, "reward_before_mean": 1.3720673564821482, "reward_before_std": 0.6550161205232143, "reward_change_max": 0.0, "reward_change_mean": -0.18294014502316713, "reward_change_min": -0.2811252400279045, "reward_change_std": 0.10329680563881993, "reward_std": 0.6729227751493454, "rewards/cosine_scaled_reward": 0.2172836670652032, "rewards/format_reward": 0.9375000074505806, "step": 312 }, { "advantage_max": 1.3496510535478592, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -1.0200391113758087, "advantage_std": 0.8189485613256693, "completion_length": 2915.5208892822266, "epoch": 0.3577142857142857, "grad_norm": 0.46193355321884155, "kl": 0.361846923828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.3201486961161093e-07, "loss": 0.02, "reward": 0.9410210661590099, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9410210661590099, "reward_after_std": 0.8189485538750887, "reward_before_mean": 1.0941763501614332, "reward_before_std": 0.7930347509682178, "reward_change_max": 0.0, "reward_change_mean": -0.15315525699406862, "reward_change_min": -0.24925747141242027, "reward_change_std": 0.09821537788957357, "reward_std": 0.8189485874027014, "rewards/cosine_scaled_reward": 0.1512548227328807, "rewards/format_reward": 0.7916666865348816, "step": 313 }, { "advantage_max": 1.112821839749813, "advantage_mean": -6.643434680153604e-08, "advantage_min": -0.8538470044732094, "advantage_std": 0.7443932630121708, "completion_length": 2624.104248046875, "epoch": 0.3588571428571429, "grad_norm": 0.8362889289855957, "kl": 0.273956298828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2898608072313045e-07, "loss": 0.0064, "reward": 1.070881293155253, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.070881293155253, "reward_after_std": 0.744393277913332, "reward_before_mean": 1.2406534266774543, "reward_before_std": 0.7279466539621353, "reward_change_max": 0.0, "reward_change_mean": -0.16977215185761452, "reward_change_min": -0.2630367986857891, "reward_change_std": 0.1019047787413001, "reward_std": 0.7443933002650738, "rewards/cosine_scaled_reward": 0.1724100224673748, "rewards/format_reward": 0.8958333432674408, "step": 314 }, { "advantage_max": 1.2765894085168839, "advantage_mean": -2.359350637082258e-08, "advantage_min": -1.1392202600836754, "advantage_std": 0.9222354628145695, "completion_length": 3017.5000610351562, "epoch": 0.36, "grad_norm": 0.67600417137146, "kl": 0.384124755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2596318988235037e-07, "loss": 0.0605, "reward": 0.5837108045816422, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5837108045816422, "reward_after_std": 0.9222354479134083, "reward_before_mean": 0.7050627954304218, "reward_before_std": 0.943105410784483, "reward_change_max": 0.0, "reward_change_mean": -0.12135198432952166, "reward_change_min": -0.23846616130322218, "reward_change_std": 0.09463956812396646, "reward_std": 0.9222354926168919, "rewards/cosine_scaled_reward": -0.012051953002810478, "rewards/format_reward": 0.7291666828095913, "step": 315 }, { "advantage_max": 1.2091378793120384, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.8695778399705887, "advantage_std": 0.7904097400605679, "completion_length": 3343.4584350585938, "epoch": 0.36114285714285715, "grad_norm": 0.8994598984718323, "kl": 0.4453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.2294634442070553e-07, "loss": 0.0191, "reward": 0.16371046472340822, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16371046472340822, "reward_after_std": 0.790409728884697, "reward_before_mean": 0.2475222758948803, "reward_before_std": 0.7998642735183239, "reward_change_max": 0.00028289854526519775, "reward_change_mean": -0.08381181326694787, "reward_change_min": -0.16807855293154716, "reward_change_std": 0.06676852668169886, "reward_std": 0.7904097698628902, "rewards/cosine_scaled_reward": -0.17832219880074263, "rewards/format_reward": 0.6041666883975267, "step": 316 }, { "advantage_max": 1.1376716941595078, "advantage_mean": -1.5522043539384356e-08, "advantage_min": -1.1290230266749859, "advantage_std": 0.894180990755558, "completion_length": 3227.0625915527344, "epoch": 0.36228571428571427, "grad_norm": 0.6548424959182739, "kl": 0.5126953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1993569137498776e-07, "loss": 0.0612, "reward": 0.5353844849159941, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5353844849159941, "reward_after_std": 0.8941810317337513, "reward_before_mean": 0.6545494571328163, "reward_before_std": 0.9253632761538029, "reward_change_max": 9.782612323760986e-05, "reward_change_mean": -0.11916495487093925, "reward_change_min": -0.24563197046518326, "reward_change_std": 0.0988409100100398, "reward_std": 0.8941810466349125, "rewards/cosine_scaled_reward": -0.03730860911309719, "rewards/format_reward": 0.7291666828095913, "step": 317 }, { "advantage_max": 1.2796707078814507, "advantage_mean": 6.208814573582799e-10, "advantage_min": -1.1328086107969284, "advantage_std": 0.889403197914362, "completion_length": 2370.916763305664, "epoch": 0.36342857142857143, "grad_norm": 0.7421766519546509, "kl": 0.215087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1693137748017915e-07, "loss": -0.0195, "reward": 0.7891982682049274, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7891982682049274, "reward_after_std": 0.8894031867384911, "reward_before_mean": 0.9295575264841318, "reward_before_std": 0.8947510085999966, "reward_change_max": 0.000398002564907074, "reward_change_mean": -0.14035920519381762, "reward_change_min": -0.25510624423623085, "reward_change_std": 0.09936856152489781, "reward_std": 0.8894032202661037, "rewards/cosine_scaled_reward": 0.04811206506565213, "rewards/format_reward": 0.8333333544433117, "step": 318 }, { "advantage_max": 1.3221676275134087, "advantage_mean": -2.7939677571531263e-08, "advantage_min": -0.9854774251580238, "advantage_std": 0.8276299238204956, "completion_length": 3122.7917404174805, "epoch": 0.36457142857142855, "grad_norm": 0.7270869016647339, "kl": 0.437164306640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1393354916230005e-07, "loss": 0.0172, "reward": 0.21932624652981758, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21932624652981758, "reward_after_std": 0.8276299238204956, "reward_before_mean": 0.30613830126821995, "reward_before_std": 0.8316191211342812, "reward_change_max": 0.00018126517534255981, "reward_change_mean": -0.08681207243353128, "reward_change_min": -0.17364921793341637, "reward_change_std": 0.06764841824769974, "reward_std": 0.8276299461722374, "rewards/cosine_scaled_reward": -0.15943086054176092, "rewards/format_reward": 0.6250000149011612, "step": 319 }, { "advantage_max": 0.8936111852526665, "advantage_mean": -1.2107193581023523e-08, "advantage_min": -0.8252501785755157, "advantage_std": 0.6222108080983162, "completion_length": 2639.7083740234375, "epoch": 0.3657142857142857, "grad_norm": 0.6508719325065613, "kl": 0.24078369140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.1094235253127374e-07, "loss": 0.0466, "reward": 0.7138673812150955, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7138673812150955, "reward_after_std": 0.6222108118236065, "reward_before_mean": 0.8545886836946011, "reward_before_std": 0.6140736099332571, "reward_change_max": 0.0, "reward_change_mean": -0.14072128757834435, "reward_change_min": -0.2278207279741764, "reward_change_std": 0.08737712493166327, "reward_std": 0.6222108118236065, "rewards/cosine_scaled_reward": 0.00021099857985973358, "rewards/format_reward": 0.8541666753590107, "step": 320 }, { "advantage_max": 1.420757032930851, "advantage_mean": -3.290673195044391e-08, "advantage_min": -0.9632410444319248, "advantage_std": 0.9021714180707932, "completion_length": 2570.7084045410156, "epoch": 0.3668571428571429, "grad_norm": 0.9943587779998779, "kl": 0.2659912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.079579333738039e-07, "loss": 0.0601, "reward": 0.8740504225715995, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8740504225715995, "reward_after_std": 0.9021714143455029, "reward_before_mean": 1.0198165625333786, "reward_before_std": 0.8919219821691513, "reward_change_max": 0.0, "reward_change_mean": -0.14576612878590822, "reward_change_min": -0.251723725348711, "reward_change_std": 0.09419048205018044, "reward_std": 0.9021714515984058, "rewards/cosine_scaled_reward": 0.1036582519300282, "rewards/format_reward": 0.8125000111758709, "step": 321 }, { "advantage_max": 1.508301742374897, "advantage_mean": -1.5211601978037947e-08, "advantage_min": -1.0270988121628761, "advantage_std": 0.931220531463623, "completion_length": 3147.7083892822266, "epoch": 0.368, "grad_norm": 0.33928442001342773, "kl": 0.297698974609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.0498043714627006e-07, "loss": 0.033, "reward": 0.5224417466670275, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5224417466670275, "reward_after_std": 0.9312205240130424, "reward_before_mean": 0.634037971496582, "reward_before_std": 0.9298334568738937, "reward_change_max": 0.00011686980724334717, "reward_change_mean": -0.11159622902050614, "reward_change_min": -0.20448446460068226, "reward_change_std": 0.08062839088961482, "reward_std": 0.9312205761671066, "rewards/cosine_scaled_reward": -0.057981026358902454, "rewards/format_reward": 0.7500000111758709, "step": 322 }, { "advantage_max": 0.8821052312850952, "advantage_mean": -4.159907673884078e-08, "advantage_min": -0.8327700607478619, "advantage_std": 0.6381414122879505, "completion_length": 3029.5000762939453, "epoch": 0.36914285714285716, "grad_norm": 0.443503737449646, "kl": 0.2681427001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 4.020100089676376e-07, "loss": 0.0181, "reward": 0.4748626947402954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4748626947402954, "reward_after_std": 0.6381414271891117, "reward_before_mean": 0.5938120167702436, "reward_before_std": 0.6410843506455421, "reward_change_max": 0.00010659545660018921, "reward_change_mean": -0.11894936440512538, "reward_change_min": -0.2031552977859974, "reward_change_std": 0.07823430374264717, "reward_std": 0.6381414458155632, "rewards/cosine_scaled_reward": -0.07809399953112006, "rewards/format_reward": 0.7500000111758709, "step": 323 }, { "advantage_max": 1.1430136933922768, "advantage_mean": -1.800557053455165e-08, "advantage_min": -0.9775372706353664, "advantage_std": 0.7667670547962189, "completion_length": 3201.3959045410156, "epoch": 0.3702857142857143, "grad_norm": 0.37577441334724426, "kl": 0.375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.9904679361238526e-07, "loss": 0.0391, "reward": 0.40068634756607935, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.40068634756607935, "reward_after_std": 0.7667670398950577, "reward_before_mean": 0.5081015911418945, "reward_before_std": 0.7719154208898544, "reward_change_max": 0.0002768710255622864, "reward_change_mean": -0.1074152598157525, "reward_change_min": -0.19459333643317223, "reward_change_std": 0.07792914099991322, "reward_std": 0.7667670398950577, "rewards/cosine_scaled_reward": -0.08969920873641968, "rewards/format_reward": 0.6875000186264515, "step": 324 }, { "advantage_max": 1.1993984952569008, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.8039500899612904, "advantage_std": 0.7809208258986473, "completion_length": 3013.3750534057617, "epoch": 0.37142857142857144, "grad_norm": 0.4150097966194153, "kl": 0.376617431640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.9609093550344907e-07, "loss": 0.0383, "reward": 0.3445624615997076, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3445624615997076, "reward_after_std": 0.780920822173357, "reward_before_mean": 0.44553670287132263, "reward_before_std": 0.78526845946908, "reward_change_max": 0.000149555504322052, "reward_change_mean": -0.10097424406558275, "reward_change_min": -0.20076271332800388, "reward_change_std": 0.0749687859788537, "reward_std": 0.7809208557009697, "rewards/cosine_scaled_reward": -0.0897316625341773, "rewards/format_reward": 0.6250000093132257, "step": 325 }, { "advantage_max": 1.1256632208824158, "advantage_mean": -5.463759156221215e-08, "advantage_min": -0.97540083527565, "advantage_std": 0.7677330262959003, "completion_length": 2688.729263305664, "epoch": 0.37257142857142855, "grad_norm": 0.29752317070961, "kl": 0.26910400390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.931425787051832e-07, "loss": 0.0378, "reward": 0.792833048501052, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.792833048501052, "reward_after_std": 0.7677330300211906, "reward_before_mean": 0.936364995315671, "reward_before_std": 0.7656964734196663, "reward_change_max": 0.0, "reward_change_mean": -0.14353200886398554, "reward_change_min": -0.2429923191666603, "reward_change_std": 0.09230223717167974, "reward_std": 0.7677330449223518, "rewards/cosine_scaled_reward": 0.051515836268663406, "rewards/format_reward": 0.8333333432674408, "step": 326 }, { "advantage_max": 1.245759092271328, "advantage_mean": -5.401671110405459e-08, "advantage_min": -1.0755857825279236, "advantage_std": 0.8496082611382008, "completion_length": 2883.9375534057617, "epoch": 0.3737142857142857, "grad_norm": 0.5994224548339844, "kl": 0.33734130859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.902018669163384e-07, "loss": 0.0484, "reward": 0.8499644990079105, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8499644990079105, "reward_after_std": 0.8496082648634911, "reward_before_mean": 0.9967611813917756, "reward_before_std": 0.8455248475074768, "reward_change_max": 0.0, "reward_change_mean": -0.14679674478247762, "reward_change_min": -0.23957818932831287, "reward_change_std": 0.09821966802701354, "reward_std": 0.8496082872152328, "rewards/cosine_scaled_reward": 0.1025472705514403, "rewards/format_reward": 0.791666679084301, "step": 327 }, { "advantage_max": 1.5196349136531353, "advantage_mean": -2.7318796225195285e-08, "advantage_min": -0.9332630708813667, "advantage_std": 0.9141910150647163, "completion_length": 3421.2084045410156, "epoch": 0.37485714285714283, "grad_norm": 0.5353078842163086, "kl": 0.3829345703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.872689434630585e-07, "loss": 0.0259, "reward": 0.2595358984544873, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2595358984544873, "reward_after_std": 0.9141910225152969, "reward_before_mean": 0.34659624099731445, "reward_before_std": 0.917808011174202, "reward_change_max": 0.0003191158175468445, "reward_change_mean": -0.0870603434741497, "reward_change_min": -0.1851830156520009, "reward_change_std": 0.07253103656694293, "reward_std": 0.9141910411417484, "rewards/cosine_scaled_reward": -0.0767018897458911, "rewards/format_reward": 0.5000000093132257, "step": 328 }, { "advantage_max": 1.1640001758933067, "advantage_mean": -3.7252904094842165e-08, "advantage_min": -0.9084269776940346, "advantage_std": 0.7873306609690189, "completion_length": 2226.8542308807373, "epoch": 0.376, "grad_norm": 0.33948275446891785, "kl": 0.23187255859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.843439512918949e-07, "loss": 0.0203, "reward": 0.8589357230812311, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8589357230812311, "reward_after_std": 0.7873306758701801, "reward_before_mean": 1.0080676265060902, "reward_before_std": 0.779254674911499, "reward_change_max": 9.436160326004028e-05, "reward_change_mean": -0.14913191087543964, "reward_change_min": -0.250965254381299, "reward_change_std": 0.09604426752775908, "reward_std": 0.7873307056725025, "rewards/cosine_scaled_reward": 0.0873671374283731, "rewards/format_reward": 0.8333333395421505, "step": 329 }, { "advantage_max": 1.157850719988346, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.9563889093697071, "advantage_std": 0.7745515741407871, "completion_length": 2771.8125610351562, "epoch": 0.37714285714285717, "grad_norm": 0.6797903180122375, "kl": 0.32012939453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.8142703296283953e-07, "loss": 0.0015, "reward": 0.566857360303402, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.566857360303402, "reward_after_std": 0.7745515815913677, "reward_before_mean": 0.6887968610972166, "reward_before_std": 0.7713383361697197, "reward_change_max": 0.0, "reward_change_mean": -0.12193948682397604, "reward_change_min": -0.20816569216549397, "reward_change_std": 0.0807517715729773, "reward_std": 0.7745516300201416, "rewards/cosine_scaled_reward": -0.05143492412753403, "rewards/format_reward": 0.7916666846722364, "step": 330 }, { "advantage_max": 1.1063204184174538, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.5477707237005234, "advantage_std": 0.6514084860682487, "completion_length": 2844.1458492279053, "epoch": 0.3782857142857143, "grad_norm": 0.8791049718856812, "kl": 0.452423095703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.785183306423767e-07, "loss": 0.0217, "reward": -0.13264837488532066, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13264837488532066, "reward_after_std": 0.6514084823429585, "reward_before_mean": -0.07454398460686207, "reward_before_std": 0.6529114097356796, "reward_change_max": 0.00013102591037750244, "reward_change_mean": -0.05810439004562795, "reward_change_min": -0.11956090945750475, "reward_change_std": 0.04494661255739629, "reward_std": 0.6514085009694099, "rewards/cosine_scaled_reward": -0.24560532672330737, "rewards/format_reward": 0.4166666753590107, "step": 331 }, { "advantage_max": 1.065722979605198, "advantage_mean": 3.3306690738754696e-16, "advantage_min": -1.053912278264761, "advantage_std": 0.8020082227885723, "completion_length": 2842.166717529297, "epoch": 0.37942857142857145, "grad_norm": 0.8386214971542358, "kl": 0.2808074951171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.7561798609655373e-07, "loss": 0.0607, "reward": 0.7436191029846668, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7436191029846668, "reward_after_std": 0.8020082227885723, "reward_before_mean": 0.8833221234381199, "reward_before_std": 0.8112938217818737, "reward_change_max": 3.3855438232421875e-05, "reward_change_mean": -0.13970299996435642, "reward_change_min": -0.25867921486496925, "reward_change_std": 0.09962275065481663, "reward_std": 0.8020082265138626, "rewards/cosine_scaled_reward": 0.0354110449552536, "rewards/format_reward": 0.8125000223517418, "step": 332 }, { "advantage_max": 0.9847873151302338, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.936510294675827, "advantage_std": 0.7264337800443172, "completion_length": 2575.6250915527344, "epoch": 0.38057142857142856, "grad_norm": 0.5724052786827087, "kl": 0.20947265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.72726140684072e-07, "loss": 0.0837, "reward": 0.7284279093146324, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7284279093146324, "reward_after_std": 0.7264337874948978, "reward_before_mean": 0.8686392232775688, "reward_before_std": 0.732000857591629, "reward_change_max": 6.556510925292969e-07, "reward_change_mean": -0.14021131303161383, "reward_change_min": -0.232681673951447, "reward_change_std": 0.09069301281124353, "reward_std": 0.7264338135719299, "rewards/cosine_scaled_reward": 0.007236262783408165, "rewards/format_reward": 0.854166679084301, "step": 333 }, { "advantage_max": 0.7661653384566307, "advantage_mean": -9.31322685637781e-10, "advantage_min": -0.93858827278018, "advantage_std": 0.6230427138507366, "completion_length": 3269.1875915527344, "epoch": 0.38171428571428573, "grad_norm": 0.38517114520072937, "kl": 0.3935546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6984293534939737e-07, "loss": 0.0393, "reward": 0.34002362564206123, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34002362564206123, "reward_after_std": 0.6230427138507366, "reward_before_mean": 0.44862550497055054, "reward_before_std": 0.6393397077918053, "reward_change_max": 4.176795482635498e-05, "reward_change_mean": -0.1086018648929894, "reward_change_min": -0.1885532783344388, "reward_change_std": 0.07740613957867026, "reward_std": 0.6230427287518978, "rewards/cosine_scaled_reward": -0.14027060009539127, "rewards/format_reward": 0.7291666902601719, "step": 334 }, { "advantage_max": 1.0034086257219315, "advantage_mean": 1.4280279680978225e-08, "advantage_min": -0.8655779659748077, "advantage_std": 0.7271784171462059, "completion_length": 2679.8541717529297, "epoch": 0.38285714285714284, "grad_norm": 0.5499774217605591, "kl": 0.272064208984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.6696851061588994e-07, "loss": 0.0389, "reward": 0.7168587098713033, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7168587098713033, "reward_after_std": 0.7271784208714962, "reward_before_mean": 0.8559580298606306, "reward_before_std": 0.7216392774134874, "reward_change_max": 0.00017315149307250977, "reward_change_mean": -0.13909927383065224, "reward_change_min": -0.2359151355922222, "reward_change_std": 0.09495427086949348, "reward_std": 0.7271784581243992, "rewards/cosine_scaled_reward": 0.06339564686641097, "rewards/format_reward": 0.7291666716337204, "step": 335 }, { "advantage_max": 1.1399415507912636, "advantage_mean": -3.29067312843101e-08, "advantage_min": -1.0761992260813713, "advantage_std": 0.8604688420891762, "completion_length": 3144.4584350585938, "epoch": 0.384, "grad_norm": 0.7100808024406433, "kl": 0.298583984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.641030065789562e-07, "loss": 0.0395, "reward": 0.699290337972343, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.699290337972343, "reward_after_std": 0.8604688420891762, "reward_before_mean": 0.8338933810591698, "reward_before_std": 0.8771174177527428, "reward_change_max": 0.0, "reward_change_mean": -0.1346030319109559, "reward_change_min": -0.24664145708084106, "reward_change_std": 0.09776118211448193, "reward_std": 0.860468864440918, "rewards/cosine_scaled_reward": 0.0002800021320581436, "rewards/format_reward": 0.833333358168602, "step": 336 }, { "advantage_max": 1.0692218244075775, "advantage_mean": -2.8560560083601416e-08, "advantage_min": -1.02787471935153, "advantage_std": 0.8040499426424503, "completion_length": 3072.0000610351562, "epoch": 0.3851428571428571, "grad_norm": 0.36398187279701233, "kl": 0.3551025390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.612465628992203e-07, "loss": 0.0351, "reward": 0.6912481244653463, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6912481244653463, "reward_after_std": 0.8040499426424503, "reward_before_mean": 0.8266817089170218, "reward_before_std": 0.8181922435760498, "reward_change_max": 0.0, "reward_change_mean": -0.13543362356722355, "reward_change_min": -0.24755028635263443, "reward_change_std": 0.09691136796027422, "reward_std": 0.8040499612689018, "rewards/cosine_scaled_reward": -0.0033258050680160522, "rewards/format_reward": 0.8333333507180214, "step": 337 }, { "advantage_max": 1.2663331478834152, "advantage_mean": -1.676380706472358e-08, "advantage_min": -0.9755394570529461, "advantage_std": 0.8277748636901379, "completion_length": 2442.7708892822266, "epoch": 0.3862857142857143, "grad_norm": 1.02471125125885, "kl": 0.27752685546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.5839931879571725e-07, "loss": 0.036, "reward": 0.5828519398346543, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5828519398346543, "reward_after_std": 0.8277748636901379, "reward_before_mean": 0.7042157929390669, "reward_before_std": 0.8291206769645214, "reward_change_max": 0.00014644861221313477, "reward_change_mean": -0.12136386055499315, "reward_change_min": -0.21317408978939056, "reward_change_std": 0.08293768810108304, "reward_std": 0.8277748972177505, "rewards/cosine_scaled_reward": -0.022892115055583417, "rewards/format_reward": 0.7500000093132257, "step": 338 }, { "advantage_max": 1.0612557902932167, "advantage_mean": 1.924733389335742e-08, "advantage_min": -0.9199167042970657, "advantage_std": 0.7026827409863472, "completion_length": 3077.7709045410156, "epoch": 0.38742857142857146, "grad_norm": 0.5753820538520813, "kl": 0.400177001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.555614130391079e-07, "loss": 0.0287, "reward": 0.3159185843542218, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3159185843542218, "reward_after_std": 0.7026827484369278, "reward_before_mean": 0.41648441832512617, "reward_before_std": 0.7069019302725792, "reward_change_max": 0.00016146153211593628, "reward_change_mean": -0.10056579299271107, "reward_change_min": -0.1769854985177517, "reward_change_std": 0.07176952017471194, "reward_std": 0.7026827856898308, "rewards/cosine_scaled_reward": -0.08342447318136692, "rewards/format_reward": 0.583333345130086, "step": 339 }, { "advantage_max": 1.4452608078718185, "advantage_mean": -2.0799538091864633e-08, "advantage_min": -0.8381898179650307, "advantage_std": 0.8512439541518688, "completion_length": 2903.0209197998047, "epoch": 0.38857142857142857, "grad_norm": 0.4870928227901459, "kl": 0.34814453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.5273298394491515e-07, "loss": 0.0612, "reward": 0.5209975503385067, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5209975503385067, "reward_after_std": 0.851243931800127, "reward_before_mean": 0.6333642583340406, "reward_before_std": 0.8363424316048622, "reward_change_max": 0.0, "reward_change_mean": -0.1123666986823082, "reward_change_min": -0.20301656797528267, "reward_change_std": 0.07352043082937598, "reward_std": 0.8512439504265785, "rewards/cosine_scaled_reward": -0.09998455084860325, "rewards/format_reward": 0.833333358168602, "step": 340 }, { "advantage_max": 1.1241364851593971, "advantage_mean": -5.898376231883162e-09, "advantage_min": -0.9545042403042316, "advantage_std": 0.8547794707119465, "completion_length": 2718.6875610351562, "epoch": 0.38971428571428574, "grad_norm": 0.7362974286079407, "kl": 0.2999114990234375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4991416936678276e-07, "loss": 0.0321, "reward": 0.787535191513598, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.787535191513598, "reward_after_std": 0.8547794744372368, "reward_before_mean": 0.9302486808373942, "reward_before_std": 0.8745679631829262, "reward_change_max": 0.0002143457531929016, "reward_change_mean": -0.1427134550176561, "reward_change_min": -0.26017661951482296, "reward_change_std": 0.10592424450442195, "reward_std": 0.854779489338398, "rewards/cosine_scaled_reward": 0.10054099000990391, "rewards/format_reward": 0.7291666753590107, "step": 341 }, { "advantage_max": 1.2140427753329277, "advantage_mean": -7.450580818968433e-09, "advantage_min": -1.0092006474733353, "advantage_std": 0.8508938550949097, "completion_length": 3101.4584350585938, "epoch": 0.39085714285714285, "grad_norm": 0.7306240797042847, "kl": 0.5538330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.471051066897562e-07, "loss": 0.0329, "reward": 0.34557378385216, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34557378385216, "reward_after_std": 0.8508938923478127, "reward_before_mean": 0.44565768260508776, "reward_before_std": 0.8663280420005322, "reward_change_max": 0.0, "reward_change_mean": -0.10008389223366976, "reward_change_min": -0.20153771713376045, "reward_change_std": 0.0821488774381578, "reward_std": 0.8508939146995544, "rewards/cosine_scaled_reward": -0.08967117220163345, "rewards/format_reward": 0.6250000149011612, "step": 342 }, { "advantage_max": 1.0857341140508652, "advantage_mean": 1.2417635419925688e-08, "advantage_min": -0.8121387511491776, "advantage_std": 0.7338540963828564, "completion_length": 3103.625030517578, "epoch": 0.392, "grad_norm": 0.8223617076873779, "kl": 0.365509033203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4430593282358777e-07, "loss": 0.0477, "reward": 0.6110813869163394, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6110813869163394, "reward_after_std": 0.733854103833437, "reward_before_mean": 0.7386172357946634, "reward_before_std": 0.7282578460872173, "reward_change_max": 0.00015385448932647705, "reward_change_mean": -0.12753579672425985, "reward_change_min": -0.22481666505336761, "reward_change_std": 0.08690722612664104, "reward_std": 0.7338541373610497, "rewards/cosine_scaled_reward": 0.0463919285684824, "rewards/format_reward": 0.6458333488553762, "step": 343 }, { "advantage_max": 0.8111766427755356, "advantage_mean": -4.718701207551135e-08, "advantage_min": -1.0656887590885162, "advantage_std": 0.7146199978888035, "completion_length": 2326.5834045410156, "epoch": 0.3931428571428571, "grad_norm": 0.4540533125400543, "kl": 0.2355194091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.4151678419606233e-07, "loss": 0.0034, "reward": 1.0658331364393234, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.0658331364393234, "reward_after_std": 0.7146199941635132, "reward_before_mean": 1.2407823614776134, "reward_before_std": 0.7263977602124214, "reward_change_max": 0.00012203305959701538, "reward_change_mean": -0.17494926508516073, "reward_change_min": -0.2789658457040787, "reward_change_std": 0.11326393391937017, "reward_std": 0.7146199978888035, "rewards/cosine_scaled_reward": 0.20372450165450573, "rewards/format_reward": 0.833333358168602, "step": 344 }, { "advantage_max": 1.0266596004366875, "advantage_mean": 5.277494247168946e-09, "advantage_min": -1.0351220294833183, "advantage_std": 0.7456030026078224, "completion_length": 2850.041748046875, "epoch": 0.3942857142857143, "grad_norm": 1.002143144607544, "kl": 0.340423583984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.387377967463493e-07, "loss": -0.0068, "reward": 0.8741661226376891, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.8741661226376891, "reward_after_std": 0.7456029951572418, "reward_before_mean": 1.02674968726933, "reward_before_std": 0.7466263473033905, "reward_change_max": 0.0, "reward_change_mean": -0.1525835506618023, "reward_change_min": -0.2507946826517582, "reward_change_std": 0.09826913708820939, "reward_std": 0.745603010058403, "rewards/cosine_scaled_reward": 0.10712484084069729, "rewards/format_reward": 0.8125000074505806, "step": 345 }, { "advantage_max": 0.791664257645607, "advantage_mean": -3.414849514271623e-09, "advantage_min": -0.6630604565143585, "advantage_std": 0.5060148164629936, "completion_length": 3213.3958740234375, "epoch": 0.3954285714285714, "grad_norm": 1.1361817121505737, "kl": 0.53509521484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.359691059183761e-07, "loss": 0.034, "reward": 0.22787731047719717, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22787731047719717, "reward_after_std": 0.506014809012413, "reward_before_mean": 0.3254447274375707, "reward_before_std": 0.49645309150218964, "reward_change_max": 7.732957601547241e-05, "reward_change_mean": -0.09756739297881722, "reward_change_min": -0.15314494539052248, "reward_change_std": 0.0625298055820167, "reward_std": 0.5060148164629936, "rewards/cosine_scaled_reward": -0.11852765083312988, "rewards/format_reward": 0.562500013038516, "step": 346 }, { "advantage_max": 0.7966568246483803, "advantage_mean": -1.986821579480491e-08, "advantage_min": -0.7594610974192619, "advantage_std": 0.5565367415547371, "completion_length": 3140.479248046875, "epoch": 0.3965714285714286, "grad_norm": 0.8332196474075317, "kl": 0.4498291015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.3321084665422803e-07, "loss": 0.0203, "reward": 0.19970552437007427, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19970552437007427, "reward_after_std": 0.5565367415547371, "reward_before_mean": 0.29436481976881623, "reward_before_std": 0.5582841373980045, "reward_change_max": 0.0, "reward_change_mean": -0.09465931123122573, "reward_change_min": -0.16640873905271292, "reward_change_std": 0.06378639955073595, "reward_std": 0.5565367564558983, "rewards/cosine_scaled_reward": -0.22781759407371283, "rewards/format_reward": 0.7500000260770321, "step": 347 }, { "advantage_max": 1.0456115677952766, "advantage_mean": -2.359350537162186e-08, "advantage_min": -0.7629845626652241, "advantage_std": 0.6946265436708927, "completion_length": 2854.5000762939453, "epoch": 0.3977142857142857, "grad_norm": 0.5373441576957703, "kl": 0.4082489013671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.3046315338757026e-07, "loss": 0.0432, "reward": 0.27077578753232956, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.27077578753232956, "reward_after_std": 0.6946265511214733, "reward_before_mean": 0.36715321242809296, "reward_before_std": 0.6996577307581902, "reward_change_max": 6.26593828201294e-05, "reward_change_mean": -0.09637744631618261, "reward_change_min": -0.18722706474363804, "reward_change_std": 0.07272940431721509, "reward_std": 0.6946265697479248, "rewards/cosine_scaled_reward": -0.13934006914496422, "rewards/format_reward": 0.6458333358168602, "step": 348 }, { "advantage_max": 1.3121556118130684, "advantage_mean": -1.1486312123665243e-08, "advantage_min": -0.8878070712089539, "advantage_std": 0.7979016825556755, "completion_length": 3066.3125915527344, "epoch": 0.39885714285714285, "grad_norm": 0.48574212193489075, "kl": 0.22210693359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.2772616003709616e-07, "loss": 0.0252, "reward": 0.4082105464185588, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4082105464185588, "reward_after_std": 0.7979016974568367, "reward_before_mean": 0.5126420352607965, "reward_before_std": 0.7913182973861694, "reward_change_max": 0.0002217814326286316, "reward_change_mean": -0.10443148808553815, "reward_change_min": -0.18364053964614868, "reward_change_std": 0.07094059698283672, "reward_std": 0.7979017086327076, "rewards/cosine_scaled_reward": -0.1290956644807011, "rewards/format_reward": 0.7708333469927311, "step": 349 }, { "advantage_max": 1.0644586831331253, "advantage_mean": -3.476937715518602e-08, "advantage_min": -0.9515664502978325, "advantage_std": 0.7851700074970722, "completion_length": 2800.104248046875, "epoch": 0.4, "grad_norm": 0.5524913668632507, "kl": 0.424530029296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.250000000000001e-07, "loss": 0.0579, "reward": 0.7616480272263288, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7616480272263288, "reward_after_std": 0.7851700149476528, "reward_before_mean": 0.9033026099205017, "reward_before_std": 0.7952011059969664, "reward_change_max": 0.0, "reward_change_mean": -0.14165459107607603, "reward_change_min": -0.25414634868502617, "reward_change_std": 0.09784402325749397, "reward_std": 0.7851700223982334, "rewards/cosine_scaled_reward": 0.08706796666956507, "rewards/format_reward": 0.7291666716337204, "step": 350 }, { "advantage_max": 1.0468278601765633, "advantage_mean": -5.8518101919702303e-08, "advantage_min": -0.871786467730999, "advantage_std": 0.7094918079674244, "completion_length": 2834.916732788086, "epoch": 0.40114285714285713, "grad_norm": 0.26139718294143677, "kl": 0.2466888427734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.222848061454764e-07, "loss": 0.0272, "reward": 0.857110857963562, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.857110857963562, "reward_after_std": 0.7094917967915535, "reward_before_mean": 1.0084098004736006, "reward_before_std": 0.6962695913389325, "reward_change_max": 0.0, "reward_change_mean": -0.15129899140447378, "reward_change_min": -0.24284421186894178, "reward_change_std": 0.0927650211378932, "reward_std": 0.7094918265938759, "rewards/cosine_scaled_reward": 0.11878822930157185, "rewards/format_reward": 0.7708333488553762, "step": 351 }, { "advantage_max": 1.302818451076746, "advantage_mean": -2.0489097474207796e-08, "advantage_min": -1.003556728363037, "advantage_std": 0.826737018302083, "completion_length": 2807.3751220703125, "epoch": 0.4022857142857143, "grad_norm": 1.261825442314148, "kl": 0.3189697265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.195807108082429e-07, "loss": 0.0777, "reward": 0.48880088748410344, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48880088748410344, "reward_after_std": 0.8267370592802763, "reward_before_mean": 0.6008866727352142, "reward_before_std": 0.8274815808981657, "reward_change_max": 0.0, "reward_change_mean": -0.11208577593788505, "reward_change_min": -0.19520364236086607, "reward_change_std": 0.07618892658501863, "reward_std": 0.8267370741814375, "rewards/cosine_scaled_reward": 0.008776647970080376, "rewards/format_reward": 0.5833333469927311, "step": 352 }, { "advantage_max": 1.1022524684667587, "advantage_mean": -3.725290242950763e-09, "advantage_min": -0.8354335837066174, "advantage_std": 0.7104697525501251, "completion_length": 2324.833450317383, "epoch": 0.4034285714285714, "grad_norm": 0.7030723094940186, "kl": 0.220916748046875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.168878457820915e-07, "loss": 0.0001, "reward": 0.7568352874368429, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7568352874368429, "reward_after_std": 0.7104697562754154, "reward_before_mean": 0.8968926519155502, "reward_before_std": 0.6982098333537579, "reward_change_max": 0.0, "reward_change_mean": -0.1400573654100299, "reward_change_min": -0.24266061559319496, "reward_change_std": 0.08802814176306129, "reward_std": 0.7104697898030281, "rewards/cosine_scaled_reward": 0.06302965292707086, "rewards/format_reward": 0.770833358168602, "step": 353 }, { "advantage_max": 0.791136309504509, "advantage_mean": -1.4901161526914564e-08, "advantage_min": -1.0148145444691181, "advantage_std": 0.6643821857869625, "completion_length": 2448.854217529297, "epoch": 0.4045714285714286, "grad_norm": 0.360441654920578, "kl": 0.2796478271484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.142063423134644e-07, "loss": 0.0113, "reward": 0.7733204569667578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7733204569667578, "reward_after_std": 0.6643821746110916, "reward_before_mean": 0.9214807010721415, "reward_before_std": 0.675186101347208, "reward_change_max": 0.0, "reward_change_mean": -0.14816024899482727, "reward_change_min": -0.23208301700651646, "reward_change_std": 0.0934093315154314, "reward_std": 0.6643821746110916, "rewards/cosine_scaled_reward": 0.12740701530128717, "rewards/format_reward": 0.6666666846722364, "step": 354 }, { "advantage_max": 1.4501032158732414, "advantage_mean": -1.98682153507157e-08, "advantage_min": -1.158358946442604, "advantage_std": 0.9547172710299492, "completion_length": 2681.2084045410156, "epoch": 0.4057142857142857, "grad_norm": 0.4072975218296051, "kl": 0.275482177734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.115363310950578e-07, "loss": 0.0327, "reward": 0.6503871716558933, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6503871716558933, "reward_after_std": 0.9547172412276268, "reward_before_mean": 0.7749169375747442, "reward_before_std": 0.9606028571724892, "reward_change_max": 9.2335045337677e-05, "reward_change_mean": -0.1245297659188509, "reward_change_min": -0.22305811289697886, "reward_change_std": 0.09232740569859743, "reward_std": 0.9547172635793686, "rewards/cosine_scaled_reward": 0.012458451557904482, "rewards/format_reward": 0.7500000074505806, "step": 355 }, { "advantage_max": 0.966142512857914, "advantage_mean": -8.071462387349015e-09, "advantage_min": -1.140071079134941, "advantage_std": 0.7658510934561491, "completion_length": 2957.479202270508, "epoch": 0.40685714285714286, "grad_norm": 0.5873293280601501, "kl": 0.2852935791015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0887794225945143e-07, "loss": 0.0161, "reward": 0.6667291913181543, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6667291913181543, "reward_after_std": 0.7658511009067297, "reward_before_mean": 0.8018401209264994, "reward_before_std": 0.7819232614710927, "reward_change_max": 0.00015025585889816284, "reward_change_mean": -0.13511092774569988, "reward_change_min": -0.23053260147571564, "reward_change_std": 0.09672127198427916, "reward_std": 0.7658511158078909, "rewards/cosine_scaled_reward": 0.0155033846385777, "rewards/format_reward": 0.7708333469927311, "step": 356 }, { "advantage_max": 1.0412839315831661, "advantage_mean": -2.235174231812742e-08, "advantage_min": -1.0026082322001457, "advantage_std": 0.763444721698761, "completion_length": 3193.3541870117188, "epoch": 0.408, "grad_norm": 0.664465606212616, "kl": 0.3111572265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.062313053727671e-07, "loss": 0.0059, "reward": 0.4894076222553849, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4894076222553849, "reward_after_std": 0.7634447365999222, "reward_before_mean": 0.6064597554504871, "reward_before_std": 0.775292593985796, "reward_change_max": 0.0004100501537322998, "reward_change_mean": -0.11705213971436024, "reward_change_min": -0.22646293975412846, "reward_change_std": 0.0875883437693119, "reward_std": 0.7634447701275349, "rewards/cosine_scaled_reward": -0.08218680415302515, "rewards/format_reward": 0.770833358168602, "step": 357 }, { "advantage_max": 1.1054131537675858, "advantage_mean": 1.1486311679576033e-08, "advantage_min": -0.9517468959093094, "advantage_std": 0.7820233516395092, "completion_length": 2913.7084045410156, "epoch": 0.40914285714285714, "grad_norm": 0.9437838196754456, "kl": 0.4264373779296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0359654942835247e-07, "loss": 0.0231, "reward": 0.6220018891617656, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6220018891617656, "reward_after_std": 0.7820233516395092, "reward_before_mean": 0.7498711105436087, "reward_before_std": 0.787973016500473, "reward_change_max": 0.0, "reward_change_mean": -0.12786916503682733, "reward_change_min": -0.22747263871133327, "reward_change_std": 0.09117141552269459, "reward_std": 0.7820233665406704, "rewards/cosine_scaled_reward": -6.447359919548035e-05, "rewards/format_reward": 0.7500000223517418, "step": 358 }, { "advantage_max": 0.9187195301055908, "advantage_mean": -9.934106759423855e-09, "advantage_min": -0.8075991421937943, "advantage_std": 0.6383391171693802, "completion_length": 2860.8334350585938, "epoch": 0.4102857142857143, "grad_norm": 0.2313191443681717, "kl": 0.248382568359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 3.0097380284049523e-07, "loss": 0.0254, "reward": 0.6255717375315726, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6255717375315726, "reward_after_std": 0.6383391208946705, "reward_before_mean": 0.7577385175973177, "reward_before_std": 0.6361598484218121, "reward_change_max": 0.0001360177993774414, "reward_change_mean": -0.1321667837910354, "reward_change_min": -0.2153165964409709, "reward_change_std": 0.0826862514950335, "reward_std": 0.6383391432464123, "rewards/cosine_scaled_reward": -0.027380744460970163, "rewards/format_reward": 0.8125000149011612, "step": 359 }, { "advantage_max": 1.344765804708004, "advantage_mean": -4.47034849138106e-08, "advantage_min": -1.3960507363080978, "advantage_std": 1.0359720475971699, "completion_length": 3082.541778564453, "epoch": 0.4114285714285714, "grad_norm": 0.9665197134017944, "kl": 0.27655029296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9836319343816397e-07, "loss": 0.0287, "reward": 0.9328369447030127, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9328369447030127, "reward_after_std": 1.0359720401465893, "reward_before_mean": 1.0855822588782758, "reward_before_std": 1.061440672725439, "reward_change_max": 0.0002325400710105896, "reward_change_mean": -0.15274538099765778, "reward_change_min": -0.2875901088118553, "reward_change_std": 0.1183041324838996, "reward_std": 1.0359720438718796, "rewards/cosine_scaled_reward": 0.1573744739871472, "rewards/format_reward": 0.770833358168602, "step": 360 }, { "advantage_max": 1.1589705422520638, "advantage_mean": 1.2107193525512372e-08, "advantage_min": -1.2056332975625992, "advantage_std": 0.8765405416488647, "completion_length": 3018.666717529297, "epoch": 0.4125714285714286, "grad_norm": 1.0897059440612793, "kl": 0.3499755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9576484845877793e-07, "loss": 0.0639, "reward": 0.4580980301834643, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4580980301834643, "reward_after_std": 0.8765405118465424, "reward_before_mean": 0.5701448558829725, "reward_before_std": 0.9030290320515633, "reward_change_max": 0.0002184361219406128, "reward_change_mean": -0.1120468145236373, "reward_change_min": -0.22999682277441025, "reward_change_std": 0.09206069586798549, "reward_std": 0.876540519297123, "rewards/cosine_scaled_reward": -0.03784424933837727, "rewards/format_reward": 0.6458333469927311, "step": 361 }, { "advantage_max": 0.8124096095561981, "advantage_mean": -3.8494666620980666e-08, "advantage_min": -0.8067378401756287, "advantage_std": 0.6147194467484951, "completion_length": 1966.5416717529297, "epoch": 0.4137142857142857, "grad_norm": 0.4702957272529602, "kl": 0.219390869140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.931788945420058e-07, "loss": 0.0247, "reward": 0.5928452904336154, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5928452904336154, "reward_after_std": 0.6147194467484951, "reward_before_mean": 0.7235520584508777, "reward_before_std": 0.6164083369076252, "reward_change_max": 0.0007086694240570068, "reward_change_mean": -0.1307067759335041, "reward_change_min": -0.22446970269083977, "reward_change_std": 0.08686780696734786, "reward_std": 0.6147194840013981, "rewards/cosine_scaled_reward": -0.034057313576340675, "rewards/format_reward": 0.7916666902601719, "step": 362 }, { "advantage_max": 1.1513478010892868, "advantage_mean": -1.2417634920325327e-08, "advantage_min": -1.4183903858065605, "advantage_std": 0.934955932199955, "completion_length": 2242.1042251586914, "epoch": 0.41485714285714287, "grad_norm": 0.4347902536392212, "kl": 0.3087310791015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.9060545772359305e-07, "loss": 0.0381, "reward": 0.7093102987855673, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7093102987855673, "reward_after_std": 0.934955932199955, "reward_before_mean": 0.8446380607783794, "reward_before_std": 0.9661568701267242, "reward_change_max": 0.00010865926742553711, "reward_change_mean": -0.13532774476334453, "reward_change_min": -0.2547372132539749, "reward_change_std": 0.10575387300923467, "reward_std": 0.9349559731781483, "rewards/cosine_scaled_reward": 0.0681523447856307, "rewards/format_reward": 0.708333358168602, "step": 363 }, { "advantage_max": 0.9506546705961227, "advantage_mean": -2.173086155465853e-09, "advantage_min": -0.6697430238127708, "advantage_std": 0.5997806861996651, "completion_length": 2946.916748046875, "epoch": 0.416, "grad_norm": 0.42780113220214844, "kl": 0.3773040771484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8804466342921987e-07, "loss": 0.0328, "reward": 0.20062502287328243, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20062502287328243, "reward_after_std": 0.5997806712985039, "reward_before_mean": 0.2927093543112278, "reward_before_std": 0.5955142378807068, "reward_change_max": 0.0, "reward_change_mean": -0.09208432491868734, "reward_change_min": -0.16610389854758978, "reward_change_std": 0.06054901331663132, "reward_std": 0.5997806787490845, "rewards/cosine_scaled_reward": -0.1453120014630258, "rewards/format_reward": 0.5833333414047956, "step": 364 }, { "advantage_max": 0.8594930022954941, "advantage_mean": -3.523503833147146e-08, "advantage_min": -0.6349957399070263, "advantage_std": 0.5609443206340075, "completion_length": 3024.7500915527344, "epoch": 0.41714285714285715, "grad_norm": 0.6753454804420471, "kl": 0.3546142578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.854966364683872e-07, "loss": 0.0101, "reward": 0.6145913098007441, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6145913098007441, "reward_after_std": 0.5609443113207817, "reward_before_mean": 0.7464557122439146, "reward_before_std": 0.5405437704175711, "reward_change_max": 0.0001978650689125061, "reward_change_mean": -0.13186442031292245, "reward_change_min": -0.20231307670474052, "reward_change_std": 0.07916592317633331, "reward_std": 0.5609443187713623, "rewards/cosine_scaled_reward": 0.019061174243688583, "rewards/format_reward": 0.708333345130086, "step": 365 }, { "advantage_max": 1.1641277149319649, "advantage_mean": -6.208817460162663e-09, "advantage_min": -1.2483574375510216, "advantage_std": 0.9762383066117764, "completion_length": 2363.6250534057617, "epoch": 0.41828571428571426, "grad_norm": 1.7422231435775757, "kl": 0.303375244140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.829615010283344e-07, "loss": 0.066, "reward": 0.6917076036334038, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6917076036334038, "reward_after_std": 0.9762383252382278, "reward_before_mean": 0.8244628701359034, "reward_before_std": 1.0097406208515167, "reward_change_max": 0.0004345700144767761, "reward_change_mean": -0.13275525951758027, "reward_change_min": -0.2532838536426425, "reward_change_std": 0.11263198498636484, "reward_std": 0.9762383289635181, "rewards/cosine_scaled_reward": 0.1309814564883709, "rewards/format_reward": 0.562500013038516, "step": 366 }, { "advantage_max": 1.122251644730568, "advantage_mean": -2.599942261483079e-08, "advantage_min": -0.8712231889367104, "advantage_std": 0.7956646084785461, "completion_length": 3046.1250610351562, "epoch": 0.41942857142857143, "grad_norm": 0.8266307711601257, "kl": 0.40216064453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.8043938066798645e-07, "loss": 0.0425, "reward": 0.48944999772356823, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.48944999772356823, "reward_after_std": 0.7956646457314491, "reward_before_mean": 0.6048614283499774, "reward_before_std": 0.804796252399683, "reward_change_max": 0.00014813244342803955, "reward_change_mean": -0.1154114343225956, "reward_change_min": -0.2323084594681859, "reward_change_std": 0.09151481185108423, "reward_std": 0.7956646531820297, "rewards/cosine_scaled_reward": -0.010069283656775951, "rewards/format_reward": 0.6250000074505806, "step": 367 }, { "advantage_max": 0.7003500536084175, "advantage_mean": 2.173085961176824e-09, "advantage_min": -0.5562456995248795, "advantage_std": 0.4822025038301945, "completion_length": 3284.3334045410156, "epoch": 0.4205714285714286, "grad_norm": 0.9353283047676086, "kl": 0.426483154296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7793039831193133e-07, "loss": 0.0277, "reward": 0.2110738381743431, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2110738381743431, "reward_after_std": 0.48220251128077507, "reward_before_mean": 0.308893536683172, "reward_before_std": 0.4765991158783436, "reward_change_max": 0.00024100393056869507, "reward_change_mean": -0.09781968453899026, "reward_change_min": -0.16675298567861319, "reward_change_std": 0.06492633419111371, "reward_std": 0.48220251500606537, "rewards/cosine_scaled_reward": -0.14763657189905643, "rewards/format_reward": 0.6041666679084301, "step": 368 }, { "advantage_max": 1.342196799814701, "advantage_mean": -2.23517424569053e-08, "advantage_min": -1.038889728486538, "advantage_std": 0.8970093280076981, "completion_length": 2942.5208892822266, "epoch": 0.4217142857142857, "grad_norm": 0.43765193223953247, "kl": 0.38726806640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7543467624442956e-07, "loss": 0.0149, "reward": 0.7111932290717959, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7111932290717959, "reward_after_std": 0.8970093317329884, "reward_before_mean": 0.8434848883189261, "reward_before_std": 0.8970467895269394, "reward_change_max": 0.00014856457710266113, "reward_change_mean": -0.13229163456708193, "reward_change_min": -0.2422568015754223, "reward_change_std": 0.09590382222086191, "reward_std": 0.8970093578100204, "rewards/cosine_scaled_reward": 0.07799242623150349, "rewards/format_reward": 0.6875000149011612, "step": 369 }, { "advantage_max": 0.7304398790001869, "advantage_mean": 9.313225579621331e-09, "advantage_min": -0.5804145596921444, "advantage_std": 0.4967747814953327, "completion_length": 3077.9584045410156, "epoch": 0.4228571428571429, "grad_norm": 1.005544662475586, "kl": 0.44821929931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.729523361034538e-07, "loss": 0.0229, "reward": 0.4255966132041067, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4255966132041067, "reward_after_std": 0.4967747703194618, "reward_before_mean": 0.5427881754003465, "reward_before_std": 0.47967731952667236, "reward_change_max": 0.0, "reward_change_mean": -0.11719155265018344, "reward_change_min": -0.18624725379049778, "reward_change_std": 0.07463060226291418, "reward_std": 0.496774785220623, "rewards/cosine_scaled_reward": -0.06193925626575947, "rewards/format_reward": 0.6666666753590107, "step": 370 }, { "advantage_max": 0.8285253420472145, "advantage_mean": -4.346172255420555e-08, "advantage_min": -0.7580652683973312, "advantage_std": 0.5659327665343881, "completion_length": 2190.5417556762695, "epoch": 0.424, "grad_norm": 0.43017879128456116, "kl": 0.3216705322265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.7048349887476037e-07, "loss": 0.0227, "reward": 0.9253095942549407, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9253095942549407, "reward_after_std": 0.5659327721223235, "reward_before_mean": 1.0867008964996785, "reward_before_std": 0.5481990473344922, "reward_change_max": 7.56382942199707e-05, "reward_change_mean": -0.16139132343232632, "reward_change_min": -0.2438336256891489, "reward_change_std": 0.09487550053745508, "reward_std": 0.5659327721223235, "rewards/cosine_scaled_reward": 0.19960043695755303, "rewards/format_reward": 0.6875000149011612, "step": 371 }, { "advantage_max": 1.2706674709916115, "advantage_mean": -2.1730867105773655e-09, "advantage_min": -0.9622760713100433, "advantage_std": 0.8690132163465023, "completion_length": 3044.3750610351562, "epoch": 0.42514285714285716, "grad_norm": 0.9394458532333374, "kl": 0.3265533447265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6802828488599294e-07, "loss": 0.0538, "reward": 0.5415406846441329, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5415406846441329, "reward_after_std": 0.8690132237970829, "reward_before_mean": 0.6591636501252651, "reward_before_std": 0.8758064955472946, "reward_change_max": 0.000122852623462677, "reward_change_mean": -0.11762292124330997, "reward_change_min": -0.2325716745108366, "reward_change_std": 0.09012753423303366, "reward_std": 0.8690132312476635, "rewards/cosine_scaled_reward": 0.027498478069901466, "rewards/format_reward": 0.6041666753590107, "step": 372 }, { "advantage_max": 1.1446843966841698, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -1.0722761787474155, "advantage_std": 0.8411088809370995, "completion_length": 2142.3125228881836, "epoch": 0.42628571428571427, "grad_norm": 0.688064694404602, "kl": 0.5388641357421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.655868138008171e-07, "loss": 0.0153, "reward": 0.7092705629765987, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7092705629765987, "reward_after_std": 0.8411088772118092, "reward_before_mean": 0.8444973253645003, "reward_before_std": 0.8505554907023907, "reward_change_max": 0.0, "reward_change_mean": -0.13522674329578876, "reward_change_min": -0.2483549453318119, "reward_change_std": 0.09696428989991546, "reward_std": 0.84110888838768, "rewards/cosine_scaled_reward": 0.07849864475429058, "rewards/format_reward": 0.6875000074505806, "step": 373 }, { "advantage_max": 1.3396911844611168, "advantage_mean": 6.829699583654758e-09, "advantage_min": -0.8294327259063721, "advantage_std": 0.8366260938346386, "completion_length": 2615.3125762939453, "epoch": 0.42742857142857144, "grad_norm": 0.3500214219093323, "kl": 0.336761474609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.631592046130896e-07, "loss": 0.0417, "reward": 0.4654297400265932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4654297400265932, "reward_after_std": 0.8366261087357998, "reward_before_mean": 0.5744652273133397, "reward_before_std": 0.8310881219804287, "reward_change_max": 0.0004261508584022522, "reward_change_mean": -0.10903546074405313, "reward_change_min": -0.20918164774775505, "reward_change_std": 0.07874666526913643, "reward_std": 0.836626123636961, "rewards/cosine_scaled_reward": -0.014850735664367676, "rewards/format_reward": 0.604166679084301, "step": 374 }, { "advantage_max": 1.05471608415246, "advantage_mean": -4.346171977864799e-09, "advantage_min": -0.9635727629065514, "advantage_std": 0.7893678471446037, "completion_length": 2934.291702270508, "epoch": 0.42857142857142855, "grad_norm": 0.8874136209487915, "kl": 0.328857421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.6074557564105724e-07, "loss": 0.0395, "reward": 0.6218974577786867, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6218974577786867, "reward_after_std": 0.7893678620457649, "reward_before_mean": 0.7510060481727123, "reward_before_std": 0.7985278442502022, "reward_change_max": 0.00015664845705032349, "reward_change_mean": -0.12910857424139977, "reward_change_min": -0.22664515953511, "reward_change_std": 0.09451806033030152, "reward_std": 0.7893679030239582, "rewards/cosine_scaled_reward": 0.06300301384180784, "rewards/format_reward": 0.6250000074505806, "step": 375 }, { "advantage_max": 0.8759348541498184, "advantage_mean": -2.048909714114089e-08, "advantage_min": -0.7458357661962509, "advantage_std": 0.5922916419804096, "completion_length": 2636.729202270508, "epoch": 0.4297142857142857, "grad_norm": 0.45231547951698303, "kl": 0.308197021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.583460445215911e-07, "loss": 0.0328, "reward": 0.4231463046744466, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4231463046744466, "reward_after_std": 0.5922916382551193, "reward_before_mean": 0.5374979162588716, "reward_before_std": 0.5894507952034473, "reward_change_max": 8.402764797210693e-05, "reward_change_mean": -0.1143516362644732, "reward_change_min": -0.18643983826041222, "reward_change_std": 0.07226712163537741, "reward_std": 0.5922916643321514, "rewards/cosine_scaled_reward": -0.09583438094705343, "rewards/format_reward": 0.7291666734963655, "step": 376 }, { "advantage_max": 1.383926510810852, "advantage_mean": 2.4835269063494536e-08, "advantage_min": -0.9945981428027153, "advantage_std": 0.9026396945118904, "completion_length": 3282.8125915527344, "epoch": 0.4308571428571429, "grad_norm": 1.0070041418075562, "kl": 0.3175048828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.5596072820445254e-07, "loss": 0.0442, "reward": 0.44021230190992355, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.44021230190992355, "reward_after_std": 0.902639701962471, "reward_before_mean": 0.5466061439365149, "reward_before_std": 0.9098929949104786, "reward_change_max": 0.00017774105072021484, "reward_change_mean": -0.10639382712543011, "reward_change_min": -0.22205941379070282, "reward_change_std": 0.08372075716033578, "reward_std": 0.9026397354900837, "rewards/cosine_scaled_reward": -0.06003026259713806, "rewards/format_reward": 0.6666666865348816, "step": 377 }, { "advantage_max": 0.9764475971460342, "advantage_mean": -5.246450671125835e-08, "advantage_min": -0.873407207429409, "advantage_std": 0.67820955067873, "completion_length": 2588.2500610351562, "epoch": 0.432, "grad_norm": 0.4015950858592987, "kl": 0.2265777587890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.5358974294659373e-07, "loss": 0.0183, "reward": 0.7747280902694911, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7747280902694911, "reward_after_std": 0.6782095581293106, "reward_before_mean": 0.9191812109202147, "reward_before_std": 0.671349611133337, "reward_change_max": 0.0, "reward_change_mean": -0.14445316838100553, "reward_change_min": -0.24850592855364084, "reward_change_std": 0.09129061782732606, "reward_std": 0.6782095953822136, "rewards/cosine_scaled_reward": 0.09500727988779545, "rewards/format_reward": 0.729166679084301, "step": 378 }, { "advantage_max": 1.2924234345555305, "advantage_mean": -1.6142925329809543e-08, "advantage_min": -0.9254086911678314, "advantage_std": 0.8383762203156948, "completion_length": 3108.2709350585938, "epoch": 0.43314285714285716, "grad_norm": 0.3910759687423706, "kl": 0.40936279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.512332043064913e-07, "loss": 0.0498, "reward": 0.43546567182056606, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.43546567182056606, "reward_after_std": 0.838376197963953, "reward_before_mean": 0.542799973860383, "reward_before_std": 0.8410528190433979, "reward_change_max": 0.00024215131998062134, "reward_change_mean": -0.10733431112021208, "reward_change_min": -0.19753815606236458, "reward_change_std": 0.0784850474447012, "reward_std": 0.8383762016892433, "rewards/cosine_scaled_reward": -0.03068335447460413, "rewards/format_reward": 0.6041666865348816, "step": 379 }, { "advantage_max": 0.9396484643220901, "advantage_mean": -3.91155505208296e-08, "advantage_min": -0.6655237004160881, "advantage_std": 0.5877484232187271, "completion_length": 2655.104217529297, "epoch": 0.4342857142857143, "grad_norm": 1.0875959396362305, "kl": 0.264251708984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.488912271385139e-07, "loss": -0.0023, "reward": 0.5554260544013232, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5554260544013232, "reward_after_std": 0.5877484120428562, "reward_before_mean": 0.6805648133158684, "reward_before_std": 0.5702716708183289, "reward_change_max": 0.0003554224967956543, "reward_change_mean": -0.12513879756443202, "reward_change_min": -0.19960128888487816, "reward_change_std": 0.07485141255892813, "reward_std": 0.5877484232187271, "rewards/cosine_scaled_reward": -0.04513426497578621, "rewards/format_reward": 0.770833333954215, "step": 380 }, { "advantage_max": 1.091643925756216, "advantage_mean": -1.4280280735690098e-08, "advantage_min": -0.9078643172979355, "advantage_std": 0.7367625050246716, "completion_length": 3038.1459045410156, "epoch": 0.43542857142857144, "grad_norm": 0.650126576423645, "kl": 0.44171142578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.465639255873246e-07, "loss": 0.0265, "reward": 0.2852631863206625, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2852631863206625, "reward_after_std": 0.7367624826729298, "reward_before_mean": 0.3824858106672764, "reward_before_std": 0.7439739629626274, "reward_change_max": 8.361786603927612e-05, "reward_change_mean": -0.09722263645380735, "reward_change_min": -0.18690539337694645, "reward_change_std": 0.0746005296241492, "reward_std": 0.7367625087499619, "rewards/cosine_scaled_reward": -0.13167377142235637, "rewards/format_reward": 0.6458333469927311, "step": 381 }, { "advantage_max": 1.0695801936089993, "advantage_mean": -5.898376675972372e-09, "advantage_min": -0.8300218358635902, "advantage_std": 0.7193886451423168, "completion_length": 2674.8959197998047, "epoch": 0.43657142857142855, "grad_norm": 0.5527292490005493, "kl": 0.3749847412109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4425141308231765e-07, "loss": 0.0426, "reward": 0.19800740387290716, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19800740387290716, "reward_after_std": 0.7193886376917362, "reward_before_mean": 0.287336059845984, "reward_before_std": 0.7278939560055733, "reward_change_max": 0.0003422573208808899, "reward_change_mean": -0.08932866249233484, "reward_change_min": -0.18138791900128126, "reward_change_std": 0.07123636966571212, "reward_std": 0.7193886563181877, "rewards/cosine_scaled_reward": -0.20008197613060474, "rewards/format_reward": 0.6875000149011612, "step": 382 }, { "advantage_max": 1.1456727720797062, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -1.1615408807992935, "advantage_std": 0.8968832269310951, "completion_length": 3055.729232788086, "epoch": 0.4377142857142857, "grad_norm": 0.47822508215904236, "kl": 0.319732666015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.4195380233209006e-07, "loss": 0.0526, "reward": 0.6082937435712665, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6082937435712665, "reward_after_std": 0.8968832530081272, "reward_before_mean": 0.7342727006180212, "reward_before_std": 0.9225108250975609, "reward_change_max": 0.00024039298295974731, "reward_change_mean": -0.12597893876954913, "reward_change_min": -0.2359147211536765, "reward_change_std": 0.09706644853577018, "reward_std": 0.896883275359869, "rewards/cosine_scaled_reward": 0.023386333137750626, "rewards/format_reward": 0.6875000186264515, "step": 383 }, { "advantage_max": 1.2379422560334206, "advantage_mean": -1.1175872061119918e-08, "advantage_min": -1.424992460757494, "advantage_std": 0.9928190894424915, "completion_length": 2224.7708740234375, "epoch": 0.43885714285714283, "grad_norm": 0.6266052722930908, "kl": 0.1634674072265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3967120531894857e-07, "loss": -0.0301, "reward": 1.2281536404043436, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.2281536404043436, "reward_after_std": 0.9928190894424915, "reward_before_mean": 1.4098486751317978, "reward_before_std": 1.014763057231903, "reward_change_max": 0.0, "reward_change_mean": -0.18169502541422844, "reward_change_min": -0.31081850454211235, "reward_change_std": 0.1270025339908898, "reward_std": 0.9928191304206848, "rewards/cosine_scaled_reward": 0.28825767897069454, "rewards/format_reward": 0.8333333469927311, "step": 384 }, { "advantage_max": 1.0953758209943771, "advantage_mean": -3.60111410691033e-08, "advantage_min": -0.8757317326962948, "advantage_std": 0.7891853414475918, "completion_length": 2996.7084045410156, "epoch": 0.44, "grad_norm": 0.6368526220321655, "kl": 0.202911376953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.374037332934512e-07, "loss": 0.0445, "reward": 0.5555390305817127, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5555390305817127, "reward_after_std": 0.7891853488981724, "reward_before_mean": 0.6777182146906853, "reward_before_std": 0.7959320954978466, "reward_change_max": 0.00031816959381103516, "reward_change_mean": -0.12217918690294027, "reward_change_min": -0.23252204339951277, "reward_change_std": 0.09129771264269948, "reward_std": 0.7891853675246239, "rewards/cosine_scaled_reward": 0.005525756627321243, "rewards/format_reward": 0.6666666734963655, "step": 385 }, { "advantage_max": 1.217541165649891, "advantage_mean": -3.97364305904091e-08, "advantage_min": -1.2796841636300087, "advantage_std": 0.9395193532109261, "completion_length": 3087.791702270508, "epoch": 0.44114285714285717, "grad_norm": 0.6705446839332581, "kl": 0.3101806640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3515149676898552e-07, "loss": 0.0018, "reward": 0.665900741238147, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.665900741238147, "reward_after_std": 0.9395193681120872, "reward_before_mean": 0.7959863543510437, "reward_before_std": 0.9656849205493927, "reward_change_max": 0.0, "reward_change_mean": -0.13008562522009015, "reward_change_min": -0.24509291164577007, "reward_change_std": 0.10135661391541362, "reward_std": 0.939519390463829, "rewards/cosine_scaled_reward": 0.033409830182790756, "rewards/format_reward": 0.7291666865348816, "step": 386 }, { "advantage_max": 1.2698442712426186, "advantage_mean": -2.9181440930337033e-08, "advantage_min": -1.0950742289423943, "advantage_std": 0.8762321844696999, "completion_length": 3122.6250762939453, "epoch": 0.4422857142857143, "grad_norm": 0.5375394821166992, "kl": 0.3720245361328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.3291460551638237e-07, "loss": 0.0262, "reward": 0.5023184239398688, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5023184239398688, "reward_after_std": 0.8762321695685387, "reward_before_mean": 0.6163734996225685, "reward_before_std": 0.8881137296557426, "reward_change_max": 0.0003941729664802551, "reward_change_mean": -0.11405510362237692, "reward_change_min": -0.21109131071716547, "reward_change_std": 0.08576344698667526, "reward_std": 0.8762321919202805, "rewards/cosine_scaled_reward": -0.014729912392795086, "rewards/format_reward": 0.6458333414047956, "step": 387 }, { "advantage_max": 1.0198524445295334, "advantage_mean": -1.862645199190993e-08, "advantage_min": -0.8256707489490509, "advantage_std": 0.7120313681662083, "completion_length": 2877.916778564453, "epoch": 0.44342857142857145, "grad_norm": 0.4498549699783325, "kl": 0.2955780029296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.306931685585657e-07, "loss": 0.0315, "reward": 0.6390794757753611, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6390794757753611, "reward_after_std": 0.7120313458144665, "reward_before_mean": 0.770656397100538, "reward_before_std": 0.712759368121624, "reward_change_max": 0.0, "reward_change_mean": -0.13157692411914468, "reward_change_min": -0.23201371356844902, "reward_change_std": 0.0902018048800528, "reward_std": 0.7120313681662083, "rewards/cosine_scaled_reward": 0.031161522027105093, "rewards/format_reward": 0.7083333395421505, "step": 388 }, { "advantage_max": 1.2392387315630913, "advantage_mean": -3.352761368535795e-08, "advantage_min": -1.2367150112986565, "advantage_std": 0.9424854889512062, "completion_length": 2731.2084045410156, "epoch": 0.44457142857142856, "grad_norm": 0.47933250665664673, "kl": 0.285736083984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2848729416523859e-07, "loss": 0.0165, "reward": 0.7998714097775519, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7998714097775519, "reward_after_std": 0.9424855038523674, "reward_before_mean": 0.9414779755752534, "reward_before_std": 0.9657678753137589, "reward_change_max": 0.0003373771905899048, "reward_change_mean": -0.14160654600709677, "reward_change_min": -0.26733815390616655, "reward_change_std": 0.10602262848988175, "reward_std": 0.9424855262041092, "rewards/cosine_scaled_reward": 0.10615562507882714, "rewards/format_reward": 0.729166679084301, "step": 389 }, { "advantage_max": 1.5434679314494133, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.987445343285799, "advantage_std": 0.9908928908407688, "completion_length": 2833.2501068115234, "epoch": 0.44571428571428573, "grad_norm": 0.9356141090393066, "kl": 0.3245697021484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2629708984760706e-07, "loss": 0.0495, "reward": 0.40416749380528927, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.40416749380528927, "reward_after_std": 0.9908928833901882, "reward_before_mean": 0.5042689014226198, "reward_before_std": 1.0035798326134682, "reward_change_max": 0.0, "reward_change_mean": -0.10010139970108867, "reward_change_min": -0.22962007019668818, "reward_change_std": 0.08653676975518465, "reward_std": 0.9908929243683815, "rewards/cosine_scaled_reward": -0.06036556634353474, "rewards/format_reward": 0.6250000093132257, "step": 390 }, { "advantage_max": 1.4135627299547195, "advantage_mean": -6.208816238917336e-10, "advantage_min": -1.1502118483185768, "advantage_std": 1.0554303713142872, "completion_length": 2525.7709045410156, "epoch": 0.44685714285714284, "grad_norm": 1.8921399116516113, "kl": 0.26483154296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2412266235313973e-07, "loss": 0.0684, "reward": 0.6798726860433817, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6798726860433817, "reward_after_std": 1.0554304011166096, "reward_before_mean": 0.8076426119860116, "reward_before_std": 1.0806686542928219, "reward_change_max": 0.000661991536617279, "reward_change_mean": -0.12776989629492164, "reward_change_min": -0.2711303811520338, "reward_change_std": 0.11271528014913201, "reward_std": 1.0554304346442223, "rewards/cosine_scaled_reward": 0.07048794813454151, "rewards/format_reward": 0.6666666753590107, "step": 391 }, { "advantage_max": 1.0657275505363941, "advantage_mean": -2.3903946183567726e-08, "advantage_min": -0.8924349471926689, "advantage_std": 0.7697158344089985, "completion_length": 2493.2708892822266, "epoch": 0.448, "grad_norm": 0.7100505232810974, "kl": 0.3263092041015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.2196411766036487e-07, "loss": 0.0461, "reward": 0.48778675869107246, "reward_advantage_correlation": 0.9999999999999996, "reward_after_mean": 0.48778675869107246, "reward_after_std": 0.769715815782547, "reward_before_mean": 0.6039394419640303, "reward_before_std": 0.7809349689632654, "reward_change_max": 0.00030147284269332886, "reward_change_mean": -0.11615270469337702, "reward_change_min": -0.24055636301636696, "reward_change_std": 0.09163055196404457, "reward_std": 0.7697158381342888, "rewards/cosine_scaled_reward": -0.03136360924690962, "rewards/format_reward": 0.6666666753590107, "step": 392 }, { "advantage_max": 1.5679549127817154, "advantage_mean": -3.973643114552061e-08, "advantage_min": -1.5204594507813454, "advantage_std": 1.2267358228564262, "completion_length": 2753.729278564453, "epoch": 0.4491428571428571, "grad_norm": 1.968538522720337, "kl": 0.2186279296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1982156097370557e-07, "loss": 0.091, "reward": 1.0800715144723654, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.0800715144723654, "reward_after_std": 1.2267358228564262, "reward_before_mean": 1.242100728675723, "reward_before_std": 1.264978300780058, "reward_change_max": 8.808821439743042e-05, "reward_change_mean": -0.16202920861542225, "reward_change_min": -0.3233691677451134, "reward_change_std": 0.13310158113017678, "reward_std": 1.2267358973622322, "rewards/cosine_scaled_reward": 0.23563368245959282, "rewards/format_reward": 0.7708333507180214, "step": 393 }, { "advantage_max": 1.1774890311062336, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7850269712507725, "advantage_std": 0.7283446006476879, "completion_length": 3308.5834045410156, "epoch": 0.4502857142857143, "grad_norm": 0.6988861560821533, "kl": 0.4105224609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1769509671835223e-07, "loss": 0.0356, "reward": 0.07187341991811991, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07187341991811991, "reward_after_std": 0.7283445969223976, "reward_before_mean": 0.14786747004836798, "reward_before_std": 0.731137853115797, "reward_change_max": 0.0, "reward_change_mean": -0.07599404593929648, "reward_change_min": -0.15443053469061852, "reward_change_std": 0.060382971074432135, "reward_std": 0.7283446118235588, "rewards/cosine_scaled_reward": -0.17606628267094493, "rewards/format_reward": 0.5000000111758709, "step": 394 }, { "advantage_max": 1.1270059682428837, "advantage_mean": -1.7384688244526103e-08, "advantage_min": -1.0076443776488304, "advantage_std": 0.7832858189940453, "completion_length": 2537.8959197998047, "epoch": 0.4514285714285714, "grad_norm": 0.5426788926124573, "kl": 0.230255126953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1558482853517253e-07, "loss": 0.0156, "reward": 0.8596542216837406, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8596542216837406, "reward_after_std": 0.783285815268755, "reward_before_mean": 1.0088724344968796, "reward_before_std": 0.7832663394510746, "reward_change_max": 0.0, "reward_change_mean": -0.1492181965149939, "reward_change_min": -0.25157369300723076, "reward_change_std": 0.09793313452973962, "reward_std": 0.7832858189940453, "rewards/cosine_scaled_reward": 0.11901953746564686, "rewards/format_reward": 0.7708333469927311, "step": 395 }, { "advantage_max": 1.4208708554506302, "advantage_mean": -2.9802322942806825e-08, "advantage_min": -1.0534027591347694, "advantage_std": 0.9146955572068691, "completion_length": 3212.0626220703125, "epoch": 0.45257142857142857, "grad_norm": 0.7655185461044312, "kl": 0.42913818359375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.134908592756607e-07, "loss": 0.0721, "reward": 0.55569236446172, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.55569236446172, "reward_after_std": 0.9146955460309982, "reward_before_mean": 0.6718438614625484, "reward_before_std": 0.9166472069919109, "reward_change_max": 0.0, "reward_change_mean": -0.116151487454772, "reward_change_min": -0.2097308114171028, "reward_change_std": 0.0818931176327169, "reward_std": 0.9146955572068691, "rewards/cosine_scaled_reward": 0.002588571864180267, "rewards/format_reward": 0.666666692122817, "step": 396 }, { "advantage_max": 1.170277938246727, "advantage_mean": -9.934107814135729e-09, "advantage_min": -0.7947026267647743, "advantage_std": 0.7339077740907669, "completion_length": 2839.416748046875, "epoch": 0.45371428571428574, "grad_norm": 0.4377634525299072, "kl": 0.171051025390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.1141329099692406e-07, "loss": 0.0284, "reward": 0.6288949530571699, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6288949530571699, "reward_after_std": 0.7339077889919281, "reward_before_mean": 0.7566788010299206, "reward_before_std": 0.7199768051505089, "reward_change_max": 0.00020315498113632202, "reward_change_mean": -0.1277838561218232, "reward_change_min": -0.20927806198596954, "reward_change_std": 0.08163281762972474, "reward_std": 0.7339078187942505, "rewards/cosine_scaled_reward": -0.00707725714892149, "rewards/format_reward": 0.7708333432674408, "step": 397 }, { "advantage_max": 1.0407202914357185, "advantage_mean": 9.934108424758392e-09, "advantage_min": -0.8572842329740524, "advantage_std": 0.6855289153754711, "completion_length": 2837.166732788086, "epoch": 0.45485714285714285, "grad_norm": 1.068537712097168, "kl": 0.31781005859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0935222495670968e-07, "loss": 0.0099, "reward": 0.3281229701824486, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3281229701824486, "reward_after_std": 0.6855289153754711, "reward_before_mean": 0.43000722490251064, "reward_before_std": 0.6828913427889347, "reward_change_max": 0.00022487342357635498, "reward_change_mean": -0.10188420582562685, "reward_change_min": -0.18412470445036888, "reward_change_std": 0.07508498663082719, "reward_std": 0.6855289451777935, "rewards/cosine_scaled_reward": -0.10791307222098112, "rewards/format_reward": 0.6458333488553762, "step": 398 }, { "advantage_max": 1.341847501695156, "advantage_mean": -4.346172111091562e-08, "advantage_min": -1.1149200797080994, "advantage_std": 0.9616120122373104, "completion_length": 2445.3333740234375, "epoch": 0.456, "grad_norm": 1.9017913341522217, "kl": 0.23044586181640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0730776160846853e-07, "loss": 0.098, "reward": 0.8614312242716551, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8614312242716551, "reward_after_std": 0.9616120085120201, "reward_before_mean": 1.0071106106042862, "reward_before_std": 0.9746308140456676, "reward_change_max": 0.0, "reward_change_mean": -0.14567939471453428, "reward_change_min": -0.28493910282850266, "reward_change_std": 0.10870950575917959, "reward_std": 0.9616120085120201, "rewards/cosine_scaled_reward": 0.10772196669131517, "rewards/format_reward": 0.7916666865348816, "step": 399 }, { "advantage_max": 1.374092049896717, "advantage_mean": -1.924733378233512e-08, "advantage_min": -1.041805051267147, "advantage_std": 0.9453696236014366, "completion_length": 2148.9583740234375, "epoch": 0.45714285714285713, "grad_norm": 0.6918766498565674, "kl": 0.1371002197265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0528000059645995e-07, "loss": 0.0116, "reward": 1.2651937678456306, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.2651937678456306, "reward_after_std": 0.9453696310520172, "reward_before_mean": 1.4482441246509552, "reward_before_std": 0.9435032866895199, "reward_change_max": 0.0, "reward_change_mean": -0.18305037543177605, "reward_change_min": -0.3167814239859581, "reward_change_std": 0.12038358487188816, "reward_std": 0.9453696794807911, "rewards/cosine_scaled_reward": 0.2762054104823619, "rewards/format_reward": 0.8958333432674408, "step": 400 }, { "advantage_max": 1.067967213690281, "advantage_mean": -8.071463275527435e-09, "advantage_min": -1.150337852537632, "advantage_std": 0.8019844517111778, "completion_length": 3032.1250915527344, "epoch": 0.4582857142857143, "grad_norm": 0.7639958262443542, "kl": 0.423126220703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.032690407508949e-07, "loss": 0.0298, "reward": 0.43492011073976755, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.43492011073976755, "reward_after_std": 0.8019844405353069, "reward_before_mean": 0.5461161928251386, "reward_before_std": 0.8226968869566917, "reward_change_max": 8.496642112731934e-05, "reward_change_mean": -0.11119608022272587, "reward_change_min": -0.19766813702881336, "reward_change_std": 0.08586510363966227, "reward_std": 0.8019844740629196, "rewards/cosine_scaled_reward": -0.06027523800730705, "rewards/format_reward": 0.6666666865348816, "step": 401 }, { "advantage_max": 0.9436207935214043, "advantage_mean": -3.104408685672411e-08, "advantage_min": -0.7495729178190231, "advantage_std": 0.6268677823245525, "completion_length": 2520.229217529297, "epoch": 0.4594285714285714, "grad_norm": 0.8622435927391052, "kl": 0.3449249267578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 2.0127498008311922e-07, "loss": 0.0176, "reward": 0.41957162227481604, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.41957162227481604, "reward_after_std": 0.6268677823245525, "reward_before_mean": 0.5319516197778285, "reward_before_std": 0.6171875484287739, "reward_change_max": 8.885562419891357e-05, "reward_change_mean": -0.11238000728189945, "reward_change_min": -0.1966311875730753, "reward_change_std": 0.07510491507127881, "reward_std": 0.6268677972257137, "rewards/cosine_scaled_reward": -0.07777421269565821, "rewards/format_reward": 0.6875000167638063, "step": 402 }, { "advantage_max": 1.0418415665626526, "advantage_mean": 4.9670538238011375e-09, "advantage_min": -1.0355667266994715, "advantage_std": 0.7518435195088387, "completion_length": 2340.083366394043, "epoch": 0.4605714285714286, "grad_norm": 0.7467604875564575, "kl": 0.1708984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9929791578083655e-07, "loss": 0.0142, "reward": 0.5140412461478263, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5140412461478263, "reward_after_std": 0.7518434971570969, "reward_before_mean": 0.6334147532470524, "reward_before_std": 0.75905573181808, "reward_change_max": 0.00014617294073104858, "reward_change_mean": -0.11937349662184715, "reward_change_min": -0.19436869025230408, "reward_change_std": 0.08348292578011751, "reward_std": 0.7518435046076775, "rewards/cosine_scaled_reward": 0.014624039176851511, "rewards/format_reward": 0.6041666865348816, "step": 403 }, { "advantage_max": 0.6259809099137783, "advantage_mean": -1.7229467852430957e-08, "advantage_min": -0.5122467614710331, "advantage_std": 0.43664512410759926, "completion_length": 2699.7083740234375, "epoch": 0.4617142857142857, "grad_norm": 0.715581476688385, "kl": 0.31291961669921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9733794420337213e-07, "loss": 0.0129, "reward": 0.520903637050651, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.520903637050651, "reward_after_std": 0.43664513528347015, "reward_before_mean": 0.6486049126833677, "reward_before_std": 0.41897546872496605, "reward_change_max": 0.0, "reward_change_mean": -0.12770125456154346, "reward_change_min": -0.1968603590503335, "reward_change_std": 0.07488203165121377, "reward_std": 0.43664515018463135, "rewards/cosine_scaled_reward": -0.009030893445014954, "rewards/format_reward": 0.6666666716337204, "step": 404 }, { "advantage_max": 1.0945487841963768, "advantage_mean": -2.7939678071131624e-08, "advantage_min": -1.289386235177517, "advantage_std": 0.9108894169330597, "completion_length": 2435.4792404174805, "epoch": 0.46285714285714286, "grad_norm": 0.494119256734848, "kl": 0.3772735595703125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.9539516087697517e-07, "loss": 0.0535, "reward": 0.7584564303979278, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7584564303979278, "reward_after_std": 0.9108893796801567, "reward_before_mean": 0.8993258298141882, "reward_before_std": 0.942900113761425, "reward_change_max": 0.0, "reward_change_mean": -0.14086939627304673, "reward_change_min": -0.25777516420930624, "reward_change_std": 0.10722628142684698, "reward_std": 0.9108894020318985, "rewards/cosine_scaled_reward": 0.12674623914062977, "rewards/format_reward": 0.6458333507180214, "step": 405 }, { "advantage_max": 1.0511068068444729, "advantage_mean": -6.084641079873165e-08, "advantage_min": -1.0219291038811207, "advantage_std": 0.7899200022220612, "completion_length": 2561.8334045410156, "epoch": 0.464, "grad_norm": 0.665023148059845, "kl": 0.25701904296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.934696604901642e-07, "loss": 0.0351, "reward": 1.0501489378511906, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 1.0501489378511906, "reward_after_std": 0.7899199984967709, "reward_before_mean": 1.219081237912178, "reward_before_std": 0.7882814519107342, "reward_change_max": 0.00011001527309417725, "reward_change_mean": -0.16893231682479382, "reward_change_min": -0.282260874286294, "reward_change_std": 0.10838037542998791, "reward_std": 0.7899200432002544, "rewards/cosine_scaled_reward": 0.18245727149769664, "rewards/format_reward": 0.8541666772216558, "step": 406 }, { "advantage_max": 0.902554202824831, "advantage_mean": -3.7252904538931375e-08, "advantage_min": -0.7991390824317932, "advantage_std": 0.6586268059909344, "completion_length": 2573.2291946411133, "epoch": 0.46514285714285714, "grad_norm": 0.5235826969146729, "kl": 0.2978057861328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.915615368891117e-07, "loss": 0.0504, "reward": 0.7006958748097531, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7006958748097531, "reward_after_std": 0.6586268097162247, "reward_before_mean": 0.8396258531138301, "reward_before_std": 0.6540814377367496, "reward_change_max": 0.0, "reward_change_mean": -0.13892999943345785, "reward_change_min": -0.2336738519370556, "reward_change_std": 0.09190351748839021, "reward_std": 0.6586268320679665, "rewards/cosine_scaled_reward": 0.07606291398406029, "rewards/format_reward": 0.6875000037252903, "step": 407 }, { "advantage_max": 1.0302674248814583, "advantage_mean": -4.346172205460519e-08, "advantage_min": -1.0587206855416298, "advantage_std": 0.7980893142521381, "completion_length": 2748.6042098999023, "epoch": 0.4662857142857143, "grad_norm": 0.49642956256866455, "kl": 0.37957763671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8967088307307e-07, "loss": 0.0551, "reward": 0.5339444152486976, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5339444152486976, "reward_after_std": 0.7980893142521381, "reward_before_mean": 0.6553622125647962, "reward_before_std": 0.8187281489372253, "reward_change_max": 0.0005005300045013428, "reward_change_mean": -0.1214177985675633, "reward_change_min": -0.23425185028463602, "reward_change_std": 0.09430251410230994, "reward_std": 0.7980893328785896, "rewards/cosine_scaled_reward": 0.015181094408035278, "rewards/format_reward": 0.6250000111758709, "step": 408 }, { "advantage_max": 1.1790355741977692, "advantage_mean": -9.934107536579972e-09, "advantage_min": -1.1195921525359154, "advantage_std": 0.8626318871974945, "completion_length": 3257.8750610351562, "epoch": 0.4674285714285714, "grad_norm": 1.0251151323318481, "kl": 0.440673828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8779779118983867e-07, "loss": 0.0102, "reward": 0.34495530603453517, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34495530603453517, "reward_after_std": 0.8626318797469139, "reward_before_mean": 0.4460237352177501, "reward_before_std": 0.8854938633739948, "reward_change_max": 0.00019879639148712158, "reward_change_mean": -0.1010684184730053, "reward_change_min": -0.21322645619511604, "reward_change_std": 0.0844197073020041, "reward_std": 0.8626318946480751, "rewards/cosine_scaled_reward": -0.1311548100784421, "rewards/format_reward": 0.708333358168602, "step": 409 }, { "advantage_max": 1.0071330815553665, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.9616048745810986, "advantage_std": 0.7821291163563728, "completion_length": 2852.8125610351562, "epoch": 0.4685714285714286, "grad_norm": 0.9547456502914429, "kl": 0.550445556640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8594235253127372e-07, "loss": 0.0562, "reward": 0.2552414983510971, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2552414983510971, "reward_after_std": 0.7821291275322437, "reward_before_mean": 0.3507663235068321, "reward_before_std": 0.804282508790493, "reward_change_max": 6.768107414245605e-05, "reward_change_mean": -0.09552482329308987, "reward_change_min": -0.18692217115312815, "reward_change_std": 0.0792939979583025, "reward_std": 0.7821291498839855, "rewards/cosine_scaled_reward": -0.08503350615501404, "rewards/format_reward": 0.5208333488553762, "step": 410 }, { "advantage_max": 0.9848299585282803, "advantage_mean": -1.8936892498544466e-08, "advantage_min": -1.0076408050954342, "advantage_std": 0.7584920935332775, "completion_length": 3145.854278564453, "epoch": 0.4697142857142857, "grad_norm": 0.4657355844974518, "kl": 0.417236328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.8410465752883758e-07, "loss": 0.0387, "reward": 0.5571842957288027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5571842957288027, "reward_after_std": 0.758492112159729, "reward_before_mean": 0.6817128874827176, "reward_before_std": 0.7747105807065964, "reward_change_max": 0.00012461096048355103, "reward_change_mean": -0.12452859850600362, "reward_change_min": -0.22812595777213573, "reward_change_std": 0.0898899482563138, "reward_std": 0.7584921382367611, "rewards/cosine_scaled_reward": -0.013310234993696213, "rewards/format_reward": 0.7083333525806665, "step": 411 }, { "advantage_max": 1.2851012870669365, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -1.1866544671356678, "advantage_std": 0.9409245103597641, "completion_length": 3118.4584045410156, "epoch": 0.47085714285714286, "grad_norm": 0.8590179085731506, "kl": 0.3385009765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.822847957491922e-07, "loss": 0.0475, "reward": 0.6860095746815205, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6860095746815205, "reward_after_std": 0.9409245178103447, "reward_before_mean": 0.8169820861658081, "reward_before_std": 0.9591667316854, "reward_change_max": 0.0, "reward_change_mean": -0.130972508341074, "reward_change_min": -0.2485770285129547, "reward_change_std": 0.09931665565818548, "reward_std": 0.9409245401620865, "rewards/cosine_scaled_reward": 0.012657706625759602, "rewards/format_reward": 0.7916666865348816, "step": 412 }, { "advantage_max": 1.0960227698087692, "advantage_mean": -2.3593505704688766e-08, "advantage_min": -1.1873703114688396, "advantage_std": 0.8447449170053005, "completion_length": 2740.6251068115234, "epoch": 0.472, "grad_norm": 0.8115954995155334, "kl": 0.342376708984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.804828558898332e-07, "loss": 0.0565, "reward": 0.6243542423471808, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6243542423471808, "reward_after_std": 0.8447449207305908, "reward_before_mean": 0.7526226807385683, "reward_before_std": 0.8638904727995396, "reward_change_max": 0.0, "reward_change_mean": -0.12826845049858093, "reward_change_min": -0.22999808378517628, "reward_change_std": 0.09474454261362553, "reward_std": 0.8447449542582035, "rewards/cosine_scaled_reward": 0.022144658491015434, "rewards/format_reward": 0.7083333469927311, "step": 413 }, { "advantage_max": 1.248441867530346, "advantage_mean": 7.76102138111412e-09, "advantage_min": -0.8844185099005699, "advantage_std": 0.8054223507642746, "completion_length": 3167.0209350585938, "epoch": 0.47314285714285714, "grad_norm": 0.6134005784988403, "kl": 0.59326171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7869892577476722e-07, "loss": 0.072, "reward": 0.06549638044089079, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.06549638044089079, "reward_after_std": 0.8054223656654358, "reward_before_mean": 0.1393390439916402, "reward_before_std": 0.8182446286082268, "reward_change_max": 0.0002498254179954529, "reward_change_mean": -0.07384267030283809, "reward_change_min": -0.15592787880450487, "reward_change_std": 0.06589569803327322, "reward_std": 0.8054223731160164, "rewards/cosine_scaled_reward": -0.1803304860368371, "rewards/format_reward": 0.5000000111758709, "step": 414 }, { "advantage_max": 1.1541544646024704, "advantage_mean": 9.313226023710541e-09, "advantage_min": -0.7554533295333385, "advantage_std": 0.7500776499509811, "completion_length": 3323.0834045410156, "epoch": 0.4742857142857143, "grad_norm": 0.9510299563407898, "kl": 0.78515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7693309235023127e-07, "loss": 0.0881, "reward": 0.026195455342531204, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.026195455342531204, "reward_after_std": 0.7500776499509811, "reward_before_mean": 0.0982854962348938, "reward_before_std": 0.7606666944921017, "reward_change_max": 0.0, "reward_change_mean": -0.07209003553725779, "reward_change_min": -0.15941832214593887, "reward_change_std": 0.06286179949529469, "reward_std": 0.7500776574015617, "rewards/cosine_scaled_reward": -0.12794058211147785, "rewards/format_reward": 0.3541666716337204, "step": 415 }, { "advantage_max": 1.1921259351074696, "advantage_mean": -2.4214388605336978e-08, "advantage_min": -1.027435451745987, "advantage_std": 0.8444834239780903, "completion_length": 2599.604263305664, "epoch": 0.4754285714285714, "grad_norm": 0.5963953733444214, "kl": 0.261688232421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7518544168045524e-07, "loss": -0.014, "reward": 0.5983389317989349, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5983389317989349, "reward_after_std": 0.8444834165275097, "reward_before_mean": 0.7227395437657833, "reward_before_std": 0.8563327789306641, "reward_change_max": 0.0002361685037612915, "reward_change_mean": -0.1244005998596549, "reward_change_min": -0.24791191704571247, "reward_change_std": 0.09307907475158572, "reward_std": 0.8444834351539612, "rewards/cosine_scaled_reward": -0.03446358081419021, "rewards/format_reward": 0.7916666902601719, "step": 416 }, { "advantage_max": 1.2748055160045624, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.8345907405018806, "advantage_std": 0.8307083323597908, "completion_length": 3296.3959045410156, "epoch": 0.4765714285714286, "grad_norm": 0.658596396446228, "kl": 0.4886474609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7345605894346726e-07, "loss": 0.0529, "reward": 0.01480916142463684, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.01480916142463684, "reward_after_std": 0.8307083398103714, "reward_before_mean": 0.08347079087980092, "reward_before_std": 0.8477018140256405, "reward_change_max": 0.0005599185824394226, "reward_change_mean": -0.06866162805818021, "reward_change_min": -0.17168361693620682, "reward_change_std": 0.06843259232118726, "reward_std": 0.8307083509862423, "rewards/cosine_scaled_reward": -0.16659794500446878, "rewards/format_reward": 0.41666667722165585, "step": 417 }, { "advantage_max": 1.0702146142721176, "advantage_mean": -2.2972623692218974e-08, "advantage_min": -1.0531324371695518, "advantage_std": 0.7828693352639675, "completion_length": 2456.52091217041, "epoch": 0.4777142857142857, "grad_norm": 0.5361673831939697, "kl": 0.20428466796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7174502842694212e-07, "loss": 0.0059, "reward": 0.9526145923882723, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.9526145923882723, "reward_after_std": 0.7828693278133869, "reward_before_mean": 1.1116907584219007, "reward_before_std": 0.7802752666175365, "reward_change_max": 0.00038120150566101074, "reward_change_mean": -0.15907620172947645, "reward_change_min": -0.25695937499403954, "reward_change_std": 0.10400415351614356, "reward_std": 0.7828693389892578, "rewards/cosine_scaled_reward": 0.1391787314787507, "rewards/format_reward": 0.8333333469927311, "step": 418 }, { "advantage_max": 1.3333582356572151, "advantage_mean": -2.126519908773883e-08, "advantage_min": -1.1237758100032806, "advantage_std": 0.9614992318674922, "completion_length": 3064.479248046875, "epoch": 0.47885714285714287, "grad_norm": 0.8987430930137634, "kl": 0.388641357421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.7005243352409333e-07, "loss": 0.0697, "reward": 0.7200150811113417, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7200150811113417, "reward_after_std": 0.9614992393180728, "reward_before_mean": 0.8526841946877539, "reward_before_std": 0.9812450297176838, "reward_change_max": 7.344037294387817e-05, "reward_change_mean": -0.1326691498979926, "reward_change_min": -0.2474001133814454, "reward_change_std": 0.10230770613998175, "reward_std": 0.961499254219234, "rewards/cosine_scaled_reward": 0.10342542547732592, "rewards/format_reward": 0.645833345130086, "step": 419 }, { "advantage_max": 0.8133354522287846, "advantage_mean": -2.6077033421501028e-08, "advantage_min": -0.9922023713588715, "advantage_std": 0.6553318947553635, "completion_length": 2480.4375762939453, "epoch": 0.48, "grad_norm": 0.4735735356807709, "kl": 0.289306640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6837835672960831e-07, "loss": 0.0418, "reward": 0.5324579540174454, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.5324579540174454, "reward_after_std": 0.6553318910300732, "reward_before_mean": 0.657228053547442, "reward_before_std": 0.6681869141757488, "reward_change_max": 0.00011757761240005493, "reward_change_mean": -0.12477011140435934, "reward_change_min": -0.21673501282930374, "reward_change_std": 0.08654853561893106, "reward_std": 0.6553319171071053, "rewards/cosine_scaled_reward": -0.05680265463888645, "rewards/format_reward": 0.7708333469927311, "step": 420 }, { "advantage_max": 1.2419108375906944, "advantage_mean": -1.9247333726823967e-08, "advantage_min": -0.9631900601089001, "advantage_std": 0.8234320022165775, "completion_length": 3331.9375915527344, "epoch": 0.48114285714285715, "grad_norm": 0.5381994843482971, "kl": 0.4439697265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6672287963562852e-07, "loss": 0.048, "reward": 0.3111051223240793, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3111051223240793, "reward_after_std": 0.8234320022165775, "reward_before_mean": 0.40812125336378813, "reward_before_std": 0.8344125263392925, "reward_change_max": 6.553530693054199e-05, "reward_change_mean": -0.09701612405478954, "reward_change_min": -0.1940639168024063, "reward_change_std": 0.07709357934072614, "reward_std": 0.823432020843029, "rewards/cosine_scaled_reward": -0.1292727179825306, "rewards/format_reward": 0.666666679084301, "step": 421 }, { "advantage_max": 0.9083509668707848, "advantage_mean": 9.313225912688239e-09, "advantage_min": -0.9410501569509506, "advantage_std": 0.7073789015412331, "completion_length": 2946.7084197998047, "epoch": 0.48228571428571426, "grad_norm": 0.702170193195343, "kl": 0.3853759765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6508608292777203e-07, "loss": 0.0758, "reward": 0.5041839517652988, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5041839517652988, "reward_after_std": 0.7073789089918137, "reward_before_mean": 0.6250940449535847, "reward_before_std": 0.7243058681488037, "reward_change_max": 0.00022362172603607178, "reward_change_mean": -0.12091004336252809, "reward_change_min": -0.21036983001977205, "reward_change_std": 0.08578781271353364, "reward_std": 0.7073789089918137, "rewards/cosine_scaled_reward": -0.010369660332798958, "rewards/format_reward": 0.6458333432674408, "step": 422 }, { "advantage_max": 1.0704151839017868, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.9820942431688309, "advantage_std": 0.7664654031395912, "completion_length": 2857.979248046875, "epoch": 0.48342857142857143, "grad_norm": 0.4890437126159668, "kl": 0.3306427001953125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6346804638120098e-07, "loss": 0.0261, "reward": 0.2683728828560561, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2683728828560561, "reward_after_std": 0.7664653956890106, "reward_before_mean": 0.36373477918095887, "reward_before_std": 0.7833064757287502, "reward_change_max": 0.00026363134384155273, "reward_change_mean": -0.09536188654601574, "reward_change_min": -0.189723776653409, "reward_change_std": 0.07672601472586393, "reward_std": 0.7664654068648815, "rewards/cosine_scaled_reward": -0.16188262542709708, "rewards/format_reward": 0.6875000074505806, "step": 423 }, { "advantage_max": 1.5122771635651588, "advantage_mean": -8.8475646253805e-09, "advantage_min": -0.9919125214219093, "advantage_std": 0.9600072056055069, "completion_length": 3214.666778564453, "epoch": 0.4845714285714286, "grad_norm": 0.8926342725753784, "kl": 0.4385986328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6186884885673413e-07, "loss": 0.0271, "reward": 0.34624925162643194, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.34624925162643194, "reward_after_std": 0.9600072205066681, "reward_before_mean": 0.4413239473942667, "reward_before_std": 0.9716643802821636, "reward_change_max": 0.00024952739477157593, "reward_change_mean": -0.09507470857352018, "reward_change_min": -0.20797365996986628, "reward_change_std": 0.07991787604987621, "reward_std": 0.9600072354078293, "rewards/cosine_scaled_reward": -0.10225469525903463, "rewards/format_reward": 0.6458333414047956, "step": 424 }, { "advantage_max": 0.8413158319890499, "advantage_mean": 5.5879355587151736e-09, "advantage_min": -1.0183219015598297, "advantage_std": 0.6862456165254116, "completion_length": 2265.0625762939453, "epoch": 0.4857142857142857, "grad_norm": 1.4573155641555786, "kl": 0.28778076171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.6028856829700258e-07, "loss": 0.0909, "reward": 1.170523855369538, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.170523855369538, "reward_after_std": 0.6862456277012825, "reward_before_mean": 1.3542293733917177, "reward_before_std": 0.6802042722702026, "reward_change_max": 0.00010611116886138916, "reward_change_mean": -0.18370548216626048, "reward_change_min": -0.2846982665359974, "reward_change_std": 0.11613735929131508, "reward_std": 0.6862456537783146, "rewards/cosine_scaled_reward": 0.3021146897226572, "rewards/format_reward": 0.7500000149011612, "step": 425 }, { "advantage_max": 1.037292331457138, "advantage_mean": -4.8428776211473235e-08, "advantage_min": -0.7733364477753639, "advantage_std": 0.6561478115618229, "completion_length": 2578.9375762939453, "epoch": 0.4868571428571429, "grad_norm": 0.6892431974411011, "kl": 0.2954254150390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5872728172265146e-07, "loss": 0.0218, "reward": 0.6788006233982742, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6788006233982742, "reward_after_std": 0.6561478264629841, "reward_before_mean": 0.8136748373508453, "reward_before_std": 0.6412300877273083, "reward_change_max": 4.620850086212158e-05, "reward_change_mean": -0.13487422419711947, "reward_change_min": -0.21194105129688978, "reward_change_std": 0.08242963580414653, "reward_std": 0.6561478637158871, "rewards/cosine_scaled_reward": 0.0005873972550034523, "rewards/format_reward": 0.8125000111758709, "step": 426 }, { "advantage_max": 1.1418819837272167, "advantage_mean": -1.3038516377683607e-08, "advantage_min": -0.6787732392549515, "advantage_std": 0.7229117751121521, "completion_length": 3192.9584045410156, "epoch": 0.488, "grad_norm": 0.9335200786590576, "kl": 0.42462158203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5718506522858572e-07, "loss": 0.0768, "reward": 0.11853348463773727, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11853348463773727, "reward_after_std": 0.7229117825627327, "reward_before_mean": 0.19935083203017712, "reward_before_std": 0.7258897684514523, "reward_change_max": 0.0003618001937866211, "reward_change_mean": -0.08081738196779042, "reward_change_min": -0.16983817890286446, "reward_change_std": 0.06711064896080643, "reward_std": 0.7229118067771196, "rewards/cosine_scaled_reward": -0.11907458305358887, "rewards/format_reward": 0.43750000558793545, "step": 427 }, { "advantage_max": 1.176058478653431, "advantage_mean": -1.2417634531747268e-08, "advantage_min": -0.9602077603340149, "advantage_std": 0.7935724556446075, "completion_length": 3005.7500915527344, "epoch": 0.48914285714285716, "grad_norm": 1.1662989854812622, "kl": 0.6168212890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5566199398026147e-07, "loss": 0.0661, "reward": 0.18571929726749659, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18571929726749659, "reward_after_std": 0.7935724854469299, "reward_before_mean": 0.2716755969449878, "reward_before_std": 0.807749580591917, "reward_change_max": 0.00014090538024902344, "reward_change_mean": -0.08595629991032183, "reward_change_min": -0.16239676997065544, "reward_change_std": 0.06813311390578747, "reward_std": 0.7935724891722202, "rewards/cosine_scaled_reward": -0.12457887083292007, "rewards/format_reward": 0.5208333469927311, "step": 428 }, { "advantage_max": 1.0645763352513313, "advantage_mean": -1.3969839729455202e-09, "advantage_min": -0.9452930763363838, "advantage_std": 0.7396648898720741, "completion_length": 2547.31258392334, "epoch": 0.49028571428571427, "grad_norm": 0.6578018665313721, "kl": 0.35150146484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5415814221002265e-07, "loss": 0.0201, "reward": 0.5265534557402134, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5265534557402134, "reward_after_std": 0.7396649047732353, "reward_before_mean": 0.6463110996410251, "reward_before_std": 0.7420226447284222, "reward_change_max": 0.00036235153675079346, "reward_change_mean": -0.11975763086229563, "reward_change_min": -0.21091782487928867, "reward_change_std": 0.08160475362092257, "reward_std": 0.7396649271249771, "rewards/cosine_scaled_reward": -0.0726778069511056, "rewards/format_reward": 0.7916666828095913, "step": 429 }, { "advantage_max": 1.0652894973754883, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.9615795835852623, "advantage_std": 0.7271205633878708, "completion_length": 2463.6666870117188, "epoch": 0.49142857142857144, "grad_norm": 1.3026567697525024, "kl": 0.322784423828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5267358321348285e-07, "loss": -0.0016, "reward": 0.6043818918988109, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6043818918988109, "reward_after_std": 0.727120541036129, "reward_before_mean": 0.7314365280326456, "reward_before_std": 0.7297204360365868, "reward_change_max": 0.0002305358648300171, "reward_change_mean": -0.12705462612211704, "reward_change_min": -0.2095300555229187, "reward_change_std": 0.08553120819851756, "reward_std": 0.7271205447614193, "rewards/cosine_scaled_reward": 0.03238492365926504, "rewards/format_reward": 0.666666679084301, "step": 430 }, { "advantage_max": 0.7961925268173218, "advantage_mean": -5.898376398416616e-09, "advantage_min": -0.7208251170814037, "advantage_std": 0.5705397799611092, "completion_length": 2729.104232788086, "epoch": 0.49257142857142855, "grad_norm": 1.024469256401062, "kl": 0.43096160888671875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.5120838934595337e-07, "loss": 0.0403, "reward": 0.13876285403966904, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13876285403966904, "reward_after_std": 0.5705397836863995, "reward_before_mean": 0.22783136554062366, "reward_before_std": 0.5793787594884634, "reward_change_max": 0.0009065419435501099, "reward_change_mean": -0.08906851289793849, "reward_change_min": -0.1641480876132846, "reward_change_std": 0.06509990012273192, "reward_std": 0.5705397874116898, "rewards/cosine_scaled_reward": -0.18816765770316124, "rewards/format_reward": 0.6041666865348816, "step": 431 }, { "advantage_max": 0.9430552236735821, "advantage_mean": -3.104408785592483e-09, "advantage_min": -0.7989522684365511, "advantage_std": 0.705777489580214, "completion_length": 3185.3125915527344, "epoch": 0.4937142857142857, "grad_norm": 0.7096803188323975, "kl": 0.595458984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4976263201891613e-07, "loss": 0.0778, "reward": 0.13925552484579384, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13925552484579384, "reward_after_std": 0.705777489580214, "reward_before_mean": 0.22575496323406696, "reward_before_std": 0.7232791353017092, "reward_change_max": 0.00015789270401000977, "reward_change_mean": -0.08649945515207946, "reward_change_min": -0.1785599086433649, "reward_change_std": 0.07354599726386368, "reward_std": 0.7057775054126978, "rewards/cosine_scaled_reward": -0.15795585606247187, "rewards/format_reward": 0.5416666753590107, "step": 432 }, { "advantage_max": 0.9957953058183193, "advantage_mean": -4.0978195170460197e-08, "advantage_min": -1.1007677465677261, "advantage_std": 0.7739236429333687, "completion_length": 2984.6250915527344, "epoch": 0.4948571428571429, "grad_norm": 0.754916787147522, "kl": 0.4191131591796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.483363816965435e-07, "loss": 0.0321, "reward": 0.5658143066102639, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5658143066102639, "reward_after_std": 0.773923646658659, "reward_before_mean": 0.6907664993777871, "reward_before_std": 0.7903828285634518, "reward_change_max": 0.00024560093879699707, "reward_change_mean": -0.12495221896097064, "reward_change_min": -0.22821590304374695, "reward_change_std": 0.09297089325264096, "reward_std": 0.7739236764609814, "rewards/cosine_scaled_reward": 0.032883236184716225, "rewards/format_reward": 0.6250000186264515, "step": 433 }, { "advantage_max": 0.766203161329031, "advantage_mean": -5.587935336670569e-09, "advantage_min": -0.6076318752020597, "advantage_std": 0.512691916897893, "completion_length": 3136.812530517578, "epoch": 0.496, "grad_norm": 0.6693951487541199, "kl": 0.37371826171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.469297078922642e-07, "loss": 0.0292, "reward": 0.08525129873305559, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08525129873305559, "reward_after_std": 0.5126919113099575, "reward_before_mean": 0.16985240951180458, "reward_before_std": 0.5098276436328888, "reward_change_max": 0.0, "reward_change_mean": -0.08460111077874899, "reward_change_min": -0.14587281458079815, "reward_change_std": 0.057145274709910154, "reward_std": 0.5126919187605381, "rewards/cosine_scaled_reward": -0.19632380595430732, "rewards/format_reward": 0.562500013038516, "step": 434 }, { "advantage_max": 0.9805018194019794, "advantage_mean": -3.352761368535795e-08, "advantage_min": -0.7217847239226103, "advantage_std": 0.6718719862401485, "completion_length": 2380.2500534057617, "epoch": 0.49714285714285716, "grad_norm": 1.3378220796585083, "kl": 0.2252197265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4554267916537495e-07, "loss": -0.0097, "reward": 0.570288053364493, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.570288053364493, "reward_after_std": 0.6718719862401485, "reward_before_mean": 0.6957142185419798, "reward_before_std": 0.6674840692430735, "reward_change_max": 0.0, "reward_change_mean": -0.12542616669088602, "reward_change_min": -0.2183425473049283, "reward_change_std": 0.07983373990282416, "reward_std": 0.6718719974160194, "rewards/cosine_scaled_reward": -0.07922623585909605, "rewards/format_reward": 0.8541666716337204, "step": 435 }, { "advantage_max": 1.3979903608560562, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.9107687398791313, "advantage_std": 0.8552838861942291, "completion_length": 2412.479217529297, "epoch": 0.4982857142857143, "grad_norm": 0.7646883726119995, "kl": 0.275054931640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4417536311769885e-07, "loss": -0.0185, "reward": 0.7218849333003163, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7218849333003163, "reward_after_std": 0.8552838936448097, "reward_before_mean": 0.8540548323653638, "reward_before_std": 0.8422790169715881, "reward_change_max": 9.534507989883423e-05, "reward_change_mean": -0.13216988369822502, "reward_change_min": -0.21940491441637278, "reward_change_std": 0.0889145196415484, "reward_std": 0.8552839010953903, "rewards/cosine_scaled_reward": 0.08327740104869008, "rewards/format_reward": 0.6875000149011612, "step": 436 }, { "advantage_max": 1.2850229367613792, "advantage_mean": -1.4901161915492622e-08, "advantage_min": -0.8939843475818634, "advantage_std": 0.845024760812521, "completion_length": 3216.6459045410156, "epoch": 0.49942857142857144, "grad_norm": 0.5741217732429504, "kl": 0.383544921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4282782639029128e-07, "loss": 0.0534, "reward": 0.37288548797369003, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.37288548797369003, "reward_after_std": 0.8450247794389725, "reward_before_mean": 0.4743971526622772, "reward_before_std": 0.8533979505300522, "reward_change_max": 0.00014210492372512817, "reward_change_mean": -0.10151165537536144, "reward_change_min": -0.2013682834804058, "reward_change_std": 0.08048270735889673, "reward_std": 0.8450247906148434, "rewards/cosine_scaled_reward": -0.09613476321101189, "rewards/format_reward": 0.6666666809469461, "step": 437 }, { "advantage_max": 1.011772993952036, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -1.0724971368908882, "advantage_std": 0.7722194865345955, "completion_length": 3073.229248046875, "epoch": 0.5005714285714286, "grad_norm": 0.7125306129455566, "kl": 0.4136962890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4150013466019114e-07, "loss": 0.0266, "reward": 0.41990641644224524, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41990641644224524, "reward_after_std": 0.7722194865345955, "reward_before_mean": 0.5311546549201012, "reward_before_std": 0.7916942909359932, "reward_change_max": 0.00031879544258117676, "reward_change_mean": -0.11124823242425919, "reward_change_min": -0.21365612745285034, "reward_change_std": 0.08680269494652748, "reward_std": 0.7722194939851761, "rewards/cosine_scaled_reward": -0.036506010219454765, "rewards/format_reward": 0.6041666772216558, "step": 438 }, { "advantage_max": 1.165678858757019, "advantage_mean": 1.3969838813521207e-08, "advantage_min": -0.8330172151327133, "advantage_std": 0.7271898277103901, "completion_length": 2788.0208892822266, "epoch": 0.5017142857142857, "grad_norm": 0.5193164348602295, "kl": 0.2929534912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.4019235263722034e-07, "loss": 0.0461, "reward": 0.04328843858093023, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04328843858093023, "reward_after_std": 0.727189838886261, "reward_before_mean": 0.11658047512173653, "reward_before_std": 0.7333553172647953, "reward_change_max": 0.0002767816185951233, "reward_change_mean": -0.07329201139509678, "reward_change_min": -0.1345710689201951, "reward_change_std": 0.05936182173900306, "reward_std": 0.7271898537874222, "rewards/cosine_scaled_reward": -0.19170977361500263, "rewards/format_reward": 0.5000000074505806, "step": 439 }, { "advantage_max": 0.95999875664711, "advantage_mean": -2.5766592415266132e-08, "advantage_min": -0.671626940369606, "advantage_std": 0.6221942529082298, "completion_length": 3048.4375762939453, "epoch": 0.5028571428571429, "grad_norm": 0.40915608406066895, "kl": 0.3280029296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3890454406082956e-07, "loss": 0.0393, "reward": 0.22823767503723502, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22823767503723502, "reward_after_std": 0.6221942454576492, "reward_before_mean": 0.32270977552980185, "reward_before_std": 0.6170362383127213, "reward_change_max": 0.0, "reward_change_mean": -0.09447211399674416, "reward_change_min": -0.17463115137070417, "reward_change_std": 0.06761556677520275, "reward_std": 0.6221942603588104, "rewards/cosine_scaled_reward": -0.09906180715188384, "rewards/format_reward": 0.5208333488553762, "step": 440 }, { "advantage_max": 1.2837908938527107, "advantage_mean": -8.071462387349015e-09, "advantage_min": -1.015868429094553, "advantage_std": 0.8546808660030365, "completion_length": 3028.9583892822266, "epoch": 0.504, "grad_norm": 0.35905030369758606, "kl": 0.2718505859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3763677169699217e-07, "loss": 0.0362, "reward": 0.4694512798450887, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4694512798450887, "reward_after_std": 0.8546808734536171, "reward_before_mean": 0.5803429093211889, "reward_before_std": 0.8632423244416714, "reward_change_max": 0.00014021247625350952, "reward_change_mean": -0.11089161830022931, "reward_change_min": -0.21306519862264395, "reward_change_std": 0.08292792178690434, "reward_std": 0.8546808958053589, "rewards/cosine_scaled_reward": -0.05357855744659901, "rewards/format_reward": 0.6875000111758709, "step": 441 }, { "advantage_max": 1.0009881034493446, "advantage_mean": -2.220446049250313e-16, "advantage_min": -0.8761501684784889, "advantage_std": 0.7383112497627735, "completion_length": 3243.604248046875, "epoch": 0.5051428571428571, "grad_norm": 0.4938925504684448, "kl": 0.2559814453125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3638909733514452e-07, "loss": 0.0458, "reward": 0.7655965192243457, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.7655965192243457, "reward_after_std": 0.7383112460374832, "reward_before_mean": 0.9093698719516397, "reward_before_std": 0.742842298001051, "reward_change_max": 0.0, "reward_change_mean": -0.14377336343750358, "reward_change_min": -0.24906758219003677, "reward_change_std": 0.10069033224135637, "reward_std": 0.7383112534880638, "rewards/cosine_scaled_reward": 0.12135160126490518, "rewards/format_reward": 0.6666666734963655, "step": 442 }, { "advantage_max": 1.106526430696249, "advantage_mean": -6.208816905051151e-09, "advantage_min": -0.7966680526733398, "advantage_std": 0.6943358443677425, "completion_length": 3220.4791870117188, "epoch": 0.5062857142857143, "grad_norm": 0.3847130835056305, "kl": 0.34722900390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.351615817851748e-07, "loss": 0.0509, "reward": 0.33592464402318, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33592464402318, "reward_after_std": 0.6943358294665813, "reward_before_mean": 0.4375531330006197, "reward_before_std": 0.6889032647013664, "reward_change_max": 0.0, "reward_change_mean": -0.10162849072366953, "reward_change_min": -0.17682076431810856, "reward_change_std": 0.0679366267286241, "reward_std": 0.6943358518183231, "rewards/cosine_scaled_reward": -0.09372343437280506, "rewards/format_reward": 0.6250000223517418, "step": 443 }, { "advantage_max": 1.1278332397341728, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.7738394141197205, "advantage_std": 0.7633233778178692, "completion_length": 3012.895896911621, "epoch": 0.5074285714285715, "grad_norm": 0.6785799860954285, "kl": 0.29107666015625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3395428487445914e-07, "loss": 0.0142, "reward": 0.12987452652305365, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12987452652305365, "reward_after_std": 0.7633233852684498, "reward_before_mean": 0.21220868080854416, "reward_before_std": 0.7764322087168694, "reward_change_max": 0.0003639534115791321, "reward_change_mean": -0.0823341638315469, "reward_change_min": -0.1722795907407999, "reward_change_std": 0.0688524863217026, "reward_std": 0.7633234038949013, "rewards/cosine_scaled_reward": -0.15431233122944832, "rewards/format_reward": 0.5208333395421505, "step": 444 }, { "advantage_max": 1.5314294025301933, "advantage_mean": -2.483526961860605e-08, "advantage_min": -0.8162381164729595, "advantage_std": 0.9250198267400265, "completion_length": 3092.7709045410156, "epoch": 0.5085714285714286, "grad_norm": 1.2392834424972534, "kl": 0.28601837158203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3276726544494571e-07, "loss": 0.0478, "reward": 0.495319290086627, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.495319290086627, "reward_after_std": 0.9250198528170586, "reward_before_mean": 0.6040422953665257, "reward_before_std": 0.9202543366700411, "reward_change_max": 9.849667549133301e-05, "reward_change_mean": -0.10872298898175359, "reward_change_min": -0.20648100413382053, "reward_change_std": 0.08262008614838123, "reward_std": 0.925019882619381, "rewards/cosine_scaled_reward": 0.010354459285736084, "rewards/format_reward": 0.5833333414047956, "step": 445 }, { "advantage_max": 1.1504263132810593, "advantage_mean": 7.45058070794613e-09, "advantage_min": -1.0259768851101398, "advantage_std": 0.782253373414278, "completion_length": 3035.2708892822266, "epoch": 0.5097142857142857, "grad_norm": 0.3403970003128052, "kl": 0.2048492431640625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.316005813502869e-07, "loss": 0.0314, "reward": 0.4214175812667236, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4214175812667236, "reward_after_std": 0.7822533883154392, "reward_before_mean": 0.529712213203311, "reward_before_std": 0.7904498800635338, "reward_change_max": 0.00035788118839263916, "reward_change_mean": -0.1082946015521884, "reward_change_min": -0.18838701210916042, "reward_change_std": 0.07935713930055499, "reward_std": 0.7822534218430519, "rewards/cosine_scaled_reward": -0.04764390899799764, "rewards/format_reward": 0.6250000186264515, "step": 446 }, { "advantage_max": 0.8362768590450287, "advantage_mean": -7.76102188071448e-09, "advantage_min": -1.0166893266141415, "advantage_std": 0.6835263110697269, "completion_length": 2488.0625610351562, "epoch": 0.5108571428571429, "grad_norm": 0.8194699883460999, "kl": 0.17340087890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.3045428945301953e-07, "loss": 0.0391, "reward": 0.7818196527659893, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7818196527659893, "reward_after_std": 0.6835263259708881, "reward_before_mean": 0.9298957334831357, "reward_before_std": 0.6929495614022017, "reward_change_max": 0.0, "reward_change_mean": -0.1480760732665658, "reward_change_min": -0.23649182450026274, "reward_change_std": 0.09618731448426843, "reward_std": 0.6835263390094042, "rewards/cosine_scaled_reward": 0.02744785137474537, "rewards/format_reward": 0.8750000074505806, "step": 447 }, { "advantage_max": 1.0442199632525444, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.869889423251152, "advantage_std": 0.7612986005842686, "completion_length": 2814.187545776367, "epoch": 0.512, "grad_norm": 0.4111681878566742, "kl": 0.24267578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2932844562179352e-07, "loss": 0.0191, "reward": 0.6223300769925117, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6223300769925117, "reward_after_std": 0.761298593133688, "reward_before_mean": 0.7515882328152657, "reward_before_std": 0.7706958763301373, "reward_change_max": 0.0003437027335166931, "reward_change_mean": -0.12925810925662518, "reward_change_min": -0.2418409138917923, "reward_change_std": 0.0902960505336523, "reward_std": 0.761298593133688, "rewards/cosine_scaled_reward": 0.0007940866053104401, "rewards/format_reward": 0.7500000055879354, "step": 448 }, { "advantage_max": 0.8848019167780876, "advantage_mean": -1.1796752796833232e-08, "advantage_min": -1.0409802421927452, "advantage_std": 0.7183829769492149, "completion_length": 2573.0000610351562, "epoch": 0.5131428571428571, "grad_norm": 0.21112467348575592, "kl": 0.1236724853515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2822310472864885e-07, "loss": 0.0106, "reward": 0.6152279544621706, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6152279544621706, "reward_after_std": 0.7183829881250858, "reward_before_mean": 0.7470788657665253, "reward_before_std": 0.7360207587480545, "reward_change_max": 0.0003293454647064209, "reward_change_mean": -0.13185090059414506, "reward_change_min": -0.22446555085480213, "reward_change_std": 0.09343716083094478, "reward_std": 0.7183830142021179, "rewards/cosine_scaled_reward": 0.029789404943585396, "rewards/format_reward": 0.6875000167638063, "step": 449 }, { "advantage_max": 0.9070663601160049, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.7719485089182854, "advantage_std": 0.6348018012940884, "completion_length": 2985.458366394043, "epoch": 0.5142857142857142, "grad_norm": 0.2994707524776459, "kl": 0.1786041259765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2713832064634125e-07, "loss": 0.015, "reward": 0.2393764741718769, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2393764741718769, "reward_after_std": 0.6348018012940884, "reward_before_mean": 0.3358287693117745, "reward_before_std": 0.6404321789741516, "reward_change_max": 8.175522089004517e-05, "reward_change_mean": -0.09645229065790772, "reward_change_min": -0.18162197712808847, "reward_change_std": 0.06990106124430895, "reward_std": 0.6348018124699593, "rewards/cosine_scaled_reward": -0.12375229911413044, "rewards/format_reward": 0.5833333395421505, "step": 450 }, { "advantage_max": 1.2291451916098595, "advantage_mean": -4.656612540010485e-09, "advantage_min": -0.9268405549228191, "advantage_std": 0.8121864721179008, "completion_length": 2530.9375534057617, "epoch": 0.5154285714285715, "grad_norm": 0.9159375429153442, "kl": 0.1414642333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.260741462457165e-07, "loss": 0.0199, "reward": 0.6339735409710556, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6339735409710556, "reward_after_std": 0.8121864832937717, "reward_before_mean": 0.7607014384120703, "reward_before_std": 0.8097108751535416, "reward_change_max": 0.0, "reward_change_mean": -0.1267279153689742, "reward_change_min": -0.22259232308715582, "reward_change_std": 0.0881945351138711, "reward_std": 0.8121864981949329, "rewards/cosine_scaled_reward": 0.07826739549636841, "rewards/format_reward": 0.6041666772216558, "step": 451 }, { "advantage_max": 1.7310744225978851, "advantage_mean": -2.6077032533322608e-08, "advantage_min": -1.0266026742756367, "advantage_std": 1.0066105760633945, "completion_length": 3234.8334045410156, "epoch": 0.5165714285714286, "grad_norm": 0.441463828086853, "kl": 0.22369384765625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2503063339313356e-07, "loss": 0.0203, "reward": 0.507617705501616, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.507617705501616, "reward_after_std": 1.0066105723381042, "reward_before_mean": 0.6138361915946007, "reward_before_std": 0.9988056197762489, "reward_change_max": 0.00042747706174850464, "reward_change_mean": -0.10621848376467824, "reward_change_min": -0.20366192236542702, "reward_change_std": 0.08123700972646475, "reward_std": 1.0066106170415878, "rewards/cosine_scaled_reward": -0.005581922363489866, "rewards/format_reward": 0.6250000149011612, "step": 452 }, { "advantage_max": 1.331640511751175, "advantage_mean": -2.1730860721991263e-08, "advantage_min": -1.3682594373822212, "advantage_std": 1.0288936495780945, "completion_length": 2916.625102996826, "epoch": 0.5177142857142857, "grad_norm": 0.6561626195907593, "kl": 0.218505859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2400783294793668e-07, "loss": 0.0292, "reward": 0.7023884258233011, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7023884258233011, "reward_after_std": 1.0288936868309975, "reward_before_mean": 0.8339581657201052, "reward_before_std": 1.0595490783452988, "reward_change_max": 2.3633241653442383e-05, "reward_change_mean": -0.13156970776617527, "reward_change_min": -0.2569648250937462, "reward_change_std": 0.10804524039849639, "reward_std": 1.0288937389850616, "rewards/cosine_scaled_reward": 0.07322905701585114, "rewards/format_reward": 0.6875000223517418, "step": 453 }, { "advantage_max": 1.180284183472395, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.8286248743534088, "advantage_std": 0.7885466255247593, "completion_length": 2831.9584045410156, "epoch": 0.5188571428571429, "grad_norm": 0.9318577647209167, "kl": 0.230743408203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2300579475997657e-07, "loss": 0.0306, "reward": 0.11954196076840162, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11954196076840162, "reward_after_std": 0.788546621799469, "reward_before_mean": 0.19991375133395195, "reward_before_std": 0.8013152182102203, "reward_change_max": 0.0003981366753578186, "reward_change_mean": -0.08037177752703428, "reward_change_min": -0.1756641110405326, "reward_change_std": 0.07187676522880793, "reward_std": 0.7885466404259205, "rewards/cosine_scaled_reward": -0.16045980621129274, "rewards/format_reward": 0.5208333432674408, "step": 454 }, { "advantage_max": 1.049359679222107, "advantage_mean": -4.65661276205509e-09, "advantage_min": -0.8113865703344345, "advantage_std": 0.7036336697638035, "completion_length": 3105.0834045410156, "epoch": 0.52, "grad_norm": 0.4618373215198517, "kl": 0.35552978515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.220245676671809e-07, "loss": 0.029, "reward": 0.06979416310787201, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06979416310787201, "reward_after_std": 0.7036336623132229, "reward_before_mean": 0.14800235256552696, "reward_before_std": 0.7142827957868576, "reward_change_max": 0.0004022940993309021, "reward_change_mean": -0.07820818712934852, "reward_change_min": -0.158931165933609, "reward_change_std": 0.06535158446058631, "reward_std": 0.7036336846649647, "rewards/cosine_scaled_reward": -0.20724883582443, "rewards/format_reward": 0.5625000074505806, "step": 455 }, { "advantage_max": 1.2895260006189346, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.9913373813033104, "advantage_std": 0.830539807677269, "completion_length": 3312.8334045410156, "epoch": 0.5211428571428571, "grad_norm": 0.4404752850532532, "kl": 0.27899169921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2106419949317388e-07, "loss": 0.0332, "reward": 0.1983939576894045, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1983939576894045, "reward_after_std": 0.8305398598313332, "reward_before_mean": 0.28388113901019096, "reward_before_std": 0.8402737453579903, "reward_change_max": 0.0003027394413948059, "reward_change_mean": -0.08548715803772211, "reward_change_min": -0.18772473465651274, "reward_change_std": 0.07269950956106186, "reward_std": 0.8305399157106876, "rewards/cosine_scaled_reward": -0.170559449121356, "rewards/format_reward": 0.6250000223517418, "step": 456 }, { "advantage_max": 0.982379175722599, "advantage_mean": 5.510325196134147e-09, "advantage_min": -0.8538262210786343, "advantage_std": 0.6569894328713417, "completion_length": 3048.4375610351562, "epoch": 0.5222857142857142, "grad_norm": 0.39246371388435364, "kl": 0.21966552734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.2012473704494537e-07, "loss": 0.0653, "reward": 0.3919263742864132, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3919263742864132, "reward_after_std": 0.6569894254207611, "reward_before_mean": 0.5009766188450158, "reward_before_std": 0.6586984917521477, "reward_change_max": 0.00028721243143081665, "reward_change_mean": -0.10905022826045752, "reward_change_min": -0.18408588599413633, "reward_change_std": 0.07368638599291444, "reward_std": 0.6569894477725029, "rewards/cosine_scaled_reward": -0.03076169639825821, "rewards/format_reward": 0.5625000074505806, "step": 457 }, { "advantage_max": 1.3128767609596252, "advantage_mean": -2.6077032977411818e-08, "advantage_min": -0.8374460823833942, "advantage_std": 0.8251619078218937, "completion_length": 2803.0000915527344, "epoch": 0.5234285714285715, "grad_norm": 0.8610221743583679, "kl": 0.218505859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1920622611056974e-07, "loss": 0.0219, "reward": 0.32338718697428703, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32338718697428703, "reward_after_std": 0.8251619152724743, "reward_before_mean": 0.4198822174221277, "reward_before_std": 0.8259499967098236, "reward_change_max": 0.00017363578081130981, "reward_change_mean": -0.09649505442939699, "reward_change_min": -0.19915975630283356, "reward_change_std": 0.07579607097432017, "reward_std": 0.8251619413495064, "rewards/cosine_scaled_reward": -0.14422556664794683, "rewards/format_reward": 0.708333333954215, "step": 458 }, { "advantage_max": 1.0017457380890846, "advantage_mean": -3.72529057601767e-09, "advantage_min": -1.1438434720039368, "advantage_std": 0.8000783547759056, "completion_length": 2578.291717529297, "epoch": 0.5245714285714286, "grad_norm": 0.5348182916641235, "kl": 0.27069091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1830871145697412e-07, "loss": 0.054, "reward": 0.6234663780778646, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6234663780778646, "reward_after_std": 0.8000783659517765, "reward_before_mean": 0.7535809008404613, "reward_before_std": 0.821883074939251, "reward_change_max": 9.752810001373291e-06, "reward_change_mean": -0.130114508792758, "reward_change_min": -0.23948706313967705, "reward_change_std": 0.09379158820956945, "reward_std": 0.8000783957540989, "rewards/cosine_scaled_reward": 0.04345710389316082, "rewards/format_reward": 0.6666666772216558, "step": 459 }, { "advantage_max": 1.278119184076786, "advantage_mean": -1.0244548737103898e-08, "advantage_min": -0.9226012080907822, "advantage_std": 0.8290133886039257, "completion_length": 3373.6250915527344, "epoch": 0.5257142857142857, "grad_norm": 0.5007908940315247, "kl": 0.3909912109375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1743223682775649e-07, "loss": 0.0507, "reward": 0.09350781515240669, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09350781515240669, "reward_after_std": 0.8290134109556675, "reward_before_mean": 0.16933209914714098, "reward_before_std": 0.8431417122483253, "reward_change_max": 1.5437602996826172e-05, "reward_change_mean": -0.07582429051399231, "reward_change_min": -0.16332056745886803, "reward_change_std": 0.06777232186868787, "reward_std": 0.8290134258568287, "rewards/cosine_scaled_reward": -0.17575062531977892, "rewards/format_reward": 0.5208333488553762, "step": 460 }, { "advantage_max": 1.0003699101507664, "advantage_mean": 3.104408563547878e-09, "advantage_min": -0.7306459732353687, "advantage_std": 0.6616128720343113, "completion_length": 3015.0000610351562, "epoch": 0.5268571428571428, "grad_norm": 0.7025476694107056, "kl": 0.32537841796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1657684494105386e-07, "loss": 0.0168, "reward": 0.6671255268156528, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6671255268156528, "reward_after_std": 0.6616128571331501, "reward_before_mean": 0.8014153416152112, "reward_before_std": 0.652360200881958, "reward_change_max": 0.0, "reward_change_mean": -0.13428980251774192, "reward_change_min": -0.2290246021002531, "reward_change_std": 0.08612646535038948, "reward_std": 0.6616128906607628, "rewards/cosine_scaled_reward": 0.0882076546549797, "rewards/format_reward": 0.6250000093132257, "step": 461 }, { "advantage_max": 0.9321895092725754, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.8385965377092361, "advantage_std": 0.6647354513406754, "completion_length": 3085.791763305664, "epoch": 0.528, "grad_norm": 0.46087542176246643, "kl": 0.250518798828125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1574257748745986e-07, "loss": 0.0394, "reward": 0.05673941969871521, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05673941969871521, "reward_after_std": 0.6647354401648045, "reward_before_mean": 0.13604146614670753, "reward_before_std": 0.6804654821753502, "reward_change_max": 0.00011178851127624512, "reward_change_mean": -0.0793020457495004, "reward_change_min": -0.16675184946507215, "reward_change_std": 0.06660343706607819, "reward_std": 0.6647354438900948, "rewards/cosine_scaled_reward": -0.20281260646879673, "rewards/format_reward": 0.5416666846722364, "step": 462 }, { "advantage_max": 1.1660524047911167, "advantage_mean": -1.7384688633104162e-08, "advantage_min": -0.8106147684156895, "advantage_std": 0.7458341121673584, "completion_length": 3201.1459197998047, "epoch": 0.5291428571428571, "grad_norm": 0.5006526708602905, "kl": 0.373992919921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1492947512799328e-07, "loss": 0.024, "reward": 0.18125806841999292, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18125806841999292, "reward_after_std": 0.7458341140300035, "reward_before_mean": 0.2674159649759531, "reward_before_std": 0.7493571043014526, "reward_change_max": 0.0, "reward_change_mean": -0.0861579212360084, "reward_change_min": -0.16297503747045994, "reward_change_std": 0.0660725818015635, "reward_std": 0.7458341177552938, "rewards/cosine_scaled_reward": -0.1162920305505395, "rewards/format_reward": 0.5000000074505806, "step": 463 }, { "advantage_max": 1.0869667418301105, "advantage_mean": -3.2906732894133484e-08, "advantage_min": -0.6707272604107857, "advantage_std": 0.6322797127068043, "completion_length": 2362.604217529297, "epoch": 0.5302857142857142, "grad_norm": 0.8603577017784119, "kl": 0.2738189697265625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1413757749211602e-07, "loss": -0.0015, "reward": 0.8882392137311399, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8882392137311399, "reward_after_std": 0.632279708981514, "reward_before_mean": 1.0414677765220404, "reward_before_std": 0.5925928438082337, "reward_change_max": 9.009987115859985e-05, "reward_change_mean": -0.15322855673730373, "reward_change_min": -0.22892808262258768, "reward_change_std": 0.08658680645748973, "reward_std": 0.6322797238826752, "rewards/cosine_scaled_reward": 0.11448386963456869, "rewards/format_reward": 0.8125000111758709, "step": 464 }, { "advantage_max": 1.2360083982348442, "advantage_mean": -1.2417636363615259e-09, "advantage_min": -0.8381345644593239, "advantage_std": 0.7440574951469898, "completion_length": 2958.4583740234375, "epoch": 0.5314285714285715, "grad_norm": 0.4997173249721527, "kl": 0.386474609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1336692317580158e-07, "loss": 0.0389, "reward": 0.08523056594276568, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08523056594276568, "reward_after_std": 0.7440575174987316, "reward_before_mean": 0.1612645024433732, "reward_before_std": 0.7435659244656563, "reward_change_max": 0.00045468658208847046, "reward_change_mean": -0.07603393588215113, "reward_change_min": -0.14358344301581383, "reward_change_std": 0.059222247917205095, "reward_std": 0.744057547301054, "rewards/cosine_scaled_reward": -0.23186775855720043, "rewards/format_reward": 0.625000013038516, "step": 465 }, { "advantage_max": 1.3792641945183277, "advantage_mean": -3.104408785592483e-09, "advantage_min": -0.9853333793580532, "advantage_std": 0.900844220072031, "completion_length": 3042.2500610351562, "epoch": 0.5325714285714286, "grad_norm": 0.44595518708229065, "kl": 0.2362060546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1261754973965422e-07, "loss": 0.0166, "reward": 0.6323232520371675, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.6323232520371675, "reward_after_std": 0.9008442088961601, "reward_before_mean": 0.7564093824476004, "reward_before_std": 0.9056645855307579, "reward_change_max": 0.00016022473573684692, "reward_change_mean": -0.12408614112064242, "reward_change_min": -0.23610640596598387, "reward_change_std": 0.09193866746500134, "reward_std": 0.9008442126214504, "rewards/cosine_scaled_reward": 0.06570468074642122, "rewards/format_reward": 0.6250000074505806, "step": 466 }, { "advantage_max": 1.192842148244381, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.8308476731181145, "advantage_std": 0.7643921412527561, "completion_length": 3214.166748046875, "epoch": 0.5337142857142857, "grad_norm": 0.5949474573135376, "kl": 0.2874755859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1188949370707787e-07, "loss": 0.0319, "reward": 0.46276107244193554, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46276107244193554, "reward_after_std": 0.7643921487033367, "reward_before_mean": 0.5747087150812149, "reward_before_std": 0.7634487450122833, "reward_change_max": 0.00010569393634796143, "reward_change_mean": -0.11194764589890838, "reward_change_min": -0.20960522443056107, "reward_change_std": 0.08081883913837373, "reward_std": 0.7643921747803688, "rewards/cosine_scaled_reward": -0.045978982001543045, "rewards/format_reward": 0.6666666846722364, "step": 467 }, { "advantage_max": 1.1976465657353401, "advantage_mean": -4.3461720111714897e-08, "advantage_min": -0.9038081467151642, "advantage_std": 0.8358358480036259, "completion_length": 3259.104217529297, "epoch": 0.5348571428571428, "grad_norm": 0.9248505234718323, "kl": 0.40380859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1118279056249653e-07, "loss": 0.0779, "reward": 0.3013373212888837, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3013373212888837, "reward_after_std": 0.8358358442783356, "reward_before_mean": 0.3982266914099455, "reward_before_std": 0.8553115110844374, "reward_change_max": 8.344650268554688e-05, "reward_change_mean": -0.09688938385806978, "reward_change_min": -0.21377692930400372, "reward_change_std": 0.08293247455731034, "reward_std": 0.8358358591794968, "rewards/cosine_scaled_reward": -0.07171999849379063, "rewards/format_reward": 0.541666679084301, "step": 468 }, { "advantage_max": 1.1891687586903572, "advantage_mean": -1.986821618338297e-08, "advantage_min": -0.9869108945131302, "advantage_std": 0.7981296181678772, "completion_length": 2615.1875915527344, "epoch": 0.536, "grad_norm": 0.4499533176422119, "kl": 0.39539337158203125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.1049747474962444e-07, "loss": 0.0624, "reward": 0.42476166412234306, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42476166412234306, "reward_after_std": 0.7981296330690384, "reward_before_mean": 0.5331416502594948, "reward_before_std": 0.8063963502645493, "reward_change_max": 0.0, "reward_change_mean": -0.10837997868657112, "reward_change_min": -0.2086690105497837, "reward_change_std": 0.08071521436795592, "reward_std": 0.7981296479701996, "rewards/cosine_scaled_reward": -0.0250958614051342, "rewards/format_reward": 0.5833333395421505, "step": 469 }, { "advantage_max": 1.1581247821450233, "advantage_mean": 1.365939888975376e-08, "advantage_min": -0.7778391763567924, "advantage_std": 0.722015731036663, "completion_length": 3278.6876220703125, "epoch": 0.5371428571428571, "grad_norm": 0.613844096660614, "kl": 0.3515625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0983357966978745e-07, "loss": 0.0299, "reward": 0.41915637208148837, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41915637208148837, "reward_after_std": 0.7220157347619534, "reward_before_mean": 0.5277159176766872, "reward_before_std": 0.7151578031480312, "reward_change_max": 0.0, "reward_change_mean": -0.10855951346457005, "reward_change_min": -0.1773476181551814, "reward_change_std": 0.0683783870190382, "reward_std": 0.7220157794654369, "rewards/cosine_scaled_reward": -0.07989205606281757, "rewards/format_reward": 0.6875000186264515, "step": 470 }, { "advantage_max": 1.2751684002578259, "advantage_mean": -1.1020650836357504e-08, "advantage_min": -1.0654745399951935, "advantage_std": 0.8733825888484716, "completion_length": 3038.0625610351562, "epoch": 0.5382857142857143, "grad_norm": 0.466342031955719, "kl": 0.26397705078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0919113768029517e-07, "loss": 0.0334, "reward": 0.7088228650391102, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7088228650391102, "reward_after_std": 0.8733825515955687, "reward_before_mean": 0.841802254319191, "reward_before_std": 0.879924139007926, "reward_change_max": 8.447468280792236e-05, "reward_change_mean": -0.13297936227172613, "reward_change_min": -0.2324786437675357, "reward_change_std": 0.09299619169905782, "reward_std": 0.8733825888484716, "rewards/cosine_scaled_reward": 0.06673444528132677, "rewards/format_reward": 0.7083333525806665, "step": 471 }, { "advantage_max": 1.2374247685074806, "advantage_mean": -1.6453366308288864e-08, "advantage_min": -0.881297804415226, "advantage_std": 0.8313139267265797, "completion_length": 3053.979248046875, "epoch": 0.5394285714285715, "grad_norm": 0.41991597414016724, "kl": 0.28839111328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0857018009286381e-07, "loss": 0.027, "reward": 0.39327038638293743, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.39327038638293743, "reward_after_std": 0.8313139192759991, "reward_before_mean": 0.49783482804195955, "reward_before_std": 0.8396384790539742, "reward_change_max": 0.0, "reward_change_mean": -0.10456445254385471, "reward_change_min": -0.21990286745131016, "reward_change_std": 0.0824456731788814, "reward_std": 0.8313139192759991, "rewards/cosine_scaled_reward": -0.0948325915960595, "rewards/format_reward": 0.6875000111758709, "step": 472 }, { "advantage_max": 1.042616032063961, "advantage_mean": 9.313226134732844e-09, "advantage_min": -0.8628039546310902, "advantage_std": 0.711292676627636, "completion_length": 3158.979217529297, "epoch": 0.5405714285714286, "grad_norm": 0.7266313433647156, "kl": 0.2745819091796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0797073717209013e-07, "loss": 0.0557, "reward": 0.16985632851719856, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16985632851719856, "reward_after_std": 0.7112926989793777, "reward_before_mean": 0.25732479616999626, "reward_before_std": 0.720692828297615, "reward_change_max": 0.00014837831258773804, "reward_change_mean": -0.08746843785047531, "reward_change_min": -0.16882281191647053, "reward_change_std": 0.06796854501590133, "reward_std": 0.711292702704668, "rewards/cosine_scaled_reward": -0.14217094890773296, "rewards/format_reward": 0.5416666772216558, "step": 473 }, { "advantage_max": 1.1585516035556793, "advantage_mean": -2.0799537953086755e-08, "advantage_min": -0.8155636340379715, "advantage_std": 0.7681524343788624, "completion_length": 2514.1458740234375, "epoch": 0.5417142857142857, "grad_norm": 0.43213552236557007, "kl": 0.24603271484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0739283813397639e-07, "loss": 0.0473, "reward": 0.7646679431200027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7646679431200027, "reward_after_std": 0.7681524567306042, "reward_before_mean": 0.9050909709185362, "reward_before_std": 0.7603873573243618, "reward_change_max": 0.0, "reward_change_mean": -0.1404230184853077, "reward_change_min": -0.25275282841175795, "reward_change_std": 0.0934959203004837, "reward_std": 0.7681524753570557, "rewards/cosine_scaled_reward": 0.10879547521471977, "rewards/format_reward": 0.687500013038516, "step": 474 }, { "advantage_max": 1.455761842429638, "advantage_mean": -2.4214387495113954e-08, "advantage_min": -1.21954682841897, "advantage_std": 0.9812054596841335, "completion_length": 2689.604263305664, "epoch": 0.5428571428571428, "grad_norm": 1.0528361797332764, "kl": 0.314056396484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.068365111445064e-07, "loss": 0.0525, "reward": 0.8001913847401738, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8001913847401738, "reward_after_std": 0.9812054559588432, "reward_before_mean": 0.9382050596177578, "reward_before_std": 0.9881223868578672, "reward_change_max": 0.00012006610631942749, "reward_change_mean": -0.13801368651911616, "reward_change_min": -0.24854467622935772, "reward_change_std": 0.1031024232506752, "reward_std": 0.981205478310585, "rewards/cosine_scaled_reward": 0.14618585677817464, "rewards/format_reward": 0.6458333469927311, "step": 475 }, { "advantage_max": 1.6118089109659195, "advantage_mean": -2.4835262735223296e-09, "advantage_min": -1.260014183819294, "advantage_std": 1.1286941394209862, "completion_length": 2836.4584350585938, "epoch": 0.544, "grad_norm": 1.7638888359069824, "kl": 0.315277099609375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.063017833182728e-07, "loss": 0.1327, "reward": 0.7785492744296789, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7785492744296789, "reward_after_std": 1.1286941319704056, "reward_before_mean": 0.9117186167277396, "reward_before_std": 1.1516238190233707, "reward_change_max": 0.00016049295663833618, "reward_change_mean": -0.13316931016743183, "reward_change_min": -0.28096507117152214, "reward_change_std": 0.11068580951541662, "reward_std": 1.128694150596857, "rewards/cosine_scaled_reward": 0.09127595031168312, "rewards/format_reward": 0.7291666828095913, "step": 476 }, { "advantage_max": 1.0612576007843018, "advantage_mean": -6.332993651714247e-08, "advantage_min": -1.2174795642495155, "advantage_std": 0.8141191489994526, "completion_length": 2536.416732788086, "epoch": 0.5451428571428572, "grad_norm": 1.627563714981079, "kl": 0.17498779296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0578868071715544e-07, "loss": 0.0816, "reward": 1.1968099847435951, "reward_advantage_correlation": 1.0, "reward_after_mean": 1.1968099847435951, "reward_after_std": 0.8141191452741623, "reward_before_mean": 1.3791041374206543, "reward_before_std": 0.8197311982512474, "reward_change_max": 0.00014181435108184814, "reward_change_mean": -0.18229419272392988, "reward_change_min": -0.280188612639904, "reward_change_std": 0.1127679473720491, "reward_std": 0.8141191527247429, "rewards/cosine_scaled_reward": 0.2312187310308218, "rewards/format_reward": 0.916666679084301, "step": 477 }, { "advantage_max": 1.0841128677129745, "advantage_mean": -8.692343900218447e-09, "advantage_min": -0.8327232263982296, "advantage_std": 0.7114966996014118, "completion_length": 2950.1459045410156, "epoch": 0.5462857142857143, "grad_norm": 0.3843514323234558, "kl": 0.3557281494140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0529722834905125e-07, "loss": 0.0494, "reward": 0.19972308538854122, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.19972308538854122, "reward_after_std": 0.7114967443048954, "reward_before_mean": 0.28897368628531694, "reward_before_std": 0.7166823334991932, "reward_change_max": 2.7410686016082764e-05, "reward_change_mean": -0.08925061579793692, "reward_change_min": -0.16781168803572655, "reward_change_std": 0.0676483353599906, "reward_std": 0.7114967629313469, "rewards/cosine_scaled_reward": -0.14717982709407806, "rewards/format_reward": 0.583333345130086, "step": 478 }, { "advantage_max": 1.284256212413311, "advantage_mean": -1.4280279625467074e-08, "advantage_min": -0.9000277370214462, "advantage_std": 0.8560518994927406, "completion_length": 3038.6875762939453, "epoch": 0.5474285714285714, "grad_norm": 0.36497536301612854, "kl": 0.350341796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0482745016665526e-07, "loss": 0.0193, "reward": 0.265819541644305, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.265819541644305, "reward_after_std": 0.8560518845915794, "reward_before_mean": 0.35775492154061794, "reward_before_std": 0.8692752569913864, "reward_change_max": 0.0, "reward_change_mean": -0.09193538874387741, "reward_change_min": -0.21421499364078045, "reward_change_std": 0.0782300722785294, "reward_std": 0.856051929295063, "rewards/cosine_scaled_reward": -0.1440392080694437, "rewards/format_reward": 0.6458333507180214, "step": 479 }, { "advantage_max": 1.2477297633886337, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.8636210225522518, "advantage_std": 0.7761502750217915, "completion_length": 2664.708366394043, "epoch": 0.5485714285714286, "grad_norm": 0.6572485566139221, "kl": 0.333465576171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0437936906629334e-07, "loss": 0.0535, "reward": 0.34925887174904346, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.34925887174904346, "reward_after_std": 0.7761502973735332, "reward_before_mean": 0.44965414330363274, "reward_before_std": 0.7742989175021648, "reward_change_max": 0.0005605295300483704, "reward_change_mean": -0.10039527481421828, "reward_change_min": -0.18748685158789158, "reward_change_std": 0.07196449814364314, "reward_std": 0.7761503122746944, "rewards/cosine_scaled_reward": -0.13975626602768898, "rewards/format_reward": 0.729166679084301, "step": 480 }, { "advantage_max": 1.130104836076498, "advantage_mean": -6.519258577419862e-09, "advantage_min": -0.8260618653148413, "advantage_std": 0.7510270290076733, "completion_length": 3182.3959045410156, "epoch": 0.5497142857142857, "grad_norm": 0.7979830503463745, "kl": 0.3316650390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0395300688680625e-07, "loss": 0.0327, "reward": 0.225021761842072, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.225021761842072, "reward_after_std": 0.7510270290076733, "reward_before_mean": 0.3158915303647518, "reward_before_std": 0.7570718768984079, "reward_change_max": 0.0, "reward_change_mean": -0.09086976759135723, "reward_change_min": -0.186422617174685, "reward_change_std": 0.0734526920132339, "reward_std": 0.7510270439088345, "rewards/cosine_scaled_reward": -0.19622090552002192, "rewards/format_reward": 0.7083333507180214, "step": 481 }, { "advantage_max": 1.2530571520328522, "advantage_mean": -6.208817349140361e-09, "advantage_min": -1.070625051856041, "advantage_std": 0.8753399774432182, "completion_length": 2833.3750762939453, "epoch": 0.5508571428571428, "grad_norm": 1.2419958114624023, "kl": 0.3258514404296875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0354838440848501e-07, "loss": 0.041, "reward": 0.8800165746361017, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.8800165746361017, "reward_after_std": 0.8753399886190891, "reward_before_mean": 1.0293409526348114, "reward_before_std": 0.8727457858622074, "reward_change_max": 0.00011719763278961182, "reward_change_mean": -0.14932430908083916, "reward_change_min": -0.2527621230110526, "reward_change_std": 0.10746940225362778, "reward_std": 0.8753399923443794, "rewards/cosine_scaled_reward": 0.19175377115607262, "rewards/format_reward": 0.645833345130086, "step": 482 }, { "advantage_max": 1.1174155697226524, "advantage_mean": -2.266218279700638e-08, "advantage_min": -0.8972061052918434, "advantage_std": 0.7504922579973936, "completion_length": 3080.7084350585938, "epoch": 0.552, "grad_norm": 0.5229855179786682, "kl": 0.44921875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0316552135205837e-07, "loss": 0.0455, "reward": 0.38609249144792557, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.38609249144792557, "reward_after_std": 0.7504922654479742, "reward_before_mean": 0.4921985100954771, "reward_before_std": 0.7534452546387911, "reward_change_max": 0.0002275332808494568, "reward_change_mean": -0.10610603354871273, "reward_change_min": -0.2087285201996565, "reward_change_std": 0.08030886575579643, "reward_std": 0.7504922710359097, "rewards/cosine_scaled_reward": -0.10806741891428828, "rewards/format_reward": 0.7083333395421505, "step": 483 }, { "advantage_max": 1.2352019250392914, "advantage_mean": -2.4835269951672956e-08, "advantage_min": -1.0424505099654198, "advantage_std": 0.8791324347257614, "completion_length": 2653.541763305664, "epoch": 0.5531428571428572, "grad_norm": 1.738598346710205, "kl": 0.27093505859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0280443637773163e-07, "loss": 0.0924, "reward": 0.667680477257818, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.667680477257818, "reward_after_std": 0.8791324757039547, "reward_before_mean": 0.7975321440026164, "reward_before_std": 0.8878633677959442, "reward_change_max": 0.00017753243446350098, "reward_change_mean": -0.1298516825772822, "reward_change_min": -0.23955402616411448, "reward_change_std": 0.09890555776655674, "reward_std": 0.8791324906051159, "rewards/cosine_scaled_reward": 0.02376605849713087, "rewards/format_reward": 0.7500000186264515, "step": 484 }, { "advantage_max": 1.1654871627688408, "advantage_mean": -2.3903947460324204e-08, "advantage_min": -1.0133636444807053, "advantage_std": 0.8400661423802376, "completion_length": 2845.104248046875, "epoch": 0.5542857142857143, "grad_norm": 0.7887821793556213, "kl": 0.353485107421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0246514708427701e-07, "loss": 0.0404, "reward": 0.620442176819779, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.620442176819779, "reward_after_std": 0.8400661423802376, "reward_before_mean": 0.747199842473492, "reward_before_std": 0.8507244884967804, "reward_change_max": 0.0, "reward_change_mean": -0.12675767857581377, "reward_change_min": -0.24706415832042694, "reward_change_std": 0.09639016725122929, "reward_std": 0.8400661535561085, "rewards/cosine_scaled_reward": 0.01943324040621519, "rewards/format_reward": 0.7083333507180214, "step": 485 }, { "advantage_max": 1.2393508404493332, "advantage_mean": -2.545615163107584e-08, "advantage_min": -0.7347784452140331, "advantage_std": 0.740032946690917, "completion_length": 2606.354248046875, "epoch": 0.5554285714285714, "grad_norm": 0.6385639309883118, "kl": 0.3489837646484375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0214767000817596e-07, "loss": 0.035, "reward": 0.5880567478016019, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5880567478016019, "reward_after_std": 0.7400329541414976, "reward_before_mean": 0.7102154456079006, "reward_before_std": 0.7220157235860825, "reward_change_max": 0.0, "reward_change_mean": -0.12215872760862112, "reward_change_min": -0.20704109221696854, "reward_change_std": 0.07904792111366987, "reward_std": 0.740032970905304, "rewards/cosine_scaled_reward": -0.01989229116588831, "rewards/format_reward": 0.7500000111758709, "step": 486 }, { "advantage_max": 1.4690705388784409, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -1.2353796288371086, "advantage_std": 1.0522098690271378, "completion_length": 2319.312568664551, "epoch": 0.5565714285714286, "grad_norm": 0.7796392440795898, "kl": 0.3911285400390625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0185202062281336e-07, "loss": 0.0243, "reward": 0.6309237442910671, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.6309237442910671, "reward_after_std": 1.0522098690271378, "reward_before_mean": 0.7531594757456332, "reward_before_std": 1.0770602822303772, "reward_change_max": 0.0, "reward_change_mean": -0.12223571492359042, "reward_change_min": -0.26455293595790863, "reward_change_std": 0.10230514733120799, "reward_std": 1.0522098764777184, "rewards/cosine_scaled_reward": 0.0744963875040412, "rewards/format_reward": 0.6041666772216558, "step": 487 }, { "advantage_max": 0.8800669386982918, "advantage_mean": -2.856055891786724e-08, "advantage_min": -0.706706915050745, "advantage_std": 0.5929678082466125, "completion_length": 2522.666732788086, "epoch": 0.5577142857142857, "grad_norm": 0.30417007207870483, "kl": 0.33161163330078125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0157821333772304e-07, "loss": 0.029, "reward": 0.5513467136770487, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5513467136770487, "reward_after_std": 0.592967800796032, "reward_before_mean": 0.6770103182643652, "reward_before_std": 0.5850005187094212, "reward_change_max": 2.4974346160888672e-05, "reward_change_mean": -0.12566362135112286, "reward_change_min": -0.20945844426751137, "reward_change_std": 0.07968493644148111, "reward_std": 0.5929678119719028, "rewards/cosine_scaled_reward": -0.005244861356914043, "rewards/format_reward": 0.6875000074505806, "step": 488 }, { "advantage_max": 0.7658723294734955, "advantage_mean": 3.725290464995368e-09, "advantage_min": -0.7253937609493732, "advantage_std": 0.5821077227592468, "completion_length": 3303.229217529297, "epoch": 0.5588571428571428, "grad_norm": 1.5999469757080078, "kl": 0.68994140625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.013262614978859e-07, "loss": 0.0506, "reward": -0.12529479584190995, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12529479584190995, "reward_after_std": 0.5821077264845371, "reward_before_mean": -0.060184720903635025, "reward_before_std": 0.6017212346196175, "reward_change_max": 0.00026082247495651245, "reward_change_mean": -0.06511008506640792, "reward_change_min": -0.1411078181117773, "reward_change_std": 0.05900137731805444, "reward_std": 0.5821077451109886, "rewards/cosine_scaled_reward": -0.25925903022289276, "rewards/format_reward": 0.45833334513008595, "step": 489 }, { "advantage_max": 0.9868172481656075, "advantage_mean": -1.5211602394371582e-08, "advantage_min": -0.8524841107428074, "advantage_std": 0.6771835945546627, "completion_length": 2362.2500534057617, "epoch": 0.56, "grad_norm": 0.821845293045044, "kl": 0.3515167236328125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0109617738307911e-07, "loss": 0.0115, "reward": 0.6511377788410755, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6511377788410755, "reward_after_std": 0.6771836057305336, "reward_before_mean": 0.784302718937397, "reward_before_std": 0.6745435670018196, "reward_change_max": 0.0, "reward_change_mean": -0.1331649529747665, "reward_change_min": -0.21483693923801184, "reward_change_std": 0.0822962406091392, "reward_std": 0.6771836318075657, "rewards/cosine_scaled_reward": 0.0067346952855587006, "rewards/format_reward": 0.7708333488553762, "step": 490 }, { "advantage_max": 1.1321530863642693, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -1.2378709018230438, "advantage_std": 0.9353579767048359, "completion_length": 2859.8750762939453, "epoch": 0.5611428571428572, "grad_norm": 0.9638422131538391, "kl": 0.392730712890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0088797220727779e-07, "loss": 0.0344, "reward": 0.7370665986090899, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.7370665986090899, "reward_after_std": 0.9353579990565777, "reward_before_mean": 0.8755270391702652, "reward_before_std": 0.9644673950970173, "reward_change_max": 0.0, "reward_change_mean": -0.13846042612567544, "reward_change_min": -0.2663510059937835, "reward_change_std": 0.11175253661349416, "reward_std": 0.9353580549359322, "rewards/cosine_scaled_reward": 0.166930191218853, "rewards/format_reward": 0.541666679084301, "step": 491 }, { "advantage_max": 1.1642880029976368, "advantage_mean": -1.7384688244526103e-08, "advantage_min": -0.9464364871382713, "advantage_std": 0.8110112994909286, "completion_length": 2653.62508392334, "epoch": 0.5622857142857143, "grad_norm": 0.6525238752365112, "kl": 0.4062652587890625, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0070165611810855e-07, "loss": 0.0486, "reward": 0.4646597392857075, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4646597392857075, "reward_after_std": 0.811011303216219, "reward_before_mean": 0.577180489897728, "reward_before_std": 0.8244456201791763, "reward_change_max": 0.00011387467384338379, "reward_change_mean": -0.11252076458185911, "reward_change_min": -0.22796605806797743, "reward_change_std": 0.0855433689430356, "reward_std": 0.8110113255679607, "rewards/cosine_scaled_reward": -0.013493089005351067, "rewards/format_reward": 0.6041666828095913, "step": 492 }, { "advantage_max": 0.9895866885781288, "advantage_mean": -3.6632022137883524e-08, "advantage_min": -0.9520559869706631, "advantage_std": 0.7569357864558697, "completion_length": 2731.1875762939453, "epoch": 0.5634285714285714, "grad_norm": 0.7446244955062866, "kl": 0.6183624267578125, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.005372381963547e-07, "loss": 0.0883, "reward": 0.7343036928214133, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.7343036928214133, "reward_after_std": 0.7569357715547085, "reward_before_mean": 0.8745586993172765, "reward_before_std": 0.7680754400789738, "reward_change_max": 0.00016526877880096436, "reward_change_mean": -0.14025500882416964, "reward_change_min": -0.24564260337501764, "reward_change_std": 0.09688371233642101, "reward_std": 0.7569358013570309, "rewards/cosine_scaled_reward": 0.12477933615446091, "rewards/format_reward": 0.6250000018626451, "step": 493 }, { "advantage_max": 1.3244344219565392, "advantage_mean": -4.346172310931706e-09, "advantage_min": -0.9265795089304447, "advantage_std": 0.8622128590941429, "completion_length": 2606.4584350585938, "epoch": 0.5645714285714286, "grad_norm": 0.9010090231895447, "kl": 0.408294677734375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0039472645551372e-07, "loss": 0.0822, "reward": 0.4567707823589444, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4567707823589444, "reward_after_std": 0.8622128367424011, "reward_before_mean": 0.5650326677132398, "reward_before_std": 0.8653600625693798, "reward_change_max": 0.0009450465440750122, "reward_change_mean": -0.10826187301427126, "reward_change_min": -0.20170058961957693, "reward_change_std": 0.07992936880327761, "reward_std": 0.862212885171175, "rewards/cosine_scaled_reward": -0.06123367277905345, "rewards/format_reward": 0.6875000186264515, "step": 494 }, { "advantage_max": 1.2726031877100468, "advantage_mean": -2.1109978431965715e-08, "advantage_min": -1.070934422314167, "advantage_std": 0.9266544282436371, "completion_length": 3046.1459350585938, "epoch": 0.5657142857142857, "grad_norm": 0.9610704183578491, "kl": 0.594482421875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.002741278414069e-07, "loss": 0.0694, "reward": 0.35878500062972307, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.35878500062972307, "reward_after_std": 0.9266544803977013, "reward_before_mean": 0.45935659296810627, "reward_before_std": 0.9541935361921787, "reward_change_max": 0.00044248998165130615, "reward_change_mean": -0.10057160933502018, "reward_change_min": -0.22374487854540348, "reward_change_std": 0.09001280879601836, "reward_std": 0.9266545325517654, "rewards/cosine_scaled_reward": -0.05157171795144677, "rewards/format_reward": 0.5625000149011612, "step": 495 }, { "advantage_max": 0.7787556611001492, "advantage_mean": -2.918144220709351e-08, "advantage_min": -0.6704142689704895, "advantage_std": 0.5368506982922554, "completion_length": 2543.6458740234375, "epoch": 0.5668571428571428, "grad_norm": 0.6127541661262512, "kl": 0.4067840576171875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0017544823184055e-07, "loss": 0.0377, "reward": 0.6063330564647913, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6063330564647913, "reward_after_std": 0.5368506982922554, "reward_before_mean": 0.7393186707049608, "reward_before_std": 0.5250267013907433, "reward_change_max": 0.0, "reward_change_mean": -0.13298564730212092, "reward_change_min": -0.20881926268339157, "reward_change_std": 0.07957718800753355, "reward_std": 0.536850705742836, "rewards/cosine_scaled_reward": 0.015492672100663185, "rewards/format_reward": 0.7083333414047956, "step": 496 }, { "advantage_max": 0.9398028701543808, "advantage_mean": 8.381903615628516e-09, "advantage_min": -0.8651764281094074, "advantage_std": 0.69316166639328, "completion_length": 2541.041748046875, "epoch": 0.568, "grad_norm": 0.8797805309295654, "kl": 0.28133392333984375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0009869243631952e-07, "loss": 0.0176, "reward": 1.1073258856777102, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 1.1073258856777102, "reward_after_std": 0.6931616589426994, "reward_before_mean": 1.2837290642783046, "reward_before_std": 0.680363591760397, "reward_change_max": 0.0, "reward_change_mean": -0.17640314577147365, "reward_change_min": -0.2838462581858039, "reward_change_std": 0.1093319933861494, "reward_std": 0.6931616626679897, "rewards/cosine_scaled_reward": 0.25644785538315773, "rewards/format_reward": 0.7708333507180214, "step": 497 }, { "advantage_max": 1.589196316897869, "advantage_mean": 4.656612789810666e-09, "advantage_min": -0.8506803512573242, "advantage_std": 0.9172989800572395, "completion_length": 3036.6459350585938, "epoch": 0.5691428571428572, "grad_norm": 1.152613639831543, "kl": 0.76190185546875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.000438641958131e-07, "loss": 0.0751, "reward": 0.21077799936756492, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21077799936756492, "reward_after_std": 0.9172989800572395, "reward_before_mean": 0.29192004108335823, "reward_before_std": 0.9136406257748604, "reward_change_max": 0.0, "reward_change_mean": -0.08114203019067645, "reward_change_min": -0.15874368697404861, "reward_change_std": 0.06351864710450172, "reward_std": 0.9172989800572395, "rewards/cosine_scaled_reward": -0.15612333035096526, "rewards/format_reward": 0.6041666734963655, "step": 498 }, { "advantage_max": 1.6222087368369102, "advantage_mean": 4.967054212379196e-09, "advantage_min": -1.1398800686001778, "advantage_std": 1.0871395617723465, "completion_length": 2844.1250915527344, "epoch": 0.5702857142857143, "grad_norm": 1.1812705993652344, "kl": 0.4490966796875, "lambda_div_used": 0.9000000000000001, "learning_rate": 1.0001096618257236e-07, "loss": 0.066, "reward": 0.5475275591015816, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5475275591015816, "reward_after_std": 1.087139554321766, "reward_before_mean": 0.6594566758722067, "reward_before_std": 1.1065706722438335, "reward_change_max": 0.00030337274074554443, "reward_change_mean": -0.11192911583930254, "reward_change_min": -0.24595525488257408, "reward_change_std": 0.09533399250358343, "reward_std": 1.087139569222927, "rewards/cosine_scaled_reward": -0.0036049976479262114, "rewards/format_reward": 0.6666666902601719, "step": 499 }, { "advantage_max": 1.4636687450110912, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -1.0613201428204775, "advantage_std": 0.9504660218954086, "completion_length": 3248.0833740234375, "epoch": 0.5714285714285714, "grad_norm": 0.7001128196716309, "kl": 0.5921630859375, "lambda_div_used": 0.9000000000000001, "learning_rate": 1e-07, "loss": 0.0719, "reward": 0.3895249618217349, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3895249618217349, "reward_after_std": 0.9504660293459892, "reward_before_mean": 0.48947494849562645, "reward_before_std": 0.9621049575507641, "reward_change_max": 0.00021667778491973877, "reward_change_mean": -0.09994998946785927, "reward_change_min": -0.20708435587584972, "reward_change_std": 0.08308770577423275, "reward_std": 0.9504660815000534, "rewards/cosine_scaled_reward": -0.04692920472007245, "rewards/format_reward": 0.583333345130086, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.023616178158205003, "train_runtime": 58431.9199, "train_samples_per_second": 0.411, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }