Think2SQL-0.5B / trainer_state.json
anonymous-2321's picture
Commit folder
4aa9000 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999559277214632,
"eval_steps": 500,
"global_step": 567,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0078125,
"epoch": 0.0017628911414720142,
"grad_norm": 3.0840760424820304,
"kl": 0.0,
"learning_rate": 1.7543859649122805e-08,
"loss": -0.0042,
"max_completion_length": 464.0,
"max_terminated_completion_length": 459.75,
"mean_completion_length": 120.04296875,
"mean_terminated_completion_length": 118.23617553710938,
"min_completion_length": 21.0,
"min_terminated_completion_length": 21.0,
"num_tokens": 115211.0,
"reward": 0.25845247507095337,
"reward_std": 0.24694325402379036,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.2897005267441273,
"rewards/qatch_metrics/std": 0.37457581236958504,
"rewards/tag_count_reward/mean": 0.244140625,
"rewards/tag_count_reward/std": 0.13581550493836403,
"step": 1
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0107421875,
"epoch": 0.00881445570736007,
"grad_norm": 2.803414301509709,
"kl": 0.00023472309112548828,
"learning_rate": 8.771929824561403e-08,
"loss": 0.0299,
"max_completion_length": 2148.625,
"max_terminated_completion_length": 560.25,
"mean_completion_length": 164.86328125,
"mean_terminated_completion_length": 122.24907398223877,
"min_completion_length": 23.3125,
"min_terminated_completion_length": 23.3125,
"num_tokens": 658751.0,
"reward": 0.14076079020742327,
"reward_std": 0.17357240640558302,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.15179980779066682,
"rewards/qatch_metrics/std": 0.285268085077405,
"rewards/tag_count_reward/mean": 0.234619140625,
"rewards/tag_count_reward/std": 0.11686475621536374,
"step": 5
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.01015625,
"epoch": 0.01762891141472014,
"grad_norm": 2.555865364836083,
"kl": 0.00034499168395996094,
"learning_rate": 1.7543859649122805e-07,
"loss": 0.0691,
"max_completion_length": 1847.5,
"max_terminated_completion_length": 736.65,
"mean_completion_length": 154.3125,
"mean_terminated_completion_length": 130.3932632446289,
"min_completion_length": 21.3,
"min_terminated_completion_length": 21.3,
"num_tokens": 1347167.0,
"reward": 0.12399922087788581,
"reward_std": 0.16416746266186238,
"rewards/format_reward/mean": 0.00078125,
"rewards/format_reward/std": 0.00625,
"rewards/qatch_metrics/mean": 0.13194531546905636,
"rewards/qatch_metrics/std": 0.2816271550953388,
"rewards/tag_count_reward/mean": 0.2353515625,
"rewards/tag_count_reward/std": 0.11751417592167854,
"step": 10
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.01328125,
"epoch": 0.026443367122080213,
"grad_norm": 1.8510134641989897,
"kl": 0.00035467147827148435,
"learning_rate": 2.631578947368421e-07,
"loss": 0.1161,
"max_completion_length": 1736.25,
"max_terminated_completion_length": 725.25,
"mean_completion_length": 166.71015625,
"mean_terminated_completion_length": 125.76015815734863,
"min_completion_length": 22.4,
"min_terminated_completion_length": 22.4,
"num_tokens": 1999516.0,
"reward": 0.13328830637037753,
"reward_std": 0.18279453851282595,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.1429885433986783,
"rewards/qatch_metrics/std": 0.2840505912899971,
"rewards/tag_count_reward/mean": 0.2349609375,
"rewards/tag_count_reward/std": 0.12176873050630092,
"step": 15
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0109375,
"epoch": 0.03525782282944028,
"grad_norm": 2.5276083322305163,
"kl": 0.0008690834045410156,
"learning_rate": 3.508771929824561e-07,
"loss": 0.0909,
"max_completion_length": 1950.25,
"max_terminated_completion_length": 694.35,
"mean_completion_length": 157.1859375,
"mean_terminated_completion_length": 122.76734085083008,
"min_completion_length": 22.95,
"min_terminated_completion_length": 22.95,
"num_tokens": 2662298.0,
"reward": 0.19042691607028245,
"reward_std": 0.19785099737346173,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.20991172092035412,
"rewards/qatch_metrics/std": 0.32454456612467764,
"rewards/tag_count_reward/mean": 0.2400390625,
"rewards/tag_count_reward/std": 0.11891286894679069,
"step": 20
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00390625,
"epoch": 0.044072278536800354,
"grad_norm": 1.6421641387454096,
"kl": 0.002597618103027344,
"learning_rate": 4.3859649122807013e-07,
"loss": 0.0598,
"max_completion_length": 1248.55,
"max_terminated_completion_length": 525.75,
"mean_completion_length": 117.29296875,
"mean_terminated_completion_length": 104.7642993927002,
"min_completion_length": 21.5,
"min_terminated_completion_length": 21.5,
"num_tokens": 3274689.0,
"reward": 0.19326679892838,
"reward_std": 0.19503218345344067,
"rewards/format_reward/mean": 0.00234375,
"rewards/format_reward/std": 0.01875,
"rewards/qatch_metrics/mean": 0.21241406723856926,
"rewards/qatch_metrics/std": 0.3405905418097973,
"rewards/tag_count_reward/mean": 0.249609375,
"rewards/tag_count_reward/std": 0.11380729898810386,
"step": 25
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00546875,
"epoch": 0.052886734244160426,
"grad_norm": 2.327163237963173,
"kl": 0.004604721069335937,
"learning_rate": 5.263157894736842e-07,
"loss": -0.0155,
"max_completion_length": 1373.3,
"max_terminated_completion_length": 461.75,
"mean_completion_length": 126.6109375,
"mean_terminated_completion_length": 110.83255004882812,
"min_completion_length": 21.45,
"min_terminated_completion_length": 21.45,
"num_tokens": 3903935.0,
"reward": 0.235529076308012,
"reward_std": 0.19208679497241973,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.26164037082344294,
"rewards/qatch_metrics/std": 0.35552939809858797,
"rewards/tag_count_reward/mean": 0.2626953125,
"rewards/tag_count_reward/std": 0.12861518152058124,
"step": 30
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.003125,
"epoch": 0.06170118995152049,
"grad_norm": 1.9030107685120632,
"kl": 0.006869125366210938,
"learning_rate": 6.140350877192982e-07,
"loss": -0.0291,
"max_completion_length": 637.45,
"max_terminated_completion_length": 475.05,
"mean_completion_length": 110.1609375,
"mean_terminated_completion_length": 106.66331939697265,
"min_completion_length": 19.1,
"min_terminated_completion_length": 19.1,
"num_tokens": 4489885.0,
"reward": 0.23229851759970188,
"reward_std": 0.21139583457261324,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.2574031319469213,
"rewards/qatch_metrics/std": 0.3460362754762173,
"rewards/tag_count_reward/mean": 0.2701171875,
"rewards/tag_count_reward/std": 0.12943687103688717,
"step": 35
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0015625,
"epoch": 0.07051564565888056,
"grad_norm": 1.3133674580676309,
"kl": 0.006020355224609375,
"learning_rate": 7.017543859649122e-07,
"loss": -0.0528,
"max_completion_length": 738.35,
"max_terminated_completion_length": 558.2,
"mean_completion_length": 145.91328125,
"mean_terminated_completion_length": 142.51878776550294,
"min_completion_length": 21.1,
"min_terminated_completion_length": 21.1,
"num_tokens": 5151902.0,
"reward": 0.22980495262891054,
"reward_std": 0.18255550526082515,
"rewards/format_reward/mean": 0.00078125,
"rewards/format_reward/std": 0.00625,
"rewards/qatch_metrics/mean": 0.25320573393255474,
"rewards/qatch_metrics/std": 0.3738796763122082,
"rewards/tag_count_reward/mean": 0.2900390625,
"rewards/tag_count_reward/std": 0.15516266897320746,
"step": 40
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0015625,
"epoch": 0.07933010136624064,
"grad_norm": 1.065001099510075,
"kl": 0.005857086181640625,
"learning_rate": 7.894736842105263e-07,
"loss": 0.0403,
"max_completion_length": 674.15,
"max_terminated_completion_length": 669.1,
"mean_completion_length": 172.17578125,
"mean_terminated_completion_length": 171.6076644897461,
"min_completion_length": 20.2,
"min_terminated_completion_length": 20.2,
"num_tokens": 5870847.0,
"reward": 0.23959124982357025,
"reward_std": 0.21271923929452896,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/qatch_metrics/mean": 0.26344375535845754,
"rewards/qatch_metrics/std": 0.36381270438432695,
"rewards/tag_count_reward/mean": 0.31328125,
"rewards/tag_count_reward/std": 0.16481443196535112,
"step": 45
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00390625,
"epoch": 0.08814455707360071,
"grad_norm": 1.4568973490349373,
"kl": 0.005655670166015625,
"learning_rate": 8.771929824561403e-07,
"loss": -0.0555,
"max_completion_length": 1026.65,
"max_terminated_completion_length": 500.6,
"mean_completion_length": 171.38203125,
"mean_terminated_completion_length": 158.90636672973633,
"min_completion_length": 20.45,
"min_terminated_completion_length": 20.45,
"num_tokens": 6568024.0,
"reward": 0.22149190343916417,
"reward_std": 0.21185415983200073,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.027518405020236968,
"rewards/qatch_metrics/mean": 0.24082917235791684,
"rewards/qatch_metrics/std": 0.36300159245729446,
"rewards/tag_count_reward/mean": 0.3279296875,
"rewards/tag_count_reward/std": 0.16671581640839578,
"step": 50
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00390625,
"epoch": 0.09695901278096078,
"grad_norm": 1.0074202050705572,
"kl": 0.007281494140625,
"learning_rate": 9.649122807017545e-07,
"loss": 0.0364,
"max_completion_length": 1240.75,
"max_terminated_completion_length": 517.85,
"mean_completion_length": 194.9171875,
"mean_terminated_completion_length": 182.60486450195313,
"min_completion_length": 23.85,
"min_terminated_completion_length": 23.85,
"num_tokens": 7285134.0,
"reward": 0.2235423892736435,
"reward_std": 0.21293668523430825,
"rewards/format_reward/mean": 0.00859375,
"rewards/format_reward/std": 0.053823620080947876,
"rewards/qatch_metrics/mean": 0.24047266095876693,
"rewards/qatch_metrics/std": 0.35300029441714287,
"rewards/tag_count_reward/mean": 0.365625,
"rewards/tag_count_reward/std": 0.17709428519010545,
"step": 55
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00625,
"epoch": 0.10577346848832085,
"grad_norm": 0.9807961478644736,
"kl": 0.01302337646484375,
"learning_rate": 1e-06,
"loss": -0.0139,
"max_completion_length": 1103.85,
"max_terminated_completion_length": 560.8,
"mean_completion_length": 248.34296875,
"mean_terminated_completion_length": 238.09479522705078,
"min_completion_length": 38.0,
"min_terminated_completion_length": 38.0,
"num_tokens": 8039813.0,
"reward": 0.23491878062486649,
"reward_std": 0.21809776537120343,
"rewards/format_reward/mean": 0.01953125,
"rewards/format_reward/std": 0.11232657507061958,
"rewards/qatch_metrics/mean": 0.24687135666608812,
"rewards/qatch_metrics/std": 0.3717411242425442,
"rewards/tag_count_reward/mean": 0.4625,
"rewards/tag_count_reward/std": 0.2135901317000389,
"step": 60
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00234375,
"epoch": 0.11458792419568092,
"grad_norm": 0.8603200985963705,
"kl": 0.01912841796875,
"learning_rate": 1e-06,
"loss": 0.0333,
"max_completion_length": 497.9,
"max_terminated_completion_length": 495.0,
"mean_completion_length": 230.2484375,
"mean_terminated_completion_length": 229.92258987426757,
"min_completion_length": 42.5,
"min_terminated_completion_length": 42.5,
"num_tokens": 8806307.0,
"reward": 0.24188727661967277,
"reward_std": 0.216167426854372,
"rewards/format_reward/mean": 0.0640625,
"rewards/format_reward/std": 0.22365741804242134,
"rewards/qatch_metrics/mean": 0.24616562593728303,
"rewards/qatch_metrics/std": 0.3404053032398224,
"rewards/tag_count_reward/mean": 0.5248046875,
"rewards/tag_count_reward/std": 0.23647152334451677,
"step": 65
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.0046875,
"epoch": 0.12340237990304098,
"grad_norm": 0.9335750466856336,
"kl": 0.02579345703125,
"learning_rate": 1e-06,
"loss": 0.0811,
"max_completion_length": 1368.45,
"max_terminated_completion_length": 485.4,
"mean_completion_length": 222.75390625,
"mean_terminated_completion_length": 207.36048126220703,
"min_completion_length": 34.7,
"min_terminated_completion_length": 34.7,
"num_tokens": 9576424.0,
"reward": 0.24611930586397648,
"reward_std": 0.22903760597109796,
"rewards/format_reward/mean": 0.134375,
"rewards/format_reward/std": 0.3337091006338596,
"rewards/qatch_metrics/mean": 0.24012656770646573,
"rewards/qatch_metrics/std": 0.33230473324656484,
"rewards/tag_count_reward/mean": 0.571484375,
"rewards/tag_count_reward/std": 0.26808963865041735,
"step": 70
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00234375,
"epoch": 0.13221683561040107,
"grad_norm": 1.2284202473881727,
"kl": 0.0276123046875,
"learning_rate": 1e-06,
"loss": 0.0495,
"max_completion_length": 927.9,
"max_terminated_completion_length": 564.7,
"mean_completion_length": 181.3296875,
"mean_terminated_completion_length": 172.18771896362304,
"min_completion_length": 23.45,
"min_terminated_completion_length": 23.45,
"num_tokens": 10314206.0,
"reward": 0.24534607045352458,
"reward_std": 0.2094151984900236,
"rewards/format_reward/mean": 0.23828125,
"rewards/format_reward/std": 0.4178183376789093,
"rewards/qatch_metrics/mean": 0.22585521470755338,
"rewards/qatch_metrics/std": 0.32939945682883265,
"rewards/tag_count_reward/mean": 0.5908203125,
"rewards/tag_count_reward/std": 0.2986594527959824,
"step": 75
},
{
"clip_ratio": 0.0,
"clipped_completions_ratio": 0.00390625,
"epoch": 0.14103129131776113,
"grad_norm": 1.142379485289819,
"kl": 0.051806640625,
"learning_rate": 1e-06,
"loss": 0.0794,
"max_completion_length": 725.95,
"max_terminated_completion_length": 543.8,
"mean_completion_length": 172.446875,
"mean_terminated_completion_length": 165.8302963256836,
"min_completion_length": 39.55,
"min_terminated_completion_length": 39.55,
"num_tokens": 11001674.0,
"reward": 0.3384398899972439,
"reward_std": 0.24649502858519554,
"rewards/format_reward/mean": 0.54296875,
"rewards/format_reward/std": 0.4884683877229691,
"rewards/qatch_metrics/mean": 0.28772110007703305,
"rewards/qatch_metrics/std": 0.38454234302043916,
"rewards/tag_count_reward/mean": 0.7916015625,
"rewards/tag_count_reward/std": 0.2731324777007103,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2780.4,
"completions/max_terminated_length": 941.2,
"completions/mean_length": 178.7609375,
"completions/mean_terminated_length": 163.41346435546876,
"completions/min_length": 26.8,
"completions/min_terminated_length": 26.8,
"epoch": 0.1498457470251212,
"grad_norm": 2.58777855961075,
"kl": 0.07294921875,
"learning_rate": 1e-06,
"loss": 0.1454,
"num_tokens": 717486.0,
"reward": 0.312135910987854,
"reward_std": 0.2266964465379715,
"rewards/format_reward/mean": 0.77265625,
"rewards/format_reward/std": 0.4172531723976135,
"rewards/qatch_metrics/mean": 0.22495078444480895,
"rewards/qatch_metrics/std": 0.3556622087955475,
"rewards/tag_count_reward/mean": 0.8732421875,
"rewards/tag_count_reward/std": 0.25258718729019164,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 743.6,
"completions/max_terminated_length": 743.6,
"completions/mean_length": 140.06875,
"completions/mean_terminated_length": 140.06875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.15866020273248127,
"grad_norm": 1.3938857270995895,
"kl": 0.0706787109375,
"learning_rate": 1e-06,
"loss": 0.0265,
"num_tokens": 1363254.0,
"reward": 0.3290148377418518,
"reward_std": 0.220520544052124,
"rewards/format_reward/mean": 0.75703125,
"rewards/format_reward/std": 0.4248744070529938,
"rewards/qatch_metrics/mean": 0.24871459007263183,
"rewards/qatch_metrics/std": 0.367422616481781,
"rewards/tag_count_reward/mean": 0.8380859375,
"rewards/tag_count_reward/std": 0.2936785161495209,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2624.8,
"completions/max_terminated_length": 1089.0,
"completions/mean_length": 144.7390625,
"completions/mean_terminated_length": 132.38715515136718,
"completions/min_length": 38.6,
"completions/min_terminated_length": 38.6,
"epoch": 0.16747465843984133,
"grad_norm": 1.271013018051317,
"kl": 0.0874755859375,
"learning_rate": 1e-06,
"loss": 0.1072,
"num_tokens": 1984568.0,
"reward": 0.4227922260761261,
"reward_std": 0.22174089550971984,
"rewards/format_reward/mean": 0.9515625,
"rewards/format_reward/std": 0.21198658645153046,
"rewards/qatch_metrics/mean": 0.32884793281555175,
"rewards/qatch_metrics/std": 0.411483907699585,
"rewards/tag_count_reward/mean": 0.9623046875,
"rewards/tag_count_reward/std": 0.15252943634986876,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1262.2,
"completions/max_terminated_length": 539.4,
"completions/mean_length": 135.00625,
"completions/mean_terminated_length": 128.81027221679688,
"completions/min_length": 37.4,
"completions/min_terminated_length": 37.4,
"epoch": 0.17628911414720141,
"grad_norm": 1.4288833040090947,
"kl": 0.093994140625,
"learning_rate": 1e-06,
"loss": 0.0551,
"num_tokens": 2650624.0,
"reward": 0.3893065094947815,
"reward_std": 0.21477862894535066,
"rewards/format_reward/mean": 0.96171875,
"rewards/format_reward/std": 0.1881812334060669,
"rewards/qatch_metrics/mean": 0.28727005124092103,
"rewards/qatch_metrics/std": 0.3896294891834259,
"rewards/tag_count_reward/mean": 0.9791015625,
"rewards/tag_count_reward/std": 0.11189484894275666,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1890.2,
"completions/max_terminated_length": 462.4,
"completions/mean_length": 132.55703125,
"completions/mean_terminated_length": 120.11187286376953,
"completions/min_length": 35.8,
"completions/min_terminated_length": 35.8,
"epoch": 0.18510356985456147,
"grad_norm": 1.1514380769030061,
"kl": 0.09775390625,
"learning_rate": 1e-06,
"loss": 0.0498,
"num_tokens": 3266889.0,
"reward": 0.3714154362678528,
"reward_std": 0.1868872672319412,
"rewards/format_reward/mean": 0.9484375,
"rewards/format_reward/std": 0.21911896765232086,
"rewards/qatch_metrics/mean": 0.2678416669368744,
"rewards/qatch_metrics/std": 0.36030757427215576,
"rewards/tag_count_reward/mean": 0.978125,
"rewards/tag_count_reward/std": 0.10167990401387214,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 2149.8,
"completions/max_terminated_length": 642.6,
"completions/mean_length": 131.7640625,
"completions/mean_terminated_length": 122.44811248779297,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.19391802556192156,
"grad_norm": 1.1648416805076183,
"kl": 0.0974853515625,
"learning_rate": 1e-06,
"loss": 0.1006,
"num_tokens": 3884011.0,
"reward": 0.45167279839515684,
"reward_std": 0.2551474153995514,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.2396955519914627,
"rewards/qatch_metrics/mean": 0.3636867344379425,
"rewards/qatch_metrics/std": 0.4195810675621033,
"rewards/tag_count_reward/mean": 0.97578125,
"rewards/tag_count_reward/std": 0.10661737024784088,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1134.6,
"completions/max_terminated_length": 383.0,
"completions/mean_length": 126.67265625,
"completions/mean_terminated_length": 123.56720428466797,
"completions/min_length": 32.8,
"completions/min_terminated_length": 32.8,
"epoch": 0.20273248126928162,
"grad_norm": 1.112770785107892,
"kl": 0.0981201171875,
"learning_rate": 1e-06,
"loss": 0.0582,
"num_tokens": 4527864.0,
"reward": 0.45048635005950927,
"reward_std": 0.24212915897369386,
"rewards/format_reward/mean": 0.90234375,
"rewards/format_reward/std": 0.2969411134719849,
"rewards/qatch_metrics/mean": 0.3671622335910797,
"rewards/qatch_metrics/std": 0.4069118857383728,
"rewards/tag_count_reward/mean": 0.96328125,
"rewards/tag_count_reward/std": 0.12083393186330796,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2718.2,
"completions/max_terminated_length": 666.6,
"completions/mean_length": 135.815625,
"completions/mean_terminated_length": 123.39766387939453,
"completions/min_length": 40.6,
"completions/min_terminated_length": 40.6,
"epoch": 0.2115469369766417,
"grad_norm": 2.8196404883813453,
"kl": 0.10830078125,
"learning_rate": 1e-06,
"loss": 0.0955,
"num_tokens": 5204604.0,
"reward": 0.4444663166999817,
"reward_std": 0.22749231457710267,
"rewards/format_reward/mean": 0.9078125,
"rewards/format_reward/std": 0.28959383964538576,
"rewards/qatch_metrics/mean": 0.3591492176055908,
"rewards/qatch_metrics/std": 0.4204003632068634,
"rewards/tag_count_reward/mean": 0.9681640625,
"rewards/tag_count_reward/std": 0.11976957470178604,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2639.8,
"completions/max_terminated_length": 459.2,
"completions/mean_length": 147.5625,
"completions/mean_terminated_length": 135.18407287597657,
"completions/min_length": 34.8,
"completions/min_terminated_length": 34.8,
"epoch": 0.22036139268400176,
"grad_norm": 1.2443606977665935,
"kl": 0.09527587890625,
"learning_rate": 1e-06,
"loss": 0.0903,
"num_tokens": 5850844.0,
"reward": 0.4391818165779114,
"reward_std": 0.24111129343509674,
"rewards/format_reward/mean": 0.88359375,
"rewards/format_reward/std": 0.32070607542991636,
"rewards/qatch_metrics/mean": 0.35628697872161863,
"rewards/qatch_metrics/std": 0.41108678579330443,
"rewards/tag_count_reward/mean": 0.9595703125,
"rewards/tag_count_reward/std": 0.1286213666200638,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 2638.2,
"completions/max_terminated_length": 729.4,
"completions/mean_length": 148.103125,
"completions/mean_terminated_length": 138.83536682128906,
"completions/min_length": 43.4,
"completions/min_terminated_length": 43.4,
"epoch": 0.22917584839136185,
"grad_norm": 1.2078511430760708,
"kl": 0.08446044921875,
"learning_rate": 1e-06,
"loss": 0.1052,
"num_tokens": 6480960.0,
"reward": 0.44752122163772584,
"reward_std": 0.23120047450065612,
"rewards/format_reward/mean": 0.87109375,
"rewards/format_reward/std": 0.33249542117118835,
"rewards/qatch_metrics/mean": 0.3680166721343994,
"rewards/qatch_metrics/std": 0.4190321207046509,
"rewards/tag_count_reward/mean": 0.951953125,
"rewards/tag_count_reward/std": 0.14520585983991624,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2009.2,
"completions/max_terminated_length": 904.6,
"completions/mean_length": 152.75390625,
"completions/mean_terminated_length": 146.58113708496094,
"completions/min_length": 44.2,
"completions/min_terminated_length": 44.2,
"epoch": 0.2379903040987219,
"grad_norm": 0.9663165749537755,
"kl": 0.084228515625,
"learning_rate": 1e-06,
"loss": 0.0976,
"num_tokens": 7141781.0,
"reward": 0.4069031774997711,
"reward_std": 0.24234023094177246,
"rewards/format_reward/mean": 0.86015625,
"rewards/format_reward/std": 0.34610814452171323,
"rewards/qatch_metrics/mean": 0.3220804750919342,
"rewards/qatch_metrics/std": 0.4035941183567047,
"rewards/tag_count_reward/mean": 0.9423828125,
"rewards/tag_count_reward/std": 0.16873225271701814,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1271.4,
"completions/max_terminated_length": 543.8,
"completions/mean_length": 142.09921875,
"completions/mean_terminated_length": 139.02362060546875,
"completions/min_length": 46.4,
"completions/min_terminated_length": 46.4,
"epoch": 0.24680475980608196,
"grad_norm": 1.120567941307361,
"kl": 0.0905029296875,
"learning_rate": 1e-06,
"loss": 0.0802,
"num_tokens": 7791572.0,
"reward": 0.44534188508987427,
"reward_std": 0.24042359590530396,
"rewards/format_reward/mean": 0.878125,
"rewards/format_reward/std": 0.3254675090312958,
"rewards/qatch_metrics/mean": 0.36461407542228697,
"rewards/qatch_metrics/std": 0.42095342874526975,
"rewards/tag_count_reward/mean": 0.9521484375,
"rewards/tag_count_reward/std": 0.15115214437246322,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1145.2,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 134.07890625,
"completions/mean_terminated_length": 130.97618713378907,
"completions/min_length": 40.8,
"completions/min_terminated_length": 40.8,
"epoch": 0.255619215513442,
"grad_norm": 1.2960180615344776,
"kl": 0.090283203125,
"learning_rate": 1e-06,
"loss": 0.0413,
"num_tokens": 8406681.0,
"reward": 0.4552301824092865,
"reward_std": 0.238674333691597,
"rewards/format_reward/mean": 0.9109375,
"rewards/format_reward/std": 0.280667769908905,
"rewards/qatch_metrics/mean": 0.3717781364917755,
"rewards/qatch_metrics/std": 0.4111446261405945,
"rewards/tag_count_reward/mean": 0.9625,
"rewards/tag_count_reward/std": 0.13774650245904924,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1219.0,
"completions/max_terminated_length": 542.6,
"completions/mean_length": 137.4578125,
"completions/mean_terminated_length": 134.36369934082032,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.26443367122080214,
"grad_norm": 1.1814014331114655,
"kl": 0.091650390625,
"learning_rate": 1e-06,
"loss": 0.0474,
"num_tokens": 9060515.0,
"reward": 0.4308152377605438,
"reward_std": 0.25072828829288485,
"rewards/format_reward/mean": 0.92265625,
"rewards/format_reward/std": 0.2654747039079666,
"rewards/qatch_metrics/mean": 0.3416989743709564,
"rewards/qatch_metrics/std": 0.4145464479923248,
"rewards/tag_count_reward/mean": 0.962109375,
"rewards/tag_count_reward/std": 0.14527225494384766,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 3343.2,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 146.34140625,
"completions/mean_terminated_length": 133.97686157226562,
"completions/min_length": 39.6,
"completions/min_terminated_length": 39.6,
"epoch": 0.2732481269281622,
"grad_norm": 1.119412659054034,
"kl": 0.0925048828125,
"learning_rate": 1e-06,
"loss": 0.108,
"num_tokens": 9726008.0,
"reward": 0.4162511765956879,
"reward_std": 0.22437838315963746,
"rewards/format_reward/mean": 0.93671875,
"rewards/format_reward/std": 0.2371742010116577,
"rewards/qatch_metrics/mean": 0.3222440242767334,
"rewards/qatch_metrics/std": 0.3945153594017029,
"rewards/tag_count_reward/mean": 0.9734375,
"rewards/tag_count_reward/std": 0.11057026386260986,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 494.8,
"completions/max_terminated_length": 494.8,
"completions/mean_length": 125.86171875,
"completions/mean_terminated_length": 125.86171875,
"completions/min_length": 38.4,
"completions/min_terminated_length": 38.4,
"epoch": 0.28206258263552225,
"grad_norm": 1.0577575352944335,
"kl": 0.110693359375,
"learning_rate": 1e-06,
"loss": 0.0378,
"num_tokens": 10365223.0,
"reward": 0.49046963453292847,
"reward_std": 0.22210898101329804,
"rewards/format_reward/mean": 0.95234375,
"rewards/format_reward/std": 0.21217795908451081,
"rewards/qatch_metrics/mean": 0.40703229904174804,
"rewards/qatch_metrics/std": 0.4201949179172516,
"rewards/tag_count_reward/mean": 0.98515625,
"rewards/tag_count_reward/std": 0.07672805488109588,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1195.2,
"completions/max_terminated_length": 445.8,
"completions/mean_length": 134.734375,
"completions/mean_terminated_length": 131.63231811523437,
"completions/min_length": 39.2,
"completions/min_terminated_length": 39.2,
"epoch": 0.2908770383428823,
"grad_norm": 1.1665219069490207,
"kl": 0.0982666015625,
"learning_rate": 1e-06,
"loss": 0.0211,
"num_tokens": 10984403.0,
"reward": 0.47783067226409914,
"reward_std": 0.2358974426984787,
"rewards/format_reward/mean": 0.96015625,
"rewards/format_reward/std": 0.1951357364654541,
"rewards/qatch_metrics/mean": 0.3911289095878601,
"rewards/qatch_metrics/std": 0.4188136160373688,
"rewards/tag_count_reward/mean": 0.987109375,
"rewards/tag_count_reward/std": 0.06311970800161362,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1121.6,
"completions/max_terminated_length": 376.6,
"completions/mean_length": 148.51640625,
"completions/mean_terminated_length": 145.42068176269532,
"completions/min_length": 42.6,
"completions/min_terminated_length": 42.6,
"epoch": 0.2996914940502424,
"grad_norm": 1.217339460476114,
"kl": 0.0956298828125,
"learning_rate": 1e-06,
"loss": 0.046,
"num_tokens": 11677144.0,
"reward": 0.446321702003479,
"reward_std": 0.23696185946464537,
"rewards/format_reward/mean": 0.925,
"rewards/format_reward/std": 0.2625602900981903,
"rewards/qatch_metrics/mean": 0.35908021926879885,
"rewards/qatch_metrics/std": 0.3949739336967468,
"rewards/tag_count_reward/mean": 0.9720703125,
"rewards/tag_count_reward/std": 0.1141625314950943,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1986.6,
"completions/max_terminated_length": 548.0,
"completions/mean_length": 159.27265625,
"completions/mean_terminated_length": 153.1066864013672,
"completions/min_length": 40.2,
"completions/min_terminated_length": 40.2,
"epoch": 0.3085059497576025,
"grad_norm": 1.1604091154811884,
"kl": 0.0948486328125,
"learning_rate": 1e-06,
"loss": 0.0555,
"num_tokens": 12347509.0,
"reward": 0.43740702271461485,
"reward_std": 0.2165643662214279,
"rewards/format_reward/mean": 0.8984375,
"rewards/format_reward/std": 0.30079524517059325,
"rewards/qatch_metrics/mean": 0.35224584937095643,
"rewards/qatch_metrics/std": 0.4033379018306732,
"rewards/tag_count_reward/mean": 0.9630859375,
"rewards/tag_count_reward/std": 0.12287088185548782,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1919.6,
"completions/max_terminated_length": 460.2,
"completions/mean_length": 161.9671875,
"completions/mean_terminated_length": 152.73724365234375,
"completions/min_length": 41.4,
"completions/min_terminated_length": 41.4,
"epoch": 0.31732040546496254,
"grad_norm": 1.016812518785413,
"kl": 0.0965087890625,
"learning_rate": 1e-06,
"loss": 0.0535,
"num_tokens": 13026443.0,
"reward": 0.49172326922416687,
"reward_std": 0.2285678654909134,
"rewards/format_reward/mean": 0.903125,
"rewards/format_reward/std": 0.2938369959592819,
"rewards/qatch_metrics/mean": 0.41588308215141295,
"rewards/qatch_metrics/std": 0.43461284041404724,
"rewards/tag_count_reward/mean": 0.958203125,
"rewards/tag_count_reward/std": 0.13759158551692963,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1250.8,
"completions/max_terminated_length": 516.6,
"completions/mean_length": 153.14375,
"completions/mean_terminated_length": 150.06268920898438,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.3261348611723226,
"grad_norm": 2.943551495922741,
"kl": 0.1255859375,
"learning_rate": 1e-06,
"loss": 0.0417,
"num_tokens": 13684611.0,
"reward": 0.4662940502166748,
"reward_std": 0.22654231786727905,
"rewards/format_reward/mean": 0.909375,
"rewards/format_reward/std": 0.2848878413438797,
"rewards/qatch_metrics/mean": 0.3857020795345306,
"rewards/qatch_metrics/std": 0.4162748992443085,
"rewards/tag_count_reward/mean": 0.9501953125,
"rewards/tag_count_reward/std": 0.1399885058403015,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 786.2,
"completions/mean_length": 160.4765625,
"completions/mean_terminated_length": 148.1515380859375,
"completions/min_length": 47.6,
"completions/min_terminated_length": 47.6,
"epoch": 0.33494931687968266,
"grad_norm": 1.1104622942084277,
"kl": 0.0940185546875,
"learning_rate": 1e-06,
"loss": 0.0702,
"num_tokens": 14338485.0,
"reward": 0.5249280750751495,
"reward_std": 0.22171878814697266,
"rewards/format_reward/mean": 0.89765625,
"rewards/format_reward/std": 0.3031489491462708,
"rewards/qatch_metrics/mean": 0.45655598640441897,
"rewards/qatch_metrics/std": 0.43402122855186465,
"rewards/tag_count_reward/mean": 0.941796875,
"rewards/tag_count_reward/std": 0.16171995401382447,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1142.6,
"completions/max_terminated_length": 420.2,
"completions/mean_length": 142.35078125,
"completions/mean_terminated_length": 139.25772094726562,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.34376377258704277,
"grad_norm": 1.0499982391674234,
"kl": 0.1005126953125,
"learning_rate": 1e-06,
"loss": 0.0386,
"num_tokens": 14972214.0,
"reward": 0.46582343578338625,
"reward_std": 0.22475437819957733,
"rewards/format_reward/mean": 0.93671875,
"rewards/format_reward/std": 0.2403053015470505,
"rewards/qatch_metrics/mean": 0.3805643320083618,
"rewards/qatch_metrics/std": 0.4070888340473175,
"rewards/tag_count_reward/mean": 0.9734375,
"rewards/tag_count_reward/std": 0.11377856433391571,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2080.4,
"completions/max_terminated_length": 718.8,
"completions/mean_length": 147.2296875,
"completions/mean_terminated_length": 141.05357666015624,
"completions/min_length": 41.2,
"completions/min_terminated_length": 41.2,
"epoch": 0.35257822829440283,
"grad_norm": 1.0220429404852622,
"kl": 0.1034912109375,
"learning_rate": 1e-06,
"loss": 0.0575,
"num_tokens": 15639820.0,
"reward": 0.4750072777271271,
"reward_std": 0.2135873943567276,
"rewards/format_reward/mean": 0.92421875,
"rewards/format_reward/std": 0.26192537546157835,
"rewards/qatch_metrics/mean": 0.39318412244319917,
"rewards/qatch_metrics/std": 0.39594073295593263,
"rewards/tag_count_reward/mean": 0.967578125,
"rewards/tag_count_reward/std": 0.1268332213163376,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1109.2,
"completions/max_terminated_length": 358.6,
"completions/mean_length": 139.92421875,
"completions/mean_terminated_length": 133.72031860351564,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.3613926840017629,
"grad_norm": 1.1782501720519973,
"kl": 0.1066162109375,
"learning_rate": 1e-06,
"loss": 0.0398,
"num_tokens": 16303995.0,
"reward": 0.4705925226211548,
"reward_std": 0.18503921926021577,
"rewards/format_reward/mean": 0.95,
"rewards/format_reward/std": 0.21313293874263764,
"rewards/qatch_metrics/mean": 0.3844171941280365,
"rewards/qatch_metrics/std": 0.37895620465278623,
"rewards/tag_count_reward/mean": 0.9767578125,
"rewards/tag_count_reward/std": 0.10622683316469192,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1162.8,
"completions/max_terminated_length": 442.2,
"completions/mean_length": 137.20234375,
"completions/mean_terminated_length": 134.1083190917969,
"completions/min_length": 42.8,
"completions/min_terminated_length": 42.8,
"epoch": 0.37020713970912295,
"grad_norm": 1.08337248687343,
"kl": 0.103369140625,
"learning_rate": 1e-06,
"loss": 0.021,
"num_tokens": 16916398.0,
"reward": 0.5206815063953399,
"reward_std": 0.23496688902378082,
"rewards/format_reward/mean": 0.9484375,
"rewards/format_reward/std": 0.22106002569198607,
"rewards/qatch_metrics/mean": 0.44340285658836365,
"rewards/qatch_metrics/std": 0.4299581289291382,
"rewards/tag_count_reward/mean": 0.97890625,
"rewards/tag_count_reward/std": 0.0956751674413681,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2602.8,
"completions/max_terminated_length": 365.2,
"completions/mean_length": 152.73984375,
"completions/mean_terminated_length": 140.39059753417968,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.37902159541648306,
"grad_norm": 1.0746862969416158,
"kl": 0.1025390625,
"learning_rate": 1e-06,
"loss": 0.0729,
"num_tokens": 17589905.0,
"reward": 0.4809570789337158,
"reward_std": 0.2217806786298752,
"rewards/format_reward/mean": 0.95,
"rewards/format_reward/std": 0.2149397164583206,
"rewards/qatch_metrics/mean": 0.3963695228099823,
"rewards/qatch_metrics/std": 0.4306588113307953,
"rewards/tag_count_reward/mean": 0.980859375,
"rewards/tag_count_reward/std": 0.09641121476888656,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2013.2,
"completions/max_terminated_length": 590.6,
"completions/mean_length": 151.1234375,
"completions/mean_terminated_length": 144.9534942626953,
"completions/min_length": 43.6,
"completions/min_terminated_length": 43.6,
"epoch": 0.3878360511238431,
"grad_norm": 1.1093719128866055,
"kl": 0.0995849609375,
"learning_rate": 1e-06,
"loss": 0.0538,
"num_tokens": 18226287.0,
"reward": 0.5117225289344788,
"reward_std": 0.23414760828018188,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.2397076427936554,
"rewards/qatch_metrics/mean": 0.43437943458557127,
"rewards/qatch_metrics/std": 0.4301003873348236,
"rewards/tag_count_reward/mean": 0.975,
"rewards/tag_count_reward/std": 0.10860425382852554,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1985.0,
"completions/max_terminated_length": 565.6,
"completions/mean_length": 158.75078125,
"completions/mean_terminated_length": 152.5970947265625,
"completions/min_length": 43.6,
"completions/min_terminated_length": 43.6,
"epoch": 0.3966505068312032,
"grad_norm": 1.0904499601554711,
"kl": 0.1022705078125,
"learning_rate": 1e-06,
"loss": 0.0853,
"num_tokens": 18930000.0,
"reward": 0.5386084854602814,
"reward_std": 0.19289222061634065,
"rewards/format_reward/mean": 0.9125,
"rewards/format_reward/std": 0.2818691849708557,
"rewards/qatch_metrics/mean": 0.46959454417228697,
"rewards/qatch_metrics/std": 0.42910557985305786,
"rewards/tag_count_reward/mean": 0.9640625,
"rewards/tag_count_reward/std": 0.13219460248947143,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1952.6,
"completions/max_terminated_length": 454.4,
"completions/mean_length": 162.29296875,
"completions/mean_terminated_length": 156.1261993408203,
"completions/min_length": 52.4,
"completions/min_terminated_length": 52.4,
"epoch": 0.40546496253856323,
"grad_norm": 1.0851594244493181,
"kl": 0.1014404296875,
"learning_rate": 1e-06,
"loss": 0.0638,
"num_tokens": 19599671.0,
"reward": 0.5117276430130004,
"reward_std": 0.24680890440940856,
"rewards/format_reward/mean": 0.8921875,
"rewards/format_reward/std": 0.30913242101669314,
"rewards/qatch_metrics/mean": 0.4411179721355438,
"rewards/qatch_metrics/std": 0.43982199430465696,
"rewards/tag_count_reward/mean": 0.951171875,
"rewards/tag_count_reward/std": 0.15551512241363524,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1092.6,
"completions/max_terminated_length": 378.0,
"completions/mean_length": 151.4390625,
"completions/mean_terminated_length": 148.35447387695314,
"completions/min_length": 42.8,
"completions/min_terminated_length": 42.8,
"epoch": 0.4142794182459233,
"grad_norm": 0.9747077221272922,
"kl": 0.1077880859375,
"learning_rate": 1e-06,
"loss": 0.0322,
"num_tokens": 20243993.0,
"reward": 0.5243084728717804,
"reward_std": 0.23080018162727356,
"rewards/format_reward/mean": 0.91640625,
"rewards/format_reward/std": 0.2711502879858017,
"rewards/qatch_metrics/mean": 0.4524148523807526,
"rewards/qatch_metrics/std": 0.4158477485179901,
"rewards/tag_count_reward/mean": 0.9623046875,
"rewards/tag_count_reward/std": 0.1330309897661209,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1154.2,
"completions/max_terminated_length": 407.8,
"completions/mean_length": 149.76953125,
"completions/mean_terminated_length": 146.6907531738281,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.4230938739532834,
"grad_norm": 0.9985959553205879,
"kl": 0.1137451171875,
"learning_rate": 1e-06,
"loss": 0.0347,
"num_tokens": 20888562.0,
"reward": 0.45308218002319334,
"reward_std": 0.21805870532989502,
"rewards/format_reward/mean": 0.92109375,
"rewards/format_reward/std": 0.2692244678735733,
"rewards/qatch_metrics/mean": 0.36815963983535765,
"rewards/qatch_metrics/std": 0.4063821077346802,
"rewards/tag_count_reward/mean": 0.9607421875,
"rewards/tag_count_reward/std": 0.14251872897148132,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1154.8,
"completions/max_terminated_length": 422.6,
"completions/mean_length": 141.10703125,
"completions/mean_terminated_length": 138.00346984863282,
"completions/min_length": 42.4,
"completions/min_terminated_length": 42.4,
"epoch": 0.43190832966064346,
"grad_norm": 1.206317234569705,
"kl": 0.1264892578125,
"learning_rate": 1e-06,
"loss": 0.033,
"num_tokens": 21529883.0,
"reward": 0.5218020260334015,
"reward_std": 0.21800511479377746,
"rewards/format_reward/mean": 0.94296875,
"rewards/format_reward/std": 0.23152050971984864,
"rewards/qatch_metrics/mean": 0.44588152766227723,
"rewards/qatch_metrics/std": 0.41684806942939756,
"rewards/tag_count_reward/mean": 0.9701171875,
"rewards/tag_count_reward/std": 0.12424642890691757,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1911.8,
"completions/max_terminated_length": 458.6,
"completions/mean_length": 154.50234375,
"completions/mean_terminated_length": 145.24712524414062,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.4407227853680035,
"grad_norm": 1.0646070893278259,
"kl": 0.108837890625,
"learning_rate": 1e-06,
"loss": 0.0425,
"num_tokens": 22202974.0,
"reward": 0.5250638484954834,
"reward_std": 0.23499601781368257,
"rewards/format_reward/mean": 0.92578125,
"rewards/format_reward/std": 0.26019937098026275,
"rewards/qatch_metrics/mean": 0.4519937574863434,
"rewards/qatch_metrics/std": 0.43260250687599183,
"rewards/tag_count_reward/mean": 0.9658203125,
"rewards/tag_count_reward/std": 0.13058245778083802,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1164.6,
"completions/max_terminated_length": 436.8,
"completions/mean_length": 149.890625,
"completions/mean_terminated_length": 146.80228271484376,
"completions/min_length": 50.8,
"completions/min_terminated_length": 50.8,
"epoch": 0.4495372410753636,
"grad_norm": 1.0628232924013388,
"kl": 0.1126220703125,
"learning_rate": 1e-06,
"loss": 0.0354,
"num_tokens": 22890018.0,
"reward": 0.49608793258666994,
"reward_std": 0.24011301696300508,
"rewards/format_reward/mean": 0.91640625,
"rewards/format_reward/std": 0.2767932593822479,
"rewards/qatch_metrics/mean": 0.4192716181278229,
"rewards/qatch_metrics/std": 0.4066218316555023,
"rewards/tag_count_reward/mean": 0.961328125,
"rewards/tag_count_reward/std": 0.1395171895623207,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 716.8,
"completions/max_terminated_length": 716.8,
"completions/mean_length": 156.6890625,
"completions/mean_terminated_length": 156.6890625,
"completions/min_length": 47.4,
"completions/min_terminated_length": 47.4,
"epoch": 0.4583516967827237,
"grad_norm": 1.0620716307535578,
"kl": 0.1146728515625,
"learning_rate": 1e-06,
"loss": 0.0473,
"num_tokens": 23537540.0,
"reward": 0.46006324887275696,
"reward_std": 0.20722155570983886,
"rewards/format_reward/mean": 0.9140625,
"rewards/format_reward/std": 0.2800079345703125,
"rewards/qatch_metrics/mean": 0.3774526119232178,
"rewards/qatch_metrics/std": 0.40044850707054136,
"rewards/tag_count_reward/mean": 0.9564453125,
"rewards/tag_count_reward/std": 0.15245147049427032,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 556.2,
"completions/max_terminated_length": 556.2,
"completions/mean_length": 143.2,
"completions/mean_terminated_length": 143.2,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.46716615249008375,
"grad_norm": 1.0417871765342246,
"kl": 0.120166015625,
"learning_rate": 1e-06,
"loss": 0.0284,
"num_tokens": 24185028.0,
"reward": 0.6036619067192077,
"reward_std": 0.20974452793598175,
"rewards/format_reward/mean": 0.9546875,
"rewards/format_reward/std": 0.20255258679389954,
"rewards/qatch_metrics/mean": 0.5404294610023499,
"rewards/qatch_metrics/std": 0.4255226194858551,
"rewards/tag_count_reward/mean": 0.9765625,
"rewards/tag_count_reward/std": 0.11209065765142441,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1263.4,
"completions/max_terminated_length": 545.8,
"completions/mean_length": 153.36640625,
"completions/mean_terminated_length": 150.29201049804686,
"completions/min_length": 48.2,
"completions/min_terminated_length": 48.2,
"epoch": 0.4759806081974438,
"grad_norm": 1.2230142515821079,
"kl": 0.12080078125,
"learning_rate": 1e-06,
"loss": 0.0454,
"num_tokens": 24848617.0,
"reward": 0.5000587105751038,
"reward_std": 0.19942412078380584,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.2082734227180481,
"rewards/qatch_metrics/mean": 0.4188075542449951,
"rewards/qatch_metrics/std": 0.4028447926044464,
"rewards/tag_count_reward/mean": 0.9751953125,
"rewards/tag_count_reward/std": 0.11493908390402793,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 545.0,
"completions/max_terminated_length": 545.0,
"completions/mean_length": 146.3046875,
"completions/mean_terminated_length": 146.3046875,
"completions/min_length": 45.6,
"completions/min_terminated_length": 45.6,
"epoch": 0.48479506390480387,
"grad_norm": 1.11053365840032,
"kl": 0.1185302734375,
"learning_rate": 1e-06,
"loss": 0.0403,
"num_tokens": 25530191.0,
"reward": 0.5597240447998046,
"reward_std": 0.22672632932662964,
"rewards/format_reward/mean": 0.96328125,
"rewards/format_reward/std": 0.18625771403312683,
"rewards/qatch_metrics/mean": 0.4874395847320557,
"rewards/qatch_metrics/std": 0.41059340834617614,
"rewards/tag_count_reward/mean": 0.9814453125,
"rewards/tag_count_reward/std": 0.09651189893484116,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1201.8,
"completions/max_terminated_length": 482.4,
"completions/mean_length": 142.7421875,
"completions/mean_terminated_length": 139.64728698730468,
"completions/min_length": 45.2,
"completions/min_terminated_length": 45.2,
"epoch": 0.4936095196121639,
"grad_norm": 1.0811085458763714,
"kl": 0.126318359375,
"learning_rate": 1e-06,
"loss": 0.0566,
"num_tokens": 26189061.0,
"reward": 0.4923192024230957,
"reward_std": 0.197740375995636,
"rewards/format_reward/mean": 0.9578125,
"rewards/format_reward/std": 0.1986761748790741,
"rewards/qatch_metrics/mean": 0.40876016914844515,
"rewards/qatch_metrics/std": 0.3987727761268616,
"rewards/tag_count_reward/mean": 0.9818359375,
"rewards/tag_count_reward/std": 0.08791020289063453,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1150.6,
"completions/max_terminated_length": 1150.6,
"completions/mean_length": 149.32734375,
"completions/mean_terminated_length": 149.32734375,
"completions/min_length": 43.6,
"completions/min_terminated_length": 43.6,
"epoch": 0.502423975319524,
"grad_norm": 1.061030164323382,
"kl": 0.1281982421875,
"learning_rate": 1e-06,
"loss": 0.0294,
"num_tokens": 26873144.0,
"reward": 0.47796512842178346,
"reward_std": 0.21353891789913176,
"rewards/format_reward/mean": 0.95,
"rewards/format_reward/std": 0.21780532896518706,
"rewards/qatch_metrics/mean": 0.392907041311264,
"rewards/qatch_metrics/std": 0.4109325408935547,
"rewards/tag_count_reward/mean": 0.9798828125,
"rewards/tag_count_reward/std": 0.09564688950777053,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1897.2,
"completions/max_terminated_length": 495.2,
"completions/mean_length": 149.51796875,
"completions/mean_terminated_length": 143.3479248046875,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.511238431026884,
"grad_norm": 1.0941932001692303,
"kl": 0.1298095703125,
"learning_rate": 1e-06,
"loss": 0.0626,
"num_tokens": 27539071.0,
"reward": 0.4637163817882538,
"reward_std": 0.18355560302734375,
"rewards/format_reward/mean": 0.92109375,
"rewards/format_reward/std": 0.2646732360124588,
"rewards/qatch_metrics/mean": 0.38052110075950624,
"rewards/qatch_metrics/std": 0.3895488500595093,
"rewards/tag_count_reward/mean": 0.96328125,
"rewards/tag_count_reward/std": 0.1275038242340088,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 840.2,
"completions/max_terminated_length": 840.2,
"completions/mean_length": 146.7,
"completions/mean_terminated_length": 146.7,
"completions/min_length": 44.2,
"completions/min_terminated_length": 44.2,
"epoch": 0.5200528867342442,
"grad_norm": 1.156974504094534,
"kl": 0.127880859375,
"learning_rate": 1e-06,
"loss": 0.0401,
"num_tokens": 28189663.0,
"reward": 0.4947424054145813,
"reward_std": 0.2121095508337021,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.308673033118248,
"rewards/qatch_metrics/mean": 0.42144557237625124,
"rewards/qatch_metrics/std": 0.4137202322483063,
"rewards/tag_count_reward/mean": 0.9490234375,
"rewards/tag_count_reward/std": 0.15444399118423463,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1255.6,
"completions/max_terminated_length": 607.6,
"completions/mean_length": 160.95703125,
"completions/mean_terminated_length": 157.87724609375,
"completions/min_length": 46.4,
"completions/min_terminated_length": 46.4,
"epoch": 0.5288673424416043,
"grad_norm": 1.0193761202428142,
"kl": 0.1197265625,
"learning_rate": 1e-06,
"loss": 0.045,
"num_tokens": 28883656.0,
"reward": 0.5260525703430176,
"reward_std": 0.21179039478302003,
"rewards/format_reward/mean": 0.9359375,
"rewards/format_reward/std": 0.24322082698345185,
"rewards/qatch_metrics/mean": 0.4514336109161377,
"rewards/qatch_metrics/std": 0.42365469336509703,
"rewards/tag_count_reward/mean": 0.9748046875,
"rewards/tag_count_reward/std": 0.10406171679496765,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2045.6,
"completions/max_terminated_length": 651.8,
"completions/mean_length": 167.04921875,
"completions/mean_terminated_length": 160.9025909423828,
"completions/min_length": 47.4,
"completions/min_terminated_length": 47.4,
"epoch": 0.5376817981489643,
"grad_norm": 1.0505969033833242,
"kl": 0.1183837890625,
"learning_rate": 1e-06,
"loss": 0.074,
"num_tokens": 29572327.0,
"reward": 0.5555553436279297,
"reward_std": 0.23490612506866454,
"rewards/format_reward/mean": 0.9421875,
"rewards/format_reward/std": 0.23279777467250823,
"rewards/qatch_metrics/mean": 0.48544191718101504,
"rewards/qatch_metrics/std": 0.4043150365352631,
"rewards/tag_count_reward/mean": 0.97421875,
"rewards/tag_count_reward/std": 0.11451640278100968,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1991.8,
"completions/max_terminated_length": 547.4,
"completions/mean_length": 163.9890625,
"completions/mean_terminated_length": 154.75635986328126,
"completions/min_length": 48.6,
"completions/min_terminated_length": 48.6,
"epoch": 0.5464962538563244,
"grad_norm": 1.0184747818610653,
"kl": 0.1130615234375,
"learning_rate": 1e-06,
"loss": 0.0564,
"num_tokens": 30237305.0,
"reward": 0.5232127249240875,
"reward_std": 0.22367032766342163,
"rewards/format_reward/mean": 0.92890625,
"rewards/format_reward/std": 0.25724474191665647,
"rewards/qatch_metrics/mean": 0.4492989718914032,
"rewards/qatch_metrics/std": 0.4237508654594421,
"rewards/tag_count_reward/mean": 0.968359375,
"rewards/tag_count_reward/std": 0.12655377388000488,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1175.4,
"completions/max_terminated_length": 489.4,
"completions/mean_length": 165.95546875,
"completions/mean_terminated_length": 162.88382873535156,
"completions/min_length": 52.8,
"completions/min_terminated_length": 52.8,
"epoch": 0.5553107095636844,
"grad_norm": 0.926469518807395,
"kl": 0.116796875,
"learning_rate": 1e-06,
"loss": 0.034,
"num_tokens": 30935440.0,
"reward": 0.5955564260482789,
"reward_std": 0.22866220772266388,
"rewards/format_reward/mean": 0.91328125,
"rewards/format_reward/std": 0.28102830052375793,
"rewards/qatch_metrics/mean": 0.5368104040622711,
"rewards/qatch_metrics/std": 0.4246533751487732,
"rewards/tag_count_reward/mean": 0.9587890625,
"rewards/tag_count_reward/std": 0.1424473986029625,
"step": 315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 173.453125,
"completions/mean_terminated_length": 173.453125,
"completions/min_length": 38.4,
"completions/min_terminated_length": 38.4,
"epoch": 0.5641251652710445,
"grad_norm": 1.013203860912236,
"kl": 0.1189208984375,
"learning_rate": 1e-06,
"loss": 0.0382,
"num_tokens": 31617748.0,
"reward": 0.5389631450176239,
"reward_std": 0.21291258931159973,
"rewards/format_reward/mean": 0.90859375,
"rewards/format_reward/std": 0.28675017356872556,
"rewards/qatch_metrics/mean": 0.47106875777244567,
"rewards/qatch_metrics/std": 0.42714625000953677,
"rewards/tag_count_reward/mean": 0.95390625,
"rewards/tag_count_reward/std": 0.16191803216934203,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2694.6,
"completions/max_terminated_length": 553.8,
"completions/mean_length": 184.3796875,
"completions/mean_terminated_length": 172.1176513671875,
"completions/min_length": 44.8,
"completions/min_terminated_length": 44.8,
"epoch": 0.5729396209784046,
"grad_norm": 1.0308637085090815,
"kl": 0.1224853515625,
"learning_rate": 1e-06,
"loss": 0.0671,
"num_tokens": 32294954.0,
"reward": 0.5857669234275817,
"reward_std": 0.215561243891716,
"rewards/format_reward/mean": 0.9359375,
"rewards/format_reward/std": 0.24411277770996093,
"rewards/qatch_metrics/mean": 0.5224210977554321,
"rewards/qatch_metrics/std": 0.4328895270824432,
"rewards/tag_count_reward/mean": 0.9623046875,
"rewards/tag_count_reward/std": 0.15089936107397078,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.4,
"completions/max_terminated_length": 496.4,
"completions/mean_length": 158.73125,
"completions/mean_terminated_length": 158.73125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.5817540766857646,
"grad_norm": 13.29499090198915,
"kl": 0.1685302734375,
"learning_rate": 1e-06,
"loss": 0.0302,
"num_tokens": 32972034.0,
"reward": 0.5693387031555176,
"reward_std": 0.20702467262744903,
"rewards/format_reward/mean": 0.91640625,
"rewards/format_reward/std": 0.2766654253005981,
"rewards/qatch_metrics/mean": 0.5060119867324829,
"rewards/qatch_metrics/std": 0.4102466404438019,
"rewards/tag_count_reward/mean": 0.9517578125,
"rewards/tag_count_reward/std": 0.16461062729358672,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1304.4,
"completions/max_terminated_length": 579.8,
"completions/mean_length": 160.171875,
"completions/mean_terminated_length": 154.0051055908203,
"completions/min_length": 44.2,
"completions/min_terminated_length": 44.2,
"epoch": 0.5905685323931247,
"grad_norm": 0.935442580551547,
"kl": 0.1378662109375,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 33644622.0,
"reward": 0.5727396726608276,
"reward_std": 0.21638197600841522,
"rewards/format_reward/mean": 0.8984375,
"rewards/format_reward/std": 0.3004028916358948,
"rewards/qatch_metrics/mean": 0.5123223960399628,
"rewards/qatch_metrics/std": 0.43056052923202515,
"rewards/tag_count_reward/mean": 0.9484375,
"rewards/tag_count_reward/std": 0.15865270793437958,
"step": 335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1336.4,
"completions/max_terminated_length": 600.0,
"completions/mean_length": 169.71640625,
"completions/mean_terminated_length": 163.57119750976562,
"completions/min_length": 29.2,
"completions/min_terminated_length": 29.2,
"epoch": 0.5993829881004848,
"grad_norm": 1.4959634071669228,
"kl": 0.1254638671875,
"learning_rate": 1e-06,
"loss": 0.0391,
"num_tokens": 34325763.0,
"reward": 0.5609373271465301,
"reward_std": 0.22974819540977479,
"rewards/format_reward/mean": 0.88046875,
"rewards/format_reward/std": 0.3236552834510803,
"rewards/qatch_metrics/mean": 0.5021367311477661,
"rewards/qatch_metrics/std": 0.4220038175582886,
"rewards/tag_count_reward/mean": 0.921484375,
"rewards/tag_count_reward/std": 0.21414475739002228,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2653.0,
"completions/max_terminated_length": 500.6,
"completions/mean_length": 174.47734375,
"completions/mean_terminated_length": 159.10546264648437,
"completions/min_length": 33.6,
"completions/min_terminated_length": 33.6,
"epoch": 0.6081974438078449,
"grad_norm": 0.8936813027459303,
"kl": 0.1334228515625,
"learning_rate": 1e-06,
"loss": 0.068,
"num_tokens": 35041510.0,
"reward": 0.5479878842830658,
"reward_std": 0.24380851686000823,
"rewards/format_reward/mean": 0.915625,
"rewards/format_reward/std": 0.2778396010398865,
"rewards/qatch_metrics/mean": 0.4811346590518951,
"rewards/qatch_metrics/std": 0.43007351756095885,
"rewards/tag_count_reward/mean": 0.94921875,
"rewards/tag_count_reward/std": 0.17109024226665498,
"step": 345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 997.4,
"completions/max_terminated_length": 997.4,
"completions/mean_length": 170.0734375,
"completions/mean_terminated_length": 170.0734375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.617011899515205,
"grad_norm": 0.8929486085452816,
"kl": 0.119775390625,
"learning_rate": 1e-06,
"loss": 0.0198,
"num_tokens": 35733284.0,
"reward": 0.5295659184455872,
"reward_std": 0.2090097412467003,
"rewards/format_reward/mean": 0.909375,
"rewards/format_reward/std": 0.2840398609638214,
"rewards/qatch_metrics/mean": 0.46024298667907715,
"rewards/qatch_metrics/std": 0.4201698362827301,
"rewards/tag_count_reward/mean": 0.9484375,
"rewards/tag_count_reward/std": 0.171261465549469,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2099.4,
"completions/max_terminated_length": 687.6,
"completions/mean_length": 177.484375,
"completions/mean_terminated_length": 171.34949951171876,
"completions/min_length": 25.4,
"completions/min_terminated_length": 25.4,
"epoch": 0.625826355222565,
"grad_norm": 0.971309519624497,
"kl": 0.11083984375,
"learning_rate": 1e-06,
"loss": 0.0143,
"num_tokens": 36425760.0,
"reward": 0.5487818002700806,
"reward_std": 0.23004478216171265,
"rewards/format_reward/mean": 0.83671875,
"rewards/format_reward/std": 0.3683928668498993,
"rewards/qatch_metrics/mean": 0.49449974298477173,
"rewards/qatch_metrics/std": 0.4279952645301819,
"rewards/tag_count_reward/mean": 0.895703125,
"rewards/tag_count_reward/std": 0.2428739696741104,
"step": 355
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1983.4,
"completions/max_terminated_length": 626.4,
"completions/mean_length": 189.1375,
"completions/mean_terminated_length": 183.03312072753906,
"completions/min_length": 46.6,
"completions/min_terminated_length": 46.6,
"epoch": 0.6346408109299251,
"grad_norm": 1.085714923968051,
"kl": 0.111279296875,
"learning_rate": 1e-06,
"loss": 0.0412,
"num_tokens": 37146560.0,
"reward": 0.5465562880039215,
"reward_std": 0.2106493055820465,
"rewards/format_reward/mean": 0.9015625,
"rewards/format_reward/std": 0.2943433105945587,
"rewards/qatch_metrics/mean": 0.4813575744628906,
"rewards/qatch_metrics/std": 0.41973625421524047,
"rewards/tag_count_reward/mean": 0.944921875,
"rewards/tag_count_reward/std": 0.1730790615081787,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1243.6,
"completions/max_terminated_length": 550.0,
"completions/mean_length": 177.896875,
"completions/mean_terminated_length": 174.8417541503906,
"completions/min_length": 26.8,
"completions/min_terminated_length": 26.8,
"epoch": 0.6434552666372851,
"grad_norm": 0.9896824938718284,
"kl": 0.115478515625,
"learning_rate": 1e-06,
"loss": 0.041,
"num_tokens": 37832268.0,
"reward": 0.5586158275604248,
"reward_std": 0.2251075476408005,
"rewards/format_reward/mean": 0.87421875,
"rewards/format_reward/std": 0.32932343482971194,
"rewards/qatch_metrics/mean": 0.49995704293251036,
"rewards/qatch_metrics/std": 0.4248849630355835,
"rewards/tag_count_reward/mean": 0.924609375,
"rewards/tag_count_reward/std": 0.20914700627326965,
"step": 365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1282.6,
"completions/max_terminated_length": 555.8,
"completions/mean_length": 170.38203125,
"completions/mean_terminated_length": 167.313427734375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.6522697223446452,
"grad_norm": 0.9278166717820118,
"kl": 0.1187255859375,
"learning_rate": 1e-06,
"loss": 0.0193,
"num_tokens": 38555925.0,
"reward": 0.5743588328361511,
"reward_std": 0.20262247920036316,
"rewards/format_reward/mean": 0.8546875,
"rewards/format_reward/std": 0.3492723762989044,
"rewards/qatch_metrics/mean": 0.5217640638351441,
"rewards/qatch_metrics/std": 0.4110603451728821,
"rewards/tag_count_reward/mean": 0.9078125,
"rewards/tag_count_reward/std": 0.22944335341453553,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.4,
"completions/max_terminated_length": 458.4,
"completions/mean_length": 171.5578125,
"completions/mean_terminated_length": 171.5578125,
"completions/min_length": 33.8,
"completions/min_terminated_length": 33.8,
"epoch": 0.6610841780520053,
"grad_norm": 0.9296512691094065,
"kl": 0.1155029296875,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 39234879.0,
"reward": 0.6218206763267518,
"reward_std": 0.20180206298828124,
"rewards/format_reward/mean": 0.90703125,
"rewards/format_reward/std": 0.29024410247802734,
"rewards/qatch_metrics/mean": 0.5695247530937195,
"rewards/qatch_metrics/std": 0.43560155630111697,
"rewards/tag_count_reward/mean": 0.9404296875,
"rewards/tag_count_reward/std": 0.1924948960542679,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1532.6,
"completions/max_terminated_length": 808.2,
"completions/mean_length": 177.89375,
"completions/mean_terminated_length": 174.8284454345703,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.6698986337593653,
"grad_norm": 0.9179048520524131,
"kl": 0.1192138671875,
"learning_rate": 1e-06,
"loss": -0.0033,
"num_tokens": 39936711.0,
"reward": 0.548683899641037,
"reward_std": 0.20842809975147247,
"rewards/format_reward/mean": 0.8875,
"rewards/format_reward/std": 0.3159508228302002,
"rewards/qatch_metrics/mean": 0.48670990467071534,
"rewards/qatch_metrics/std": 0.42913843393325807,
"rewards/tag_count_reward/mean": 0.924609375,
"rewards/tag_count_reward/std": 0.2165108621120453,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1210.4,
"completions/max_terminated_length": 476.8,
"completions/mean_length": 185.3,
"completions/mean_terminated_length": 182.22890930175782,
"completions/min_length": 37.8,
"completions/min_terminated_length": 37.8,
"epoch": 0.6787130894667255,
"grad_norm": 1.0609912938864583,
"kl": 0.1187255859375,
"learning_rate": 1e-06,
"loss": 0.0192,
"num_tokens": 40628007.0,
"reward": 0.5760585784912109,
"reward_std": 0.22054702043533325,
"rewards/format_reward/mean": 0.88984375,
"rewards/format_reward/std": 0.3102767616510391,
"rewards/qatch_metrics/mean": 0.5183065176010132,
"rewards/qatch_metrics/std": 0.4284651458263397,
"rewards/tag_count_reward/mean": 0.9302734375,
"rewards/tag_count_reward/std": 0.20196012556552886,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1221.8,
"completions/max_terminated_length": 604.6,
"completions/mean_length": 170.103125,
"completions/mean_terminated_length": 167.0412139892578,
"completions/min_length": 28.4,
"completions/min_terminated_length": 28.4,
"epoch": 0.6875275451740855,
"grad_norm": 1.0349341835531938,
"kl": 0.1177978515625,
"learning_rate": 1e-06,
"loss": 0.0142,
"num_tokens": 41336699.0,
"reward": 0.5688169717788696,
"reward_std": 0.21753813624382018,
"rewards/format_reward/mean": 0.8546875,
"rewards/format_reward/std": 0.35243783593177797,
"rewards/qatch_metrics/mean": 0.5149684965610504,
"rewards/qatch_metrics/std": 0.4279025971889496,
"rewards/tag_count_reward/mean": 0.9125,
"rewards/tag_count_reward/std": 0.22199150621891023,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1952.2,
"completions/max_terminated_length": 1016.6,
"completions/mean_length": 194.11171875,
"completions/mean_terminated_length": 188.0023986816406,
"completions/min_length": 36.4,
"completions/min_terminated_length": 36.4,
"epoch": 0.6963420008814456,
"grad_norm": 1.1198091080655637,
"kl": 0.1160400390625,
"learning_rate": 1e-06,
"loss": 0.0349,
"num_tokens": 42057866.0,
"reward": 0.5434407353401184,
"reward_std": 0.2424723982810974,
"rewards/format_reward/mean": 0.8625,
"rewards/format_reward/std": 0.3433255970478058,
"rewards/qatch_metrics/mean": 0.48393073081970217,
"rewards/qatch_metrics/std": 0.4233227550983429,
"rewards/tag_count_reward/mean": 0.9169921875,
"rewards/tag_count_reward/std": 0.21575720310211183,
"step": 395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1565.8,
"completions/max_terminated_length": 859.6,
"completions/mean_length": 195.30625,
"completions/mean_terminated_length": 192.24614868164062,
"completions/min_length": 36.6,
"completions/min_terminated_length": 36.6,
"epoch": 0.7051564565888057,
"grad_norm": 0.9364233280028263,
"kl": 0.109765625,
"learning_rate": 1e-06,
"loss": -0.009,
"num_tokens": 42802098.0,
"reward": 0.532480639219284,
"reward_std": 0.20860818028450012,
"rewards/format_reward/mean": 0.8578125,
"rewards/format_reward/std": 0.3485052168369293,
"rewards/qatch_metrics/mean": 0.4721968710422516,
"rewards/qatch_metrics/std": 0.4088200509548187,
"rewards/tag_count_reward/mean": 0.906640625,
"rewards/tag_count_reward/std": 0.23517801761627197,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1322.0,
"completions/max_terminated_length": 674.6,
"completions/mean_length": 191.78125,
"completions/mean_terminated_length": 188.7188934326172,
"completions/min_length": 31.8,
"completions/min_terminated_length": 31.8,
"epoch": 0.7139709122961657,
"grad_norm": 0.9184749770599845,
"kl": 0.109228515625,
"learning_rate": 1e-06,
"loss": 0.0074,
"num_tokens": 43511370.0,
"reward": 0.6411563873291015,
"reward_std": 0.206281441450119,
"rewards/format_reward/mean": 0.89609375,
"rewards/format_reward/std": 0.30500052571296693,
"rewards/qatch_metrics/mean": 0.5939271092414856,
"rewards/qatch_metrics/std": 0.4107288718223572,
"rewards/tag_count_reward/mean": 0.9341796875,
"rewards/tag_count_reward/std": 0.2010919064283371,
"step": 405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2370.0,
"completions/max_terminated_length": 1083.0,
"completions/mean_length": 203.7921875,
"completions/mean_terminated_length": 197.71029968261718,
"completions/min_length": 41.8,
"completions/min_terminated_length": 41.8,
"epoch": 0.7227853680035258,
"grad_norm": 0.9498324772801405,
"kl": 0.112841796875,
"learning_rate": 1e-06,
"loss": 0.023,
"num_tokens": 44257808.0,
"reward": 0.6159097194671631,
"reward_std": 0.18089311718940734,
"rewards/format_reward/mean": 0.88828125,
"rewards/format_reward/std": 0.31010690331459045,
"rewards/qatch_metrics/mean": 0.5651557564735412,
"rewards/qatch_metrics/std": 0.407144832611084,
"rewards/tag_count_reward/mean": 0.933984375,
"rewards/tag_count_reward/std": 0.19542383253574372,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1977.8,
"completions/max_terminated_length": 646.4,
"completions/mean_length": 204.78828125,
"completions/mean_terminated_length": 192.57474365234376,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.7315998237108858,
"grad_norm": 0.8990363515461627,
"kl": 0.108154296875,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 44986913.0,
"reward": 0.5512337327003479,
"reward_std": 0.20042451322078705,
"rewards/format_reward/mean": 0.84296875,
"rewards/format_reward/std": 0.3632165014743805,
"rewards/qatch_metrics/mean": 0.4962239682674408,
"rewards/qatch_metrics/std": 0.4146161139011383,
"rewards/tag_count_reward/mean": 0.9029296875,
"rewards/tag_count_reward/std": 0.23338495790958405,
"step": 415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1267.8,
"completions/max_terminated_length": 545.2,
"completions/mean_length": 207.953125,
"completions/mean_terminated_length": 204.91087341308594,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.7404142794182459,
"grad_norm": 0.8949487847379094,
"kl": 0.1084228515625,
"learning_rate": 1e-06,
"loss": 0.0089,
"num_tokens": 45704165.0,
"reward": 0.5654355883598328,
"reward_std": 0.21594917476177217,
"rewards/format_reward/mean": 0.85703125,
"rewards/format_reward/std": 0.35001330375671386,
"rewards/qatch_metrics/mean": 0.5105997562408447,
"rewards/qatch_metrics/std": 0.42142562866210936,
"rewards/tag_count_reward/mean": 0.914453125,
"rewards/tag_count_reward/std": 0.22119783163070678,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1989.4,
"completions/max_terminated_length": 602.8,
"completions/mean_length": 194.7515625,
"completions/mean_terminated_length": 188.6415222167969,
"completions/min_length": 25.6,
"completions/min_terminated_length": 25.6,
"epoch": 0.749228735125606,
"grad_norm": 0.8443386346612493,
"kl": 0.11494140625,
"learning_rate": 1e-06,
"loss": 0.0033,
"num_tokens": 46430055.0,
"reward": 0.5958282589912415,
"reward_std": 0.19520920515060425,
"rewards/format_reward/mean": 0.87109375,
"rewards/format_reward/std": 0.3312843978404999,
"rewards/qatch_metrics/mean": 0.544264841079712,
"rewards/qatch_metrics/std": 0.4191899299621582,
"rewards/tag_count_reward/mean": 0.921875,
"rewards/tag_count_reward/std": 0.21222967505455018,
"step": 425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 2666.0,
"completions/max_terminated_length": 567.2,
"completions/mean_length": 208.08203125,
"completions/mean_terminated_length": 195.87457275390625,
"completions/min_length": 32.6,
"completions/min_terminated_length": 32.6,
"epoch": 0.7580431908329661,
"grad_norm": 0.9046123034832724,
"kl": 0.1042724609375,
"learning_rate": 1e-06,
"loss": 0.0387,
"num_tokens": 47156176.0,
"reward": 0.6103429317474365,
"reward_std": 0.22614607214927673,
"rewards/format_reward/mean": 0.86640625,
"rewards/format_reward/std": 0.3397656261920929,
"rewards/qatch_metrics/mean": 0.5620072841644287,
"rewards/qatch_metrics/std": 0.4072328984737396,
"rewards/tag_count_reward/mean": 0.919921875,
"rewards/tag_count_reward/std": 0.21127038300037385,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1371.4,
"completions/max_terminated_length": 650.8,
"completions/mean_length": 200.70390625,
"completions/mean_terminated_length": 197.6367401123047,
"completions/min_length": 30.8,
"completions/min_terminated_length": 30.8,
"epoch": 0.7668576465403262,
"grad_norm": 0.8934328290702316,
"kl": 0.11044921875,
"learning_rate": 1e-06,
"loss": -0.0073,
"num_tokens": 47893749.0,
"reward": 0.526023668050766,
"reward_std": 0.22173346281051637,
"rewards/format_reward/mean": 0.89765625,
"rewards/format_reward/std": 0.3027026534080505,
"rewards/qatch_metrics/mean": 0.457994270324707,
"rewards/qatch_metrics/std": 0.4191239416599274,
"rewards/tag_count_reward/mean": 0.9392578125,
"rewards/tag_count_reward/std": 0.19010738730430604,
"step": 435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1307.0,
"completions/max_terminated_length": 619.6,
"completions/mean_length": 213.5671875,
"completions/mean_terminated_length": 207.5018341064453,
"completions/min_length": 33.6,
"completions/min_terminated_length": 33.6,
"epoch": 0.7756721022476862,
"grad_norm": 0.7709350601632311,
"kl": 0.1083984375,
"learning_rate": 1e-06,
"loss": 0.0201,
"num_tokens": 48637627.0,
"reward": 0.5134056210517883,
"reward_std": 0.19459065198898315,
"rewards/format_reward/mean": 0.8875,
"rewards/format_reward/std": 0.31534498929977417,
"rewards/qatch_metrics/mean": 0.44468907117843626,
"rewards/qatch_metrics/std": 0.3993754625320435,
"rewards/tag_count_reward/mean": 0.9333984375,
"rewards/tag_count_reward/std": 0.19436517655849456,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1255.0,
"completions/max_terminated_length": 573.8,
"completions/mean_length": 213.83359375,
"completions/mean_terminated_length": 210.8156005859375,
"completions/min_length": 29.8,
"completions/min_terminated_length": 29.8,
"epoch": 0.7844865579550463,
"grad_norm": 0.9730549470078724,
"kl": 0.11259765625,
"learning_rate": 1e-06,
"loss": 0.0194,
"num_tokens": 49400534.0,
"reward": 0.5392698287963867,
"reward_std": 0.21343457698822021,
"rewards/format_reward/mean": 0.89375,
"rewards/format_reward/std": 0.30757365822792054,
"rewards/qatch_metrics/mean": 0.4741064965724945,
"rewards/qatch_metrics/std": 0.42416965365409853,
"rewards/tag_count_reward/mean": 0.9380859375,
"rewards/tag_count_reward/std": 0.18792852461338044,
"step": 445
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 657.8,
"completions/max_terminated_length": 657.8,
"completions/mean_length": 191.94375,
"completions/mean_terminated_length": 191.94375,
"completions/min_length": 28.4,
"completions/min_terminated_length": 28.4,
"epoch": 0.7933010136624064,
"grad_norm": 0.8068324955270447,
"kl": 0.12177734375,
"learning_rate": 1e-06,
"loss": -0.007,
"num_tokens": 50124014.0,
"reward": 0.5979775786399841,
"reward_std": 0.18400471210479735,
"rewards/format_reward/mean": 0.88125,
"rewards/format_reward/std": 0.32406928539276125,
"rewards/qatch_metrics/mean": 0.545139092206955,
"rewards/qatch_metrics/std": 0.40265028476715087,
"rewards/tag_count_reward/mean": 0.9296875,
"rewards/tag_count_reward/std": 0.19919731020927428,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1665.4,
"completions/max_terminated_length": 989.6,
"completions/mean_length": 191.44140625,
"completions/mean_terminated_length": 188.39669494628907,
"completions/min_length": 26.8,
"completions/min_terminated_length": 26.8,
"epoch": 0.8021154693697664,
"grad_norm": 0.9661639043459677,
"kl": 0.120654296875,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 50856579.0,
"reward": 0.5794390618801117,
"reward_std": 0.2076917439699173,
"rewards/format_reward/mean": 0.91953125,
"rewards/format_reward/std": 0.27112471759319307,
"rewards/qatch_metrics/mean": 0.5176190257072448,
"rewards/qatch_metrics/std": 0.41045997142791746,
"rewards/tag_count_reward/mean": 0.9501953125,
"rewards/tag_count_reward/std": 0.17092148661613465,
"step": 455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 578.6,
"completions/max_terminated_length": 578.6,
"completions/mean_length": 185.61171875,
"completions/mean_terminated_length": 185.61171875,
"completions/min_length": 31.8,
"completions/min_terminated_length": 31.8,
"epoch": 0.8109299250771265,
"grad_norm": 0.859237972466753,
"kl": 0.1249755859375,
"learning_rate": 1e-06,
"loss": -0.0045,
"num_tokens": 51568034.0,
"reward": 0.5465836644172668,
"reward_std": 0.17585654258728028,
"rewards/format_reward/mean": 0.93515625,
"rewards/format_reward/std": 0.24650255739688873,
"rewards/qatch_metrics/mean": 0.4766333520412445,
"rewards/qatch_metrics/std": 0.3998740196228027,
"rewards/tag_count_reward/mean": 0.95859375,
"rewards/tag_count_reward/std": 0.15964243412017823,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1242.8,
"completions/max_terminated_length": 579.6,
"completions/mean_length": 193.2140625,
"completions/mean_terminated_length": 190.17308959960937,
"completions/min_length": 37.8,
"completions/min_terminated_length": 37.8,
"epoch": 0.8197443807844865,
"grad_norm": 0.7384329915764564,
"kl": 0.112744140625,
"learning_rate": 1e-06,
"loss": -0.0108,
"num_tokens": 52261716.0,
"reward": 0.6089231491088867,
"reward_std": 0.18263671100139617,
"rewards/format_reward/mean": 0.92421875,
"rewards/format_reward/std": 0.26459681391716006,
"rewards/qatch_metrics/mean": 0.5516513049602508,
"rewards/qatch_metrics/std": 0.4096936106681824,
"rewards/tag_count_reward/mean": 0.951953125,
"rewards/tag_count_reward/std": 0.17155620753765105,
"step": 465
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 591.8,
"completions/max_terminated_length": 591.8,
"completions/mean_length": 196.45390625,
"completions/mean_terminated_length": 196.45390625,
"completions/min_length": 31.2,
"completions/min_terminated_length": 31.2,
"epoch": 0.8285588364918466,
"grad_norm": 1.0025973293270143,
"kl": 0.1135009765625,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 52953113.0,
"reward": 0.6507824778556823,
"reward_std": 0.20129505693912506,
"rewards/format_reward/mean": 0.91484375,
"rewards/format_reward/std": 0.27930967807769774,
"rewards/qatch_metrics/mean": 0.6021958470344544,
"rewards/qatch_metrics/std": 0.3972749710083008,
"rewards/tag_count_reward/mean": 0.9486328125,
"rewards/tag_count_reward/std": 0.1716614156961441,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 638.0,
"completions/max_terminated_length": 638.0,
"completions/mean_length": 213.11171875,
"completions/mean_terminated_length": 213.11171875,
"completions/min_length": 34.6,
"completions/min_terminated_length": 34.6,
"epoch": 0.8373732921992068,
"grad_norm": 0.8516678257406487,
"kl": 0.108935546875,
"learning_rate": 1e-06,
"loss": 0.0161,
"num_tokens": 53659128.0,
"reward": 0.5703884243965149,
"reward_std": 0.20974204540252686,
"rewards/format_reward/mean": 0.9171875,
"rewards/format_reward/std": 0.27448596358299254,
"rewards/qatch_metrics/mean": 0.5072354257106781,
"rewards/qatch_metrics/std": 0.42126131653785703,
"rewards/tag_count_reward/mean": 0.950390625,
"rewards/tag_count_reward/std": 0.16825708746910095,
"step": 475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1347.6,
"completions/max_terminated_length": 668.8,
"completions/mean_length": 221.3375,
"completions/mean_terminated_length": 215.31790161132812,
"completions/min_length": 35.8,
"completions/min_terminated_length": 35.8,
"epoch": 0.8461877479065668,
"grad_norm": 0.8676387330449279,
"kl": 0.1122314453125,
"learning_rate": 1e-06,
"loss": -0.0145,
"num_tokens": 54425080.0,
"reward": 0.5916900038719177,
"reward_std": 0.20206353664398194,
"rewards/format_reward/mean": 0.88203125,
"rewards/format_reward/std": 0.3205987274646759,
"rewards/qatch_metrics/mean": 0.5378453254699707,
"rewards/qatch_metrics/std": 0.41082814931869505,
"rewards/tag_count_reward/mean": 0.9263671875,
"rewards/tag_count_reward/std": 0.20377787947654724,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1610.6,
"completions/max_terminated_length": 923.2,
"completions/mean_length": 224.6875,
"completions/mean_terminated_length": 221.65450744628907,
"completions/min_length": 27.4,
"completions/min_terminated_length": 27.4,
"epoch": 0.8550022036139269,
"grad_norm": 0.7904555962961125,
"kl": 0.109765625,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 55227528.0,
"reward": 0.5810864806175232,
"reward_std": 0.2380138784646988,
"rewards/format_reward/mean": 0.853125,
"rewards/format_reward/std": 0.3541332304477692,
"rewards/qatch_metrics/mean": 0.5300695478916169,
"rewards/qatch_metrics/std": 0.4341892719268799,
"rewards/tag_count_reward/mean": 0.904296875,
"rewards/tag_count_reward/std": 0.23573453426361085,
"step": 485
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1990.4,
"completions/max_terminated_length": 623.2,
"completions/mean_length": 218.89296875,
"completions/mean_terminated_length": 209.80899353027343,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.8638166593212869,
"grad_norm": 0.8342126282923776,
"kl": 0.106396484375,
"learning_rate": 1e-06,
"loss": 0.0082,
"num_tokens": 55974703.0,
"reward": 0.5714076519012451,
"reward_std": 0.20491171181201934,
"rewards/format_reward/mean": 0.8515625,
"rewards/format_reward/std": 0.35569257140159605,
"rewards/qatch_metrics/mean": 0.5185333371162415,
"rewards/qatch_metrics/std": 0.41171206831932067,
"rewards/tag_count_reward/mean": 0.9099609375,
"rewards/tag_count_reward/std": 0.22593727111816406,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1373.0,
"completions/max_terminated_length": 707.4,
"completions/mean_length": 209.70390625,
"completions/mean_terminated_length": 203.64548950195314,
"completions/min_length": 37.2,
"completions/min_terminated_length": 37.2,
"epoch": 0.872631115028647,
"grad_norm": 0.8070043867292849,
"kl": 0.1049560546875,
"learning_rate": 1e-06,
"loss": -0.0094,
"num_tokens": 56709412.0,
"reward": 0.6168586254119873,
"reward_std": 0.20435989499092103,
"rewards/format_reward/mean": 0.85546875,
"rewards/format_reward/std": 0.34868985414505005,
"rewards/qatch_metrics/mean": 0.5714651107788086,
"rewards/qatch_metrics/std": 0.42900125980377196,
"rewards/tag_count_reward/mean": 0.911328125,
"rewards/tag_count_reward/std": 0.22259356081485748,
"step": 495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1373.4,
"completions/max_terminated_length": 661.4,
"completions/mean_length": 218.89609375,
"completions/mean_terminated_length": 215.85523681640626,
"completions/min_length": 38.6,
"completions/min_terminated_length": 38.6,
"epoch": 0.881445570736007,
"grad_norm": 0.8578105605653167,
"kl": 0.1009765625,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 57464591.0,
"reward": 0.5597202479839325,
"reward_std": 0.22529322803020477,
"rewards/format_reward/mean": 0.865625,
"rewards/format_reward/std": 0.3411052882671356,
"rewards/qatch_metrics/mean": 0.5026809990406036,
"rewards/qatch_metrics/std": 0.4211664915084839,
"rewards/tag_count_reward/mean": 0.917578125,
"rewards/tag_count_reward/std": 0.21666719317436217,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1334.0,
"completions/max_terminated_length": 611.0,
"completions/mean_length": 209.959375,
"completions/mean_terminated_length": 203.84853515625,
"completions/min_length": 26.2,
"completions/min_terminated_length": 26.2,
"epoch": 0.8902600264433671,
"grad_norm": 0.8021142913277886,
"kl": 0.108203125,
"learning_rate": 1e-06,
"loss": -0.0275,
"num_tokens": 58213131.0,
"reward": 0.5699510633945465,
"reward_std": 0.2101448118686676,
"rewards/format_reward/mean": 0.82578125,
"rewards/format_reward/std": 0.3783671915531158,
"rewards/qatch_metrics/mean": 0.5208638191223145,
"rewards/qatch_metrics/std": 0.41592952609062195,
"rewards/tag_count_reward/mean": 0.8927734375,
"rewards/tag_count_reward/std": 0.2440448522567749,
"step": 505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2336.6,
"completions/max_terminated_length": 925.2,
"completions/mean_length": 245.92890625,
"completions/mean_terminated_length": 230.83008422851563,
"completions/min_length": 29.8,
"completions/min_terminated_length": 29.8,
"epoch": 0.8990744821507272,
"grad_norm": 0.7883812832224967,
"kl": 0.100732421875,
"learning_rate": 1e-06,
"loss": 0.0226,
"num_tokens": 59018080.0,
"reward": 0.5421915054321289,
"reward_std": 0.21591668128967284,
"rewards/format_reward/mean": 0.80546875,
"rewards/format_reward/std": 0.3959254801273346,
"rewards/qatch_metrics/mean": 0.49160627126693723,
"rewards/qatch_metrics/std": 0.41863099932670594,
"rewards/tag_count_reward/mean": 0.8755859375,
"rewards/tag_count_reward/std": 0.2644981533288956,
"step": 510
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 618.4,
"completions/max_terminated_length": 618.4,
"completions/mean_length": 208.48984375,
"completions/mean_terminated_length": 208.48984375,
"completions/min_length": 34.8,
"completions/min_terminated_length": 34.8,
"epoch": 0.9078889378580872,
"grad_norm": 0.903154309567232,
"kl": 0.111083984375,
"learning_rate": 1e-06,
"loss": -0.0072,
"num_tokens": 59739139.0,
"reward": 0.632229495048523,
"reward_std": 0.19765791296958923,
"rewards/format_reward/mean": 0.83125,
"rewards/format_reward/std": 0.368955659866333,
"rewards/qatch_metrics/mean": 0.593443238735199,
"rewards/qatch_metrics/std": 0.4310678899288177,
"rewards/tag_count_reward/mean": 0.8935546875,
"rewards/tag_count_reward/std": 0.23964128494262696,
"step": 515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1252.4,
"completions/max_terminated_length": 611.6,
"completions/mean_length": 212.459375,
"completions/mean_terminated_length": 209.4299072265625,
"completions/min_length": 33.4,
"completions/min_terminated_length": 33.4,
"epoch": 0.9167033935654474,
"grad_norm": 0.8346843956683994,
"kl": 0.1077392578125,
"learning_rate": 1e-06,
"loss": 0.0163,
"num_tokens": 60466559.0,
"reward": 0.5584656774997712,
"reward_std": 0.23274661898612975,
"rewards/format_reward/mean": 0.8796875,
"rewards/format_reward/std": 0.32524962425231935,
"rewards/qatch_metrics/mean": 0.4991369664669037,
"rewards/qatch_metrics/std": 0.41381397247314455,
"rewards/tag_count_reward/mean": 0.924609375,
"rewards/tag_count_reward/std": 0.21089179813861847,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1248.4,
"completions/max_terminated_length": 638.4,
"completions/mean_length": 210.87109375,
"completions/mean_terminated_length": 207.83172912597655,
"completions/min_length": 28.4,
"completions/min_terminated_length": 28.4,
"epoch": 0.9255178492728074,
"grad_norm": 0.9740281006839744,
"kl": 0.1169189453125,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 61228986.0,
"reward": 0.5737495183944702,
"reward_std": 0.221232670545578,
"rewards/format_reward/mean": 0.8328125,
"rewards/format_reward/std": 0.3710750341415405,
"rewards/qatch_metrics/mean": 0.5245283961296081,
"rewards/qatch_metrics/std": 0.42740072011947633,
"rewards/tag_count_reward/mean": 0.8923828125,
"rewards/tag_count_reward/std": 0.24619007110595703,
"step": 525
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 2158.4,
"completions/max_terminated_length": 753.4,
"completions/mean_length": 201.10390625,
"completions/mean_terminated_length": 191.97602233886718,
"completions/min_length": 21.4,
"completions/min_terminated_length": 21.4,
"epoch": 0.9343323049801675,
"grad_norm": 0.8087287498541261,
"kl": 0.1182373046875,
"learning_rate": 1e-06,
"loss": -0.0014,
"num_tokens": 61969759.0,
"reward": 0.5955200791358948,
"reward_std": 0.2014760673046112,
"rewards/format_reward/mean": 0.84140625,
"rewards/format_reward/std": 0.35906914472579954,
"rewards/qatch_metrics/mean": 0.5486127734184265,
"rewards/qatch_metrics/std": 0.39445692896842954,
"rewards/tag_count_reward/mean": 0.901171875,
"rewards/tag_count_reward/std": 0.23216934502124786,
"step": 530
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 1311.6,
"completions/max_terminated_length": 636.6,
"completions/mean_length": 200.63671875,
"completions/mean_terminated_length": 194.58396911621094,
"completions/min_length": 31.6,
"completions/min_terminated_length": 31.6,
"epoch": 0.9431467606875276,
"grad_norm": 1.0244359727988313,
"kl": 0.1111083984375,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 62673742.0,
"reward": 0.5657651543617248,
"reward_std": 0.18060422837734222,
"rewards/format_reward/mean": 0.88671875,
"rewards/format_reward/std": 0.3158248126506805,
"rewards/qatch_metrics/mean": 0.5065067887306214,
"rewards/qatch_metrics/std": 0.3941995918750763,
"rewards/tag_count_reward/mean": 0.93125,
"rewards/tag_count_reward/std": 0.1997167259454727,
"step": 535
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1975.2,
"completions/max_terminated_length": 528.8,
"completions/mean_length": 192.11484375,
"completions/mean_terminated_length": 182.94750061035157,
"completions/min_length": 21.8,
"completions/min_terminated_length": 21.8,
"epoch": 0.9519612163948876,
"grad_norm": 0.9450830973150219,
"kl": 0.106689453125,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 63388433.0,
"reward": 0.5951115846633911,
"reward_std": 0.19603927731513976,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.34651567935943606,
"rewards/qatch_metrics/mean": 0.5458343744277954,
"rewards/qatch_metrics/std": 0.4278919756412506,
"rewards/tag_count_reward/mean": 0.904296875,
"rewards/tag_count_reward/std": 0.24298664927482605,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00234375,
"completions/max_length": 1540.4,
"completions/max_terminated_length": 1090.8,
"completions/mean_length": 190.159375,
"completions/mean_terminated_length": 181.03360290527343,
"completions/min_length": 35.8,
"completions/min_terminated_length": 35.8,
"epoch": 0.9607756721022477,
"grad_norm": 0.9291286887400723,
"kl": 0.1108642578125,
"learning_rate": 1e-06,
"loss": 0.0375,
"num_tokens": 64092413.0,
"reward": 0.6795460700988769,
"reward_std": 0.20811468064785005,
"rewards/format_reward/mean": 0.91328125,
"rewards/format_reward/std": 0.280303093791008,
"rewards/qatch_metrics/mean": 0.6362651109695434,
"rewards/qatch_metrics/std": 0.4117369055747986,
"rewards/tag_count_reward/mean": 0.9478515625,
"rewards/tag_count_reward/std": 0.18190329372882844,
"step": 545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 560.4,
"completions/max_terminated_length": 560.4,
"completions/mean_length": 181.5375,
"completions/mean_terminated_length": 181.5375,
"completions/min_length": 35.6,
"completions/min_terminated_length": 35.6,
"epoch": 0.9695901278096077,
"grad_norm": 0.9490785592275803,
"kl": 0.1166015625,
"learning_rate": 1e-06,
"loss": -0.0105,
"num_tokens": 64759389.0,
"reward": 0.577846372127533,
"reward_std": 0.19823800325393676,
"rewards/format_reward/mean": 0.91328125,
"rewards/format_reward/std": 0.28074146509170533,
"rewards/qatch_metrics/mean": 0.5166414439678192,
"rewards/qatch_metrics/std": 0.4310955286026001,
"rewards/tag_count_reward/mean": 0.9474609375,
"rewards/tag_count_reward/std": 0.1789928376674652,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 2067.2,
"completions/max_terminated_length": 755.6,
"completions/mean_length": 197.115625,
"completions/mean_terminated_length": 191.02464904785157,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.9784045835169678,
"grad_norm": 0.90442977548872,
"kl": 0.108544921875,
"learning_rate": 1e-06,
"loss": 0.0203,
"num_tokens": 65477873.0,
"reward": 0.5961843609809876,
"reward_std": 0.20585475862026215,
"rewards/format_reward/mean": 0.92421875,
"rewards/format_reward/std": 0.2640294134616852,
"rewards/qatch_metrics/mean": 0.5363083481788635,
"rewards/qatch_metrics/std": 0.41726168990135193,
"rewards/tag_count_reward/mean": 0.9580078125,
"rewards/tag_count_reward/std": 0.1562621772289276,
"step": 555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 3376.8,
"completions/max_terminated_length": 587.8,
"completions/mean_length": 195.74296875,
"completions/mean_terminated_length": 183.5094757080078,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.9872190392243279,
"grad_norm": 0.9502895043232452,
"kl": 0.112939453125,
"learning_rate": 1e-06,
"loss": 0.0127,
"num_tokens": 66214488.0,
"reward": 0.6302931666374206,
"reward_std": 0.21948930323123933,
"rewards/format_reward/mean": 0.88515625,
"rewards/format_reward/std": 0.31716270446777345,
"rewards/qatch_metrics/mean": 0.5827322959899902,
"rewards/qatch_metrics/std": 0.4163429081439972,
"rewards/tag_count_reward/mean": 0.9291015625,
"rewards/tag_count_reward/std": 0.20648659765720367,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 3388.8,
"completions/max_terminated_length": 552.8,
"completions/mean_length": 203.7375,
"completions/mean_terminated_length": 191.53135375976564,
"completions/min_length": 29.8,
"completions/min_terminated_length": 29.8,
"epoch": 0.996033494931688,
"grad_norm": 1.0451866474159504,
"kl": 0.110400390625,
"learning_rate": 1e-06,
"loss": 0.0071,
"num_tokens": 66948152.0,
"reward": 0.5372519016265869,
"reward_std": 0.20013673603534698,
"rewards/format_reward/mean": 0.8640625,
"rewards/format_reward/std": 0.34239274859428404,
"rewards/qatch_metrics/mean": 0.4767416715621948,
"rewards/qatch_metrics/std": 0.3842666923999786,
"rewards/tag_count_reward/mean": 0.9123046875,
"rewards/tag_count_reward/std": 0.23215168714523315,
"step": 565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 2463.5,
"completions/max_terminated_length": 763.0,
"completions/mean_length": 191.5,
"completions/mean_terminated_length": 183.86861419677734,
"completions/min_length": 30.5,
"completions/min_terminated_length": 30.5,
"epoch": 0.999559277214632,
"kl": 0.11181640625,
"num_tokens": 67212456.0,
"reward": 0.6837565302848816,
"reward_std": 0.19374996423721313,
"rewards/format_reward/mean": 0.8828125,
"rewards/format_reward/std": 0.32204362750053406,
"rewards/qatch_metrics/mean": 0.6466471254825592,
"rewards/qatch_metrics/std": 0.3814842849969864,
"rewards/tag_count_reward/mean": 0.91650390625,
"rewards/tag_count_reward/std": 0.2326364442706108,
"step": 567,
"total_flos": 0.0,
"train_loss": 0.0005930395028184331,
"train_runtime": 32371.3135,
"train_samples_per_second": 0.28,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 567,
"num_input_tokens_seen": 67212456,
"num_train_epochs": 1,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}