{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.7, "completions/max_terminated_length": 486.7, "completions/mean_length": 378.275, "completions/mean_terminated_length": 378.275, "completions/min_length": 300.6, "completions/min_terminated_length": 300.6, "epoch": 0.0020815986677768525, "grad_norm": 0.036591879402560146, "kl": 0.00189361572265625, "learning_rate": 9.99991340007382e-07, "loss": 0.0001, "num_tokens": 257710.0, "reward": 1.83125, "reward_std": 0.2132595658302307, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.19231742918491362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0408231720328331, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.8, "completions/max_terminated_length": 461.8, "completions/mean_length": 346.6625, "completions/mean_terminated_length": 346.6625, "completions/min_length": 255.4, "completions/min_terminated_length": 255.4, "epoch": 0.004163197335553705, "grad_norm": 0.06523851911226544, "kl": 0.0028778076171875, "learning_rate": 9.999614046155623e-07, "loss": 0.0001, "num_tokens": 514611.0, "reward": 1.9541666746139525, "reward_std": 0.12878680378198623, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.3, "completions/max_terminated_length": 473.3, "completions/mean_length": 360.95, "completions/mean_terminated_length": 360.95, "completions/min_length": 274.2, "completions/min_terminated_length": 274.2, "epoch": 0.0062447960033305576, "grad_norm": 5.522680149040527, "kl": 0.006317138671875, "learning_rate": 9.99910088190945e-07, "loss": 0.0003, "num_tokens": 770799.0, "reward": 1.789285707473755, "reward_std": 0.2921529281884432, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.2112731844186783, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.026785714365541936, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.059929624944925305, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.2, "completions/max_terminated_length": 520.2, "completions/mean_length": 393.15, "completions/mean_terminated_length": 393.15, "completions/min_length": 297.4, "completions/min_terminated_length": 297.4, "epoch": 0.00832639467110741, "grad_norm": 0.28591884336780005, "kl": 0.0102813720703125, "learning_rate": 9.998373929280957e-07, "loss": 0.0004, "num_tokens": 1046283.0, "reward": 1.89375, "reward_std": 0.2680151164531708, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.22220885157585143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.9, "completions/max_terminated_length": 479.9, "completions/mean_length": 377.4125, "completions/mean_terminated_length": 377.4125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.010407993338884263, "grad_norm": 6.083028786361364, "kl": 0.01175537109375, "learning_rate": 9.997433219358542e-07, "loss": 0.0005, "num_tokens": 1262500.0, "reward": 1.7572916746139526, "reward_std": 0.31888280510902406, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.3202547788619995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00729166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.013684006035327911, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.4, "completions/max_terminated_length": 468.4, "completions/mean_length": 363.9625, "completions/mean_terminated_length": 363.9625, "completions/min_length": 262.7, "completions/min_terminated_length": 262.7, "epoch": 0.012489592006661115, "grad_norm": 4.769209326375173, "kl": 0.013623046875, "learning_rate": 9.996278792372007e-07, "loss": 0.0005, "num_tokens": 1535937.0, "reward": 1.55625, "reward_std": 0.27814957946538926, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.25500801801681516, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.3, "completions/max_terminated_length": 469.3, "completions/mean_length": 352.2625, "completions/mean_terminated_length": 352.2625, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.014571190674437969, "grad_norm": 5.546453307423874, "kl": 0.0191650390625, "learning_rate": 9.994910697690848e-07, "loss": 0.0008, "num_tokens": 1795742.0, "reward": 1.8916666746139525, "reward_std": 0.2752553790807724, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.25587469935417173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.1, "completions/max_terminated_length": 541.1, "completions/mean_length": 387.55, "completions/mean_terminated_length": 387.55, "completions/min_length": 271.5, "completions/min_terminated_length": 271.5, "epoch": 0.01665278934221482, "grad_norm": 3.9471842106226727, "kl": 0.015997314453125, "learning_rate": 9.993328993822132e-07, "loss": 0.0006, "num_tokens": 2068658.0, "reward": 1.775, "reward_std": 0.20214119255542756, "rewards/accuracy_reward/mean": 0.7625, "rewards/accuracy_reward/std": 0.20957585871219636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.018898223340511323, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.2, "completions/max_terminated_length": 513.2, "completions/mean_length": 391.1, "completions/mean_terminated_length": 391.1, "completions/min_length": 270.3, "completions/min_terminated_length": 270.3, "epoch": 0.018734388009991675, "grad_norm": 0.10980281422900377, "kl": 0.0157958984375, "learning_rate": 9.99153374840801e-07, "loss": 0.0006, "num_tokens": 2313026.0, "reward": 1.625, "reward_std": 0.2109176844358444, "rewards/accuracy_reward/mean": 0.5875, "rewards/accuracy_reward/std": 0.13509859144687653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000149011612, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08040101677179337, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.7, "completions/max_terminated_length": 537.7, "completions/mean_length": 412.25, "completions/mean_terminated_length": 412.25, "completions/min_length": 318.3, "completions/min_terminated_length": 318.3, "epoch": 0.020815986677768527, "grad_norm": 0.11232662339475162, "kl": 0.014801025390625, "learning_rate": 9.989525038222806e-07, "loss": 0.0006, "num_tokens": 2583670.0, "reward": 1.85, "reward_std": 0.25040294229984283, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.25586686432361605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.5, "completions/max_terminated_length": 461.5, "completions/mean_length": 364.275, "completions/mean_terminated_length": 364.275, "completions/min_length": 281.2, "completions/min_terminated_length": 281.2, "epoch": 0.02289758534554538, "grad_norm": 0.17438948467243226, "kl": 0.018988037109375, "learning_rate": 9.987302949169748e-07, "loss": 0.0008, "num_tokens": 2849020.0, "reward": 1.8645833253860473, "reward_std": 0.23733972944319248, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.23144719302654265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002083333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00589255727827549, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 352.5875, "completions/mean_terminated_length": 352.5875, "completions/min_length": 236.4, "completions/min_terminated_length": 236.4, "epoch": 0.02497918401332223, "grad_norm": 5.1924414165917305, "kl": 0.02213134765625, "learning_rate": 9.984867576277293e-07, "loss": 0.0009, "num_tokens": 3103243.0, "reward": 1.8191666722297668, "reward_std": 0.24814118593931198, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.2386084347963333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.006666666828095913, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01885618269443512, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.5, "completions/max_terminated_length": 416.5, "completions/mean_length": 327.925, "completions/mean_terminated_length": 327.925, "completions/min_length": 242.6, "completions/min_terminated_length": 242.6, "epoch": 0.027060782681099085, "grad_norm": 0.2046874877968627, "kl": 0.024169921875, "learning_rate": 9.982219023695053e-07, "loss": 0.001, "num_tokens": 3339501.0, "reward": 1.7830357074737548, "reward_std": 0.2340016055852175, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.2112731844186783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008035714365541935, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.022728431969881058, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 294.975, "completions/mean_terminated_length": 294.975, "completions/min_length": 208.2, "completions/min_terminated_length": 208.2, "epoch": 0.029142381348875937, "grad_norm": 0.1422658013873825, "kl": 0.02890625, "learning_rate": 9.979357404689349e-07, "loss": 0.0012, "num_tokens": 3607371.0, "reward": 1.8625, "reward_std": 0.16875659823417663, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/max_terminated_length": 452.6, "completions/mean_length": 341.3125, "completions/mean_terminated_length": 341.3125, "completions/min_length": 249.2, "completions/min_terminated_length": 249.2, "epoch": 0.03122398001665279, "grad_norm": 6.274183399039419, "kl": 0.025732421875, "learning_rate": 9.97628284163837e-07, "loss": 0.001, "num_tokens": 3859748.0, "reward": 1.8802689909934998, "reward_std": 0.21689079953357576, "rewards/accuracy_reward/mean": 0.8511023391969502, "rewards/accuracy_reward/std": 0.11791455755010247, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666567325592, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07967560291290283, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.7, "completions/max_terminated_length": 469.7, "completions/mean_length": 342.8625, "completions/mean_terminated_length": 342.8625, "completions/min_length": 248.8, "completions/min_terminated_length": 248.8, "epoch": 0.03330557868442964, "grad_norm": 4.393785318086197, "kl": 0.03060302734375, "learning_rate": 9.97299546602693e-07, "loss": 0.0012, "num_tokens": 4128401.0, "reward": 1.934375, "reward_std": 0.17850233241915703, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.1632926881313324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 557.4, "completions/max_terminated_length": 514.4, "completions/mean_length": 408.15, "completions/mean_terminated_length": 400.9410736083984, "completions/min_length": 293.4, "completions/min_terminated_length": 293.4, "epoch": 0.03538717735220649, "grad_norm": 0.14407093242039448, "kl": 0.02762451171875, "learning_rate": 9.969495418440855e-07, "loss": 0.0011, "num_tokens": 4386837.0, "reward": 1.7154840588569642, "reward_std": 0.12313865721225739, "rewards/accuracy_reward/mean": 0.7217340528964996, "rewards/accuracy_reward/std": 0.11099039763212204, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.2, "completions/max_terminated_length": 594.2, "completions/mean_length": 469.9625, "completions/mean_terminated_length": 469.9625, "completions/min_length": 344.2, "completions/min_terminated_length": 344.2, "epoch": 0.03746877601998335, "grad_norm": 4.568523263801617, "kl": 0.0269287109375, "learning_rate": 9.965782848560961e-07, "loss": 0.0011, "num_tokens": 4635530.0, "reward": 1.740625, "reward_std": 0.36733007729053496, "rewards/accuracy_reward/mean": 0.7, "rewards/accuracy_reward/std": 0.3130935370922089, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.040625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08054837882518769, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.5, "completions/max_terminated_length": 553.5, "completions/mean_length": 426.9375, "completions/mean_terminated_length": 426.9375, "completions/min_length": 300.1, "completions/min_terminated_length": 300.1, "epoch": 0.0395503746877602, "grad_norm": 4.8879251566662925, "kl": 0.0309326171875, "learning_rate": 9.961857915156661e-07, "loss": 0.0012, "num_tokens": 4900021.0, "reward": 1.9059523820877076, "reward_std": 0.23443024940788745, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.20580926835536956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018452381156384944, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0521912157535553, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.6, "completions/max_terminated_length": 615.6, "completions/mean_length": 441.1625, "completions/mean_terminated_length": 441.1625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.041631973355537054, "grad_norm": 4.457740693953414, "kl": 0.029052734375, "learning_rate": 9.95772078607917e-07, "loss": 0.0012, "num_tokens": 5177594.0, "reward": 1.746875, "reward_std": 0.15954835414886476, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.1595182627439499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 392.4625, "completions/mean_terminated_length": 392.4625, "completions/min_length": 311.9, "completions/min_terminated_length": 311.9, "epoch": 0.043713572023313906, "grad_norm": 0.20284855124141812, "kl": 0.03153076171875, "learning_rate": 9.953371638254334e-07, "loss": 0.0013, "num_tokens": 5458767.0, "reward": 1.8024999976158143, "reward_std": 0.20019548237323762, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.2032530963420868, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002500000037252903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00707106813788414, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 340.35, "completions/mean_terminated_length": 340.35, "completions/min_length": 236.8, "completions/min_terminated_length": 236.8, "epoch": 0.04579517069109076, "grad_norm": 0.12400643377970895, "kl": 0.0336181640625, "learning_rate": 9.94881065767505e-07, "loss": 0.0013, "num_tokens": 5714411.0, "reward": 1.9456944465637207, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.9394444465637207, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.2, "completions/max_terminated_length": 501.2, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 290.2, "completions/min_terminated_length": 290.2, "epoch": 0.04787676935886761, "grad_norm": 4.5686106575902805, "kl": 0.0332763671875, "learning_rate": 9.94403803939333e-07, "loss": 0.0013, "num_tokens": 5967975.0, "reward": 1.8041666746139526, "reward_std": 0.12878680378198623, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08249579146504402, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.7, "completions/max_terminated_length": 483.7, "completions/mean_length": 377.3625, "completions/mean_terminated_length": 377.3625, "completions/min_length": 292.5, "completions/min_terminated_length": 292.5, "epoch": 0.04995836802664446, "grad_norm": 0.19094757033893725, "kl": 0.0373779296875, "learning_rate": 9.939053987511937e-07, "loss": 0.0015, "num_tokens": 6220364.0, "reward": 1.8916666746139525, "reward_std": 0.10983104258775711, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.8, "completions/max_terminated_length": 489.8, "completions/mean_length": 383.15, "completions/mean_terminated_length": 383.15, "completions/min_length": 289.7, "completions/min_terminated_length": 289.7, "epoch": 0.05203996669442131, "grad_norm": 4.443711087325592, "kl": 0.0322509765625, "learning_rate": 9.933858715175687e-07, "loss": 0.0013, "num_tokens": 6465384.0, "reward": 1.7186701416969299, "reward_std": 0.19897533096373082, "rewards/accuracy_reward/mean": 0.7143844276666641, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01678571440279484, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04747716933488846, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.8, "completions/max_terminated_length": 473.8, "completions/mean_length": 350.7125, "completions/mean_terminated_length": 350.7125, "completions/min_length": 257.3, "completions/min_terminated_length": 257.3, "epoch": 0.05412156536219817, "grad_norm": 5.545971354020349, "kl": 0.03245849609375, "learning_rate": 9.928452444562298e-07, "loss": 0.0013, "num_tokens": 6740745.0, "reward": 1.9385416626930236, "reward_std": 0.14416180178523064, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.12416292428970337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013541666977107524, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03830162100493908, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.8, "completions/max_terminated_length": 437.8, "completions/mean_length": 324.8875, "completions/mean_terminated_length": 324.8875, "completions/min_length": 238.8, "completions/min_terminated_length": 238.8, "epoch": 0.05620316402997502, "grad_norm": 5.431553614830727, "kl": 0.0399658203125, "learning_rate": 9.92283540687292e-07, "loss": 0.0016, "num_tokens": 7001736.0, "reward": 1.7467147350311278, "reward_std": 0.22841061986982822, "rewards/accuracy_reward/mean": 0.7102564103901386, "rewards/accuracy_reward/std": 0.14603425860404967, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.036458333395421504, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08728792332112789, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.4, "completions/max_terminated_length": 418.4, "completions/mean_length": 328.3875, "completions/mean_terminated_length": 328.3875, "completions/min_length": 237.5, "completions/min_terminated_length": 237.5, "epoch": 0.058284762697751874, "grad_norm": 5.238127325433427, "kl": 0.043896484375, "learning_rate": 9.917007842322228e-07, "loss": 0.0018, "num_tokens": 7259391.0, "reward": 1.88125, "reward_std": 0.2597082987427711, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11397495716810227, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.5, "completions/max_terminated_length": 402.5, "completions/mean_length": 312.5375, "completions/mean_terminated_length": 312.5375, "completions/min_length": 235.2, "completions/min_terminated_length": 235.2, "epoch": 0.060366361365528726, "grad_norm": 5.339336919324121, "kl": 0.05615234375, "learning_rate": 9.910970000128159e-07, "loss": 0.0022, "num_tokens": 7532170.0, "reward": 1.8760416746139525, "reward_std": 0.2622366651892662, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.1595182627439499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13723524883389474, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 302.975, "completions/mean_terminated_length": 302.975, "completions/min_length": 214.8, "completions/min_terminated_length": 214.8, "epoch": 0.06244796003330558, "grad_norm": 0.2096105552296755, "kl": 0.05185546875, "learning_rate": 9.904722138501244e-07, "loss": 0.0021, "num_tokens": 7784800.0, "reward": 1.9462037086486816, "reward_std": 0.13529810905456544, "rewards/accuracy_reward/mean": 0.8920370370149613, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08084159195423127, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.3, "completions/max_terminated_length": 379.3, "completions/mean_length": 316.3875, "completions/mean_terminated_length": 316.3875, "completions/min_length": 248.6, "completions/min_terminated_length": 248.6, "epoch": 0.06452955870108243, "grad_norm": 0.19562255789500088, "kl": 0.044677734375, "learning_rate": 9.89826452463358e-07, "loss": 0.0018, "num_tokens": 8023007.0, "reward": 1.8239700555801392, "reward_std": 0.2665687516331673, "rewards/accuracy_reward/mean": 0.7791486293077469, "rewards/accuracy_reward/std": 0.2205115258693695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04482142850756645, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06473954916000366, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.9, "completions/max_terminated_length": 467.9, "completions/mean_length": 357.7, "completions/mean_terminated_length": 357.7, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "epoch": 0.06661115736885928, "grad_norm": 4.368009131540421, "kl": 0.0458740234375, "learning_rate": 9.89159743468739e-07, "loss": 0.0018, "num_tokens": 8269055.0, "reward": 1.8868750095367433, "reward_std": 0.136459456756711, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03687500022351742, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08300721384584904, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 377.475, "completions/mean_terminated_length": 377.475, "completions/min_length": 246.1, "completions/min_terminated_length": 246.1, "epoch": 0.06869275603663613, "grad_norm": 5.429243629442194, "kl": 0.0504150390625, "learning_rate": 9.884721153783223e-07, "loss": 0.002, "num_tokens": 8520957.0, "reward": 1.6583147287368774, "reward_std": 0.16093675643205643, "rewards/accuracy_reward/mean": 0.6437313750386238, "rewards/accuracy_reward/std": 0.14980084896087648, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027368012070655822, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.2, "completions/max_terminated_length": 546.2, "completions/mean_length": 405.0375, "completions/mean_terminated_length": 405.0375, "completions/min_length": 285.6, "completions/min_terminated_length": 285.6, "epoch": 0.07077435470441298, "grad_norm": 4.8225656345010695, "kl": 0.047509765625, "learning_rate": 9.87763597598775e-07, "loss": 0.0019, "num_tokens": 8777248.0, "reward": 1.9, "reward_std": 0.07071067690849304, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.3, "completions/max_terminated_length": 567.3, "completions/mean_length": 407.8375, "completions/mean_terminated_length": 407.8375, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.07285595337218984, "grad_norm": 5.072558874680301, "kl": 0.0479248046875, "learning_rate": 9.8703422043012e-07, "loss": 0.0019, "num_tokens": 9052947.0, "reward": 1.8925000190734864, "reward_std": 0.24598965793848038, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05500000044703483, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11805230379104614, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.1, "completions/max_terminated_length": 542.1, "completions/mean_length": 415.9375, "completions/mean_terminated_length": 415.9375, "completions/min_length": 292.3, "completions/min_terminated_length": 292.3, "epoch": 0.0749375520399667, "grad_norm": 5.68978401195414, "kl": 0.04268798828125, "learning_rate": 9.862840150644394e-07, "loss": 0.0017, "num_tokens": 9315070.0, "reward": 1.849039077758789, "reward_std": 0.06558054089546203, "rewards/accuracy_reward/mean": 0.8448724031448365, "rewards/accuracy_reward/std": 0.06210371255874634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.3, "completions/max_terminated_length": 436.3, "completions/mean_length": 318.0625, "completions/mean_terminated_length": 318.0625, "completions/min_length": 208.2, "completions/min_terminated_length": 208.2, "epoch": 0.07701915070774355, "grad_norm": 0.25251783885777485, "kl": 0.0482177734375, "learning_rate": 9.855130135845404e-07, "loss": 0.0019, "num_tokens": 9576043.0, "reward": 1.93067307472229, "reward_std": 0.2553727373480797, "rewards/accuracy_reward/mean": 0.8744230777025223, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12743539363145828, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.7, "completions/max_terminated_length": 542.7, "completions/mean_length": 392.825, "completions/mean_terminated_length": 392.825, "completions/min_length": 272.9, "completions/min_terminated_length": 272.9, "epoch": 0.0791007493755204, "grad_norm": 0.1796751788206183, "kl": 0.043994140625, "learning_rate": 9.847212489625844e-07, "loss": 0.0018, "num_tokens": 9832709.0, "reward": 1.9983333587646483, "reward_std": 0.05785674601793289, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010833333618938923, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02250140383839607, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.1, "completions/max_terminated_length": 492.1, "completions/mean_length": 373.2375, "completions/mean_terminated_length": 373.2375, "completions/min_length": 264.7, "completions/min_terminated_length": 264.7, "epoch": 0.08118234804329726, "grad_norm": 0.2718271523009852, "kl": 0.04495849609375, "learning_rate": 9.839087550586756e-07, "loss": 0.0018, "num_tokens": 10111256.0, "reward": 1.8685897588729858, "reward_std": 0.06406784504652023, "rewards/accuracy_reward/mean": 0.8369230777025223, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03166666720062494, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06406784281134606, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 356.025, "completions/mean_terminated_length": 356.025, "completions/min_length": 246.7, "completions/min_terminated_length": 246.7, "epoch": 0.08326394671107411, "grad_norm": 0.22730851394419502, "kl": 0.0422119140625, "learning_rate": 9.830755666194136e-07, "loss": 0.0017, "num_tokens": 10386938.0, "reward": 1.9456876635551452, "reward_std": 0.01767766922712326, "rewards/accuracy_reward/mean": 0.9394376605749131, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.6, "completions/max_terminated_length": 498.6, "completions/mean_length": 388.375, "completions/mean_terminated_length": 388.375, "completions/min_length": 271.4, "completions/min_terminated_length": 271.4, "epoch": 0.08534554537885096, "grad_norm": 5.316651335823694, "kl": 0.045263671875, "learning_rate": 9.822217192764078e-07, "loss": 0.0018, "num_tokens": 10647280.0, "reward": 1.9, "reward_std": 0.11700168251991272, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.3, "completions/max_terminated_length": 483.3, "completions/mean_length": 375.525, "completions/mean_terminated_length": 375.525, "completions/min_length": 268.7, "completions/min_terminated_length": 268.7, "epoch": 0.08742714404662781, "grad_norm": 0.16182374320605225, "kl": 0.04052734375, "learning_rate": 9.813472495447527e-07, "loss": 0.0016, "num_tokens": 10923474.0, "reward": 1.8870895028114318, "reward_std": 0.11237782835960389, "rewards/accuracy_reward/mean": 0.866256158053875, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556756734848, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.8, "completions/max_terminated_length": 505.8, "completions/mean_length": 379.7, "completions/mean_terminated_length": 379.7, "completions/min_length": 259.8, "completions/min_terminated_length": 259.8, "epoch": 0.08950874271440466, "grad_norm": 0.15437642494043818, "kl": 0.0384033203125, "learning_rate": 9.804521948214671e-07, "loss": 0.0015, "num_tokens": 11187162.0, "reward": 1.821875, "reward_std": 0.13258251920342445, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184229493141, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.5, "completions/max_terminated_length": 500.5, "completions/mean_length": 393.1375, "completions/mean_terminated_length": 393.1375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.09159034138218151, "grad_norm": 0.2088322620701091, "kl": 0.0418701171875, "learning_rate": 9.795365933838946e-07, "loss": 0.0017, "num_tokens": 11460861.0, "reward": 1.9864583492279053, "reward_std": 0.09016691148281097, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03671465814113617, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 394.0875, "completions/mean_terminated_length": 394.0875, "completions/min_length": 301.1, "completions/min_terminated_length": 301.1, "epoch": 0.09367194004995837, "grad_norm": 0.14926889020190243, "kl": 0.03753662109375, "learning_rate": 9.786004843880663e-07, "loss": 0.0015, "num_tokens": 11682516.0, "reward": 2.0981250047683715, "reward_std": 0.056880544126033786, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09812500476837158, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0568805381655693, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.5, "completions/max_terminated_length": 449.5, "completions/mean_length": 354.5375, "completions/mean_terminated_length": 354.5375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.09575353871773522, "grad_norm": 0.19510909785876593, "kl": 0.0388916015625, "learning_rate": 9.776439078670266e-07, "loss": 0.0016, "num_tokens": 11949799.0, "reward": 1.8666666746139526, "reward_std": 0.12705429196357726, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.5, "completions/max_terminated_length": 507.5, "completions/mean_length": 383.425, "completions/mean_terminated_length": 383.425, "completions/min_length": 281.3, "completions/min_terminated_length": 281.3, "epoch": 0.09783513738551207, "grad_norm": 5.56912057426005, "kl": 0.042724609375, "learning_rate": 9.766669047291212e-07, "loss": 0.0017, "num_tokens": 12211601.0, "reward": 1.8916666746139525, "reward_std": 0.08512316644191742, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.034930617362260816, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.1, "completions/max_terminated_length": 502.1, "completions/mean_length": 392.5125, "completions/mean_terminated_length": 392.5125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.09991673605328892, "grad_norm": 5.648352316551891, "kl": 0.043408203125, "learning_rate": 9.756695167562477e-07, "loss": 0.0017, "num_tokens": 12472202.0, "reward": 1.7966435194015502, "reward_std": 0.12664942545816302, "rewards/accuracy_reward/mean": 0.7643518518656492, "rewards/accuracy_reward/std": 0.05289790946990251, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07375151664018631, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 370.5875, "completions/mean_terminated_length": 370.5875, "completions/min_length": 278.2, "completions/min_terminated_length": 278.2, "epoch": 0.10199833472106577, "grad_norm": 4.6762294422697215, "kl": 0.048583984375, "learning_rate": 9.746517866020685e-07, "loss": 0.0019, "num_tokens": 12731665.0, "reward": 1.7166666746139527, "reward_std": 0.19317471832036973, "rewards/accuracy_reward/mean": 0.7125, "rewards/accuracy_reward/std": 0.18138959705829621, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 346.7, "completions/mean_terminated_length": 346.7, "completions/min_length": 225.1, "completions/min_terminated_length": 225.1, "epoch": 0.10407993338884262, "grad_norm": 4.944784659074124, "kl": 0.0518310546875, "learning_rate": 9.736137577901864e-07, "loss": 0.0021, "num_tokens": 13005057.0, "reward": 1.8979166746139526, "reward_std": 0.12586160004138947, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.10606601536273956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.7, "completions/max_terminated_length": 476.7, "completions/mean_length": 332.4625, "completions/mean_terminated_length": 332.4625, "completions/min_length": 208.4, "completions/min_terminated_length": 208.4, "epoch": 0.10616153205661949, "grad_norm": 5.837744557276559, "kl": 0.0501220703125, "learning_rate": 9.725554747122847e-07, "loss": 0.002, "num_tokens": 13278646.0, "reward": 1.9418981552124024, "reward_std": 0.14538512378931046, "rewards/accuracy_reward/mean": 0.8939814820885659, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04791666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1100297823548317, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.6, "completions/max_terminated_length": 439.6, "completions/mean_length": 324.5875, "completions/mean_terminated_length": 324.5875, "completions/min_length": 196.9, "completions/min_terminated_length": 196.9, "epoch": 0.10824313072439634, "grad_norm": 5.582698319005468, "kl": 0.0571044921875, "learning_rate": 9.714769826262268e-07, "loss": 0.0023, "num_tokens": 13529733.0, "reward": 1.8927083492279053, "reward_std": 0.13453055396676064, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04572295844554901, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 316.6875, "completions/mean_terminated_length": 316.6875, "completions/min_length": 196.7, "completions/min_terminated_length": 196.7, "epoch": 0.1103247293921732, "grad_norm": 0.29368782924063275, "kl": 0.0617919921875, "learning_rate": 9.703783276541226e-07, "loss": 0.0025, "num_tokens": 13802252.0, "reward": 2.009375, "reward_std": 0.07692329585552216, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.7, "completions/max_terminated_length": 491.7, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.11240632805995004, "grad_norm": 0.12367376826560877, "kl": 0.0528076171875, "learning_rate": 9.69259556780355e-07, "loss": 0.0021, "num_tokens": 14057620.0, "reward": 1.76875, "reward_std": 0.17920753061771394, "rewards/accuracy_reward/mean": 0.7625, "rewards/accuracy_reward/std": 0.185156187415123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 340.775, "completions/mean_terminated_length": 340.775, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.1144879267277269, "grad_norm": 0.13246155741271956, "kl": 0.05185546875, "learning_rate": 9.6812071784957e-07, "loss": 0.0021, "num_tokens": 14323042.0, "reward": 1.8, "reward_std": 0.14548112079501152, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027439431101083756, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.5, "completions/max_terminated_length": 452.5, "completions/mean_length": 353.1625, "completions/mean_terminated_length": 353.1625, "completions/min_length": 242.9, "completions/min_terminated_length": 242.9, "epoch": 0.11656952539550375, "grad_norm": 0.16536302838147435, "kl": 0.0464111328125, "learning_rate": 9.669618595646326e-07, "loss": 0.0019, "num_tokens": 14601999.0, "reward": 1.9729166746139526, "reward_std": 0.0812177062034607, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.8, "completions/max_terminated_length": 523.8, "completions/mean_length": 402.1625, "completions/mean_terminated_length": 402.1625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.1186511240632806, "grad_norm": 5.102191465543784, "kl": 0.044482421875, "learning_rate": 9.657830314845423e-07, "loss": 0.0018, "num_tokens": 14880164.0, "reward": 2.00625, "reward_std": 0.01767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.7, "completions/max_terminated_length": 511.7, "completions/mean_length": 403.7875, "completions/mean_terminated_length": 403.7875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.12073272273105745, "grad_norm": 0.17497369248614117, "kl": 0.040087890625, "learning_rate": 9.64584284022314e-07, "loss": 0.0016, "num_tokens": 15134971.0, "reward": 2.0322916746139525, "reward_std": 0.07375151813030242, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07375151664018631, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.2, "completions/max_terminated_length": 521.2, "completions/mean_length": 390.7375, "completions/mean_terminated_length": 390.7375, "completions/min_length": 274.5, "completions/min_terminated_length": 274.5, "epoch": 0.1228143213988343, "grad_norm": 0.16337098172815034, "kl": 0.04287109375, "learning_rate": 9.633656684428226e-07, "loss": 0.0017, "num_tokens": 15387070.0, "reward": 1.9728573560714722, "reward_std": 0.16534992158412934, "rewards/accuracy_reward/mean": 0.9132739961147308, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059583334252238274, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09020403549075126, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.4, "completions/max_terminated_length": 445.4, "completions/mean_length": 344.3875, "completions/mean_terminated_length": 344.3875, "completions/min_length": 234.9, "completions/min_terminated_length": 234.9, "epoch": 0.12489592006661115, "grad_norm": 0.19085037941157149, "kl": 0.0505859375, "learning_rate": 9.6212723686061e-07, "loss": 0.002, "num_tokens": 15656085.0, "reward": 2.0025000095367433, "reward_std": 0.07778174281120301, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015000000037252903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.042426406592130664, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.5, "completions/max_terminated_length": 516.5, "completions/mean_length": 376.7875, "completions/mean_terminated_length": 376.7875, "completions/min_length": 261.9, "completions/min_terminated_length": 261.9, "epoch": 0.126977518734388, "grad_norm": 0.18065584438130572, "kl": 0.04794921875, "learning_rate": 9.608690422376572e-07, "loss": 0.0019, "num_tokens": 15930868.0, "reward": 1.9028934240341187, "reward_std": 0.08380073457956314, "rewards/accuracy_reward/mean": 0.8815243899822235, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021369047462940216, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0375097319483757, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.1, "completions/max_terminated_length": 498.1, "completions/mean_length": 373.925, "completions/mean_terminated_length": 373.925, "completions/min_length": 272.3, "completions/min_terminated_length": 272.3, "epoch": 0.12905911740216486, "grad_norm": 0.1464513024616147, "kl": 0.0505615234375, "learning_rate": 9.595911383811186e-07, "loss": 0.002, "num_tokens": 16202262.0, "reward": 1.5799382686614991, "reward_std": 0.18771235942840575, "rewards/accuracy_reward/mean": 0.5799382716417313, "rewards/accuracy_reward/std": 0.18771235942840575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.6, "completions/max_terminated_length": 491.6, "completions/mean_length": 384.05, "completions/mean_terminated_length": 384.05, "completions/min_length": 271.8, "completions/min_terminated_length": 271.8, "epoch": 0.1311407160699417, "grad_norm": 5.53441289953858, "kl": 0.0549072265625, "learning_rate": 9.58293579941021e-07, "loss": 0.0022, "num_tokens": 16469482.0, "reward": 1.6495346546173095, "reward_std": 0.19612031616270542, "rewards/accuracy_reward/mean": 0.6130763038992881, "rewards/accuracy_reward/std": 0.13490437557920815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06681375280022621, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.3, "completions/max_terminated_length": 492.3, "completions/mean_length": 367.45, "completions/mean_terminated_length": 367.45, "completions/min_length": 274.3, "completions/min_terminated_length": 274.3, "epoch": 0.13322231473771856, "grad_norm": 5.668662679753432, "kl": 0.05673828125, "learning_rate": 9.56976422407927e-07, "loss": 0.0023, "num_tokens": 16731918.0, "reward": 1.7243749976158143, "reward_std": 0.11841271668672562, "rewards/accuracy_reward/mean": 0.7125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02437499985098839, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.047702043503522876, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.3, "completions/max_terminated_length": 458.3, "completions/mean_length": 342.9625, "completions/mean_terminated_length": 342.9625, "completions/min_length": 239.3, "completions/min_terminated_length": 239.3, "epoch": 0.1353039134054954, "grad_norm": 0.18411771479742783, "kl": 0.053857421875, "learning_rate": 9.556397221105614e-07, "loss": 0.0022, "num_tokens": 17006411.0, "reward": 1.8183712363243103, "reward_std": 0.0936412863433361, "rewards/accuracy_reward/mean": 0.7871212124824524, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125000111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04735027924180031, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.8, "completions/max_terminated_length": 436.8, "completions/mean_length": 349.0875, "completions/mean_terminated_length": 349.0875, "completions/min_length": 262.7, "completions/min_terminated_length": 262.7, "epoch": 0.13738551207327226, "grad_norm": 5.511871839955663, "kl": 0.0563720703125, "learning_rate": 9.542835362134027e-07, "loss": 0.0023, "num_tokens": 17261754.0, "reward": 2.063749980926514, "reward_std": 0.17790938653051852, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07624999973922968, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15386090911924838, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.5, "completions/max_terminated_length": 428.5, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.13946711074104912, "grad_norm": 5.592099803836835, "kl": 0.0670654296875, "learning_rate": 9.529079227142383e-07, "loss": 0.0027, "num_tokens": 17534422.0, "reward": 1.973035740852356, "reward_std": 0.20074295550584792, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.15235702097415924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0355357151478529, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07101892232894898, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.8, "completions/max_terminated_length": 480.8, "completions/mean_length": 370.5125, "completions/mean_terminated_length": 370.5125, "completions/min_length": 264.4, "completions/min_terminated_length": 264.4, "epoch": 0.14154870940882597, "grad_norm": 0.18265559151343294, "kl": 0.055810546875, "learning_rate": 9.515129404416833e-07, "loss": 0.0022, "num_tokens": 17771455.0, "reward": 2.057083344459534, "reward_std": 0.12538987398147583, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06958333496004343, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09003453925251961, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.2, "completions/max_terminated_length": 518.2, "completions/mean_length": 393.5, "completions/mean_terminated_length": 393.5, "completions/min_length": 295.4, "completions/min_terminated_length": 295.4, "epoch": 0.14363030807660282, "grad_norm": 4.6895976996574165, "kl": 0.05400390625, "learning_rate": 9.500986490526667e-07, "loss": 0.0022, "num_tokens": 18016639.0, "reward": 1.9352083563804627, "reward_std": 0.13558952510356903, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06020833440124988, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09784583821892738, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 546.2, "completions/max_terminated_length": 482.9, "completions/mean_length": 368.3875, "completions/mean_terminated_length": 359.7875, "completions/min_length": 275.2, "completions/min_terminated_length": 275.2, "epoch": 0.14571190674437967, "grad_norm": 0.3020241800583253, "kl": 0.057861328125, "learning_rate": 9.486651090298781e-07, "loss": 0.0023, "num_tokens": 18254198.0, "reward": 1.9729166746139526, "reward_std": 0.17288785427808762, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 0.975, "rewards/format_reward/std": 0.07071067690849304, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04459637701511383, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.8, "completions/max_terminated_length": 470.8, "completions/mean_length": 375.4125, "completions/mean_terminated_length": 375.4125, "completions/min_length": 293.4, "completions/min_terminated_length": 293.4, "epoch": 0.14779350541215652, "grad_norm": 4.671918257015997, "kl": 0.0577392578125, "learning_rate": 9.472123816791822e-07, "loss": 0.0023, "num_tokens": 18496943.0, "reward": 1.9375000238418578, "reward_std": 0.18562961220741273, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06862791702151298, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.1, "completions/max_terminated_length": 474.1, "completions/mean_length": 357.7625, "completions/mean_terminated_length": 357.7625, "completions/min_length": 270.7, "completions/min_terminated_length": 270.7, "epoch": 0.1498751040799334, "grad_norm": 5.427919347031838, "kl": 0.05341796875, "learning_rate": 9.457405291269969e-07, "loss": 0.0021, "num_tokens": 18735012.0, "reward": 1.9375, "reward_std": 0.0816463440656662, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.4, "completions/max_terminated_length": 456.4, "completions/mean_length": 330.775, "completions/mean_terminated_length": 330.775, "completions/min_length": 240.2, "completions/min_terminated_length": 240.2, "epoch": 0.15195670274771025, "grad_norm": 0.16748703582364957, "kl": 0.05107421875, "learning_rate": 9.442496143176363e-07, "loss": 0.002, "num_tokens": 18980186.0, "reward": 1.829650616645813, "reward_std": 0.11030184328556061, "rewards/accuracy_reward/mean": 0.8088172636926174, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03959116339683533, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.7, "completions/max_terminated_length": 496.7, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 263.4, "completions/min_terminated_length": 263.4, "epoch": 0.1540383014154871, "grad_norm": 5.032838834280366, "kl": 0.054248046875, "learning_rate": 9.427397010106189e-07, "loss": 0.0022, "num_tokens": 19251148.0, "reward": 1.7395833492279054, "reward_std": 0.08753891214728356, "rewards/accuracy_reward/mean": 0.725, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04124789834022522, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.2, "completions/max_terminated_length": 537.2, "completions/mean_length": 413.0125, "completions/mean_terminated_length": 413.0125, "completions/min_length": 303.7, "completions/min_terminated_length": 303.7, "epoch": 0.15611990008326396, "grad_norm": 0.19302912446240003, "kl": 0.0564208984375, "learning_rate": 9.412108537779411e-07, "loss": 0.0023, "num_tokens": 19464021.0, "reward": 1.8952083587646484, "reward_std": 0.16093288138508796, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0702083345502615, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11464186012744904, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.6, "completions/max_terminated_length": 537.6, "completions/mean_length": 413.95, "completions/mean_terminated_length": 413.95, "completions/min_length": 327.7, "completions/min_terminated_length": 327.7, "epoch": 0.1582014987510408, "grad_norm": 0.145118164583351, "kl": 0.0509765625, "learning_rate": 9.396631380013151e-07, "loss": 0.002, "num_tokens": 19717945.0, "reward": 1.9875, "reward_std": 0.03535533845424652, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.7, "completions/max_terminated_length": 493.7, "completions/mean_length": 370.6375, "completions/mean_terminated_length": 370.6375, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.16028309741881766, "grad_norm": 0.20241676308921436, "kl": 0.0595703125, "learning_rate": 9.38096619869374e-07, "loss": 0.0024, "num_tokens": 19979964.0, "reward": 1.800843644142151, "reward_std": 0.18159112185239792, "rewards/accuracy_reward/mean": 0.7644894897937775, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04885416626930237, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10429937615990639, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.3, "completions/max_terminated_length": 475.3, "completions/mean_length": 370.35, "completions/mean_terminated_length": 370.35, "completions/min_length": 281.1, "completions/min_terminated_length": 281.1, "epoch": 0.1623646960865945, "grad_norm": 0.185958325228719, "kl": 0.0536865234375, "learning_rate": 9.365113663748398e-07, "loss": 0.0021, "num_tokens": 20248624.0, "reward": 1.9333333492279052, "reward_std": 0.08614101856946946, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05078567415475845, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.9, "completions/max_terminated_length": 448.9, "completions/mean_length": 354.825, "completions/mean_terminated_length": 354.825, "completions/min_length": 242.5, "completions/min_terminated_length": 242.5, "epoch": 0.16444629475437136, "grad_norm": 0.1756506621150048, "kl": 0.0556396484375, "learning_rate": 9.349074453116597e-07, "loss": 0.0022, "num_tokens": 20502578.0, "reward": 1.7383797764778137, "reward_std": 0.1407657042145729, "rewards/accuracy_reward/mean": 0.7000464394688606, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03833333365619183, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06858938410878182, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.8, "completions/max_terminated_length": 495.8, "completions/mean_length": 377.775, "completions/mean_terminated_length": 377.775, "completions/min_length": 285.2, "completions/min_terminated_length": 285.2, "epoch": 0.16652789342214822, "grad_norm": 0.12984067526566828, "kl": 0.0520263671875, "learning_rate": 9.332849252721059e-07, "loss": 0.0021, "num_tokens": 20774224.0, "reward": 1.8625, "reward_std": 0.05175491571426392, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.9, "completions/max_terminated_length": 516.9, "completions/mean_length": 394.05, "completions/mean_terminated_length": 394.05, "completions/min_length": 287.7, "completions/min_terminated_length": 287.7, "epoch": 0.16860949208992507, "grad_norm": 0.1747884207833455, "kl": 0.0486083984375, "learning_rate": 9.316438756438429e-07, "loss": 0.0019, "num_tokens": 21028532.0, "reward": 1.9053641319274903, "reward_std": 0.04150375239551067, "rewards/accuracy_reward/mean": 0.9053641200065613, "rewards/accuracy_reward/std": 0.04150375053286552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.8, "completions/max_terminated_length": 489.8, "completions/mean_length": 340.1, "completions/mean_terminated_length": 340.1, "completions/min_length": 230.9, "completions/min_terminated_length": 230.9, "epoch": 0.17069109075770192, "grad_norm": 4.988214414987532, "kl": 0.0572509765625, "learning_rate": 9.299843666069601e-07, "loss": 0.0023, "num_tokens": 21294116.0, "reward": 1.9336805582046508, "reward_std": 0.2032353922724724, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0711805559694767, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11472872197628022, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.4, "completions/max_terminated_length": 425.4, "completions/mean_length": 318.6875, "completions/mean_terminated_length": 318.6875, "completions/min_length": 221.7, "completions/min_terminated_length": 221.7, "epoch": 0.17277268942547877, "grad_norm": 0.23283062060027884, "kl": 0.0542236328125, "learning_rate": 9.283064691309696e-07, "loss": 0.0022, "num_tokens": 21558139.0, "reward": 1.8541666746139527, "reward_std": 0.10436713248491288, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.3, "completions/max_terminated_length": 464.3, "completions/mean_length": 358.775, "completions/mean_terminated_length": 358.775, "completions/min_length": 262.1, "completions/min_terminated_length": 262.1, "epoch": 0.17485428809325562, "grad_norm": 5.329373226907204, "kl": 0.0521240234375, "learning_rate": 9.266102549717725e-07, "loss": 0.0021, "num_tokens": 21820729.0, "reward": 1.728125, "reward_std": 0.07954951077699661, "rewards/accuracy_reward/mean": 0.7125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044194172322750094, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.2, "completions/max_terminated_length": 471.2, "completions/mean_length": 367.5, "completions/mean_terminated_length": 367.5, "completions/min_length": 269.8, "completions/min_terminated_length": 269.8, "epoch": 0.17693588676103247, "grad_norm": 5.154771321049736, "kl": 0.0549072265625, "learning_rate": 9.248957966685891e-07, "loss": 0.0022, "num_tokens": 22088897.0, "reward": 1.85, "reward_std": 0.24768393635749816, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.1687566041946411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10606601536273956, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.3, "completions/max_terminated_length": 474.3, "completions/mean_length": 367.5125, "completions/mean_terminated_length": 367.5125, "completions/min_length": 280.7, "completions/min_terminated_length": 280.7, "epoch": 0.17901748542880933, "grad_norm": 0.18589784255518843, "kl": 0.05771484375, "learning_rate": 9.231631675408574e-07, "loss": 0.0023, "num_tokens": 22350946.0, "reward": 2.025, "reward_std": 0.046291005611419675, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.7, "completions/max_terminated_length": 517.7, "completions/mean_length": 390.7375, "completions/mean_terminated_length": 390.7375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.18109908409658618, "grad_norm": 0.1660448340192837, "kl": 0.054931640625, "learning_rate": 9.214124416850976e-07, "loss": 0.0022, "num_tokens": 22609893.0, "reward": 1.9479166746139527, "reward_std": 0.0812177062034607, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.9, "completions/max_terminated_length": 506.9, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 308.9, "completions/min_terminated_length": 308.9, "epoch": 0.18318068276436303, "grad_norm": 0.2285617215764742, "kl": 0.0521240234375, "learning_rate": 9.196436939717427e-07, "loss": 0.0021, "num_tokens": 22887917.0, "reward": 2.008333349227905, "reward_std": 0.023570242524147033, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 408.55, "completions/mean_terminated_length": 408.55, "completions/min_length": 278.2, "completions/min_terminated_length": 278.2, "epoch": 0.18526228143213988, "grad_norm": 0.15322648795694802, "kl": 0.0478515625, "learning_rate": 9.178570000419372e-07, "loss": 0.0019, "num_tokens": 23155369.0, "reward": 1.9081249952316284, "reward_std": 0.015569546818733215, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008124999701976776, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.015569545328617096, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 417.725, "completions/mean_terminated_length": 417.725, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.18734388009991673, "grad_norm": 0.13314228532582625, "kl": 0.0444580078125, "learning_rate": 9.160524363043022e-07, "loss": 0.0018, "num_tokens": 23428683.0, "reward": 1.93125, "reward_std": 0.06396867483854293, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.6, "completions/max_terminated_length": 544.6, "completions/mean_length": 408.4, "completions/mean_terminated_length": 408.4, "completions/min_length": 313.4, "completions/min_terminated_length": 313.4, "epoch": 0.18942547876769358, "grad_norm": 0.1512692723591993, "kl": 0.0454833984375, "learning_rate": 9.14230079931668e-07, "loss": 0.0018, "num_tokens": 23665659.0, "reward": 1.90625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 459.6625, "completions/mean_terminated_length": 459.6625, "completions/min_length": 346.2, "completions/min_terminated_length": 346.2, "epoch": 0.19150707743547044, "grad_norm": 0.20503774244974085, "kl": 0.051025390625, "learning_rate": 9.123900088577726e-07, "loss": 0.002, "num_tokens": 23943032.0, "reward": 1.883680558204651, "reward_std": 0.12932045757770538, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02118055559694767, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04767410829663277, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 431.025, "completions/mean_terminated_length": 431.025, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.1935886761032473, "grad_norm": 0.21587849440014878, "kl": 0.0527587890625, "learning_rate": 9.105323017739304e-07, "loss": 0.0021, "num_tokens": 24210330.0, "reward": 1.88125, "reward_std": 0.12246951609849929, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0408231720328331, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.5, "completions/max_terminated_length": 530.5, "completions/mean_length": 398.3375, "completions/mean_terminated_length": 398.3375, "completions/min_length": 286.4, "completions/min_terminated_length": 286.4, "epoch": 0.19567027477102414, "grad_norm": 4.54774855713076, "kl": 0.05244140625, "learning_rate": 9.086570381256662e-07, "loss": 0.0021, "num_tokens": 24472445.0, "reward": 1.8766865015029908, "reward_std": 0.10292123556137085, "rewards/accuracy_reward/mean": 0.8543650805950165, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02232142835855484, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05663023442029953, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.4, "completions/max_terminated_length": 532.4, "completions/mean_length": 420.3625, "completions/mean_terminated_length": 420.3625, "completions/min_length": 320.3, "completions/min_terminated_length": 320.3, "epoch": 0.197751873438801, "grad_norm": 4.543023975771067, "kl": 0.05146484375, "learning_rate": 9.067642981093174e-07, "loss": 0.0021, "num_tokens": 24737890.0, "reward": 1.9541666746139525, "reward_std": 0.1006326362490654, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05434163063764572, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.6, "completions/max_terminated_length": 610.6, "completions/mean_length": 474.5625, "completions/mean_terminated_length": 474.5625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.19983347210657784, "grad_norm": 4.854188023647615, "kl": 0.050244140625, "learning_rate": 9.048541626686046e-07, "loss": 0.002, "num_tokens": 25006047.0, "reward": 1.7625, "reward_std": 0.22051936089992524, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.1851640224456787, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.6, "completions/max_terminated_length": 564.6, "completions/mean_length": 441.7375, "completions/mean_terminated_length": 441.7375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.2019150707743547, "grad_norm": 0.19432002582692567, "kl": 0.0546142578125, "learning_rate": 9.029267134911708e-07, "loss": 0.0022, "num_tokens": 25285434.0, "reward": 1.6375, "reward_std": 0.05175491571426392, "rewards/accuracy_reward/mean": 0.6375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.1, "completions/max_terminated_length": 585.1, "completions/mean_length": 437.9375, "completions/mean_terminated_length": 437.9375, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "epoch": 0.20399666944213155, "grad_norm": 4.642014207555178, "kl": 0.0569580078125, "learning_rate": 9.009820330050866e-07, "loss": 0.0023, "num_tokens": 25557781.0, "reward": 1.8791666746139526, "reward_std": 0.27757782191038133, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.1957120805978775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09299983680248261, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.3, "completions/max_terminated_length": 495.3, "completions/mean_length": 391.825, "completions/mean_terminated_length": 391.825, "completions/min_length": 268.7, "completions/min_terminated_length": 268.7, "epoch": 0.2060782681099084, "grad_norm": 0.2008078809099005, "kl": 0.0586181640625, "learning_rate": 8.990202043753261e-07, "loss": 0.0023, "num_tokens": 25833711.0, "reward": 1.6260989665985108, "reward_std": 0.10520716309547425, "rewards/accuracy_reward/mean": 0.626098969578743, "rewards/accuracy_reward/std": 0.10520716905593872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.6, "completions/max_terminated_length": 591.6, "completions/mean_length": 453.2875, "completions/mean_terminated_length": 453.2875, "completions/min_length": 334.4, "completions/min_terminated_length": 334.4, "epoch": 0.20815986677768525, "grad_norm": 0.25846319187573186, "kl": 0.058935546875, "learning_rate": 8.9704131150021e-07, "loss": 0.0024, "num_tokens": 26111030.0, "reward": 1.853125, "reward_std": 0.133001758903265, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.2, "completions/max_terminated_length": 508.2, "completions/mean_length": 393.5875, "completions/mean_terminated_length": 393.5875, "completions/min_length": 281.8, "completions/min_terminated_length": 281.8, "epoch": 0.21024146544546213, "grad_norm": 3.665098494767977, "kl": 0.0560546875, "learning_rate": 8.950454390078177e-07, "loss": 0.0022, "num_tokens": 26357917.0, "reward": 1.9770833253860474, "reward_std": 0.07685204781591892, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.014583333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03514297790825367, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 316.6, "completions/min_terminated_length": 316.6, "epoch": 0.21232306411323898, "grad_norm": 4.557729215755654, "kl": 0.0542724609375, "learning_rate": 8.930326722523685e-07, "loss": 0.0022, "num_tokens": 26629521.0, "reward": 1.9354166746139527, "reward_std": 0.06791418939828872, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03255883827805519, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 411.9375, "completions/mean_terminated_length": 411.9375, "completions/min_length": 260.7, "completions/min_terminated_length": 260.7, "epoch": 0.21440466278101583, "grad_norm": 0.13656288522156476, "kl": 0.052783203125, "learning_rate": 8.910030973105705e-07, "loss": 0.0021, "num_tokens": 26886244.0, "reward": 1.9420833587646484, "reward_std": 0.08647008240222931, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.017083333618938924, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.040179073065519336, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.4, "completions/max_terminated_length": 538.4, "completions/mean_length": 409.45, "completions/mean_terminated_length": 409.45, "completions/min_length": 304.9, "completions/min_terminated_length": 304.9, "epoch": 0.21648626144879268, "grad_norm": 0.15406730218953882, "kl": 0.0578125, "learning_rate": 8.889568009779402e-07, "loss": 0.0023, "num_tokens": 27159272.0, "reward": 1.975, "reward_std": 0.11898414641618729, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.043555130064487454, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.6, "completions/max_terminated_length": 521.6, "completions/mean_length": 393.9, "completions/mean_terminated_length": 393.9, "completions/min_length": 285.3, "completions/min_terminated_length": 285.3, "epoch": 0.21856786011656953, "grad_norm": 4.232997152833753, "kl": 0.0549072265625, "learning_rate": 8.868938707650907e-07, "loss": 0.0022, "num_tokens": 27409456.0, "reward": 1.9552083492279053, "reward_std": 0.0955127865076065, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0552083358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09551278054714203, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 411.0875, "completions/mean_terminated_length": 411.0875, "completions/min_length": 298.4, "completions/min_terminated_length": 298.4, "epoch": 0.2206494587843464, "grad_norm": 0.1563578337439989, "kl": 0.056884765625, "learning_rate": 8.848143948939892e-07, "loss": 0.0023, "num_tokens": 27669359.0, "reward": 1.8354166746139526, "reward_std": 0.09050626158714295, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055150920152664186, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.4, "completions/max_terminated_length": 520.4, "completions/mean_length": 376.4875, "completions/mean_terminated_length": 376.4875, "completions/min_length": 271.2, "completions/min_terminated_length": 271.2, "epoch": 0.22273105745212324, "grad_norm": 4.773738463091718, "kl": 0.061328125, "learning_rate": 8.827184622941835e-07, "loss": 0.0025, "num_tokens": 27930574.0, "reward": 1.8539583444595338, "reward_std": 0.10452117174863815, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04145833365619182, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06916583105921745, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 406.65, "completions/mean_terminated_length": 406.65, "completions/min_length": 298.1, "completions/min_terminated_length": 298.1, "epoch": 0.2248126561199001, "grad_norm": 4.74427330850803, "kl": 0.0505615234375, "learning_rate": 8.806061625990002e-07, "loss": 0.002, "num_tokens": 28201482.0, "reward": 1.7759617567062378, "reward_std": 0.10142084583640099, "rewards/accuracy_reward/mean": 0.7728367522358894, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.7, "completions/max_terminated_length": 482.7, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 251.1, "completions/min_terminated_length": 251.1, "epoch": 0.22689425478767694, "grad_norm": 0.24058862628684932, "kl": 0.0642822265625, "learning_rate": 8.784775861417099e-07, "loss": 0.0026, "num_tokens": 28466388.0, "reward": 1.909375, "reward_std": 0.2374896600842476, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.071875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12523438036441803, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 346.65, "completions/mean_terminated_length": 346.65, "completions/min_length": 252.7, "completions/min_terminated_length": 252.7, "epoch": 0.2289758534554538, "grad_norm": 4.515835434457469, "kl": 0.0667724609375, "learning_rate": 8.763328239516656e-07, "loss": 0.0027, "num_tokens": 28738368.0, "reward": 2.0379166841506957, "reward_std": 0.09141271561384201, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03791666682809591, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0914127141237259, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.4, "completions/max_terminated_length": 482.4, "completions/mean_length": 376.075, "completions/mean_terminated_length": 376.075, "completions/min_length": 260.6, "completions/min_terminated_length": 260.6, "epoch": 0.23105745212323064, "grad_norm": 0.23711930079187077, "kl": 0.0647216796875, "learning_rate": 8.741719677504088e-07, "loss": 0.0026, "num_tokens": 28987782.0, "reward": 1.8104339241981506, "reward_std": 0.1613641142845154, "rewards/accuracy_reward/mean": 0.7791839212179184, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07255653142929078, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.8, "completions/max_terminated_length": 503.8, "completions/mean_length": 378.325, "completions/mean_terminated_length": 378.325, "completions/min_length": 277.4, "completions/min_terminated_length": 277.4, "epoch": 0.2331390507910075, "grad_norm": 4.704217859900306, "kl": 0.063037109375, "learning_rate": 8.719951099477472e-07, "loss": 0.0025, "num_tokens": 29264176.0, "reward": 1.8729166746139527, "reward_std": 0.11827037632465362, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.2, "completions/max_terminated_length": 480.2, "completions/mean_length": 363.95, "completions/mean_terminated_length": 363.95, "completions/min_length": 251.7, "completions/min_terminated_length": 251.7, "epoch": 0.23522064945878435, "grad_norm": 4.448082281129015, "kl": 0.0650634765625, "learning_rate": 8.698023436378028e-07, "loss": 0.0026, "num_tokens": 29535068.0, "reward": 1.775, "reward_std": 0.17422050833702088, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.17422052025794982, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.7, "completions/max_terminated_length": 488.7, "completions/mean_length": 356.075, "completions/mean_terminated_length": 356.075, "completions/min_length": 263.8, "completions/min_terminated_length": 263.8, "epoch": 0.2373022481265612, "grad_norm": 0.16881450891644154, "kl": 0.0620849609375, "learning_rate": 8.675937625950312e-07, "loss": 0.0025, "num_tokens": 29766794.0, "reward": 1.9022201299667358, "reward_std": 0.09568586796522141, "rewards/accuracy_reward/mean": 0.8511784493923187, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09568586498498917, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.3, "completions/max_terminated_length": 472.3, "completions/mean_length": 368.425, "completions/mean_terminated_length": 368.425, "completions/min_length": 268.7, "completions/min_terminated_length": 268.7, "epoch": 0.23938384679433805, "grad_norm": 0.20587238677147113, "kl": 0.0624267578125, "learning_rate": 8.653694612702105e-07, "loss": 0.0025, "num_tokens": 30015268.0, "reward": 1.7605624437332152, "reward_std": 0.1828530788421631, "rewards/accuracy_reward/mean": 0.7288957685232162, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03166666720062494, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06585140451788903, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.6, "completions/max_terminated_length": 553.6, "completions/mean_length": 418.95, "completions/mean_terminated_length": 418.95, "completions/min_length": 299.9, "completions/min_terminated_length": 299.9, "epoch": 0.2414654454621149, "grad_norm": 4.886301286542531, "kl": 0.06640625, "learning_rate": 8.631295347864023e-07, "loss": 0.0027, "num_tokens": 30292240.0, "reward": 1.9337500095367433, "reward_std": 0.16898389756679535, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000059604644, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08733755946159363, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.5, "completions/max_terminated_length": 530.5, "completions/mean_length": 372.5, "completions/mean_terminated_length": 372.5, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.24354704412989175, "grad_norm": 0.1671711093190659, "kl": 0.0646484375, "learning_rate": 8.608740789348843e-07, "loss": 0.0026, "num_tokens": 30557720.0, "reward": 1.8916666746139525, "reward_std": 0.18054171949625014, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07916666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14518638029694558, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.4, "completions/max_terminated_length": 545.4, "completions/mean_length": 404.7, "completions/mean_terminated_length": 404.7, "completions/min_length": 287.3, "completions/min_terminated_length": 287.3, "epoch": 0.2456286427976686, "grad_norm": 4.336197463116693, "kl": 0.0661865234375, "learning_rate": 8.586031901710526e-07, "loss": 0.0026, "num_tokens": 30815472.0, "reward": 2.0014583349227903, "reward_std": 0.10190928652882576, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.026458334550261496, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055618280172348024, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.3, "completions/max_terminated_length": 536.3, "completions/mean_length": 403.5625, "completions/mean_terminated_length": 403.5625, "completions/min_length": 303.8, "completions/min_terminated_length": 303.8, "epoch": 0.24771024146544546, "grad_norm": 0.20563799179064923, "kl": 0.05634765625, "learning_rate": 8.563169656102984e-07, "loss": 0.0023, "num_tokens": 31074925.0, "reward": 2.0166666746139525, "reward_std": 0.03563483655452728, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.6, "completions/max_terminated_length": 486.6, "completions/mean_length": 372.4, "completions/mean_terminated_length": 372.4, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.2497918401332223, "grad_norm": 4.6014497468287345, "kl": 0.062255859375, "learning_rate": 8.540155030238532e-07, "loss": 0.0025, "num_tokens": 31344357.0, "reward": 2.018750023841858, "reward_std": 0.10257088243961335, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125000111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06721552982926368, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.8, "completions/max_terminated_length": 493.8, "completions/mean_length": 380.4875, "completions/mean_terminated_length": 380.4875, "completions/min_length": 274.5, "completions/min_terminated_length": 274.5, "epoch": 0.25187343880099916, "grad_norm": 0.21384116814461035, "kl": 0.06572265625, "learning_rate": 8.516989008346083e-07, "loss": 0.0026, "num_tokens": 31610260.0, "reward": 1.8972916841506957, "reward_std": 0.15455301925539972, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04729166775941849, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10088659450411797, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.5, "completions/max_terminated_length": 581.5, "completions/mean_length": 413.1125, "completions/mean_terminated_length": 413.1125, "completions/min_length": 293.6, "completions/min_terminated_length": 293.6, "epoch": 0.253955037468776, "grad_norm": 5.141278583858697, "kl": 0.0589111328125, "learning_rate": 8.493672581129058e-07, "loss": 0.0024, "num_tokens": 31874765.0, "reward": 1.928125, "reward_std": 0.1801423728466034, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05825847387313843, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.4, "completions/max_terminated_length": 644.4, "completions/mean_length": 464.075, "completions/mean_terminated_length": 464.075, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.25603663613655286, "grad_norm": 0.16354981078889747, "kl": 0.0588134765625, "learning_rate": 8.470206745723017e-07, "loss": 0.0024, "num_tokens": 32135427.0, "reward": 1.9770833492279052, "reward_std": 0.1731685608625412, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05208333544433117, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09118002727627754, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.2, "completions/max_terminated_length": 530.2, "completions/mean_length": 423.5625, "completions/mean_terminated_length": 423.5625, "completions/min_length": 319.8, "completions/min_terminated_length": 319.8, "epoch": 0.2581182348043297, "grad_norm": 0.19910532213269125, "kl": 0.0617431640625, "learning_rate": 8.446592505653017e-07, "loss": 0.0025, "num_tokens": 32356120.0, "reward": 1.8947438836097716, "reward_std": 0.08823015540838242, "rewards/accuracy_reward/mean": 0.8710831701755524, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02366071417927742, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.4, "completions/max_terminated_length": 575.4, "completions/mean_length": 432.0625, "completions/mean_terminated_length": 432.0625, "completions/min_length": 317.1, "completions/min_terminated_length": 317.1, "epoch": 0.26019983347210657, "grad_norm": 0.15896187258464695, "kl": 0.0609375, "learning_rate": 8.422830870790692e-07, "loss": 0.0024, "num_tokens": 32630189.0, "reward": 1.851994562149048, "reward_std": 0.045456858724355696, "rewards/accuracy_reward/mean": 0.8359231412410736, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01607142873108387, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.045456863939762115, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 404.2625, "completions/mean_terminated_length": 404.2625, "completions/min_length": 292.4, "completions/min_terminated_length": 292.4, "epoch": 0.2622814321398834, "grad_norm": 0.18398079197328773, "kl": 0.061572265625, "learning_rate": 8.39892285731107e-07, "loss": 0.0025, "num_tokens": 32885290.0, "reward": 2.0916666746139527, "reward_std": 0.17030038088560104, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.134945035725832, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.3, "completions/max_terminated_length": 475.3, "completions/mean_length": 373.975, "completions/mean_terminated_length": 373.975, "completions/min_length": 245.1, "completions/min_terminated_length": 245.1, "epoch": 0.26436303080766027, "grad_norm": 0.17496923546167426, "kl": 0.0584228515625, "learning_rate": 8.374869487649116e-07, "loss": 0.0023, "num_tokens": 33162864.0, "reward": 1.9354166746139527, "reward_std": 0.10017346739768981, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.4, "completions/max_terminated_length": 470.4, "completions/mean_length": 346.4375, "completions/mean_terminated_length": 346.4375, "completions/min_length": 230.7, "completions/min_terminated_length": 230.7, "epoch": 0.2664446294754371, "grad_norm": 0.20407585116288846, "kl": 0.0647705078125, "learning_rate": 8.350671790456003e-07, "loss": 0.0026, "num_tokens": 33433515.0, "reward": 1.9375, "reward_std": 0.05175491571426392, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.5, "completions/max_terminated_length": 490.5, "completions/mean_length": 355.9625, "completions/mean_terminated_length": 355.9625, "completions/min_length": 251.3, "completions/min_terminated_length": 251.3, "epoch": 0.268526228143214, "grad_norm": 0.2047612335149913, "kl": 0.0600830078125, "learning_rate": 8.326330800555123e-07, "loss": 0.0024, "num_tokens": 33692176.0, "reward": 1.881944465637207, "reward_std": 0.1617008775472641, "rewards/accuracy_reward/mean": 0.8486111111938953, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.7, "completions/max_terminated_length": 462.7, "completions/mean_length": 349.3375, "completions/mean_terminated_length": 349.3375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.2706078268109908, "grad_norm": 5.268008904357831, "kl": 0.0579833984375, "learning_rate": 8.301847558897836e-07, "loss": 0.0023, "num_tokens": 33953955.0, "reward": 1.900000023841858, "reward_std": 0.2489195354282856, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.11250000409781932, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1672731988132, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.4, "completions/max_terminated_length": 464.4, "completions/mean_length": 354.8, "completions/mean_terminated_length": 354.8, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2726894254787677, "grad_norm": 0.3512517591663718, "kl": 0.064794921875, "learning_rate": 8.27722311251895e-07, "loss": 0.0026, "num_tokens": 34226955.0, "reward": 2.015625, "reward_std": 0.03627826422452927, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03627826571464539, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 351.8125, "completions/mean_terminated_length": 351.8125, "completions/min_length": 255.2, "completions/min_terminated_length": 255.2, "epoch": 0.27477102414654453, "grad_norm": 0.16467258407693672, "kl": 0.06689453125, "learning_rate": 8.25245851449194e-07, "loss": 0.0027, "num_tokens": 34498260.0, "reward": 1.871875, "reward_std": 0.14545682817697525, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.3, "completions/max_terminated_length": 471.3, "completions/mean_length": 360.8125, "completions/mean_terminated_length": 360.8125, "completions/min_length": 251.7, "completions/min_terminated_length": 251.7, "epoch": 0.2768526228143214, "grad_norm": 0.20861058312857134, "kl": 0.068603515625, "learning_rate": 8.227554823883925e-07, "loss": 0.0027, "num_tokens": 34769973.0, "reward": 1.91875, "reward_std": 0.07373789101839065, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07147541642189026, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.9, "completions/max_terminated_length": 491.9, "completions/mean_length": 368.7, "completions/mean_terminated_length": 368.7, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.27893422148209823, "grad_norm": 0.18466009019316756, "kl": 0.063134765625, "learning_rate": 8.202513105710365e-07, "loss": 0.0025, "num_tokens": 35023525.0, "reward": 1.8916666746139525, "reward_std": 0.04714045971632004, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.5, "completions/max_terminated_length": 487.5, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 244.4, "completions/min_terminated_length": 244.4, "epoch": 0.2810158201498751, "grad_norm": 0.1978909757097866, "kl": 0.067431640625, "learning_rate": 8.17733443088952e-07, "loss": 0.0027, "num_tokens": 35281873.0, "reward": 1.8048369646072389, "reward_std": 0.15534310936927795, "rewards/accuracy_reward/mean": 0.791920292377472, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025416667200624944, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0437350295484066, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.2, "completions/max_terminated_length": 539.2, "completions/mean_length": 410.275, "completions/mean_terminated_length": 410.275, "completions/min_length": 317.2, "completions/min_terminated_length": 317.2, "epoch": 0.28309741881765194, "grad_norm": 0.16131378238013733, "kl": 0.060498046875, "learning_rate": 8.152019876196652e-07, "loss": 0.0024, "num_tokens": 35539495.0, "reward": 2.0125, "reward_std": 0.023145502805709837, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.023145502805709837, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.4, "completions/max_terminated_length": 490.4, "completions/mean_length": 375.1625, "completions/mean_terminated_length": 375.1625, "completions/min_length": 269.8, "completions/min_terminated_length": 269.8, "epoch": 0.2851790174854288, "grad_norm": 0.16965838916322384, "kl": 0.0564453125, "learning_rate": 8.126570524217972e-07, "loss": 0.0023, "num_tokens": 35810644.0, "reward": 2.003125, "reward_std": 0.00883883461356163, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.7, "completions/max_terminated_length": 476.7, "completions/mean_length": 342.0125, "completions/mean_terminated_length": 342.0125, "completions/min_length": 234.7, "completions/min_terminated_length": 234.7, "epoch": 0.28726061615320564, "grad_norm": 5.830801542117954, "kl": 0.0645263671875, "learning_rate": 8.100987463304354e-07, "loss": 0.0026, "num_tokens": 36076309.0, "reward": 1.9291666746139526, "reward_std": 0.23552957624197007, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.10350984334945679, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06666666865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09666440933942795, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.6, "completions/max_terminated_length": 419.6, "completions/mean_length": 318.6, "completions/mean_terminated_length": 318.6, "completions/min_length": 225.5, "completions/min_terminated_length": 225.5, "epoch": 0.2893422148209825, "grad_norm": 1.1801320667855104, "kl": 0.06484375, "learning_rate": 8.075271787524775e-07, "loss": 0.0026, "num_tokens": 36342709.0, "reward": 2.019166660308838, "reward_std": 0.048203670978546144, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667014360428, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.048203660547733305, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.3, "completions/max_terminated_length": 457.3, "completions/mean_length": 336.9875, "completions/mean_terminated_length": 336.9875, "completions/min_length": 233.7, "completions/min_terminated_length": 233.7, "epoch": 0.29142381348875934, "grad_norm": 0.19315056407617454, "kl": 0.059814453125, "learning_rate": 8.049424596619543e-07, "loss": 0.0024, "num_tokens": 36608124.0, "reward": 1.8322916746139526, "reward_std": 0.07292233854532242, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03756699562072754, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.7, "completions/max_terminated_length": 444.7, "completions/mean_length": 340.5375, "completions/mean_terminated_length": 340.5375, "completions/min_length": 244.1, "completions/min_terminated_length": 244.1, "epoch": 0.2935054121565362, "grad_norm": 0.19456407294224826, "kl": 0.0667724609375, "learning_rate": 8.023446995953251e-07, "loss": 0.0027, "num_tokens": 36878607.0, "reward": 1.8760416746139525, "reward_std": 0.11459305360913277, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06830205097794532, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.4, "completions/max_terminated_length": 485.4, "completions/mean_length": 370.2, "completions/mean_terminated_length": 370.2, "completions/min_length": 262.5, "completions/min_terminated_length": 262.5, "epoch": 0.29558701082431305, "grad_norm": 0.22498138258141726, "kl": 0.0623291015625, "learning_rate": 7.99734009646752e-07, "loss": 0.0025, "num_tokens": 37139823.0, "reward": 1.7394469141960145, "reward_std": 0.12612001225352287, "rewards/accuracy_reward/mean": 0.7196552403271198, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044473668187856676, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.3, "completions/max_terminated_length": 529.3, "completions/mean_length": 398.975, "completions/mean_terminated_length": 398.975, "completions/min_length": 316.4, "completions/min_terminated_length": 316.4, "epoch": 0.29766860949208995, "grad_norm": 0.2608680560915561, "kl": 0.0619384765625, "learning_rate": 7.971105014633477e-07, "loss": 0.0025, "num_tokens": 37388309.0, "reward": 1.8760273218154908, "reward_std": 0.07142658531665802, "rewards/accuracy_reward/mean": 0.8609231412410736, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01510416716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.025135573744773865, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 521.5, "completions/max_terminated_length": 485.4, "completions/mean_length": 390.675, "completions/mean_terminated_length": 384.0750030517578, "completions/min_length": 311.4, "completions/min_terminated_length": 311.4, "epoch": 0.2997502081598668, "grad_norm": 0.21420320209665428, "kl": 0.067578125, "learning_rate": 7.94474287240402e-07, "loss": 0.0027, "num_tokens": 37614403.0, "reward": 2.065416693687439, "reward_std": 0.1892769455909729, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0904166653752327, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11856626570224763, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 398.1125, "completions/mean_terminated_length": 398.1125, "completions/min_length": 295.3, "completions/min_terminated_length": 295.3, "epoch": 0.30183180682764366, "grad_norm": 0.23938732824134978, "kl": 0.0615234375, "learning_rate": 7.918254797165824e-07, "loss": 0.0025, "num_tokens": 37860732.0, "reward": 1.9497023820877075, "reward_std": 0.09799748845398426, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.049702381156384944, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09799748659133911, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.8, "completions/max_terminated_length": 511.8, "completions/mean_length": 379.275, "completions/mean_terminated_length": 379.275, "completions/min_length": 279.9, "completions/min_terminated_length": 279.9, "epoch": 0.3039134054954205, "grad_norm": 4.628092242634228, "kl": 0.069482421875, "learning_rate": 7.891641921691144e-07, "loss": 0.0028, "num_tokens": 38129138.0, "reward": 1.8937500238418579, "reward_std": 0.17325654327869416, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375000149011612, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0947420835494995, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.9, "completions/max_terminated_length": 461.9, "completions/mean_length": 373.3625, "completions/mean_terminated_length": 373.3625, "completions/min_length": 278.3, "completions/min_terminated_length": 278.3, "epoch": 0.30599500416319736, "grad_norm": 0.20059865536203564, "kl": 0.070068359375, "learning_rate": 7.864905384089354e-07, "loss": 0.0028, "num_tokens": 38387623.0, "reward": 1.98125, "reward_std": 0.12118750065565109, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06943259090185165, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.6, "completions/max_terminated_length": 472.6, "completions/mean_length": 342.3375, "completions/mean_terminated_length": 342.3375, "completions/min_length": 243.5, "completions/min_terminated_length": 243.5, "epoch": 0.3080766028309742, "grad_norm": 0.17844621860586748, "kl": 0.0716064453125, "learning_rate": 7.838046327758292e-07, "loss": 0.0029, "num_tokens": 38656674.0, "reward": 1.9824664831161498, "reward_std": 0.13602151721715927, "rewards/accuracy_reward/mean": 0.9543414890766144, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0716336041688919, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.5, "completions/max_terminated_length": 470.5, "completions/mean_length": 350.9875, "completions/mean_terminated_length": 350.9875, "completions/min_length": 256.5, "completions/min_terminated_length": 256.5, "epoch": 0.31015820149875106, "grad_norm": 0.22326710213332043, "kl": 0.0666259765625, "learning_rate": 7.811065901335347e-07, "loss": 0.0027, "num_tokens": 38927993.0, "reward": 1.9309977531433105, "reward_std": 0.044473668187856676, "rewards/accuracy_reward/mean": 0.9112060777842999, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044473668187856676, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.9, "completions/max_terminated_length": 456.9, "completions/mean_length": 359.5625, "completions/mean_terminated_length": 359.5625, "completions/min_length": 262.1, "completions/min_terminated_length": 262.1, "epoch": 0.3122398001665279, "grad_norm": 4.832125835729535, "kl": 0.0694091796875, "learning_rate": 7.783965258648353e-07, "loss": 0.0028, "num_tokens": 39202846.0, "reward": 2.0133333444595336, "reward_std": 0.08462430983781814, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025833333283662795, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04926896393299103, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.4, "completions/max_terminated_length": 460.4, "completions/mean_length": 365.575, "completions/mean_terminated_length": 365.575, "completions/min_length": 269.8, "completions/min_terminated_length": 269.8, "epoch": 0.31432139883430477, "grad_norm": 4.430038915172002, "kl": 0.0671142578125, "learning_rate": 7.756745558666229e-07, "loss": 0.0027, "num_tokens": 39458492.0, "reward": 1.9916666746139526, "reward_std": 0.13425071388483048, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08249579146504402, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.9, "completions/max_terminated_length": 443.9, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 263.9, "completions/min_terminated_length": 263.9, "epoch": 0.3164029975020816, "grad_norm": 4.471834571798772, "kl": 0.065234375, "learning_rate": 7.729407965449426e-07, "loss": 0.0026, "num_tokens": 39732600.0, "reward": 1.9228125095367432, "reward_std": 0.06452349089086055, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0228125000372529, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06452349312603474, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.9, "completions/max_terminated_length": 485.9, "completions/mean_length": 384.075, "completions/mean_terminated_length": 384.075, "completions/min_length": 273.5, "completions/min_terminated_length": 273.5, "epoch": 0.31848459616985847, "grad_norm": 0.13877808954631102, "kl": 0.0644287109375, "learning_rate": 7.701953648100141e-07, "loss": 0.0026, "num_tokens": 40000910.0, "reward": 2.00625, "reward_std": 0.07255652844905854, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03720119297504425, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.9, "completions/max_terminated_length": 472.9, "completions/mean_length": 362.0875, "completions/mean_terminated_length": 362.0875, "completions/min_length": 236.1, "completions/min_terminated_length": 236.1, "epoch": 0.3205661948376353, "grad_norm": 4.941392135414514, "kl": 0.0575927734375, "learning_rate": 7.674383780712325e-07, "loss": 0.0023, "num_tokens": 40270589.0, "reward": 1.9339583396911622, "reward_std": 0.13531450778245926, "rewards/accuracy_reward/mean": 0.8816666670143605, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05229166746139526, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08902351260185241, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.5, "completions/max_terminated_length": 472.5, "completions/mean_length": 358.6625, "completions/mean_terminated_length": 358.6625, "completions/min_length": 261.9, "completions/min_terminated_length": 261.9, "epoch": 0.3226477935054122, "grad_norm": 0.20067276246483776, "kl": 0.0630859375, "learning_rate": 7.646699542321468e-07, "loss": 0.0025, "num_tokens": 40525298.0, "reward": 2.028541684150696, "reward_std": 0.06255088374018669, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02854166701436043, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06255088374018669, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 305.4375, "completions/mean_terminated_length": 305.4375, "completions/min_length": 226.5, "completions/min_terminated_length": 226.5, "epoch": 0.324729392173189, "grad_norm": 0.23733834977909768, "kl": 0.0707275390625, "learning_rate": 7.618902116854171e-07, "loss": 0.0028, "num_tokens": 40790061.0, "reward": 1.944444465637207, "reward_std": 0.05745632499456406, "rewards/accuracy_reward/mean": 0.9111111111938953, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05745632201433182, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.5, "completions/max_terminated_length": 398.5, "completions/mean_length": 289.0625, "completions/mean_terminated_length": 289.0625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3268109908409659, "grad_norm": 5.195397649911, "kl": 0.06748046875, "learning_rate": 7.590992693077532e-07, "loss": 0.0027, "num_tokens": 41060354.0, "reward": 2.0458333492279053, "reward_std": 0.09181488454341888, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09181488156318665, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.6, "completions/max_terminated_length": 401.6, "completions/mean_length": 322.5875, "completions/mean_terminated_length": 322.5875, "completions/min_length": 233.6, "completions/min_terminated_length": 233.6, "epoch": 0.3288925895087427, "grad_norm": 5.1645519441733745, "kl": 0.07841796875, "learning_rate": 7.56297246454829e-07, "loss": 0.0031, "num_tokens": 41323545.0, "reward": 1.9291666746139526, "reward_std": 0.08249579817056656, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 333.3875, "completions/mean_terminated_length": 333.3875, "completions/min_length": 243.5, "completions/min_terminated_length": 243.5, "epoch": 0.3309741881765196, "grad_norm": 0.1884882034576032, "kl": 0.0700927734375, "learning_rate": 7.534842629561791e-07, "loss": 0.0028, "num_tokens": 41586160.0, "reward": 1.8593750238418578, "reward_std": 0.09315259978175164, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02187500111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04139767438173294, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.6, "completions/max_terminated_length": 504.6, "completions/mean_length": 363.3, "completions/mean_terminated_length": 363.3, "completions/min_length": 246.5, "completions/min_terminated_length": 246.5, "epoch": 0.33305578684429643, "grad_norm": 5.826370704218823, "kl": 0.0677001953125, "learning_rate": 7.506604391100748e-07, "loss": 0.0027, "num_tokens": 41847152.0, "reward": 1.8416666746139527, "reward_std": 0.0497533455491066, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0497533343732357, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.9, "completions/max_terminated_length": 443.9, "completions/mean_length": 335.6875, "completions/mean_terminated_length": 335.6875, "completions/min_length": 254.6, "completions/min_terminated_length": 254.6, "epoch": 0.3351373855120733, "grad_norm": 4.972314064995392, "kl": 0.072509765625, "learning_rate": 7.478258956783781e-07, "loss": 0.0029, "num_tokens": 42112783.0, "reward": 2.008333349227905, "reward_std": 0.17427795231342316, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09263160824775696, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.1, "completions/max_terminated_length": 423.1, "completions/mean_length": 339.675, "completions/mean_terminated_length": 339.675, "completions/min_length": 249.8, "completions/min_terminated_length": 249.8, "epoch": 0.33721898417985013, "grad_norm": 5.179177647842328, "kl": 0.068310546875, "learning_rate": 7.44980753881378e-07, "loss": 0.0027, "num_tokens": 42361445.0, "reward": 2.0614583492279053, "reward_std": 0.11628946885466576, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0614583358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11628946885466576, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.5, "completions/max_terminated_length": 485.5, "completions/mean_length": 388.5375, "completions/mean_terminated_length": 388.5375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.339300582847627, "grad_norm": 0.23406569969548927, "kl": 0.0636474609375, "learning_rate": 7.421251353926073e-07, "loss": 0.0025, "num_tokens": 42602776.0, "reward": 1.8666666746139526, "reward_std": 0.16328328996896743, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.1514981746673584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.8, "completions/max_terminated_length": 514.8, "completions/mean_length": 406.3125, "completions/mean_terminated_length": 406.3125, "completions/min_length": 305.1, "completions/min_terminated_length": 305.1, "epoch": 0.34138218151540384, "grad_norm": 0.23754229470691682, "kl": 0.067724609375, "learning_rate": 7.39259162333637e-07, "loss": 0.0027, "num_tokens": 42821513.0, "reward": 1.9469507694244386, "reward_std": 0.059881458431482314, "rewards/accuracy_reward/mean": 0.9075757578015328, "rewards/accuracy_reward/std": 0.0014284986071288585, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03937500081956387, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05958408713340759, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.4, "completions/max_terminated_length": 468.4, "completions/mean_length": 387.175, "completions/mean_terminated_length": 387.175, "completions/min_length": 313.5, "completions/min_terminated_length": 313.5, "epoch": 0.3434637801831807, "grad_norm": 0.2073197430914097, "kl": 0.07919921875, "learning_rate": 7.363829572688566e-07, "loss": 0.0032, "num_tokens": 43080103.0, "reward": 1.7793055534362794, "reward_std": 0.16691839694976807, "rewards/accuracy_reward/mean": 0.7555555552244186, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.023750000074505805, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06717514395713806, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 428.1125, "completions/mean_terminated_length": 428.1125, "completions/min_length": 313.9, "completions/min_terminated_length": 313.9, "epoch": 0.34554537885095754, "grad_norm": 0.18312263174837315, "kl": 0.0740478515625, "learning_rate": 7.334966432002301e-07, "loss": 0.003, "num_tokens": 43326488.0, "reward": 1.8846933603286744, "reward_std": 0.10355282425880433, "rewards/accuracy_reward/mean": 0.8513600140810013, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05179789587855339, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.2, "completions/max_terminated_length": 568.2, "completions/mean_length": 429.7875, "completions/mean_terminated_length": 429.7875, "completions/min_length": 330.5, "completions/min_terminated_length": 330.5, "epoch": 0.3476269775187344, "grad_norm": 0.1688036787174455, "kl": 0.071240234375, "learning_rate": 7.30600343562037e-07, "loss": 0.0028, "num_tokens": 43604655.0, "reward": 1.785193634033203, "reward_std": 0.09075549244880676, "rewards/accuracy_reward/mean": 0.7685269489884377, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03900056481361389, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.7, "completions/max_terminated_length": 520.7, "completions/mean_length": 382.1125, "completions/mean_terminated_length": 382.1125, "completions/min_length": 284.6, "completions/min_terminated_length": 284.6, "epoch": 0.34970857618651124, "grad_norm": 0.1915634889403887, "kl": 0.07587890625, "learning_rate": 7.276941822155931e-07, "loss": 0.003, "num_tokens": 43879256.0, "reward": 1.83125, "reward_std": 0.11884753406047821, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.7, "completions/max_terminated_length": 562.7, "completions/mean_length": 421.9875, "completions/mean_terminated_length": 421.9875, "completions/min_length": 326.7, "completions/min_terminated_length": 326.7, "epoch": 0.3517901748542881, "grad_norm": 0.268720747030889, "kl": 0.066748046875, "learning_rate": 7.247782834439546e-07, "loss": 0.0027, "num_tokens": 44151383.0, "reward": 1.7979166746139525, "reward_std": 0.1014419287443161, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.1, "completions/max_terminated_length": 496.1, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 280.7, "completions/min_terminated_length": 280.7, "epoch": 0.35387177352206495, "grad_norm": 4.259811558741672, "kl": 0.076416015625, "learning_rate": 7.218527719466013e-07, "loss": 0.0031, "num_tokens": 44415861.0, "reward": 1.7114583373069763, "reward_std": 0.1656905271112919, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09497984722256661, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.2, "completions/max_terminated_length": 580.2, "completions/mean_length": 442.4125, "completions/mean_terminated_length": 442.4125, "completions/min_length": 340.7, "completions/min_terminated_length": 340.7, "epoch": 0.3559533721898418, "grad_norm": 0.1856302711758979, "kl": 0.06845703125, "learning_rate": 7.189177728341051e-07, "loss": 0.0027, "num_tokens": 44673238.0, "reward": 1.653946590423584, "reward_std": 0.1017528209136799, "rewards/accuracy_reward/mean": 0.6508215961977839, "rewards/accuracy_reward/std": 0.09291398625355214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.4, "completions/max_terminated_length": 508.4, "completions/mean_length": 400.5125, "completions/mean_terminated_length": 400.5125, "completions/min_length": 308.3, "completions/min_terminated_length": 308.3, "epoch": 0.35803497085761865, "grad_norm": 4.670734841619906, "kl": 0.0704345703125, "learning_rate": 7.159734116227795e-07, "loss": 0.0028, "num_tokens": 44950983.0, "reward": 1.9545833349227906, "reward_std": 0.12122158259153366, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04208333380520344, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08586623594164848, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.6, "completions/max_terminated_length": 589.6, "completions/mean_length": 448.6375, "completions/mean_terminated_length": 448.6375, "completions/min_length": 337.9, "completions/min_terminated_length": 337.9, "epoch": 0.3601165695253955, "grad_norm": 0.1816425284023795, "kl": 0.074951171875, "learning_rate": 7.130198142293112e-07, "loss": 0.003, "num_tokens": 45214626.0, "reward": 1.8920833468437195, "reward_std": 0.1463622696697712, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09208333436399699, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14636227563023568, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.4, "completions/max_terminated_length": 600.4, "completions/mean_length": 460.5875, "completions/mean_terminated_length": 460.5875, "completions/min_length": 348.4, "completions/min_terminated_length": 348.4, "epoch": 0.36219816819317235, "grad_norm": 0.14018851586139344, "kl": 0.0724853515625, "learning_rate": 7.100571069653758e-07, "loss": 0.0029, "num_tokens": 45477105.0, "reward": 1.9604166746139526, "reward_std": 0.1159398838877678, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0233578659594059, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.6, "completions/max_terminated_length": 563.6, "completions/mean_length": 434.2125, "completions/mean_terminated_length": 434.2125, "completions/min_length": 333.6, "completions/min_terminated_length": 333.6, "epoch": 0.3642797668609492, "grad_norm": 0.17898776181926912, "kl": 0.07880859375, "learning_rate": 7.07085416532236e-07, "loss": 0.0032, "num_tokens": 45759426.0, "reward": 1.862916684150696, "reward_std": 0.08998610377311707, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.012916666828095913, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03653385192155838, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.5, "completions/max_terminated_length": 630.5, "completions/mean_length": 460.35, "completions/mean_terminated_length": 460.35, "completions/min_length": 352.4, "completions/min_terminated_length": 352.4, "epoch": 0.36636136552872606, "grad_norm": 0.1815748637730769, "kl": 0.0739013671875, "learning_rate": 7.041048700153225e-07, "loss": 0.003, "num_tokens": 46035510.0, "reward": 1.803125, "reward_std": 0.1404043108224869, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.10520716905593872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03519715070724487, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.7, "completions/max_terminated_length": 533.7, "completions/mean_length": 403.125, "completions/mean_terminated_length": 403.125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.3684429641965029, "grad_norm": 0.1846933022988794, "kl": 0.0753173828125, "learning_rate": 7.011155948788004e-07, "loss": 0.003, "num_tokens": 46302008.0, "reward": 1.89375, "reward_std": 0.09932401329278946, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.9, "completions/max_terminated_length": 505.9, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 261.7, "completions/min_terminated_length": 261.7, "epoch": 0.37052456286427976, "grad_norm": 0.24274231022975615, "kl": 0.0776611328125, "learning_rate": 6.981177189601168e-07, "loss": 0.0031, "num_tokens": 46562450.0, "reward": 1.8025000095367432, "reward_std": 0.09913797974586487, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02750000059604645, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07040632367134095, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.4, "completions/max_terminated_length": 481.4, "completions/mean_length": 378.1, "completions/mean_terminated_length": 378.1, "completions/min_length": 277.3, "completions/min_terminated_length": 277.3, "epoch": 0.3726061615320566, "grad_norm": 5.690030544072626, "kl": 0.0618408203125, "learning_rate": 6.951113704645347e-07, "loss": 0.0025, "num_tokens": 46828018.0, "reward": 1.9410416722297668, "reward_std": 0.10387353450059891, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.041041666828095916, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10387352779507637, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.3, "completions/max_terminated_length": 487.3, "completions/mean_length": 371.975, "completions/mean_terminated_length": 371.975, "completions/min_length": 259.7, "completions/min_terminated_length": 259.7, "epoch": 0.37468776019983346, "grad_norm": 0.22872967977661343, "kl": 0.0654296875, "learning_rate": 6.920966779596499e-07, "loss": 0.0026, "num_tokens": 47102656.0, "reward": 1.8604166746139525, "reward_std": 0.0851863980293274, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053312502801418304, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 374.075, "completions/mean_terminated_length": 374.075, "completions/min_length": 266.2, "completions/min_terminated_length": 266.2, "epoch": 0.3767693588676103, "grad_norm": 0.24099285333188789, "kl": 0.06142578125, "learning_rate": 6.890737703698929e-07, "loss": 0.0025, "num_tokens": 47372798.0, "reward": 1.9625, "reward_std": 0.05175491571426392, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.9, "completions/max_terminated_length": 469.9, "completions/mean_length": 360.4625, "completions/mean_terminated_length": 360.4625, "completions/min_length": 260.6, "completions/min_terminated_length": 260.6, "epoch": 0.37885095753538717, "grad_norm": 0.18257577519312274, "kl": 0.0643310546875, "learning_rate": 6.860427769710151e-07, "loss": 0.0026, "num_tokens": 47636883.0, "reward": 1.959791660308838, "reward_std": 0.09345597624778748, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03479166626930237, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04716496616601944, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.2, "completions/max_terminated_length": 516.2, "completions/mean_length": 378.2375, "completions/mean_terminated_length": 378.2375, "completions/min_length": 260.1, "completions/min_terminated_length": 260.1, "epoch": 0.380932556203164, "grad_norm": 0.1648102773611354, "kl": 0.06181640625, "learning_rate": 6.830038273845607e-07, "loss": 0.0025, "num_tokens": 47887326.0, "reward": 2.092500019073486, "reward_std": 0.13325504809617997, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0925000011920929, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13325505405664445, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.5, "completions/max_terminated_length": 487.5, "completions/mean_length": 377.3625, "completions/mean_terminated_length": 377.3625, "completions/min_length": 277.9, "completions/min_terminated_length": 277.9, "epoch": 0.38301415487094087, "grad_norm": 0.16638683505842794, "kl": 0.05546875, "learning_rate": 6.799570515723232e-07, "loss": 0.0022, "num_tokens": 48132531.0, "reward": 1.9729166746139526, "reward_std": 0.10860317051410676, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055150920152664186, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.7, "completions/max_terminated_length": 417.7, "completions/mean_length": 319.4625, "completions/mean_terminated_length": 319.4625, "completions/min_length": 228.9, "completions/min_terminated_length": 228.9, "epoch": 0.3850957535387177, "grad_norm": 6.452846913122507, "kl": 0.067578125, "learning_rate": 6.769025798307872e-07, "loss": 0.0027, "num_tokens": 48376000.0, "reward": 1.9947916746139527, "reward_std": 0.12939899265766144, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06979166865348815, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.083107990026474, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.8, "completions/max_terminated_length": 469.8, "completions/mean_length": 329.225, "completions/mean_terminated_length": 329.225, "completions/min_length": 214.2, "completions/min_terminated_length": 214.2, "epoch": 0.3871773522064946, "grad_norm": 0.1827940633944976, "kl": 0.0648193359375, "learning_rate": 6.738405427855569e-07, "loss": 0.0026, "num_tokens": 48642826.0, "reward": 1.7982400417327882, "reward_std": 0.14018386900424956, "rewards/accuracy_reward/mean": 0.7607400402426719, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08673161640763283, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.8, "completions/max_terminated_length": 474.8, "completions/mean_length": 355.0125, "completions/mean_terminated_length": 355.0125, "completions/min_length": 251.2, "completions/min_terminated_length": 251.2, "epoch": 0.3892589508742714, "grad_norm": 0.18147870737021074, "kl": 0.0661865234375, "learning_rate": 6.707710713857695e-07, "loss": 0.0026, "num_tokens": 48884763.0, "reward": 1.9114583492279054, "reward_std": 0.1345927134156227, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.6, "completions/max_terminated_length": 433.6, "completions/mean_length": 307.8875, "completions/mean_terminated_length": 307.8875, "completions/min_length": 200.6, "completions/min_terminated_length": 200.6, "epoch": 0.3913405495420483, "grad_norm": 5.501740334693203, "kl": 0.06533203125, "learning_rate": 6.676942968984947e-07, "loss": 0.0026, "num_tokens": 49132586.0, "reward": 2.0125, "reward_std": 0.0902341976761818, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05487886220216751, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.1, "completions/max_terminated_length": 409.1, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.39342214820982513, "grad_norm": 0.25039435539094473, "kl": 0.0704833984375, "learning_rate": 6.646103509031218e-07, "loss": 0.0028, "num_tokens": 49393066.0, "reward": 1.8256556272506714, "reward_std": 0.046291005611419675, "rewards/accuracy_reward/mean": 0.8256556272506714, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.3, "completions/max_terminated_length": 401.3, "completions/mean_length": 275.8875, "completions/mean_terminated_length": 275.8875, "completions/min_length": 164.6, "completions/min_terminated_length": 164.6, "epoch": 0.395503746877602, "grad_norm": 0.25691871199244704, "kl": 0.0763427734375, "learning_rate": 6.61519365285732e-07, "loss": 0.0031, "num_tokens": 49655017.0, "reward": 1.844861125946045, "reward_std": 0.05957057476043701, "rewards/accuracy_reward/mean": 0.8111111111938953, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03375000059604645, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05957057476043701, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.8, "completions/max_terminated_length": 371.8, "completions/mean_length": 266.1375, "completions/mean_terminated_length": 266.1375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.39758534554537883, "grad_norm": 0.20036486923463614, "kl": 0.0826171875, "learning_rate": 6.584214722334587e-07, "loss": 0.0033, "num_tokens": 49910500.0, "reward": 2.0104166746139525, "reward_std": 0.0197955846786499, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.7, "completions/max_terminated_length": 427.7, "completions/mean_length": 326.55, "completions/mean_terminated_length": 326.55, "completions/min_length": 239.1, "completions/min_terminated_length": 239.1, "epoch": 0.3996669442131557, "grad_norm": 0.1801963277310025, "kl": 0.066650390625, "learning_rate": 6.553168042288344e-07, "loss": 0.0027, "num_tokens": 50147856.0, "reward": 1.8151979327201844, "reward_std": 0.0197955846786499, "rewards/accuracy_reward/mean": 0.804781262204051, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.6, "completions/max_terminated_length": 432.6, "completions/mean_length": 321.975, "completions/mean_terminated_length": 321.975, "completions/min_length": 220.6, "completions/min_terminated_length": 220.6, "epoch": 0.40174854288093254, "grad_norm": 4.238761240943084, "kl": 0.0733154296875, "learning_rate": 6.522054940441245e-07, "loss": 0.0029, "num_tokens": 50385742.0, "reward": 2.0125, "reward_std": 0.22073246538639069, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13836860954761504, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.5, "completions/max_terminated_length": 444.5, "completions/mean_length": 331.9875, "completions/mean_terminated_length": 331.9875, "completions/min_length": 223.5, "completions/min_terminated_length": 223.5, "epoch": 0.4038301415487094, "grad_norm": 7.556629315451778, "kl": 0.0675048828125, "learning_rate": 6.490876747356502e-07, "loss": 0.0027, "num_tokens": 50613853.0, "reward": 1.9708333492279053, "reward_std": 0.11572751551866531, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06943650841712952, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.2, "completions/max_terminated_length": 479.2, "completions/mean_length": 348.8625, "completions/mean_terminated_length": 348.8625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.40591174021648624, "grad_norm": 4.456618141615971, "kl": 0.067236328125, "learning_rate": 6.459634796380971e-07, "loss": 0.0027, "num_tokens": 50879938.0, "reward": 2.015416693687439, "reward_std": 0.08786429166793823, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.027916667237877845, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05250894501805305, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.5, "completions/max_terminated_length": 445.5, "completions/mean_length": 344.4125, "completions/mean_terminated_length": 344.4125, "completions/min_length": 251.2, "completions/min_terminated_length": 251.2, "epoch": 0.4079933388842631, "grad_norm": 4.5171789641781315, "kl": 0.101806640625, "learning_rate": 6.428330423588145e-07, "loss": 0.0041, "num_tokens": 51134059.0, "reward": 1.921875, "reward_std": 0.04604002460837364, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046040027588605884, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.2, "completions/max_terminated_length": 545.2, "completions/mean_length": 395.8375, "completions/mean_terminated_length": 395.8375, "completions/min_length": 272.7, "completions/min_terminated_length": 272.7, "epoch": 0.41007493755203994, "grad_norm": 0.16951611801469424, "kl": 0.0731689453125, "learning_rate": 6.396964967721005e-07, "loss": 0.0029, "num_tokens": 51394838.0, "reward": 1.8356250047683715, "reward_std": 0.22055620830506087, "rewards/accuracy_reward/mean": 0.8033333331346512, "rewards/accuracy_reward/std": 0.13616152815520763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08439468294382095, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.8, "completions/max_terminated_length": 561.8, "completions/mean_length": 404.5375, "completions/mean_terminated_length": 404.5375, "completions/min_length": 282.6, "completions/min_terminated_length": 282.6, "epoch": 0.4121565362198168, "grad_norm": 0.16229005912790964, "kl": 0.071826171875, "learning_rate": 6.365539770134771e-07, "loss": 0.0029, "num_tokens": 51658337.0, "reward": 1.9541666746139525, "reward_std": 0.08854104951024055, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08854104280471801, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.4, "completions/max_terminated_length": 535.4, "completions/mean_length": 414.8125, "completions/mean_terminated_length": 414.8125, "completions/min_length": 324.1, "completions/min_terminated_length": 324.1, "epoch": 0.41423813488759365, "grad_norm": 4.555204301813374, "kl": 0.0670654296875, "learning_rate": 6.334056174739544e-07, "loss": 0.0027, "num_tokens": 51922002.0, "reward": 1.8643336772918702, "reward_std": 0.10620288997888565, "rewards/accuracy_reward/mean": 0.8149586588144302, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04937499985098839, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10620288476347924, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.5, "completions/max_terminated_length": 536.5, "completions/mean_length": 414.0125, "completions/mean_terminated_length": 414.0125, "completions/min_length": 318.2, "completions/min_terminated_length": 318.2, "epoch": 0.4163197335553705, "grad_norm": 4.905038947000655, "kl": 0.0641845703125, "learning_rate": 6.302515527942821e-07, "loss": 0.0026, "num_tokens": 52200915.0, "reward": 1.9447916746139526, "reward_std": 0.10023652911186218, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04479166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10023652613162995, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.5, "completions/max_terminated_length": 497.5, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 262.3, "completions/min_terminated_length": 262.3, "epoch": 0.4184013322231474, "grad_norm": 0.2606902179556751, "kl": 0.0752685546875, "learning_rate": 6.270919178591931e-07, "loss": 0.003, "num_tokens": 52447019.0, "reward": 2.019791674613953, "reward_std": 0.046982268989086154, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046982265263795855, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.4, "completions/max_terminated_length": 517.4, "completions/mean_length": 385.0125, "completions/mean_terminated_length": 385.0125, "completions/min_length": 272.6, "completions/min_terminated_length": 272.6, "epoch": 0.42048293089092426, "grad_norm": 0.1756738079044623, "kl": 0.0638427734375, "learning_rate": 6.239268477916339e-07, "loss": 0.0026, "num_tokens": 52718236.0, "reward": 1.990625, "reward_std": 0.10128694772720337, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05512984022498131, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.9, "completions/max_terminated_length": 510.9, "completions/mean_length": 380.925, "completions/mean_terminated_length": 380.925, "completions/min_length": 278.6, "completions/min_terminated_length": 278.6, "epoch": 0.4225645295587011, "grad_norm": 0.15933891953595988, "kl": 0.0642578125, "learning_rate": 6.207564779469866e-07, "loss": 0.0026, "num_tokens": 52978670.0, "reward": 1.9325980424880982, "reward_std": 0.04247846901416778, "rewards/accuracy_reward/mean": 0.9138480395078659, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04247846454381943, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.2, "completions/max_terminated_length": 476.2, "completions/mean_length": 362.675, "completions/mean_terminated_length": 362.675, "completions/min_length": 262.6, "completions/min_terminated_length": 262.6, "epoch": 0.42464612822647796, "grad_norm": 5.441756902551618, "kl": 0.0628662109375, "learning_rate": 6.175809439072801e-07, "loss": 0.0025, "num_tokens": 53253164.0, "reward": 1.85625, "reward_std": 0.1047879233956337, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.7, "completions/max_terminated_length": 470.7, "completions/mean_length": 361.175, "completions/mean_terminated_length": 361.175, "completions/min_length": 255.9, "completions/min_terminated_length": 255.9, "epoch": 0.4267277268942548, "grad_norm": 0.1751402303368657, "kl": 0.0652587890625, "learning_rate": 6.144003814753918e-07, "loss": 0.0026, "num_tokens": 53526690.0, "reward": 1.950000023841858, "reward_std": 0.11148266792297364, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06250000409781933, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07612732574343681, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.6, "completions/max_terminated_length": 490.6, "completions/mean_length": 397.1375, "completions/mean_terminated_length": 397.1375, "completions/min_length": 282.8, "completions/min_terminated_length": 282.8, "epoch": 0.42880932556203166, "grad_norm": 0.19036129861503978, "kl": 0.0601806640625, "learning_rate": 6.112149266692408e-07, "loss": 0.0024, "num_tokens": 53775445.0, "reward": 1.8781250238418579, "reward_std": 0.08377420753240586, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500447034836, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08377420306205749, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.2, "completions/max_terminated_length": 533.2, "completions/mean_length": 396.175, "completions/mean_terminated_length": 396.175, "completions/min_length": 268.6, "completions/min_terminated_length": 268.6, "epoch": 0.4308909242298085, "grad_norm": 0.19970426413421907, "kl": 0.0605224609375, "learning_rate": 6.080247157159698e-07, "loss": 0.0024, "num_tokens": 54048691.0, "reward": 1.8385416746139527, "reward_std": 0.08529684022068977, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06144712120294571, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.4, "completions/max_terminated_length": 492.4, "completions/mean_length": 363.4, "completions/mean_terminated_length": 363.4, "completions/min_length": 244.8, "completions/min_terminated_length": 244.8, "epoch": 0.43297252289758537, "grad_norm": 4.437547266147232, "kl": 0.0663330078125, "learning_rate": 6.048298850461199e-07, "loss": 0.0027, "num_tokens": 54284083.0, "reward": 2.0614583492279053, "reward_std": 0.08593156784772873, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06145833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08593156784772873, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.8, "completions/max_terminated_length": 444.8, "completions/mean_length": 346.9875, "completions/mean_terminated_length": 346.9875, "completions/min_length": 253.2, "completions/min_terminated_length": 253.2, "epoch": 0.4350541215653622, "grad_norm": 0.16638263391399472, "kl": 0.066162109375, "learning_rate": 6.016305712877963e-07, "loss": 0.0026, "num_tokens": 54530666.0, "reward": 1.9469957947731018, "reward_std": 0.06380395293235779, "rewards/accuracy_reward/mean": 0.91262077242136, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03437500111758709, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06380394622683525, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.7, "completions/max_terminated_length": 455.7, "completions/mean_length": 349.175, "completions/mean_terminated_length": 349.175, "completions/min_length": 243.2, "completions/min_terminated_length": 243.2, "epoch": 0.43713572023313907, "grad_norm": 0.7141845653139242, "kl": 0.06982421875, "learning_rate": 5.984269112608248e-07, "loss": 0.0028, "num_tokens": 54807816.0, "reward": 1.8172561049461364, "reward_std": 0.011785121262073516, "rewards/accuracy_reward/mean": 0.8130894303321838, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.8, "completions/max_terminated_length": 485.8, "completions/mean_length": 374.5875, "completions/mean_terminated_length": 374.5875, "completions/min_length": 271.4, "completions/min_terminated_length": 271.4, "epoch": 0.4392173189009159, "grad_norm": 5.696974098605386, "kl": 0.0635009765625, "learning_rate": 5.952190419709015e-07, "loss": 0.0025, "num_tokens": 55041135.0, "reward": 1.9550000309944153, "reward_std": 0.2741450160741806, "rewards/accuracy_reward/mean": 0.8091666668653488, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1458333373069763, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.2038055345416069, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.3, "completions/max_terminated_length": 505.3, "completions/mean_length": 371.55, "completions/mean_terminated_length": 371.55, "completions/min_length": 271.3, "completions/min_terminated_length": 271.3, "epoch": 0.4412989175686928, "grad_norm": 5.4845719987127595, "kl": 0.0662353515625, "learning_rate": 5.920071006037328e-07, "loss": 0.0027, "num_tokens": 55270747.0, "reward": 1.959375, "reward_std": 0.13593488335609435, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184154987335, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.1, "completions/max_terminated_length": 562.1, "completions/mean_length": 406.5875, "completions/mean_terminated_length": 406.5875, "completions/min_length": 296.7, "completions/min_terminated_length": 296.7, "epoch": 0.4433805162364696, "grad_norm": 0.16148691721988143, "kl": 0.05693359375, "learning_rate": 5.88791224519169e-07, "loss": 0.0023, "num_tokens": 55517802.0, "reward": 1.9183068990707397, "reward_std": 0.22507139891386033, "rewards/accuracy_reward/mean": 0.8165211647748947, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10178571604192257, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.17878038063645363, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.6, "completions/max_terminated_length": 518.6, "completions/mean_length": 393.7625, "completions/mean_terminated_length": 393.7625, "completions/min_length": 280.5, "completions/min_terminated_length": 280.5, "epoch": 0.4454621149042465, "grad_norm": 0.1931719760035152, "kl": 0.067578125, "learning_rate": 5.8557155124533e-07, "loss": 0.0027, "num_tokens": 55776967.0, "reward": 1.9604166746139526, "reward_std": 0.05612906813621521, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06041666865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.056129063665866855, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.4, "completions/max_terminated_length": 628.4, "completions/mean_length": 486.4125, "completions/mean_terminated_length": 486.4125, "completions/min_length": 391.4, "completions/min_terminated_length": 391.4, "epoch": 0.4475437135720233, "grad_norm": 0.17084291616318495, "kl": 0.0604736328125, "learning_rate": 5.82348218472724e-07, "loss": 0.0024, "num_tokens": 56029360.0, "reward": 1.875, "reward_std": 0.19062008559703827, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.15526476502418518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.3, "completions/max_terminated_length": 599.3, "completions/mean_length": 443.6875, "completions/mean_terminated_length": 443.6875, "completions/min_length": 295.2, "completions/min_terminated_length": 295.2, "epoch": 0.4496253122398002, "grad_norm": 0.48374155371617983, "kl": 0.06611328125, "learning_rate": 5.791213640483591e-07, "loss": 0.0026, "num_tokens": 56290727.0, "reward": 1.69375, "reward_std": 0.05303300768136978, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.4, "completions/max_terminated_length": 588.4, "completions/mean_length": 403.6, "completions/mean_terminated_length": 403.6, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.45170691090757703, "grad_norm": 0.1445031069139877, "kl": 0.0674072265625, "learning_rate": 5.758911259698479e-07, "loss": 0.0027, "num_tokens": 56535423.0, "reward": 2.0375, "reward_std": 0.05717712491750717, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05717712789773941, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 582.1, "completions/max_terminated_length": 548.7, "completions/mean_length": 419.825, "completions/mean_terminated_length": 414.7017883300781, "completions/min_length": 302.8, "completions/min_terminated_length": 302.8, "epoch": 0.4537885095753539, "grad_norm": 0.2039012828985732, "kl": 0.063671875, "learning_rate": 5.726576423795064e-07, "loss": 0.0025, "num_tokens": 56776377.0, "reward": 1.996666669845581, "reward_std": 0.08670328855514527, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021666666865348815, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03576821386814118, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.7, "completions/max_terminated_length": 615.7, "completions/mean_length": 450.525, "completions/mean_terminated_length": 450.525, "completions/min_length": 322.9, "completions/min_terminated_length": 322.9, "epoch": 0.45587010824313073, "grad_norm": 0.2191739216969976, "kl": 0.05869140625, "learning_rate": 5.694210515584457e-07, "loss": 0.0023, "num_tokens": 57055635.0, "reward": 2.0091666698455812, "reward_std": 0.08777731209993363, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021666666865348815, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05242196917533874, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.4, "completions/max_terminated_length": 534.4, "completions/mean_length": 397.9875, "completions/mean_terminated_length": 397.9875, "completions/min_length": 292.4, "completions/min_terminated_length": 292.4, "epoch": 0.4579517069109076, "grad_norm": 0.18071086089547078, "kl": 0.065087890625, "learning_rate": 5.661814919206594e-07, "loss": 0.0026, "num_tokens": 57296922.0, "reward": 2.028869080543518, "reward_std": 0.05570702590048313, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028869048692286015, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05570701584219932, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.9, "completions/max_terminated_length": 531.9, "completions/mean_length": 400.8125, "completions/mean_terminated_length": 400.8125, "completions/min_length": 292.9, "completions/min_terminated_length": 292.9, "epoch": 0.46003330557868444, "grad_norm": 4.088657909363055, "kl": 0.0660888671875, "learning_rate": 5.629391020071032e-07, "loss": 0.0026, "num_tokens": 57544875.0, "reward": 1.8931250095367431, "reward_std": 0.23395789116621019, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09312500022351741, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1542452432215214, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.7, "completions/max_terminated_length": 529.7, "completions/mean_length": 409.4875, "completions/mean_terminated_length": 409.4875, "completions/min_length": 315.7, "completions/min_terminated_length": 315.7, "epoch": 0.4621149042464613, "grad_norm": 4.76066463942462, "kl": 0.1063720703125, "learning_rate": 5.59694020479771e-07, "loss": 0.0043, "num_tokens": 57816154.0, "reward": 1.96875, "reward_std": 0.1952166110277176, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0672792598605156, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.3, "completions/max_terminated_length": 669.3, "completions/mean_length": 536.5375, "completions/mean_terminated_length": 536.5375, "completions/min_length": 384.3, "completions/min_terminated_length": 384.3, "epoch": 0.46419650291423814, "grad_norm": 4.334216934519029, "kl": 0.0681884765625, "learning_rate": 5.564463861157637e-07, "loss": 0.0027, "num_tokens": 58090021.0, "reward": 1.3, "reward_std": 0.4292363554239273, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.39388103485107423, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 723.7, "completions/max_terminated_length": 663.9, "completions/mean_length": 526.05, "completions/mean_terminated_length": 515.7785766601562, "completions/min_length": 369.8, "completions/min_terminated_length": 369.8, "epoch": 0.466278101582015, "grad_norm": 4.1930076280017206, "kl": 0.0716796875, "learning_rate": 5.531963378013561e-07, "loss": 0.0029, "num_tokens": 58377081.0, "reward": 1.3382021546363831, "reward_std": 0.4580157116055489, "rewards/accuracy_reward/mean": 0.2954938292503357, "rewards/accuracy_reward/std": 0.3434520088136196, "rewards/format_reward/mean": 0.975, "rewards/format_reward/std": 0.07071067690849304, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0677083346992731, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09890521839261054, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.5, "completions/max_terminated_length": 625.5, "completions/mean_length": 462.5625, "completions/mean_terminated_length": 462.5625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.46835970024979184, "grad_norm": 4.5367121933873085, "kl": 0.071923828125, "learning_rate": 5.49944014526056e-07, "loss": 0.0029, "num_tokens": 58648062.0, "reward": 1.5307243108749389, "reward_std": 0.37956870198249815, "rewards/accuracy_reward/mean": 0.47405762821435926, "rewards/accuracy_reward/std": 0.37618621438741684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.056666669249534604, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09018309488892555, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.9, "completions/max_terminated_length": 581.9, "completions/mean_length": 448.6125, "completions/mean_terminated_length": 448.6125, "completions/min_length": 322.2, "completions/min_terminated_length": 322.2, "epoch": 0.4704412989175687, "grad_norm": 4.212489303625003, "kl": 0.076708984375, "learning_rate": 5.46689555376661e-07, "loss": 0.0031, "num_tokens": 58888839.0, "reward": 1.778541672229767, "reward_std": 0.324503193795681, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.23144719302654265, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05354166869074106, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10384699180722237, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.6, "completions/max_terminated_length": 660.6, "completions/mean_length": 502.925, "completions/mean_terminated_length": 502.925, "completions/min_length": 350.9, "completions/min_terminated_length": 350.9, "epoch": 0.47252289758534555, "grad_norm": 0.16742959807896826, "kl": 0.0670166015625, "learning_rate": 5.434330995313097e-07, "loss": 0.0027, "num_tokens": 59157993.0, "reward": 1.8049999952316285, "reward_std": 0.20901573747396468, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.1948736011981964, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.005000000074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01414213627576828, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 469.2, "completions/mean_terminated_length": 469.2, "completions/min_length": 332.7, "completions/min_terminated_length": 332.7, "epoch": 0.4746044962531224, "grad_norm": 4.835954112188709, "kl": 0.07080078125, "learning_rate": 5.401747862535307e-07, "loss": 0.0028, "num_tokens": 59428345.0, "reward": 1.8166666984558106, "reward_std": 0.37507805973291397, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.2929195284843445, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07916666977107525, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10120401307940483, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.1, "completions/max_terminated_length": 625.1, "completions/mean_length": 485.1125, "completions/mean_terminated_length": 485.1125, "completions/min_length": 351.9, "completions/min_terminated_length": 351.9, "epoch": 0.47668609492089925, "grad_norm": 5.6497051467213755, "kl": 0.0648193359375, "learning_rate": 5.369147548862859e-07, "loss": 0.0026, "num_tokens": 59682330.0, "reward": 1.883750033378601, "reward_std": 0.2730850502848625, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.2150476098060608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000171363353, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11164017990231515, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.6, "completions/max_terminated_length": 507.6, "completions/mean_length": 397.8375, "completions/mean_terminated_length": 397.8375, "completions/min_length": 302.7, "completions/min_terminated_length": 302.7, "epoch": 0.4787676935886761, "grad_norm": 4.905315583455296, "kl": 0.071826171875, "learning_rate": 5.336531448460124e-07, "loss": 0.0029, "num_tokens": 59941981.0, "reward": 2.0729166746139525, "reward_std": 0.14378461316227914, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.085416666790843, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10842926502227783, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 598.5, "completions/max_terminated_length": 559.3, "completions/mean_length": 441.3, "completions/mean_terminated_length": 434.8089294433594, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.48084929225645295, "grad_norm": 0.15381638622442467, "kl": 0.0681396484375, "learning_rate": 5.303900956166593e-07, "loss": 0.0027, "num_tokens": 60197093.0, "reward": 1.8645833492279054, "reward_std": 0.10891140550374985, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.033108004927635194, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.6, "completions/max_terminated_length": 533.6, "completions/mean_length": 399.675, "completions/mean_terminated_length": 399.675, "completions/min_length": 283.6, "completions/min_terminated_length": 283.6, "epoch": 0.4829308909242298, "grad_norm": 0.41233767876846544, "kl": 0.06923828125, "learning_rate": 5.271257467437234e-07, "loss": 0.0028, "num_tokens": 60472171.0, "reward": 1.780847954750061, "reward_std": 0.07071067690849304, "rewards/accuracy_reward/mean": 0.7808479532599449, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.4, "completions/max_terminated_length": 484.4, "completions/mean_length": 382.8125, "completions/mean_terminated_length": 382.8125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.48501248959200666, "grad_norm": 0.18389590944709894, "kl": 0.0666259765625, "learning_rate": 5.238602378282815e-07, "loss": 0.0027, "num_tokens": 60746292.0, "reward": 1.975, "reward_std": 0.046291005611419675, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.9, "completions/max_terminated_length": 560.9, "completions/mean_length": 413.85, "completions/mean_terminated_length": 413.85, "completions/min_length": 292.3, "completions/min_terminated_length": 292.3, "epoch": 0.4870940882597835, "grad_norm": 4.615879338831406, "kl": 0.063720703125, "learning_rate": 5.205937085210197e-07, "loss": 0.0026, "num_tokens": 60959256.0, "reward": 1.927529764175415, "reward_std": 0.19220280051231384, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.14056250751018523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02752976268529892, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05164029598236084, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.5, "completions/max_terminated_length": 532.5, "completions/mean_length": 394.55, "completions/mean_terminated_length": 394.55, "completions/min_length": 278.9, "completions/min_terminated_length": 278.9, "epoch": 0.48917568692756036, "grad_norm": 0.14942227839170505, "kl": 0.0636962890625, "learning_rate": 5.173262985162614e-07, "loss": 0.0026, "num_tokens": 61230564.0, "reward": 1.9645833492279052, "reward_std": 0.20971630662679672, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02708333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07660323679447174, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.9, "completions/max_terminated_length": 494.9, "completions/mean_length": 397.2625, "completions/mean_terminated_length": 397.2625, "completions/min_length": 299.9, "completions/min_terminated_length": 299.9, "epoch": 0.4912572855953372, "grad_norm": 4.289089774953379, "kl": 0.0595458984375, "learning_rate": 5.140581475459938e-07, "loss": 0.0024, "num_tokens": 61471321.0, "reward": 1.8956249952316284, "reward_std": 0.13828388042747974, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.033125000260770324, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07691512294113637, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 411.4875, "completions/mean_terminated_length": 411.4875, "completions/min_length": 305.9, "completions/min_terminated_length": 305.9, "epoch": 0.49333888426311406, "grad_norm": 0.18518897583184304, "kl": 0.0660888671875, "learning_rate": 5.107893953738915e-07, "loss": 0.0026, "num_tokens": 61730760.0, "reward": 1.86875, "reward_std": 0.11623437106609344, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.029124119877815248, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.4, "completions/max_terminated_length": 507.4, "completions/mean_length": 381.625, "completions/mean_terminated_length": 381.625, "completions/min_length": 281.5, "completions/min_terminated_length": 281.5, "epoch": 0.4954204829308909, "grad_norm": 5.476492140212813, "kl": 0.0659912109375, "learning_rate": 5.075201817893396e-07, "loss": 0.0026, "num_tokens": 62006714.0, "reward": 2.021250009536743, "reward_std": 0.19753799736499786, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000096857548, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13994049057364463, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 602.2, "completions/max_terminated_length": 577.7, "completions/mean_length": 450.0625, "completions/mean_terminated_length": 444.56964721679685, "completions/min_length": 321.8, "completions/min_terminated_length": 321.8, "epoch": 0.49750208159866777, "grad_norm": 0.1584734476787758, "kl": 0.06484375, "learning_rate": 5.04250646601456e-07, "loss": 0.0026, "num_tokens": 62237823.0, "reward": 1.7147321462631226, "reward_std": 0.1240307368338108, "rewards/accuracy_reward/mean": 0.6970238089561462, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05063929483294487, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.2, "completions/max_terminated_length": 526.2, "completions/mean_length": 416.0375, "completions/mean_terminated_length": 416.0375, "completions/min_length": 310.8, "completions/min_terminated_length": 310.8, "epoch": 0.4995836802664446, "grad_norm": 4.671426097531064, "kl": 0.0638916015625, "learning_rate": 5.009809296331118e-07, "loss": 0.0026, "num_tokens": 62514826.0, "reward": 1.6633333444595337, "reward_std": 0.09387510269880295, "rewards/accuracy_reward/mean": 0.6125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.050833333656191824, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05851975753903389, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.6, "completions/max_terminated_length": 542.6, "completions/mean_length": 406.35, "completions/mean_terminated_length": 406.35, "completions/min_length": 275.3, "completions/min_terminated_length": 275.3, "epoch": 0.5016652789342215, "grad_norm": 0.14630449525115286, "kl": 0.13447265625, "learning_rate": 4.977111707149521e-07, "loss": 0.0054, "num_tokens": 62779366.0, "reward": 1.875, "reward_std": 0.19686797261238098, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.1687566041946411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.1, "completions/max_terminated_length": 562.1, "completions/mean_length": 406.1375, "completions/mean_terminated_length": 406.1375, "completions/min_length": 286.6, "completions/min_terminated_length": 286.6, "epoch": 0.5037468776019983, "grad_norm": 5.956207329606263, "kl": 0.058935546875, "learning_rate": 4.944415096794161e-07, "loss": 0.0024, "num_tokens": 63043865.0, "reward": 1.9383333444595336, "reward_std": 0.1557971253991127, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013333333283662796, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02661053091287613, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.4, "completions/max_terminated_length": 460.4, "completions/mean_length": 372.1625, "completions/mean_terminated_length": 372.1625, "completions/min_length": 275.5, "completions/min_terminated_length": 275.5, "epoch": 0.5058284762697752, "grad_norm": 0.23284235184340876, "kl": 0.06318359375, "learning_rate": 4.911720863547568e-07, "loss": 0.0025, "num_tokens": 63314838.0, "reward": 1.8229166746139527, "reward_std": 0.1730016589164734, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.14056250751018523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0412478968501091, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.1, "completions/max_terminated_length": 510.1, "completions/mean_length": 388.9125, "completions/mean_terminated_length": 388.9125, "completions/min_length": 298.4, "completions/min_terminated_length": 298.4, "epoch": 0.507910074937552, "grad_norm": 0.15603568789200578, "kl": 0.057958984375, "learning_rate": 4.879030405590619e-07, "loss": 0.0023, "num_tokens": 63586063.0, "reward": 1.6361075520515442, "reward_std": 0.14981908798217775, "rewards/accuracy_reward/mean": 0.6298575364053249, "rewards/accuracy_reward/std": 0.13214141875505447, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.5, "completions/max_terminated_length": 488.5, "completions/mean_length": 379.2125, "completions/mean_terminated_length": 379.2125, "completions/min_length": 273.7, "completions/min_terminated_length": 273.7, "epoch": 0.5099916736053289, "grad_norm": 0.16532706939561498, "kl": 0.0598388671875, "learning_rate": 4.84634512094273e-07, "loss": 0.0024, "num_tokens": 63857800.0, "reward": 1.93125, "reward_std": 0.18097035735845565, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.1632926881313324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.8, "completions/max_terminated_length": 504.8, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 267.5, "completions/min_terminated_length": 267.5, "epoch": 0.5120732722731057, "grad_norm": 0.162376196003175, "kl": 0.056689453125, "learning_rate": 4.813666407402089e-07, "loss": 0.0023, "num_tokens": 64129614.0, "reward": 1.8337500095367432, "reward_std": 0.1141713872551918, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008749999850988389, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019339685142040253, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.5, "completions/max_terminated_length": 432.5, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 285.3, "completions/min_terminated_length": 285.3, "epoch": 0.5141548709408826, "grad_norm": 0.20639651283098462, "kl": 0.061962890625, "learning_rate": 4.780995662485859e-07, "loss": 0.0025, "num_tokens": 64384904.0, "reward": 1.865046989917755, "reward_std": 0.18655484169721603, "rewards/accuracy_reward/mean": 0.8521303236484528, "rewards/accuracy_reward/std": 0.17045392990112304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.012916666828095913, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03653385192155838, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.7, "completions/max_terminated_length": 491.7, "completions/mean_length": 360.4875, "completions/mean_terminated_length": 360.4875, "completions/min_length": 249.7, "completions/min_terminated_length": 249.7, "epoch": 0.5162364696086594, "grad_norm": 0.18282518864741693, "kl": 0.0615234375, "learning_rate": 4.748334283370432e-07, "loss": 0.0025, "num_tokens": 64643279.0, "reward": 1.9632417678833007, "reward_std": 0.19427025616168975, "rewards/accuracy_reward/mean": 0.8944917440414428, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000260770321, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12050200030207633, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.3, "completions/max_terminated_length": 492.3, "completions/mean_length": 369.575, "completions/mean_terminated_length": 369.575, "completions/min_length": 271.2, "completions/min_terminated_length": 271.2, "epoch": 0.5183180682764363, "grad_norm": 5.280030094416361, "kl": 0.06513671875, "learning_rate": 4.7156836668316567e-07, "loss": 0.0026, "num_tokens": 64894389.0, "reward": 1.9135416984558105, "reward_std": 0.25629419833421707, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.12416292428970337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166604578496, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13213126733899117, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 388.825, "completions/mean_terminated_length": 388.825, "completions/min_length": 264.2, "completions/min_terminated_length": 264.2, "epoch": 0.5203996669442131, "grad_norm": 0.17958774514499543, "kl": 0.063232421875, "learning_rate": 4.683045209185126e-07, "loss": 0.0025, "num_tokens": 65171735.0, "reward": 2.0, "reward_std": 0.11624701544642449, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06279476955533028, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.4, "completions/max_terminated_length": 435.4, "completions/mean_length": 341.0875, "completions/mean_terminated_length": 341.0875, "completions/min_length": 234.7, "completions/min_terminated_length": 234.7, "epoch": 0.52248126561199, "grad_norm": 4.911714810824962, "kl": 0.0594970703125, "learning_rate": 4.6504203062264465e-07, "loss": 0.0024, "num_tokens": 65437782.0, "reward": 1.7674168467521667, "reward_std": 0.11884753406047821, "rewards/accuracy_reward/mean": 0.7361668512225151, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07255653142929078, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 330.1, "completions/mean_terminated_length": 330.1, "completions/min_length": 245.1, "completions/min_terminated_length": 245.1, "epoch": 0.5245628642797668, "grad_norm": 5.248564218688146, "kl": 0.06259765625, "learning_rate": 4.617810353171559e-07, "loss": 0.0025, "num_tokens": 65705374.0, "reward": 1.8066666722297668, "reward_std": 0.22095786333084105, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04416666682809591, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11436765491962433, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.3, "completions/max_terminated_length": 493.3, "completions/mean_length": 391.3875, "completions/mean_terminated_length": 391.3875, "completions/min_length": 278.1, "completions/min_terminated_length": 278.1, "epoch": 0.5266444629475437, "grad_norm": 0.16378324718640833, "kl": 0.0593505859375, "learning_rate": 4.58521674459706e-07, "loss": 0.0024, "num_tokens": 65962061.0, "reward": 1.85, "reward_std": 0.15782093107700348, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.15782093703746797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.3, "completions/max_terminated_length": 471.3, "completions/mean_length": 373.475, "completions/mean_terminated_length": 373.475, "completions/min_length": 266.7, "completions/min_terminated_length": 266.7, "epoch": 0.5287260616153205, "grad_norm": 0.1830028884331716, "kl": 0.0652587890625, "learning_rate": 4.5526408743805766e-07, "loss": 0.0026, "num_tokens": 66221843.0, "reward": 1.6479166746139526, "reward_std": 0.19362604022026061, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.1569620907306671, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03666396141052246, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.5, "completions/max_terminated_length": 485.5, "completions/mean_length": 390.7625, "completions/mean_terminated_length": 390.7625, "completions/min_length": 307.8, "completions/min_terminated_length": 307.8, "epoch": 0.5308076602830974, "grad_norm": 4.514230348134004, "kl": 0.0629150390625, "learning_rate": 4.5200841356411383e-07, "loss": 0.0025, "num_tokens": 66499336.0, "reward": 1.8791666746139526, "reward_std": 0.12090870141983032, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 408.0, "completions/mean_terminated_length": 408.0, "completions/min_length": 297.9, "completions/min_terminated_length": 297.9, "epoch": 0.5328892589508742, "grad_norm": 5.057172959976599, "kl": 0.0637451171875, "learning_rate": 4.487547920679619e-07, "loss": 0.0026, "num_tokens": 66777848.0, "reward": 1.8416666746139527, "reward_std": 0.25614635050296786, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.185156187415123, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.7, "completions/max_terminated_length": 436.7, "completions/mean_length": 347.7625, "completions/mean_terminated_length": 347.7625, "completions/min_length": 246.6, "completions/min_terminated_length": 246.6, "epoch": 0.5349708576186512, "grad_norm": 0.16058155449876269, "kl": 0.06494140625, "learning_rate": 4.455033620919181e-07, "loss": 0.0026, "num_tokens": 67050053.0, "reward": 1.9739131927490234, "reward_std": 0.12320148199796677, "rewards/accuracy_reward/mean": 0.9145381838083267, "rewards/accuracy_reward/std": 0.040609382838010785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059375002048909664, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08259210474789143, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.2, "completions/max_terminated_length": 461.2, "completions/mean_length": 364.925, "completions/mean_terminated_length": 364.925, "completions/min_length": 255.9, "completions/min_terminated_length": 255.9, "epoch": 0.537052456286428, "grad_norm": 4.827380252982668, "kl": 0.0681396484375, "learning_rate": 4.422542626845778e-07, "loss": 0.0027, "num_tokens": 67311335.0, "reward": 1.9572916746139526, "reward_std": 0.12070775479078293, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07775685265660286, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 369.9375, "completions/mean_terminated_length": 369.9375, "completions/min_length": 249.2, "completions/min_terminated_length": 249.2, "epoch": 0.5391340549542049, "grad_norm": 0.14251222545674874, "kl": 0.066455078125, "learning_rate": 4.390076327948682e-07, "loss": 0.0027, "num_tokens": 67576626.0, "reward": 1.7372291088104248, "reward_std": 0.15879597142338753, "rewards/accuracy_reward/mean": 0.7163957685232163, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03085862174630165, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.1, "completions/max_terminated_length": 435.1, "completions/mean_length": 354.675, "completions/mean_terminated_length": 354.675, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5412156536219817, "grad_norm": 0.1917964302311964, "kl": 0.0631103515625, "learning_rate": 4.3576361126610726e-07, "loss": 0.0025, "num_tokens": 67844256.0, "reward": 1.8813888907432557, "reward_std": 0.10606601536273956, "rewards/accuracy_reward/mean": 0.8938888892531395, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.5, "completions/max_terminated_length": 498.5, "completions/mean_length": 363.65, "completions/mean_terminated_length": 363.65, "completions/min_length": 263.3, "completions/min_terminated_length": 263.3, "epoch": 0.5432972522897586, "grad_norm": 0.1959125425760896, "kl": 0.062109375, "learning_rate": 4.325223368300651e-07, "loss": 0.0025, "num_tokens": 68103620.0, "reward": 1.9702083468437195, "reward_std": 0.12276247590780258, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08270833585411311, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09280072674155235, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 342.5875, "completions/mean_terminated_length": 342.5875, "completions/min_length": 251.3, "completions/min_terminated_length": 251.3, "epoch": 0.5453788509575354, "grad_norm": 5.143451269883383, "kl": 0.0701416015625, "learning_rate": 4.2928394810103183e-07, "loss": 0.0028, "num_tokens": 68354763.0, "reward": 1.8983333587646485, "reward_std": 0.1375915750861168, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02333333417773247, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.050481320917606355, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.8, "completions/max_terminated_length": 455.8, "completions/mean_length": 337.075, "completions/mean_terminated_length": 337.075, "completions/min_length": 223.5, "completions/min_terminated_length": 223.5, "epoch": 0.5474604496253123, "grad_norm": 4.888783306760334, "kl": 0.0669677734375, "learning_rate": 4.2604858356988845e-07, "loss": 0.0027, "num_tokens": 68620449.0, "reward": 1.9479166746139527, "reward_std": 0.17002529054880142, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.2, "completions/max_terminated_length": 449.2, "completions/mean_length": 344.775, "completions/mean_terminated_length": 344.775, "completions/min_length": 231.1, "completions/min_terminated_length": 231.1, "epoch": 0.5495420482930891, "grad_norm": 0.20047462316762774, "kl": 0.0705078125, "learning_rate": 4.2281638159818576e-07, "loss": 0.0028, "num_tokens": 68864487.0, "reward": 2.021875, "reward_std": 0.13440237641334535, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.046875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07023735865950584, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.3, "completions/max_terminated_length": 424.3, "completions/mean_length": 332.1, "completions/mean_terminated_length": 332.1, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.551623646960866, "grad_norm": 0.19745363394939197, "kl": 0.06005859375, "learning_rate": 4.195874804122262e-07, "loss": 0.0024, "num_tokens": 69110799.0, "reward": 1.9602083206176757, "reward_std": 0.09825282096862793, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010208333283662796, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.011142565310001374, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.5, "completions/max_terminated_length": 468.5, "completions/mean_length": 352.8875, "completions/mean_terminated_length": 352.8875, "completions/min_length": 252.8, "completions/min_terminated_length": 252.8, "epoch": 0.5537052456286428, "grad_norm": 0.1280210870103763, "kl": 0.0635986328125, "learning_rate": 4.163620180971532e-07, "loss": 0.0025, "num_tokens": 69375502.0, "reward": 1.971875, "reward_std": 0.09685598835349082, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 502.2, "completions/max_terminated_length": 475.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 358.0821472167969, "completions/min_length": 239.6, "completions/min_terminated_length": 239.6, "epoch": 0.5557868442964197, "grad_norm": 3.3723511611967147, "kl": 0.06787109375, "learning_rate": 4.13140132591045e-07, "loss": 0.0027, "num_tokens": 69649910.0, "reward": 1.943750023841858, "reward_std": 0.18331822901964187, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000149011612, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1288616955280304, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.6, "completions/max_terminated_length": 501.6, "completions/mean_length": 399.7625, "completions/mean_terminated_length": 399.7625, "completions/min_length": 297.2, "completions/min_terminated_length": 297.2, "epoch": 0.5578684429641965, "grad_norm": 0.1468120352133046, "kl": 0.05986328125, "learning_rate": 4.099219616790171e-07, "loss": 0.0024, "num_tokens": 69927915.0, "reward": 1.8760416746139525, "reward_std": 0.11300802528858185, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03136167526245117, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.3, "completions/max_terminated_length": 481.3, "completions/mean_length": 356.825, "completions/mean_terminated_length": 356.825, "completions/min_length": 249.6, "completions/min_terminated_length": 249.6, "epoch": 0.5599500416319734, "grad_norm": 5.091825810338028, "kl": 0.0645263671875, "learning_rate": 4.067076429873283e-07, "loss": 0.0026, "num_tokens": 70181997.0, "reward": 1.8833333492279052, "reward_std": 0.14142135977745057, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071068063378334, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.2, "completions/max_terminated_length": 493.2, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5620316402997502, "grad_norm": 0.2079987682202225, "kl": 0.0571044921875, "learning_rate": 4.034973139774962e-07, "loss": 0.0023, "num_tokens": 70393499.0, "reward": 1.8135416746139525, "reward_std": 0.07906274311244488, "rewards/accuracy_reward/mean": 0.7895833333954215, "rewards/accuracy_reward/std": 0.04124789573252201, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02395833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03781484961509705, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 550.9, "completions/max_terminated_length": 517.6, "completions/mean_length": 385.8875, "completions/mean_terminated_length": 379.0964294433594, "completions/min_length": 268.4, "completions/min_terminated_length": 268.4, "epoch": 0.5641132389675271, "grad_norm": 0.1933390995078618, "kl": 0.0705810546875, "learning_rate": 4.002911119404181e-07, "loss": 0.0028, "num_tokens": 70653330.0, "reward": 1.84375, "reward_std": 0.26869996935129165, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.1632926881313324, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0772959053516388, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 347.4, "completions/mean_terminated_length": 347.4, "completions/min_length": 247.9, "completions/min_terminated_length": 247.9, "epoch": 0.5661948376353039, "grad_norm": 4.915739208482373, "kl": 0.063818359375, "learning_rate": 3.9708917399050003e-07, "loss": 0.0026, "num_tokens": 70890210.0, "reward": 1.9677083492279053, "reward_std": 0.2370162934064865, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.1632926881313324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09270833544433117, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09098203107714653, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.1, "completions/max_terminated_length": 443.1, "completions/mean_length": 338.5875, "completions/mean_terminated_length": 338.5875, "completions/min_length": 243.1, "completions/min_terminated_length": 243.1, "epoch": 0.5682764363030808, "grad_norm": 0.1713205405963468, "kl": 0.061669921875, "learning_rate": 3.9389163705979205e-07, "loss": 0.0025, "num_tokens": 71162329.0, "reward": 2.040000009536743, "reward_std": 0.05196775794029236, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04000000208616257, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05196775794029236, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 381.9, "completions/mean_terminated_length": 381.9, "completions/min_length": 266.8, "completions/min_terminated_length": 266.8, "epoch": 0.5703580349708576, "grad_norm": 0.1860626994363445, "kl": 0.0588623046875, "learning_rate": 3.9069863789213386e-07, "loss": 0.0024, "num_tokens": 71402497.0, "reward": 1.8729166746139527, "reward_std": 0.09573607742786408, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06739883720874787, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.3, "completions/max_terminated_length": 515.3, "completions/mean_length": 394.8875, "completions/mean_terminated_length": 394.8875, "completions/min_length": 308.3, "completions/min_terminated_length": 308.3, "epoch": 0.5724396336386345, "grad_norm": 0.20655811983357472, "kl": 0.0579345703125, "learning_rate": 3.875103130373055e-07, "loss": 0.0023, "num_tokens": 71672064.0, "reward": 1.945562446117401, "reward_std": 0.1405719131231308, "rewards/accuracy_reward/mean": 0.9038957685232163, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09428090453147889, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.9, "completions/max_terminated_length": 508.9, "completions/mean_length": 378.6625, "completions/mean_terminated_length": 378.6625, "completions/min_length": 272.4, "completions/min_terminated_length": 272.4, "epoch": 0.5745212323064113, "grad_norm": 0.16434709986724766, "kl": 0.05673828125, "learning_rate": 3.843267988451888e-07, "loss": 0.0023, "num_tokens": 71944285.0, "reward": 1.9539583444595336, "reward_std": 0.1520232580602169, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06645833402872085, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1166679285466671, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.9, "completions/max_terminated_length": 443.9, "completions/mean_length": 339.8625, "completions/mean_terminated_length": 339.8625, "completions/min_length": 247.5, "completions/min_terminated_length": 247.5, "epoch": 0.5766028309741882, "grad_norm": 0.21011433719763795, "kl": 0.06376953125, "learning_rate": 3.81148231459935e-07, "loss": 0.0026, "num_tokens": 72169274.0, "reward": 1.9620498180389405, "reward_std": 0.07314258962869644, "rewards/accuracy_reward/mean": 0.8918414890766144, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07020833343267441, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07314259260892868, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.7, "completions/max_terminated_length": 516.7, "completions/mean_length": 389.175, "completions/mean_terminated_length": 389.175, "completions/min_length": 288.7, "completions/min_terminated_length": 288.7, "epoch": 0.578684429641965, "grad_norm": 4.132988613381757, "kl": 0.057421875, "learning_rate": 3.779747468141444e-07, "loss": 0.0023, "num_tokens": 72423584.0, "reward": 1.95291668176651, "reward_std": 0.19983291178941726, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.14056250751018523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05291666910052299, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07267622202634812, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.3, "completions/max_terminated_length": 490.3, "completions/mean_length": 371.2375, "completions/mean_terminated_length": 371.2375, "completions/min_length": 279.5, "completions/min_terminated_length": 279.5, "epoch": 0.5807660283097419, "grad_norm": 0.14167042199242594, "kl": 0.059326171875, "learning_rate": 3.748064806230512e-07, "loss": 0.0024, "num_tokens": 72694027.0, "reward": 1.8518012285232544, "reward_std": 0.05197432786226273, "rewards/accuracy_reward/mean": 0.8243012249469757, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02750000059604645, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05197431892156601, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.7, "completions/max_terminated_length": 505.7, "completions/mean_length": 389.2125, "completions/mean_terminated_length": 389.2125, "completions/min_length": 268.3, "completions/min_terminated_length": 268.3, "epoch": 0.5828476269775187, "grad_norm": 0.16260582148160752, "kl": 0.06240234375, "learning_rate": 3.716435683787212e-07, "loss": 0.0025, "num_tokens": 72959636.0, "reward": 2.0104166746139525, "reward_std": 0.0197955846786499, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.5, "completions/max_terminated_length": 535.5, "completions/mean_length": 391.6375, "completions/mean_terminated_length": 391.6375, "completions/min_length": 282.2, "completions/min_terminated_length": 282.2, "epoch": 0.5849292256452956, "grad_norm": 5.398047142968695, "kl": 0.0581298828125, "learning_rate": 3.684861453442559e-07, "loss": 0.0023, "num_tokens": 73183623.0, "reward": 2.0072916746139526, "reward_std": 0.07827533856034279, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08229166865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.031984337419271466, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.7, "completions/max_terminated_length": 476.7, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 276.7, "completions/min_terminated_length": 276.7, "epoch": 0.5870108243130724, "grad_norm": 5.209234303596355, "kl": 0.062939453125, "learning_rate": 3.653343465480094e-07, "loss": 0.0025, "num_tokens": 73446071.0, "reward": 1.8541666746139527, "reward_std": 0.23748018741607665, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.20411194264888763, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.3, "completions/max_terminated_length": 488.3, "completions/mean_length": 364.9375, "completions/mean_terminated_length": 364.9375, "completions/min_length": 258.3, "completions/min_terminated_length": 258.3, "epoch": 0.5890924229808493, "grad_norm": 0.20315120680488988, "kl": 0.0669189453125, "learning_rate": 3.6218830677781287e-07, "loss": 0.0027, "num_tokens": 73676026.0, "reward": 2.000000023841858, "reward_std": 0.17773192301392554, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06250000298023224, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1002311997115612, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.2, "completions/max_terminated_length": 491.2, "completions/mean_length": 383.925, "completions/mean_terminated_length": 383.925, "completions/min_length": 285.8, "completions/min_terminated_length": 285.8, "epoch": 0.5911740216486261, "grad_norm": 0.14883941440587328, "kl": 0.0576416015625, "learning_rate": 3.590481605752107e-07, "loss": 0.0023, "num_tokens": 73943828.0, "reward": 1.9854166746139525, "reward_std": 0.07575379610061646, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.5, "completions/max_terminated_length": 536.5, "completions/mean_length": 407.9375, "completions/mean_terminated_length": 407.9375, "completions/min_length": 308.1, "completions/min_terminated_length": 308.1, "epoch": 0.593255620316403, "grad_norm": 0.23106299258902835, "kl": 0.0578369140625, "learning_rate": 3.559140422297069e-07, "loss": 0.0023, "num_tokens": 74182295.0, "reward": 1.9865277886390686, "reward_std": 0.10382884740829468, "rewards/accuracy_reward/mean": 0.944861114025116, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07501916810870171, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.3, "completions/max_terminated_length": 485.3, "completions/mean_length": 392.3125, "completions/mean_terminated_length": 392.3125, "completions/min_length": 307.9, "completions/min_terminated_length": 307.9, "epoch": 0.5953372189841799, "grad_norm": 0.1436455249167597, "kl": 0.0568603515625, "learning_rate": 3.527860857730214e-07, "loss": 0.0023, "num_tokens": 74426280.0, "reward": 1.91875, "reward_std": 0.12745261490345, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.3, "completions/max_terminated_length": 534.3, "completions/mean_length": 420.6375, "completions/mean_terminated_length": 420.6375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5974188176519567, "grad_norm": 0.14423937571477746, "kl": 0.0572265625, "learning_rate": 3.4966442497335936e-07, "loss": 0.0023, "num_tokens": 74684595.0, "reward": 2.044166684150696, "reward_std": 0.07888686656951904, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04416666869074106, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07888686880469323, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.9, "completions/max_terminated_length": 545.9, "completions/mean_length": 419.775, "completions/mean_terminated_length": 419.775, "completions/min_length": 316.9, "completions/min_terminated_length": 316.9, "epoch": 0.5995004163197336, "grad_norm": 0.11281943802862444, "kl": 0.0521728515625, "learning_rate": 3.4654919332968923e-07, "loss": 0.0021, "num_tokens": 74949489.0, "reward": 2.0072916746139526, "reward_std": 0.013684011995792389, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00729166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.013684006035327911, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.4, "completions/max_terminated_length": 524.4, "completions/mean_length": 398.8625, "completions/mean_terminated_length": 398.8625, "completions/min_length": 311.7, "completions/min_terminated_length": 311.7, "epoch": 0.6015820149875104, "grad_norm": 0.12108732261018262, "kl": 0.059130859375, "learning_rate": 3.4344052406603485e-07, "loss": 0.0024, "num_tokens": 75223470.0, "reward": 1.9645833492279052, "reward_std": 0.1357921063899994, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0395833358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06500234007835388, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.9, "completions/max_terminated_length": 490.9, "completions/mean_length": 368.6375, "completions/mean_terminated_length": 368.6375, "completions/min_length": 271.1, "completions/min_terminated_length": 271.1, "epoch": 0.6036636136552873, "grad_norm": 5.105473836739841, "kl": 0.0620361328125, "learning_rate": 3.40338550125777e-07, "loss": 0.0025, "num_tokens": 75492369.0, "reward": 1.8625, "reward_std": 0.12246559262275696, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.3, "completions/max_terminated_length": 595.3, "completions/mean_length": 426.4375, "completions/mean_terminated_length": 426.4375, "completions/min_length": 309.2, "completions/min_terminated_length": 309.2, "epoch": 0.6057452123230641, "grad_norm": 0.171884299035333, "kl": 0.0556884765625, "learning_rate": 3.37243404165969e-07, "loss": 0.0022, "num_tokens": 75738228.0, "reward": 2.0052083492279054, "reward_std": 0.055492520332336426, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01770833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.020137180387973786, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.3, "completions/max_terminated_length": 462.3, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 275.3, "completions/min_terminated_length": 275.3, "epoch": 0.607826810990841, "grad_norm": 0.19991084845817067, "kl": 0.0576416015625, "learning_rate": 3.341552185516623e-07, "loss": 0.0023, "num_tokens": 76005202.0, "reward": 1.90625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.2, "completions/max_terminated_length": 508.2, "completions/mean_length": 387.7125, "completions/mean_terminated_length": 387.7125, "completions/min_length": 284.2, "completions/min_terminated_length": 284.2, "epoch": 0.6099084096586178, "grad_norm": 5.137834016006699, "kl": 0.0586181640625, "learning_rate": 3.310741253502474e-07, "loss": 0.0023, "num_tokens": 76280163.0, "reward": 1.9791666746139527, "reward_std": 0.2614880561828613, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.1595182627439499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12458351105451584, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.8, "completions/max_terminated_length": 468.8, "completions/mean_length": 354.7375, "completions/mean_terminated_length": 354.7375, "completions/min_length": 245.1, "completions/min_terminated_length": 245.1, "epoch": 0.6119900083263947, "grad_norm": 5.197754833651674, "kl": 0.0572021484375, "learning_rate": 3.280002563258047e-07, "loss": 0.0023, "num_tokens": 76547126.0, "reward": 1.9916666984558105, "reward_std": 0.20014614909887313, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10416666939854621, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1184998020529747, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.4, "completions/max_terminated_length": 547.4, "completions/mean_length": 405.85, "completions/mean_terminated_length": 405.85, "completions/min_length": 297.5, "completions/min_terminated_length": 297.5, "epoch": 0.6140716069941715, "grad_norm": 5.091353448827943, "kl": 0.0522705078125, "learning_rate": 3.249337429334705e-07, "loss": 0.0021, "num_tokens": 76813202.0, "reward": 1.8640625, "reward_std": 0.22569140791893005, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.18138959705829621, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0265625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053110551089048386, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.7, "completions/max_terminated_length": 496.7, "completions/mean_length": 362.95, "completions/mean_terminated_length": 362.95, "completions/min_length": 259.4, "completions/min_terminated_length": 259.4, "epoch": 0.6161532056619484, "grad_norm": 0.147757377653093, "kl": 0.057275390625, "learning_rate": 3.21874716313814e-07, "loss": 0.0023, "num_tokens": 77081446.0, "reward": 2.014285707473755, "reward_std": 0.10369245111942291, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03928571343421936, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.057401442527771, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.9, "completions/max_terminated_length": 485.9, "completions/mean_length": 373.225, "completions/mean_terminated_length": 373.225, "completions/min_length": 284.8, "completions/min_terminated_length": 284.8, "epoch": 0.6182348043297252, "grad_norm": 0.14559764289523203, "kl": 0.0578369140625, "learning_rate": 3.188233072872306e-07, "loss": 0.0023, "num_tokens": 77327368.0, "reward": 1.7632440447807312, "reward_std": 0.12269835770130158, "rewards/accuracy_reward/mean": 0.7351190477609635, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04105201661586762, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.8, "completions/max_terminated_length": 483.8, "completions/mean_length": 368.7375, "completions/mean_terminated_length": 368.7375, "completions/min_length": 264.9, "completions/min_terminated_length": 264.9, "epoch": 0.6203164029975021, "grad_norm": 0.18554386606188442, "kl": 0.060205078125, "learning_rate": 3.157796463483462e-07, "loss": 0.0024, "num_tokens": 77598443.0, "reward": 1.8989583492279052, "reward_std": 0.05251617282629013, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.3, "completions/max_terminated_length": 528.3, "completions/mean_length": 372.1, "completions/mean_terminated_length": 372.1, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6223980016652789, "grad_norm": 6.080061352103668, "kl": 0.0625244140625, "learning_rate": 3.12743863660437e-07, "loss": 0.0025, "num_tokens": 77874619.0, "reward": 1.974608850479126, "reward_std": 0.11803357228636742, "rewards/accuracy_reward/mean": 0.9141921669244766, "rewards/accuracy_reward/std": 0.0017611147835850717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06041666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11627245470881462, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.2, "completions/max_terminated_length": 496.2, "completions/mean_length": 373.2875, "completions/mean_terminated_length": 373.2875, "completions/min_length": 238.4, "completions/min_terminated_length": 238.4, "epoch": 0.6244796003330558, "grad_norm": 0.21710130755298088, "kl": 0.0623046875, "learning_rate": 3.097160890498625e-07, "loss": 0.0025, "num_tokens": 78151298.0, "reward": 1.8583333492279053, "reward_std": 0.11615225374698639, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.8, "completions/max_terminated_length": 512.8, "completions/mean_length": 383.425, "completions/mean_terminated_length": 383.425, "completions/min_length": 280.5, "completions/min_terminated_length": 280.5, "epoch": 0.6265611990008326, "grad_norm": 0.1315134028604458, "kl": 0.0558837890625, "learning_rate": 3.0669645200051453e-07, "loss": 0.0022, "num_tokens": 78410436.0, "reward": 1.7776818871498108, "reward_std": 0.15103521551936866, "rewards/accuracy_reward/mean": 0.7585152305662632, "rewards/accuracy_reward/std": 0.12693723943084478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667759418488, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.031952467560768125, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.9, "completions/max_terminated_length": 472.9, "completions/mean_length": 388.525, "completions/mean_terminated_length": 388.525, "completions/min_length": 291.6, "completions/min_terminated_length": 291.6, "epoch": 0.6286427976686095, "grad_norm": 0.12651115020522632, "kl": 0.0575927734375, "learning_rate": 3.036850816482785e-07, "loss": 0.0023, "num_tokens": 78678670.0, "reward": 1.7510416746139525, "reward_std": 0.111685012280941, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01988932639360428, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.2, "completions/max_terminated_length": 489.2, "completions/mean_length": 351.825, "completions/mean_terminated_length": 351.825, "completions/min_length": 243.7, "completions/min_terminated_length": 243.7, "epoch": 0.6307243963363863, "grad_norm": 4.418440689589054, "kl": 0.05615234375, "learning_rate": 3.006821067755121e-07, "loss": 0.0022, "num_tokens": 78940568.0, "reward": 1.8958333492279054, "reward_std": 0.19499201476573944, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07083333544433117, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12428133860230446, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.3, "completions/max_terminated_length": 450.3, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 248.7, "completions/min_terminated_length": 248.7, "epoch": 0.6328059950041632, "grad_norm": 0.15336856419939696, "kl": 0.0598876953125, "learning_rate": 2.9768765580553646e-07, "loss": 0.0024, "num_tokens": 79202124.0, "reward": 1.9208333373069764, "reward_std": 0.14433692693710326, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06811279505491256, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.9, "completions/max_terminated_length": 497.9, "completions/mean_length": 367.9, "completions/mean_terminated_length": 367.9, "completions/min_length": 271.1, "completions/min_terminated_length": 271.1, "epoch": 0.63488759367194, "grad_norm": 0.19183319995044523, "kl": 0.0605224609375, "learning_rate": 2.9470185679714575e-07, "loss": 0.0024, "num_tokens": 79452556.0, "reward": 1.9670833587646483, "reward_std": 0.07115457355976104, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0795833358541131, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044742978364229205, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.7, "completions/max_terminated_length": 450.7, "completions/mean_length": 349.8375, "completions/mean_terminated_length": 349.8375, "completions/min_length": 248.9, "completions/min_terminated_length": 248.9, "epoch": 0.6369691923397169, "grad_norm": 4.946905661215974, "kl": 0.054736328125, "learning_rate": 2.917248374391291e-07, "loss": 0.0022, "num_tokens": 79705863.0, "reward": 1.9729166746139526, "reward_std": 0.1014419287443161, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.8, "completions/max_terminated_length": 434.8, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 231.3, "completions/min_terminated_length": 231.3, "epoch": 0.6390507910074937, "grad_norm": 0.20955413749497104, "kl": 0.0560302734375, "learning_rate": 2.887567250448112e-07, "loss": 0.0022, "num_tokens": 79962649.0, "reward": 2.025, "reward_std": 0.053452253341674805, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02500000074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05345225036144256, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.9, "completions/max_terminated_length": 475.9, "completions/mean_length": 369.7, "completions/mean_terminated_length": 369.7, "completions/min_length": 274.3, "completions/min_terminated_length": 274.3, "epoch": 0.6411323896752706, "grad_norm": 0.16739936123712726, "kl": 0.0539794921875, "learning_rate": 2.8579764654660684e-07, "loss": 0.0022, "num_tokens": 80219937.0, "reward": 1.8879098296165466, "reward_std": 0.11641737371683121, "rewards/accuracy_reward/mean": 0.8499931506812572, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03791666682809591, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06466245353221893, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.7, "completions/max_terminated_length": 511.7, "completions/mean_length": 388.375, "completions/mean_terminated_length": 388.375, "completions/min_length": 290.6, "completions/min_terminated_length": 290.6, "epoch": 0.6432139883430474, "grad_norm": 0.15903268299642787, "kl": 0.0543212890625, "learning_rate": 2.828477284905931e-07, "loss": 0.0022, "num_tokens": 80473399.0, "reward": 1.83125, "reward_std": 0.134679351747036, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 412.75, "completions/mean_terminated_length": 412.75, "completions/min_length": 327.6, "completions/min_terminated_length": 327.6, "epoch": 0.6452955870108243, "grad_norm": 0.23672273143469844, "kl": 0.0563232421875, "learning_rate": 2.7990709703109715e-07, "loss": 0.0023, "num_tokens": 80726755.0, "reward": 1.7434523820877075, "reward_std": 0.21723176091909407, "rewards/accuracy_reward/mean": 0.725, "rewards/accuracy_reward/std": 0.17045392990112304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018452381156384944, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0521912157535553, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 374.0875, "completions/mean_terminated_length": 374.0875, "completions/min_length": 295.4, "completions/min_terminated_length": 295.4, "epoch": 0.6473771856786011, "grad_norm": 4.4358192605003675, "kl": 0.056396484375, "learning_rate": 2.7697587792530224e-07, "loss": 0.0023, "num_tokens": 80972154.0, "reward": 1.9489583373069763, "reward_std": 0.09627838134765625, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01602174937725067, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.4, "completions/max_terminated_length": 473.4, "completions/mean_length": 374.3875, "completions/mean_terminated_length": 374.3875, "completions/min_length": 270.2, "completions/min_terminated_length": 270.2, "epoch": 0.649458784346378, "grad_norm": 4.299687592517207, "kl": 0.0558349609375, "learning_rate": 2.740541965278674e-07, "loss": 0.0022, "num_tokens": 81222465.0, "reward": 1.9568055629730225, "reward_std": 0.09925851821899415, "rewards/accuracy_reward/mean": 0.925555557012558, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.6, "completions/max_terminated_length": 499.6, "completions/mean_length": 371.25, "completions/mean_terminated_length": 371.25, "completions/min_length": 261.7, "completions/min_terminated_length": 261.7, "epoch": 0.6515403830141548, "grad_norm": 0.15859439093876504, "kl": 0.0567138671875, "learning_rate": 2.711421777855697e-07, "loss": 0.0023, "num_tokens": 81472885.0, "reward": 2.0004166841506956, "reward_std": 0.18078695088624955, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06291666850447655, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10276310220360756, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.8, "completions/max_terminated_length": 537.8, "completions/mean_length": 401.4875, "completions/mean_terminated_length": 401.4875, "completions/min_length": 297.4, "completions/min_terminated_length": 297.4, "epoch": 0.6536219816819318, "grad_norm": 0.11178847338664576, "kl": 0.0556884765625, "learning_rate": 2.682399462319581e-07, "loss": 0.0022, "num_tokens": 81735388.0, "reward": 1.8754166841506958, "reward_std": 0.24388935342431067, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10041666850447654, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15540080443024634, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.6, "completions/max_terminated_length": 545.6, "completions/mean_length": 389.975, "completions/mean_terminated_length": 389.975, "completions/min_length": 262.5, "completions/min_terminated_length": 262.5, "epoch": 0.6557035803497085, "grad_norm": 5.338136743789175, "kl": 0.05400390625, "learning_rate": 2.6534762598202924e-07, "loss": 0.0022, "num_tokens": 81997114.0, "reward": 1.8625, "reward_std": 0.12246559262275696, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.4, "completions/max_terminated_length": 488.4, "completions/mean_length": 392.0, "completions/mean_terminated_length": 392.0, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6577851790174855, "grad_norm": 4.662728227249034, "kl": 0.0537109375, "learning_rate": 2.624653407269192e-07, "loss": 0.0021, "num_tokens": 82271194.0, "reward": 1.8606499552726745, "reward_std": 0.12930927574634551, "rewards/accuracy_reward/mean": 0.8189832538366317, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.041666668653488156, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08301825821399689, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.6, "completions/max_terminated_length": 503.6, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 278.4, "completions/min_terminated_length": 278.4, "epoch": 0.6598667776852623, "grad_norm": 0.13954351267435355, "kl": 0.053955078125, "learning_rate": 2.595932137286138e-07, "loss": 0.0022, "num_tokens": 82518412.0, "reward": 1.9166666746139527, "reward_std": 0.09428090751171112, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556607723236, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 403.7375, "completions/mean_terminated_length": 403.7375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6619483763530392, "grad_norm": 4.888555596115262, "kl": 0.0542724609375, "learning_rate": 2.567313678146771e-07, "loss": 0.0022, "num_tokens": 82771527.0, "reward": 1.965625023841858, "reward_std": 0.2425983279943466, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500149011611, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14271903932094573, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.3, "completions/max_terminated_length": 574.3, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 430.6875, "completions/min_length": 298.8, "completions/min_terminated_length": 298.8, "epoch": 0.664029975020816, "grad_norm": 5.0577996446349625, "kl": 0.05341796875, "learning_rate": 2.5387992537299963e-07, "loss": 0.0021, "num_tokens": 83018006.0, "reward": 1.9958333492279052, "reward_std": 0.11996905207633972, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049258365482091906, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.2, "completions/max_terminated_length": 497.2, "completions/mean_length": 382.7125, "completions/mean_terminated_length": 382.7125, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.6661115736885929, "grad_norm": 5.030928942983225, "kl": 0.0546875, "learning_rate": 2.510390083465621e-07, "loss": 0.0022, "num_tokens": 83286223.0, "reward": 2.031250023841858, "reward_std": 0.14403526857495308, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.13125000409781934, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14403527304530145, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.6, "completions/max_terminated_length": 549.6, "completions/mean_length": 404.025, "completions/mean_terminated_length": 404.025, "completions/min_length": 284.5, "completions/min_terminated_length": 284.5, "epoch": 0.6681931723563697, "grad_norm": 4.124145506186141, "kl": 0.056201171875, "learning_rate": 2.482087382282238e-07, "loss": 0.0022, "num_tokens": 83532273.0, "reward": 1.8395833492279052, "reward_std": 0.06592325270175933, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.039583335444331166, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0659232459962368, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.7, "completions/max_terminated_length": 486.7, "completions/mean_length": 373.9375, "completions/mean_terminated_length": 373.9375, "completions/min_length": 283.7, "completions/min_terminated_length": 283.7, "epoch": 0.6702747710241466, "grad_norm": 0.1474085241752631, "kl": 0.0574462890625, "learning_rate": 2.453892360555233e-07, "loss": 0.0023, "num_tokens": 83769724.0, "reward": 1.8802083492279054, "reward_std": 0.14919540733098985, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04270833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07760776579380035, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.7, "completions/max_terminated_length": 503.7, "completions/mean_length": 400.4375, "completions/mean_terminated_length": 400.4375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6723563696919234, "grad_norm": 4.803506917401603, "kl": 0.05517578125, "learning_rate": 2.425806224055055e-07, "loss": 0.0022, "num_tokens": 84031639.0, "reward": 1.9375, "reward_std": 0.0816463440656662, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 626.1, "completions/max_terminated_length": 579.7, "completions/mean_length": 410.5, "completions/mean_terminated_length": 402.5732147216797, "completions/min_length": 285.4, "completions/min_terminated_length": 285.4, "epoch": 0.6744379683597003, "grad_norm": 4.429967274639083, "kl": 0.0574951171875, "learning_rate": 2.3978301738956287e-07, "loss": 0.0023, "num_tokens": 84303743.0, "reward": 1.70625, "reward_std": 0.2609692007303238, "rewards/accuracy_reward/mean": 0.7, "rewards/accuracy_reward/std": 0.2205115258693695, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.6, "completions/max_terminated_length": 541.6, "completions/mean_length": 409.725, "completions/mean_terminated_length": 409.725, "completions/min_length": 309.1, "completions/min_terminated_length": 309.1, "epoch": 0.6765195670274771, "grad_norm": 0.2112119584367916, "kl": 0.052734375, "learning_rate": 2.369965406482996e-07, "loss": 0.0021, "num_tokens": 84533249.0, "reward": 1.9125, "reward_std": 0.17943965792655944, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.14433693289756774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.4, "completions/max_terminated_length": 519.4, "completions/mean_length": 392.3125, "completions/mean_terminated_length": 392.3125, "completions/min_length": 289.2, "completions/min_terminated_length": 289.2, "epoch": 0.678601165695254, "grad_norm": 0.1459616982889113, "kl": 0.0552490234375, "learning_rate": 2.342213113464155e-07, "loss": 0.0022, "num_tokens": 84815362.0, "reward": 1.926360011100769, "reward_std": 0.0947591558098793, "rewards/accuracy_reward/mean": 0.9013600140810013, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.7, "completions/max_terminated_length": 527.7, "completions/mean_length": 401.0125, "completions/mean_terminated_length": 401.0125, "completions/min_length": 303.3, "completions/min_terminated_length": 303.3, "epoch": 0.6806827643630308, "grad_norm": 0.15128101407838257, "kl": 0.0541748046875, "learning_rate": 2.3145744816760915e-07, "loss": 0.0022, "num_tokens": 85090907.0, "reward": 1.9145833492279052, "reward_std": 0.03310801088809967, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.033108004927635194, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.5, "completions/max_terminated_length": 533.5, "completions/mean_length": 425.7125, "completions/mean_terminated_length": 425.7125, "completions/min_length": 311.1, "completions/min_terminated_length": 311.1, "epoch": 0.6827643630308077, "grad_norm": 4.464712293634347, "kl": 0.0533447265625, "learning_rate": 2.287050693095028e-07, "loss": 0.0021, "num_tokens": 85364564.0, "reward": 1.8083333492279052, "reward_std": 0.27998869568109513, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.2205115258693695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08461370393633842, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.5, "completions/max_terminated_length": 579.5, "completions/mean_length": 445.3375, "completions/mean_terminated_length": 445.3375, "completions/min_length": 323.6, "completions/min_terminated_length": 323.6, "epoch": 0.6848459616985845, "grad_norm": 0.12651191382442867, "kl": 0.0567626953125, "learning_rate": 2.25964292478588e-07, "loss": 0.0023, "num_tokens": 85614319.0, "reward": 1.9358333587646483, "reward_std": 0.15191497951745986, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.060833333805203435, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08951258435845375, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.5, "completions/max_terminated_length": 497.5, "completions/mean_length": 379.4375, "completions/mean_terminated_length": 379.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.6869275603663614, "grad_norm": 0.15966499931168038, "kl": 0.058203125, "learning_rate": 2.2323523488519035e-07, "loss": 0.0023, "num_tokens": 85889306.0, "reward": 1.940625, "reward_std": 0.21795205026865005, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.15782093703746797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.040625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06013111919164658, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.1, "completions/max_terminated_length": 580.1, "completions/mean_length": 430.5375, "completions/mean_terminated_length": 430.5375, "completions/min_length": 316.3, "completions/min_terminated_length": 316.3, "epoch": 0.6890091590341382, "grad_norm": 5.482841175278612, "kl": 0.0480712890625, "learning_rate": 2.2051801323845898e-07, "loss": 0.0019, "num_tokens": 86165773.0, "reward": 1.89375, "reward_std": 0.05303300768136978, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.2, "completions/max_terminated_length": 495.2, "completions/mean_length": 395.0625, "completions/mean_terminated_length": 395.0625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.6910907577019151, "grad_norm": 4.34674602323066, "kl": 0.0560791015625, "learning_rate": 2.178127437413738e-07, "loss": 0.0022, "num_tokens": 86430674.0, "reward": 1.9328063368797301, "reward_std": 0.18096388429403304, "rewards/accuracy_reward/mean": 0.8786396577954292, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11062439307570457, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.6, "completions/max_terminated_length": 518.6, "completions/mean_length": 395.825, "completions/mean_terminated_length": 395.825, "completions/min_length": 284.9, "completions/min_terminated_length": 284.9, "epoch": 0.6931723563696919, "grad_norm": 4.461438229384425, "kl": 0.056396484375, "learning_rate": 2.1511954208577687e-07, "loss": 0.0023, "num_tokens": 86678692.0, "reward": 1.8700993537902832, "reward_std": 0.10521658658981323, "rewards/accuracy_reward/mean": 0.8492660000920296, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556756734848, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.6, "completions/max_terminated_length": 467.6, "completions/mean_length": 359.1625, "completions/mean_terminated_length": 359.1625, "completions/min_length": 271.4, "completions/min_terminated_length": 271.4, "epoch": 0.6952539550374688, "grad_norm": 0.1587957283713179, "kl": 0.05439453125, "learning_rate": 2.1243852344742456e-07, "loss": 0.0022, "num_tokens": 86955689.0, "reward": 1.933035707473755, "reward_std": 0.14633138179779054, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03303571343421936, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07562070488929748, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.4, "completions/max_terminated_length": 528.4, "completions/mean_length": 403.3875, "completions/mean_terminated_length": 403.3875, "completions/min_length": 299.8, "completions/min_terminated_length": 299.8, "epoch": 0.6973355537052456, "grad_norm": 0.16912400051864426, "kl": 0.0544921875, "learning_rate": 2.0976980248106207e-07, "loss": 0.0022, "num_tokens": 87199200.0, "reward": 1.7490820646286012, "reward_std": 0.14865545853972434, "rewards/accuracy_reward/mean": 0.6969987243413925, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05208333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08596487566828728, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.3, "completions/max_terminated_length": 476.3, "completions/mean_length": 364.2625, "completions/mean_terminated_length": 364.2625, "completions/min_length": 250.6, "completions/min_terminated_length": 250.6, "epoch": 0.6994171523730225, "grad_norm": 0.13838109852259403, "kl": 0.0562255859375, "learning_rate": 2.071134933155198e-07, "loss": 0.0022, "num_tokens": 87448773.0, "reward": 1.866287887096405, "reward_std": 0.153206467628479, "rewards/accuracy_reward/mean": 0.8621212124824524, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.7, "completions/max_terminated_length": 534.7, "completions/mean_length": 399.6125, "completions/mean_terminated_length": 399.6125, "completions/min_length": 272.4, "completions/min_terminated_length": 272.4, "epoch": 0.7014987510407993, "grad_norm": 5.846906136391809, "kl": 0.0546630859375, "learning_rate": 2.0446970954883397e-07, "loss": 0.0022, "num_tokens": 87726590.0, "reward": 1.8574824571609496, "reward_std": 0.12330644056200982, "rewards/accuracy_reward/mean": 0.8253991156816483, "rewards/accuracy_reward/std": 0.008560963720083237, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.044583334028720854, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0793901264667511, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.4, "completions/max_terminated_length": 525.4, "completions/mean_length": 409.4125, "completions/mean_terminated_length": 409.4125, "completions/min_length": 288.9, "completions/min_terminated_length": 288.9, "epoch": 0.7035803497085762, "grad_norm": 0.12400396976068362, "kl": 0.052490234375, "learning_rate": 2.018385642433859e-07, "loss": 0.0021, "num_tokens": 87991615.0, "reward": 1.8695168137550353, "reward_std": 0.09889537543058395, "rewards/accuracy_reward/mean": 0.8528501406311989, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 392.4875, "completions/mean_terminated_length": 392.4875, "completions/min_length": 276.5, "completions/min_terminated_length": 276.5, "epoch": 0.705661948376353, "grad_norm": 0.1491549885780442, "kl": 0.0510986328125, "learning_rate": 1.9922016992107004e-07, "loss": 0.002, "num_tokens": 88234326.0, "reward": 1.8433712124824524, "reward_std": 0.14787373542785645, "rewards/accuracy_reward/mean": 0.7996212124824524, "rewards/accuracy_reward/std": 0.10520716905593872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04266657531261444, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.5, "completions/max_terminated_length": 522.5, "completions/mean_length": 392.575, "completions/mean_terminated_length": 392.575, "completions/min_length": 272.3, "completions/min_terminated_length": 272.3, "epoch": 0.7077435470441299, "grad_norm": 0.14457577960469306, "kl": 0.05, "learning_rate": 1.9661463855847953e-07, "loss": 0.002, "num_tokens": 88502596.0, "reward": 2.0260416746139525, "reward_std": 0.1515728861093521, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08086220920085907, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.2, "completions/max_terminated_length": 523.2, "completions/mean_length": 421.3625, "completions/mean_terminated_length": 421.3625, "completions/min_length": 318.8, "completions/min_terminated_length": 318.8, "epoch": 0.7098251457119067, "grad_norm": 0.16354366705945111, "kl": 0.0482666015625, "learning_rate": 1.9402208158211846e-07, "loss": 0.0019, "num_tokens": 88766889.0, "reward": 1.8375, "reward_std": 0.12288875579833984, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.023145502805709837, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.3, "completions/max_terminated_length": 512.3, "completions/mean_length": 407.425, "completions/mean_terminated_length": 407.425, "completions/min_length": 316.9, "completions/min_terminated_length": 316.9, "epoch": 0.7119067443796836, "grad_norm": 0.15199017450102612, "kl": 0.050537109375, "learning_rate": 1.9144260986363663e-07, "loss": 0.002, "num_tokens": 89049659.0, "reward": 2.031250023841858, "reward_std": 0.14454624205827712, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0927913174033165, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.8, "completions/max_terminated_length": 474.8, "completions/mean_length": 375.9875, "completions/mean_terminated_length": 375.9875, "completions/min_length": 281.8, "completions/min_terminated_length": 281.8, "epoch": 0.7139883430474604, "grad_norm": 0.18400026901801167, "kl": 0.0502197265625, "learning_rate": 1.888763337150877e-07, "loss": 0.002, "num_tokens": 89307282.0, "reward": 1.9940972447395324, "reward_std": 0.13174496293067933, "rewards/accuracy_reward/mean": 0.9305555552244187, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166828095913, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13174496218562126, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.8, "completions/max_terminated_length": 539.8, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 295.3, "completions/min_terminated_length": 295.3, "epoch": 0.7160699417152373, "grad_norm": 0.15960429674340176, "kl": 0.0587646484375, "learning_rate": 1.8632336288421275e-07, "loss": 0.0024, "num_tokens": 89514746.0, "reward": 1.9389583468437195, "reward_std": 0.23162795454263688, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.20411194264888763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03895833436399698, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07336622476577759, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.9, "completions/max_terminated_length": 524.9, "completions/mean_length": 408.7875, "completions/mean_terminated_length": 408.7875, "completions/min_length": 305.4, "completions/min_terminated_length": 305.4, "epoch": 0.7181515403830142, "grad_norm": 0.15281608095911914, "kl": 0.052734375, "learning_rate": 1.837838065497448e-07, "loss": 0.0021, "num_tokens": 89773265.0, "reward": 1.9989583492279053, "reward_std": 0.08846957683563232, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03671466112136841, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.4, "completions/max_terminated_length": 515.4, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 280.2, "completions/min_terminated_length": 280.2, "epoch": 0.720233139050791, "grad_norm": 0.166348730186093, "kl": 0.0537109375, "learning_rate": 1.8125777331674224e-07, "loss": 0.0021, "num_tokens": 90026781.0, "reward": 2.0, "reward_std": 0.09258201122283935, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.3, "completions/max_terminated_length": 514.3, "completions/mean_length": 385.7, "completions/mean_terminated_length": 385.7, "completions/min_length": 262.9, "completions/min_terminated_length": 262.9, "epoch": 0.7223147377185679, "grad_norm": 5.263532396427543, "kl": 0.0520263671875, "learning_rate": 1.7874537121194233e-07, "loss": 0.0021, "num_tokens": 90303189.0, "reward": 2.007232141494751, "reward_std": 0.15389785990118982, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0572321429848671, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07758329436182976, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.9, "completions/max_terminated_length": 546.9, "completions/mean_length": 430.1875, "completions/mean_terminated_length": 430.1875, "completions/min_length": 326.8, "completions/min_terminated_length": 326.8, "epoch": 0.7243963363863447, "grad_norm": 4.1697785482475815, "kl": 0.052490234375, "learning_rate": 1.7624670767914241e-07, "loss": 0.0021, "num_tokens": 90550892.0, "reward": 1.869166660308838, "reward_std": 0.11857073605060578, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05666666626930237, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0832154020667076, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.7, "completions/max_terminated_length": 530.7, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 286.3, "completions/min_terminated_length": 286.3, "epoch": 0.7264779350541216, "grad_norm": 4.735500880156546, "kl": 0.0527099609375, "learning_rate": 1.7376188957460464e-07, "loss": 0.0021, "num_tokens": 90796390.0, "reward": 1.964305579662323, "reward_std": 0.14060550779104233, "rewards/accuracy_reward/mean": 0.9055555552244187, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05875000134110451, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11017159223556519, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.8, "completions/max_terminated_length": 492.8, "completions/mean_length": 375.3875, "completions/mean_terminated_length": 375.3875, "completions/min_length": 280.2, "completions/min_terminated_length": 280.2, "epoch": 0.7285595337218984, "grad_norm": 4.588985089684356, "kl": 0.048681640625, "learning_rate": 1.7129102316248644e-07, "loss": 0.0019, "num_tokens": 91058349.0, "reward": 1.8958333492279054, "reward_std": 0.20283454060554504, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08277528658509255, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.3, "completions/max_terminated_length": 519.3, "completions/mean_length": 394.225, "completions/mean_terminated_length": 394.225, "completions/min_length": 268.7, "completions/min_terminated_length": 268.7, "epoch": 0.7306411323896753, "grad_norm": 5.032917532131076, "kl": 0.0522216796875, "learning_rate": 1.688342141102958e-07, "loss": 0.0021, "num_tokens": 91316167.0, "reward": 1.6864583492279053, "reward_std": 0.19432369619607925, "rewards/accuracy_reward/mean": 0.675, "rewards/accuracy_reward/std": 0.17422052025794982, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03240906372666359, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 381.075, "completions/mean_terminated_length": 381.075, "completions/min_length": 286.4, "completions/min_terminated_length": 286.4, "epoch": 0.7327227310574521, "grad_norm": 4.764262378414541, "kl": 0.0566162109375, "learning_rate": 1.6639156748437316e-07, "loss": 0.0023, "num_tokens": 91570893.0, "reward": 1.9031250119209289, "reward_std": 0.14553493782877922, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10751301869750023, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.7, "completions/max_terminated_length": 476.7, "completions/mean_length": 366.8875, "completions/mean_terminated_length": 366.8875, "completions/min_length": 263.1, "completions/min_terminated_length": 263.1, "epoch": 0.734804329725229, "grad_norm": 5.087437731335537, "kl": 0.058740234375, "learning_rate": 1.6396318774539658e-07, "loss": 0.0024, "num_tokens": 91840780.0, "reward": 1.8559027910232544, "reward_std": 0.15110048055648803, "rewards/accuracy_reward/mean": 0.841944444179535, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013958334363996983, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02309281751513481, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.4, "completions/max_terminated_length": 477.4, "completions/mean_length": 367.7125, "completions/mean_terminated_length": 367.7125, "completions/min_length": 255.8, "completions/min_terminated_length": 255.8, "epoch": 0.7368859283930058, "grad_norm": 5.792350963783144, "kl": 0.0564453125, "learning_rate": 1.6154917874391642e-07, "loss": 0.0023, "num_tokens": 92082925.0, "reward": 1.8333333492279054, "reward_std": 0.22360757291316985, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.185156187415123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03845139443874359, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.8, "completions/max_terminated_length": 494.8, "completions/mean_length": 377.575, "completions/mean_terminated_length": 377.575, "completions/min_length": 278.2, "completions/min_terminated_length": 278.2, "epoch": 0.7389675270607827, "grad_norm": 4.6887900541716405, "kl": 0.064013671875, "learning_rate": 1.5914964371591282e-07, "loss": 0.0026, "num_tokens": 92359075.0, "reward": 1.8440972328186036, "reward_std": 0.18152772933244704, "rewards/accuracy_reward/mean": 0.7847222223877907, "rewards/accuracy_reward/std": 0.12093005329370499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07280750945210457, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 345.2875, "completions/mean_terminated_length": 345.2875, "completions/min_length": 262.8, "completions/min_terminated_length": 262.8, "epoch": 0.7410491257285595, "grad_norm": 0.1742682978076689, "kl": 0.0579833984375, "learning_rate": 1.56764685278381e-07, "loss": 0.0023, "num_tokens": 92629834.0, "reward": 1.8572916746139527, "reward_std": 0.18208086043596267, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.13509859144687653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046982265263795855, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.1, "completions/max_terminated_length": 448.1, "completions/mean_length": 337.6625, "completions/mean_terminated_length": 337.6625, "completions/min_length": 246.8, "completions/min_terminated_length": 246.8, "epoch": 0.7431307243963364, "grad_norm": 5.953804007612613, "kl": 0.0570068359375, "learning_rate": 1.5439440542494315e-07, "loss": 0.0023, "num_tokens": 92876415.0, "reward": 1.7963137030601501, "reward_std": 0.1349079929292202, "rewards/accuracy_reward/mean": 0.796313701570034, "rewards/accuracy_reward/std": 0.1349079929292202, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.4, "completions/max_terminated_length": 465.4, "completions/mean_length": 363.775, "completions/mean_terminated_length": 363.775, "completions/min_length": 256.4, "completions/min_terminated_length": 256.4, "epoch": 0.7452123230641132, "grad_norm": 5.446112817760046, "kl": 0.0608154296875, "learning_rate": 1.5203890552148624e-07, "loss": 0.0024, "num_tokens": 93138461.0, "reward": 2.0, "reward_std": 0.08408745229244233, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.037796446681022645, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.7, "completions/max_terminated_length": 510.7, "completions/mean_length": 382.1375, "completions/mean_terminated_length": 382.1375, "completions/min_length": 275.3, "completions/min_terminated_length": 275.3, "epoch": 0.7472939217318901, "grad_norm": 5.55625074684671, "kl": 0.0570556640625, "learning_rate": 1.496982863018275e-07, "loss": 0.0023, "num_tokens": 93372936.0, "reward": 1.9674107313156128, "reward_std": 0.1747075505554676, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04241071492433548, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0875972904264927, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.4, "completions/max_terminated_length": 495.4, "completions/mean_length": 385.175, "completions/mean_terminated_length": 385.175, "completions/min_length": 295.8, "completions/min_terminated_length": 295.8, "epoch": 0.7493755203996669, "grad_norm": 0.15568327909789098, "kl": 0.0562744140625, "learning_rate": 1.473726478634061e-07, "loss": 0.0023, "num_tokens": 93635342.0, "reward": 1.8378610372543336, "reward_std": 0.07071067690849304, "rewards/accuracy_reward/mean": 0.8253610402345657, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.5, "completions/max_terminated_length": 467.5, "completions/mean_length": 359.9625, "completions/mean_terminated_length": 359.9625, "completions/min_length": 254.2, "completions/min_terminated_length": 254.2, "epoch": 0.7514571190674438, "grad_norm": 4.720343495448124, "kl": 0.058642578125, "learning_rate": 1.4506208966300248e-07, "loss": 0.0023, "num_tokens": 93907019.0, "reward": 2.0114583492279055, "reward_std": 0.1769111342728138, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10314287841320038, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.9, "completions/max_terminated_length": 497.9, "completions/mean_length": 392.3125, "completions/mean_terminated_length": 392.3125, "completions/min_length": 279.9, "completions/min_terminated_length": 279.9, "epoch": 0.7535387177352206, "grad_norm": 0.12252307448017402, "kl": 0.060302734375, "learning_rate": 1.4276671051248572e-07, "loss": 0.0024, "num_tokens": 94168604.0, "reward": 2.0208333492279054, "reward_std": 0.16390654146671296, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11761552840471268, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.1, "completions/max_terminated_length": 473.1, "completions/mean_length": 364.425, "completions/mean_terminated_length": 364.425, "completions/min_length": 269.3, "completions/min_terminated_length": 269.3, "epoch": 0.7556203164029975, "grad_norm": 5.028606701234733, "kl": 0.06298828125, "learning_rate": 1.4048660857458637e-07, "loss": 0.0025, "num_tokens": 94429414.0, "reward": 2.0260416984558107, "reward_std": 0.1510403722524643, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0760416690260172, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06393011137843133, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.6, "completions/max_terminated_length": 423.6, "completions/mean_length": 347.525, "completions/mean_terminated_length": 347.525, "completions/min_length": 261.4, "completions/min_terminated_length": 261.4, "epoch": 0.7577019150707743, "grad_norm": 4.9837741307547825, "kl": 0.0615234375, "learning_rate": 1.3822188135870034e-07, "loss": 0.0025, "num_tokens": 94693016.0, "reward": 1.9754166841506957, "reward_std": 0.10829881578683853, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025416667759418487, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05484656691551208, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.2, "completions/max_terminated_length": 508.2, "completions/mean_length": 392.175, "completions/mean_terminated_length": 392.175, "completions/min_length": 287.5, "completions/min_terminated_length": 287.5, "epoch": 0.7597835137385512, "grad_norm": 0.15479196965058603, "kl": 0.054833984375, "learning_rate": 1.359726257167172e-07, "loss": 0.0022, "num_tokens": 94951070.0, "reward": 1.7885703325271607, "reward_std": 0.08867817372083664, "rewards/accuracy_reward/mean": 0.7698203206062317, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875000074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03522591739892959, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.6, "completions/max_terminated_length": 534.6, "completions/mean_length": 413.35, "completions/mean_terminated_length": 413.35, "completions/min_length": 298.3, "completions/min_terminated_length": 298.3, "epoch": 0.761865112406328, "grad_norm": 0.12937673677540254, "kl": 0.0532470703125, "learning_rate": 1.3373893783887934e-07, "loss": 0.0021, "num_tokens": 95189962.0, "reward": 2.0135416746139527, "reward_std": 0.03136168122291565, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03136167526245117, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/max_terminated_length": 452.6, "completions/mean_length": 346.75, "completions/mean_terminated_length": 346.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.763946711074105, "grad_norm": 4.407489858408879, "kl": 0.0561767578125, "learning_rate": 1.3152091324966797e-07, "loss": 0.0022, "num_tokens": 95462542.0, "reward": 1.9885416746139526, "reward_std": 0.16378697603940964, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07667671665549278, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.8, "completions/max_terminated_length": 495.8, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 265.4, "completions/min_terminated_length": 265.4, "epoch": 0.7660283097418817, "grad_norm": 0.14732401576120982, "kl": 0.0573486328125, "learning_rate": 1.2931864680371783e-07, "loss": 0.0023, "num_tokens": 95730854.0, "reward": 1.9910416722297668, "reward_std": 0.16076218485832214, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.11604166850447654, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12301851361989975, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.1, "completions/max_terminated_length": 438.1, "completions/mean_length": 353.7375, "completions/mean_terminated_length": 353.7375, "completions/min_length": 266.4, "completions/min_terminated_length": 266.4, "epoch": 0.7681099084096586, "grad_norm": 0.16920726378100856, "kl": 0.0602294921875, "learning_rate": 1.27132232681761e-07, "loss": 0.0024, "num_tokens": 95991633.0, "reward": 1.984730076789856, "reward_std": 0.08931128680706024, "rewards/accuracy_reward/mean": 0.962855052947998, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02187500037252903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053955940157175065, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.9, "completions/max_terminated_length": 502.9, "completions/mean_length": 400.55, "completions/mean_terminated_length": 400.55, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7701915070774354, "grad_norm": 0.15757015594848228, "kl": 0.05205078125, "learning_rate": 1.2496176438659944e-07, "loss": 0.0021, "num_tokens": 96244909.0, "reward": 1.984375, "reward_std": 0.05585952997207642, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.084375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055859526991844176, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 390.425, "completions/mean_terminated_length": 390.425, "completions/min_length": 284.6, "completions/min_terminated_length": 284.6, "epoch": 0.7722731057452124, "grad_norm": 0.13615266257106715, "kl": 0.052587890625, "learning_rate": 1.2280733473910527e-07, "loss": 0.0021, "num_tokens": 96508287.0, "reward": 1.8875, "reward_std": 0.03535533845424652, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.6, "completions/max_terminated_length": 507.6, "completions/mean_length": 372.5125, "completions/mean_terminated_length": 372.5125, "completions/min_length": 272.9, "completions/min_terminated_length": 272.9, "epoch": 0.7743547044129891, "grad_norm": 0.14070110173334624, "kl": 0.0517578125, "learning_rate": 1.2066903587425264e-07, "loss": 0.0021, "num_tokens": 96783880.0, "reward": 1.8479166746139526, "reward_std": 0.0954555444419384, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.057878179103136064, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.1, "completions/max_terminated_length": 494.1, "completions/mean_length": 368.1125, "completions/mean_terminated_length": 368.1125, "completions/min_length": 251.1, "completions/min_terminated_length": 251.1, "epoch": 0.776436303080766, "grad_norm": 0.22048760698173916, "kl": 0.0575927734375, "learning_rate": 1.1854695923717656e-07, "loss": 0.0023, "num_tokens": 97046017.0, "reward": 1.9729166746139526, "reward_std": 0.07117186933755874, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.3, "completions/max_terminated_length": 498.3, "completions/mean_length": 374.3625, "completions/mean_terminated_length": 374.3625, "completions/min_length": 275.8, "completions/min_terminated_length": 275.8, "epoch": 0.7785179017485429, "grad_norm": 4.826237703846462, "kl": 0.051904296875, "learning_rate": 1.1644119557926247e-07, "loss": 0.0021, "num_tokens": 97296238.0, "reward": 2.005625009536743, "reward_std": 0.07153735160827637, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018124999850988387, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03618201315402984, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.2, "completions/max_terminated_length": 447.2, "completions/mean_length": 344.9625, "completions/mean_terminated_length": 344.9625, "completions/min_length": 258.7, "completions/min_terminated_length": 258.7, "epoch": 0.7805995004163198, "grad_norm": 4.434990243764791, "kl": 0.0589599609375, "learning_rate": 1.1435183495426542e-07, "loss": 0.0024, "num_tokens": 97540299.0, "reward": 2.059313750267029, "reward_std": 0.16124168485403062, "rewards/accuracy_reward/mean": 0.9759803950786591, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08333333358168601, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.16124166548252106, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.6, "completions/max_terminated_length": 453.6, "completions/mean_length": 361.3, "completions/mean_terminated_length": 361.3, "completions/min_length": 281.2, "completions/min_terminated_length": 281.2, "epoch": 0.7826810990840966, "grad_norm": 0.1697122880636255, "kl": 0.0545654296875, "learning_rate": 1.1227896671445864e-07, "loss": 0.0022, "num_tokens": 97791299.0, "reward": 2.0145833492279053, "reward_std": 0.08838834911584854, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02708333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.1, "completions/max_terminated_length": 465.1, "completions/mean_length": 356.1625, "completions/mean_terminated_length": 356.1625, "completions/min_length": 261.5, "completions/min_terminated_length": 261.5, "epoch": 0.7847626977518735, "grad_norm": 4.72204657136494, "kl": 0.0535888671875, "learning_rate": 1.1022267950681247e-07, "loss": 0.0021, "num_tokens": 98050592.0, "reward": 1.8523929595947266, "reward_std": 0.1882556490600109, "rewards/accuracy_reward/mean": 0.8294762820005417, "rewards/accuracy_reward/std": 0.1349431467242539, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053312502801418304, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 416.4875, "completions/mean_terminated_length": 416.4875, "completions/min_length": 296.6, "completions/min_terminated_length": 296.6, "epoch": 0.7868442964196503, "grad_norm": 0.13573945644742816, "kl": 0.0557861328125, "learning_rate": 1.0818306126920346e-07, "loss": 0.0022, "num_tokens": 98276623.0, "reward": 1.9375, "reward_std": 0.2537449184805155, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.1687566041946411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10000000204890966, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10358962155878544, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.9, "completions/max_terminated_length": 510.9, "completions/mean_length": 385.95, "completions/mean_terminated_length": 385.95, "completions/min_length": 274.2, "completions/min_terminated_length": 274.2, "epoch": 0.7889258950874272, "grad_norm": 0.15732118591221703, "kl": 0.054443359375, "learning_rate": 1.061601992266532e-07, "loss": 0.0022, "num_tokens": 98548259.0, "reward": 1.8893749952316283, "reward_std": 0.19129744172096252, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05187499970197677, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1024898573756218, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.9, "completions/max_terminated_length": 477.9, "completions/mean_length": 369.2, "completions/mean_terminated_length": 369.2, "completions/min_length": 283.5, "completions/min_terminated_length": 283.5, "epoch": 0.791007493755204, "grad_norm": 0.12779789300769948, "kl": 0.05419921875, "learning_rate": 1.0415417988759916e-07, "loss": 0.0022, "num_tokens": 98811235.0, "reward": 1.9833333492279053, "reward_std": 0.17567494064569472, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.12416292428970337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333488553762, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05609594918787479, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.4, "completions/max_terminated_length": 509.4, "completions/mean_length": 397.2625, "completions/mean_terminated_length": 397.2625, "completions/min_length": 295.9, "completions/min_terminated_length": 295.9, "epoch": 0.7930890924229809, "grad_norm": 0.12194587886020418, "kl": 0.0489990234375, "learning_rate": 1.0216508904019339e-07, "loss": 0.002, "num_tokens": 99083496.0, "reward": 1.9875, "reward_std": 0.03535533845424652, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.9, "completions/max_terminated_length": 433.9, "completions/mean_length": 349.325, "completions/mean_terminated_length": 349.325, "completions/min_length": 257.7, "completions/min_terminated_length": 257.7, "epoch": 0.7951706910907577, "grad_norm": 5.718242192993079, "kl": 0.052880859375, "learning_rate": 1.0019301174863582e-07, "loss": 0.0021, "num_tokens": 99355562.0, "reward": 1.9729166746139526, "reward_std": 0.11110913455486297, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.1, "completions/max_terminated_length": 471.1, "completions/mean_length": 379.6, "completions/mean_terminated_length": 379.6, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7972522897585346, "grad_norm": 0.14164906888614892, "kl": 0.058984375, "learning_rate": 9.82380323495347e-08, "loss": 0.0024, "num_tokens": 99614210.0, "reward": 1.7272916793823243, "reward_std": 0.22196788042783738, "rewards/accuracy_reward/mean": 0.675, "rewards/accuracy_reward/std": 0.12416292428970337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05229166727513075, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12513141110539436, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.3, "completions/max_terminated_length": 489.3, "completions/mean_length": 370.675, "completions/mean_terminated_length": 370.675, "completions/min_length": 280.6, "completions/min_terminated_length": 280.6, "epoch": 0.7993338884263114, "grad_norm": 0.1499050279028627, "kl": 0.05205078125, "learning_rate": 9.630023444830104e-08, "loss": 0.0021, "num_tokens": 99891040.0, "reward": 1.9086726307868958, "reward_std": 0.005892548337578773, "rewards/accuracy_reward/mean": 0.9065893024206162, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002083333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00589255727827549, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.9, "completions/max_terminated_length": 468.9, "completions/mean_length": 369.65, "completions/mean_terminated_length": 369.65, "completions/min_length": 284.8, "completions/min_terminated_length": 284.8, "epoch": 0.8014154870940883, "grad_norm": 6.713483614256585, "kl": 0.0563232421875, "learning_rate": 9.437970091557251e-08, "loss": 0.0023, "num_tokens": 100145740.0, "reward": 1.915625, "reward_std": 0.11031491830945014, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.065625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09391534104943275, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.2, "completions/max_terminated_length": 468.2, "completions/mean_length": 368.1625, "completions/mean_terminated_length": 368.1625, "completions/min_length": 279.3, "completions/min_terminated_length": 279.3, "epoch": 0.8034970857618651, "grad_norm": 5.113616133939523, "kl": 0.0548828125, "learning_rate": 9.247651388367e-08, "loss": 0.0022, "num_tokens": 100417977.0, "reward": 1.8708333492279052, "reward_std": 0.143629489839077, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.13509859144687653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.5, "completions/max_terminated_length": 485.5, "completions/mean_length": 370.7875, "completions/mean_terminated_length": 370.7875, "completions/min_length": 284.7, "completions/min_terminated_length": 284.7, "epoch": 0.805578684429642, "grad_norm": 0.11922368664970306, "kl": 0.0568603515625, "learning_rate": 9.059075474308459e-08, "loss": 0.0023, "num_tokens": 100693856.0, "reward": 1.9239583492279053, "reward_std": 0.16635380387306214, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02395833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049352110177278516, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.8, "completions/max_terminated_length": 540.8, "completions/mean_length": 390.1875, "completions/mean_terminated_length": 390.1875, "completions/min_length": 284.9, "completions/min_terminated_length": 284.9, "epoch": 0.8076602830974188, "grad_norm": 0.1291060533060879, "kl": 0.0534423828125, "learning_rate": 8.872250413899785e-08, "loss": 0.0021, "num_tokens": 100965423.0, "reward": 1.828125, "reward_std": 0.18752237744629383, "rewards/accuracy_reward/mean": 0.8, "rewards/accuracy_reward/std": 0.1334012657403946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125000186264515, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.054121119901537895, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.1, "completions/max_terminated_length": 484.1, "completions/mean_length": 380.1375, "completions/mean_terminated_length": 380.1375, "completions/min_length": 279.2, "completions/min_terminated_length": 279.2, "epoch": 0.8097418817651957, "grad_norm": 4.878083007917422, "kl": 0.0498291015625, "learning_rate": 8.687184196783138e-08, "loss": 0.002, "num_tokens": 101216602.0, "reward": 1.9375, "reward_std": 0.12793734967708587, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.4, "completions/max_terminated_length": 473.4, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 280.6, "completions/min_terminated_length": 280.6, "epoch": 0.8118234804329725, "grad_norm": 0.1329960691761566, "kl": 0.0533447265625, "learning_rate": 8.503884737383188e-08, "loss": 0.0021, "num_tokens": 101477834.0, "reward": 1.9260416746139526, "reward_std": 0.14048119634389877, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.10520716905593872, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055533173680305484, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.2, "completions/max_terminated_length": 478.2, "completions/mean_length": 384.25, "completions/mean_terminated_length": 384.25, "completions/min_length": 282.7, "completions/min_terminated_length": 282.7, "epoch": 0.8139050791007494, "grad_norm": 0.12332734924314498, "kl": 0.046728515625, "learning_rate": 8.32235987456853e-08, "loss": 0.0019, "num_tokens": 101753350.0, "reward": 1.7395833492279054, "reward_std": 0.10045296251773835, "rewards/accuracy_reward/mean": 0.7, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03958333395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1004529558122158, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.8, "completions/max_terminated_length": 497.8, "completions/mean_length": 373.0875, "completions/mean_terminated_length": 373.0875, "completions/min_length": 272.1, "completions/min_terminated_length": 272.1, "epoch": 0.8159866777685262, "grad_norm": 0.1412123156702961, "kl": 0.051416015625, "learning_rate": 8.142617371316473e-08, "loss": 0.0021, "num_tokens": 102031781.0, "reward": 1.9260416746139526, "reward_std": 0.10721644759178162, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.054992732405662534, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.8, "completions/max_terminated_length": 496.8, "completions/mean_length": 389.35, "completions/mean_terminated_length": 389.35, "completions/min_length": 311.6, "completions/min_terminated_length": 311.6, "epoch": 0.8180682764363031, "grad_norm": 0.21222767791413055, "kl": 0.0527099609375, "learning_rate": 7.964664914381086e-08, "loss": 0.0021, "num_tokens": 102307233.0, "reward": 1.9333333492279052, "reward_std": 0.16807903349399567, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.10350984334945679, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06456920504570007, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.4, "completions/max_terminated_length": 505.4, "completions/mean_length": 372.7625, "completions/mean_terminated_length": 372.7625, "completions/min_length": 265.9, "completions/min_terminated_length": 265.9, "epoch": 0.8201498751040799, "grad_norm": 4.518923590747868, "kl": 0.052734375, "learning_rate": 7.788510113964436e-08, "loss": 0.0021, "num_tokens": 102566030.0, "reward": 1.7533788442611695, "reward_std": 0.12879444248974323, "rewards/accuracy_reward/mean": 0.7203431375324726, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03303571436554194, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0934391088783741, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.3, "completions/max_terminated_length": 509.3, "completions/mean_length": 386.6125, "completions/mean_terminated_length": 386.6125, "completions/min_length": 279.7, "completions/min_terminated_length": 279.7, "epoch": 0.8222314737718568, "grad_norm": 4.401170932607157, "kl": 0.052294921875, "learning_rate": 7.614160503391159e-08, "loss": 0.0021, "num_tokens": 102811719.0, "reward": 1.6110849499702453, "reward_std": 0.2531979136168957, "rewards/accuracy_reward/mean": 0.580876623466611, "rewards/accuracy_reward/std": 0.2234456790611148, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055492518842220305, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.4, "completions/max_terminated_length": 514.4, "completions/mean_length": 400.0375, "completions/mean_terminated_length": 400.0375, "completions/min_length": 301.6, "completions/min_terminated_length": 301.6, "epoch": 0.8243130724396336, "grad_norm": 5.5389473156402955, "kl": 0.052783203125, "learning_rate": 7.441623538786267e-08, "loss": 0.0021, "num_tokens": 103049338.0, "reward": 1.8135416746139525, "reward_std": 0.19410984218120575, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.1388651818037033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0552446648478508, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 379.9875, "completions/mean_terminated_length": 379.9875, "completions/min_length": 303.7, "completions/min_terminated_length": 303.7, "epoch": 0.8263946711074105, "grad_norm": 0.18763065293743852, "kl": 0.05126953125, "learning_rate": 7.270906598756354e-08, "loss": 0.002, "num_tokens": 103302593.0, "reward": 1.8629722237586974, "reward_std": 0.134679351747036, "rewards/accuracy_reward/mean": 0.8567222222685814, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.5, "completions/max_terminated_length": 479.5, "completions/mean_length": 364.2375, "completions/mean_terminated_length": 364.2375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.8284762697751873, "grad_norm": 4.675829318699885, "kl": 0.0544189453125, "learning_rate": 7.102016984073939e-08, "loss": 0.0022, "num_tokens": 103572132.0, "reward": 2.0260416984558107, "reward_std": 0.15645610243082048, "rewards/accuracy_reward/mean": 0.95, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07604166828095912, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10300384685397149, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.6, "completions/max_terminated_length": 512.6, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 280.6, "completions/min_terminated_length": 280.6, "epoch": 0.8305578684429642, "grad_norm": 4.376198711871789, "kl": 0.0482421875, "learning_rate": 6.934961917365323e-08, "loss": 0.0019, "num_tokens": 103836776.0, "reward": 1.919166672229767, "reward_std": 0.04006379246711731, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667386889456, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0400637723505497, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.6, "completions/max_terminated_length": 484.6, "completions/mean_length": 385.4125, "completions/mean_terminated_length": 385.4125, "completions/min_length": 285.9, "completions/min_terminated_length": 285.9, "epoch": 0.832639467110741, "grad_norm": 0.17280059406529635, "kl": 0.052587890625, "learning_rate": 6.769748542801696e-08, "loss": 0.0021, "num_tokens": 104067641.0, "reward": 2.00625, "reward_std": 0.01767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 407.825, "completions/mean_terminated_length": 407.825, "completions/min_length": 311.8, "completions/min_terminated_length": 311.8, "epoch": 0.8347210657785179, "grad_norm": 0.12326887803289459, "kl": 0.0458984375, "learning_rate": 6.606383925793596e-08, "loss": 0.0018, "num_tokens": 104344899.0, "reward": 1.9331209182739257, "reward_std": 0.08897026628255844, "rewards/accuracy_reward/mean": 0.9143708974123002, "rewards/accuracy_reward/std": 0.0612799197435379, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018750001117587088, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027690339833498, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.2, "completions/max_terminated_length": 474.2, "completions/mean_length": 365.825, "completions/mean_terminated_length": 365.825, "completions/min_length": 271.2, "completions/min_terminated_length": 271.2, "epoch": 0.8368026644462948, "grad_norm": 5.814823012738687, "kl": 0.051123046875, "learning_rate": 6.444875052688764e-08, "loss": 0.002, "num_tokens": 104612149.0, "reward": 1.6837563395500184, "reward_std": 0.1435942307114601, "rewards/accuracy_reward/mean": 0.6403039485216141, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.043452383019030094, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05773126855492592, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.5, "completions/max_terminated_length": 458.5, "completions/mean_length": 355.375, "completions/mean_terminated_length": 355.375, "completions/min_length": 262.6, "completions/min_terminated_length": 262.6, "epoch": 0.8388842631140716, "grad_norm": 0.09901890393356158, "kl": 0.04814453125, "learning_rate": 6.285228830473421e-08, "loss": 0.0019, "num_tokens": 104885179.0, "reward": 1.890000009536743, "reward_std": 0.06549679189920425, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.014999999850988387, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02910144701600075, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.3, "completions/max_terminated_length": 495.3, "completions/mean_length": 384.3375, "completions/mean_terminated_length": 384.3375, "completions/min_length": 280.3, "completions/min_terminated_length": 280.3, "epoch": 0.8409658617818485, "grad_norm": 0.14455133341003648, "kl": 0.0484130859375, "learning_rate": 6.127452086476748e-08, "loss": 0.0019, "num_tokens": 105137526.0, "reward": 2.01875, "reward_std": 0.044035982340574265, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044035985320806506, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.1, "completions/max_terminated_length": 498.1, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 293.5, "completions/min_terminated_length": 293.5, "epoch": 0.8430474604496253, "grad_norm": 0.1210642036264325, "kl": 0.0546875, "learning_rate": 5.971551568079097e-08, "loss": 0.0022, "num_tokens": 105381174.0, "reward": 1.9229166746139525, "reward_std": 0.1885521873831749, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.1687566041946411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.8, "completions/max_terminated_length": 507.8, "completions/mean_length": 384.9375, "completions/mean_terminated_length": 384.9375, "completions/min_length": 279.5, "completions/min_terminated_length": 279.5, "epoch": 0.8451290591174022, "grad_norm": 4.399537829335464, "kl": 0.050439453125, "learning_rate": 5.817533942423286e-08, "loss": 0.002, "num_tokens": 105646977.0, "reward": 2.0375, "reward_std": 0.06348394006490707, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06348394006490707, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.4, "completions/max_terminated_length": 509.4, "completions/mean_length": 384.5625, "completions/mean_terminated_length": 384.5625, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.847210657785179, "grad_norm": 0.13733316919087182, "kl": 0.0467529296875, "learning_rate": 5.665405796129552e-08, "loss": 0.0019, "num_tokens": 105904598.0, "reward": 1.9947916746139527, "reward_std": 0.09533035308122635, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049039344489574435, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.6, "completions/max_terminated_length": 522.6, "completions/mean_length": 388.0875, "completions/mean_terminated_length": 388.0875, "completions/min_length": 283.2, "completions/min_terminated_length": 283.2, "epoch": 0.8492922564529559, "grad_norm": 0.11794495477424068, "kl": 0.049267578125, "learning_rate": 5.515173635013859e-08, "loss": 0.002, "num_tokens": 106185101.0, "reward": 1.996875, "reward_std": 0.06187184229493141, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.1, "completions/max_terminated_length": 474.1, "completions/mean_length": 363.825, "completions/mean_terminated_length": 363.825, "completions/min_length": 269.1, "completions/min_terminated_length": 269.1, "epoch": 0.8513738551207327, "grad_norm": 4.565056845803448, "kl": 0.060888671875, "learning_rate": 5.3668438838096685e-08, "loss": 0.0024, "num_tokens": 106455751.0, "reward": 1.994494080543518, "reward_std": 0.12382127717137337, "rewards/accuracy_reward/mean": 0.9625, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03199404887855053, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05725074335932732, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.6, "completions/max_terminated_length": 520.6, "completions/mean_length": 399.1625, "completions/mean_terminated_length": 399.1625, "completions/min_length": 299.4, "completions/min_terminated_length": 299.4, "epoch": 0.8534554537885096, "grad_norm": 0.17276149318032222, "kl": 0.047607421875, "learning_rate": 5.2204228858931664e-08, "loss": 0.0019, "num_tokens": 106710260.0, "reward": 1.9436021447181702, "reward_std": 0.06134198904037476, "rewards/accuracy_reward/mean": 0.9061021506786346, "rewards/accuracy_reward/std": 0.0015966927632689476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06094194948673248, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.2, "completions/max_terminated_length": 491.2, "completions/mean_length": 363.0875, "completions/mean_terminated_length": 363.0875, "completions/min_length": 269.9, "completions/min_terminated_length": 269.9, "epoch": 0.8555370524562864, "grad_norm": 0.1553358408491195, "kl": 0.0528076171875, "learning_rate": 5.0759169030120454e-08, "loss": 0.0021, "num_tokens": 106952595.0, "reward": 1.9416666746139526, "reward_std": 0.1220408782362938, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.034930617362260816, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.1, "completions/max_terminated_length": 503.1, "completions/mean_length": 376.4375, "completions/mean_terminated_length": 376.4375, "completions/min_length": 282.8, "completions/min_terminated_length": 282.8, "epoch": 0.8576186511240633, "grad_norm": 0.12064916321985647, "kl": 0.05244140625, "learning_rate": 4.933332115017619e-08, "loss": 0.0021, "num_tokens": 107223014.0, "reward": 1.8802083492279054, "reward_std": 0.1497805744409561, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06267031356692314, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.3, "completions/max_terminated_length": 499.3, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 266.8, "completions/min_terminated_length": 266.8, "epoch": 0.8597002497918401, "grad_norm": 0.15994207017756168, "kl": 0.0513671875, "learning_rate": 4.7926746196006675e-08, "loss": 0.0021, "num_tokens": 107485078.0, "reward": 1.9061742424964905, "reward_std": 0.09082945436239243, "rewards/accuracy_reward/mean": 0.8624242424964905, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09082945436239243, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.3, "completions/max_terminated_length": 477.3, "completions/mean_length": 370.5875, "completions/mean_terminated_length": 370.5875, "completions/min_length": 269.4, "completions/min_terminated_length": 269.4, "epoch": 0.861781848459617, "grad_norm": 5.466497474038428, "kl": 0.0529296875, "learning_rate": 4.653950432030518e-08, "loss": 0.0021, "num_tokens": 107733645.0, "reward": 1.9285416841506957, "reward_std": 0.2662425719201565, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.19317627549171448, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04104166775941849, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08089874014258384, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.8, "completions/max_terminated_length": 453.8, "completions/mean_length": 361.725, "completions/mean_terminated_length": 361.725, "completions/min_length": 260.3, "completions/min_terminated_length": 260.3, "epoch": 0.8638634471273938, "grad_norm": 0.15802322397669666, "kl": 0.048291015625, "learning_rate": 4.51716548489795e-08, "loss": 0.0019, "num_tokens": 107984727.0, "reward": 1.6659722208976746, "reward_std": 0.13679726421833038, "rewards/accuracy_reward/mean": 0.6555555552244187, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.8, "completions/max_terminated_length": 501.8, "completions/mean_length": 390.4625, "completions/mean_terminated_length": 390.4625, "completions/min_length": 291.4, "completions/min_terminated_length": 291.4, "epoch": 0.8659450457951707, "grad_norm": 5.800661850820499, "kl": 0.0455322265625, "learning_rate": 4.382325627861383e-08, "loss": 0.0018, "num_tokens": 108246356.0, "reward": 1.927916669845581, "reward_std": 0.21918051093816757, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.1388651818037033, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.052916666865348815, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09879994541406631, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.8, "completions/max_terminated_length": 493.8, "completions/mean_length": 373.4, "completions/mean_terminated_length": 373.4, "completions/min_length": 294.4, "completions/min_terminated_length": 294.4, "epoch": 0.8680266444629475, "grad_norm": 5.428300158144188, "kl": 0.046630859375, "learning_rate": 4.2494366273967355e-08, "loss": 0.0019, "num_tokens": 108514156.0, "reward": 1.971875, "reward_std": 0.10024693906307221, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 0.9875, "rewards/format_reward/std": 0.03535533845424652, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.018600596487522124, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.5, "completions/max_terminated_length": 469.5, "completions/mean_length": 370.7, "completions/mean_terminated_length": 370.7, "completions/min_length": 269.8, "completions/min_terminated_length": 269.8, "epoch": 0.8701082431307244, "grad_norm": 0.121734992020979, "kl": 0.050244140625, "learning_rate": 4.118504166550846e-08, "loss": 0.002, "num_tokens": 108791212.0, "reward": 1.69375, "reward_std": 0.13467935025691985, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.1, "completions/max_terminated_length": 513.1, "completions/mean_length": 378.575, "completions/mean_terminated_length": 378.575, "completions/min_length": 251.3, "completions/min_terminated_length": 251.3, "epoch": 0.8721898417985012, "grad_norm": 0.1551012228146172, "kl": 0.052099609375, "learning_rate": 3.989533844698412e-08, "loss": 0.0021, "num_tokens": 109063354.0, "reward": 1.7762298107147216, "reward_std": 0.10243196031078697, "rewards/accuracy_reward/mean": 0.7595631256699562, "rewards/accuracy_reward/std": 0.08766918759793044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.1, "completions/max_terminated_length": 479.1, "completions/mean_length": 377.55, "completions/mean_terminated_length": 377.55, "completions/min_length": 288.3, "completions/min_terminated_length": 288.3, "epoch": 0.8742714404662781, "grad_norm": 5.053903913258786, "kl": 0.0511474609375, "learning_rate": 3.862531177302536e-08, "loss": 0.002, "num_tokens": 109324198.0, "reward": 2.0260416746139525, "reward_std": 0.13661691546440125, "rewards/accuracy_reward/mean": 0.975, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10126157030463219, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.6, "completions/max_terminated_length": 514.6, "completions/mean_length": 400.325, "completions/mean_terminated_length": 400.325, "completions/min_length": 302.7, "completions/min_terminated_length": 302.7, "epoch": 0.8763530391340549, "grad_norm": 0.14070777923623243, "kl": 0.051708984375, "learning_rate": 3.737501595678877e-08, "loss": 0.0021, "num_tokens": 109582328.0, "reward": 1.8760416746139525, "reward_std": 0.12483179420232773, "rewards/accuracy_reward/mean": 0.8416666686534882, "rewards/accuracy_reward/std": 0.07071067690849304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.034375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05412111729383469, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.6, "completions/max_terminated_length": 504.6, "completions/mean_length": 371.075, "completions/mean_terminated_length": 371.075, "completions/min_length": 272.1, "completions/min_terminated_length": 272.1, "epoch": 0.8784346378018318, "grad_norm": 0.14022921478443767, "kl": 0.0544189453125, "learning_rate": 3.6144504467633177e-08, "loss": 0.0022, "num_tokens": 109838110.0, "reward": 1.8385416865348816, "reward_std": 0.12297167479991913, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166828095913, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08761632815003395, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.7, "completions/max_terminated_length": 468.7, "completions/mean_length": 349.05, "completions/mean_terminated_length": 349.05, "completions/min_length": 232.4, "completions/min_terminated_length": 232.4, "epoch": 0.8805162364696086, "grad_norm": 0.17054655274531946, "kl": 0.0545166015625, "learning_rate": 3.493382992883376e-08, "loss": 0.0022, "num_tokens": 110089850.0, "reward": 2.0010416746139525, "reward_std": 0.20167978554964067, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10278442576527595, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.7, "completions/max_terminated_length": 511.7, "completions/mean_length": 408.425, "completions/mean_terminated_length": 408.425, "completions/min_length": 302.5, "completions/min_terminated_length": 302.5, "epoch": 0.8825978351373855, "grad_norm": 3.487135276663561, "kl": 0.04794921875, "learning_rate": 3.3743044115331074e-08, "loss": 0.0019, "num_tokens": 110337772.0, "reward": 1.9349905014038087, "reward_std": 0.07198155298829079, "rewards/accuracy_reward/mean": 0.9112405106425285, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.023750000074505805, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03662621006369591, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.3, "completions/max_terminated_length": 506.3, "completions/mean_length": 377.6125, "completions/mean_terminated_length": 377.6125, "completions/min_length": 267.9, "completions/min_terminated_length": 267.9, "epoch": 0.8846794338051623, "grad_norm": 5.377135867331848, "kl": 0.050146484375, "learning_rate": 3.257219795151706e-08, "loss": 0.002, "num_tokens": 110570869.0, "reward": 1.9635416746139527, "reward_std": 0.13173307850956917, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08544206991791725, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.5, "completions/max_terminated_length": 515.5, "completions/mean_length": 396.6875, "completions/mean_terminated_length": 396.6875, "completions/min_length": 289.1, "completions/min_terminated_length": 289.1, "epoch": 0.8867610324729392, "grad_norm": 0.13949685317961896, "kl": 0.0470703125, "learning_rate": 3.1421341509057286e-08, "loss": 0.0019, "num_tokens": 110818524.0, "reward": 1.9802773118019104, "reward_std": 0.08302186951041221, "rewards/accuracy_reward/mean": 0.9177773147821426, "rewards/accuracy_reward/std": 0.009221436083316803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07380043268203736, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.5, "completions/max_terminated_length": 467.5, "completions/mean_length": 376.3, "completions/mean_terminated_length": 376.3, "completions/min_length": 285.1, "completions/min_terminated_length": 285.1, "epoch": 0.888842631140716, "grad_norm": 5.148213381792115, "kl": 0.050048828125, "learning_rate": 3.029052400474946e-08, "loss": 0.002, "num_tokens": 111091996.0, "reward": 1.8268019556999207, "reward_std": 0.05303300768136978, "rewards/accuracy_reward/mean": 0.808051960915327, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.9, "completions/max_terminated_length": 506.9, "completions/mean_length": 394.7375, "completions/mean_terminated_length": 394.7375, "completions/min_length": 295.7, "completions/min_terminated_length": 295.7, "epoch": 0.890924229808493, "grad_norm": 5.657416392972012, "kl": 0.04599609375, "learning_rate": 2.917979379841884e-08, "loss": 0.0018, "num_tokens": 111356711.0, "reward": 1.96245219707489, "reward_std": 0.0872782051563263, "rewards/accuracy_reward/mean": 0.8841188423335552, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07833333555608987, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06079131290316582, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.6, "completions/max_terminated_length": 499.6, "completions/mean_length": 390.7625, "completions/mean_terminated_length": 390.7625, "completions/min_length": 297.9, "completions/min_terminated_length": 297.9, "epoch": 0.8930058284762697, "grad_norm": 0.12242317928805745, "kl": 0.0466796875, "learning_rate": 2.8089198390850054e-08, "loss": 0.0019, "num_tokens": 111639116.0, "reward": 1.9114583492279054, "reward_std": 0.025469133257865907, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.9, "completions/max_terminated_length": 488.9, "completions/mean_length": 364.4, "completions/mean_terminated_length": 364.4, "completions/min_length": 255.2, "completions/min_terminated_length": 255.2, "epoch": 0.8950874271440467, "grad_norm": 5.237753278009117, "kl": 0.052197265625, "learning_rate": 2.701878442175548e-08, "loss": 0.0021, "num_tokens": 111890132.0, "reward": 2.075000023841858, "reward_std": 0.12417701482772828, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08750000447034836, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08882166296243668, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.6, "completions/max_terminated_length": 478.6, "completions/mean_length": 384.3875, "completions/mean_terminated_length": 384.3875, "completions/min_length": 279.2, "completions/min_terminated_length": 279.2, "epoch": 0.8971690258118235, "grad_norm": 0.1476525406524929, "kl": 0.04580078125, "learning_rate": 2.59685976677812e-08, "loss": 0.0018, "num_tokens": 112171523.0, "reward": 1.825, "reward_std": 0.11700168251991272, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.11700168251991272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.1, "completions/max_terminated_length": 482.1, "completions/mean_length": 372.5, "completions/mean_terminated_length": 372.5, "completions/min_length": 268.7, "completions/min_terminated_length": 268.7, "epoch": 0.8992506244796004, "grad_norm": 4.272435531356291, "kl": 0.0462158203125, "learning_rate": 2.493868304054858e-08, "loss": 0.0018, "num_tokens": 112429075.0, "reward": 2.0447916746139527, "reward_std": 0.2241403728723526, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10729166865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11172061711549759, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.2, "completions/max_terminated_length": 517.2, "completions/mean_length": 405.9875, "completions/mean_terminated_length": 405.9875, "completions/min_length": 321.7, "completions/min_terminated_length": 321.7, "epoch": 0.9013322231473772, "grad_norm": 0.10872554480013581, "kl": 0.049560546875, "learning_rate": 2.3929084584734583e-08, "loss": 0.002, "num_tokens": 112662970.0, "reward": 1.8549999952316285, "reward_std": 0.17398233264684676, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.10606601536273956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.042500002309679985, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07622460052371025, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.7, "completions/max_terminated_length": 478.7, "completions/mean_length": 389.0625, "completions/mean_terminated_length": 389.0625, "completions/min_length": 282.1, "completions/min_terminated_length": 282.1, "epoch": 0.9034138218151541, "grad_norm": 0.14089223495537415, "kl": 0.049951171875, "learning_rate": 2.293984547618716e-08, "loss": 0.002, "num_tokens": 112917615.0, "reward": 2.0250000238418577, "reward_std": 0.05036096572875977, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02500000037252903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05036095306277275, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.4, "completions/max_terminated_length": 507.4, "completions/mean_length": 394.7375, "completions/mean_terminated_length": 394.7375, "completions/min_length": 280.5, "completions/min_terminated_length": 280.5, "epoch": 0.9054954204829309, "grad_norm": 4.912712839875028, "kl": 0.0505859375, "learning_rate": 2.197100802007967e-08, "loss": 0.002, "num_tokens": 113192594.0, "reward": 1.78125, "reward_std": 0.18097035735845565, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.09258201122283935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0883883461356163, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.5, "completions/max_terminated_length": 472.5, "completions/mean_length": 366.4375, "completions/mean_terminated_length": 366.4375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9075770191507078, "grad_norm": 5.1684488225695855, "kl": 0.053271484375, "learning_rate": 2.102261364910113e-08, "loss": 0.0021, "num_tokens": 113466493.0, "reward": 1.95, "reward_std": 0.11700168251991272, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.3, "completions/max_terminated_length": 504.3, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 294.6, "completions/min_terminated_length": 294.6, "epoch": 0.9096586178184846, "grad_norm": 4.333634506581281, "kl": 0.049365234375, "learning_rate": 2.009470292168458e-08, "loss": 0.002, "num_tokens": 113730683.0, "reward": 1.6584088206291199, "reward_std": 0.07365696355700493, "rewards/accuracy_reward/mean": 0.6448671489953994, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03830161839723587, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.1, "completions/max_terminated_length": 468.1, "completions/mean_length": 361.65, "completions/mean_terminated_length": 361.65, "completions/min_length": 264.7, "completions/min_terminated_length": 264.7, "epoch": 0.9117402164862615, "grad_norm": 0.7362022148606795, "kl": 0.0502685546875, "learning_rate": 1.9187315520272474e-08, "loss": 0.002, "num_tokens": 113970759.0, "reward": 1.9650000095367433, "reward_std": 0.18890444338321685, "rewards/accuracy_reward/mean": 0.9125, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.052500000596046446, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10725810527801513, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.7, "completions/max_terminated_length": 485.7, "completions/mean_length": 369.4, "completions/mean_terminated_length": 369.4, "completions/min_length": 264.4, "completions/min_terminated_length": 264.4, "epoch": 0.9138218151540383, "grad_norm": 0.12468607156671294, "kl": 0.0517578125, "learning_rate": 1.8300490249619937e-08, "loss": 0.0021, "num_tokens": 114219847.0, "reward": 2.00625, "reward_std": 0.01767766922712326, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.3, "completions/max_terminated_length": 476.3, "completions/mean_length": 357.1625, "completions/mean_terminated_length": 357.1625, "completions/min_length": 256.5, "completions/min_terminated_length": 256.5, "epoch": 0.9159034138218152, "grad_norm": 4.715800520092029, "kl": 0.0498046875, "learning_rate": 1.743426503513462e-08, "loss": 0.002, "num_tokens": 114492356.0, "reward": 2.035416674613953, "reward_std": 0.09050626158714295, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09050625860691071, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.6, "completions/max_terminated_length": 479.6, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 296.2, "completions/min_terminated_length": 296.2, "epoch": 0.917985012489592, "grad_norm": 0.11666760088151248, "kl": 0.0504638671875, "learning_rate": 1.6588676921255595e-08, "loss": 0.002, "num_tokens": 114754464.0, "reward": 1.9166666746139527, "reward_std": 0.03747325390577316, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03747325092554092, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.5, "completions/max_terminated_length": 459.5, "completions/mean_length": 347.1125, "completions/mean_terminated_length": 347.1125, "completions/min_length": 246.7, "completions/min_terminated_length": 246.7, "epoch": 0.9200666111573689, "grad_norm": 5.083016362521967, "kl": 0.0513427734375, "learning_rate": 1.5763762069868626e-08, "loss": 0.0021, "num_tokens": 115024633.0, "reward": 1.9514627933502198, "reward_std": 0.21071887612342835, "rewards/accuracy_reward/mean": 0.8639627665281295, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08750000260770321, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15726661458611488, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.2, "completions/max_terminated_length": 488.2, "completions/mean_length": 383.3625, "completions/mean_terminated_length": 383.3625, "completions/min_length": 282.2, "completions/min_terminated_length": 282.2, "epoch": 0.9221482098251457, "grad_norm": 0.12144909342997057, "kl": 0.0490234375, "learning_rate": 1.495955575875979e-08, "loss": 0.002, "num_tokens": 115292630.0, "reward": 1.804674792289734, "reward_std": 0.1397224634885788, "rewards/accuracy_reward/mean": 0.8005081295967102, "rewards/accuracy_reward/std": 0.12793734967708587, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.3, "completions/max_terminated_length": 478.3, "completions/mean_length": 355.2875, "completions/mean_terminated_length": 355.2875, "completions/min_length": 251.4, "completions/min_terminated_length": 251.4, "epoch": 0.9242298084929226, "grad_norm": 4.635990166700676, "kl": 0.0520751953125, "learning_rate": 1.4176092380106862e-08, "loss": 0.0021, "num_tokens": 115569397.0, "reward": 1.8619230747222901, "reward_std": 0.10850712358951568, "rewards/accuracy_reward/mean": 0.8244230777025223, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07315178513526917, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.4, "completions/max_terminated_length": 492.4, "completions/mean_length": 399.45, "completions/mean_terminated_length": 399.45, "completions/min_length": 303.8, "completions/min_terminated_length": 303.8, "epoch": 0.9263114071606994, "grad_norm": 0.14106025786661242, "kl": 0.048828125, "learning_rate": 1.3413405439008485e-08, "loss": 0.002, "num_tokens": 115844865.0, "reward": 1.9447916746139526, "reward_std": 0.05844042226672173, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04479166865348816, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05844042524695396, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.4, "completions/max_terminated_length": 484.4, "completions/mean_length": 369.9125, "completions/mean_terminated_length": 369.9125, "completions/min_length": 275.3, "completions/min_terminated_length": 275.3, "epoch": 0.9283930058284763, "grad_norm": 0.14363075656265512, "kl": 0.0481689453125, "learning_rate": 1.2671527552051476e-08, "loss": 0.0019, "num_tokens": 116098690.0, "reward": 2.037500023841858, "reward_std": 0.07791186273097991, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05000000074505806, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09743538349866868, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.9, "completions/max_terminated_length": 504.9, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 273.1, "completions/min_terminated_length": 273.1, "epoch": 0.9304746044962531, "grad_norm": 0.12277997686559079, "kl": 0.0497802734375, "learning_rate": 1.1950490445915562e-08, "loss": 0.002, "num_tokens": 116348966.0, "reward": 1.60625, "reward_std": 0.1352471113204956, "rewards/accuracy_reward/mean": 0.5875, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03720119297504425, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.6, "completions/max_terminated_length": 466.6, "completions/mean_length": 368.3625, "completions/mean_terminated_length": 368.3625, "completions/min_length": 283.7, "completions/min_terminated_length": 283.7, "epoch": 0.93255620316403, "grad_norm": 4.998444783741019, "kl": 0.053369140625, "learning_rate": 1.1250324956017021e-08, "loss": 0.0021, "num_tokens": 116571507.0, "reward": 1.9235119104385376, "reward_std": 0.13009742498397828, "rewards/accuracy_reward/mean": 0.9047619044780731, "rewards/accuracy_reward/std": 0.08711026012897491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.7, "completions/max_terminated_length": 476.7, "completions/mean_length": 377.8875, "completions/mean_terminated_length": 377.8875, "completions/min_length": 295.8, "completions/min_terminated_length": 295.8, "epoch": 0.9346378018318068, "grad_norm": 0.14538984556042098, "kl": 0.04931640625, "learning_rate": 1.0571061025189898e-08, "loss": 0.002, "num_tokens": 116851034.0, "reward": 1.9, "reward_std": 0.14603425860404967, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.09974325299263001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.5, "completions/max_terminated_length": 483.5, "completions/mean_length": 379.5875, "completions/mean_terminated_length": 379.5875, "completions/min_length": 273.1, "completions/min_terminated_length": 273.1, "epoch": 0.9367194004995837, "grad_norm": 0.15471626446744527, "kl": 0.0471435546875, "learning_rate": 9.912727702405089e-09, "loss": 0.0019, "num_tokens": 117130249.0, "reward": 1.7438988089561462, "reward_std": 0.13729031383991241, "rewards/accuracy_reward/mean": 0.6720238097012043, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.071875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1263546496629715, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.2, "completions/max_terminated_length": 474.2, "completions/mean_length": 348.65, "completions/mean_terminated_length": 348.65, "completions/min_length": 258.1, "completions/min_terminated_length": 258.1, "epoch": 0.9388009991673605, "grad_norm": 0.13552586888561896, "kl": 0.05390625, "learning_rate": 9.275353141528719e-09, "loss": 0.0022, "num_tokens": 117376645.0, "reward": 2.06875, "reward_std": 0.09932401329278946, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08125, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06396867483854293, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.1, "completions/max_terminated_length": 445.1, "completions/mean_length": 345.2375, "completions/mean_terminated_length": 345.2375, "completions/min_length": 248.9, "completions/min_terminated_length": 248.9, "epoch": 0.9408825978351374, "grad_norm": 5.084349277875754, "kl": 0.054052734375, "learning_rate": 8.658964600117447e-09, "loss": 0.0022, "num_tokens": 117625392.0, "reward": 1.76875, "reward_std": 0.11572359055280686, "rewards/accuracy_reward/mean": 0.7625, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 373.4, "completions/mean_terminated_length": 373.4, "completions/min_length": 270.1, "completions/min_terminated_length": 270.1, "epoch": 0.9429641965029142, "grad_norm": 0.14517700306479756, "kl": 0.054833984375, "learning_rate": 8.063588438253333e-09, "loss": 0.0022, "num_tokens": 117882216.0, "reward": 1.7432638883590699, "reward_std": 0.0941609364002943, "rewards/accuracy_reward/mean": 0.7213888883590698, "rewards/accuracy_reward/std": 0.07419009134173393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184229493141, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.7, "completions/max_terminated_length": 547.7, "completions/mean_length": 412.8125, "completions/mean_terminated_length": 412.8125, "completions/min_length": 308.7, "completions/min_terminated_length": 308.7, "epoch": 0.9450457951706911, "grad_norm": 4.660371503146494, "kl": 0.0468017578125, "learning_rate": 7.489250117416301e-09, "loss": 0.0019, "num_tokens": 118157841.0, "reward": 1.9556250333786012, "reward_std": 0.18665989637374877, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.0816463440656662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06812500152736903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11871083304286004, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.8, "completions/max_terminated_length": 464.8, "completions/mean_length": 356.1625, "completions/mean_terminated_length": 356.1625, "completions/min_length": 260.9, "completions/min_terminated_length": 260.9, "epoch": 0.9471273938384679, "grad_norm": 0.1227115198824497, "kl": 0.0503173828125, "learning_rate": 6.935974199395123e-09, "loss": 0.002, "num_tokens": 118433734.0, "reward": 1.9291666746139526, "reward_std": 0.0709901750087738, "rewards/accuracy_reward/mean": 0.9, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07099017202854156, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.6, "completions/max_terminated_length": 482.6, "completions/mean_length": 380.225, "completions/mean_terminated_length": 380.225, "completions/min_length": 284.2, "completions/min_terminated_length": 284.2, "epoch": 0.9492089925062448, "grad_norm": 0.17771861834744349, "kl": 0.0508544921875, "learning_rate": 6.403784345237473e-09, "loss": 0.002, "num_tokens": 118691128.0, "reward": 2.040000009536743, "reward_std": 0.04670769646763802, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.039999999850988385, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046707694232463834, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 381.675, "completions/mean_terminated_length": 381.675, "completions/min_length": 275.8, "completions/min_terminated_length": 275.8, "epoch": 0.9512905911740216, "grad_norm": 4.884031037119129, "kl": 0.0503662109375, "learning_rate": 5.892703314237468e-09, "loss": 0.002, "num_tokens": 118937798.0, "reward": 1.871875, "reward_std": 0.0782714195549488, "rewards/accuracy_reward/mean": 0.8625, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 383.35, "completions/mean_terminated_length": 383.35, "completions/min_length": 274.2, "completions/min_terminated_length": 274.2, "epoch": 0.9533721898417985, "grad_norm": 4.1868205565680325, "kl": 0.04984130859375, "learning_rate": 5.402752962962887e-09, "loss": 0.002, "num_tokens": 119217538.0, "reward": 1.8479166746139526, "reward_std": 0.10860317498445511, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.9554537885095754, "grad_norm": 5.821238368131713, "kl": 0.0527587890625, "learning_rate": 4.933954244320138e-09, "loss": 0.0021, "num_tokens": 119476040.0, "reward": 1.9512932300567627, "reward_std": 0.14620633274316788, "rewards/accuracy_reward/mean": 0.9023348808288574, "rewards/accuracy_reward/std": 0.051754921674728394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833395421505, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09445140585303306, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.8, "completions/max_terminated_length": 494.8, "completions/mean_length": 379.55, "completions/mean_terminated_length": 379.55, "completions/min_length": 275.4, "completions/min_terminated_length": 275.4, "epoch": 0.9575353871773522, "grad_norm": 4.872306940707049, "kl": 0.0545166015625, "learning_rate": 4.486327206658314e-09, "loss": 0.0022, "num_tokens": 119739980.0, "reward": 1.9541666746139525, "reward_std": 0.11295498609542846, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06666397675871849, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.7, "completions/max_terminated_length": 493.7, "completions/mean_length": 383.775, "completions/mean_terminated_length": 383.775, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9596169858451291, "grad_norm": 0.14209871790245543, "kl": 0.0483154296875, "learning_rate": 4.059890992911819e-09, "loss": 0.0019, "num_tokens": 120000802.0, "reward": 1.9552083492279053, "reward_std": 0.08975056856870652, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833432674408, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.043459554016590116, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.7, "completions/max_terminated_length": 499.7, "completions/mean_length": 382.0375, "completions/mean_terminated_length": 382.0375, "completions/min_length": 269.8, "completions/min_terminated_length": 269.8, "epoch": 0.9616985845129059, "grad_norm": 5.254881507252644, "kl": 0.0486572265625, "learning_rate": 3.6546638397817463e-09, "loss": 0.0019, "num_tokens": 120273181.0, "reward": 2.0375, "reward_std": 0.2587729513645172, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 0.08880758583545685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.16996537446975707, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.6, "completions/max_terminated_length": 461.6, "completions/mean_length": 377.8375, "completions/mean_terminated_length": 377.8375, "completions/min_length": 284.2, "completions/min_terminated_length": 284.2, "epoch": 0.9637801831806828, "grad_norm": 0.13488627147120524, "kl": 0.0506103515625, "learning_rate": 3.2706630769558372e-09, "loss": 0.002, "num_tokens": 120528808.0, "reward": 1.843750011920929, "reward_std": 0.08866784423589706, "rewards/accuracy_reward/mean": 0.808333333581686, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08866784125566482, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.7, "completions/max_terminated_length": 454.7, "completions/mean_length": 358.6875, "completions/mean_terminated_length": 358.6875, "completions/min_length": 260.3, "completions/min_terminated_length": 260.3, "epoch": 0.9658617818484596, "grad_norm": 5.075488307300403, "kl": 0.0509033203125, "learning_rate": 2.9079051263675713e-09, "loss": 0.002, "num_tokens": 120792255.0, "reward": 2.015625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.5, "completions/max_terminated_length": 477.5, "completions/mean_length": 391.5375, "completions/mean_terminated_length": 391.5375, "completions/min_length": 272.2, "completions/min_terminated_length": 272.2, "epoch": 0.9679433805162365, "grad_norm": 0.1276972947224501, "kl": 0.053271484375, "learning_rate": 2.5664055014936738e-09, "loss": 0.0021, "num_tokens": 121067802.0, "reward": 1.83125, "reward_std": 0.19529284089803695, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.1595182627439499, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.3, "completions/max_terminated_length": 453.3, "completions/mean_length": 347.075, "completions/mean_terminated_length": 347.075, "completions/min_length": 253.3, "completions/min_terminated_length": 253.3, "epoch": 0.9700249791840133, "grad_norm": 0.16208269923991245, "kl": 0.054052734375, "learning_rate": 2.2461788066908127e-09, "loss": 0.0022, "num_tokens": 121329352.0, "reward": 1.8862499952316285, "reward_std": 0.2228688657283783, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.15865941643714904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0487500011920929, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08947228491306305, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.6, "completions/max_terminated_length": 502.6, "completions/mean_length": 395.3125, "completions/mean_terminated_length": 395.3125, "completions/min_length": 307.1, "completions/min_terminated_length": 307.1, "epoch": 0.9721065778517902, "grad_norm": 0.13886386977374376, "kl": 0.050390625, "learning_rate": 1.9472387365710995e-09, "loss": 0.002, "num_tokens": 121598809.0, "reward": 1.614305555820465, "reward_std": 0.0757537841796875, "rewards/accuracy_reward/mean": 0.6038888901472091, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.7, "completions/max_terminated_length": 482.7, "completions/mean_length": 353.9875, "completions/mean_terminated_length": 353.9875, "completions/min_length": 245.7, "completions/min_terminated_length": 245.7, "epoch": 0.974188176519567, "grad_norm": 0.17509149030292293, "kl": 0.050244140625, "learning_rate": 1.6695980754162231e-09, "loss": 0.002, "num_tokens": 121863576.0, "reward": 2.008333349227905, "reward_std": 0.08614101856946946, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05078567415475845, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.1, "completions/max_terminated_length": 503.1, "completions/mean_length": 382.1625, "completions/mean_terminated_length": 382.1625, "completions/min_length": 287.6, "completions/min_terminated_length": 287.6, "epoch": 0.9762697751873439, "grad_norm": 0.13843234084015008, "kl": 0.05654296875, "learning_rate": 1.4132686966307761e-09, "loss": 0.0023, "num_tokens": 122105629.0, "reward": 1.9668092727661133, "reward_std": 0.09535977803170681, "rewards/accuracy_reward/mean": 0.9126425817608833, "rewards/accuracy_reward/std": 0.003666847199201584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666716337204, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09169291257858277, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.2, "completions/max_terminated_length": 531.2, "completions/mean_length": 380.175, "completions/mean_terminated_length": 380.175, "completions/min_length": 246.6, "completions/min_terminated_length": 246.6, "epoch": 0.9783513738551207, "grad_norm": 0.1318637209902438, "kl": 0.0500732421875, "learning_rate": 1.1782615622347169e-09, "loss": 0.002, "num_tokens": 122371067.0, "reward": 1.691041684150696, "reward_std": 0.18236968517303467, "rewards/accuracy_reward/mean": 0.6375, "rewards/accuracy_reward/std": 0.12246559858322144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05354166775941849, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.059904086589813235, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.6, "completions/max_terminated_length": 488.6, "completions/mean_length": 381.2875, "completions/mean_terminated_length": 381.2875, "completions/min_length": 286.8, "completions/min_terminated_length": 286.8, "epoch": 0.9804329725228976, "grad_norm": 0.13333367658511264, "kl": 0.0505615234375, "learning_rate": 9.64586722394356e-10, "loss": 0.002, "num_tokens": 122644978.0, "reward": 1.7597916722297668, "reward_std": 0.21156217083334922, "rewards/accuracy_reward/mean": 0.725, "rewards/accuracy_reward/std": 0.15782093703746797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03479166869074106, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06111666113138199, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.3, "completions/max_terminated_length": 470.3, "completions/mean_length": 385.225, "completions/mean_terminated_length": 385.225, "completions/min_length": 262.8, "completions/min_terminated_length": 262.8, "epoch": 0.9825145711906744, "grad_norm": 0.13765351771426185, "kl": 0.0531005859375, "learning_rate": 7.722533149924771e-10, "loss": 0.0021, "num_tokens": 122896092.0, "reward": 1.8212500095367432, "reward_std": 0.12279465645551682, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.09804592728614807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008750000037252903, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0247487373650074, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.1, "completions/max_terminated_length": 524.1, "completions/mean_length": 382.8375, "completions/mean_terminated_length": 382.8375, "completions/min_length": 276.8, "completions/min_terminated_length": 276.8, "epoch": 0.9845961698584513, "grad_norm": 0.1636076030531112, "kl": 0.05126953125, "learning_rate": 6.012695652378163e-10, "loss": 0.0021, "num_tokens": 123137623.0, "reward": 2.0483333349227903, "reward_std": 0.11749013215303421, "rewards/accuracy_reward/mean": 0.9875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0608333345502615, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08213478401303291, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.9, "completions/max_terminated_length": 515.9, "completions/mean_length": 371.475, "completions/mean_terminated_length": 371.475, "completions/min_length": 234.8, "completions/min_terminated_length": 234.8, "epoch": 0.9866777685262281, "grad_norm": 0.12834150880736073, "kl": 0.05224609375, "learning_rate": 4.5164278531312214e-10, "loss": 0.0021, "num_tokens": 123391573.0, "reward": 1.9947916746139527, "reward_std": 0.0983980879187584, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06979166865348815, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07026949226856231, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.4, "completions/max_terminated_length": 484.4, "completions/mean_length": 364.5875, "completions/mean_terminated_length": 364.5875, "completions/min_length": 257.5, "completions/min_terminated_length": 257.5, "epoch": 0.988759367194005, "grad_norm": 4.537840716568053, "kl": 0.05213623046875, "learning_rate": 3.233793740625157e-10, "loss": 0.0021, "num_tokens": 123664276.0, "reward": 1.9135416746139526, "reward_std": 0.0927880972623825, "rewards/accuracy_reward/mean": 0.8875, "rewards/accuracy_reward/std": 0.03535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166679084301, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06574104949831963, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.3, "completions/max_terminated_length": 477.3, "completions/mean_length": 376.2, "completions/mean_terminated_length": 376.2, "completions/min_length": 289.8, "completions/min_terminated_length": 289.8, "epoch": 0.9908409658617818, "grad_norm": 0.14229700398876058, "kl": 0.0517578125, "learning_rate": 2.1648481671787679e-10, "loss": 0.0021, "num_tokens": 123922900.0, "reward": 1.8545833587646485, "reward_std": 0.18718414306640624, "rewards/accuracy_reward/mean": 0.8375, "rewards/accuracy_reward/std": 0.1388651818037033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.017083333618938924, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04831896647810936, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.6, "completions/max_terminated_length": 466.6, "completions/mean_length": 369.1875, "completions/mean_terminated_length": 369.1875, "completions/min_length": 265.5, "completions/min_terminated_length": 265.5, "epoch": 0.9929225645295587, "grad_norm": 4.5997180217266225, "kl": 0.0514404296875, "learning_rate": 1.309636846639761e-10, "loss": 0.0021, "num_tokens": 124164635.0, "reward": 1.8562393307685852, "reward_std": 0.12708007395267487, "rewards/accuracy_reward/mean": 0.847905983030796, "rewards/accuracy_reward/std": 0.10350984334945679, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.3, "completions/max_terminated_length": 522.3, "completions/mean_length": 375.0, "completions/mean_terminated_length": 375.0, "completions/min_length": 273.6, "completions/min_terminated_length": 273.6, "epoch": 0.9950041631973355, "grad_norm": 0.1592605584163004, "kl": 0.0489990234375, "learning_rate": 6.68196352435757e-11, "loss": 0.002, "num_tokens": 124430099.0, "reward": 1.9211574077606202, "reward_std": 0.11545017808675766, "rewards/accuracy_reward/mean": 0.8774074077606201, "rewards/accuracy_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06199793368577957, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.4, "completions/max_terminated_length": 502.4, "completions/mean_length": 394.6875, "completions/mean_terminated_length": 394.6875, "completions/min_length": 295.4, "completions/min_terminated_length": 295.4, "epoch": 0.9970857618651124, "grad_norm": 0.1317878118742546, "kl": 0.048974609375, "learning_rate": 2.4055411600332197e-11, "loss": 0.002, "num_tokens": 124675202.0, "reward": 1.8478713989257813, "reward_std": 0.11516052857041359, "rewards/accuracy_reward/mean": 0.7989130437374115, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833283662796, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06886952444911003, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 356.8, "completions/mean_terminated_length": 356.8, "completions/min_length": 261.9, "completions/min_terminated_length": 261.9, "epoch": 0.9991673605328892, "grad_norm": 0.12595486365913136, "kl": 0.0521484375, "learning_rate": 2.6728425620015094e-12, "loss": 0.0021, "num_tokens": 124950954.0, "reward": 1.940625, "reward_std": 0.09048517867922783, "rewards/accuracy_reward/mean": 0.925, "rewards/accuracy_reward/std": 0.046291005611419675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.75, "completions/max_terminated_length": 457.75, "completions/mean_length": 377.71875, "completions/mean_terminated_length": 377.71875, "completions/min_length": 287.25, "completions/min_terminated_length": 287.25, "epoch": 1.0, "kl": 0.05474853515625, "num_tokens": 125036766.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, "step": 4804, "total_flos": 0.0, "train_loss": 0.002239806222792197, "train_runtime": 195994.6659, "train_samples_per_second": 0.025, "train_steps_per_second": 0.025 } ], "logging_steps": 10, "max_steps": 4804, "num_input_tokens_seen": 125036766, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }