diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13985 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4804, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.7, + "completions/max_terminated_length": 486.7, + "completions/mean_length": 378.275, + "completions/mean_terminated_length": 378.275, + "completions/min_length": 300.6, + "completions/min_terminated_length": 300.6, + "epoch": 0.0020815986677768525, + "grad_norm": 0.036591879402560146, + "kl": 0.00189361572265625, + "learning_rate": 9.99991340007382e-07, + "loss": 0.0001, + "num_tokens": 257710.0, + "reward": 1.83125, + "reward_std": 0.2132595658302307, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.19231742918491362, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0408231720328331, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.8, + "completions/max_terminated_length": 461.8, + "completions/mean_length": 346.6625, + "completions/mean_terminated_length": 346.6625, + "completions/min_length": 255.4, + "completions/min_terminated_length": 255.4, + "epoch": 0.004163197335553705, + "grad_norm": 0.06523851911226544, + "kl": 0.0028778076171875, + "learning_rate": 9.999614046155623e-07, + "loss": 0.0001, + "num_tokens": 514611.0, + "reward": 1.9541666746139525, + "reward_std": 0.12878680378198623, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.3, + "completions/max_terminated_length": 473.3, + "completions/mean_length": 360.95, + "completions/mean_terminated_length": 360.95, + "completions/min_length": 274.2, + "completions/min_terminated_length": 274.2, + "epoch": 0.0062447960033305576, + "grad_norm": 5.522680149040527, + "kl": 0.006317138671875, + "learning_rate": 9.99910088190945e-07, + "loss": 0.0003, + "num_tokens": 770799.0, + "reward": 1.789285707473755, + "reward_std": 0.2921529281884432, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.2112731844186783, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.026785714365541936, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.059929624944925305, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.2, + "completions/max_terminated_length": 520.2, + "completions/mean_length": 393.15, + "completions/mean_terminated_length": 393.15, + "completions/min_length": 297.4, + "completions/min_terminated_length": 297.4, + "epoch": 0.00832639467110741, + "grad_norm": 0.28591884336780005, + "kl": 0.0102813720703125, + "learning_rate": 9.998373929280957e-07, + "loss": 0.0004, + "num_tokens": 1046283.0, + "reward": 1.89375, + "reward_std": 0.2680151164531708, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.22220885157585143, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.9, + "completions/max_terminated_length": 479.9, + "completions/mean_length": 377.4125, + "completions/mean_terminated_length": 377.4125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.010407993338884263, + "grad_norm": 6.083028786361364, + "kl": 0.01175537109375, + "learning_rate": 9.997433219358542e-07, + "loss": 0.0005, + "num_tokens": 1262500.0, + "reward": 1.7572916746139526, + "reward_std": 0.31888280510902406, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 0.3202547788619995, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00729166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.013684006035327911, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.4, + "completions/max_terminated_length": 468.4, + "completions/mean_length": 363.9625, + "completions/mean_terminated_length": 363.9625, + "completions/min_length": 262.7, + "completions/min_terminated_length": 262.7, + "epoch": 0.012489592006661115, + "grad_norm": 4.769209326375173, + "kl": 0.013623046875, + "learning_rate": 9.996278792372007e-07, + "loss": 0.0005, + "num_tokens": 1535937.0, + "reward": 1.55625, + "reward_std": 0.27814957946538926, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.25500801801681516, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.3, + "completions/max_terminated_length": 469.3, + "completions/mean_length": 352.2625, + "completions/mean_terminated_length": 352.2625, + "completions/min_length": 249.5, + "completions/min_terminated_length": 249.5, + "epoch": 0.014571190674437969, + "grad_norm": 5.546453307423874, + "kl": 0.0191650390625, + "learning_rate": 9.994910697690848e-07, + "loss": 0.0008, + "num_tokens": 1795742.0, + "reward": 1.8916666746139525, + "reward_std": 0.2752553790807724, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.25587469935417173, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.1, + "completions/max_terminated_length": 541.1, + "completions/mean_length": 387.55, + "completions/mean_terminated_length": 387.55, + "completions/min_length": 271.5, + "completions/min_terminated_length": 271.5, + "epoch": 0.01665278934221482, + "grad_norm": 3.9471842106226727, + "kl": 0.015997314453125, + "learning_rate": 9.993328993822132e-07, + "loss": 0.0006, + "num_tokens": 2068658.0, + "reward": 1.775, + "reward_std": 0.20214119255542756, + "rewards/accuracy_reward/mean": 0.7625, + "rewards/accuracy_reward/std": 0.20957585871219636, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.018898223340511323, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.2, + "completions/max_terminated_length": 513.2, + "completions/mean_length": 391.1, + "completions/mean_terminated_length": 391.1, + "completions/min_length": 270.3, + "completions/min_terminated_length": 270.3, + "epoch": 0.018734388009991675, + "grad_norm": 0.10980281422900377, + "kl": 0.0157958984375, + "learning_rate": 9.99153374840801e-07, + "loss": 0.0006, + "num_tokens": 2313026.0, + "reward": 1.625, + "reward_std": 0.2109176844358444, + "rewards/accuracy_reward/mean": 0.5875, + "rewards/accuracy_reward/std": 0.13509859144687653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000149011612, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08040101677179337, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.7, + "completions/max_terminated_length": 537.7, + "completions/mean_length": 412.25, + "completions/mean_terminated_length": 412.25, + "completions/min_length": 318.3, + "completions/min_terminated_length": 318.3, + "epoch": 0.020815986677768527, + "grad_norm": 0.11232662339475162, + "kl": 0.014801025390625, + "learning_rate": 9.989525038222806e-07, + "loss": 0.0006, + "num_tokens": 2583670.0, + "reward": 1.85, + "reward_std": 0.25040294229984283, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.25586686432361605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.5, + "completions/max_terminated_length": 461.5, + "completions/mean_length": 364.275, + "completions/mean_terminated_length": 364.275, + "completions/min_length": 281.2, + "completions/min_terminated_length": 281.2, + "epoch": 0.02289758534554538, + "grad_norm": 0.17438948467243226, + "kl": 0.018988037109375, + "learning_rate": 9.987302949169748e-07, + "loss": 0.0008, + "num_tokens": 2849020.0, + "reward": 1.8645833253860473, + "reward_std": 0.23733972944319248, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.23144719302654265, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002083333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00589255727827549, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 352.5875, + "completions/mean_terminated_length": 352.5875, + "completions/min_length": 236.4, + "completions/min_terminated_length": 236.4, + "epoch": 0.02497918401332223, + "grad_norm": 5.1924414165917305, + "kl": 0.02213134765625, + "learning_rate": 9.984867576277293e-07, + "loss": 0.0009, + "num_tokens": 3103243.0, + "reward": 1.8191666722297668, + "reward_std": 0.24814118593931198, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.2386084347963333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.006666666828095913, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01885618269443512, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.5, + "completions/max_terminated_length": 416.5, + "completions/mean_length": 327.925, + "completions/mean_terminated_length": 327.925, + "completions/min_length": 242.6, + "completions/min_terminated_length": 242.6, + "epoch": 0.027060782681099085, + "grad_norm": 0.2046874877968627, + "kl": 0.024169921875, + "learning_rate": 9.982219023695053e-07, + "loss": 0.001, + "num_tokens": 3339501.0, + "reward": 1.7830357074737548, + "reward_std": 0.2340016055852175, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.2112731844186783, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008035714365541935, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.022728431969881058, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 294.975, + "completions/mean_terminated_length": 294.975, + "completions/min_length": 208.2, + "completions/min_terminated_length": 208.2, + "epoch": 0.029142381348875937, + "grad_norm": 0.1422658013873825, + "kl": 0.02890625, + "learning_rate": 9.979357404689349e-07, + "loss": 0.0012, + "num_tokens": 3607371.0, + "reward": 1.8625, + "reward_std": 0.16875659823417663, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.6, + "completions/max_terminated_length": 452.6, + "completions/mean_length": 341.3125, + "completions/mean_terminated_length": 341.3125, + "completions/min_length": 249.2, + "completions/min_terminated_length": 249.2, + "epoch": 0.03122398001665279, + "grad_norm": 6.274183399039419, + "kl": 0.025732421875, + "learning_rate": 9.97628284163837e-07, + "loss": 0.001, + "num_tokens": 3859748.0, + "reward": 1.8802689909934998, + "reward_std": 0.21689079953357576, + "rewards/accuracy_reward/mean": 0.8511023391969502, + "rewards/accuracy_reward/std": 0.11791455755010247, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666567325592, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07967560291290283, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.7, + "completions/max_terminated_length": 469.7, + "completions/mean_length": 342.8625, + "completions/mean_terminated_length": 342.8625, + "completions/min_length": 248.8, + "completions/min_terminated_length": 248.8, + "epoch": 0.03330557868442964, + "grad_norm": 4.393785318086197, + "kl": 0.03060302734375, + "learning_rate": 9.97299546602693e-07, + "loss": 0.0012, + "num_tokens": 4128401.0, + "reward": 1.934375, + "reward_std": 0.17850233241915703, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.1632926881313324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 557.4, + "completions/max_terminated_length": 514.4, + "completions/mean_length": 408.15, + "completions/mean_terminated_length": 400.9410736083984, + "completions/min_length": 293.4, + "completions/min_terminated_length": 293.4, + "epoch": 0.03538717735220649, + "grad_norm": 0.14407093242039448, + "kl": 0.02762451171875, + "learning_rate": 9.969495418440855e-07, + "loss": 0.0011, + "num_tokens": 4386837.0, + "reward": 1.7154840588569642, + "reward_std": 0.12313865721225739, + "rewards/accuracy_reward/mean": 0.7217340528964996, + "rewards/accuracy_reward/std": 0.11099039763212204, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.2, + "completions/max_terminated_length": 594.2, + "completions/mean_length": 469.9625, + "completions/mean_terminated_length": 469.9625, + "completions/min_length": 344.2, + "completions/min_terminated_length": 344.2, + "epoch": 0.03746877601998335, + "grad_norm": 4.568523263801617, + "kl": 0.0269287109375, + "learning_rate": 9.965782848560961e-07, + "loss": 0.0011, + "num_tokens": 4635530.0, + "reward": 1.740625, + "reward_std": 0.36733007729053496, + "rewards/accuracy_reward/mean": 0.7, + "rewards/accuracy_reward/std": 0.3130935370922089, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.040625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08054837882518769, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.5, + "completions/max_terminated_length": 553.5, + "completions/mean_length": 426.9375, + "completions/mean_terminated_length": 426.9375, + "completions/min_length": 300.1, + "completions/min_terminated_length": 300.1, + "epoch": 0.0395503746877602, + "grad_norm": 4.8879251566662925, + "kl": 0.0309326171875, + "learning_rate": 9.961857915156661e-07, + "loss": 0.0012, + "num_tokens": 4900021.0, + "reward": 1.9059523820877076, + "reward_std": 0.23443024940788745, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.20580926835536956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018452381156384944, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0521912157535553, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.6, + "completions/max_terminated_length": 615.6, + "completions/mean_length": 441.1625, + "completions/mean_terminated_length": 441.1625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.041631973355537054, + "grad_norm": 4.457740693953414, + "kl": 0.029052734375, + "learning_rate": 9.95772078607917e-07, + "loss": 0.0012, + "num_tokens": 5177594.0, + "reward": 1.746875, + "reward_std": 0.15954835414886476, + "rewards/accuracy_reward/mean": 0.7375, + "rewards/accuracy_reward/std": 0.1595182627439499, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 392.4625, + "completions/mean_terminated_length": 392.4625, + "completions/min_length": 311.9, + "completions/min_terminated_length": 311.9, + "epoch": 0.043713572023313906, + "grad_norm": 0.20284855124141812, + "kl": 0.03153076171875, + "learning_rate": 9.953371638254334e-07, + "loss": 0.0013, + "num_tokens": 5458767.0, + "reward": 1.8024999976158143, + "reward_std": 0.20019548237323762, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.2032530963420868, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002500000037252903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00707106813788414, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 340.35, + "completions/mean_terminated_length": 340.35, + "completions/min_length": 236.8, + "completions/min_terminated_length": 236.8, + "epoch": 0.04579517069109076, + "grad_norm": 0.12400643377970895, + "kl": 0.0336181640625, + "learning_rate": 9.94881065767505e-07, + "loss": 0.0013, + "num_tokens": 5714411.0, + "reward": 1.9456944465637207, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.9394444465637207, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.2, + "completions/max_terminated_length": 501.2, + "completions/mean_length": 379.25, + "completions/mean_terminated_length": 379.25, + "completions/min_length": 290.2, + "completions/min_terminated_length": 290.2, + "epoch": 0.04787676935886761, + "grad_norm": 4.5686106575902805, + "kl": 0.0332763671875, + "learning_rate": 9.94403803939333e-07, + "loss": 0.0013, + "num_tokens": 5967975.0, + "reward": 1.8041666746139526, + "reward_std": 0.12878680378198623, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08249579146504402, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.7, + "completions/max_terminated_length": 483.7, + "completions/mean_length": 377.3625, + "completions/mean_terminated_length": 377.3625, + "completions/min_length": 292.5, + "completions/min_terminated_length": 292.5, + "epoch": 0.04995836802664446, + "grad_norm": 0.19094757033893725, + "kl": 0.0373779296875, + "learning_rate": 9.939053987511937e-07, + "loss": 0.0015, + "num_tokens": 6220364.0, + "reward": 1.8916666746139525, + "reward_std": 0.10983104258775711, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.8, + "completions/max_terminated_length": 489.8, + "completions/mean_length": 383.15, + "completions/mean_terminated_length": 383.15, + "completions/min_length": 289.7, + "completions/min_terminated_length": 289.7, + "epoch": 0.05203996669442131, + "grad_norm": 4.443711087325592, + "kl": 0.0322509765625, + "learning_rate": 9.933858715175687e-07, + "loss": 0.0013, + "num_tokens": 6465384.0, + "reward": 1.7186701416969299, + "reward_std": 0.19897533096373082, + "rewards/accuracy_reward/mean": 0.7143844276666641, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01678571440279484, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04747716933488846, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.8, + "completions/max_terminated_length": 473.8, + "completions/mean_length": 350.7125, + "completions/mean_terminated_length": 350.7125, + "completions/min_length": 257.3, + "completions/min_terminated_length": 257.3, + "epoch": 0.05412156536219817, + "grad_norm": 5.545971354020349, + "kl": 0.03245849609375, + "learning_rate": 9.928452444562298e-07, + "loss": 0.0013, + "num_tokens": 6740745.0, + "reward": 1.9385416626930236, + "reward_std": 0.14416180178523064, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.12416292428970337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013541666977107524, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03830162100493908, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.8, + "completions/max_terminated_length": 437.8, + "completions/mean_length": 324.8875, + "completions/mean_terminated_length": 324.8875, + "completions/min_length": 238.8, + "completions/min_terminated_length": 238.8, + "epoch": 0.05620316402997502, + "grad_norm": 5.431553614830727, + "kl": 0.0399658203125, + "learning_rate": 9.92283540687292e-07, + "loss": 0.0016, + "num_tokens": 7001736.0, + "reward": 1.7467147350311278, + "reward_std": 0.22841061986982822, + "rewards/accuracy_reward/mean": 0.7102564103901386, + "rewards/accuracy_reward/std": 0.14603425860404967, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.036458333395421504, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08728792332112789, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.4, + "completions/max_terminated_length": 418.4, + "completions/mean_length": 328.3875, + "completions/mean_terminated_length": 328.3875, + "completions/min_length": 237.5, + "completions/min_terminated_length": 237.5, + "epoch": 0.058284762697751874, + "grad_norm": 5.238127325433427, + "kl": 0.043896484375, + "learning_rate": 9.917007842322228e-07, + "loss": 0.0018, + "num_tokens": 7259391.0, + "reward": 1.88125, + "reward_std": 0.2597082987427711, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11397495716810227, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.5, + "completions/max_terminated_length": 402.5, + "completions/mean_length": 312.5375, + "completions/mean_terminated_length": 312.5375, + "completions/min_length": 235.2, + "completions/min_terminated_length": 235.2, + "epoch": 0.060366361365528726, + "grad_norm": 5.339336919324121, + "kl": 0.05615234375, + "learning_rate": 9.910970000128159e-07, + "loss": 0.0022, + "num_tokens": 7532170.0, + "reward": 1.8760416746139525, + "reward_std": 0.2622366651892662, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.1595182627439499, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13723524883389474, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 302.975, + "completions/mean_terminated_length": 302.975, + "completions/min_length": 214.8, + "completions/min_terminated_length": 214.8, + "epoch": 0.06244796003330558, + "grad_norm": 0.2096105552296755, + "kl": 0.05185546875, + "learning_rate": 9.904722138501244e-07, + "loss": 0.0021, + "num_tokens": 7784800.0, + "reward": 1.9462037086486816, + "reward_std": 0.13529810905456544, + "rewards/accuracy_reward/mean": 0.8920370370149613, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08084159195423127, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.3, + "completions/max_terminated_length": 379.3, + "completions/mean_length": 316.3875, + "completions/mean_terminated_length": 316.3875, + "completions/min_length": 248.6, + "completions/min_terminated_length": 248.6, + "epoch": 0.06452955870108243, + "grad_norm": 0.19562255789500088, + "kl": 0.044677734375, + "learning_rate": 9.89826452463358e-07, + "loss": 0.0018, + "num_tokens": 8023007.0, + "reward": 1.8239700555801392, + "reward_std": 0.2665687516331673, + "rewards/accuracy_reward/mean": 0.7791486293077469, + "rewards/accuracy_reward/std": 0.2205115258693695, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04482142850756645, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06473954916000366, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.9, + "completions/max_terminated_length": 467.9, + "completions/mean_length": 357.7, + "completions/mean_terminated_length": 357.7, + "completions/min_length": 250.5, + "completions/min_terminated_length": 250.5, + "epoch": 0.06661115736885928, + "grad_norm": 4.368009131540421, + "kl": 0.0458740234375, + "learning_rate": 9.89159743468739e-07, + "loss": 0.0018, + "num_tokens": 8269055.0, + "reward": 1.8868750095367433, + "reward_std": 0.136459456756711, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03687500022351742, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08300721384584904, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 377.475, + "completions/mean_terminated_length": 377.475, + "completions/min_length": 246.1, + "completions/min_terminated_length": 246.1, + "epoch": 0.06869275603663613, + "grad_norm": 5.429243629442194, + "kl": 0.0504150390625, + "learning_rate": 9.884721153783223e-07, + "loss": 0.002, + "num_tokens": 8520957.0, + "reward": 1.6583147287368774, + "reward_std": 0.16093675643205643, + "rewards/accuracy_reward/mean": 0.6437313750386238, + "rewards/accuracy_reward/std": 0.14980084896087648, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027368012070655822, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.2, + "completions/max_terminated_length": 546.2, + "completions/mean_length": 405.0375, + "completions/mean_terminated_length": 405.0375, + "completions/min_length": 285.6, + "completions/min_terminated_length": 285.6, + "epoch": 0.07077435470441298, + "grad_norm": 4.8225656345010695, + "kl": 0.047509765625, + "learning_rate": 9.87763597598775e-07, + "loss": 0.0019, + "num_tokens": 8777248.0, + "reward": 1.9, + "reward_std": 0.07071067690849304, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.3, + "completions/max_terminated_length": 567.3, + "completions/mean_length": 407.8375, + "completions/mean_terminated_length": 407.8375, + "completions/min_length": 295.5, + "completions/min_terminated_length": 295.5, + "epoch": 0.07285595337218984, + "grad_norm": 5.072558874680301, + "kl": 0.0479248046875, + "learning_rate": 9.8703422043012e-07, + "loss": 0.0019, + "num_tokens": 9052947.0, + "reward": 1.8925000190734864, + "reward_std": 0.24598965793848038, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05500000044703483, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11805230379104614, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.1, + "completions/max_terminated_length": 542.1, + "completions/mean_length": 415.9375, + "completions/mean_terminated_length": 415.9375, + "completions/min_length": 292.3, + "completions/min_terminated_length": 292.3, + "epoch": 0.0749375520399667, + "grad_norm": 5.68978401195414, + "kl": 0.04268798828125, + "learning_rate": 9.862840150644394e-07, + "loss": 0.0017, + "num_tokens": 9315070.0, + "reward": 1.849039077758789, + "reward_std": 0.06558054089546203, + "rewards/accuracy_reward/mean": 0.8448724031448365, + "rewards/accuracy_reward/std": 0.06210371255874634, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.3, + "completions/max_terminated_length": 436.3, + "completions/mean_length": 318.0625, + "completions/mean_terminated_length": 318.0625, + "completions/min_length": 208.2, + "completions/min_terminated_length": 208.2, + "epoch": 0.07701915070774355, + "grad_norm": 0.25251783885777485, + "kl": 0.0482177734375, + "learning_rate": 9.855130135845404e-07, + "loss": 0.0019, + "num_tokens": 9576043.0, + "reward": 1.93067307472229, + "reward_std": 0.2553727373480797, + "rewards/accuracy_reward/mean": 0.8744230777025223, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12743539363145828, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.7, + "completions/max_terminated_length": 542.7, + "completions/mean_length": 392.825, + "completions/mean_terminated_length": 392.825, + "completions/min_length": 272.9, + "completions/min_terminated_length": 272.9, + "epoch": 0.0791007493755204, + "grad_norm": 0.1796751788206183, + "kl": 0.043994140625, + "learning_rate": 9.847212489625844e-07, + "loss": 0.0018, + "num_tokens": 9832709.0, + "reward": 1.9983333587646483, + "reward_std": 0.05785674601793289, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010833333618938923, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02250140383839607, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.1, + "completions/max_terminated_length": 492.1, + "completions/mean_length": 373.2375, + "completions/mean_terminated_length": 373.2375, + "completions/min_length": 264.7, + "completions/min_terminated_length": 264.7, + "epoch": 0.08118234804329726, + "grad_norm": 0.2718271523009852, + "kl": 0.04495849609375, + "learning_rate": 9.839087550586756e-07, + "loss": 0.0018, + "num_tokens": 10111256.0, + "reward": 1.8685897588729858, + "reward_std": 0.06406784504652023, + "rewards/accuracy_reward/mean": 0.8369230777025223, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03166666720062494, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06406784281134606, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 356.025, + "completions/mean_terminated_length": 356.025, + "completions/min_length": 246.7, + "completions/min_terminated_length": 246.7, + "epoch": 0.08326394671107411, + "grad_norm": 0.22730851394419502, + "kl": 0.0422119140625, + "learning_rate": 9.830755666194136e-07, + "loss": 0.0017, + "num_tokens": 10386938.0, + "reward": 1.9456876635551452, + "reward_std": 0.01767766922712326, + "rewards/accuracy_reward/mean": 0.9394376605749131, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.6, + "completions/max_terminated_length": 498.6, + "completions/mean_length": 388.375, + "completions/mean_terminated_length": 388.375, + "completions/min_length": 271.4, + "completions/min_terminated_length": 271.4, + "epoch": 0.08534554537885096, + "grad_norm": 5.316651335823694, + "kl": 0.045263671875, + "learning_rate": 9.822217192764078e-07, + "loss": 0.0018, + "num_tokens": 10647280.0, + "reward": 1.9, + "reward_std": 0.11700168251991272, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.3, + "completions/max_terminated_length": 483.3, + "completions/mean_length": 375.525, + "completions/mean_terminated_length": 375.525, + "completions/min_length": 268.7, + "completions/min_terminated_length": 268.7, + "epoch": 0.08742714404662781, + "grad_norm": 0.16182374320605225, + "kl": 0.04052734375, + "learning_rate": 9.813472495447527e-07, + "loss": 0.0016, + "num_tokens": 10923474.0, + "reward": 1.8870895028114318, + "reward_std": 0.11237782835960389, + "rewards/accuracy_reward/mean": 0.866256158053875, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556756734848, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.8, + "completions/max_terminated_length": 505.8, + "completions/mean_length": 379.7, + "completions/mean_terminated_length": 379.7, + "completions/min_length": 259.8, + "completions/min_terminated_length": 259.8, + "epoch": 0.08950874271440466, + "grad_norm": 0.15437642494043818, + "kl": 0.0384033203125, + "learning_rate": 9.804521948214671e-07, + "loss": 0.0015, + "num_tokens": 11187162.0, + "reward": 1.821875, + "reward_std": 0.13258251920342445, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184229493141, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.5, + "completions/max_terminated_length": 500.5, + "completions/mean_length": 393.1375, + "completions/mean_terminated_length": 393.1375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.09159034138218151, + "grad_norm": 0.2088322620701091, + "kl": 0.0418701171875, + "learning_rate": 9.795365933838946e-07, + "loss": 0.0017, + "num_tokens": 11460861.0, + "reward": 1.9864583492279053, + "reward_std": 0.09016691148281097, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03671465814113617, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 394.0875, + "completions/mean_terminated_length": 394.0875, + "completions/min_length": 301.1, + "completions/min_terminated_length": 301.1, + "epoch": 0.09367194004995837, + "grad_norm": 0.14926889020190243, + "kl": 0.03753662109375, + "learning_rate": 9.786004843880663e-07, + "loss": 0.0015, + "num_tokens": 11682516.0, + "reward": 2.0981250047683715, + "reward_std": 0.056880544126033786, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09812500476837158, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0568805381655693, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.5, + "completions/max_terminated_length": 449.5, + "completions/mean_length": 354.5375, + "completions/mean_terminated_length": 354.5375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.09575353871773522, + "grad_norm": 0.19510909785876593, + "kl": 0.0388916015625, + "learning_rate": 9.776439078670266e-07, + "loss": 0.0016, + "num_tokens": 11949799.0, + "reward": 1.8666666746139526, + "reward_std": 0.12705429196357726, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.5, + "completions/max_terminated_length": 507.5, + "completions/mean_length": 383.425, + "completions/mean_terminated_length": 383.425, + "completions/min_length": 281.3, + "completions/min_terminated_length": 281.3, + "epoch": 0.09783513738551207, + "grad_norm": 5.56912057426005, + "kl": 0.042724609375, + "learning_rate": 9.766669047291212e-07, + "loss": 0.0017, + "num_tokens": 12211601.0, + "reward": 1.8916666746139525, + "reward_std": 0.08512316644191742, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.034930617362260816, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.1, + "completions/max_terminated_length": 502.1, + "completions/mean_length": 392.5125, + "completions/mean_terminated_length": 392.5125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.09991673605328892, + "grad_norm": 5.648352316551891, + "kl": 0.043408203125, + "learning_rate": 9.756695167562477e-07, + "loss": 0.0017, + "num_tokens": 12472202.0, + "reward": 1.7966435194015502, + "reward_std": 0.12664942545816302, + "rewards/accuracy_reward/mean": 0.7643518518656492, + "rewards/accuracy_reward/std": 0.05289790946990251, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07375151664018631, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 370.5875, + "completions/mean_terminated_length": 370.5875, + "completions/min_length": 278.2, + "completions/min_terminated_length": 278.2, + "epoch": 0.10199833472106577, + "grad_norm": 4.6762294422697215, + "kl": 0.048583984375, + "learning_rate": 9.746517866020685e-07, + "loss": 0.0019, + "num_tokens": 12731665.0, + "reward": 1.7166666746139527, + "reward_std": 0.19317471832036973, + "rewards/accuracy_reward/mean": 0.7125, + "rewards/accuracy_reward/std": 0.18138959705829621, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 346.7, + "completions/mean_terminated_length": 346.7, + "completions/min_length": 225.1, + "completions/min_terminated_length": 225.1, + "epoch": 0.10407993338884262, + "grad_norm": 4.944784659074124, + "kl": 0.0518310546875, + "learning_rate": 9.736137577901864e-07, + "loss": 0.0021, + "num_tokens": 13005057.0, + "reward": 1.8979166746139526, + "reward_std": 0.12586160004138947, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.10606601536273956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.7, + "completions/max_terminated_length": 476.7, + "completions/mean_length": 332.4625, + "completions/mean_terminated_length": 332.4625, + "completions/min_length": 208.4, + "completions/min_terminated_length": 208.4, + "epoch": 0.10616153205661949, + "grad_norm": 5.837744557276559, + "kl": 0.0501220703125, + "learning_rate": 9.725554747122847e-07, + "loss": 0.002, + "num_tokens": 13278646.0, + "reward": 1.9418981552124024, + "reward_std": 0.14538512378931046, + "rewards/accuracy_reward/mean": 0.8939814820885659, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04791666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1100297823548317, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.6, + "completions/max_terminated_length": 439.6, + "completions/mean_length": 324.5875, + "completions/mean_terminated_length": 324.5875, + "completions/min_length": 196.9, + "completions/min_terminated_length": 196.9, + "epoch": 0.10824313072439634, + "grad_norm": 5.582698319005468, + "kl": 0.0571044921875, + "learning_rate": 9.714769826262268e-07, + "loss": 0.0023, + "num_tokens": 13529733.0, + "reward": 1.8927083492279053, + "reward_std": 0.13453055396676064, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04572295844554901, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 316.6875, + "completions/mean_terminated_length": 316.6875, + "completions/min_length": 196.7, + "completions/min_terminated_length": 196.7, + "epoch": 0.1103247293921732, + "grad_norm": 0.29368782924063275, + "kl": 0.0617919921875, + "learning_rate": 9.703783276541226e-07, + "loss": 0.0025, + "num_tokens": 13802252.0, + "reward": 2.009375, + "reward_std": 0.07692329585552216, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.7, + "completions/max_terminated_length": 491.7, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.11240632805995004, + "grad_norm": 0.12367376826560877, + "kl": 0.0528076171875, + "learning_rate": 9.69259556780355e-07, + "loss": 0.0021, + "num_tokens": 14057620.0, + "reward": 1.76875, + "reward_std": 0.17920753061771394, + "rewards/accuracy_reward/mean": 0.7625, + "rewards/accuracy_reward/std": 0.185156187415123, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 340.775, + "completions/mean_terminated_length": 340.775, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.1144879267277269, + "grad_norm": 0.13246155741271956, + "kl": 0.05185546875, + "learning_rate": 9.6812071784957e-07, + "loss": 0.0021, + "num_tokens": 14323042.0, + "reward": 1.8, + "reward_std": 0.14548112079501152, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027439431101083756, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.5, + "completions/max_terminated_length": 452.5, + "completions/mean_length": 353.1625, + "completions/mean_terminated_length": 353.1625, + "completions/min_length": 242.9, + "completions/min_terminated_length": 242.9, + "epoch": 0.11656952539550375, + "grad_norm": 0.16536302838147435, + "kl": 0.0464111328125, + "learning_rate": 9.669618595646326e-07, + "loss": 0.0019, + "num_tokens": 14601999.0, + "reward": 1.9729166746139526, + "reward_std": 0.0812177062034607, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.8, + "completions/max_terminated_length": 523.8, + "completions/mean_length": 402.1625, + "completions/mean_terminated_length": 402.1625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1186511240632806, + "grad_norm": 5.102191465543784, + "kl": 0.044482421875, + "learning_rate": 9.657830314845423e-07, + "loss": 0.0018, + "num_tokens": 14880164.0, + "reward": 2.00625, + "reward_std": 0.01767766922712326, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.7, + "completions/max_terminated_length": 511.7, + "completions/mean_length": 403.7875, + "completions/mean_terminated_length": 403.7875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.12073272273105745, + "grad_norm": 0.17497369248614117, + "kl": 0.040087890625, + "learning_rate": 9.64584284022314e-07, + "loss": 0.0016, + "num_tokens": 15134971.0, + "reward": 2.0322916746139525, + "reward_std": 0.07375151813030242, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07375151664018631, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.2, + "completions/max_terminated_length": 521.2, + "completions/mean_length": 390.7375, + "completions/mean_terminated_length": 390.7375, + "completions/min_length": 274.5, + "completions/min_terminated_length": 274.5, + "epoch": 0.1228143213988343, + "grad_norm": 0.16337098172815034, + "kl": 0.04287109375, + "learning_rate": 9.633656684428226e-07, + "loss": 0.0017, + "num_tokens": 15387070.0, + "reward": 1.9728573560714722, + "reward_std": 0.16534992158412934, + "rewards/accuracy_reward/mean": 0.9132739961147308, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059583334252238274, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09020403549075126, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.4, + "completions/max_terminated_length": 445.4, + "completions/mean_length": 344.3875, + "completions/mean_terminated_length": 344.3875, + "completions/min_length": 234.9, + "completions/min_terminated_length": 234.9, + "epoch": 0.12489592006661115, + "grad_norm": 0.19085037941157149, + "kl": 0.0505859375, + "learning_rate": 9.6212723686061e-07, + "loss": 0.002, + "num_tokens": 15656085.0, + "reward": 2.0025000095367433, + "reward_std": 0.07778174281120301, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015000000037252903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.042426406592130664, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.5, + "completions/max_terminated_length": 516.5, + "completions/mean_length": 376.7875, + "completions/mean_terminated_length": 376.7875, + "completions/min_length": 261.9, + "completions/min_terminated_length": 261.9, + "epoch": 0.126977518734388, + "grad_norm": 0.18065584438130572, + "kl": 0.04794921875, + "learning_rate": 9.608690422376572e-07, + "loss": 0.0019, + "num_tokens": 15930868.0, + "reward": 1.9028934240341187, + "reward_std": 0.08380073457956314, + "rewards/accuracy_reward/mean": 0.8815243899822235, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021369047462940216, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0375097319483757, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.1, + "completions/max_terminated_length": 498.1, + "completions/mean_length": 373.925, + "completions/mean_terminated_length": 373.925, + "completions/min_length": 272.3, + "completions/min_terminated_length": 272.3, + "epoch": 0.12905911740216486, + "grad_norm": 0.1464513024616147, + "kl": 0.0505615234375, + "learning_rate": 9.595911383811186e-07, + "loss": 0.002, + "num_tokens": 16202262.0, + "reward": 1.5799382686614991, + "reward_std": 0.18771235942840575, + "rewards/accuracy_reward/mean": 0.5799382716417313, + "rewards/accuracy_reward/std": 0.18771235942840575, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.6, + "completions/max_terminated_length": 491.6, + "completions/mean_length": 384.05, + "completions/mean_terminated_length": 384.05, + "completions/min_length": 271.8, + "completions/min_terminated_length": 271.8, + "epoch": 0.1311407160699417, + "grad_norm": 5.53441289953858, + "kl": 0.0549072265625, + "learning_rate": 9.58293579941021e-07, + "loss": 0.0022, + "num_tokens": 16469482.0, + "reward": 1.6495346546173095, + "reward_std": 0.19612031616270542, + "rewards/accuracy_reward/mean": 0.6130763038992881, + "rewards/accuracy_reward/std": 0.13490437557920815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06681375280022621, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.3, + "completions/max_terminated_length": 492.3, + "completions/mean_length": 367.45, + "completions/mean_terminated_length": 367.45, + "completions/min_length": 274.3, + "completions/min_terminated_length": 274.3, + "epoch": 0.13322231473771856, + "grad_norm": 5.668662679753432, + "kl": 0.05673828125, + "learning_rate": 9.56976422407927e-07, + "loss": 0.0023, + "num_tokens": 16731918.0, + "reward": 1.7243749976158143, + "reward_std": 0.11841271668672562, + "rewards/accuracy_reward/mean": 0.7125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02437499985098839, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.047702043503522876, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.3, + "completions/max_terminated_length": 458.3, + "completions/mean_length": 342.9625, + "completions/mean_terminated_length": 342.9625, + "completions/min_length": 239.3, + "completions/min_terminated_length": 239.3, + "epoch": 0.1353039134054954, + "grad_norm": 0.18411771479742783, + "kl": 0.053857421875, + "learning_rate": 9.556397221105614e-07, + "loss": 0.0022, + "num_tokens": 17006411.0, + "reward": 1.8183712363243103, + "reward_std": 0.0936412863433361, + "rewards/accuracy_reward/mean": 0.7871212124824524, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125000111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04735027924180031, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.8, + "completions/max_terminated_length": 436.8, + "completions/mean_length": 349.0875, + "completions/mean_terminated_length": 349.0875, + "completions/min_length": 262.7, + "completions/min_terminated_length": 262.7, + "epoch": 0.13738551207327226, + "grad_norm": 5.511871839955663, + "kl": 0.0563720703125, + "learning_rate": 9.542835362134027e-07, + "loss": 0.0023, + "num_tokens": 17261754.0, + "reward": 2.063749980926514, + "reward_std": 0.17790938653051852, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07624999973922968, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15386090911924838, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.5, + "completions/max_terminated_length": 428.5, + "completions/mean_length": 331.75, + "completions/mean_terminated_length": 331.75, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.13946711074104912, + "grad_norm": 5.592099803836835, + "kl": 0.0670654296875, + "learning_rate": 9.529079227142383e-07, + "loss": 0.0027, + "num_tokens": 17534422.0, + "reward": 1.973035740852356, + "reward_std": 0.20074295550584792, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.15235702097415924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0355357151478529, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07101892232894898, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.8, + "completions/max_terminated_length": 480.8, + "completions/mean_length": 370.5125, + "completions/mean_terminated_length": 370.5125, + "completions/min_length": 264.4, + "completions/min_terminated_length": 264.4, + "epoch": 0.14154870940882597, + "grad_norm": 0.18265559151343294, + "kl": 0.055810546875, + "learning_rate": 9.515129404416833e-07, + "loss": 0.0022, + "num_tokens": 17771455.0, + "reward": 2.057083344459534, + "reward_std": 0.12538987398147583, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06958333496004343, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09003453925251961, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.2, + "completions/max_terminated_length": 518.2, + "completions/mean_length": 393.5, + "completions/mean_terminated_length": 393.5, + "completions/min_length": 295.4, + "completions/min_terminated_length": 295.4, + "epoch": 0.14363030807660282, + "grad_norm": 4.6895976996574165, + "kl": 0.05400390625, + "learning_rate": 9.500986490526667e-07, + "loss": 0.0022, + "num_tokens": 18016639.0, + "reward": 1.9352083563804627, + "reward_std": 0.13558952510356903, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06020833440124988, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09784583821892738, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 546.2, + "completions/max_terminated_length": 482.9, + "completions/mean_length": 368.3875, + "completions/mean_terminated_length": 359.7875, + "completions/min_length": 275.2, + "completions/min_terminated_length": 275.2, + "epoch": 0.14571190674437967, + "grad_norm": 0.3020241800583253, + "kl": 0.057861328125, + "learning_rate": 9.486651090298781e-07, + "loss": 0.0023, + "num_tokens": 18254198.0, + "reward": 1.9729166746139526, + "reward_std": 0.17288785427808762, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.07071067690849304, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04459637701511383, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.8, + "completions/max_terminated_length": 470.8, + "completions/mean_length": 375.4125, + "completions/mean_terminated_length": 375.4125, + "completions/min_length": 293.4, + "completions/min_terminated_length": 293.4, + "epoch": 0.14779350541215652, + "grad_norm": 4.671918257015997, + "kl": 0.0577392578125, + "learning_rate": 9.472123816791822e-07, + "loss": 0.0023, + "num_tokens": 18496943.0, + "reward": 1.9375000238418578, + "reward_std": 0.18562961220741273, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06862791702151298, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.1, + "completions/max_terminated_length": 474.1, + "completions/mean_length": 357.7625, + "completions/mean_terminated_length": 357.7625, + "completions/min_length": 270.7, + "completions/min_terminated_length": 270.7, + "epoch": 0.1498751040799334, + "grad_norm": 5.427919347031838, + "kl": 0.05341796875, + "learning_rate": 9.457405291269969e-07, + "loss": 0.0021, + "num_tokens": 18735012.0, + "reward": 1.9375, + "reward_std": 0.0816463440656662, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.4, + "completions/max_terminated_length": 456.4, + "completions/mean_length": 330.775, + "completions/mean_terminated_length": 330.775, + "completions/min_length": 240.2, + "completions/min_terminated_length": 240.2, + "epoch": 0.15195670274771025, + "grad_norm": 0.16748703582364957, + "kl": 0.05107421875, + "learning_rate": 9.442496143176363e-07, + "loss": 0.002, + "num_tokens": 18980186.0, + "reward": 1.829650616645813, + "reward_std": 0.11030184328556061, + "rewards/accuracy_reward/mean": 0.8088172636926174, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03959116339683533, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.7, + "completions/max_terminated_length": 496.7, + "completions/mean_length": 382.625, + "completions/mean_terminated_length": 382.625, + "completions/min_length": 263.4, + "completions/min_terminated_length": 263.4, + "epoch": 0.1540383014154871, + "grad_norm": 5.032838834280366, + "kl": 0.054248046875, + "learning_rate": 9.427397010106189e-07, + "loss": 0.0022, + "num_tokens": 19251148.0, + "reward": 1.7395833492279054, + "reward_std": 0.08753891214728356, + "rewards/accuracy_reward/mean": 0.725, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04124789834022522, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.2, + "completions/max_terminated_length": 537.2, + "completions/mean_length": 413.0125, + "completions/mean_terminated_length": 413.0125, + "completions/min_length": 303.7, + "completions/min_terminated_length": 303.7, + "epoch": 0.15611990008326396, + "grad_norm": 0.19302912446240003, + "kl": 0.0564208984375, + "learning_rate": 9.412108537779411e-07, + "loss": 0.0023, + "num_tokens": 19464021.0, + "reward": 1.8952083587646484, + "reward_std": 0.16093288138508796, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0702083345502615, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11464186012744904, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.6, + "completions/max_terminated_length": 537.6, + "completions/mean_length": 413.95, + "completions/mean_terminated_length": 413.95, + "completions/min_length": 327.7, + "completions/min_terminated_length": 327.7, + "epoch": 0.1582014987510408, + "grad_norm": 0.145118164583351, + "kl": 0.0509765625, + "learning_rate": 9.396631380013151e-07, + "loss": 0.002, + "num_tokens": 19717945.0, + "reward": 1.9875, + "reward_std": 0.03535533845424652, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.7, + "completions/max_terminated_length": 493.7, + "completions/mean_length": 370.6375, + "completions/mean_terminated_length": 370.6375, + "completions/min_length": 269.5, + "completions/min_terminated_length": 269.5, + "epoch": 0.16028309741881766, + "grad_norm": 0.20241676308921436, + "kl": 0.0595703125, + "learning_rate": 9.38096619869374e-07, + "loss": 0.0024, + "num_tokens": 19979964.0, + "reward": 1.800843644142151, + "reward_std": 0.18159112185239792, + "rewards/accuracy_reward/mean": 0.7644894897937775, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04885416626930237, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10429937615990639, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.3, + "completions/max_terminated_length": 475.3, + "completions/mean_length": 370.35, + "completions/mean_terminated_length": 370.35, + "completions/min_length": 281.1, + "completions/min_terminated_length": 281.1, + "epoch": 0.1623646960865945, + "grad_norm": 0.185958325228719, + "kl": 0.0536865234375, + "learning_rate": 9.365113663748398e-07, + "loss": 0.0021, + "num_tokens": 20248624.0, + "reward": 1.9333333492279052, + "reward_std": 0.08614101856946946, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05078567415475845, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.9, + "completions/max_terminated_length": 448.9, + "completions/mean_length": 354.825, + "completions/mean_terminated_length": 354.825, + "completions/min_length": 242.5, + "completions/min_terminated_length": 242.5, + "epoch": 0.16444629475437136, + "grad_norm": 0.1756506621150048, + "kl": 0.0556396484375, + "learning_rate": 9.349074453116597e-07, + "loss": 0.0022, + "num_tokens": 20502578.0, + "reward": 1.7383797764778137, + "reward_std": 0.1407657042145729, + "rewards/accuracy_reward/mean": 0.7000464394688606, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03833333365619183, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06858938410878182, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.8, + "completions/max_terminated_length": 495.8, + "completions/mean_length": 377.775, + "completions/mean_terminated_length": 377.775, + "completions/min_length": 285.2, + "completions/min_terminated_length": 285.2, + "epoch": 0.16652789342214822, + "grad_norm": 0.12984067526566828, + "kl": 0.0520263671875, + "learning_rate": 9.332849252721059e-07, + "loss": 0.0021, + "num_tokens": 20774224.0, + "reward": 1.8625, + "reward_std": 0.05175491571426392, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.9, + "completions/max_terminated_length": 516.9, + "completions/mean_length": 394.05, + "completions/mean_terminated_length": 394.05, + "completions/min_length": 287.7, + "completions/min_terminated_length": 287.7, + "epoch": 0.16860949208992507, + "grad_norm": 0.1747884207833455, + "kl": 0.0486083984375, + "learning_rate": 9.316438756438429e-07, + "loss": 0.0019, + "num_tokens": 21028532.0, + "reward": 1.9053641319274903, + "reward_std": 0.04150375239551067, + "rewards/accuracy_reward/mean": 0.9053641200065613, + "rewards/accuracy_reward/std": 0.04150375053286552, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.8, + "completions/max_terminated_length": 489.8, + "completions/mean_length": 340.1, + "completions/mean_terminated_length": 340.1, + "completions/min_length": 230.9, + "completions/min_terminated_length": 230.9, + "epoch": 0.17069109075770192, + "grad_norm": 4.988214414987532, + "kl": 0.0572509765625, + "learning_rate": 9.299843666069601e-07, + "loss": 0.0023, + "num_tokens": 21294116.0, + "reward": 1.9336805582046508, + "reward_std": 0.2032353922724724, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0711805559694767, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11472872197628022, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.4, + "completions/max_terminated_length": 425.4, + "completions/mean_length": 318.6875, + "completions/mean_terminated_length": 318.6875, + "completions/min_length": 221.7, + "completions/min_terminated_length": 221.7, + "epoch": 0.17277268942547877, + "grad_norm": 0.23283062060027884, + "kl": 0.0542236328125, + "learning_rate": 9.283064691309696e-07, + "loss": 0.0022, + "num_tokens": 21558139.0, + "reward": 1.8541666746139527, + "reward_std": 0.10436713248491288, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.3, + "completions/max_terminated_length": 464.3, + "completions/mean_length": 358.775, + "completions/mean_terminated_length": 358.775, + "completions/min_length": 262.1, + "completions/min_terminated_length": 262.1, + "epoch": 0.17485428809325562, + "grad_norm": 5.329373226907204, + "kl": 0.0521240234375, + "learning_rate": 9.266102549717725e-07, + "loss": 0.0021, + "num_tokens": 21820729.0, + "reward": 1.728125, + "reward_std": 0.07954951077699661, + "rewards/accuracy_reward/mean": 0.7125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044194172322750094, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.2, + "completions/max_terminated_length": 471.2, + "completions/mean_length": 367.5, + "completions/mean_terminated_length": 367.5, + "completions/min_length": 269.8, + "completions/min_terminated_length": 269.8, + "epoch": 0.17693588676103247, + "grad_norm": 5.154771321049736, + "kl": 0.0549072265625, + "learning_rate": 9.248957966685891e-07, + "loss": 0.0022, + "num_tokens": 22088897.0, + "reward": 1.85, + "reward_std": 0.24768393635749816, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.1687566041946411, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10606601536273956, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.3, + "completions/max_terminated_length": 474.3, + "completions/mean_length": 367.5125, + "completions/mean_terminated_length": 367.5125, + "completions/min_length": 280.7, + "completions/min_terminated_length": 280.7, + "epoch": 0.17901748542880933, + "grad_norm": 0.18589784255518843, + "kl": 0.05771484375, + "learning_rate": 9.231631675408574e-07, + "loss": 0.0023, + "num_tokens": 22350946.0, + "reward": 2.025, + "reward_std": 0.046291005611419675, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.7, + "completions/max_terminated_length": 517.7, + "completions/mean_length": 390.7375, + "completions/mean_terminated_length": 390.7375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.18109908409658618, + "grad_norm": 0.1660448340192837, + "kl": 0.054931640625, + "learning_rate": 9.214124416850976e-07, + "loss": 0.0022, + "num_tokens": 22609893.0, + "reward": 1.9479166746139527, + "reward_std": 0.0812177062034607, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.9, + "completions/max_terminated_length": 506.9, + "completions/mean_length": 404.5, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 308.9, + "completions/min_terminated_length": 308.9, + "epoch": 0.18318068276436303, + "grad_norm": 0.2285617215764742, + "kl": 0.0521240234375, + "learning_rate": 9.196436939717427e-07, + "loss": 0.0021, + "num_tokens": 22887917.0, + "reward": 2.008333349227905, + "reward_std": 0.023570242524147033, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 408.55, + "completions/mean_terminated_length": 408.55, + "completions/min_length": 278.2, + "completions/min_terminated_length": 278.2, + "epoch": 0.18526228143213988, + "grad_norm": 0.15322648795694802, + "kl": 0.0478515625, + "learning_rate": 9.178570000419372e-07, + "loss": 0.0019, + "num_tokens": 23155369.0, + "reward": 1.9081249952316284, + "reward_std": 0.015569546818733215, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008124999701976776, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.015569545328617096, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 417.725, + "completions/mean_terminated_length": 417.725, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.18734388009991673, + "grad_norm": 0.13314228532582625, + "kl": 0.0444580078125, + "learning_rate": 9.160524363043022e-07, + "loss": 0.0018, + "num_tokens": 23428683.0, + "reward": 1.93125, + "reward_std": 0.06396867483854293, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.6, + "completions/max_terminated_length": 544.6, + "completions/mean_length": 408.4, + "completions/mean_terminated_length": 408.4, + "completions/min_length": 313.4, + "completions/min_terminated_length": 313.4, + "epoch": 0.18942547876769358, + "grad_norm": 0.1512692723591993, + "kl": 0.0454833984375, + "learning_rate": 9.14230079931668e-07, + "loss": 0.0018, + "num_tokens": 23665659.0, + "reward": 1.90625, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 459.6625, + "completions/mean_terminated_length": 459.6625, + "completions/min_length": 346.2, + "completions/min_terminated_length": 346.2, + "epoch": 0.19150707743547044, + "grad_norm": 0.20503774244974085, + "kl": 0.051025390625, + "learning_rate": 9.123900088577726e-07, + "loss": 0.002, + "num_tokens": 23943032.0, + "reward": 1.883680558204651, + "reward_std": 0.12932045757770538, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02118055559694767, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04767410829663277, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 431.025, + "completions/mean_terminated_length": 431.025, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.1935886761032473, + "grad_norm": 0.21587849440014878, + "kl": 0.0527587890625, + "learning_rate": 9.105323017739304e-07, + "loss": 0.0021, + "num_tokens": 24210330.0, + "reward": 1.88125, + "reward_std": 0.12246951609849929, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0408231720328331, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.5, + "completions/max_terminated_length": 530.5, + "completions/mean_length": 398.3375, + "completions/mean_terminated_length": 398.3375, + "completions/min_length": 286.4, + "completions/min_terminated_length": 286.4, + "epoch": 0.19567027477102414, + "grad_norm": 4.54774855713076, + "kl": 0.05244140625, + "learning_rate": 9.086570381256662e-07, + "loss": 0.0021, + "num_tokens": 24472445.0, + "reward": 1.8766865015029908, + "reward_std": 0.10292123556137085, + "rewards/accuracy_reward/mean": 0.8543650805950165, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02232142835855484, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05663023442029953, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.4, + "completions/max_terminated_length": 532.4, + "completions/mean_length": 420.3625, + "completions/mean_terminated_length": 420.3625, + "completions/min_length": 320.3, + "completions/min_terminated_length": 320.3, + "epoch": 0.197751873438801, + "grad_norm": 4.543023975771067, + "kl": 0.05146484375, + "learning_rate": 9.067642981093174e-07, + "loss": 0.0021, + "num_tokens": 24737890.0, + "reward": 1.9541666746139525, + "reward_std": 0.1006326362490654, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05434163063764572, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.6, + "completions/max_terminated_length": 610.6, + "completions/mean_length": 474.5625, + "completions/mean_terminated_length": 474.5625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.19983347210657784, + "grad_norm": 4.854188023647615, + "kl": 0.050244140625, + "learning_rate": 9.048541626686046e-07, + "loss": 0.002, + "num_tokens": 25006047.0, + "reward": 1.7625, + "reward_std": 0.22051936089992524, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 0.1851640224456787, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.6, + "completions/max_terminated_length": 564.6, + "completions/mean_length": 441.7375, + "completions/mean_terminated_length": 441.7375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.2019150707743547, + "grad_norm": 0.19432002582692567, + "kl": 0.0546142578125, + "learning_rate": 9.029267134911708e-07, + "loss": 0.0022, + "num_tokens": 25285434.0, + "reward": 1.6375, + "reward_std": 0.05175491571426392, + "rewards/accuracy_reward/mean": 0.6375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.1, + "completions/max_terminated_length": 585.1, + "completions/mean_length": 437.9375, + "completions/mean_terminated_length": 437.9375, + "completions/min_length": 310.5, + "completions/min_terminated_length": 310.5, + "epoch": 0.20399666944213155, + "grad_norm": 4.642014207555178, + "kl": 0.0569580078125, + "learning_rate": 9.009820330050866e-07, + "loss": 0.0023, + "num_tokens": 25557781.0, + "reward": 1.8791666746139526, + "reward_std": 0.27757782191038133, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.1957120805978775, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09299983680248261, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.3, + "completions/max_terminated_length": 495.3, + "completions/mean_length": 391.825, + "completions/mean_terminated_length": 391.825, + "completions/min_length": 268.7, + "completions/min_terminated_length": 268.7, + "epoch": 0.2060782681099084, + "grad_norm": 0.2008078809099005, + "kl": 0.0586181640625, + "learning_rate": 8.990202043753261e-07, + "loss": 0.0023, + "num_tokens": 25833711.0, + "reward": 1.6260989665985108, + "reward_std": 0.10520716309547425, + "rewards/accuracy_reward/mean": 0.626098969578743, + "rewards/accuracy_reward/std": 0.10520716905593872, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.6, + "completions/max_terminated_length": 591.6, + "completions/mean_length": 453.2875, + "completions/mean_terminated_length": 453.2875, + "completions/min_length": 334.4, + "completions/min_terminated_length": 334.4, + "epoch": 0.20815986677768525, + "grad_norm": 0.25846319187573186, + "kl": 0.058935546875, + "learning_rate": 8.9704131150021e-07, + "loss": 0.0024, + "num_tokens": 26111030.0, + "reward": 1.853125, + "reward_std": 0.133001758903265, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.2, + "completions/max_terminated_length": 508.2, + "completions/mean_length": 393.5875, + "completions/mean_terminated_length": 393.5875, + "completions/min_length": 281.8, + "completions/min_terminated_length": 281.8, + "epoch": 0.21024146544546213, + "grad_norm": 3.665098494767977, + "kl": 0.0560546875, + "learning_rate": 8.950454390078177e-07, + "loss": 0.0022, + "num_tokens": 26357917.0, + "reward": 1.9770833253860474, + "reward_std": 0.07685204781591892, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.014583333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03514297790825367, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 415.25, + "completions/mean_terminated_length": 415.25, + "completions/min_length": 316.6, + "completions/min_terminated_length": 316.6, + "epoch": 0.21232306411323898, + "grad_norm": 4.557729215755654, + "kl": 0.0542724609375, + "learning_rate": 8.930326722523685e-07, + "loss": 0.0022, + "num_tokens": 26629521.0, + "reward": 1.9354166746139527, + "reward_std": 0.06791418939828872, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03255883827805519, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 411.9375, + "completions/mean_terminated_length": 411.9375, + "completions/min_length": 260.7, + "completions/min_terminated_length": 260.7, + "epoch": 0.21440466278101583, + "grad_norm": 0.13656288522156476, + "kl": 0.052783203125, + "learning_rate": 8.910030973105705e-07, + "loss": 0.0021, + "num_tokens": 26886244.0, + "reward": 1.9420833587646484, + "reward_std": 0.08647008240222931, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.017083333618938924, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.040179073065519336, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.4, + "completions/max_terminated_length": 538.4, + "completions/mean_length": 409.45, + "completions/mean_terminated_length": 409.45, + "completions/min_length": 304.9, + "completions/min_terminated_length": 304.9, + "epoch": 0.21648626144879268, + "grad_norm": 0.15406730218953882, + "kl": 0.0578125, + "learning_rate": 8.889568009779402e-07, + "loss": 0.0023, + "num_tokens": 27159272.0, + "reward": 1.975, + "reward_std": 0.11898414641618729, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.043555130064487454, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.6, + "completions/max_terminated_length": 521.6, + "completions/mean_length": 393.9, + "completions/mean_terminated_length": 393.9, + "completions/min_length": 285.3, + "completions/min_terminated_length": 285.3, + "epoch": 0.21856786011656953, + "grad_norm": 4.232997152833753, + "kl": 0.0549072265625, + "learning_rate": 8.868938707650907e-07, + "loss": 0.0022, + "num_tokens": 27409456.0, + "reward": 1.9552083492279053, + "reward_std": 0.0955127865076065, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0552083358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09551278054714203, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 411.0875, + "completions/mean_terminated_length": 411.0875, + "completions/min_length": 298.4, + "completions/min_terminated_length": 298.4, + "epoch": 0.2206494587843464, + "grad_norm": 0.1563578337439989, + "kl": 0.056884765625, + "learning_rate": 8.848143948939892e-07, + "loss": 0.0023, + "num_tokens": 27669359.0, + "reward": 1.8354166746139526, + "reward_std": 0.09050626158714295, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055150920152664186, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.4, + "completions/max_terminated_length": 520.4, + "completions/mean_length": 376.4875, + "completions/mean_terminated_length": 376.4875, + "completions/min_length": 271.2, + "completions/min_terminated_length": 271.2, + "epoch": 0.22273105745212324, + "grad_norm": 4.773738463091718, + "kl": 0.061328125, + "learning_rate": 8.827184622941835e-07, + "loss": 0.0025, + "num_tokens": 27930574.0, + "reward": 1.8539583444595338, + "reward_std": 0.10452117174863815, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04145833365619182, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06916583105921745, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 406.65, + "completions/mean_terminated_length": 406.65, + "completions/min_length": 298.1, + "completions/min_terminated_length": 298.1, + "epoch": 0.2248126561199001, + "grad_norm": 4.74427330850803, + "kl": 0.0505615234375, + "learning_rate": 8.806061625990002e-07, + "loss": 0.002, + "num_tokens": 28201482.0, + "reward": 1.7759617567062378, + "reward_std": 0.10142084583640099, + "rewards/accuracy_reward/mean": 0.7728367522358894, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.7, + "completions/max_terminated_length": 482.7, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 365.625, + "completions/min_length": 251.1, + "completions/min_terminated_length": 251.1, + "epoch": 0.22689425478767694, + "grad_norm": 0.24058862628684932, + "kl": 0.0642822265625, + "learning_rate": 8.784775861417099e-07, + "loss": 0.0026, + "num_tokens": 28466388.0, + "reward": 1.909375, + "reward_std": 0.2374896600842476, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.071875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12523438036441803, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 346.65, + "completions/mean_terminated_length": 346.65, + "completions/min_length": 252.7, + "completions/min_terminated_length": 252.7, + "epoch": 0.2289758534554538, + "grad_norm": 4.515835434457469, + "kl": 0.0667724609375, + "learning_rate": 8.763328239516656e-07, + "loss": 0.0027, + "num_tokens": 28738368.0, + "reward": 2.0379166841506957, + "reward_std": 0.09141271561384201, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03791666682809591, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0914127141237259, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.4, + "completions/max_terminated_length": 482.4, + "completions/mean_length": 376.075, + "completions/mean_terminated_length": 376.075, + "completions/min_length": 260.6, + "completions/min_terminated_length": 260.6, + "epoch": 0.23105745212323064, + "grad_norm": 0.23711930079187077, + "kl": 0.0647216796875, + "learning_rate": 8.741719677504088e-07, + "loss": 0.0026, + "num_tokens": 28987782.0, + "reward": 1.8104339241981506, + "reward_std": 0.1613641142845154, + "rewards/accuracy_reward/mean": 0.7791839212179184, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07255653142929078, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.8, + "completions/max_terminated_length": 503.8, + "completions/mean_length": 378.325, + "completions/mean_terminated_length": 378.325, + "completions/min_length": 277.4, + "completions/min_terminated_length": 277.4, + "epoch": 0.2331390507910075, + "grad_norm": 4.704217859900306, + "kl": 0.063037109375, + "learning_rate": 8.719951099477472e-07, + "loss": 0.0025, + "num_tokens": 29264176.0, + "reward": 1.8729166746139527, + "reward_std": 0.11827037632465362, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.2, + "completions/max_terminated_length": 480.2, + "completions/mean_length": 363.95, + "completions/mean_terminated_length": 363.95, + "completions/min_length": 251.7, + "completions/min_terminated_length": 251.7, + "epoch": 0.23522064945878435, + "grad_norm": 4.448082281129015, + "kl": 0.0650634765625, + "learning_rate": 8.698023436378028e-07, + "loss": 0.0026, + "num_tokens": 29535068.0, + "reward": 1.775, + "reward_std": 0.17422050833702088, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.17422052025794982, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.7, + "completions/max_terminated_length": 488.7, + "completions/mean_length": 356.075, + "completions/mean_terminated_length": 356.075, + "completions/min_length": 263.8, + "completions/min_terminated_length": 263.8, + "epoch": 0.2373022481265612, + "grad_norm": 0.16881450891644154, + "kl": 0.0620849609375, + "learning_rate": 8.675937625950312e-07, + "loss": 0.0025, + "num_tokens": 29766794.0, + "reward": 1.9022201299667358, + "reward_std": 0.09568586796522141, + "rewards/accuracy_reward/mean": 0.8511784493923187, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09568586498498917, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.3, + "completions/max_terminated_length": 472.3, + "completions/mean_length": 368.425, + "completions/mean_terminated_length": 368.425, + "completions/min_length": 268.7, + "completions/min_terminated_length": 268.7, + "epoch": 0.23938384679433805, + "grad_norm": 0.20587238677147113, + "kl": 0.0624267578125, + "learning_rate": 8.653694612702105e-07, + "loss": 0.0025, + "num_tokens": 30015268.0, + "reward": 1.7605624437332152, + "reward_std": 0.1828530788421631, + "rewards/accuracy_reward/mean": 0.7288957685232162, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03166666720062494, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06585140451788903, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.6, + "completions/max_terminated_length": 553.6, + "completions/mean_length": 418.95, + "completions/mean_terminated_length": 418.95, + "completions/min_length": 299.9, + "completions/min_terminated_length": 299.9, + "epoch": 0.2414654454621149, + "grad_norm": 4.886301286542531, + "kl": 0.06640625, + "learning_rate": 8.631295347864023e-07, + "loss": 0.0027, + "num_tokens": 30292240.0, + "reward": 1.9337500095367433, + "reward_std": 0.16898389756679535, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000059604644, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08733755946159363, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.5, + "completions/max_terminated_length": 530.5, + "completions/mean_length": 372.5, + "completions/mean_terminated_length": 372.5, + "completions/min_length": 269.5, + "completions/min_terminated_length": 269.5, + "epoch": 0.24354704412989175, + "grad_norm": 0.1671711093190659, + "kl": 0.0646484375, + "learning_rate": 8.608740789348843e-07, + "loss": 0.0026, + "num_tokens": 30557720.0, + "reward": 1.8916666746139525, + "reward_std": 0.18054171949625014, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07916666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14518638029694558, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.4, + "completions/max_terminated_length": 545.4, + "completions/mean_length": 404.7, + "completions/mean_terminated_length": 404.7, + "completions/min_length": 287.3, + "completions/min_terminated_length": 287.3, + "epoch": 0.2456286427976686, + "grad_norm": 4.336197463116693, + "kl": 0.0661865234375, + "learning_rate": 8.586031901710526e-07, + "loss": 0.0026, + "num_tokens": 30815472.0, + "reward": 2.0014583349227903, + "reward_std": 0.10190928652882576, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.026458334550261496, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055618280172348024, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.3, + "completions/max_terminated_length": 536.3, + "completions/mean_length": 403.5625, + "completions/mean_terminated_length": 403.5625, + "completions/min_length": 303.8, + "completions/min_terminated_length": 303.8, + "epoch": 0.24771024146544546, + "grad_norm": 0.20563799179064923, + "kl": 0.05634765625, + "learning_rate": 8.563169656102984e-07, + "loss": 0.0023, + "num_tokens": 31074925.0, + "reward": 2.0166666746139525, + "reward_std": 0.03563483655452728, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.6, + "completions/max_terminated_length": 486.6, + "completions/mean_length": 372.4, + "completions/mean_terminated_length": 372.4, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.2497918401332223, + "grad_norm": 4.6014497468287345, + "kl": 0.062255859375, + "learning_rate": 8.540155030238532e-07, + "loss": 0.0025, + "num_tokens": 31344357.0, + "reward": 2.018750023841858, + "reward_std": 0.10257088243961335, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125000111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06721552982926368, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.8, + "completions/max_terminated_length": 493.8, + "completions/mean_length": 380.4875, + "completions/mean_terminated_length": 380.4875, + "completions/min_length": 274.5, + "completions/min_terminated_length": 274.5, + "epoch": 0.25187343880099916, + "grad_norm": 0.21384116814461035, + "kl": 0.06572265625, + "learning_rate": 8.516989008346083e-07, + "loss": 0.0026, + "num_tokens": 31610260.0, + "reward": 1.8972916841506957, + "reward_std": 0.15455301925539972, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04729166775941849, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10088659450411797, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.5, + "completions/max_terminated_length": 581.5, + "completions/mean_length": 413.1125, + "completions/mean_terminated_length": 413.1125, + "completions/min_length": 293.6, + "completions/min_terminated_length": 293.6, + "epoch": 0.253955037468776, + "grad_norm": 5.141278583858697, + "kl": 0.0589111328125, + "learning_rate": 8.493672581129058e-07, + "loss": 0.0024, + "num_tokens": 31874765.0, + "reward": 1.928125, + "reward_std": 0.1801423728466034, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05825847387313843, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.4, + "completions/max_terminated_length": 644.4, + "completions/mean_length": 464.075, + "completions/mean_terminated_length": 464.075, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.25603663613655286, + "grad_norm": 0.16354981078889747, + "kl": 0.0588134765625, + "learning_rate": 8.470206745723017e-07, + "loss": 0.0024, + "num_tokens": 32135427.0, + "reward": 1.9770833492279052, + "reward_std": 0.1731685608625412, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05208333544433117, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09118002727627754, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.2, + "completions/max_terminated_length": 530.2, + "completions/mean_length": 423.5625, + "completions/mean_terminated_length": 423.5625, + "completions/min_length": 319.8, + "completions/min_terminated_length": 319.8, + "epoch": 0.2581182348043297, + "grad_norm": 0.19910532213269125, + "kl": 0.0617431640625, + "learning_rate": 8.446592505653017e-07, + "loss": 0.0025, + "num_tokens": 32356120.0, + "reward": 1.8947438836097716, + "reward_std": 0.08823015540838242, + "rewards/accuracy_reward/mean": 0.8710831701755524, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02366071417927742, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.4, + "completions/max_terminated_length": 575.4, + "completions/mean_length": 432.0625, + "completions/mean_terminated_length": 432.0625, + "completions/min_length": 317.1, + "completions/min_terminated_length": 317.1, + "epoch": 0.26019983347210657, + "grad_norm": 0.15896187258464695, + "kl": 0.0609375, + "learning_rate": 8.422830870790692e-07, + "loss": 0.0024, + "num_tokens": 32630189.0, + "reward": 1.851994562149048, + "reward_std": 0.045456858724355696, + "rewards/accuracy_reward/mean": 0.8359231412410736, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01607142873108387, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.045456863939762115, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 404.2625, + "completions/mean_terminated_length": 404.2625, + "completions/min_length": 292.4, + "completions/min_terminated_length": 292.4, + "epoch": 0.2622814321398834, + "grad_norm": 0.18398079197328773, + "kl": 0.061572265625, + "learning_rate": 8.39892285731107e-07, + "loss": 0.0025, + "num_tokens": 32885290.0, + "reward": 2.0916666746139527, + "reward_std": 0.17030038088560104, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.134945035725832, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.3, + "completions/max_terminated_length": 475.3, + "completions/mean_length": 373.975, + "completions/mean_terminated_length": 373.975, + "completions/min_length": 245.1, + "completions/min_terminated_length": 245.1, + "epoch": 0.26436303080766027, + "grad_norm": 0.17496923546167426, + "kl": 0.0584228515625, + "learning_rate": 8.374869487649116e-07, + "loss": 0.0023, + "num_tokens": 33162864.0, + "reward": 1.9354166746139527, + "reward_std": 0.10017346739768981, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.4, + "completions/max_terminated_length": 470.4, + "completions/mean_length": 346.4375, + "completions/mean_terminated_length": 346.4375, + "completions/min_length": 230.7, + "completions/min_terminated_length": 230.7, + "epoch": 0.2664446294754371, + "grad_norm": 0.20407585116288846, + "kl": 0.0647705078125, + "learning_rate": 8.350671790456003e-07, + "loss": 0.0026, + "num_tokens": 33433515.0, + "reward": 1.9375, + "reward_std": 0.05175491571426392, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.5, + "completions/max_terminated_length": 490.5, + "completions/mean_length": 355.9625, + "completions/mean_terminated_length": 355.9625, + "completions/min_length": 251.3, + "completions/min_terminated_length": 251.3, + "epoch": 0.268526228143214, + "grad_norm": 0.2047612335149913, + "kl": 0.0600830078125, + "learning_rate": 8.326330800555123e-07, + "loss": 0.0024, + "num_tokens": 33692176.0, + "reward": 1.881944465637207, + "reward_std": 0.1617008775472641, + "rewards/accuracy_reward/mean": 0.8486111111938953, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.7, + "completions/max_terminated_length": 462.7, + "completions/mean_length": 349.3375, + "completions/mean_terminated_length": 349.3375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.2706078268109908, + "grad_norm": 5.268008904357831, + "kl": 0.0579833984375, + "learning_rate": 8.301847558897836e-07, + "loss": 0.0023, + "num_tokens": 33953955.0, + "reward": 1.900000023841858, + "reward_std": 0.2489195354282856, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.11250000409781932, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1672731988132, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.4, + "completions/max_terminated_length": 464.4, + "completions/mean_length": 354.8, + "completions/mean_terminated_length": 354.8, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.2726894254787677, + "grad_norm": 0.3512517591663718, + "kl": 0.064794921875, + "learning_rate": 8.27722311251895e-07, + "loss": 0.0026, + "num_tokens": 34226955.0, + "reward": 2.015625, + "reward_std": 0.03627826422452927, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03627826571464539, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 351.8125, + "completions/mean_terminated_length": 351.8125, + "completions/min_length": 255.2, + "completions/min_terminated_length": 255.2, + "epoch": 0.27477102414654453, + "grad_norm": 0.16467258407693672, + "kl": 0.06689453125, + "learning_rate": 8.25245851449194e-07, + "loss": 0.0027, + "num_tokens": 34498260.0, + "reward": 1.871875, + "reward_std": 0.14545682817697525, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.052874819934368135, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.3, + "completions/max_terminated_length": 471.3, + "completions/mean_length": 360.8125, + "completions/mean_terminated_length": 360.8125, + "completions/min_length": 251.7, + "completions/min_terminated_length": 251.7, + "epoch": 0.2768526228143214, + "grad_norm": 0.20861058312857134, + "kl": 0.068603515625, + "learning_rate": 8.227554823883925e-07, + "loss": 0.0027, + "num_tokens": 34769973.0, + "reward": 1.91875, + "reward_std": 0.07373789101839065, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07147541642189026, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.9, + "completions/max_terminated_length": 491.9, + "completions/mean_length": 368.7, + "completions/mean_terminated_length": 368.7, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.27893422148209823, + "grad_norm": 0.18466009019316756, + "kl": 0.063134765625, + "learning_rate": 8.202513105710365e-07, + "loss": 0.0025, + "num_tokens": 35023525.0, + "reward": 1.8916666746139525, + "reward_std": 0.04714045971632004, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 365.75, + "completions/mean_terminated_length": 365.75, + "completions/min_length": 244.4, + "completions/min_terminated_length": 244.4, + "epoch": 0.2810158201498751, + "grad_norm": 0.1978909757097866, + "kl": 0.067431640625, + "learning_rate": 8.17733443088952e-07, + "loss": 0.0027, + "num_tokens": 35281873.0, + "reward": 1.8048369646072389, + "reward_std": 0.15534310936927795, + "rewards/accuracy_reward/mean": 0.791920292377472, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025416667200624944, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0437350295484066, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.2, + "completions/max_terminated_length": 539.2, + "completions/mean_length": 410.275, + "completions/mean_terminated_length": 410.275, + "completions/min_length": 317.2, + "completions/min_terminated_length": 317.2, + "epoch": 0.28309741881765194, + "grad_norm": 0.16131378238013733, + "kl": 0.060498046875, + "learning_rate": 8.152019876196652e-07, + "loss": 0.0024, + "num_tokens": 35539495.0, + "reward": 2.0125, + "reward_std": 0.023145502805709837, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.023145502805709837, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.4, + "completions/max_terminated_length": 490.4, + "completions/mean_length": 375.1625, + "completions/mean_terminated_length": 375.1625, + "completions/min_length": 269.8, + "completions/min_terminated_length": 269.8, + "epoch": 0.2851790174854288, + "grad_norm": 0.16965838916322384, + "kl": 0.0564453125, + "learning_rate": 8.126570524217972e-07, + "loss": 0.0023, + "num_tokens": 35810644.0, + "reward": 2.003125, + "reward_std": 0.00883883461356163, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.7, + "completions/max_terminated_length": 476.7, + "completions/mean_length": 342.0125, + "completions/mean_terminated_length": 342.0125, + "completions/min_length": 234.7, + "completions/min_terminated_length": 234.7, + "epoch": 0.28726061615320564, + "grad_norm": 5.830801542117954, + "kl": 0.0645263671875, + "learning_rate": 8.100987463304354e-07, + "loss": 0.0026, + "num_tokens": 36076309.0, + "reward": 1.9291666746139526, + "reward_std": 0.23552957624197007, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.10350984334945679, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06666666865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09666440933942795, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.6, + "completions/max_terminated_length": 419.6, + "completions/mean_length": 318.6, + "completions/mean_terminated_length": 318.6, + "completions/min_length": 225.5, + "completions/min_terminated_length": 225.5, + "epoch": 0.2893422148209825, + "grad_norm": 1.1801320667855104, + "kl": 0.06484375, + "learning_rate": 8.075271787524775e-07, + "loss": 0.0026, + "num_tokens": 36342709.0, + "reward": 2.019166660308838, + "reward_std": 0.048203670978546144, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667014360428, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.048203660547733305, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.3, + "completions/max_terminated_length": 457.3, + "completions/mean_length": 336.9875, + "completions/mean_terminated_length": 336.9875, + "completions/min_length": 233.7, + "completions/min_terminated_length": 233.7, + "epoch": 0.29142381348875934, + "grad_norm": 0.19315056407617454, + "kl": 0.059814453125, + "learning_rate": 8.049424596619543e-07, + "loss": 0.0024, + "num_tokens": 36608124.0, + "reward": 1.8322916746139526, + "reward_std": 0.07292233854532242, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03756699562072754, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.7, + "completions/max_terminated_length": 444.7, + "completions/mean_length": 340.5375, + "completions/mean_terminated_length": 340.5375, + "completions/min_length": 244.1, + "completions/min_terminated_length": 244.1, + "epoch": 0.2935054121565362, + "grad_norm": 0.19456407294224826, + "kl": 0.0667724609375, + "learning_rate": 8.023446995953251e-07, + "loss": 0.0027, + "num_tokens": 36878607.0, + "reward": 1.8760416746139525, + "reward_std": 0.11459305360913277, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06830205097794532, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.4, + "completions/max_terminated_length": 485.4, + "completions/mean_length": 370.2, + "completions/mean_terminated_length": 370.2, + "completions/min_length": 262.5, + "completions/min_terminated_length": 262.5, + "epoch": 0.29558701082431305, + "grad_norm": 0.22498138258141726, + "kl": 0.0623291015625, + "learning_rate": 7.99734009646752e-07, + "loss": 0.0025, + "num_tokens": 37139823.0, + "reward": 1.7394469141960145, + "reward_std": 0.12612001225352287, + "rewards/accuracy_reward/mean": 0.7196552403271198, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044473668187856676, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.3, + "completions/max_terminated_length": 529.3, + "completions/mean_length": 398.975, + "completions/mean_terminated_length": 398.975, + "completions/min_length": 316.4, + "completions/min_terminated_length": 316.4, + "epoch": 0.29766860949208995, + "grad_norm": 0.2608680560915561, + "kl": 0.0619384765625, + "learning_rate": 7.971105014633477e-07, + "loss": 0.0025, + "num_tokens": 37388309.0, + "reward": 1.8760273218154908, + "reward_std": 0.07142658531665802, + "rewards/accuracy_reward/mean": 0.8609231412410736, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01510416716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.025135573744773865, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 521.5, + "completions/max_terminated_length": 485.4, + "completions/mean_length": 390.675, + "completions/mean_terminated_length": 384.0750030517578, + "completions/min_length": 311.4, + "completions/min_terminated_length": 311.4, + "epoch": 0.2997502081598668, + "grad_norm": 0.21420320209665428, + "kl": 0.067578125, + "learning_rate": 7.94474287240402e-07, + "loss": 0.0027, + "num_tokens": 37614403.0, + "reward": 2.065416693687439, + "reward_std": 0.1892769455909729, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0904166653752327, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11856626570224763, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 398.1125, + "completions/mean_terminated_length": 398.1125, + "completions/min_length": 295.3, + "completions/min_terminated_length": 295.3, + "epoch": 0.30183180682764366, + "grad_norm": 0.23938732824134978, + "kl": 0.0615234375, + "learning_rate": 7.918254797165824e-07, + "loss": 0.0025, + "num_tokens": 37860732.0, + "reward": 1.9497023820877075, + "reward_std": 0.09799748845398426, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.049702381156384944, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09799748659133911, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.8, + "completions/max_terminated_length": 511.8, + "completions/mean_length": 379.275, + "completions/mean_terminated_length": 379.275, + "completions/min_length": 279.9, + "completions/min_terminated_length": 279.9, + "epoch": 0.3039134054954205, + "grad_norm": 4.628092242634228, + "kl": 0.069482421875, + "learning_rate": 7.891641921691144e-07, + "loss": 0.0028, + "num_tokens": 38129138.0, + "reward": 1.8937500238418579, + "reward_std": 0.17325654327869416, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375000149011612, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0947420835494995, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.9, + "completions/max_terminated_length": 461.9, + "completions/mean_length": 373.3625, + "completions/mean_terminated_length": 373.3625, + "completions/min_length": 278.3, + "completions/min_terminated_length": 278.3, + "epoch": 0.30599500416319736, + "grad_norm": 0.20059865536203564, + "kl": 0.070068359375, + "learning_rate": 7.864905384089354e-07, + "loss": 0.0028, + "num_tokens": 38387623.0, + "reward": 1.98125, + "reward_std": 0.12118750065565109, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06943259090185165, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.6, + "completions/max_terminated_length": 472.6, + "completions/mean_length": 342.3375, + "completions/mean_terminated_length": 342.3375, + "completions/min_length": 243.5, + "completions/min_terminated_length": 243.5, + "epoch": 0.3080766028309742, + "grad_norm": 0.17844621860586748, + "kl": 0.0716064453125, + "learning_rate": 7.838046327758292e-07, + "loss": 0.0029, + "num_tokens": 38656674.0, + "reward": 1.9824664831161498, + "reward_std": 0.13602151721715927, + "rewards/accuracy_reward/mean": 0.9543414890766144, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0716336041688919, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.5, + "completions/max_terminated_length": 470.5, + "completions/mean_length": 350.9875, + "completions/mean_terminated_length": 350.9875, + "completions/min_length": 256.5, + "completions/min_terminated_length": 256.5, + "epoch": 0.31015820149875106, + "grad_norm": 0.22326710213332043, + "kl": 0.0666259765625, + "learning_rate": 7.811065901335347e-07, + "loss": 0.0027, + "num_tokens": 38927993.0, + "reward": 1.9309977531433105, + "reward_std": 0.044473668187856676, + "rewards/accuracy_reward/mean": 0.9112060777842999, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044473668187856676, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.9, + "completions/max_terminated_length": 456.9, + "completions/mean_length": 359.5625, + "completions/mean_terminated_length": 359.5625, + "completions/min_length": 262.1, + "completions/min_terminated_length": 262.1, + "epoch": 0.3122398001665279, + "grad_norm": 4.832125835729535, + "kl": 0.0694091796875, + "learning_rate": 7.783965258648353e-07, + "loss": 0.0028, + "num_tokens": 39202846.0, + "reward": 2.0133333444595336, + "reward_std": 0.08462430983781814, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025833333283662795, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04926896393299103, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.4, + "completions/max_terminated_length": 460.4, + "completions/mean_length": 365.575, + "completions/mean_terminated_length": 365.575, + "completions/min_length": 269.8, + "completions/min_terminated_length": 269.8, + "epoch": 0.31432139883430477, + "grad_norm": 4.430038915172002, + "kl": 0.0671142578125, + "learning_rate": 7.756745558666229e-07, + "loss": 0.0027, + "num_tokens": 39458492.0, + "reward": 1.9916666746139526, + "reward_std": 0.13425071388483048, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08249579146504402, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.9, + "completions/max_terminated_length": 443.9, + "completions/mean_length": 354.75, + "completions/mean_terminated_length": 354.75, + "completions/min_length": 263.9, + "completions/min_terminated_length": 263.9, + "epoch": 0.3164029975020816, + "grad_norm": 4.471834571798772, + "kl": 0.065234375, + "learning_rate": 7.729407965449426e-07, + "loss": 0.0026, + "num_tokens": 39732600.0, + "reward": 1.9228125095367432, + "reward_std": 0.06452349089086055, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0228125000372529, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06452349312603474, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.9, + "completions/max_terminated_length": 485.9, + "completions/mean_length": 384.075, + "completions/mean_terminated_length": 384.075, + "completions/min_length": 273.5, + "completions/min_terminated_length": 273.5, + "epoch": 0.31848459616985847, + "grad_norm": 0.13877808954631102, + "kl": 0.0644287109375, + "learning_rate": 7.701953648100141e-07, + "loss": 0.0026, + "num_tokens": 40000910.0, + "reward": 2.00625, + "reward_std": 0.07255652844905854, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03720119297504425, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.9, + "completions/max_terminated_length": 472.9, + "completions/mean_length": 362.0875, + "completions/mean_terminated_length": 362.0875, + "completions/min_length": 236.1, + "completions/min_terminated_length": 236.1, + "epoch": 0.3205661948376353, + "grad_norm": 4.941392135414514, + "kl": 0.0575927734375, + "learning_rate": 7.674383780712325e-07, + "loss": 0.0023, + "num_tokens": 40270589.0, + "reward": 1.9339583396911622, + "reward_std": 0.13531450778245926, + "rewards/accuracy_reward/mean": 0.8816666670143605, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05229166746139526, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08902351260185241, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.5, + "completions/max_terminated_length": 472.5, + "completions/mean_length": 358.6625, + "completions/mean_terminated_length": 358.6625, + "completions/min_length": 261.9, + "completions/min_terminated_length": 261.9, + "epoch": 0.3226477935054122, + "grad_norm": 0.20067276246483776, + "kl": 0.0630859375, + "learning_rate": 7.646699542321468e-07, + "loss": 0.0025, + "num_tokens": 40525298.0, + "reward": 2.028541684150696, + "reward_std": 0.06255088374018669, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02854166701436043, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06255088374018669, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.8, + "completions/max_terminated_length": 404.8, + "completions/mean_length": 305.4375, + "completions/mean_terminated_length": 305.4375, + "completions/min_length": 226.5, + "completions/min_terminated_length": 226.5, + "epoch": 0.324729392173189, + "grad_norm": 0.23733834977909768, + "kl": 0.0707275390625, + "learning_rate": 7.618902116854171e-07, + "loss": 0.0028, + "num_tokens": 40790061.0, + "reward": 1.944444465637207, + "reward_std": 0.05745632499456406, + "rewards/accuracy_reward/mean": 0.9111111111938953, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05745632201433182, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.5, + "completions/max_terminated_length": 398.5, + "completions/mean_length": 289.0625, + "completions/mean_terminated_length": 289.0625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.3268109908409659, + "grad_norm": 5.195397649911, + "kl": 0.06748046875, + "learning_rate": 7.590992693077532e-07, + "loss": 0.0027, + "num_tokens": 41060354.0, + "reward": 2.0458333492279053, + "reward_std": 0.09181488454341888, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09181488156318665, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.6, + "completions/max_terminated_length": 401.6, + "completions/mean_length": 322.5875, + "completions/mean_terminated_length": 322.5875, + "completions/min_length": 233.6, + "completions/min_terminated_length": 233.6, + "epoch": 0.3288925895087427, + "grad_norm": 5.1645519441733745, + "kl": 0.07841796875, + "learning_rate": 7.56297246454829e-07, + "loss": 0.0031, + "num_tokens": 41323545.0, + "reward": 1.9291666746139526, + "reward_std": 0.08249579817056656, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 333.3875, + "completions/mean_terminated_length": 333.3875, + "completions/min_length": 243.5, + "completions/min_terminated_length": 243.5, + "epoch": 0.3309741881765196, + "grad_norm": 0.1884882034576032, + "kl": 0.0700927734375, + "learning_rate": 7.534842629561791e-07, + "loss": 0.0028, + "num_tokens": 41586160.0, + "reward": 1.8593750238418578, + "reward_std": 0.09315259978175164, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02187500111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04139767438173294, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.6, + "completions/max_terminated_length": 504.6, + "completions/mean_length": 363.3, + "completions/mean_terminated_length": 363.3, + "completions/min_length": 246.5, + "completions/min_terminated_length": 246.5, + "epoch": 0.33305578684429643, + "grad_norm": 5.826370704218823, + "kl": 0.0677001953125, + "learning_rate": 7.506604391100748e-07, + "loss": 0.0027, + "num_tokens": 41847152.0, + "reward": 1.8416666746139527, + "reward_std": 0.0497533455491066, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0497533343732357, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.9, + "completions/max_terminated_length": 443.9, + "completions/mean_length": 335.6875, + "completions/mean_terminated_length": 335.6875, + "completions/min_length": 254.6, + "completions/min_terminated_length": 254.6, + "epoch": 0.3351373855120733, + "grad_norm": 4.972314064995392, + "kl": 0.072509765625, + "learning_rate": 7.478258956783781e-07, + "loss": 0.0029, + "num_tokens": 42112783.0, + "reward": 2.008333349227905, + "reward_std": 0.17427795231342316, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09263160824775696, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.1, + "completions/max_terminated_length": 423.1, + "completions/mean_length": 339.675, + "completions/mean_terminated_length": 339.675, + "completions/min_length": 249.8, + "completions/min_terminated_length": 249.8, + "epoch": 0.33721898417985013, + "grad_norm": 5.179177647842328, + "kl": 0.068310546875, + "learning_rate": 7.44980753881378e-07, + "loss": 0.0027, + "num_tokens": 42361445.0, + "reward": 2.0614583492279053, + "reward_std": 0.11628946885466576, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0614583358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11628946885466576, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.5, + "completions/max_terminated_length": 485.5, + "completions/mean_length": 388.5375, + "completions/mean_terminated_length": 388.5375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.339300582847627, + "grad_norm": 0.23406569969548927, + "kl": 0.0636474609375, + "learning_rate": 7.421251353926073e-07, + "loss": 0.0025, + "num_tokens": 42602776.0, + "reward": 1.8666666746139526, + "reward_std": 0.16328328996896743, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.1514981746673584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.8, + "completions/max_terminated_length": 514.8, + "completions/mean_length": 406.3125, + "completions/mean_terminated_length": 406.3125, + "completions/min_length": 305.1, + "completions/min_terminated_length": 305.1, + "epoch": 0.34138218151540384, + "grad_norm": 0.23754229470691682, + "kl": 0.067724609375, + "learning_rate": 7.39259162333637e-07, + "loss": 0.0027, + "num_tokens": 42821513.0, + "reward": 1.9469507694244386, + "reward_std": 0.059881458431482314, + "rewards/accuracy_reward/mean": 0.9075757578015328, + "rewards/accuracy_reward/std": 0.0014284986071288585, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03937500081956387, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05958408713340759, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.4, + "completions/max_terminated_length": 468.4, + "completions/mean_length": 387.175, + "completions/mean_terminated_length": 387.175, + "completions/min_length": 313.5, + "completions/min_terminated_length": 313.5, + "epoch": 0.3434637801831807, + "grad_norm": 0.2073197430914097, + "kl": 0.07919921875, + "learning_rate": 7.363829572688566e-07, + "loss": 0.0032, + "num_tokens": 43080103.0, + "reward": 1.7793055534362794, + "reward_std": 0.16691839694976807, + "rewards/accuracy_reward/mean": 0.7555555552244186, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.023750000074505805, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06717514395713806, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 428.1125, + "completions/mean_terminated_length": 428.1125, + "completions/min_length": 313.9, + "completions/min_terminated_length": 313.9, + "epoch": 0.34554537885095754, + "grad_norm": 0.18312263174837315, + "kl": 0.0740478515625, + "learning_rate": 7.334966432002301e-07, + "loss": 0.003, + "num_tokens": 43326488.0, + "reward": 1.8846933603286744, + "reward_std": 0.10355282425880433, + "rewards/accuracy_reward/mean": 0.8513600140810013, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05179789587855339, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.2, + "completions/max_terminated_length": 568.2, + "completions/mean_length": 429.7875, + "completions/mean_terminated_length": 429.7875, + "completions/min_length": 330.5, + "completions/min_terminated_length": 330.5, + "epoch": 0.3476269775187344, + "grad_norm": 0.1688036787174455, + "kl": 0.071240234375, + "learning_rate": 7.30600343562037e-07, + "loss": 0.0028, + "num_tokens": 43604655.0, + "reward": 1.785193634033203, + "reward_std": 0.09075549244880676, + "rewards/accuracy_reward/mean": 0.7685269489884377, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03900056481361389, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.7, + "completions/max_terminated_length": 520.7, + "completions/mean_length": 382.1125, + "completions/mean_terminated_length": 382.1125, + "completions/min_length": 284.6, + "completions/min_terminated_length": 284.6, + "epoch": 0.34970857618651124, + "grad_norm": 0.1915634889403887, + "kl": 0.07587890625, + "learning_rate": 7.276941822155931e-07, + "loss": 0.003, + "num_tokens": 43879256.0, + "reward": 1.83125, + "reward_std": 0.11884753406047821, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.7, + "completions/max_terminated_length": 562.7, + "completions/mean_length": 421.9875, + "completions/mean_terminated_length": 421.9875, + "completions/min_length": 326.7, + "completions/min_terminated_length": 326.7, + "epoch": 0.3517901748542881, + "grad_norm": 0.268720747030889, + "kl": 0.066748046875, + "learning_rate": 7.247782834439546e-07, + "loss": 0.0027, + "num_tokens": 44151383.0, + "reward": 1.7979166746139525, + "reward_std": 0.1014419287443161, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.1, + "completions/max_terminated_length": 496.1, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 280.7, + "completions/min_terminated_length": 280.7, + "epoch": 0.35387177352206495, + "grad_norm": 4.259811558741672, + "kl": 0.076416015625, + "learning_rate": 7.218527719466013e-07, + "loss": 0.0031, + "num_tokens": 44415861.0, + "reward": 1.7114583373069763, + "reward_std": 0.1656905271112919, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09497984722256661, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.2, + "completions/max_terminated_length": 580.2, + "completions/mean_length": 442.4125, + "completions/mean_terminated_length": 442.4125, + "completions/min_length": 340.7, + "completions/min_terminated_length": 340.7, + "epoch": 0.3559533721898418, + "grad_norm": 0.1856302711758979, + "kl": 0.06845703125, + "learning_rate": 7.189177728341051e-07, + "loss": 0.0027, + "num_tokens": 44673238.0, + "reward": 1.653946590423584, + "reward_std": 0.1017528209136799, + "rewards/accuracy_reward/mean": 0.6508215961977839, + "rewards/accuracy_reward/std": 0.09291398625355214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.003125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00883883461356163, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.4, + "completions/max_terminated_length": 508.4, + "completions/mean_length": 400.5125, + "completions/mean_terminated_length": 400.5125, + "completions/min_length": 308.3, + "completions/min_terminated_length": 308.3, + "epoch": 0.35803497085761865, + "grad_norm": 4.670734841619906, + "kl": 0.0704345703125, + "learning_rate": 7.159734116227795e-07, + "loss": 0.0028, + "num_tokens": 44950983.0, + "reward": 1.9545833349227906, + "reward_std": 0.12122158259153366, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04208333380520344, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08586623594164848, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.6, + "completions/max_terminated_length": 589.6, + "completions/mean_length": 448.6375, + "completions/mean_terminated_length": 448.6375, + "completions/min_length": 337.9, + "completions/min_terminated_length": 337.9, + "epoch": 0.3601165695253955, + "grad_norm": 0.1816425284023795, + "kl": 0.074951171875, + "learning_rate": 7.130198142293112e-07, + "loss": 0.003, + "num_tokens": 45214626.0, + "reward": 1.8920833468437195, + "reward_std": 0.1463622696697712, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09208333436399699, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14636227563023568, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.4, + "completions/max_terminated_length": 600.4, + "completions/mean_length": 460.5875, + "completions/mean_terminated_length": 460.5875, + "completions/min_length": 348.4, + "completions/min_terminated_length": 348.4, + "epoch": 0.36219816819317235, + "grad_norm": 0.14018851586139344, + "kl": 0.0724853515625, + "learning_rate": 7.100571069653758e-07, + "loss": 0.0029, + "num_tokens": 45477105.0, + "reward": 1.9604166746139526, + "reward_std": 0.1159398838877678, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0233578659594059, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.6, + "completions/max_terminated_length": 563.6, + "completions/mean_length": 434.2125, + "completions/mean_terminated_length": 434.2125, + "completions/min_length": 333.6, + "completions/min_terminated_length": 333.6, + "epoch": 0.3642797668609492, + "grad_norm": 0.17898776181926912, + "kl": 0.07880859375, + "learning_rate": 7.07085416532236e-07, + "loss": 0.0032, + "num_tokens": 45759426.0, + "reward": 1.862916684150696, + "reward_std": 0.08998610377311707, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.012916666828095913, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03653385192155838, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.5, + "completions/max_terminated_length": 630.5, + "completions/mean_length": 460.35, + "completions/mean_terminated_length": 460.35, + "completions/min_length": 352.4, + "completions/min_terminated_length": 352.4, + "epoch": 0.36636136552872606, + "grad_norm": 0.1815748637730769, + "kl": 0.0739013671875, + "learning_rate": 7.041048700153225e-07, + "loss": 0.003, + "num_tokens": 46035510.0, + "reward": 1.803125, + "reward_std": 0.1404043108224869, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.10520716905593872, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03519715070724487, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.7, + "completions/max_terminated_length": 533.7, + "completions/mean_length": 403.125, + "completions/mean_terminated_length": 403.125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.3684429641965029, + "grad_norm": 0.1846933022988794, + "kl": 0.0753173828125, + "learning_rate": 7.011155948788004e-07, + "loss": 0.003, + "num_tokens": 46302008.0, + "reward": 1.89375, + "reward_std": 0.09932401329278946, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.9, + "completions/max_terminated_length": 505.9, + "completions/mean_length": 361.125, + "completions/mean_terminated_length": 361.125, + "completions/min_length": 261.7, + "completions/min_terminated_length": 261.7, + "epoch": 0.37052456286427976, + "grad_norm": 0.24274231022975615, + "kl": 0.0776611328125, + "learning_rate": 6.981177189601168e-07, + "loss": 0.0031, + "num_tokens": 46562450.0, + "reward": 1.8025000095367432, + "reward_std": 0.09913797974586487, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02750000059604645, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07040632367134095, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.4, + "completions/max_terminated_length": 481.4, + "completions/mean_length": 378.1, + "completions/mean_terminated_length": 378.1, + "completions/min_length": 277.3, + "completions/min_terminated_length": 277.3, + "epoch": 0.3726061615320566, + "grad_norm": 5.690030544072626, + "kl": 0.0618408203125, + "learning_rate": 6.951113704645347e-07, + "loss": 0.0025, + "num_tokens": 46828018.0, + "reward": 1.9410416722297668, + "reward_std": 0.10387353450059891, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.041041666828095916, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10387352779507637, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.3, + "completions/max_terminated_length": 487.3, + "completions/mean_length": 371.975, + "completions/mean_terminated_length": 371.975, + "completions/min_length": 259.7, + "completions/min_terminated_length": 259.7, + "epoch": 0.37468776019983346, + "grad_norm": 0.22872967977661343, + "kl": 0.0654296875, + "learning_rate": 6.920966779596499e-07, + "loss": 0.0026, + "num_tokens": 47102656.0, + "reward": 1.8604166746139525, + "reward_std": 0.0851863980293274, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053312502801418304, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 374.075, + "completions/mean_terminated_length": 374.075, + "completions/min_length": 266.2, + "completions/min_terminated_length": 266.2, + "epoch": 0.3767693588676103, + "grad_norm": 0.24099285333188789, + "kl": 0.06142578125, + "learning_rate": 6.890737703698929e-07, + "loss": 0.0025, + "num_tokens": 47372798.0, + "reward": 1.9625, + "reward_std": 0.05175491571426392, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.9, + "completions/max_terminated_length": 469.9, + "completions/mean_length": 360.4625, + "completions/mean_terminated_length": 360.4625, + "completions/min_length": 260.6, + "completions/min_terminated_length": 260.6, + "epoch": 0.37885095753538717, + "grad_norm": 0.18257577519312274, + "kl": 0.0643310546875, + "learning_rate": 6.860427769710151e-07, + "loss": 0.0026, + "num_tokens": 47636883.0, + "reward": 1.959791660308838, + "reward_std": 0.09345597624778748, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03479166626930237, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04716496616601944, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.2, + "completions/max_terminated_length": 516.2, + "completions/mean_length": 378.2375, + "completions/mean_terminated_length": 378.2375, + "completions/min_length": 260.1, + "completions/min_terminated_length": 260.1, + "epoch": 0.380932556203164, + "grad_norm": 0.1648102773611354, + "kl": 0.06181640625, + "learning_rate": 6.830038273845607e-07, + "loss": 0.0025, + "num_tokens": 47887326.0, + "reward": 2.092500019073486, + "reward_std": 0.13325504809617997, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0925000011920929, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13325505405664445, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.5, + "completions/max_terminated_length": 487.5, + "completions/mean_length": 377.3625, + "completions/mean_terminated_length": 377.3625, + "completions/min_length": 277.9, + "completions/min_terminated_length": 277.9, + "epoch": 0.38301415487094087, + "grad_norm": 0.16638683505842794, + "kl": 0.05546875, + "learning_rate": 6.799570515723232e-07, + "loss": 0.0022, + "num_tokens": 48132531.0, + "reward": 1.9729166746139526, + "reward_std": 0.10860317051410676, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055150920152664186, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.7, + "completions/max_terminated_length": 417.7, + "completions/mean_length": 319.4625, + "completions/mean_terminated_length": 319.4625, + "completions/min_length": 228.9, + "completions/min_terminated_length": 228.9, + "epoch": 0.3850957535387177, + "grad_norm": 6.452846913122507, + "kl": 0.067578125, + "learning_rate": 6.769025798307872e-07, + "loss": 0.0027, + "num_tokens": 48376000.0, + "reward": 1.9947916746139527, + "reward_std": 0.12939899265766144, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06979166865348815, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.083107990026474, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.8, + "completions/max_terminated_length": 469.8, + "completions/mean_length": 329.225, + "completions/mean_terminated_length": 329.225, + "completions/min_length": 214.2, + "completions/min_terminated_length": 214.2, + "epoch": 0.3871773522064946, + "grad_norm": 0.1827940633944976, + "kl": 0.0648193359375, + "learning_rate": 6.738405427855569e-07, + "loss": 0.0026, + "num_tokens": 48642826.0, + "reward": 1.7982400417327882, + "reward_std": 0.14018386900424956, + "rewards/accuracy_reward/mean": 0.7607400402426719, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03750000111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08673161640763283, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.8, + "completions/max_terminated_length": 474.8, + "completions/mean_length": 355.0125, + "completions/mean_terminated_length": 355.0125, + "completions/min_length": 251.2, + "completions/min_terminated_length": 251.2, + "epoch": 0.3892589508742714, + "grad_norm": 0.18147870737021074, + "kl": 0.0661865234375, + "learning_rate": 6.707710713857695e-07, + "loss": 0.0026, + "num_tokens": 48884763.0, + "reward": 1.9114583492279054, + "reward_std": 0.1345927134156227, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.6, + "completions/max_terminated_length": 433.6, + "completions/mean_length": 307.8875, + "completions/mean_terminated_length": 307.8875, + "completions/min_length": 200.6, + "completions/min_terminated_length": 200.6, + "epoch": 0.3913405495420483, + "grad_norm": 5.501740334693203, + "kl": 0.06533203125, + "learning_rate": 6.676942968984947e-07, + "loss": 0.0026, + "num_tokens": 49132586.0, + "reward": 2.0125, + "reward_std": 0.0902341976761818, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05487886220216751, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.1, + "completions/max_terminated_length": 409.1, + "completions/mean_length": 308.0, + "completions/mean_terminated_length": 308.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.39342214820982513, + "grad_norm": 0.25039435539094473, + "kl": 0.0704833984375, + "learning_rate": 6.646103509031218e-07, + "loss": 0.0028, + "num_tokens": 49393066.0, + "reward": 1.8256556272506714, + "reward_std": 0.046291005611419675, + "rewards/accuracy_reward/mean": 0.8256556272506714, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.3, + "completions/max_terminated_length": 401.3, + "completions/mean_length": 275.8875, + "completions/mean_terminated_length": 275.8875, + "completions/min_length": 164.6, + "completions/min_terminated_length": 164.6, + "epoch": 0.395503746877602, + "grad_norm": 0.25691871199244704, + "kl": 0.0763427734375, + "learning_rate": 6.61519365285732e-07, + "loss": 0.0031, + "num_tokens": 49655017.0, + "reward": 1.844861125946045, + "reward_std": 0.05957057476043701, + "rewards/accuracy_reward/mean": 0.8111111111938953, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03375000059604645, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05957057476043701, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.8, + "completions/max_terminated_length": 371.8, + "completions/mean_length": 266.1375, + "completions/mean_terminated_length": 266.1375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.39758534554537883, + "grad_norm": 0.20036486923463614, + "kl": 0.0826171875, + "learning_rate": 6.584214722334587e-07, + "loss": 0.0033, + "num_tokens": 49910500.0, + "reward": 2.0104166746139525, + "reward_std": 0.0197955846786499, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.7, + "completions/max_terminated_length": 427.7, + "completions/mean_length": 326.55, + "completions/mean_terminated_length": 326.55, + "completions/min_length": 239.1, + "completions/min_terminated_length": 239.1, + "epoch": 0.3996669442131557, + "grad_norm": 0.1801963277310025, + "kl": 0.066650390625, + "learning_rate": 6.553168042288344e-07, + "loss": 0.0027, + "num_tokens": 50147856.0, + "reward": 1.8151979327201844, + "reward_std": 0.0197955846786499, + "rewards/accuracy_reward/mean": 0.804781262204051, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.6, + "completions/max_terminated_length": 432.6, + "completions/mean_length": 321.975, + "completions/mean_terminated_length": 321.975, + "completions/min_length": 220.6, + "completions/min_terminated_length": 220.6, + "epoch": 0.40174854288093254, + "grad_norm": 4.238761240943084, + "kl": 0.0733154296875, + "learning_rate": 6.522054940441245e-07, + "loss": 0.0029, + "num_tokens": 50385742.0, + "reward": 2.0125, + "reward_std": 0.22073246538639069, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13836860954761504, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.5, + "completions/max_terminated_length": 444.5, + "completions/mean_length": 331.9875, + "completions/mean_terminated_length": 331.9875, + "completions/min_length": 223.5, + "completions/min_terminated_length": 223.5, + "epoch": 0.4038301415487094, + "grad_norm": 7.556629315451778, + "kl": 0.0675048828125, + "learning_rate": 6.490876747356502e-07, + "loss": 0.0027, + "num_tokens": 50613853.0, + "reward": 1.9708333492279053, + "reward_std": 0.11572751551866531, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04583333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06943650841712952, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.2, + "completions/max_terminated_length": 479.2, + "completions/mean_length": 348.8625, + "completions/mean_terminated_length": 348.8625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.40591174021648624, + "grad_norm": 4.456618141615971, + "kl": 0.067236328125, + "learning_rate": 6.459634796380971e-07, + "loss": 0.0027, + "num_tokens": 50879938.0, + "reward": 2.015416693687439, + "reward_std": 0.08786429166793823, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.027916667237877845, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05250894501805305, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.5, + "completions/max_terminated_length": 445.5, + "completions/mean_length": 344.4125, + "completions/mean_terminated_length": 344.4125, + "completions/min_length": 251.2, + "completions/min_terminated_length": 251.2, + "epoch": 0.4079933388842631, + "grad_norm": 4.5171789641781315, + "kl": 0.101806640625, + "learning_rate": 6.428330423588145e-07, + "loss": 0.0041, + "num_tokens": 51134059.0, + "reward": 1.921875, + "reward_std": 0.04604002460837364, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046040027588605884, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.2, + "completions/max_terminated_length": 545.2, + "completions/mean_length": 395.8375, + "completions/mean_terminated_length": 395.8375, + "completions/min_length": 272.7, + "completions/min_terminated_length": 272.7, + "epoch": 0.41007493755203994, + "grad_norm": 0.16951611801469424, + "kl": 0.0731689453125, + "learning_rate": 6.396964967721005e-07, + "loss": 0.0029, + "num_tokens": 51394838.0, + "reward": 1.8356250047683715, + "reward_std": 0.22055620830506087, + "rewards/accuracy_reward/mean": 0.8033333331346512, + "rewards/accuracy_reward/std": 0.13616152815520763, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08439468294382095, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.8, + "completions/max_terminated_length": 561.8, + "completions/mean_length": 404.5375, + "completions/mean_terminated_length": 404.5375, + "completions/min_length": 282.6, + "completions/min_terminated_length": 282.6, + "epoch": 0.4121565362198168, + "grad_norm": 0.16229005912790964, + "kl": 0.071826171875, + "learning_rate": 6.365539770134771e-07, + "loss": 0.0029, + "num_tokens": 51658337.0, + "reward": 1.9541666746139525, + "reward_std": 0.08854104951024055, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08854104280471801, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.4, + "completions/max_terminated_length": 535.4, + "completions/mean_length": 414.8125, + "completions/mean_terminated_length": 414.8125, + "completions/min_length": 324.1, + "completions/min_terminated_length": 324.1, + "epoch": 0.41423813488759365, + "grad_norm": 4.555204301813374, + "kl": 0.0670654296875, + "learning_rate": 6.334056174739544e-07, + "loss": 0.0027, + "num_tokens": 51922002.0, + "reward": 1.8643336772918702, + "reward_std": 0.10620288997888565, + "rewards/accuracy_reward/mean": 0.8149586588144302, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04937499985098839, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10620288476347924, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.5, + "completions/max_terminated_length": 536.5, + "completions/mean_length": 414.0125, + "completions/mean_terminated_length": 414.0125, + "completions/min_length": 318.2, + "completions/min_terminated_length": 318.2, + "epoch": 0.4163197335553705, + "grad_norm": 4.905038947000655, + "kl": 0.0641845703125, + "learning_rate": 6.302515527942821e-07, + "loss": 0.0026, + "num_tokens": 52200915.0, + "reward": 1.9447916746139526, + "reward_std": 0.10023652911186218, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04479166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10023652613162995, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.5, + "completions/max_terminated_length": 497.5, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 371.0, + "completions/min_length": 262.3, + "completions/min_terminated_length": 262.3, + "epoch": 0.4184013322231474, + "grad_norm": 0.2606902179556751, + "kl": 0.0752685546875, + "learning_rate": 6.270919178591931e-07, + "loss": 0.003, + "num_tokens": 52447019.0, + "reward": 2.019791674613953, + "reward_std": 0.046982268989086154, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046982265263795855, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.4, + "completions/max_terminated_length": 517.4, + "completions/mean_length": 385.0125, + "completions/mean_terminated_length": 385.0125, + "completions/min_length": 272.6, + "completions/min_terminated_length": 272.6, + "epoch": 0.42048293089092426, + "grad_norm": 0.1756738079044623, + "kl": 0.0638427734375, + "learning_rate": 6.239268477916339e-07, + "loss": 0.0026, + "num_tokens": 52718236.0, + "reward": 1.990625, + "reward_std": 0.10128694772720337, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05512984022498131, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.9, + "completions/max_terminated_length": 510.9, + "completions/mean_length": 380.925, + "completions/mean_terminated_length": 380.925, + "completions/min_length": 278.6, + "completions/min_terminated_length": 278.6, + "epoch": 0.4225645295587011, + "grad_norm": 0.15933891953595988, + "kl": 0.0642578125, + "learning_rate": 6.207564779469866e-07, + "loss": 0.0026, + "num_tokens": 52978670.0, + "reward": 1.9325980424880982, + "reward_std": 0.04247846901416778, + "rewards/accuracy_reward/mean": 0.9138480395078659, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04247846454381943, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.2, + "completions/max_terminated_length": 476.2, + "completions/mean_length": 362.675, + "completions/mean_terminated_length": 362.675, + "completions/min_length": 262.6, + "completions/min_terminated_length": 262.6, + "epoch": 0.42464612822647796, + "grad_norm": 5.441756902551618, + "kl": 0.0628662109375, + "learning_rate": 6.175809439072801e-07, + "loss": 0.0025, + "num_tokens": 53253164.0, + "reward": 1.85625, + "reward_std": 0.1047879233956337, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.7, + "completions/max_terminated_length": 470.7, + "completions/mean_length": 361.175, + "completions/mean_terminated_length": 361.175, + "completions/min_length": 255.9, + "completions/min_terminated_length": 255.9, + "epoch": 0.4267277268942548, + "grad_norm": 0.1751402303368657, + "kl": 0.0652587890625, + "learning_rate": 6.144003814753918e-07, + "loss": 0.0026, + "num_tokens": 53526690.0, + "reward": 1.950000023841858, + "reward_std": 0.11148266792297364, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06250000409781933, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07612732574343681, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.6, + "completions/max_terminated_length": 490.6, + "completions/mean_length": 397.1375, + "completions/mean_terminated_length": 397.1375, + "completions/min_length": 282.8, + "completions/min_terminated_length": 282.8, + "epoch": 0.42880932556203166, + "grad_norm": 0.19036129861503978, + "kl": 0.0601806640625, + "learning_rate": 6.112149266692408e-07, + "loss": 0.0024, + "num_tokens": 53775445.0, + "reward": 1.8781250238418579, + "reward_std": 0.08377420753240586, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500447034836, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08377420306205749, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.2, + "completions/max_terminated_length": 533.2, + "completions/mean_length": 396.175, + "completions/mean_terminated_length": 396.175, + "completions/min_length": 268.6, + "completions/min_terminated_length": 268.6, + "epoch": 0.4308909242298085, + "grad_norm": 0.19970426413421907, + "kl": 0.0605224609375, + "learning_rate": 6.080247157159698e-07, + "loss": 0.0024, + "num_tokens": 54048691.0, + "reward": 1.8385416746139527, + "reward_std": 0.08529684022068977, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06144712120294571, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.4, + "completions/max_terminated_length": 492.4, + "completions/mean_length": 363.4, + "completions/mean_terminated_length": 363.4, + "completions/min_length": 244.8, + "completions/min_terminated_length": 244.8, + "epoch": 0.43297252289758537, + "grad_norm": 4.437547266147232, + "kl": 0.0663330078125, + "learning_rate": 6.048298850461199e-07, + "loss": 0.0027, + "num_tokens": 54284083.0, + "reward": 2.0614583492279053, + "reward_std": 0.08593156784772873, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06145833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08593156784772873, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.8, + "completions/max_terminated_length": 444.8, + "completions/mean_length": 346.9875, + "completions/mean_terminated_length": 346.9875, + "completions/min_length": 253.2, + "completions/min_terminated_length": 253.2, + "epoch": 0.4350541215653622, + "grad_norm": 0.16638263391399472, + "kl": 0.066162109375, + "learning_rate": 6.016305712877963e-07, + "loss": 0.0026, + "num_tokens": 54530666.0, + "reward": 1.9469957947731018, + "reward_std": 0.06380395293235779, + "rewards/accuracy_reward/mean": 0.91262077242136, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03437500111758709, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06380394622683525, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.7, + "completions/max_terminated_length": 455.7, + "completions/mean_length": 349.175, + "completions/mean_terminated_length": 349.175, + "completions/min_length": 243.2, + "completions/min_terminated_length": 243.2, + "epoch": 0.43713572023313907, + "grad_norm": 0.7141845653139242, + "kl": 0.06982421875, + "learning_rate": 5.984269112608248e-07, + "loss": 0.0028, + "num_tokens": 54807816.0, + "reward": 1.8172561049461364, + "reward_std": 0.011785121262073516, + "rewards/accuracy_reward/mean": 0.8130894303321838, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.8, + "completions/max_terminated_length": 485.8, + "completions/mean_length": 374.5875, + "completions/mean_terminated_length": 374.5875, + "completions/min_length": 271.4, + "completions/min_terminated_length": 271.4, + "epoch": 0.4392173189009159, + "grad_norm": 5.696974098605386, + "kl": 0.0635009765625, + "learning_rate": 5.952190419709015e-07, + "loss": 0.0025, + "num_tokens": 55041135.0, + "reward": 1.9550000309944153, + "reward_std": 0.2741450160741806, + "rewards/accuracy_reward/mean": 0.8091666668653488, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1458333373069763, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.2038055345416069, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.3, + "completions/max_terminated_length": 505.3, + "completions/mean_length": 371.55, + "completions/mean_terminated_length": 371.55, + "completions/min_length": 271.3, + "completions/min_terminated_length": 271.3, + "epoch": 0.4412989175686928, + "grad_norm": 5.4845719987127595, + "kl": 0.0662353515625, + "learning_rate": 5.920071006037328e-07, + "loss": 0.0027, + "num_tokens": 55270747.0, + "reward": 1.959375, + "reward_std": 0.13593488335609435, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184154987335, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.1, + "completions/max_terminated_length": 562.1, + "completions/mean_length": 406.5875, + "completions/mean_terminated_length": 406.5875, + "completions/min_length": 296.7, + "completions/min_terminated_length": 296.7, + "epoch": 0.4433805162364696, + "grad_norm": 0.16148691721988143, + "kl": 0.05693359375, + "learning_rate": 5.88791224519169e-07, + "loss": 0.0023, + "num_tokens": 55517802.0, + "reward": 1.9183068990707397, + "reward_std": 0.22507139891386033, + "rewards/accuracy_reward/mean": 0.8165211647748947, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10178571604192257, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.17878038063645363, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.6, + "completions/max_terminated_length": 518.6, + "completions/mean_length": 393.7625, + "completions/mean_terminated_length": 393.7625, + "completions/min_length": 280.5, + "completions/min_terminated_length": 280.5, + "epoch": 0.4454621149042465, + "grad_norm": 0.1931719760035152, + "kl": 0.067578125, + "learning_rate": 5.8557155124533e-07, + "loss": 0.0027, + "num_tokens": 55776967.0, + "reward": 1.9604166746139526, + "reward_std": 0.05612906813621521, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06041666865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.056129063665866855, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.4, + "completions/max_terminated_length": 628.4, + "completions/mean_length": 486.4125, + "completions/mean_terminated_length": 486.4125, + "completions/min_length": 391.4, + "completions/min_terminated_length": 391.4, + "epoch": 0.4475437135720233, + "grad_norm": 0.17084291616318495, + "kl": 0.0604736328125, + "learning_rate": 5.82348218472724e-07, + "loss": 0.0024, + "num_tokens": 56029360.0, + "reward": 1.875, + "reward_std": 0.19062008559703827, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.15526476502418518, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.3, + "completions/max_terminated_length": 599.3, + "completions/mean_length": 443.6875, + "completions/mean_terminated_length": 443.6875, + "completions/min_length": 295.2, + "completions/min_terminated_length": 295.2, + "epoch": 0.4496253122398002, + "grad_norm": 0.48374155371617983, + "kl": 0.06611328125, + "learning_rate": 5.791213640483591e-07, + "loss": 0.0026, + "num_tokens": 56290727.0, + "reward": 1.69375, + "reward_std": 0.05303300768136978, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.4, + "completions/max_terminated_length": 588.4, + "completions/mean_length": 403.6, + "completions/mean_terminated_length": 403.6, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.45170691090757703, + "grad_norm": 0.1445031069139877, + "kl": 0.0674072265625, + "learning_rate": 5.758911259698479e-07, + "loss": 0.0027, + "num_tokens": 56535423.0, + "reward": 2.0375, + "reward_std": 0.05717712491750717, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05717712789773941, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 582.1, + "completions/max_terminated_length": 548.7, + "completions/mean_length": 419.825, + "completions/mean_terminated_length": 414.7017883300781, + "completions/min_length": 302.8, + "completions/min_terminated_length": 302.8, + "epoch": 0.4537885095753539, + "grad_norm": 0.2039012828985732, + "kl": 0.063671875, + "learning_rate": 5.726576423795064e-07, + "loss": 0.0025, + "num_tokens": 56776377.0, + "reward": 1.996666669845581, + "reward_std": 0.08670328855514527, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021666666865348815, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03576821386814118, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.7, + "completions/max_terminated_length": 615.7, + "completions/mean_length": 450.525, + "completions/mean_terminated_length": 450.525, + "completions/min_length": 322.9, + "completions/min_terminated_length": 322.9, + "epoch": 0.45587010824313073, + "grad_norm": 0.2191739216969976, + "kl": 0.05869140625, + "learning_rate": 5.694210515584457e-07, + "loss": 0.0023, + "num_tokens": 57055635.0, + "reward": 2.0091666698455812, + "reward_std": 0.08777731209993363, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021666666865348815, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05242196917533874, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.4, + "completions/max_terminated_length": 534.4, + "completions/mean_length": 397.9875, + "completions/mean_terminated_length": 397.9875, + "completions/min_length": 292.4, + "completions/min_terminated_length": 292.4, + "epoch": 0.4579517069109076, + "grad_norm": 0.18071086089547078, + "kl": 0.065087890625, + "learning_rate": 5.661814919206594e-07, + "loss": 0.0026, + "num_tokens": 57296922.0, + "reward": 2.028869080543518, + "reward_std": 0.05570702590048313, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028869048692286015, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05570701584219932, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.9, + "completions/max_terminated_length": 531.9, + "completions/mean_length": 400.8125, + "completions/mean_terminated_length": 400.8125, + "completions/min_length": 292.9, + "completions/min_terminated_length": 292.9, + "epoch": 0.46003330557868444, + "grad_norm": 4.088657909363055, + "kl": 0.0660888671875, + "learning_rate": 5.629391020071032e-07, + "loss": 0.0026, + "num_tokens": 57544875.0, + "reward": 1.8931250095367431, + "reward_std": 0.23395789116621019, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09312500022351741, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1542452432215214, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.7, + "completions/max_terminated_length": 529.7, + "completions/mean_length": 409.4875, + "completions/mean_terminated_length": 409.4875, + "completions/min_length": 315.7, + "completions/min_terminated_length": 315.7, + "epoch": 0.4621149042464613, + "grad_norm": 4.76066463942462, + "kl": 0.1063720703125, + "learning_rate": 5.59694020479771e-07, + "loss": 0.0043, + "num_tokens": 57816154.0, + "reward": 1.96875, + "reward_std": 0.1952166110277176, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0672792598605156, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.3, + "completions/max_terminated_length": 669.3, + "completions/mean_length": 536.5375, + "completions/mean_terminated_length": 536.5375, + "completions/min_length": 384.3, + "completions/min_terminated_length": 384.3, + "epoch": 0.46419650291423814, + "grad_norm": 4.334216934519029, + "kl": 0.0681884765625, + "learning_rate": 5.564463861157637e-07, + "loss": 0.0027, + "num_tokens": 58090021.0, + "reward": 1.3, + "reward_std": 0.4292363554239273, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.39388103485107423, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 723.7, + "completions/max_terminated_length": 663.9, + "completions/mean_length": 526.05, + "completions/mean_terminated_length": 515.7785766601562, + "completions/min_length": 369.8, + "completions/min_terminated_length": 369.8, + "epoch": 0.466278101582015, + "grad_norm": 4.1930076280017206, + "kl": 0.0716796875, + "learning_rate": 5.531963378013561e-07, + "loss": 0.0029, + "num_tokens": 58377081.0, + "reward": 1.3382021546363831, + "reward_std": 0.4580157116055489, + "rewards/accuracy_reward/mean": 0.2954938292503357, + "rewards/accuracy_reward/std": 0.3434520088136196, + "rewards/format_reward/mean": 0.975, + "rewards/format_reward/std": 0.07071067690849304, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0677083346992731, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09890521839261054, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.5, + "completions/max_terminated_length": 625.5, + "completions/mean_length": 462.5625, + "completions/mean_terminated_length": 462.5625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.46835970024979184, + "grad_norm": 4.5367121933873085, + "kl": 0.071923828125, + "learning_rate": 5.49944014526056e-07, + "loss": 0.0029, + "num_tokens": 58648062.0, + "reward": 1.5307243108749389, + "reward_std": 0.37956870198249815, + "rewards/accuracy_reward/mean": 0.47405762821435926, + "rewards/accuracy_reward/std": 0.37618621438741684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.056666669249534604, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09018309488892555, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.9, + "completions/max_terminated_length": 581.9, + "completions/mean_length": 448.6125, + "completions/mean_terminated_length": 448.6125, + "completions/min_length": 322.2, + "completions/min_terminated_length": 322.2, + "epoch": 0.4704412989175687, + "grad_norm": 4.212489303625003, + "kl": 0.076708984375, + "learning_rate": 5.46689555376661e-07, + "loss": 0.0031, + "num_tokens": 58888839.0, + "reward": 1.778541672229767, + "reward_std": 0.324503193795681, + "rewards/accuracy_reward/mean": 0.7375, + "rewards/accuracy_reward/std": 0.23144719302654265, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05354166869074106, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10384699180722237, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.6, + "completions/max_terminated_length": 660.6, + "completions/mean_length": 502.925, + "completions/mean_terminated_length": 502.925, + "completions/min_length": 350.9, + "completions/min_terminated_length": 350.9, + "epoch": 0.47252289758534555, + "grad_norm": 0.16742959807896826, + "kl": 0.0670166015625, + "learning_rate": 5.434330995313097e-07, + "loss": 0.0027, + "num_tokens": 59157993.0, + "reward": 1.8049999952316285, + "reward_std": 0.20901573747396468, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.1948736011981964, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.005000000074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01414213627576828, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 469.2, + "completions/mean_terminated_length": 469.2, + "completions/min_length": 332.7, + "completions/min_terminated_length": 332.7, + "epoch": 0.4746044962531224, + "grad_norm": 4.835954112188709, + "kl": 0.07080078125, + "learning_rate": 5.401747862535307e-07, + "loss": 0.0028, + "num_tokens": 59428345.0, + "reward": 1.8166666984558106, + "reward_std": 0.37507805973291397, + "rewards/accuracy_reward/mean": 0.7375, + "rewards/accuracy_reward/std": 0.2929195284843445, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07916666977107525, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10120401307940483, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.1, + "completions/max_terminated_length": 625.1, + "completions/mean_length": 485.1125, + "completions/mean_terminated_length": 485.1125, + "completions/min_length": 351.9, + "completions/min_terminated_length": 351.9, + "epoch": 0.47668609492089925, + "grad_norm": 5.6497051467213755, + "kl": 0.0648193359375, + "learning_rate": 5.369147548862859e-07, + "loss": 0.0026, + "num_tokens": 59682330.0, + "reward": 1.883750033378601, + "reward_std": 0.2730850502848625, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.2150476098060608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000171363353, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11164017990231515, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.6, + "completions/max_terminated_length": 507.6, + "completions/mean_length": 397.8375, + "completions/mean_terminated_length": 397.8375, + "completions/min_length": 302.7, + "completions/min_terminated_length": 302.7, + "epoch": 0.4787676935886761, + "grad_norm": 4.905315583455296, + "kl": 0.071826171875, + "learning_rate": 5.336531448460124e-07, + "loss": 0.0029, + "num_tokens": 59941981.0, + "reward": 2.0729166746139525, + "reward_std": 0.14378461316227914, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.085416666790843, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10842926502227783, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 598.5, + "completions/max_terminated_length": 559.3, + "completions/mean_length": 441.3, + "completions/mean_terminated_length": 434.8089294433594, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.48084929225645295, + "grad_norm": 0.15381638622442467, + "kl": 0.0681396484375, + "learning_rate": 5.303900956166593e-07, + "loss": 0.0027, + "num_tokens": 60197093.0, + "reward": 1.8645833492279054, + "reward_std": 0.10891140550374985, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.033108004927635194, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.6, + "completions/max_terminated_length": 533.6, + "completions/mean_length": 399.675, + "completions/mean_terminated_length": 399.675, + "completions/min_length": 283.6, + "completions/min_terminated_length": 283.6, + "epoch": 0.4829308909242298, + "grad_norm": 0.41233767876846544, + "kl": 0.06923828125, + "learning_rate": 5.271257467437234e-07, + "loss": 0.0028, + "num_tokens": 60472171.0, + "reward": 1.780847954750061, + "reward_std": 0.07071067690849304, + "rewards/accuracy_reward/mean": 0.7808479532599449, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.4, + "completions/max_terminated_length": 484.4, + "completions/mean_length": 382.8125, + "completions/mean_terminated_length": 382.8125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.48501248959200666, + "grad_norm": 0.18389590944709894, + "kl": 0.0666259765625, + "learning_rate": 5.238602378282815e-07, + "loss": 0.0027, + "num_tokens": 60746292.0, + "reward": 1.975, + "reward_std": 0.046291005611419675, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.9, + "completions/max_terminated_length": 560.9, + "completions/mean_length": 413.85, + "completions/mean_terminated_length": 413.85, + "completions/min_length": 292.3, + "completions/min_terminated_length": 292.3, + "epoch": 0.4870940882597835, + "grad_norm": 4.615879338831406, + "kl": 0.063720703125, + "learning_rate": 5.205937085210197e-07, + "loss": 0.0026, + "num_tokens": 60959256.0, + "reward": 1.927529764175415, + "reward_std": 0.19220280051231384, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.14056250751018523, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02752976268529892, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05164029598236084, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.5, + "completions/max_terminated_length": 532.5, + "completions/mean_length": 394.55, + "completions/mean_terminated_length": 394.55, + "completions/min_length": 278.9, + "completions/min_terminated_length": 278.9, + "epoch": 0.48917568692756036, + "grad_norm": 0.14942227839170505, + "kl": 0.0636962890625, + "learning_rate": 5.173262985162614e-07, + "loss": 0.0026, + "num_tokens": 61230564.0, + "reward": 1.9645833492279052, + "reward_std": 0.20971630662679672, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02708333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07660323679447174, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.9, + "completions/max_terminated_length": 494.9, + "completions/mean_length": 397.2625, + "completions/mean_terminated_length": 397.2625, + "completions/min_length": 299.9, + "completions/min_terminated_length": 299.9, + "epoch": 0.4912572855953372, + "grad_norm": 4.289089774953379, + "kl": 0.0595458984375, + "learning_rate": 5.140581475459938e-07, + "loss": 0.0024, + "num_tokens": 61471321.0, + "reward": 1.8956249952316284, + "reward_std": 0.13828388042747974, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.033125000260770324, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07691512294113637, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 411.4875, + "completions/mean_terminated_length": 411.4875, + "completions/min_length": 305.9, + "completions/min_terminated_length": 305.9, + "epoch": 0.49333888426311406, + "grad_norm": 0.18518897583184304, + "kl": 0.0660888671875, + "learning_rate": 5.107893953738915e-07, + "loss": 0.0026, + "num_tokens": 61730760.0, + "reward": 1.86875, + "reward_std": 0.11623437106609344, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.029124119877815248, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.4, + "completions/max_terminated_length": 507.4, + "completions/mean_length": 381.625, + "completions/mean_terminated_length": 381.625, + "completions/min_length": 281.5, + "completions/min_terminated_length": 281.5, + "epoch": 0.4954204829308909, + "grad_norm": 5.476492140212813, + "kl": 0.0659912109375, + "learning_rate": 5.075201817893396e-07, + "loss": 0.0026, + "num_tokens": 62006714.0, + "reward": 2.021250009536743, + "reward_std": 0.19753799736499786, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07125000096857548, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13994049057364463, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 602.2, + "completions/max_terminated_length": 577.7, + "completions/mean_length": 450.0625, + "completions/mean_terminated_length": 444.56964721679685, + "completions/min_length": 321.8, + "completions/min_terminated_length": 321.8, + "epoch": 0.49750208159866777, + "grad_norm": 0.1584734476787758, + "kl": 0.06484375, + "learning_rate": 5.04250646601456e-07, + "loss": 0.0026, + "num_tokens": 62237823.0, + "reward": 1.7147321462631226, + "reward_std": 0.1240307368338108, + "rewards/accuracy_reward/mean": 0.6970238089561462, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05063929483294487, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.2, + "completions/max_terminated_length": 526.2, + "completions/mean_length": 416.0375, + "completions/mean_terminated_length": 416.0375, + "completions/min_length": 310.8, + "completions/min_terminated_length": 310.8, + "epoch": 0.4995836802664446, + "grad_norm": 4.671426097531064, + "kl": 0.0638916015625, + "learning_rate": 5.009809296331118e-07, + "loss": 0.0026, + "num_tokens": 62514826.0, + "reward": 1.6633333444595337, + "reward_std": 0.09387510269880295, + "rewards/accuracy_reward/mean": 0.6125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.050833333656191824, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05851975753903389, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.6, + "completions/max_terminated_length": 542.6, + "completions/mean_length": 406.35, + "completions/mean_terminated_length": 406.35, + "completions/min_length": 275.3, + "completions/min_terminated_length": 275.3, + "epoch": 0.5016652789342215, + "grad_norm": 0.14630449525115286, + "kl": 0.13447265625, + "learning_rate": 4.977111707149521e-07, + "loss": 0.0054, + "num_tokens": 62779366.0, + "reward": 1.875, + "reward_std": 0.19686797261238098, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.1687566041946411, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.1, + "completions/max_terminated_length": 562.1, + "completions/mean_length": 406.1375, + "completions/mean_terminated_length": 406.1375, + "completions/min_length": 286.6, + "completions/min_terminated_length": 286.6, + "epoch": 0.5037468776019983, + "grad_norm": 5.956207329606263, + "kl": 0.058935546875, + "learning_rate": 4.944415096794161e-07, + "loss": 0.0024, + "num_tokens": 63043865.0, + "reward": 1.9383333444595336, + "reward_std": 0.1557971253991127, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013333333283662796, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02661053091287613, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.4, + "completions/max_terminated_length": 460.4, + "completions/mean_length": 372.1625, + "completions/mean_terminated_length": 372.1625, + "completions/min_length": 275.5, + "completions/min_terminated_length": 275.5, + "epoch": 0.5058284762697752, + "grad_norm": 0.23284235184340876, + "kl": 0.06318359375, + "learning_rate": 4.911720863547568e-07, + "loss": 0.0025, + "num_tokens": 63314838.0, + "reward": 1.8229166746139527, + "reward_std": 0.1730016589164734, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.14056250751018523, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0412478968501091, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.1, + "completions/max_terminated_length": 510.1, + "completions/mean_length": 388.9125, + "completions/mean_terminated_length": 388.9125, + "completions/min_length": 298.4, + "completions/min_terminated_length": 298.4, + "epoch": 0.507910074937552, + "grad_norm": 0.15603568789200578, + "kl": 0.057958984375, + "learning_rate": 4.879030405590619e-07, + "loss": 0.0023, + "num_tokens": 63586063.0, + "reward": 1.6361075520515442, + "reward_std": 0.14981908798217775, + "rewards/accuracy_reward/mean": 0.6298575364053249, + "rewards/accuracy_reward/std": 0.13214141875505447, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.5, + "completions/max_terminated_length": 488.5, + "completions/mean_length": 379.2125, + "completions/mean_terminated_length": 379.2125, + "completions/min_length": 273.7, + "completions/min_terminated_length": 273.7, + "epoch": 0.5099916736053289, + "grad_norm": 0.16532706939561498, + "kl": 0.0598388671875, + "learning_rate": 4.84634512094273e-07, + "loss": 0.0024, + "num_tokens": 63857800.0, + "reward": 1.93125, + "reward_std": 0.18097035735845565, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.1632926881313324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.8, + "completions/max_terminated_length": 504.8, + "completions/mean_length": 396.375, + "completions/mean_terminated_length": 396.375, + "completions/min_length": 267.5, + "completions/min_terminated_length": 267.5, + "epoch": 0.5120732722731057, + "grad_norm": 0.162376196003175, + "kl": 0.056689453125, + "learning_rate": 4.813666407402089e-07, + "loss": 0.0023, + "num_tokens": 64129614.0, + "reward": 1.8337500095367432, + "reward_std": 0.1141713872551918, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008749999850988389, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019339685142040253, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.5, + "completions/max_terminated_length": 432.5, + "completions/mean_length": 361.125, + "completions/mean_terminated_length": 361.125, + "completions/min_length": 285.3, + "completions/min_terminated_length": 285.3, + "epoch": 0.5141548709408826, + "grad_norm": 0.20639651283098462, + "kl": 0.061962890625, + "learning_rate": 4.780995662485859e-07, + "loss": 0.0025, + "num_tokens": 64384904.0, + "reward": 1.865046989917755, + "reward_std": 0.18655484169721603, + "rewards/accuracy_reward/mean": 0.8521303236484528, + "rewards/accuracy_reward/std": 0.17045392990112304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.012916666828095913, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03653385192155838, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.7, + "completions/max_terminated_length": 491.7, + "completions/mean_length": 360.4875, + "completions/mean_terminated_length": 360.4875, + "completions/min_length": 249.7, + "completions/min_terminated_length": 249.7, + "epoch": 0.5162364696086594, + "grad_norm": 0.18282518864741693, + "kl": 0.0615234375, + "learning_rate": 4.748334283370432e-07, + "loss": 0.0025, + "num_tokens": 64643279.0, + "reward": 1.9632417678833007, + "reward_std": 0.19427025616168975, + "rewards/accuracy_reward/mean": 0.8944917440414428, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000260770321, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12050200030207633, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.3, + "completions/max_terminated_length": 492.3, + "completions/mean_length": 369.575, + "completions/mean_terminated_length": 369.575, + "completions/min_length": 271.2, + "completions/min_terminated_length": 271.2, + "epoch": 0.5183180682764363, + "grad_norm": 5.280030094416361, + "kl": 0.06513671875, + "learning_rate": 4.7156836668316567e-07, + "loss": 0.0026, + "num_tokens": 64894389.0, + "reward": 1.9135416984558105, + "reward_std": 0.25629419833421707, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.12416292428970337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166604578496, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13213126733899117, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 388.825, + "completions/mean_terminated_length": 388.825, + "completions/min_length": 264.2, + "completions/min_terminated_length": 264.2, + "epoch": 0.5203996669442131, + "grad_norm": 0.17958774514499543, + "kl": 0.063232421875, + "learning_rate": 4.683045209185126e-07, + "loss": 0.0025, + "num_tokens": 65171735.0, + "reward": 2.0, + "reward_std": 0.11624701544642449, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06279476955533028, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.4, + "completions/max_terminated_length": 435.4, + "completions/mean_length": 341.0875, + "completions/mean_terminated_length": 341.0875, + "completions/min_length": 234.7, + "completions/min_terminated_length": 234.7, + "epoch": 0.52248126561199, + "grad_norm": 4.911714810824962, + "kl": 0.0594970703125, + "learning_rate": 4.6504203062264465e-07, + "loss": 0.0024, + "num_tokens": 65437782.0, + "reward": 1.7674168467521667, + "reward_std": 0.11884753406047821, + "rewards/accuracy_reward/mean": 0.7361668512225151, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07255653142929078, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 330.1, + "completions/mean_terminated_length": 330.1, + "completions/min_length": 245.1, + "completions/min_terminated_length": 245.1, + "epoch": 0.5245628642797668, + "grad_norm": 5.248564218688146, + "kl": 0.06259765625, + "learning_rate": 4.617810353171559e-07, + "loss": 0.0025, + "num_tokens": 65705374.0, + "reward": 1.8066666722297668, + "reward_std": 0.22095786333084105, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04416666682809591, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11436765491962433, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.3, + "completions/max_terminated_length": 493.3, + "completions/mean_length": 391.3875, + "completions/mean_terminated_length": 391.3875, + "completions/min_length": 278.1, + "completions/min_terminated_length": 278.1, + "epoch": 0.5266444629475437, + "grad_norm": 0.16378324718640833, + "kl": 0.0593505859375, + "learning_rate": 4.58521674459706e-07, + "loss": 0.0024, + "num_tokens": 65962061.0, + "reward": 1.85, + "reward_std": 0.15782093107700348, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.15782093703746797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.3, + "completions/max_terminated_length": 471.3, + "completions/mean_length": 373.475, + "completions/mean_terminated_length": 373.475, + "completions/min_length": 266.7, + "completions/min_terminated_length": 266.7, + "epoch": 0.5287260616153205, + "grad_norm": 0.1830028884331716, + "kl": 0.0652587890625, + "learning_rate": 4.5526408743805766e-07, + "loss": 0.0026, + "num_tokens": 66221843.0, + "reward": 1.6479166746139526, + "reward_std": 0.19362604022026061, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.1569620907306671, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03666396141052246, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.5, + "completions/max_terminated_length": 485.5, + "completions/mean_length": 390.7625, + "completions/mean_terminated_length": 390.7625, + "completions/min_length": 307.8, + "completions/min_terminated_length": 307.8, + "epoch": 0.5308076602830974, + "grad_norm": 4.514230348134004, + "kl": 0.0629150390625, + "learning_rate": 4.5200841356411383e-07, + "loss": 0.0025, + "num_tokens": 66499336.0, + "reward": 1.8791666746139526, + "reward_std": 0.12090870141983032, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 408.0, + "completions/mean_terminated_length": 408.0, + "completions/min_length": 297.9, + "completions/min_terminated_length": 297.9, + "epoch": 0.5328892589508742, + "grad_norm": 5.057172959976599, + "kl": 0.0637451171875, + "learning_rate": 4.487547920679619e-07, + "loss": 0.0026, + "num_tokens": 66777848.0, + "reward": 1.8416666746139527, + "reward_std": 0.25614635050296786, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.185156187415123, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.7, + "completions/max_terminated_length": 436.7, + "completions/mean_length": 347.7625, + "completions/mean_terminated_length": 347.7625, + "completions/min_length": 246.6, + "completions/min_terminated_length": 246.6, + "epoch": 0.5349708576186512, + "grad_norm": 0.16058155449876269, + "kl": 0.06494140625, + "learning_rate": 4.455033620919181e-07, + "loss": 0.0026, + "num_tokens": 67050053.0, + "reward": 1.9739131927490234, + "reward_std": 0.12320148199796677, + "rewards/accuracy_reward/mean": 0.9145381838083267, + "rewards/accuracy_reward/std": 0.040609382838010785, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059375002048909664, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08259210474789143, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.2, + "completions/max_terminated_length": 461.2, + "completions/mean_length": 364.925, + "completions/mean_terminated_length": 364.925, + "completions/min_length": 255.9, + "completions/min_terminated_length": 255.9, + "epoch": 0.537052456286428, + "grad_norm": 4.827380252982668, + "kl": 0.0681396484375, + "learning_rate": 4.422542626845778e-07, + "loss": 0.0027, + "num_tokens": 67311335.0, + "reward": 1.9572916746139526, + "reward_std": 0.12070775479078293, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03229166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07775685265660286, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 369.9375, + "completions/mean_terminated_length": 369.9375, + "completions/min_length": 249.2, + "completions/min_terminated_length": 249.2, + "epoch": 0.5391340549542049, + "grad_norm": 0.14251222545674874, + "kl": 0.066455078125, + "learning_rate": 4.390076327948682e-07, + "loss": 0.0027, + "num_tokens": 67576626.0, + "reward": 1.7372291088104248, + "reward_std": 0.15879597142338753, + "rewards/accuracy_reward/mean": 0.7163957685232163, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03085862174630165, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.1, + "completions/max_terminated_length": 435.1, + "completions/mean_length": 354.675, + "completions/mean_terminated_length": 354.675, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5412156536219817, + "grad_norm": 0.1917964302311964, + "kl": 0.0631103515625, + "learning_rate": 4.3576361126610726e-07, + "loss": 0.0025, + "num_tokens": 67844256.0, + "reward": 1.8813888907432557, + "reward_std": 0.10606601536273956, + "rewards/accuracy_reward/mean": 0.8938888892531395, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.5, + "completions/max_terminated_length": 498.5, + "completions/mean_length": 363.65, + "completions/mean_terminated_length": 363.65, + "completions/min_length": 263.3, + "completions/min_terminated_length": 263.3, + "epoch": 0.5432972522897586, + "grad_norm": 0.1959125425760896, + "kl": 0.062109375, + "learning_rate": 4.325223368300651e-07, + "loss": 0.0025, + "num_tokens": 68103620.0, + "reward": 1.9702083468437195, + "reward_std": 0.12276247590780258, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08270833585411311, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09280072674155235, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 342.5875, + "completions/mean_terminated_length": 342.5875, + "completions/min_length": 251.3, + "completions/min_terminated_length": 251.3, + "epoch": 0.5453788509575354, + "grad_norm": 5.143451269883383, + "kl": 0.0701416015625, + "learning_rate": 4.2928394810103183e-07, + "loss": 0.0028, + "num_tokens": 68354763.0, + "reward": 1.8983333587646485, + "reward_std": 0.1375915750861168, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02333333417773247, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.050481320917606355, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.8, + "completions/max_terminated_length": 455.8, + "completions/mean_length": 337.075, + "completions/mean_terminated_length": 337.075, + "completions/min_length": 223.5, + "completions/min_terminated_length": 223.5, + "epoch": 0.5474604496253123, + "grad_norm": 4.888783306760334, + "kl": 0.0669677734375, + "learning_rate": 4.2604858356988845e-07, + "loss": 0.0027, + "num_tokens": 68620449.0, + "reward": 1.9479166746139527, + "reward_std": 0.17002529054880142, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06481812223792076, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.2, + "completions/max_terminated_length": 449.2, + "completions/mean_length": 344.775, + "completions/mean_terminated_length": 344.775, + "completions/min_length": 231.1, + "completions/min_terminated_length": 231.1, + "epoch": 0.5495420482930891, + "grad_norm": 0.20047462316762774, + "kl": 0.0705078125, + "learning_rate": 4.2281638159818576e-07, + "loss": 0.0028, + "num_tokens": 68864487.0, + "reward": 2.021875, + "reward_std": 0.13440237641334535, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.046875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07023735865950584, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.3, + "completions/max_terminated_length": 424.3, + "completions/mean_length": 332.1, + "completions/mean_terminated_length": 332.1, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.551623646960866, + "grad_norm": 0.19745363394939197, + "kl": 0.06005859375, + "learning_rate": 4.195874804122262e-07, + "loss": 0.0024, + "num_tokens": 69110799.0, + "reward": 1.9602083206176757, + "reward_std": 0.09825282096862793, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010208333283662796, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.011142565310001374, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.5, + "completions/max_terminated_length": 468.5, + "completions/mean_length": 352.8875, + "completions/mean_terminated_length": 352.8875, + "completions/min_length": 252.8, + "completions/min_terminated_length": 252.8, + "epoch": 0.5537052456286428, + "grad_norm": 0.1280210870103763, + "kl": 0.0635986328125, + "learning_rate": 4.163620180971532e-07, + "loss": 0.0025, + "num_tokens": 69375502.0, + "reward": 1.971875, + "reward_std": 0.09685598835349082, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 502.2, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 364.0, + "completions/mean_terminated_length": 358.0821472167969, + "completions/min_length": 239.6, + "completions/min_terminated_length": 239.6, + "epoch": 0.5557868442964197, + "grad_norm": 3.3723511611967147, + "kl": 0.06787109375, + "learning_rate": 4.13140132591045e-07, + "loss": 0.0027, + "num_tokens": 69649910.0, + "reward": 1.943750023841858, + "reward_std": 0.18331822901964187, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000149011612, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1288616955280304, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.6, + "completions/max_terminated_length": 501.6, + "completions/mean_length": 399.7625, + "completions/mean_terminated_length": 399.7625, + "completions/min_length": 297.2, + "completions/min_terminated_length": 297.2, + "epoch": 0.5578684429641965, + "grad_norm": 0.1468120352133046, + "kl": 0.05986328125, + "learning_rate": 4.099219616790171e-07, + "loss": 0.0024, + "num_tokens": 69927915.0, + "reward": 1.8760416746139525, + "reward_std": 0.11300802528858185, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03136167526245117, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.3, + "completions/max_terminated_length": 481.3, + "completions/mean_length": 356.825, + "completions/mean_terminated_length": 356.825, + "completions/min_length": 249.6, + "completions/min_terminated_length": 249.6, + "epoch": 0.5599500416319734, + "grad_norm": 5.091825810338028, + "kl": 0.0645263671875, + "learning_rate": 4.067076429873283e-07, + "loss": 0.0026, + "num_tokens": 70181997.0, + "reward": 1.8833333492279052, + "reward_std": 0.14142135977745057, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071068063378334, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.2, + "completions/max_terminated_length": 493.2, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5620316402997502, + "grad_norm": 0.2079987682202225, + "kl": 0.0571044921875, + "learning_rate": 4.034973139774962e-07, + "loss": 0.0023, + "num_tokens": 70393499.0, + "reward": 1.8135416746139525, + "reward_std": 0.07906274311244488, + "rewards/accuracy_reward/mean": 0.7895833333954215, + "rewards/accuracy_reward/std": 0.04124789573252201, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02395833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03781484961509705, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 550.9, + "completions/max_terminated_length": 517.6, + "completions/mean_length": 385.8875, + "completions/mean_terminated_length": 379.0964294433594, + "completions/min_length": 268.4, + "completions/min_terminated_length": 268.4, + "epoch": 0.5641132389675271, + "grad_norm": 0.1933390995078618, + "kl": 0.0705810546875, + "learning_rate": 4.002911119404181e-07, + "loss": 0.0028, + "num_tokens": 70653330.0, + "reward": 1.84375, + "reward_std": 0.26869996935129165, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.1632926881313324, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0772959053516388, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 347.4, + "completions/mean_terminated_length": 347.4, + "completions/min_length": 247.9, + "completions/min_terminated_length": 247.9, + "epoch": 0.5661948376353039, + "grad_norm": 4.915739208482373, + "kl": 0.063818359375, + "learning_rate": 3.9708917399050003e-07, + "loss": 0.0026, + "num_tokens": 70890210.0, + "reward": 1.9677083492279053, + "reward_std": 0.2370162934064865, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.1632926881313324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.09270833544433117, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09098203107714653, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.1, + "completions/max_terminated_length": 443.1, + "completions/mean_length": 338.5875, + "completions/mean_terminated_length": 338.5875, + "completions/min_length": 243.1, + "completions/min_terminated_length": 243.1, + "epoch": 0.5682764363030808, + "grad_norm": 0.1713205405963468, + "kl": 0.061669921875, + "learning_rate": 3.9389163705979205e-07, + "loss": 0.0025, + "num_tokens": 71162329.0, + "reward": 2.040000009536743, + "reward_std": 0.05196775794029236, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04000000208616257, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05196775794029236, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 381.9, + "completions/mean_terminated_length": 381.9, + "completions/min_length": 266.8, + "completions/min_terminated_length": 266.8, + "epoch": 0.5703580349708576, + "grad_norm": 0.1860626994363445, + "kl": 0.0588623046875, + "learning_rate": 3.9069863789213386e-07, + "loss": 0.0024, + "num_tokens": 71402497.0, + "reward": 1.8729166746139527, + "reward_std": 0.09573607742786408, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06739883720874787, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.3, + "completions/max_terminated_length": 515.3, + "completions/mean_length": 394.8875, + "completions/mean_terminated_length": 394.8875, + "completions/min_length": 308.3, + "completions/min_terminated_length": 308.3, + "epoch": 0.5724396336386345, + "grad_norm": 0.20655811983357472, + "kl": 0.0579345703125, + "learning_rate": 3.875103130373055e-07, + "loss": 0.0023, + "num_tokens": 71672064.0, + "reward": 1.945562446117401, + "reward_std": 0.1405719131231308, + "rewards/accuracy_reward/mean": 0.9038957685232163, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09428090453147889, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.9, + "completions/max_terminated_length": 508.9, + "completions/mean_length": 378.6625, + "completions/mean_terminated_length": 378.6625, + "completions/min_length": 272.4, + "completions/min_terminated_length": 272.4, + "epoch": 0.5745212323064113, + "grad_norm": 0.16434709986724766, + "kl": 0.05673828125, + "learning_rate": 3.843267988451888e-07, + "loss": 0.0023, + "num_tokens": 71944285.0, + "reward": 1.9539583444595336, + "reward_std": 0.1520232580602169, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06645833402872085, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1166679285466671, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.9, + "completions/max_terminated_length": 443.9, + "completions/mean_length": 339.8625, + "completions/mean_terminated_length": 339.8625, + "completions/min_length": 247.5, + "completions/min_terminated_length": 247.5, + "epoch": 0.5766028309741882, + "grad_norm": 0.21011433719763795, + "kl": 0.06376953125, + "learning_rate": 3.81148231459935e-07, + "loss": 0.0026, + "num_tokens": 72169274.0, + "reward": 1.9620498180389405, + "reward_std": 0.07314258962869644, + "rewards/accuracy_reward/mean": 0.8918414890766144, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07020833343267441, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07314259260892868, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.7, + "completions/max_terminated_length": 516.7, + "completions/mean_length": 389.175, + "completions/mean_terminated_length": 389.175, + "completions/min_length": 288.7, + "completions/min_terminated_length": 288.7, + "epoch": 0.578684429641965, + "grad_norm": 4.132988613381757, + "kl": 0.057421875, + "learning_rate": 3.779747468141444e-07, + "loss": 0.0023, + "num_tokens": 72423584.0, + "reward": 1.95291668176651, + "reward_std": 0.19983291178941726, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.14056250751018523, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05291666910052299, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07267622202634812, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.3, + "completions/max_terminated_length": 490.3, + "completions/mean_length": 371.2375, + "completions/mean_terminated_length": 371.2375, + "completions/min_length": 279.5, + "completions/min_terminated_length": 279.5, + "epoch": 0.5807660283097419, + "grad_norm": 0.14167042199242594, + "kl": 0.059326171875, + "learning_rate": 3.748064806230512e-07, + "loss": 0.0024, + "num_tokens": 72694027.0, + "reward": 1.8518012285232544, + "reward_std": 0.05197432786226273, + "rewards/accuracy_reward/mean": 0.8243012249469757, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02750000059604645, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05197431892156601, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.7, + "completions/max_terminated_length": 505.7, + "completions/mean_length": 389.2125, + "completions/mean_terminated_length": 389.2125, + "completions/min_length": 268.3, + "completions/min_terminated_length": 268.3, + "epoch": 0.5828476269775187, + "grad_norm": 0.16260582148160752, + "kl": 0.06240234375, + "learning_rate": 3.716435683787212e-07, + "loss": 0.0025, + "num_tokens": 72959636.0, + "reward": 2.0104166746139525, + "reward_std": 0.0197955846786499, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.5, + "completions/max_terminated_length": 535.5, + "completions/mean_length": 391.6375, + "completions/mean_terminated_length": 391.6375, + "completions/min_length": 282.2, + "completions/min_terminated_length": 282.2, + "epoch": 0.5849292256452956, + "grad_norm": 5.398047142968695, + "kl": 0.0581298828125, + "learning_rate": 3.684861453442559e-07, + "loss": 0.0023, + "num_tokens": 73183623.0, + "reward": 2.0072916746139526, + "reward_std": 0.07827533856034279, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08229166865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.031984337419271466, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.7, + "completions/max_terminated_length": 476.7, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 370.0, + "completions/min_length": 276.7, + "completions/min_terminated_length": 276.7, + "epoch": 0.5870108243130724, + "grad_norm": 5.209234303596355, + "kl": 0.062939453125, + "learning_rate": 3.653343465480094e-07, + "loss": 0.0025, + "num_tokens": 73446071.0, + "reward": 1.8541666746139527, + "reward_std": 0.23748018741607665, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.20411194264888763, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.3, + "completions/max_terminated_length": 488.3, + "completions/mean_length": 364.9375, + "completions/mean_terminated_length": 364.9375, + "completions/min_length": 258.3, + "completions/min_terminated_length": 258.3, + "epoch": 0.5890924229808493, + "grad_norm": 0.20315120680488988, + "kl": 0.0669189453125, + "learning_rate": 3.6218830677781287e-07, + "loss": 0.0027, + "num_tokens": 73676026.0, + "reward": 2.000000023841858, + "reward_std": 0.17773192301392554, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06250000298023224, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1002311997115612, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.2, + "completions/max_terminated_length": 491.2, + "completions/mean_length": 383.925, + "completions/mean_terminated_length": 383.925, + "completions/min_length": 285.8, + "completions/min_terminated_length": 285.8, + "epoch": 0.5911740216486261, + "grad_norm": 0.14883941440587328, + "kl": 0.0576416015625, + "learning_rate": 3.590481605752107e-07, + "loss": 0.0023, + "num_tokens": 73943828.0, + "reward": 1.9854166746139525, + "reward_std": 0.07575379610061646, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.5, + "completions/max_terminated_length": 536.5, + "completions/mean_length": 407.9375, + "completions/mean_terminated_length": 407.9375, + "completions/min_length": 308.1, + "completions/min_terminated_length": 308.1, + "epoch": 0.593255620316403, + "grad_norm": 0.23106299258902835, + "kl": 0.0578369140625, + "learning_rate": 3.559140422297069e-07, + "loss": 0.0023, + "num_tokens": 74182295.0, + "reward": 1.9865277886390686, + "reward_std": 0.10382884740829468, + "rewards/accuracy_reward/mean": 0.944861114025116, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04166666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07501916810870171, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.3, + "completions/max_terminated_length": 485.3, + "completions/mean_length": 392.3125, + "completions/mean_terminated_length": 392.3125, + "completions/min_length": 307.9, + "completions/min_terminated_length": 307.9, + "epoch": 0.5953372189841799, + "grad_norm": 0.1436455249167597, + "kl": 0.0568603515625, + "learning_rate": 3.527860857730214e-07, + "loss": 0.0023, + "num_tokens": 74426280.0, + "reward": 1.91875, + "reward_std": 0.12745261490345, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.3, + "completions/max_terminated_length": 534.3, + "completions/mean_length": 420.6375, + "completions/mean_terminated_length": 420.6375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.5974188176519567, + "grad_norm": 0.14423937571477746, + "kl": 0.0572265625, + "learning_rate": 3.4966442497335936e-07, + "loss": 0.0023, + "num_tokens": 74684595.0, + "reward": 2.044166684150696, + "reward_std": 0.07888686656951904, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04416666869074106, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07888686880469323, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.9, + "completions/max_terminated_length": 545.9, + "completions/mean_length": 419.775, + "completions/mean_terminated_length": 419.775, + "completions/min_length": 316.9, + "completions/min_terminated_length": 316.9, + "epoch": 0.5995004163197336, + "grad_norm": 0.11281943802862444, + "kl": 0.0521728515625, + "learning_rate": 3.4654919332968923e-07, + "loss": 0.0021, + "num_tokens": 74949489.0, + "reward": 2.0072916746139526, + "reward_std": 0.013684011995792389, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00729166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.013684006035327911, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.4, + "completions/max_terminated_length": 524.4, + "completions/mean_length": 398.8625, + "completions/mean_terminated_length": 398.8625, + "completions/min_length": 311.7, + "completions/min_terminated_length": 311.7, + "epoch": 0.6015820149875104, + "grad_norm": 0.12108732261018262, + "kl": 0.059130859375, + "learning_rate": 3.4344052406603485e-07, + "loss": 0.0024, + "num_tokens": 75223470.0, + "reward": 1.9645833492279052, + "reward_std": 0.1357921063899994, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0395833358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06500234007835388, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.9, + "completions/max_terminated_length": 490.9, + "completions/mean_length": 368.6375, + "completions/mean_terminated_length": 368.6375, + "completions/min_length": 271.1, + "completions/min_terminated_length": 271.1, + "epoch": 0.6036636136552873, + "grad_norm": 5.105473836739841, + "kl": 0.0620361328125, + "learning_rate": 3.40338550125777e-07, + "loss": 0.0025, + "num_tokens": 75492369.0, + "reward": 1.8625, + "reward_std": 0.12246559262275696, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.3, + "completions/max_terminated_length": 595.3, + "completions/mean_length": 426.4375, + "completions/mean_terminated_length": 426.4375, + "completions/min_length": 309.2, + "completions/min_terminated_length": 309.2, + "epoch": 0.6057452123230641, + "grad_norm": 0.171884299035333, + "kl": 0.0556884765625, + "learning_rate": 3.37243404165969e-07, + "loss": 0.0022, + "num_tokens": 75738228.0, + "reward": 2.0052083492279054, + "reward_std": 0.055492520332336426, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01770833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.020137180387973786, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.3, + "completions/max_terminated_length": 462.3, + "completions/mean_length": 367.375, + "completions/mean_terminated_length": 367.375, + "completions/min_length": 275.3, + "completions/min_terminated_length": 275.3, + "epoch": 0.607826810990841, + "grad_norm": 0.19991084845817067, + "kl": 0.0576416015625, + "learning_rate": 3.341552185516623e-07, + "loss": 0.0023, + "num_tokens": 76005202.0, + "reward": 1.90625, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.2, + "completions/max_terminated_length": 508.2, + "completions/mean_length": 387.7125, + "completions/mean_terminated_length": 387.7125, + "completions/min_length": 284.2, + "completions/min_terminated_length": 284.2, + "epoch": 0.6099084096586178, + "grad_norm": 5.137834016006699, + "kl": 0.0586181640625, + "learning_rate": 3.310741253502474e-07, + "loss": 0.0023, + "num_tokens": 76280163.0, + "reward": 1.9791666746139527, + "reward_std": 0.2614880561828613, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.1595182627439499, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12458351105451584, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.8, + "completions/max_terminated_length": 468.8, + "completions/mean_length": 354.7375, + "completions/mean_terminated_length": 354.7375, + "completions/min_length": 245.1, + "completions/min_terminated_length": 245.1, + "epoch": 0.6119900083263947, + "grad_norm": 5.197754833651674, + "kl": 0.0572021484375, + "learning_rate": 3.280002563258047e-07, + "loss": 0.0023, + "num_tokens": 76547126.0, + "reward": 1.9916666984558105, + "reward_std": 0.20014614909887313, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10416666939854621, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1184998020529747, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.4, + "completions/max_terminated_length": 547.4, + "completions/mean_length": 405.85, + "completions/mean_terminated_length": 405.85, + "completions/min_length": 297.5, + "completions/min_terminated_length": 297.5, + "epoch": 0.6140716069941715, + "grad_norm": 5.091353448827943, + "kl": 0.0522705078125, + "learning_rate": 3.249337429334705e-07, + "loss": 0.0021, + "num_tokens": 76813202.0, + "reward": 1.8640625, + "reward_std": 0.22569140791893005, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.18138959705829621, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0265625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053110551089048386, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.7, + "completions/max_terminated_length": 496.7, + "completions/mean_length": 362.95, + "completions/mean_terminated_length": 362.95, + "completions/min_length": 259.4, + "completions/min_terminated_length": 259.4, + "epoch": 0.6161532056619484, + "grad_norm": 0.147757377653093, + "kl": 0.057275390625, + "learning_rate": 3.21874716313814e-07, + "loss": 0.0023, + "num_tokens": 77081446.0, + "reward": 2.014285707473755, + "reward_std": 0.10369245111942291, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03928571343421936, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.057401442527771, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.9, + "completions/max_terminated_length": 485.9, + "completions/mean_length": 373.225, + "completions/mean_terminated_length": 373.225, + "completions/min_length": 284.8, + "completions/min_terminated_length": 284.8, + "epoch": 0.6182348043297252, + "grad_norm": 0.14559764289523203, + "kl": 0.0578369140625, + "learning_rate": 3.188233072872306e-07, + "loss": 0.0023, + "num_tokens": 77327368.0, + "reward": 1.7632440447807312, + "reward_std": 0.12269835770130158, + "rewards/accuracy_reward/mean": 0.7351190477609635, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04105201661586762, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.8, + "completions/max_terminated_length": 483.8, + "completions/mean_length": 368.7375, + "completions/mean_terminated_length": 368.7375, + "completions/min_length": 264.9, + "completions/min_terminated_length": 264.9, + "epoch": 0.6203164029975021, + "grad_norm": 0.18554386606188442, + "kl": 0.060205078125, + "learning_rate": 3.157796463483462e-07, + "loss": 0.0024, + "num_tokens": 77598443.0, + "reward": 1.8989583492279052, + "reward_std": 0.05251617282629013, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.3, + "completions/max_terminated_length": 528.3, + "completions/mean_length": 372.1, + "completions/mean_terminated_length": 372.1, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.6223980016652789, + "grad_norm": 6.080061352103668, + "kl": 0.0625244140625, + "learning_rate": 3.12743863660437e-07, + "loss": 0.0025, + "num_tokens": 77874619.0, + "reward": 1.974608850479126, + "reward_std": 0.11803357228636742, + "rewards/accuracy_reward/mean": 0.9141921669244766, + "rewards/accuracy_reward/std": 0.0017611147835850717, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06041666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11627245470881462, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.2, + "completions/max_terminated_length": 496.2, + "completions/mean_length": 373.2875, + "completions/mean_terminated_length": 373.2875, + "completions/min_length": 238.4, + "completions/min_terminated_length": 238.4, + "epoch": 0.6244796003330558, + "grad_norm": 0.21710130755298088, + "kl": 0.0623046875, + "learning_rate": 3.097160890498625e-07, + "loss": 0.0025, + "num_tokens": 78151298.0, + "reward": 1.8583333492279053, + "reward_std": 0.11615225374698639, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.8, + "completions/max_terminated_length": 512.8, + "completions/mean_length": 383.425, + "completions/mean_terminated_length": 383.425, + "completions/min_length": 280.5, + "completions/min_terminated_length": 280.5, + "epoch": 0.6265611990008326, + "grad_norm": 0.1315134028604458, + "kl": 0.0558837890625, + "learning_rate": 3.0669645200051453e-07, + "loss": 0.0022, + "num_tokens": 78410436.0, + "reward": 1.7776818871498108, + "reward_std": 0.15103521551936866, + "rewards/accuracy_reward/mean": 0.7585152305662632, + "rewards/accuracy_reward/std": 0.12693723943084478, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667759418488, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.031952467560768125, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.9, + "completions/max_terminated_length": 472.9, + "completions/mean_length": 388.525, + "completions/mean_terminated_length": 388.525, + "completions/min_length": 291.6, + "completions/min_terminated_length": 291.6, + "epoch": 0.6286427976686095, + "grad_norm": 0.12651115020522632, + "kl": 0.0575927734375, + "learning_rate": 3.036850816482785e-07, + "loss": 0.0023, + "num_tokens": 78678670.0, + "reward": 1.7510416746139525, + "reward_std": 0.111685012280941, + "rewards/accuracy_reward/mean": 0.7375, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01988932639360428, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.2, + "completions/max_terminated_length": 489.2, + "completions/mean_length": 351.825, + "completions/mean_terminated_length": 351.825, + "completions/min_length": 243.7, + "completions/min_terminated_length": 243.7, + "epoch": 0.6307243963363863, + "grad_norm": 4.418440689589054, + "kl": 0.05615234375, + "learning_rate": 3.006821067755121e-07, + "loss": 0.0022, + "num_tokens": 78940568.0, + "reward": 1.8958333492279054, + "reward_std": 0.19499201476573944, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07083333544433117, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12428133860230446, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.3, + "completions/max_terminated_length": 450.3, + "completions/mean_length": 349.75, + "completions/mean_terminated_length": 349.75, + "completions/min_length": 248.7, + "completions/min_terminated_length": 248.7, + "epoch": 0.6328059950041632, + "grad_norm": 0.15336856419939696, + "kl": 0.0598876953125, + "learning_rate": 2.9768765580553646e-07, + "loss": 0.0024, + "num_tokens": 79202124.0, + "reward": 1.9208333373069764, + "reward_std": 0.14433692693710326, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06811279505491256, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.9, + "completions/max_terminated_length": 497.9, + "completions/mean_length": 367.9, + "completions/mean_terminated_length": 367.9, + "completions/min_length": 271.1, + "completions/min_terminated_length": 271.1, + "epoch": 0.63488759367194, + "grad_norm": 0.19183319995044523, + "kl": 0.0605224609375, + "learning_rate": 2.9470185679714575e-07, + "loss": 0.0024, + "num_tokens": 79452556.0, + "reward": 1.9670833587646483, + "reward_std": 0.07115457355976104, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0795833358541131, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044742978364229205, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.7, + "completions/max_terminated_length": 450.7, + "completions/mean_length": 349.8375, + "completions/mean_terminated_length": 349.8375, + "completions/min_length": 248.9, + "completions/min_terminated_length": 248.9, + "epoch": 0.6369691923397169, + "grad_norm": 4.946905661215974, + "kl": 0.054736328125, + "learning_rate": 2.917248374391291e-07, + "loss": 0.0022, + "num_tokens": 79705863.0, + "reward": 1.9729166746139526, + "reward_std": 0.1014419287443161, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.8, + "completions/max_terminated_length": 434.8, + "completions/mean_length": 321.625, + "completions/mean_terminated_length": 321.625, + "completions/min_length": 231.3, + "completions/min_terminated_length": 231.3, + "epoch": 0.6390507910074937, + "grad_norm": 0.20955413749497104, + "kl": 0.0560302734375, + "learning_rate": 2.887567250448112e-07, + "loss": 0.0022, + "num_tokens": 79962649.0, + "reward": 2.025, + "reward_std": 0.053452253341674805, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02500000074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05345225036144256, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.9, + "completions/max_terminated_length": 475.9, + "completions/mean_length": 369.7, + "completions/mean_terminated_length": 369.7, + "completions/min_length": 274.3, + "completions/min_terminated_length": 274.3, + "epoch": 0.6411323896752706, + "grad_norm": 0.16739936123712726, + "kl": 0.0539794921875, + "learning_rate": 2.8579764654660684e-07, + "loss": 0.0022, + "num_tokens": 80219937.0, + "reward": 1.8879098296165466, + "reward_std": 0.11641737371683121, + "rewards/accuracy_reward/mean": 0.8499931506812572, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03791666682809591, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06466245353221893, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.7, + "completions/max_terminated_length": 511.7, + "completions/mean_length": 388.375, + "completions/mean_terminated_length": 388.375, + "completions/min_length": 290.6, + "completions/min_terminated_length": 290.6, + "epoch": 0.6432139883430474, + "grad_norm": 0.15903268299642787, + "kl": 0.0543212890625, + "learning_rate": 2.828477284905931e-07, + "loss": 0.0022, + "num_tokens": 80473399.0, + "reward": 1.83125, + "reward_std": 0.134679351747036, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 412.75, + "completions/mean_terminated_length": 412.75, + "completions/min_length": 327.6, + "completions/min_terminated_length": 327.6, + "epoch": 0.6452955870108243, + "grad_norm": 0.23672273143469844, + "kl": 0.0563232421875, + "learning_rate": 2.7990709703109715e-07, + "loss": 0.0023, + "num_tokens": 80726755.0, + "reward": 1.7434523820877075, + "reward_std": 0.21723176091909407, + "rewards/accuracy_reward/mean": 0.725, + "rewards/accuracy_reward/std": 0.17045392990112304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018452381156384944, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0521912157535553, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 374.0875, + "completions/mean_terminated_length": 374.0875, + "completions/min_length": 295.4, + "completions/min_terminated_length": 295.4, + "epoch": 0.6473771856786011, + "grad_norm": 4.4358192605003675, + "kl": 0.056396484375, + "learning_rate": 2.7697587792530224e-07, + "loss": 0.0023, + "num_tokens": 80972154.0, + "reward": 1.9489583373069763, + "reward_std": 0.09627838134765625, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01602174937725067, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.4, + "completions/max_terminated_length": 473.4, + "completions/mean_length": 374.3875, + "completions/mean_terminated_length": 374.3875, + "completions/min_length": 270.2, + "completions/min_terminated_length": 270.2, + "epoch": 0.649458784346378, + "grad_norm": 4.299687592517207, + "kl": 0.0558349609375, + "learning_rate": 2.740541965278674e-07, + "loss": 0.0022, + "num_tokens": 81222465.0, + "reward": 1.9568055629730225, + "reward_std": 0.09925851821899415, + "rewards/accuracy_reward/mean": 0.925555557012558, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04580627083778381, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.6, + "completions/max_terminated_length": 499.6, + "completions/mean_length": 371.25, + "completions/mean_terminated_length": 371.25, + "completions/min_length": 261.7, + "completions/min_terminated_length": 261.7, + "epoch": 0.6515403830141548, + "grad_norm": 0.15859439093876504, + "kl": 0.0567138671875, + "learning_rate": 2.711421777855697e-07, + "loss": 0.0023, + "num_tokens": 81472885.0, + "reward": 2.0004166841506956, + "reward_std": 0.18078695088624955, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06291666850447655, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10276310220360756, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.8, + "completions/max_terminated_length": 537.8, + "completions/mean_length": 401.4875, + "completions/mean_terminated_length": 401.4875, + "completions/min_length": 297.4, + "completions/min_terminated_length": 297.4, + "epoch": 0.6536219816819318, + "grad_norm": 0.11178847338664576, + "kl": 0.0556884765625, + "learning_rate": 2.682399462319581e-07, + "loss": 0.0022, + "num_tokens": 81735388.0, + "reward": 1.8754166841506958, + "reward_std": 0.24388935342431067, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10041666850447654, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15540080443024634, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.6, + "completions/max_terminated_length": 545.6, + "completions/mean_length": 389.975, + "completions/mean_terminated_length": 389.975, + "completions/min_length": 262.5, + "completions/min_terminated_length": 262.5, + "epoch": 0.6557035803497085, + "grad_norm": 5.338136743789175, + "kl": 0.05400390625, + "learning_rate": 2.6534762598202924e-07, + "loss": 0.0022, + "num_tokens": 81997114.0, + "reward": 1.8625, + "reward_std": 0.12246559262275696, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.4, + "completions/max_terminated_length": 488.4, + "completions/mean_length": 392.0, + "completions/mean_terminated_length": 392.0, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.6577851790174855, + "grad_norm": 4.662728227249034, + "kl": 0.0537109375, + "learning_rate": 2.624653407269192e-07, + "loss": 0.0021, + "num_tokens": 82271194.0, + "reward": 1.8606499552726745, + "reward_std": 0.12930927574634551, + "rewards/accuracy_reward/mean": 0.8189832538366317, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.041666668653488156, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08301825821399689, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.6, + "completions/max_terminated_length": 503.6, + "completions/mean_length": 390.125, + "completions/mean_terminated_length": 390.125, + "completions/min_length": 278.4, + "completions/min_terminated_length": 278.4, + "epoch": 0.6598667776852623, + "grad_norm": 0.13954351267435355, + "kl": 0.053955078125, + "learning_rate": 2.595932137286138e-07, + "loss": 0.0022, + "num_tokens": 82518412.0, + "reward": 1.9166666746139527, + "reward_std": 0.09428090751171112, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556607723236, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 403.7375, + "completions/mean_terminated_length": 403.7375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.6619483763530392, + "grad_norm": 4.888555596115262, + "kl": 0.0542724609375, + "learning_rate": 2.567313678146771e-07, + "loss": 0.0022, + "num_tokens": 82771527.0, + "reward": 1.965625023841858, + "reward_std": 0.2425983279943466, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500149011611, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14271903932094573, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.3, + "completions/max_terminated_length": 574.3, + "completions/mean_length": 430.6875, + "completions/mean_terminated_length": 430.6875, + "completions/min_length": 298.8, + "completions/min_terminated_length": 298.8, + "epoch": 0.664029975020816, + "grad_norm": 5.0577996446349625, + "kl": 0.05341796875, + "learning_rate": 2.5387992537299963e-07, + "loss": 0.0021, + "num_tokens": 83018006.0, + "reward": 1.9958333492279052, + "reward_std": 0.11996905207633972, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049258365482091906, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.2, + "completions/max_terminated_length": 497.2, + "completions/mean_length": 382.7125, + "completions/mean_terminated_length": 382.7125, + "completions/min_length": 289.5, + "completions/min_terminated_length": 289.5, + "epoch": 0.6661115736885929, + "grad_norm": 5.030928942983225, + "kl": 0.0546875, + "learning_rate": 2.510390083465621e-07, + "loss": 0.0022, + "num_tokens": 83286223.0, + "reward": 2.031250023841858, + "reward_std": 0.14403526857495308, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.13125000409781934, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.14403527304530145, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.6, + "completions/max_terminated_length": 549.6, + "completions/mean_length": 404.025, + "completions/mean_terminated_length": 404.025, + "completions/min_length": 284.5, + "completions/min_terminated_length": 284.5, + "epoch": 0.6681931723563697, + "grad_norm": 4.124145506186141, + "kl": 0.056201171875, + "learning_rate": 2.482087382282238e-07, + "loss": 0.0022, + "num_tokens": 83532273.0, + "reward": 1.8395833492279052, + "reward_std": 0.06592325270175933, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.039583335444331166, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0659232459962368, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.7, + "completions/max_terminated_length": 486.7, + "completions/mean_length": 373.9375, + "completions/mean_terminated_length": 373.9375, + "completions/min_length": 283.7, + "completions/min_terminated_length": 283.7, + "epoch": 0.6702747710241466, + "grad_norm": 0.1474085241752631, + "kl": 0.0574462890625, + "learning_rate": 2.453892360555233e-07, + "loss": 0.0023, + "num_tokens": 83769724.0, + "reward": 1.8802083492279054, + "reward_std": 0.14919540733098985, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04270833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07760776579380035, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.7, + "completions/max_terminated_length": 503.7, + "completions/mean_length": 400.4375, + "completions/mean_terminated_length": 400.4375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6723563696919234, + "grad_norm": 4.803506917401603, + "kl": 0.05517578125, + "learning_rate": 2.425806224055055e-07, + "loss": 0.0022, + "num_tokens": 84031639.0, + "reward": 1.9375, + "reward_std": 0.0816463440656662, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 626.1, + "completions/max_terminated_length": 579.7, + "completions/mean_length": 410.5, + "completions/mean_terminated_length": 402.5732147216797, + "completions/min_length": 285.4, + "completions/min_terminated_length": 285.4, + "epoch": 0.6744379683597003, + "grad_norm": 4.429967274639083, + "kl": 0.0574951171875, + "learning_rate": 2.3978301738956287e-07, + "loss": 0.0023, + "num_tokens": 84303743.0, + "reward": 1.70625, + "reward_std": 0.2609692007303238, + "rewards/accuracy_reward/mean": 0.7, + "rewards/accuracy_reward/std": 0.2205115258693695, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.6, + "completions/max_terminated_length": 541.6, + "completions/mean_length": 409.725, + "completions/mean_terminated_length": 409.725, + "completions/min_length": 309.1, + "completions/min_terminated_length": 309.1, + "epoch": 0.6765195670274771, + "grad_norm": 0.2112119584367916, + "kl": 0.052734375, + "learning_rate": 2.369965406482996e-07, + "loss": 0.0021, + "num_tokens": 84533249.0, + "reward": 1.9125, + "reward_std": 0.17943965792655944, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.14433693289756774, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.4, + "completions/max_terminated_length": 519.4, + "completions/mean_length": 392.3125, + "completions/mean_terminated_length": 392.3125, + "completions/min_length": 289.2, + "completions/min_terminated_length": 289.2, + "epoch": 0.678601165695254, + "grad_norm": 0.1459616982889113, + "kl": 0.0552490234375, + "learning_rate": 2.342213113464155e-07, + "loss": 0.0022, + "num_tokens": 84815362.0, + "reward": 1.926360011100769, + "reward_std": 0.0947591558098793, + "rewards/accuracy_reward/mean": 0.9013600140810013, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.7, + "completions/max_terminated_length": 527.7, + "completions/mean_length": 401.0125, + "completions/mean_terminated_length": 401.0125, + "completions/min_length": 303.3, + "completions/min_terminated_length": 303.3, + "epoch": 0.6806827643630308, + "grad_norm": 0.15128101407838257, + "kl": 0.0541748046875, + "learning_rate": 2.3145744816760915e-07, + "loss": 0.0022, + "num_tokens": 85090907.0, + "reward": 1.9145833492279052, + "reward_std": 0.03310801088809967, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01458333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.033108004927635194, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.5, + "completions/max_terminated_length": 533.5, + "completions/mean_length": 425.7125, + "completions/mean_terminated_length": 425.7125, + "completions/min_length": 311.1, + "completions/min_terminated_length": 311.1, + "epoch": 0.6827643630308077, + "grad_norm": 4.464712293634347, + "kl": 0.0533447265625, + "learning_rate": 2.287050693095028e-07, + "loss": 0.0021, + "num_tokens": 85364564.0, + "reward": 1.8083333492279052, + "reward_std": 0.27998869568109513, + "rewards/accuracy_reward/mean": 0.775, + "rewards/accuracy_reward/std": 0.2205115258693695, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08461370393633842, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.5, + "completions/max_terminated_length": 579.5, + "completions/mean_length": 445.3375, + "completions/mean_terminated_length": 445.3375, + "completions/min_length": 323.6, + "completions/min_terminated_length": 323.6, + "epoch": 0.6848459616985845, + "grad_norm": 0.12651191382442867, + "kl": 0.0567626953125, + "learning_rate": 2.25964292478588e-07, + "loss": 0.0023, + "num_tokens": 85614319.0, + "reward": 1.9358333587646483, + "reward_std": 0.15191497951745986, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.060833333805203435, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08951258435845375, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.5, + "completions/max_terminated_length": 497.5, + "completions/mean_length": 379.4375, + "completions/mean_terminated_length": 379.4375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.6869275603663614, + "grad_norm": 0.15966499931168038, + "kl": 0.058203125, + "learning_rate": 2.2323523488519035e-07, + "loss": 0.0023, + "num_tokens": 85889306.0, + "reward": 1.940625, + "reward_std": 0.21795205026865005, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.15782093703746797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.040625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06013111919164658, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.1, + "completions/max_terminated_length": 580.1, + "completions/mean_length": 430.5375, + "completions/mean_terminated_length": 430.5375, + "completions/min_length": 316.3, + "completions/min_terminated_length": 316.3, + "epoch": 0.6890091590341382, + "grad_norm": 5.482841175278612, + "kl": 0.0480712890625, + "learning_rate": 2.2051801323845898e-07, + "loss": 0.0019, + "num_tokens": 86165773.0, + "reward": 1.89375, + "reward_std": 0.05303300768136978, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.2, + "completions/max_terminated_length": 495.2, + "completions/mean_length": 395.0625, + "completions/mean_terminated_length": 395.0625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.6910907577019151, + "grad_norm": 4.34674602323066, + "kl": 0.0560791015625, + "learning_rate": 2.178127437413738e-07, + "loss": 0.0022, + "num_tokens": 86430674.0, + "reward": 1.9328063368797301, + "reward_std": 0.18096388429403304, + "rewards/accuracy_reward/mean": 0.8786396577954292, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11062439307570457, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.6, + "completions/max_terminated_length": 518.6, + "completions/mean_length": 395.825, + "completions/mean_terminated_length": 395.825, + "completions/min_length": 284.9, + "completions/min_terminated_length": 284.9, + "epoch": 0.6931723563696919, + "grad_norm": 4.461438229384425, + "kl": 0.056396484375, + "learning_rate": 2.1511954208577687e-07, + "loss": 0.0023, + "num_tokens": 86678692.0, + "reward": 1.8700993537902832, + "reward_std": 0.10521658658981323, + "rewards/accuracy_reward/mean": 0.8492660000920296, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05892556756734848, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.6, + "completions/max_terminated_length": 467.6, + "completions/mean_length": 359.1625, + "completions/mean_terminated_length": 359.1625, + "completions/min_length": 271.4, + "completions/min_terminated_length": 271.4, + "epoch": 0.6952539550374688, + "grad_norm": 0.1587957283713179, + "kl": 0.05439453125, + "learning_rate": 2.1243852344742456e-07, + "loss": 0.0022, + "num_tokens": 86955689.0, + "reward": 1.933035707473755, + "reward_std": 0.14633138179779054, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03303571343421936, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07562070488929748, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.4, + "completions/max_terminated_length": 528.4, + "completions/mean_length": 403.3875, + "completions/mean_terminated_length": 403.3875, + "completions/min_length": 299.8, + "completions/min_terminated_length": 299.8, + "epoch": 0.6973355537052456, + "grad_norm": 0.16912400051864426, + "kl": 0.0544921875, + "learning_rate": 2.0976980248106207e-07, + "loss": 0.0022, + "num_tokens": 87199200.0, + "reward": 1.7490820646286012, + "reward_std": 0.14865545853972434, + "rewards/accuracy_reward/mean": 0.6969987243413925, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05208333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08596487566828728, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.3, + "completions/max_terminated_length": 476.3, + "completions/mean_length": 364.2625, + "completions/mean_terminated_length": 364.2625, + "completions/min_length": 250.6, + "completions/min_terminated_length": 250.6, + "epoch": 0.6994171523730225, + "grad_norm": 0.13838109852259403, + "kl": 0.0562255859375, + "learning_rate": 2.071134933155198e-07, + "loss": 0.0022, + "num_tokens": 87448773.0, + "reward": 1.866287887096405, + "reward_std": 0.153206467628479, + "rewards/accuracy_reward/mean": 0.8621212124824524, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.7, + "completions/max_terminated_length": 534.7, + "completions/mean_length": 399.6125, + "completions/mean_terminated_length": 399.6125, + "completions/min_length": 272.4, + "completions/min_terminated_length": 272.4, + "epoch": 0.7014987510407993, + "grad_norm": 5.846906136391809, + "kl": 0.0546630859375, + "learning_rate": 2.0446970954883397e-07, + "loss": 0.0022, + "num_tokens": 87726590.0, + "reward": 1.8574824571609496, + "reward_std": 0.12330644056200982, + "rewards/accuracy_reward/mean": 0.8253991156816483, + "rewards/accuracy_reward/std": 0.008560963720083237, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.044583334028720854, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0793901264667511, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.4, + "completions/max_terminated_length": 525.4, + "completions/mean_length": 409.4125, + "completions/mean_terminated_length": 409.4125, + "completions/min_length": 288.9, + "completions/min_terminated_length": 288.9, + "epoch": 0.7035803497085762, + "grad_norm": 0.12400396976068362, + "kl": 0.052490234375, + "learning_rate": 2.018385642433859e-07, + "loss": 0.0021, + "num_tokens": 87991615.0, + "reward": 1.8695168137550353, + "reward_std": 0.09889537543058395, + "rewards/accuracy_reward/mean": 0.8528501406311989, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0471404530107975, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 392.4875, + "completions/mean_terminated_length": 392.4875, + "completions/min_length": 276.5, + "completions/min_terminated_length": 276.5, + "epoch": 0.705661948376353, + "grad_norm": 0.1491549885780442, + "kl": 0.0510986328125, + "learning_rate": 1.9922016992107004e-07, + "loss": 0.002, + "num_tokens": 88234326.0, + "reward": 1.8433712124824524, + "reward_std": 0.14787373542785645, + "rewards/accuracy_reward/mean": 0.7996212124824524, + "rewards/accuracy_reward/std": 0.10520716905593872, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04266657531261444, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.5, + "completions/max_terminated_length": 522.5, + "completions/mean_length": 392.575, + "completions/mean_terminated_length": 392.575, + "completions/min_length": 272.3, + "completions/min_terminated_length": 272.3, + "epoch": 0.7077435470441299, + "grad_norm": 0.14457577960469306, + "kl": 0.05, + "learning_rate": 1.9661463855847953e-07, + "loss": 0.002, + "num_tokens": 88502596.0, + "reward": 2.0260416746139525, + "reward_std": 0.1515728861093521, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08086220920085907, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.2, + "completions/max_terminated_length": 523.2, + "completions/mean_length": 421.3625, + "completions/mean_terminated_length": 421.3625, + "completions/min_length": 318.8, + "completions/min_terminated_length": 318.8, + "epoch": 0.7098251457119067, + "grad_norm": 0.16354366705945111, + "kl": 0.0482666015625, + "learning_rate": 1.9402208158211846e-07, + "loss": 0.0019, + "num_tokens": 88766889.0, + "reward": 1.8375, + "reward_std": 0.12288875579833984, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.023145502805709837, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.3, + "completions/max_terminated_length": 512.3, + "completions/mean_length": 407.425, + "completions/mean_terminated_length": 407.425, + "completions/min_length": 316.9, + "completions/min_terminated_length": 316.9, + "epoch": 0.7119067443796836, + "grad_norm": 0.15199017450102612, + "kl": 0.050537109375, + "learning_rate": 1.9144260986363663e-07, + "loss": 0.002, + "num_tokens": 89049659.0, + "reward": 2.031250023841858, + "reward_std": 0.14454624205827712, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06875000074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0927913174033165, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.8, + "completions/max_terminated_length": 474.8, + "completions/mean_length": 375.9875, + "completions/mean_terminated_length": 375.9875, + "completions/min_length": 281.8, + "completions/min_terminated_length": 281.8, + "epoch": 0.7139883430474604, + "grad_norm": 0.18400026901801167, + "kl": 0.0502197265625, + "learning_rate": 1.888763337150877e-07, + "loss": 0.002, + "num_tokens": 89307282.0, + "reward": 1.9940972447395324, + "reward_std": 0.13174496293067933, + "rewards/accuracy_reward/mean": 0.9305555552244187, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166828095913, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.13174496218562126, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.8, + "completions/max_terminated_length": 539.8, + "completions/mean_length": 406.5, + "completions/mean_terminated_length": 406.5, + "completions/min_length": 295.3, + "completions/min_terminated_length": 295.3, + "epoch": 0.7160699417152373, + "grad_norm": 0.15960429674340176, + "kl": 0.0587646484375, + "learning_rate": 1.8632336288421275e-07, + "loss": 0.0024, + "num_tokens": 89514746.0, + "reward": 1.9389583468437195, + "reward_std": 0.23162795454263688, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.20411194264888763, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03895833436399698, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07336622476577759, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.9, + "completions/max_terminated_length": 524.9, + "completions/mean_length": 408.7875, + "completions/mean_terminated_length": 408.7875, + "completions/min_length": 305.4, + "completions/min_terminated_length": 305.4, + "epoch": 0.7181515403830142, + "grad_norm": 0.15281608095911914, + "kl": 0.052734375, + "learning_rate": 1.837838065497448e-07, + "loss": 0.0021, + "num_tokens": 89773265.0, + "reward": 1.9989583492279053, + "reward_std": 0.08846957683563232, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03645833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03671466112136841, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.4, + "completions/max_terminated_length": 515.4, + "completions/mean_length": 383.75, + "completions/mean_terminated_length": 383.75, + "completions/min_length": 280.2, + "completions/min_terminated_length": 280.2, + "epoch": 0.720233139050791, + "grad_norm": 0.166348730186093, + "kl": 0.0537109375, + "learning_rate": 1.8125777331674224e-07, + "loss": 0.0021, + "num_tokens": 90026781.0, + "reward": 2.0, + "reward_std": 0.09258201122283935, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.3, + "completions/max_terminated_length": 514.3, + "completions/mean_length": 385.7, + "completions/mean_terminated_length": 385.7, + "completions/min_length": 262.9, + "completions/min_terminated_length": 262.9, + "epoch": 0.7223147377185679, + "grad_norm": 5.263532396427543, + "kl": 0.0520263671875, + "learning_rate": 1.7874537121194233e-07, + "loss": 0.0021, + "num_tokens": 90303189.0, + "reward": 2.007232141494751, + "reward_std": 0.15389785990118982, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0572321429848671, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07758329436182976, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.9, + "completions/max_terminated_length": 546.9, + "completions/mean_length": 430.1875, + "completions/mean_terminated_length": 430.1875, + "completions/min_length": 326.8, + "completions/min_terminated_length": 326.8, + "epoch": 0.7243963363863447, + "grad_norm": 4.1697785482475815, + "kl": 0.052490234375, + "learning_rate": 1.7624670767914241e-07, + "loss": 0.0021, + "num_tokens": 90550892.0, + "reward": 1.869166660308838, + "reward_std": 0.11857073605060578, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05666666626930237, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0832154020667076, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.7, + "completions/max_terminated_length": 530.7, + "completions/mean_length": 395.625, + "completions/mean_terminated_length": 395.625, + "completions/min_length": 286.3, + "completions/min_terminated_length": 286.3, + "epoch": 0.7264779350541216, + "grad_norm": 4.735500880156546, + "kl": 0.0527099609375, + "learning_rate": 1.7376188957460464e-07, + "loss": 0.0021, + "num_tokens": 90796390.0, + "reward": 1.964305579662323, + "reward_std": 0.14060550779104233, + "rewards/accuracy_reward/mean": 0.9055555552244187, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05875000134110451, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11017159223556519, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.8, + "completions/max_terminated_length": 492.8, + "completions/mean_length": 375.3875, + "completions/mean_terminated_length": 375.3875, + "completions/min_length": 280.2, + "completions/min_terminated_length": 280.2, + "epoch": 0.7285595337218984, + "grad_norm": 4.588985089684356, + "kl": 0.048681640625, + "learning_rate": 1.7129102316248644e-07, + "loss": 0.0019, + "num_tokens": 91058349.0, + "reward": 1.8958333492279054, + "reward_std": 0.20283454060554504, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08277528658509255, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.3, + "completions/max_terminated_length": 519.3, + "completions/mean_length": 394.225, + "completions/mean_terminated_length": 394.225, + "completions/min_length": 268.7, + "completions/min_terminated_length": 268.7, + "epoch": 0.7306411323896753, + "grad_norm": 5.032917532131076, + "kl": 0.0522216796875, + "learning_rate": 1.688342141102958e-07, + "loss": 0.0021, + "num_tokens": 91316167.0, + "reward": 1.6864583492279053, + "reward_std": 0.19432369619607925, + "rewards/accuracy_reward/mean": 0.675, + "rewards/accuracy_reward/std": 0.17422052025794982, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03240906372666359, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 381.075, + "completions/mean_terminated_length": 381.075, + "completions/min_length": 286.4, + "completions/min_terminated_length": 286.4, + "epoch": 0.7327227310574521, + "grad_norm": 4.764262378414541, + "kl": 0.0566162109375, + "learning_rate": 1.6639156748437316e-07, + "loss": 0.0023, + "num_tokens": 91570893.0, + "reward": 1.9031250119209289, + "reward_std": 0.14553493782877922, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07812500074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10751301869750023, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.7, + "completions/max_terminated_length": 476.7, + "completions/mean_length": 366.8875, + "completions/mean_terminated_length": 366.8875, + "completions/min_length": 263.1, + "completions/min_terminated_length": 263.1, + "epoch": 0.734804329725229, + "grad_norm": 5.087437731335537, + "kl": 0.058740234375, + "learning_rate": 1.6396318774539658e-07, + "loss": 0.0024, + "num_tokens": 91840780.0, + "reward": 1.8559027910232544, + "reward_std": 0.15110048055648803, + "rewards/accuracy_reward/mean": 0.841944444179535, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.013958334363996983, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02309281751513481, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.4, + "completions/max_terminated_length": 477.4, + "completions/mean_length": 367.7125, + "completions/mean_terminated_length": 367.7125, + "completions/min_length": 255.8, + "completions/min_terminated_length": 255.8, + "epoch": 0.7368859283930058, + "grad_norm": 5.792350963783144, + "kl": 0.0564453125, + "learning_rate": 1.6154917874391642e-07, + "loss": 0.0023, + "num_tokens": 92082925.0, + "reward": 1.8333333492279054, + "reward_std": 0.22360757291316985, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.185156187415123, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.020833334326744078, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03845139443874359, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.8, + "completions/max_terminated_length": 494.8, + "completions/mean_length": 377.575, + "completions/mean_terminated_length": 377.575, + "completions/min_length": 278.2, + "completions/min_terminated_length": 278.2, + "epoch": 0.7389675270607827, + "grad_norm": 4.6887900541716405, + "kl": 0.064013671875, + "learning_rate": 1.5914964371591282e-07, + "loss": 0.0026, + "num_tokens": 92359075.0, + "reward": 1.8440972328186036, + "reward_std": 0.18152772933244704, + "rewards/accuracy_reward/mean": 0.7847222223877907, + "rewards/accuracy_reward/std": 0.12093005329370499, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.059375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07280750945210457, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 345.2875, + "completions/mean_terminated_length": 345.2875, + "completions/min_length": 262.8, + "completions/min_terminated_length": 262.8, + "epoch": 0.7410491257285595, + "grad_norm": 0.1742682978076689, + "kl": 0.0579833984375, + "learning_rate": 1.56764685278381e-07, + "loss": 0.0023, + "num_tokens": 92629834.0, + "reward": 1.8572916746139527, + "reward_std": 0.18208086043596267, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.13509859144687653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046982265263795855, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.1, + "completions/max_terminated_length": 448.1, + "completions/mean_length": 337.6625, + "completions/mean_terminated_length": 337.6625, + "completions/min_length": 246.8, + "completions/min_terminated_length": 246.8, + "epoch": 0.7431307243963364, + "grad_norm": 5.953804007612613, + "kl": 0.0570068359375, + "learning_rate": 1.5439440542494315e-07, + "loss": 0.0023, + "num_tokens": 92876415.0, + "reward": 1.7963137030601501, + "reward_std": 0.1349079929292202, + "rewards/accuracy_reward/mean": 0.796313701570034, + "rewards/accuracy_reward/std": 0.1349079929292202, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.4, + "completions/max_terminated_length": 465.4, + "completions/mean_length": 363.775, + "completions/mean_terminated_length": 363.775, + "completions/min_length": 256.4, + "completions/min_terminated_length": 256.4, + "epoch": 0.7452123230641132, + "grad_norm": 5.446112817760046, + "kl": 0.0608154296875, + "learning_rate": 1.5203890552148624e-07, + "loss": 0.0024, + "num_tokens": 93138461.0, + "reward": 2.0, + "reward_std": 0.08408745229244233, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.037796446681022645, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.7, + "completions/max_terminated_length": 510.7, + "completions/mean_length": 382.1375, + "completions/mean_terminated_length": 382.1375, + "completions/min_length": 275.3, + "completions/min_terminated_length": 275.3, + "epoch": 0.7472939217318901, + "grad_norm": 5.55625074684671, + "kl": 0.0570556640625, + "learning_rate": 1.496982863018275e-07, + "loss": 0.0023, + "num_tokens": 93372936.0, + "reward": 1.9674107313156128, + "reward_std": 0.1747075505554676, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04241071492433548, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0875972904264927, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.4, + "completions/max_terminated_length": 495.4, + "completions/mean_length": 385.175, + "completions/mean_terminated_length": 385.175, + "completions/min_length": 295.8, + "completions/min_terminated_length": 295.8, + "epoch": 0.7493755203996669, + "grad_norm": 0.15568327909789098, + "kl": 0.0562744140625, + "learning_rate": 1.473726478634061e-07, + "loss": 0.0023, + "num_tokens": 93635342.0, + "reward": 1.8378610372543336, + "reward_std": 0.07071067690849304, + "rewards/accuracy_reward/mean": 0.8253610402345657, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03535533845424652, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.5, + "completions/max_terminated_length": 467.5, + "completions/mean_length": 359.9625, + "completions/mean_terminated_length": 359.9625, + "completions/min_length": 254.2, + "completions/min_terminated_length": 254.2, + "epoch": 0.7514571190674438, + "grad_norm": 4.720343495448124, + "kl": 0.058642578125, + "learning_rate": 1.4506208966300248e-07, + "loss": 0.0023, + "num_tokens": 93907019.0, + "reward": 2.0114583492279055, + "reward_std": 0.1769111342728138, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10314287841320038, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.9, + "completions/max_terminated_length": 497.9, + "completions/mean_length": 392.3125, + "completions/mean_terminated_length": 392.3125, + "completions/min_length": 279.9, + "completions/min_terminated_length": 279.9, + "epoch": 0.7535387177352206, + "grad_norm": 0.12252307448017402, + "kl": 0.060302734375, + "learning_rate": 1.4276671051248572e-07, + "loss": 0.0024, + "num_tokens": 94168604.0, + "reward": 2.0208333492279054, + "reward_std": 0.16390654146671296, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11761552840471268, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.1, + "completions/max_terminated_length": 473.1, + "completions/mean_length": 364.425, + "completions/mean_terminated_length": 364.425, + "completions/min_length": 269.3, + "completions/min_terminated_length": 269.3, + "epoch": 0.7556203164029975, + "grad_norm": 5.028606701234733, + "kl": 0.06298828125, + "learning_rate": 1.4048660857458637e-07, + "loss": 0.0025, + "num_tokens": 94429414.0, + "reward": 2.0260416984558107, + "reward_std": 0.1510403722524643, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0760416690260172, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06393011137843133, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.6, + "completions/max_terminated_length": 423.6, + "completions/mean_length": 347.525, + "completions/mean_terminated_length": 347.525, + "completions/min_length": 261.4, + "completions/min_terminated_length": 261.4, + "epoch": 0.7577019150707743, + "grad_norm": 4.9837741307547825, + "kl": 0.0615234375, + "learning_rate": 1.3822188135870034e-07, + "loss": 0.0025, + "num_tokens": 94693016.0, + "reward": 1.9754166841506957, + "reward_std": 0.10829881578683853, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025416667759418487, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05484656691551208, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.2, + "completions/max_terminated_length": 508.2, + "completions/mean_length": 392.175, + "completions/mean_terminated_length": 392.175, + "completions/min_length": 287.5, + "completions/min_terminated_length": 287.5, + "epoch": 0.7597835137385512, + "grad_norm": 0.15479196965058603, + "kl": 0.054833984375, + "learning_rate": 1.359726257167172e-07, + "loss": 0.0022, + "num_tokens": 94951070.0, + "reward": 1.7885703325271607, + "reward_std": 0.08867817372083664, + "rewards/accuracy_reward/mean": 0.7698203206062317, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875000074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03522591739892959, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.6, + "completions/max_terminated_length": 534.6, + "completions/mean_length": 413.35, + "completions/mean_terminated_length": 413.35, + "completions/min_length": 298.3, + "completions/min_terminated_length": 298.3, + "epoch": 0.761865112406328, + "grad_norm": 0.12937673677540254, + "kl": 0.0532470703125, + "learning_rate": 1.3373893783887934e-07, + "loss": 0.0021, + "num_tokens": 95189962.0, + "reward": 2.0135416746139527, + "reward_std": 0.03136168122291565, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03136167526245117, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.6, + "completions/max_terminated_length": 452.6, + "completions/mean_length": 346.75, + "completions/mean_terminated_length": 346.75, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.763946711074105, + "grad_norm": 4.407489858408879, + "kl": 0.0561767578125, + "learning_rate": 1.3152091324966797e-07, + "loss": 0.0022, + "num_tokens": 95462542.0, + "reward": 1.9885416746139526, + "reward_std": 0.16378697603940964, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07667671665549278, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.8, + "completions/max_terminated_length": 495.8, + "completions/mean_length": 379.0, + "completions/mean_terminated_length": 379.0, + "completions/min_length": 265.4, + "completions/min_terminated_length": 265.4, + "epoch": 0.7660283097418817, + "grad_norm": 0.14732401576120982, + "kl": 0.0573486328125, + "learning_rate": 1.2931864680371783e-07, + "loss": 0.0023, + "num_tokens": 95730854.0, + "reward": 1.9910416722297668, + "reward_std": 0.16076218485832214, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.11604166850447654, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12301851361989975, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.1, + "completions/max_terminated_length": 438.1, + "completions/mean_length": 353.7375, + "completions/mean_terminated_length": 353.7375, + "completions/min_length": 266.4, + "completions/min_terminated_length": 266.4, + "epoch": 0.7681099084096586, + "grad_norm": 0.16920726378100856, + "kl": 0.0602294921875, + "learning_rate": 1.27132232681761e-07, + "loss": 0.0024, + "num_tokens": 95991633.0, + "reward": 1.984730076789856, + "reward_std": 0.08931128680706024, + "rewards/accuracy_reward/mean": 0.962855052947998, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02187500037252903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053955940157175065, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.9, + "completions/max_terminated_length": 502.9, + "completions/mean_length": 400.55, + "completions/mean_terminated_length": 400.55, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.7701915070774354, + "grad_norm": 0.15757015594848228, + "kl": 0.05205078125, + "learning_rate": 1.2496176438659944e-07, + "loss": 0.0021, + "num_tokens": 96244909.0, + "reward": 1.984375, + "reward_std": 0.05585952997207642, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.084375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055859526991844176, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 390.425, + "completions/mean_terminated_length": 390.425, + "completions/min_length": 284.6, + "completions/min_terminated_length": 284.6, + "epoch": 0.7722731057452124, + "grad_norm": 0.13615266257106715, + "kl": 0.052587890625, + "learning_rate": 1.2280733473910527e-07, + "loss": 0.0021, + "num_tokens": 96508287.0, + "reward": 1.8875, + "reward_std": 0.03535533845424652, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.6, + "completions/max_terminated_length": 507.6, + "completions/mean_length": 372.5125, + "completions/mean_terminated_length": 372.5125, + "completions/min_length": 272.9, + "completions/min_terminated_length": 272.9, + "epoch": 0.7743547044129891, + "grad_norm": 0.14070110173334624, + "kl": 0.0517578125, + "learning_rate": 1.2066903587425264e-07, + "loss": 0.0021, + "num_tokens": 96783880.0, + "reward": 1.8479166746139526, + "reward_std": 0.0954555444419384, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.057878179103136064, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.1, + "completions/max_terminated_length": 494.1, + "completions/mean_length": 368.1125, + "completions/mean_terminated_length": 368.1125, + "completions/min_length": 251.1, + "completions/min_terminated_length": 251.1, + "epoch": 0.776436303080766, + "grad_norm": 0.22048760698173916, + "kl": 0.0575927734375, + "learning_rate": 1.1854695923717656e-07, + "loss": 0.0023, + "num_tokens": 97046017.0, + "reward": 1.9729166746139526, + "reward_std": 0.07117186933755874, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.3, + "completions/max_terminated_length": 498.3, + "completions/mean_length": 374.3625, + "completions/mean_terminated_length": 374.3625, + "completions/min_length": 275.8, + "completions/min_terminated_length": 275.8, + "epoch": 0.7785179017485429, + "grad_norm": 4.826237703846462, + "kl": 0.051904296875, + "learning_rate": 1.1644119557926247e-07, + "loss": 0.0021, + "num_tokens": 97296238.0, + "reward": 2.005625009536743, + "reward_std": 0.07153735160827637, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018124999850988387, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03618201315402984, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.2, + "completions/max_terminated_length": 447.2, + "completions/mean_length": 344.9625, + "completions/mean_terminated_length": 344.9625, + "completions/min_length": 258.7, + "completions/min_terminated_length": 258.7, + "epoch": 0.7805995004163198, + "grad_norm": 4.434990243764791, + "kl": 0.0589599609375, + "learning_rate": 1.1435183495426542e-07, + "loss": 0.0024, + "num_tokens": 97540299.0, + "reward": 2.059313750267029, + "reward_std": 0.16124168485403062, + "rewards/accuracy_reward/mean": 0.9759803950786591, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08333333358168601, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.16124166548252106, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.6, + "completions/max_terminated_length": 453.6, + "completions/mean_length": 361.3, + "completions/mean_terminated_length": 361.3, + "completions/min_length": 281.2, + "completions/min_terminated_length": 281.2, + "epoch": 0.7826810990840966, + "grad_norm": 0.1697122880636255, + "kl": 0.0545654296875, + "learning_rate": 1.1227896671445864e-07, + "loss": 0.0022, + "num_tokens": 97791299.0, + "reward": 2.0145833492279053, + "reward_std": 0.08838834911584854, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02708333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.1, + "completions/max_terminated_length": 465.1, + "completions/mean_length": 356.1625, + "completions/mean_terminated_length": 356.1625, + "completions/min_length": 261.5, + "completions/min_terminated_length": 261.5, + "epoch": 0.7847626977518735, + "grad_norm": 4.72204657136494, + "kl": 0.0535888671875, + "learning_rate": 1.1022267950681247e-07, + "loss": 0.0021, + "num_tokens": 98050592.0, + "reward": 1.8523929595947266, + "reward_std": 0.1882556490600109, + "rewards/accuracy_reward/mean": 0.8294762820005417, + "rewards/accuracy_reward/std": 0.1349431467242539, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02291666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.053312502801418304, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 416.4875, + "completions/mean_terminated_length": 416.4875, + "completions/min_length": 296.6, + "completions/min_terminated_length": 296.6, + "epoch": 0.7868442964196503, + "grad_norm": 0.13573945644742816, + "kl": 0.0557861328125, + "learning_rate": 1.0818306126920346e-07, + "loss": 0.0022, + "num_tokens": 98276623.0, + "reward": 1.9375, + "reward_std": 0.2537449184805155, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.1687566041946411, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10000000204890966, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10358962155878544, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.9, + "completions/max_terminated_length": 510.9, + "completions/mean_length": 385.95, + "completions/mean_terminated_length": 385.95, + "completions/min_length": 274.2, + "completions/min_terminated_length": 274.2, + "epoch": 0.7889258950874272, + "grad_norm": 0.15732118591221703, + "kl": 0.054443359375, + "learning_rate": 1.061601992266532e-07, + "loss": 0.0022, + "num_tokens": 98548259.0, + "reward": 1.8893749952316283, + "reward_std": 0.19129744172096252, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05187499970197677, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1024898573756218, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.9, + "completions/max_terminated_length": 477.9, + "completions/mean_length": 369.2, + "completions/mean_terminated_length": 369.2, + "completions/min_length": 283.5, + "completions/min_terminated_length": 283.5, + "epoch": 0.791007493755204, + "grad_norm": 0.12779789300769948, + "kl": 0.05419921875, + "learning_rate": 1.0415417988759916e-07, + "loss": 0.0022, + "num_tokens": 98811235.0, + "reward": 1.9833333492279053, + "reward_std": 0.17567494064569472, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.12416292428970337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05833333488553762, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05609594918787479, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.4, + "completions/max_terminated_length": 509.4, + "completions/mean_length": 397.2625, + "completions/mean_terminated_length": 397.2625, + "completions/min_length": 295.9, + "completions/min_terminated_length": 295.9, + "epoch": 0.7930890924229809, + "grad_norm": 0.12194587886020418, + "kl": 0.0489990234375, + "learning_rate": 1.0216508904019339e-07, + "loss": 0.002, + "num_tokens": 99083496.0, + "reward": 1.9875, + "reward_std": 0.03535533845424652, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.9, + "completions/max_terminated_length": 433.9, + "completions/mean_length": 349.325, + "completions/mean_terminated_length": 349.325, + "completions/min_length": 257.7, + "completions/min_terminated_length": 257.7, + "epoch": 0.7951706910907577, + "grad_norm": 5.718242192993079, + "kl": 0.052880859375, + "learning_rate": 1.0019301174863582e-07, + "loss": 0.0021, + "num_tokens": 99355562.0, + "reward": 1.9729166746139526, + "reward_std": 0.11110913455486297, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.1, + "completions/max_terminated_length": 471.1, + "completions/mean_length": 379.6, + "completions/mean_terminated_length": 379.6, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.7972522897585346, + "grad_norm": 0.14164906888614892, + "kl": 0.058984375, + "learning_rate": 9.82380323495347e-08, + "loss": 0.0024, + "num_tokens": 99614210.0, + "reward": 1.7272916793823243, + "reward_std": 0.22196788042783738, + "rewards/accuracy_reward/mean": 0.675, + "rewards/accuracy_reward/std": 0.12416292428970337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05229166727513075, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.12513141110539436, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.3, + "completions/max_terminated_length": 489.3, + "completions/mean_length": 370.675, + "completions/mean_terminated_length": 370.675, + "completions/min_length": 280.6, + "completions/min_terminated_length": 280.6, + "epoch": 0.7993338884263114, + "grad_norm": 0.1499050279028627, + "kl": 0.05205078125, + "learning_rate": 9.630023444830104e-08, + "loss": 0.0021, + "num_tokens": 99891040.0, + "reward": 1.9086726307868958, + "reward_std": 0.005892548337578773, + "rewards/accuracy_reward/mean": 0.9065893024206162, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.002083333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.00589255727827549, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.9, + "completions/max_terminated_length": 468.9, + "completions/mean_length": 369.65, + "completions/mean_terminated_length": 369.65, + "completions/min_length": 284.8, + "completions/min_terminated_length": 284.8, + "epoch": 0.8014154870940883, + "grad_norm": 6.713483614256585, + "kl": 0.0563232421875, + "learning_rate": 9.437970091557251e-08, + "loss": 0.0023, + "num_tokens": 100145740.0, + "reward": 1.915625, + "reward_std": 0.11031491830945014, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.065625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09391534104943275, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.2, + "completions/max_terminated_length": 468.2, + "completions/mean_length": 368.1625, + "completions/mean_terminated_length": 368.1625, + "completions/min_length": 279.3, + "completions/min_terminated_length": 279.3, + "epoch": 0.8034970857618651, + "grad_norm": 5.113616133939523, + "kl": 0.0548828125, + "learning_rate": 9.247651388367e-08, + "loss": 0.0022, + "num_tokens": 100417977.0, + "reward": 1.8708333492279052, + "reward_std": 0.143629489839077, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.13509859144687653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.5, + "completions/max_terminated_length": 485.5, + "completions/mean_length": 370.7875, + "completions/mean_terminated_length": 370.7875, + "completions/min_length": 284.7, + "completions/min_terminated_length": 284.7, + "epoch": 0.805578684429642, + "grad_norm": 0.11922368664970306, + "kl": 0.0568603515625, + "learning_rate": 9.059075474308459e-08, + "loss": 0.0023, + "num_tokens": 100693856.0, + "reward": 1.9239583492279053, + "reward_std": 0.16635380387306214, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02395833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049352110177278516, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.8, + "completions/max_terminated_length": 540.8, + "completions/mean_length": 390.1875, + "completions/mean_terminated_length": 390.1875, + "completions/min_length": 284.9, + "completions/min_terminated_length": 284.9, + "epoch": 0.8076602830974188, + "grad_norm": 0.1291060533060879, + "kl": 0.0534423828125, + "learning_rate": 8.872250413899785e-08, + "loss": 0.0021, + "num_tokens": 100965423.0, + "reward": 1.828125, + "reward_std": 0.18752237744629383, + "rewards/accuracy_reward/mean": 0.8, + "rewards/accuracy_reward/std": 0.1334012657403946, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.028125000186264515, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.054121119901537895, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.1, + "completions/max_terminated_length": 484.1, + "completions/mean_length": 380.1375, + "completions/mean_terminated_length": 380.1375, + "completions/min_length": 279.2, + "completions/min_terminated_length": 279.2, + "epoch": 0.8097418817651957, + "grad_norm": 4.878083007917422, + "kl": 0.0498291015625, + "learning_rate": 8.687184196783138e-08, + "loss": 0.002, + "num_tokens": 101216602.0, + "reward": 1.9375, + "reward_std": 0.12793734967708587, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.4, + "completions/max_terminated_length": 473.4, + "completions/mean_length": 374.0, + "completions/mean_terminated_length": 374.0, + "completions/min_length": 280.6, + "completions/min_terminated_length": 280.6, + "epoch": 0.8118234804329725, + "grad_norm": 0.1329960691761566, + "kl": 0.0533447265625, + "learning_rate": 8.503884737383188e-08, + "loss": 0.0021, + "num_tokens": 101477834.0, + "reward": 1.9260416746139526, + "reward_std": 0.14048119634389877, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.10520716905593872, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055533173680305484, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.2, + "completions/max_terminated_length": 478.2, + "completions/mean_length": 384.25, + "completions/mean_terminated_length": 384.25, + "completions/min_length": 282.7, + "completions/min_terminated_length": 282.7, + "epoch": 0.8139050791007494, + "grad_norm": 0.12332734924314498, + "kl": 0.046728515625, + "learning_rate": 8.32235987456853e-08, + "loss": 0.0019, + "num_tokens": 101753350.0, + "reward": 1.7395833492279054, + "reward_std": 0.10045296251773835, + "rewards/accuracy_reward/mean": 0.7, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03958333395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1004529558122158, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.8, + "completions/max_terminated_length": 497.8, + "completions/mean_length": 373.0875, + "completions/mean_terminated_length": 373.0875, + "completions/min_length": 272.1, + "completions/min_terminated_length": 272.1, + "epoch": 0.8159866777685262, + "grad_norm": 0.1412123156702961, + "kl": 0.051416015625, + "learning_rate": 8.142617371316473e-08, + "loss": 0.0021, + "num_tokens": 102031781.0, + "reward": 1.9260416746139526, + "reward_std": 0.10721644759178162, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.054992732405662534, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.8, + "completions/max_terminated_length": 496.8, + "completions/mean_length": 389.35, + "completions/mean_terminated_length": 389.35, + "completions/min_length": 311.6, + "completions/min_terminated_length": 311.6, + "epoch": 0.8180682764363031, + "grad_norm": 0.21222767791413055, + "kl": 0.0527099609375, + "learning_rate": 7.964664914381086e-08, + "loss": 0.0021, + "num_tokens": 102307233.0, + "reward": 1.9333333492279052, + "reward_std": 0.16807903349399567, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.10350984334945679, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03333333432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06456920504570007, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.4, + "completions/max_terminated_length": 505.4, + "completions/mean_length": 372.7625, + "completions/mean_terminated_length": 372.7625, + "completions/min_length": 265.9, + "completions/min_terminated_length": 265.9, + "epoch": 0.8201498751040799, + "grad_norm": 4.518923590747868, + "kl": 0.052734375, + "learning_rate": 7.788510113964436e-08, + "loss": 0.0021, + "num_tokens": 102566030.0, + "reward": 1.7533788442611695, + "reward_std": 0.12879444248974323, + "rewards/accuracy_reward/mean": 0.7203431375324726, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03303571436554194, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0934391088783741, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.3, + "completions/max_terminated_length": 509.3, + "completions/mean_length": 386.6125, + "completions/mean_terminated_length": 386.6125, + "completions/min_length": 279.7, + "completions/min_terminated_length": 279.7, + "epoch": 0.8222314737718568, + "grad_norm": 4.401170932607157, + "kl": 0.052294921875, + "learning_rate": 7.614160503391159e-08, + "loss": 0.0021, + "num_tokens": 102811719.0, + "reward": 1.6110849499702453, + "reward_std": 0.2531979136168957, + "rewards/accuracy_reward/mean": 0.580876623466611, + "rewards/accuracy_reward/std": 0.2234456790611148, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.055492518842220305, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.4, + "completions/max_terminated_length": 514.4, + "completions/mean_length": 400.0375, + "completions/mean_terminated_length": 400.0375, + "completions/min_length": 301.6, + "completions/min_terminated_length": 301.6, + "epoch": 0.8243130724396336, + "grad_norm": 5.5389473156402955, + "kl": 0.052783203125, + "learning_rate": 7.441623538786267e-08, + "loss": 0.0021, + "num_tokens": 103049338.0, + "reward": 1.8135416746139525, + "reward_std": 0.19410984218120575, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.1388651818037033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0552446648478508, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 379.9875, + "completions/mean_terminated_length": 379.9875, + "completions/min_length": 303.7, + "completions/min_terminated_length": 303.7, + "epoch": 0.8263946711074105, + "grad_norm": 0.18763065293743852, + "kl": 0.05126953125, + "learning_rate": 7.270906598756354e-08, + "loss": 0.002, + "num_tokens": 103302593.0, + "reward": 1.8629722237586974, + "reward_std": 0.134679351747036, + "rewards/accuracy_reward/mean": 0.8567222222685814, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.5, + "completions/max_terminated_length": 479.5, + "completions/mean_length": 364.2375, + "completions/mean_terminated_length": 364.2375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.8284762697751873, + "grad_norm": 4.675829318699885, + "kl": 0.0544189453125, + "learning_rate": 7.102016984073939e-08, + "loss": 0.0022, + "num_tokens": 103572132.0, + "reward": 2.0260416984558107, + "reward_std": 0.15645610243082048, + "rewards/accuracy_reward/mean": 0.95, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07604166828095912, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10300384685397149, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.6, + "completions/max_terminated_length": 512.6, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 390.25, + "completions/min_length": 280.6, + "completions/min_terminated_length": 280.6, + "epoch": 0.8305578684429642, + "grad_norm": 4.376198711871789, + "kl": 0.0482421875, + "learning_rate": 6.934961917365323e-08, + "loss": 0.0019, + "num_tokens": 103836776.0, + "reward": 1.919166672229767, + "reward_std": 0.04006379246711731, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.019166667386889456, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0400637723505497, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.6, + "completions/max_terminated_length": 484.6, + "completions/mean_length": 385.4125, + "completions/mean_terminated_length": 385.4125, + "completions/min_length": 285.9, + "completions/min_terminated_length": 285.9, + "epoch": 0.832639467110741, + "grad_norm": 0.17280059406529635, + "kl": 0.052587890625, + "learning_rate": 6.769748542801696e-08, + "loss": 0.0021, + "num_tokens": 104067641.0, + "reward": 2.00625, + "reward_std": 0.01767766922712326, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 407.825, + "completions/mean_terminated_length": 407.825, + "completions/min_length": 311.8, + "completions/min_terminated_length": 311.8, + "epoch": 0.8347210657785179, + "grad_norm": 0.12326887803289459, + "kl": 0.0458984375, + "learning_rate": 6.606383925793596e-08, + "loss": 0.0018, + "num_tokens": 104344899.0, + "reward": 1.9331209182739257, + "reward_std": 0.08897026628255844, + "rewards/accuracy_reward/mean": 0.9143708974123002, + "rewards/accuracy_reward/std": 0.0612799197435379, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.018750001117587088, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.027690339833498, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.2, + "completions/max_terminated_length": 474.2, + "completions/mean_length": 365.825, + "completions/mean_terminated_length": 365.825, + "completions/min_length": 271.2, + "completions/min_terminated_length": 271.2, + "epoch": 0.8368026644462948, + "grad_norm": 5.814823012738687, + "kl": 0.051123046875, + "learning_rate": 6.444875052688764e-08, + "loss": 0.002, + "num_tokens": 104612149.0, + "reward": 1.6837563395500184, + "reward_std": 0.1435942307114601, + "rewards/accuracy_reward/mean": 0.6403039485216141, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.043452383019030094, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05773126855492592, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.5, + "completions/max_terminated_length": 458.5, + "completions/mean_length": 355.375, + "completions/mean_terminated_length": 355.375, + "completions/min_length": 262.6, + "completions/min_terminated_length": 262.6, + "epoch": 0.8388842631140716, + "grad_norm": 0.09901890393356158, + "kl": 0.04814453125, + "learning_rate": 6.285228830473421e-08, + "loss": 0.0019, + "num_tokens": 104885179.0, + "reward": 1.890000009536743, + "reward_std": 0.06549679189920425, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.014999999850988387, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02910144701600075, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.3, + "completions/max_terminated_length": 495.3, + "completions/mean_length": 384.3375, + "completions/mean_terminated_length": 384.3375, + "completions/min_length": 280.3, + "completions/min_terminated_length": 280.3, + "epoch": 0.8409658617818485, + "grad_norm": 0.14455133341003648, + "kl": 0.0484130859375, + "learning_rate": 6.127452086476748e-08, + "loss": 0.0019, + "num_tokens": 105137526.0, + "reward": 2.01875, + "reward_std": 0.044035982340574265, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.044035985320806506, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.1, + "completions/max_terminated_length": 498.1, + "completions/mean_length": 391.0, + "completions/mean_terminated_length": 391.0, + "completions/min_length": 293.5, + "completions/min_terminated_length": 293.5, + "epoch": 0.8430474604496253, + "grad_norm": 0.1210642036264325, + "kl": 0.0546875, + "learning_rate": 5.971551568079097e-08, + "loss": 0.0022, + "num_tokens": 105381174.0, + "reward": 1.9229166746139525, + "reward_std": 0.1885521873831749, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.1687566041946411, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.8, + "completions/max_terminated_length": 507.8, + "completions/mean_length": 384.9375, + "completions/mean_terminated_length": 384.9375, + "completions/min_length": 279.5, + "completions/min_terminated_length": 279.5, + "epoch": 0.8451290591174022, + "grad_norm": 4.399537829335464, + "kl": 0.050439453125, + "learning_rate": 5.817533942423286e-08, + "loss": 0.002, + "num_tokens": 105646977.0, + "reward": 2.0375, + "reward_std": 0.06348394006490707, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06348394006490707, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.4, + "completions/max_terminated_length": 509.4, + "completions/mean_length": 384.5625, + "completions/mean_terminated_length": 384.5625, + "completions/min_length": 289.5, + "completions/min_terminated_length": 289.5, + "epoch": 0.847210657785179, + "grad_norm": 0.13733316919087182, + "kl": 0.0467529296875, + "learning_rate": 5.665405796129552e-08, + "loss": 0.0019, + "num_tokens": 105904598.0, + "reward": 1.9947916746139527, + "reward_std": 0.09533035308122635, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01979166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.049039344489574435, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.6, + "completions/max_terminated_length": 522.6, + "completions/mean_length": 388.0875, + "completions/mean_terminated_length": 388.0875, + "completions/min_length": 283.2, + "completions/min_terminated_length": 283.2, + "epoch": 0.8492922564529559, + "grad_norm": 0.11794495477424068, + "kl": 0.049267578125, + "learning_rate": 5.515173635013859e-08, + "loss": 0.002, + "num_tokens": 106185101.0, + "reward": 1.996875, + "reward_std": 0.06187184229493141, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.1, + "completions/max_terminated_length": 474.1, + "completions/mean_length": 363.825, + "completions/mean_terminated_length": 363.825, + "completions/min_length": 269.1, + "completions/min_terminated_length": 269.1, + "epoch": 0.8513738551207327, + "grad_norm": 4.565056845803448, + "kl": 0.060888671875, + "learning_rate": 5.3668438838096685e-08, + "loss": 0.0024, + "num_tokens": 106455751.0, + "reward": 1.994494080543518, + "reward_std": 0.12382127717137337, + "rewards/accuracy_reward/mean": 0.9625, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03199404887855053, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05725074335932732, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.6, + "completions/max_terminated_length": 520.6, + "completions/mean_length": 399.1625, + "completions/mean_terminated_length": 399.1625, + "completions/min_length": 299.4, + "completions/min_terminated_length": 299.4, + "epoch": 0.8534554537885096, + "grad_norm": 0.17276149318032222, + "kl": 0.047607421875, + "learning_rate": 5.2204228858931664e-08, + "loss": 0.0019, + "num_tokens": 106710260.0, + "reward": 1.9436021447181702, + "reward_std": 0.06134198904037476, + "rewards/accuracy_reward/mean": 0.9061021506786346, + "rewards/accuracy_reward/std": 0.0015966927632689476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06094194948673248, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.2, + "completions/max_terminated_length": 491.2, + "completions/mean_length": 363.0875, + "completions/mean_terminated_length": 363.0875, + "completions/min_length": 269.9, + "completions/min_terminated_length": 269.9, + "epoch": 0.8555370524562864, + "grad_norm": 0.1553358408491195, + "kl": 0.0528076171875, + "learning_rate": 5.0759169030120454e-08, + "loss": 0.0021, + "num_tokens": 106952595.0, + "reward": 1.9416666746139526, + "reward_std": 0.1220408782362938, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.034930617362260816, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.1, + "completions/max_terminated_length": 503.1, + "completions/mean_length": 376.4375, + "completions/mean_terminated_length": 376.4375, + "completions/min_length": 282.8, + "completions/min_terminated_length": 282.8, + "epoch": 0.8576186511240633, + "grad_norm": 0.12064916321985647, + "kl": 0.05244140625, + "learning_rate": 4.933332115017619e-08, + "loss": 0.0021, + "num_tokens": 107223014.0, + "reward": 1.8802083492279054, + "reward_std": 0.1497805744409561, + "rewards/accuracy_reward/mean": 0.85, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06267031356692314, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.3, + "completions/max_terminated_length": 499.3, + "completions/mean_length": 362.0, + "completions/mean_terminated_length": 362.0, + "completions/min_length": 266.8, + "completions/min_terminated_length": 266.8, + "epoch": 0.8597002497918401, + "grad_norm": 0.15994207017756168, + "kl": 0.0513671875, + "learning_rate": 4.7926746196006675e-08, + "loss": 0.0021, + "num_tokens": 107485078.0, + "reward": 1.9061742424964905, + "reward_std": 0.09082945436239243, + "rewards/accuracy_reward/mean": 0.8624242424964905, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09082945436239243, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.3, + "completions/max_terminated_length": 477.3, + "completions/mean_length": 370.5875, + "completions/mean_terminated_length": 370.5875, + "completions/min_length": 269.4, + "completions/min_terminated_length": 269.4, + "epoch": 0.861781848459617, + "grad_norm": 5.466497474038428, + "kl": 0.0529296875, + "learning_rate": 4.653950432030518e-08, + "loss": 0.0021, + "num_tokens": 107733645.0, + "reward": 1.9285416841506957, + "reward_std": 0.2662425719201565, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.19317627549171448, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04104166775941849, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08089874014258384, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.8, + "completions/max_terminated_length": 453.8, + "completions/mean_length": 361.725, + "completions/mean_terminated_length": 361.725, + "completions/min_length": 260.3, + "completions/min_terminated_length": 260.3, + "epoch": 0.8638634471273938, + "grad_norm": 0.15802322397669666, + "kl": 0.048291015625, + "learning_rate": 4.51716548489795e-08, + "loss": 0.0019, + "num_tokens": 107984727.0, + "reward": 1.6659722208976746, + "reward_std": 0.13679726421833038, + "rewards/accuracy_reward/mean": 0.6555555552244187, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.8, + "completions/max_terminated_length": 501.8, + "completions/mean_length": 390.4625, + "completions/mean_terminated_length": 390.4625, + "completions/min_length": 291.4, + "completions/min_terminated_length": 291.4, + "epoch": 0.8659450457951707, + "grad_norm": 5.800661850820499, + "kl": 0.0455322265625, + "learning_rate": 4.382325627861383e-08, + "loss": 0.0018, + "num_tokens": 108246356.0, + "reward": 1.927916669845581, + "reward_std": 0.21918051093816757, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.1388651818037033, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.052916666865348815, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09879994541406631, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.8, + "completions/max_terminated_length": 493.8, + "completions/mean_length": 373.4, + "completions/mean_terminated_length": 373.4, + "completions/min_length": 294.4, + "completions/min_terminated_length": 294.4, + "epoch": 0.8680266444629475, + "grad_norm": 5.428300158144188, + "kl": 0.046630859375, + "learning_rate": 4.2494366273967355e-08, + "loss": 0.0019, + "num_tokens": 108514156.0, + "reward": 1.971875, + "reward_std": 0.10024693906307221, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 0.9875, + "rewards/format_reward/std": 0.03535533845424652, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.018600596487522124, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.5, + "completions/max_terminated_length": 469.5, + "completions/mean_length": 370.7, + "completions/mean_terminated_length": 370.7, + "completions/min_length": 269.8, + "completions/min_terminated_length": 269.8, + "epoch": 0.8701082431307244, + "grad_norm": 0.121734992020979, + "kl": 0.050244140625, + "learning_rate": 4.118504166550846e-08, + "loss": 0.002, + "num_tokens": 108791212.0, + "reward": 1.69375, + "reward_std": 0.13467935025691985, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.1, + "completions/max_terminated_length": 513.1, + "completions/mean_length": 378.575, + "completions/mean_terminated_length": 378.575, + "completions/min_length": 251.3, + "completions/min_terminated_length": 251.3, + "epoch": 0.8721898417985012, + "grad_norm": 0.1551012228146172, + "kl": 0.052099609375, + "learning_rate": 3.989533844698412e-08, + "loss": 0.0021, + "num_tokens": 109063354.0, + "reward": 1.7762298107147216, + "reward_std": 0.10243196031078697, + "rewards/accuracy_reward/mean": 0.7595631256699562, + "rewards/accuracy_reward/std": 0.08766918759793044, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03563483357429505, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.1, + "completions/max_terminated_length": 479.1, + "completions/mean_length": 377.55, + "completions/mean_terminated_length": 377.55, + "completions/min_length": 288.3, + "completions/min_terminated_length": 288.3, + "epoch": 0.8742714404662781, + "grad_norm": 5.053903913258786, + "kl": 0.0511474609375, + "learning_rate": 3.862531177302536e-08, + "loss": 0.002, + "num_tokens": 109324198.0, + "reward": 2.0260416746139525, + "reward_std": 0.13661691546440125, + "rewards/accuracy_reward/mean": 0.975, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10126157030463219, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.6, + "completions/max_terminated_length": 514.6, + "completions/mean_length": 400.325, + "completions/mean_terminated_length": 400.325, + "completions/min_length": 302.7, + "completions/min_terminated_length": 302.7, + "epoch": 0.8763530391340549, + "grad_norm": 0.14070777923623243, + "kl": 0.051708984375, + "learning_rate": 3.737501595678877e-08, + "loss": 0.0021, + "num_tokens": 109582328.0, + "reward": 1.8760416746139525, + "reward_std": 0.12483179420232773, + "rewards/accuracy_reward/mean": 0.8416666686534882, + "rewards/accuracy_reward/std": 0.07071067690849304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.034375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05412111729383469, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.6, + "completions/max_terminated_length": 504.6, + "completions/mean_length": 371.075, + "completions/mean_terminated_length": 371.075, + "completions/min_length": 272.1, + "completions/min_terminated_length": 272.1, + "epoch": 0.8784346378018318, + "grad_norm": 0.14022921478443767, + "kl": 0.0544189453125, + "learning_rate": 3.6144504467633177e-08, + "loss": 0.0022, + "num_tokens": 109838110.0, + "reward": 1.8385416865348816, + "reward_std": 0.12297167479991913, + "rewards/accuracy_reward/mean": 0.7875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05104166828095913, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08761632815003395, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.7, + "completions/max_terminated_length": 468.7, + "completions/mean_length": 349.05, + "completions/mean_terminated_length": 349.05, + "completions/min_length": 232.4, + "completions/min_terminated_length": 232.4, + "epoch": 0.8805162364696086, + "grad_norm": 0.17054655274531946, + "kl": 0.0545166015625, + "learning_rate": 3.493382992883376e-08, + "loss": 0.0022, + "num_tokens": 110089850.0, + "reward": 2.0010416746139525, + "reward_std": 0.20167978554964067, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06354166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10278442576527595, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.7, + "completions/max_terminated_length": 511.7, + "completions/mean_length": 408.425, + "completions/mean_terminated_length": 408.425, + "completions/min_length": 302.5, + "completions/min_terminated_length": 302.5, + "epoch": 0.8825978351373855, + "grad_norm": 3.487135276663561, + "kl": 0.04794921875, + "learning_rate": 3.3743044115331074e-08, + "loss": 0.0019, + "num_tokens": 110337772.0, + "reward": 1.9349905014038087, + "reward_std": 0.07198155298829079, + "rewards/accuracy_reward/mean": 0.9112405106425285, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.023750000074505805, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03662621006369591, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.3, + "completions/max_terminated_length": 506.3, + "completions/mean_length": 377.6125, + "completions/mean_terminated_length": 377.6125, + "completions/min_length": 267.9, + "completions/min_terminated_length": 267.9, + "epoch": 0.8846794338051623, + "grad_norm": 5.377135867331848, + "kl": 0.050146484375, + "learning_rate": 3.257219795151706e-08, + "loss": 0.002, + "num_tokens": 110570869.0, + "reward": 1.9635416746139527, + "reward_std": 0.13173307850956917, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03854166716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08544206991791725, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.5, + "completions/max_terminated_length": 515.5, + "completions/mean_length": 396.6875, + "completions/mean_terminated_length": 396.6875, + "completions/min_length": 289.1, + "completions/min_terminated_length": 289.1, + "epoch": 0.8867610324729392, + "grad_norm": 0.13949685317961896, + "kl": 0.0470703125, + "learning_rate": 3.1421341509057286e-08, + "loss": 0.0019, + "num_tokens": 110818524.0, + "reward": 1.9802773118019104, + "reward_std": 0.08302186951041221, + "rewards/accuracy_reward/mean": 0.9177773147821426, + "rewards/accuracy_reward/std": 0.009221436083316803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07380043268203736, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.5, + "completions/max_terminated_length": 467.5, + "completions/mean_length": 376.3, + "completions/mean_terminated_length": 376.3, + "completions/min_length": 285.1, + "completions/min_terminated_length": 285.1, + "epoch": 0.888842631140716, + "grad_norm": 5.148213381792115, + "kl": 0.050048828125, + "learning_rate": 3.029052400474946e-08, + "loss": 0.002, + "num_tokens": 111091996.0, + "reward": 1.8268019556999207, + "reward_std": 0.05303300768136978, + "rewards/accuracy_reward/mean": 0.808051960915327, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.9, + "completions/max_terminated_length": 506.9, + "completions/mean_length": 394.7375, + "completions/mean_terminated_length": 394.7375, + "completions/min_length": 295.7, + "completions/min_terminated_length": 295.7, + "epoch": 0.890924229808493, + "grad_norm": 5.657416392972012, + "kl": 0.04599609375, + "learning_rate": 2.917979379841884e-08, + "loss": 0.0018, + "num_tokens": 111356711.0, + "reward": 1.96245219707489, + "reward_std": 0.0872782051563263, + "rewards/accuracy_reward/mean": 0.8841188423335552, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.07833333555608987, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06079131290316582, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.6, + "completions/max_terminated_length": 499.6, + "completions/mean_length": 390.7625, + "completions/mean_terminated_length": 390.7625, + "completions/min_length": 297.9, + "completions/min_terminated_length": 297.9, + "epoch": 0.8930058284762697, + "grad_norm": 0.12242317928805745, + "kl": 0.0466796875, + "learning_rate": 2.8089198390850054e-08, + "loss": 0.0019, + "num_tokens": 111639116.0, + "reward": 1.9114583492279054, + "reward_std": 0.025469133257865907, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01145833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02546912059187889, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.9, + "completions/max_terminated_length": 488.9, + "completions/mean_length": 364.4, + "completions/mean_terminated_length": 364.4, + "completions/min_length": 255.2, + "completions/min_terminated_length": 255.2, + "epoch": 0.8950874271440467, + "grad_norm": 5.237753278009117, + "kl": 0.052197265625, + "learning_rate": 2.701878442175548e-08, + "loss": 0.0021, + "num_tokens": 111890132.0, + "reward": 2.075000023841858, + "reward_std": 0.12417701482772828, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08750000447034836, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08882166296243668, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.6, + "completions/max_terminated_length": 478.6, + "completions/mean_length": 384.3875, + "completions/mean_terminated_length": 384.3875, + "completions/min_length": 279.2, + "completions/min_terminated_length": 279.2, + "epoch": 0.8971690258118235, + "grad_norm": 0.1476525406524929, + "kl": 0.04580078125, + "learning_rate": 2.59685976677812e-08, + "loss": 0.0018, + "num_tokens": 112171523.0, + "reward": 1.825, + "reward_std": 0.11700168251991272, + "rewards/accuracy_reward/mean": 0.825, + "rewards/accuracy_reward/std": 0.11700168251991272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.1, + "completions/max_terminated_length": 482.1, + "completions/mean_length": 372.5, + "completions/mean_terminated_length": 372.5, + "completions/min_length": 268.7, + "completions/min_terminated_length": 268.7, + "epoch": 0.8992506244796004, + "grad_norm": 4.272435531356291, + "kl": 0.0462158203125, + "learning_rate": 2.493868304054858e-08, + "loss": 0.0018, + "num_tokens": 112429075.0, + "reward": 2.0447916746139527, + "reward_std": 0.2241403728723526, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.10729166865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11172061711549759, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.2, + "completions/max_terminated_length": 517.2, + "completions/mean_length": 405.9875, + "completions/mean_terminated_length": 405.9875, + "completions/min_length": 321.7, + "completions/min_terminated_length": 321.7, + "epoch": 0.9013322231473772, + "grad_norm": 0.10872554480013581, + "kl": 0.049560546875, + "learning_rate": 2.3929084584734583e-08, + "loss": 0.002, + "num_tokens": 112662970.0, + "reward": 1.8549999952316285, + "reward_std": 0.17398233264684676, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.10606601536273956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.042500002309679985, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07622460052371025, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.7, + "completions/max_terminated_length": 478.7, + "completions/mean_length": 389.0625, + "completions/mean_terminated_length": 389.0625, + "completions/min_length": 282.1, + "completions/min_terminated_length": 282.1, + "epoch": 0.9034138218151541, + "grad_norm": 0.14089223495537415, + "kl": 0.049951171875, + "learning_rate": 2.293984547618716e-08, + "loss": 0.002, + "num_tokens": 112917615.0, + "reward": 2.0250000238418577, + "reward_std": 0.05036096572875977, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02500000037252903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05036095306277275, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.4, + "completions/max_terminated_length": 507.4, + "completions/mean_length": 394.7375, + "completions/mean_terminated_length": 394.7375, + "completions/min_length": 280.5, + "completions/min_terminated_length": 280.5, + "epoch": 0.9054954204829309, + "grad_norm": 4.912712839875028, + "kl": 0.0505859375, + "learning_rate": 2.197100802007967e-08, + "loss": 0.002, + "num_tokens": 113192594.0, + "reward": 1.78125, + "reward_std": 0.18097035735845565, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 0.09258201122283935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0883883461356163, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.5, + "completions/max_terminated_length": 472.5, + "completions/mean_length": 366.4375, + "completions/mean_terminated_length": 366.4375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.9075770191507078, + "grad_norm": 5.1684488225695855, + "kl": 0.053271484375, + "learning_rate": 2.102261364910113e-08, + "loss": 0.0021, + "num_tokens": 113466493.0, + "reward": 1.95, + "reward_std": 0.11700168251991272, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07071067690849304, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.3, + "completions/max_terminated_length": 504.3, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 391.375, + "completions/min_length": 294.6, + "completions/min_terminated_length": 294.6, + "epoch": 0.9096586178184846, + "grad_norm": 4.333634506581281, + "kl": 0.049365234375, + "learning_rate": 2.009470292168458e-08, + "loss": 0.002, + "num_tokens": 113730683.0, + "reward": 1.6584088206291199, + "reward_std": 0.07365696355700493, + "rewards/accuracy_reward/mean": 0.6448671489953994, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01354166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03830161839723587, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.1, + "completions/max_terminated_length": 468.1, + "completions/mean_length": 361.65, + "completions/mean_terminated_length": 361.65, + "completions/min_length": 264.7, + "completions/min_terminated_length": 264.7, + "epoch": 0.9117402164862615, + "grad_norm": 0.7362022148606795, + "kl": 0.0502685546875, + "learning_rate": 1.9187315520272474e-08, + "loss": 0.002, + "num_tokens": 113970759.0, + "reward": 1.9650000095367433, + "reward_std": 0.18890444338321685, + "rewards/accuracy_reward/mean": 0.9125, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.052500000596046446, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.10725810527801513, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.7, + "completions/max_terminated_length": 485.7, + "completions/mean_length": 369.4, + "completions/mean_terminated_length": 369.4, + "completions/min_length": 264.4, + "completions/min_terminated_length": 264.4, + "epoch": 0.9138218151540383, + "grad_norm": 0.12468607156671294, + "kl": 0.0517578125, + "learning_rate": 1.8300490249619937e-08, + "loss": 0.0021, + "num_tokens": 114219847.0, + "reward": 2.00625, + "reward_std": 0.01767766922712326, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.3, + "completions/max_terminated_length": 476.3, + "completions/mean_length": 357.1625, + "completions/mean_terminated_length": 357.1625, + "completions/min_length": 256.5, + "completions/min_terminated_length": 256.5, + "epoch": 0.9159034138218152, + "grad_norm": 4.715800520092029, + "kl": 0.0498046875, + "learning_rate": 1.743426503513462e-08, + "loss": 0.002, + "num_tokens": 114492356.0, + "reward": 2.035416674613953, + "reward_std": 0.09050626158714295, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09050625860691071, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.6, + "completions/max_terminated_length": 479.6, + "completions/mean_length": 379.25, + "completions/mean_terminated_length": 379.25, + "completions/min_length": 296.2, + "completions/min_terminated_length": 296.2, + "epoch": 0.917985012489592, + "grad_norm": 0.11666760088151248, + "kl": 0.0504638671875, + "learning_rate": 1.6588676921255595e-08, + "loss": 0.002, + "num_tokens": 114754464.0, + "reward": 1.9166666746139527, + "reward_std": 0.03747325390577316, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01666666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03747325092554092, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.5, + "completions/max_terminated_length": 459.5, + "completions/mean_length": 347.1125, + "completions/mean_terminated_length": 347.1125, + "completions/min_length": 246.7, + "completions/min_terminated_length": 246.7, + "epoch": 0.9200666111573689, + "grad_norm": 5.083016362521967, + "kl": 0.0513427734375, + "learning_rate": 1.5763762069868626e-08, + "loss": 0.0021, + "num_tokens": 115024633.0, + "reward": 1.9514627933502198, + "reward_std": 0.21071887612342835, + "rewards/accuracy_reward/mean": 0.8639627665281295, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08750000260770321, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.15726661458611488, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.2, + "completions/max_terminated_length": 488.2, + "completions/mean_length": 383.3625, + "completions/mean_terminated_length": 383.3625, + "completions/min_length": 282.2, + "completions/min_terminated_length": 282.2, + "epoch": 0.9221482098251457, + "grad_norm": 0.12144909342997057, + "kl": 0.0490234375, + "learning_rate": 1.495955575875979e-08, + "loss": 0.002, + "num_tokens": 115292630.0, + "reward": 1.804674792289734, + "reward_std": 0.1397224634885788, + "rewards/accuracy_reward/mean": 0.8005081295967102, + "rewards/accuracy_reward/std": 0.12793734967708587, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00416666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01178511455655098, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.3, + "completions/max_terminated_length": 478.3, + "completions/mean_length": 355.2875, + "completions/mean_terminated_length": 355.2875, + "completions/min_length": 251.4, + "completions/min_terminated_length": 251.4, + "epoch": 0.9242298084929226, + "grad_norm": 4.635990166700676, + "kl": 0.0520751953125, + "learning_rate": 1.4176092380106862e-08, + "loss": 0.0021, + "num_tokens": 115569397.0, + "reward": 1.8619230747222901, + "reward_std": 0.10850712358951568, + "rewards/accuracy_reward/mean": 0.8244230777025223, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07315178513526917, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.4, + "completions/max_terminated_length": 492.4, + "completions/mean_length": 399.45, + "completions/mean_terminated_length": 399.45, + "completions/min_length": 303.8, + "completions/min_terminated_length": 303.8, + "epoch": 0.9263114071606994, + "grad_norm": 0.14106025786661242, + "kl": 0.048828125, + "learning_rate": 1.3413405439008485e-08, + "loss": 0.002, + "num_tokens": 115844865.0, + "reward": 1.9447916746139526, + "reward_std": 0.05844042226672173, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04479166865348816, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05844042524695396, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.4, + "completions/max_terminated_length": 484.4, + "completions/mean_length": 369.9125, + "completions/mean_terminated_length": 369.9125, + "completions/min_length": 275.3, + "completions/min_terminated_length": 275.3, + "epoch": 0.9283930058284763, + "grad_norm": 0.14363075656265512, + "kl": 0.0481689453125, + "learning_rate": 1.2671527552051476e-08, + "loss": 0.0019, + "num_tokens": 116098690.0, + "reward": 2.037500023841858, + "reward_std": 0.07791186273097991, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05000000074505806, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09743538349866868, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.9, + "completions/max_terminated_length": 504.9, + "completions/mean_length": 386.75, + "completions/mean_terminated_length": 386.75, + "completions/min_length": 273.1, + "completions/min_terminated_length": 273.1, + "epoch": 0.9304746044962531, + "grad_norm": 0.12277997686559079, + "kl": 0.0497802734375, + "learning_rate": 1.1950490445915562e-08, + "loss": 0.002, + "num_tokens": 116348966.0, + "reward": 1.60625, + "reward_std": 0.1352471113204956, + "rewards/accuracy_reward/mean": 0.5875, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.03720119297504425, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.6, + "completions/max_terminated_length": 466.6, + "completions/mean_length": 368.3625, + "completions/mean_terminated_length": 368.3625, + "completions/min_length": 283.7, + "completions/min_terminated_length": 283.7, + "epoch": 0.93255620316403, + "grad_norm": 4.998444783741019, + "kl": 0.053369140625, + "learning_rate": 1.1250324956017021e-08, + "loss": 0.0021, + "num_tokens": 116571507.0, + "reward": 1.9235119104385376, + "reward_std": 0.13009742498397828, + "rewards/accuracy_reward/mean": 0.9047619044780731, + "rewards/accuracy_reward/std": 0.08711026012897491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.7, + "completions/max_terminated_length": 476.7, + "completions/mean_length": 377.8875, + "completions/mean_terminated_length": 377.8875, + "completions/min_length": 295.8, + "completions/min_terminated_length": 295.8, + "epoch": 0.9346378018318068, + "grad_norm": 0.14538984556042098, + "kl": 0.04931640625, + "learning_rate": 1.0571061025189898e-08, + "loss": 0.002, + "num_tokens": 116851034.0, + "reward": 1.9, + "reward_std": 0.14603425860404967, + "rewards/accuracy_reward/mean": 0.875, + "rewards/accuracy_reward/std": 0.09974325299263001, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.025, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046291005611419675, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.5, + "completions/max_terminated_length": 483.5, + "completions/mean_length": 379.5875, + "completions/mean_terminated_length": 379.5875, + "completions/min_length": 273.1, + "completions/min_terminated_length": 273.1, + "epoch": 0.9367194004995837, + "grad_norm": 0.15471626446744527, + "kl": 0.0471435546875, + "learning_rate": 9.912727702405089e-09, + "loss": 0.0019, + "num_tokens": 117130249.0, + "reward": 1.7438988089561462, + "reward_std": 0.13729031383991241, + "rewards/accuracy_reward/mean": 0.6720238097012043, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.071875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.1263546496629715, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.2, + "completions/max_terminated_length": 474.2, + "completions/mean_length": 348.65, + "completions/mean_terminated_length": 348.65, + "completions/min_length": 258.1, + "completions/min_terminated_length": 258.1, + "epoch": 0.9388009991673605, + "grad_norm": 0.13552586888561896, + "kl": 0.05390625, + "learning_rate": 9.275353141528719e-09, + "loss": 0.0022, + "num_tokens": 117376645.0, + "reward": 2.06875, + "reward_std": 0.09932401329278946, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.08125, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06396867483854293, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.1, + "completions/max_terminated_length": 445.1, + "completions/mean_length": 345.2375, + "completions/mean_terminated_length": 345.2375, + "completions/min_length": 248.9, + "completions/min_terminated_length": 248.9, + "epoch": 0.9408825978351374, + "grad_norm": 5.084349277875754, + "kl": 0.054052734375, + "learning_rate": 8.658964600117447e-09, + "loss": 0.0022, + "num_tokens": 117625392.0, + "reward": 1.76875, + "reward_std": 0.11572359055280686, + "rewards/accuracy_reward/mean": 0.7625, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.01767766922712326, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 373.4, + "completions/mean_terminated_length": 373.4, + "completions/min_length": 270.1, + "completions/min_terminated_length": 270.1, + "epoch": 0.9429641965029142, + "grad_norm": 0.14517700306479756, + "kl": 0.054833984375, + "learning_rate": 8.063588438253333e-09, + "loss": 0.0022, + "num_tokens": 117882216.0, + "reward": 1.7432638883590699, + "reward_std": 0.0941609364002943, + "rewards/accuracy_reward/mean": 0.7213888883590698, + "rewards/accuracy_reward/std": 0.07419009134173393, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.021875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06187184229493141, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.7, + "completions/max_terminated_length": 547.7, + "completions/mean_length": 412.8125, + "completions/mean_terminated_length": 412.8125, + "completions/min_length": 308.7, + "completions/min_terminated_length": 308.7, + "epoch": 0.9450457951706911, + "grad_norm": 4.660371503146494, + "kl": 0.0468017578125, + "learning_rate": 7.489250117416301e-09, + "loss": 0.0019, + "num_tokens": 118157841.0, + "reward": 1.9556250333786012, + "reward_std": 0.18665989637374877, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.0816463440656662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06812500152736903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.11871083304286004, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.8, + "completions/max_terminated_length": 464.8, + "completions/mean_length": 356.1625, + "completions/mean_terminated_length": 356.1625, + "completions/min_length": 260.9, + "completions/min_terminated_length": 260.9, + "epoch": 0.9471273938384679, + "grad_norm": 0.1227115198824497, + "kl": 0.0503173828125, + "learning_rate": 6.935974199395123e-09, + "loss": 0.002, + "num_tokens": 118433734.0, + "reward": 1.9291666746139526, + "reward_std": 0.0709901750087738, + "rewards/accuracy_reward/mean": 0.9, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07099017202854156, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.6, + "completions/max_terminated_length": 482.6, + "completions/mean_length": 380.225, + "completions/mean_terminated_length": 380.225, + "completions/min_length": 284.2, + "completions/min_terminated_length": 284.2, + "epoch": 0.9492089925062448, + "grad_norm": 0.17771861834744349, + "kl": 0.0508544921875, + "learning_rate": 6.403784345237473e-09, + "loss": 0.002, + "num_tokens": 118691128.0, + "reward": 2.040000009536743, + "reward_std": 0.04670769646763802, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.039999999850988385, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.046707694232463834, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 381.675, + "completions/mean_terminated_length": 381.675, + "completions/min_length": 275.8, + "completions/min_terminated_length": 275.8, + "epoch": 0.9512905911740216, + "grad_norm": 4.884031037119129, + "kl": 0.0503662109375, + "learning_rate": 5.892703314237468e-09, + "loss": 0.002, + "num_tokens": 118937798.0, + "reward": 1.871875, + "reward_std": 0.0782714195549488, + "rewards/accuracy_reward/mean": 0.8625, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.009375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02651650384068489, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 383.35, + "completions/mean_terminated_length": 383.35, + "completions/min_length": 274.2, + "completions/min_terminated_length": 274.2, + "epoch": 0.9533721898417985, + "grad_norm": 4.1868205565680325, + "kl": 0.04984130859375, + "learning_rate": 5.402752962962887e-09, + "loss": 0.002, + "num_tokens": 119217538.0, + "reward": 1.8479166746139526, + "reward_std": 0.10860317498445511, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.010416667163372039, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.019795581698417664, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 269.5, + "completions/min_terminated_length": 269.5, + "epoch": 0.9554537885095754, + "grad_norm": 5.821238368131713, + "kl": 0.0527587890625, + "learning_rate": 4.933954244320138e-09, + "loss": 0.0021, + "num_tokens": 119476040.0, + "reward": 1.9512932300567627, + "reward_std": 0.14620633274316788, + "rewards/accuracy_reward/mean": 0.9023348808288574, + "rewards/accuracy_reward/std": 0.051754921674728394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833395421505, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09445140585303306, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.8, + "completions/max_terminated_length": 494.8, + "completions/mean_length": 379.55, + "completions/mean_terminated_length": 379.55, + "completions/min_length": 275.4, + "completions/min_terminated_length": 275.4, + "epoch": 0.9575353871773522, + "grad_norm": 4.872306940707049, + "kl": 0.0545166015625, + "learning_rate": 4.486327206658314e-09, + "loss": 0.0022, + "num_tokens": 119739980.0, + "reward": 1.9541666746139525, + "reward_std": 0.11295498609542846, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02916666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06666397675871849, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.7, + "completions/max_terminated_length": 493.7, + "completions/mean_length": 383.775, + "completions/mean_terminated_length": 383.775, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.9596169858451291, + "grad_norm": 0.14209871790245543, + "kl": 0.0483154296875, + "learning_rate": 4.059890992911819e-09, + "loss": 0.0019, + "num_tokens": 120000802.0, + "reward": 1.9552083492279053, + "reward_std": 0.08975056856870652, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03020833432674408, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.043459554016590116, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.7, + "completions/max_terminated_length": 499.7, + "completions/mean_length": 382.0375, + "completions/mean_terminated_length": 382.0375, + "completions/min_length": 269.8, + "completions/min_terminated_length": 269.8, + "epoch": 0.9616985845129059, + "grad_norm": 5.254881507252644, + "kl": 0.0486572265625, + "learning_rate": 3.6546638397817463e-09, + "loss": 0.0019, + "num_tokens": 120273181.0, + "reward": 2.0375, + "reward_std": 0.2587729513645172, + "rewards/accuracy_reward/mean": 0.9375, + "rewards/accuracy_reward/std": 0.08880758583545685, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.1, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.16996537446975707, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.6, + "completions/max_terminated_length": 461.6, + "completions/mean_length": 377.8375, + "completions/mean_terminated_length": 377.8375, + "completions/min_length": 284.2, + "completions/min_terminated_length": 284.2, + "epoch": 0.9637801831806828, + "grad_norm": 0.13488627147120524, + "kl": 0.0506103515625, + "learning_rate": 3.2706630769558372e-09, + "loss": 0.002, + "num_tokens": 120528808.0, + "reward": 1.843750011920929, + "reward_std": 0.08866784423589706, + "rewards/accuracy_reward/mean": 0.808333333581686, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03541666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08866784125566482, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.7, + "completions/max_terminated_length": 454.7, + "completions/mean_length": 358.6875, + "completions/mean_terminated_length": 358.6875, + "completions/min_length": 260.3, + "completions/min_terminated_length": 260.3, + "epoch": 0.9658617818484596, + "grad_norm": 5.075488307300403, + "kl": 0.0509033203125, + "learning_rate": 2.9079051263675713e-09, + "loss": 0.002, + "num_tokens": 120792255.0, + "reward": 2.015625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward/mean": 1.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.5, + "completions/max_terminated_length": 477.5, + "completions/mean_length": 391.5375, + "completions/mean_terminated_length": 391.5375, + "completions/min_length": 272.2, + "completions/min_terminated_length": 272.2, + "epoch": 0.9679433805162365, + "grad_norm": 0.1276972947224501, + "kl": 0.053271484375, + "learning_rate": 2.5664055014936738e-09, + "loss": 0.0021, + "num_tokens": 121067802.0, + "reward": 1.83125, + "reward_std": 0.19529284089803695, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.1595182627439499, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01875, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05303300768136978, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.3, + "completions/max_terminated_length": 453.3, + "completions/mean_length": 347.075, + "completions/mean_terminated_length": 347.075, + "completions/min_length": 253.3, + "completions/min_terminated_length": 253.3, + "epoch": 0.9700249791840133, + "grad_norm": 0.16208269923991245, + "kl": 0.054052734375, + "learning_rate": 2.2461788066908127e-09, + "loss": 0.0022, + "num_tokens": 121329352.0, + "reward": 1.8862499952316285, + "reward_std": 0.2228688657283783, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.15865941643714904, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0487500011920929, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08947228491306305, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.6, + "completions/max_terminated_length": 502.6, + "completions/mean_length": 395.3125, + "completions/mean_terminated_length": 395.3125, + "completions/min_length": 307.1, + "completions/min_terminated_length": 307.1, + "epoch": 0.9721065778517902, + "grad_norm": 0.13886386977374376, + "kl": 0.050390625, + "learning_rate": 1.9472387365710995e-09, + "loss": 0.002, + "num_tokens": 121598809.0, + "reward": 1.614305555820465, + "reward_std": 0.0757537841796875, + "rewards/accuracy_reward/mean": 0.6038888901472091, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.01041666679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02946278378367424, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.7, + "completions/max_terminated_length": 482.7, + "completions/mean_length": 353.9875, + "completions/mean_terminated_length": 353.9875, + "completions/min_length": 245.7, + "completions/min_terminated_length": 245.7, + "epoch": 0.974188176519567, + "grad_norm": 0.17509149030292293, + "kl": 0.050244140625, + "learning_rate": 1.6695980754162231e-09, + "loss": 0.002, + "num_tokens": 121863576.0, + "reward": 2.008333349227905, + "reward_std": 0.08614101856946946, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02083333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.05078567415475845, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.1, + "completions/max_terminated_length": 503.1, + "completions/mean_length": 382.1625, + "completions/mean_terminated_length": 382.1625, + "completions/min_length": 287.6, + "completions/min_terminated_length": 287.6, + "epoch": 0.9762697751873439, + "grad_norm": 0.13843234084015008, + "kl": 0.05654296875, + "learning_rate": 1.4132686966307761e-09, + "loss": 0.0023, + "num_tokens": 122105629.0, + "reward": 1.9668092727661133, + "reward_std": 0.09535977803170681, + "rewards/accuracy_reward/mean": 0.9126425817608833, + "rewards/accuracy_reward/std": 0.003666847199201584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05416666716337204, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.09169291257858277, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.2, + "completions/max_terminated_length": 531.2, + "completions/mean_length": 380.175, + "completions/mean_terminated_length": 380.175, + "completions/min_length": 246.6, + "completions/min_terminated_length": 246.6, + "epoch": 0.9783513738551207, + "grad_norm": 0.1318637209902438, + "kl": 0.0500732421875, + "learning_rate": 1.1782615622347169e-09, + "loss": 0.002, + "num_tokens": 122371067.0, + "reward": 1.691041684150696, + "reward_std": 0.18236968517303467, + "rewards/accuracy_reward/mean": 0.6375, + "rewards/accuracy_reward/std": 0.12246559858322144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.05354166775941849, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.059904086589813235, + "step": 4700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.6, + "completions/max_terminated_length": 488.6, + "completions/mean_length": 381.2875, + "completions/mean_terminated_length": 381.2875, + "completions/min_length": 286.8, + "completions/min_terminated_length": 286.8, + "epoch": 0.9804329725228976, + "grad_norm": 0.13333367658511264, + "kl": 0.0505615234375, + "learning_rate": 9.64586722394356e-10, + "loss": 0.002, + "num_tokens": 122644978.0, + "reward": 1.7597916722297668, + "reward_std": 0.21156217083334922, + "rewards/accuracy_reward/mean": 0.725, + "rewards/accuracy_reward/std": 0.15782093703746797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.03479166869074106, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06111666113138199, + "step": 4710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.3, + "completions/max_terminated_length": 470.3, + "completions/mean_length": 385.225, + "completions/mean_terminated_length": 385.225, + "completions/min_length": 262.8, + "completions/min_terminated_length": 262.8, + "epoch": 0.9825145711906744, + "grad_norm": 0.13765351771426185, + "kl": 0.0531005859375, + "learning_rate": 7.722533149924771e-10, + "loss": 0.0021, + "num_tokens": 122896092.0, + "reward": 1.8212500095367432, + "reward_std": 0.12279465645551682, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.09804592728614807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.008750000037252903, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0247487373650074, + "step": 4720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.1, + "completions/max_terminated_length": 524.1, + "completions/mean_length": 382.8375, + "completions/mean_terminated_length": 382.8375, + "completions/min_length": 276.8, + "completions/min_terminated_length": 276.8, + "epoch": 0.9845961698584513, + "grad_norm": 0.1636076030531112, + "kl": 0.05126953125, + "learning_rate": 6.012695652378163e-10, + "loss": 0.0021, + "num_tokens": 123137623.0, + "reward": 2.0483333349227903, + "reward_std": 0.11749013215303421, + "rewards/accuracy_reward/mean": 0.9875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0608333345502615, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.08213478401303291, + "step": 4730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.9, + "completions/max_terminated_length": 515.9, + "completions/mean_length": 371.475, + "completions/mean_terminated_length": 371.475, + "completions/min_length": 234.8, + "completions/min_terminated_length": 234.8, + "epoch": 0.9866777685262281, + "grad_norm": 0.12834150880736073, + "kl": 0.05224609375, + "learning_rate": 4.5164278531312214e-10, + "loss": 0.0021, + "num_tokens": 123391573.0, + "reward": 1.9947916746139527, + "reward_std": 0.0983980879187584, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.06979166865348815, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.07026949226856231, + "step": 4740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.4, + "completions/max_terminated_length": 484.4, + "completions/mean_length": 364.5875, + "completions/mean_terminated_length": 364.5875, + "completions/min_length": 257.5, + "completions/min_terminated_length": 257.5, + "epoch": 0.988759367194005, + "grad_norm": 4.537840716568053, + "kl": 0.05213623046875, + "learning_rate": 3.233793740625157e-10, + "loss": 0.0021, + "num_tokens": 123664276.0, + "reward": 1.9135416746139526, + "reward_std": 0.0927880972623825, + "rewards/accuracy_reward/mean": 0.8875, + "rewards/accuracy_reward/std": 0.03535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.02604166679084301, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06574104949831963, + "step": 4750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.3, + "completions/max_terminated_length": 477.3, + "completions/mean_length": 376.2, + "completions/mean_terminated_length": 376.2, + "completions/min_length": 289.8, + "completions/min_terminated_length": 289.8, + "epoch": 0.9908409658617818, + "grad_norm": 0.14229700398876058, + "kl": 0.0517578125, + "learning_rate": 2.1648481671787679e-10, + "loss": 0.0021, + "num_tokens": 123922900.0, + "reward": 1.8545833587646485, + "reward_std": 0.18718414306640624, + "rewards/accuracy_reward/mean": 0.8375, + "rewards/accuracy_reward/std": 0.1388651818037033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.017083333618938924, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04831896647810936, + "step": 4760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.6, + "completions/max_terminated_length": 466.6, + "completions/mean_length": 369.1875, + "completions/mean_terminated_length": 369.1875, + "completions/min_length": 265.5, + "completions/min_terminated_length": 265.5, + "epoch": 0.9929225645295587, + "grad_norm": 4.5997180217266225, + "kl": 0.0514404296875, + "learning_rate": 1.309636846639761e-10, + "loss": 0.0021, + "num_tokens": 124164635.0, + "reward": 1.8562393307685852, + "reward_std": 0.12708007395267487, + "rewards/accuracy_reward/mean": 0.847905983030796, + "rewards/accuracy_reward/std": 0.10350984334945679, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.00833333358168602, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.02357022911310196, + "step": 4770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.3, + "completions/max_terminated_length": 522.3, + "completions/mean_length": 375.0, + "completions/mean_terminated_length": 375.0, + "completions/min_length": 273.6, + "completions/min_terminated_length": 273.6, + "epoch": 0.9950041631973355, + "grad_norm": 0.1592605584163004, + "kl": 0.0489990234375, + "learning_rate": 6.68196352435757e-11, + "loss": 0.002, + "num_tokens": 124430099.0, + "reward": 1.9211574077606202, + "reward_std": 0.11545017808675766, + "rewards/accuracy_reward/mean": 0.8774074077606201, + "rewards/accuracy_reward/std": 0.05345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04375, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06199793368577957, + "step": 4780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.4, + "completions/max_terminated_length": 502.4, + "completions/mean_length": 394.6875, + "completions/mean_terminated_length": 394.6875, + "completions/min_length": 295.4, + "completions/min_terminated_length": 295.4, + "epoch": 0.9970857618651124, + "grad_norm": 0.1317878118742546, + "kl": 0.048974609375, + "learning_rate": 2.4055411600332197e-11, + "loss": 0.002, + "num_tokens": 124675202.0, + "reward": 1.8478713989257813, + "reward_std": 0.11516052857041359, + "rewards/accuracy_reward/mean": 0.7989130437374115, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.04895833283662796, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.06886952444911003, + "step": 4790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 356.8, + "completions/mean_terminated_length": 356.8, + "completions/min_length": 261.9, + "completions/min_terminated_length": 261.9, + "epoch": 0.9991673605328892, + "grad_norm": 0.12595486365913136, + "kl": 0.0521484375, + "learning_rate": 2.6728425620015094e-12, + "loss": 0.0021, + "num_tokens": 124950954.0, + "reward": 1.940625, + "reward_std": 0.09048517867922783, + "rewards/accuracy_reward/mean": 0.925, + "rewards/accuracy_reward/std": 0.046291005611419675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.015625, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.04419417306780815, + "step": 4800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.75, + "completions/max_terminated_length": 457.75, + "completions/mean_length": 377.71875, + "completions/mean_terminated_length": 377.71875, + "completions/min_length": 287.25, + "completions/min_terminated_length": 287.25, + "epoch": 1.0, + "kl": 0.05474853515625, + "num_tokens": 125036766.0, + "reward": 1.75, + "reward_std": 0.0, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/mean": 0.0, + "rewards/temporal_grounding_sentence_embedding_consistency_reward/std": 0.0, + "step": 4804, + "total_flos": 0.0, + "train_loss": 0.002239806222792197, + "train_runtime": 195994.6659, + "train_samples_per_second": 0.025, + "train_steps_per_second": 0.025 + } + ], + "logging_steps": 10, + "max_steps": 4804, + "num_input_tokens_seen": 125036766, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}