Reacherx commited on
Commit
f4de988
·
verified ·
1 Parent(s): 0e4e9ad

Training in progress, step 900, checkpoint

Browse files
Files changed (28) hide show
  1. last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  2. last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  4. last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  5. last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  6. last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  7. last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  8. last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  9. last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  10. last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  11. last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  12. last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  13. last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  14. last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  15. last-checkpoint/latest +1 -1
  16. last-checkpoint/model-00001-of-00004.safetensors +1 -1
  17. last-checkpoint/model-00002-of-00004.safetensors +1 -1
  18. last-checkpoint/model-00003-of-00004.safetensors +1 -1
  19. last-checkpoint/model-00004-of-00004.safetensors +1 -1
  20. last-checkpoint/rng_state_0.pth +1 -1
  21. last-checkpoint/rng_state_1.pth +1 -1
  22. last-checkpoint/rng_state_2.pth +1 -1
  23. last-checkpoint/rng_state_3.pth +1 -1
  24. last-checkpoint/rng_state_4.pth +1 -1
  25. last-checkpoint/rng_state_5.pth +1 -1
  26. last-checkpoint/rng_state_6.pth +1 -1
  27. last-checkpoint/scheduler.pt +1 -1
  28. last-checkpoint/trainer_state.json +1602 -2
last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05fe1b88528bc46c4cda49bc4a01e42631384a5ea55da387a14f37bb9f785760
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119684ca6eaa3304b994cb0ca6c2c609e538093f85761f264540a302c687be22
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:413314a6a3536e2f73fe9573223a0cbb1e8f9cdd371e04a01a424d018c570781
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b92cf0ad31e096c86ab41dd988952e16a54bc7f14e74a94f660354a7eee1914
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b322a9d34c580b99dca38dbfa503d05445f05e0c8bff821677bd7cddc1a9e6
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb8a8779ed7b2dc721eeb785d5c61873aaf0e9ed94ba9ff998093c74752f20d
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a017205025f69240e8cc591a021675de1f51493494e1a1c697aa668c064cab
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384c0989a10a126c5c7d7a7b9f72ef35dfbc398c32123846e2e07689adb315ed
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0213f55026cc7c7de26d71c0bb2024ddec98ebc52ac67317eccef8c445e0966
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:734e259c2a11627b96a7376440289a0348721cd16e144afe1cb7adff22cdb64c
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36c4ead9b8b685f4c23f1b9a5bbc53944f33083843794dc22b7d5be6fc0e4658
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60d5aabd902cfd76d5ba9511081474460690487134a6d64ffa3925cf0e289f3
3
+ size 14215152302
last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e336a3fc477735f2e6b4433383835db995837cbe62f524c6bd77f33cd0809b55
3
+ size 349379
last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da30a63edbc2ef896c289a68bd85a2ba5fa2d18271b4b4308d28e04462d33d2
3
+ size 14215152302
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step800
 
1
+ global_step900
last-checkpoint/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88b65313b7da3a700baa760ac5ad8add99bf390086f8c5154512d7f1da169a9d
3
  size 4968242840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d5584d7d8ab8fce5f0e749132bbd3603c179d400f9070f72b405c26541a0715
3
  size 4968242840
last-checkpoint/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e901aeec62b6b83bb95a7940a5886b722fadf06414d94b0b8a379200b022b62b
3
  size 4991495688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bd51885a17a33d56a018bb7e738b8b0c7a07ac55b6bd73b2b6ed36b2e7574a5
3
  size 4991495688
last-checkpoint/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6857ecc27e007eaee9d8929ecb5ad6d623203cdd2d0366bfff04bb5884cffbd
3
  size 4932750920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8292604de962a9f1ec484afd5166e29c0f1f034160f26d3a7baaa899b90a1470
3
  size 4932750920
last-checkpoint/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcb835b11499a67cbf19802321b68ea3e3ee7ca21c9d1b5fe90cd6a20076083e
3
  size 1691924368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:088ee4e5f47e8e54e48083a414d467ece0362d7407c886cb087a8147a723d99a
3
  size 1691924368
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:675484fa00046dc2cfc706e877a51328ad54a9e32d261ee56392e9fd31660503
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbd53c58df8917500ea3a32f627b80f3093bc83d5395e17696aff262f3065ed1
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:786c9e6af7c37c4359662349def8af02e5383f25fcb70587757cd07292496180
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2c713e5d5a3e9e920fa84d9b959311a0590a9d39ac1eb57d11c7092870f80c
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f489e1aa25e7abfdb7ee5b7c42c2024a2c130b51238f90ea60b89beba32e7255
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:275171564de734632c71fef1b2e29ca6bf24c1b021438853efcba459a4fe3f11
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a2071fd54ef3a300d73f7c38e490c9b8dc45eb00aa992f5bd451002c8a0715d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d4d671511a83b2975532f867cdacdb632409e481223d65aab40d4a541d5c1c
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1aec1ccbad836bfadbe6f7022031a9ba302619cc19a2f0e2f40dfd6545649f79
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4964e7fca7c3842d7e50a180f381cc988728d1d3219dbac2badaa370566a6ce6
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2c61b099807b3bc5138fdd05f9c2bc743ce34640dc17724698924b5f742c39d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:545620e1f74e91eaf4d0ab187a660a53712e5d0ab01ed4b4923841a0ae812429
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2713a28e3b13111ad1726481a868b28e9c89fbcc86eb856b1e45c849c3e0b5e6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a36705df35ce87e1a2f3b1395aa333e8f140488116fb59129918c2657e4b071
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:173695d232d5cd26ad7781b9459c96a820f64aeb38a786dcb3286d0a8ab79468
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5af295ce1b3dcf465f79b8946f0d8fa680faeb4b08aa2de4700def074a628db
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.24479804161566707,
5
  "eval_steps": 500,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12807,6 +12807,1606 @@
12807
  "rewards/format_reward": 1.0,
12808
  "step": 800,
12809
  "temporal_rewards": 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12810
  }
12811
  ],
12812
  "logging_steps": 1.0,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.27539779681762544,
5
  "eval_steps": 500,
6
+ "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12807
  "rewards/format_reward": 1.0,
12808
  "step": 800,
12809
  "temporal_rewards": 1.0
12810
+ },
12811
+ {
12812
+ "all_correct": 0.42857142857142855,
12813
+ "all_wrong": 0.0,
12814
+ "completion_length": 207.0357208251953,
12815
+ "epoch": 0.24510403916768667,
12816
+ "grad_norm": 2.9281723699853845,
12817
+ "kl": 0.0657958984375,
12818
+ "learning_rate": 8.622788645990524e-07,
12819
+ "loss": 0.0026,
12820
+ "reward": 2.047179937362671,
12821
+ "reward_std": 0.2067583054304123,
12822
+ "rewards/accuracy_reward": 0.7293230295181274,
12823
+ "rewards/format_reward": 1.0,
12824
+ "step": 801,
12825
+ "temporal_rewards": 1.0
12826
+ },
12827
+ {
12828
+ "all_correct": 0.14285714285714285,
12829
+ "all_wrong": 0.42857142857142855,
12830
+ "completion_length": 179.07144165039062,
12831
+ "epoch": 0.24541003671970624,
12832
+ "grad_norm": 4.134309138458019,
12833
+ "kl": 0.06689453125,
12834
+ "learning_rate": 8.619474197126057e-07,
12835
+ "loss": 0.0027,
12836
+ "reward": 1.4867311716079712,
12837
+ "reward_std": 0.10263003408908844,
12838
+ "rewards/accuracy_reward": 0.32958826422691345,
12839
+ "rewards/format_reward": 1.0,
12840
+ "step": 802,
12841
+ "temporal_rewards": 0.8571428656578064
12842
+ },
12843
+ {
12844
+ "all_correct": 0.14285714285714285,
12845
+ "all_wrong": 0.0,
12846
+ "completion_length": 148.30357360839844,
12847
+ "epoch": 0.24571603427172584,
12848
+ "grad_norm": 4.751302590383965,
12849
+ "kl": 0.07086181640625,
12850
+ "learning_rate": 8.616156403377282e-07,
12851
+ "loss": 0.0028,
12852
+ "reward": 1.7162678241729736,
12853
+ "reward_std": 0.32455798983573914,
12854
+ "rewards/accuracy_reward": 0.5037676692008972,
12855
+ "rewards/format_reward": 1.0,
12856
+ "step": 803,
12857
+ "temporal_rewards": 0.8571428656578064
12858
+ },
12859
+ {
12860
+ "all_correct": 0.14285714285714285,
12861
+ "all_wrong": 0.14285714285714285,
12862
+ "completion_length": 227.0535888671875,
12863
+ "epoch": 0.2460220318237454,
12864
+ "grad_norm": 5.991111880083119,
12865
+ "kl": 0.0748291015625,
12866
+ "learning_rate": 8.612835267810286e-07,
12867
+ "loss": 0.003,
12868
+ "reward": 1.684024453163147,
12869
+ "reward_std": 0.1964358240365982,
12870
+ "rewards/accuracy_reward": 0.4233100712299347,
12871
+ "rewards/format_reward": 0.9821429252624512,
12872
+ "step": 804,
12873
+ "temporal_rewards": 1.0
12874
+ },
12875
+ {
12876
+ "all_correct": 0.14285714285714285,
12877
+ "all_wrong": 0.0,
12878
+ "completion_length": 168.83929443359375,
12879
+ "epoch": 0.246328029375765,
12880
+ "grad_norm": 9.316771002327046,
12881
+ "kl": 0.12939453125,
12882
+ "learning_rate": 8.609510793494254e-07,
12883
+ "loss": 0.0052,
12884
+ "reward": 1.8607145547866821,
12885
+ "reward_std": 0.22949722409248352,
12886
+ "rewards/accuracy_reward": 0.594643235206604,
12887
+ "rewards/format_reward": 0.9821429252624512,
12888
+ "step": 805,
12889
+ "temporal_rewards": 1.0
12890
+ },
12891
+ {
12892
+ "all_correct": 0.14285714285714285,
12893
+ "all_wrong": 0.2857142857142857,
12894
+ "completion_length": 148.375,
12895
+ "epoch": 0.24663402692778458,
12896
+ "grad_norm": 2.536873948888703,
12897
+ "kl": 0.07196044921875,
12898
+ "learning_rate": 8.606182983501446e-07,
12899
+ "loss": 0.0029,
12900
+ "reward": 1.4161813259124756,
12901
+ "reward_std": 0.2212689369916916,
12902
+ "rewards/accuracy_reward": 0.31260988116264343,
12903
+ "rewards/format_reward": 1.0,
12904
+ "step": 806,
12905
+ "temporal_rewards": 0.8571428656578064
12906
+ },
12907
+ {
12908
+ "all_correct": 0.14285714285714285,
12909
+ "all_wrong": 0.2857142857142857,
12910
+ "completion_length": 239.0357208251953,
12911
+ "epoch": 0.24694002447980415,
12912
+ "grad_norm": 2.2992367339007482,
12913
+ "kl": 0.08148193359375,
12914
+ "learning_rate": 8.602851840907212e-07,
12915
+ "loss": 0.0033,
12916
+ "reward": 1.5467685461044312,
12917
+ "reward_std": 0.09967747330665588,
12918
+ "rewards/accuracy_reward": 0.2967683970928192,
12919
+ "rewards/format_reward": 0.9821429252624512,
12920
+ "step": 807,
12921
+ "temporal_rewards": 0.8571428656578064
12922
+ },
12923
+ {
12924
+ "all_correct": 0.14285714285714285,
12925
+ "all_wrong": 0.2857142857142857,
12926
+ "completion_length": 261.6785888671875,
12927
+ "epoch": 0.24724602203182375,
12928
+ "grad_norm": 2.6163619560973426,
12929
+ "kl": 0.06671142578125,
12930
+ "learning_rate": 8.599517368789979e-07,
12931
+ "loss": 0.0027,
12932
+ "reward": 1.467833399772644,
12933
+ "reward_std": 0.15854676067829132,
12934
+ "rewards/accuracy_reward": 0.267833411693573,
12935
+ "rewards/format_reward": 1.0,
12936
+ "step": 808,
12937
+ "temporal_rewards": 0.714285671710968
12938
+ },
12939
+ {
12940
+ "all_correct": 0.42857142857142855,
12941
+ "all_wrong": 0.0,
12942
+ "completion_length": 245.71429443359375,
12943
+ "epoch": 0.24755201958384332,
12944
+ "grad_norm": 1.876303446324673,
12945
+ "kl": 0.0562744140625,
12946
+ "learning_rate": 8.596179570231248e-07,
12947
+ "loss": 0.0023,
12948
+ "reward": 1.8534244298934937,
12949
+ "reward_std": 0.17602498829364777,
12950
+ "rewards/accuracy_reward": 0.5623528361320496,
12951
+ "rewards/format_reward": 1.0,
12952
+ "step": 809,
12953
+ "temporal_rewards": 0.8571428656578064
12954
+ },
12955
+ {
12956
+ "all_correct": 0.42857142857142855,
12957
+ "all_wrong": 0.2857142857142857,
12958
+ "completion_length": 160.96429443359375,
12959
+ "epoch": 0.24785801713586292,
12960
+ "grad_norm": 1.983593074435128,
12961
+ "kl": 0.0673828125,
12962
+ "learning_rate": 8.592838448315599e-07,
12963
+ "loss": 0.0027,
12964
+ "reward": 1.7673360109329224,
12965
+ "reward_std": 0.03303138539195061,
12966
+ "rewards/accuracy_reward": 0.5459073185920715,
12967
+ "rewards/format_reward": 1.0,
12968
+ "step": 810,
12969
+ "temporal_rewards": 0.8571428656578064
12970
+ },
12971
+ {
12972
+ "all_correct": 0.14285714285714285,
12973
+ "all_wrong": 0.14285714285714285,
12974
+ "completion_length": 232.12501525878906,
12975
+ "epoch": 0.2481640146878825,
12976
+ "grad_norm": 3.0190110438589124,
12977
+ "kl": 0.06878662109375,
12978
+ "learning_rate": 8.589494006130679e-07,
12979
+ "loss": 0.0028,
12980
+ "reward": 1.5021908283233643,
12981
+ "reward_std": 0.12059634923934937,
12982
+ "rewards/accuracy_reward": 0.3129049837589264,
12983
+ "rewards/format_reward": 1.0,
12984
+ "step": 811,
12985
+ "temporal_rewards": 0.714285671710968
12986
+ },
12987
+ {
12988
+ "all_correct": 0.42857142857142855,
12989
+ "all_wrong": 0.0,
12990
+ "completion_length": 286.26788330078125,
12991
+ "epoch": 0.2484700122399021,
12992
+ "grad_norm": 1.7801210792367876,
12993
+ "kl": 0.08331298828125,
12994
+ "learning_rate": 8.58614624676721e-07,
12995
+ "loss": 0.0033,
12996
+ "reward": 1.9375736713409424,
12997
+ "reward_std": 0.1051332876086235,
12998
+ "rewards/accuracy_reward": 0.6411451101303101,
12999
+ "rewards/format_reward": 1.0,
13000
+ "step": 812,
13001
+ "temporal_rewards": 0.714285671710968
13002
+ },
13003
+ {
13004
+ "all_correct": 0.2857142857142857,
13005
+ "all_wrong": 0.14285714285714285,
13006
+ "completion_length": 252.19644165039062,
13007
+ "epoch": 0.24877600979192166,
13008
+ "grad_norm": 2.9742718067292033,
13009
+ "kl": 0.062042236328125,
13010
+ "learning_rate": 8.582795173318969e-07,
13011
+ "loss": 0.0025,
13012
+ "reward": 1.7696127891540527,
13013
+ "reward_std": 0.12452299147844315,
13014
+ "rewards/accuracy_reward": 0.4981841444969177,
13015
+ "rewards/format_reward": 1.0,
13016
+ "step": 813,
13017
+ "temporal_rewards": 0.8571428656578064
13018
+ },
13019
+ {
13020
+ "all_correct": 0.14285714285714285,
13021
+ "all_wrong": 0.2857142857142857,
13022
+ "completion_length": 256.75,
13023
+ "epoch": 0.24908200734394126,
13024
+ "grad_norm": 2.2704735528170277,
13025
+ "kl": 0.07305908203125,
13026
+ "learning_rate": 8.579440788882806e-07,
13027
+ "loss": 0.0029,
13028
+ "reward": 1.5697615146636963,
13029
+ "reward_std": 0.06264042109251022,
13030
+ "rewards/accuracy_reward": 0.3036900758743286,
13031
+ "rewards/format_reward": 1.0,
13032
+ "step": 814,
13033
+ "temporal_rewards": 1.0
13034
+ },
13035
+ {
13036
+ "all_correct": 0.5714285714285714,
13037
+ "all_wrong": 0.0,
13038
+ "completion_length": 221.60714721679688,
13039
+ "epoch": 0.24938800489596083,
13040
+ "grad_norm": 1.623507735144211,
13041
+ "kl": 0.062255859375,
13042
+ "learning_rate": 8.576083096558624e-07,
13043
+ "loss": 0.0025,
13044
+ "reward": 1.9773122072219849,
13045
+ "reward_std": 0.08393806964159012,
13046
+ "rewards/accuracy_reward": 0.6844549775123596,
13047
+ "rewards/format_reward": 1.0,
13048
+ "step": 815,
13049
+ "temporal_rewards": 0.8571428656578064
13050
+ },
13051
+ {
13052
+ "all_correct": 0.2857142857142857,
13053
+ "all_wrong": 0.0,
13054
+ "completion_length": 206.00001525878906,
13055
+ "epoch": 0.24969400244798043,
13056
+ "grad_norm": 2.1482537052968547,
13057
+ "kl": 0.0755615234375,
13058
+ "learning_rate": 8.572722099449388e-07,
13059
+ "loss": 0.003,
13060
+ "reward": 1.7175023555755615,
13061
+ "reward_std": 0.11727368831634521,
13062
+ "rewards/accuracy_reward": 0.4639308750629425,
13063
+ "rewards/format_reward": 0.9821429252624512,
13064
+ "step": 816,
13065
+ "temporal_rewards": 0.714285671710968
13066
+ },
13067
+ {
13068
+ "all_correct": 0.2857142857142857,
13069
+ "all_wrong": 0.0,
13070
+ "completion_length": 222.7857208251953,
13071
+ "epoch": 0.25,
13072
+ "grad_norm": 4.595475850035951,
13073
+ "kl": 0.0740966796875,
13074
+ "learning_rate": 8.569357800661111e-07,
13075
+ "loss": 0.003,
13076
+ "reward": 1.8679395914077759,
13077
+ "reward_std": 0.12349120527505875,
13078
+ "rewards/accuracy_reward": 0.5572252869606018,
13079
+ "rewards/format_reward": 1.0,
13080
+ "step": 817,
13081
+ "temporal_rewards": 0.8571428656578064
13082
+ },
13083
+ {
13084
+ "all_correct": 0.2857142857142857,
13085
+ "all_wrong": 0.0,
13086
+ "completion_length": 183.33929443359375,
13087
+ "epoch": 0.2503059975520196,
13088
+ "grad_norm": 3.5483816862668784,
13089
+ "kl": 0.1090087890625,
13090
+ "learning_rate": 8.565990203302864e-07,
13091
+ "loss": 0.0044,
13092
+ "reward": 1.993505597114563,
13093
+ "reward_std": 0.18882004916667938,
13094
+ "rewards/accuracy_reward": 0.7095768451690674,
13095
+ "rewards/format_reward": 1.0,
13096
+ "step": 818,
13097
+ "temporal_rewards": 0.8571428656578064
13098
+ },
13099
+ {
13100
+ "all_correct": 0.14285714285714285,
13101
+ "all_wrong": 0.0,
13102
+ "completion_length": 366.2321472167969,
13103
+ "epoch": 0.25061199510403914,
13104
+ "grad_norm": 4.026645332394576,
13105
+ "kl": 0.0811767578125,
13106
+ "learning_rate": 8.562619310486763e-07,
13107
+ "loss": 0.0032,
13108
+ "reward": 1.833251714706421,
13109
+ "reward_std": 0.04854978993535042,
13110
+ "rewards/accuracy_reward": 0.3868231177330017,
13111
+ "rewards/format_reward": 1.0,
13112
+ "step": 819,
13113
+ "temporal_rewards": 1.0
13114
+ },
13115
+ {
13116
+ "all_correct": 0.2857142857142857,
13117
+ "all_wrong": 0.0,
13118
+ "completion_length": 243.7857208251953,
13119
+ "epoch": 0.25091799265605874,
13120
+ "grad_norm": 4.76954015263035,
13121
+ "kl": 0.06488037109375,
13122
+ "learning_rate": 8.559245125327965e-07,
13123
+ "loss": 0.0026,
13124
+ "reward": 1.9217417240142822,
13125
+ "reward_std": 0.25500017404556274,
13126
+ "rewards/accuracy_reward": 0.6449559330940247,
13127
+ "rewards/format_reward": 1.0,
13128
+ "step": 820,
13129
+ "temporal_rewards": 0.8571428656578064
13130
+ },
13131
+ {
13132
+ "all_correct": 0.2857142857142857,
13133
+ "all_wrong": 0.2857142857142857,
13134
+ "completion_length": 213.19644165039062,
13135
+ "epoch": 0.25122399020807834,
13136
+ "grad_norm": 1.8782121986183875,
13137
+ "kl": 0.07269287109375,
13138
+ "learning_rate": 8.55586765094468e-07,
13139
+ "loss": 0.0029,
13140
+ "reward": 1.7066212892532349,
13141
+ "reward_std": 0.1408018320798874,
13142
+ "rewards/accuracy_reward": 0.48697829246520996,
13143
+ "rewards/format_reward": 1.0,
13144
+ "step": 821,
13145
+ "temporal_rewards": 1.0
13146
+ },
13147
+ {
13148
+ "all_correct": 0.42857142857142855,
13149
+ "all_wrong": 0.0,
13150
+ "completion_length": 154.46429443359375,
13151
+ "epoch": 0.25152998776009794,
13152
+ "grad_norm": 2.802553457575392,
13153
+ "kl": 0.059600830078125,
13154
+ "learning_rate": 8.552486890458146e-07,
13155
+ "loss": 0.0024,
13156
+ "reward": 1.9007517099380493,
13157
+ "reward_std": 0.17006878554821014,
13158
+ "rewards/accuracy_reward": 0.6811087727546692,
13159
+ "rewards/format_reward": 1.0,
13160
+ "step": 822,
13161
+ "temporal_rewards": 0.8571428656578064
13162
+ },
13163
+ {
13164
+ "all_correct": 0.42857142857142855,
13165
+ "all_wrong": 0.2857142857142857,
13166
+ "completion_length": 254.5535888671875,
13167
+ "epoch": 0.2518359853121175,
13168
+ "grad_norm": 2.2104544083520765,
13169
+ "kl": 0.06500244140625,
13170
+ "learning_rate": 8.549102846992649e-07,
13171
+ "loss": 0.0026,
13172
+ "reward": 1.7576632499694824,
13173
+ "reward_std": 0.028187856078147888,
13174
+ "rewards/accuracy_reward": 0.4862346649169922,
13175
+ "rewards/format_reward": 1.0,
13176
+ "step": 823,
13177
+ "temporal_rewards": 1.0
13178
+ },
13179
+ {
13180
+ "all_correct": 0.42857142857142855,
13181
+ "all_wrong": 0.0,
13182
+ "completion_length": 139.6607208251953,
13183
+ "epoch": 0.2521419828641371,
13184
+ "grad_norm": 3.899024552852272,
13185
+ "kl": 0.0740966796875,
13186
+ "learning_rate": 8.5457155236755e-07,
13187
+ "loss": 0.003,
13188
+ "reward": 1.9015222787857056,
13189
+ "reward_std": 0.16740469634532928,
13190
+ "rewards/accuracy_reward": 0.7265222072601318,
13191
+ "rewards/format_reward": 1.0,
13192
+ "step": 824,
13193
+ "temporal_rewards": 0.5714285373687744
13194
+ },
13195
+ {
13196
+ "all_correct": 0.14285714285714285,
13197
+ "all_wrong": 0.2857142857142857,
13198
+ "completion_length": 182.2678680419922,
13199
+ "epoch": 0.2524479804161567,
13200
+ "grad_norm": 3.813733151425986,
13201
+ "kl": 0.080322265625,
13202
+ "learning_rate": 8.542324923637045e-07,
13203
+ "loss": 0.0032,
13204
+ "reward": 1.6101853847503662,
13205
+ "reward_std": 0.07268591970205307,
13206
+ "rewards/accuracy_reward": 0.41732820868492126,
13207
+ "rewards/format_reward": 1.0,
13208
+ "step": 825,
13209
+ "temporal_rewards": 0.8571428656578064
13210
+ },
13211
+ {
13212
+ "all_correct": 0.0,
13213
+ "all_wrong": 0.14285714285714285,
13214
+ "completion_length": 193.80357360839844,
13215
+ "epoch": 0.2527539779681763,
13216
+ "grad_norm": 2.3299012963911707,
13217
+ "kl": 0.06707763671875,
13218
+ "learning_rate": 8.538931050010659e-07,
13219
+ "loss": 0.0027,
13220
+ "reward": 1.664027452468872,
13221
+ "reward_std": 0.2882187068462372,
13222
+ "rewards/accuracy_reward": 0.5425989031791687,
13223
+ "rewards/format_reward": 1.0,
13224
+ "step": 826,
13225
+ "temporal_rewards": 0.5714285373687744
13226
+ },
13227
+ {
13228
+ "all_correct": 0.2857142857142857,
13229
+ "all_wrong": 0.2857142857142857,
13230
+ "completion_length": 156.67857360839844,
13231
+ "epoch": 0.2530599755201958,
13232
+ "grad_norm": 2.555255650663871,
13233
+ "kl": 0.06451416015625,
13234
+ "learning_rate": 8.535533905932737e-07,
13235
+ "loss": 0.0026,
13236
+ "reward": 1.6659547090530396,
13237
+ "reward_std": 0.1807582676410675,
13238
+ "rewards/accuracy_reward": 0.5177403688430786,
13239
+ "rewards/format_reward": 1.0,
13240
+ "step": 827,
13241
+ "temporal_rewards": 0.8571428656578064
13242
+ },
13243
+ {
13244
+ "all_correct": 0.5714285714285714,
13245
+ "all_wrong": 0.14285714285714285,
13246
+ "completion_length": 120.48214721679688,
13247
+ "epoch": 0.2533659730722154,
13248
+ "grad_norm": 5.737746204248841,
13249
+ "kl": 0.079833984375,
13250
+ "learning_rate": 8.532133494542705e-07,
13251
+ "loss": 0.0032,
13252
+ "reward": 1.9454082250595093,
13253
+ "reward_std": 0.00841815210878849,
13254
+ "rewards/accuracy_reward": 0.6882654428482056,
13255
+ "rewards/format_reward": 1.0,
13256
+ "step": 828,
13257
+ "temporal_rewards": 0.8571428656578064
13258
+ },
13259
+ {
13260
+ "all_correct": 0.2857142857142857,
13261
+ "all_wrong": 0.0,
13262
+ "completion_length": 211.7678680419922,
13263
+ "epoch": 0.253671970624235,
13264
+ "grad_norm": 3.114541528996617,
13265
+ "kl": 0.06292724609375,
13266
+ "learning_rate": 8.528729818983e-07,
13267
+ "loss": 0.0025,
13268
+ "reward": 1.7419655323028564,
13269
+ "reward_std": 0.3375326693058014,
13270
+ "rewards/accuracy_reward": 0.5526796579360962,
13271
+ "rewards/format_reward": 0.9642857313156128,
13272
+ "step": 829,
13273
+ "temporal_rewards": 0.5714285373687744
13274
+ },
13275
+ {
13276
+ "all_correct": 0.2857142857142857,
13277
+ "all_wrong": 0.2857142857142857,
13278
+ "completion_length": 173.35714721679688,
13279
+ "epoch": 0.25397796817625456,
13280
+ "grad_norm": 67.6969972123076,
13281
+ "kl": 0.068115234375,
13282
+ "learning_rate": 8.525322882399082e-07,
13283
+ "loss": 0.0027,
13284
+ "reward": 1.6851218938827515,
13285
+ "reward_std": 0.09688737243413925,
13286
+ "rewards/accuracy_reward": 0.48512178659439087,
13287
+ "rewards/format_reward": 1.0,
13288
+ "step": 830,
13289
+ "temporal_rewards": 0.714285671710968
13290
+ },
13291
+ {
13292
+ "all_correct": 0.42857142857142855,
13293
+ "all_wrong": 0.14285714285714285,
13294
+ "completion_length": 178.55357360839844,
13295
+ "epoch": 0.25428396572827416,
13296
+ "grad_norm": 1.4363689395781298,
13297
+ "kl": 0.0780029296875,
13298
+ "learning_rate": 8.52191268793942e-07,
13299
+ "loss": 0.0031,
13300
+ "reward": 1.9075700044631958,
13301
+ "reward_std": 0.028506727889180183,
13302
+ "rewards/accuracy_reward": 0.6254270672798157,
13303
+ "rewards/format_reward": 1.0,
13304
+ "step": 831,
13305
+ "temporal_rewards": 1.0
13306
+ },
13307
+ {
13308
+ "all_correct": 0.2857142857142857,
13309
+ "all_wrong": 0.14285714285714285,
13310
+ "completion_length": 191.42857360839844,
13311
+ "epoch": 0.25458996328029376,
13312
+ "grad_norm": 3.293920291757542,
13313
+ "kl": 0.08331298828125,
13314
+ "learning_rate": 8.518499238755496e-07,
13315
+ "loss": 0.0033,
13316
+ "reward": 1.807334542274475,
13317
+ "reward_std": 0.17562764883041382,
13318
+ "rewards/accuracy_reward": 0.54662024974823,
13319
+ "rewards/format_reward": 0.9821429252624512,
13320
+ "step": 832,
13321
+ "temporal_rewards": 1.0
13322
+ },
13323
+ {
13324
+ "all_correct": 0.14285714285714285,
13325
+ "all_wrong": 0.14285714285714285,
13326
+ "completion_length": 261.51788330078125,
13327
+ "epoch": 0.25489596083231336,
13328
+ "grad_norm": 2.5882653330593115,
13329
+ "kl": 0.076171875,
13330
+ "learning_rate": 8.515082538001798e-07,
13331
+ "loss": 0.003,
13332
+ "reward": 1.561861276626587,
13333
+ "reward_std": 0.2398105412721634,
13334
+ "rewards/accuracy_reward": 0.3225754499435425,
13335
+ "rewards/format_reward": 0.9642857313156128,
13336
+ "step": 833,
13337
+ "temporal_rewards": 0.714285671710968
13338
+ },
13339
+ {
13340
+ "all_correct": 0.14285714285714285,
13341
+ "all_wrong": 0.2857142857142857,
13342
+ "completion_length": 179.7678680419922,
13343
+ "epoch": 0.2552019583843329,
13344
+ "grad_norm": 2.0572353102018313,
13345
+ "kl": 0.0810546875,
13346
+ "learning_rate": 8.511662588835823e-07,
13347
+ "loss": 0.0032,
13348
+ "reward": 1.509498953819275,
13349
+ "reward_std": 0.19672635197639465,
13350
+ "rewards/accuracy_reward": 0.32199880480766296,
13351
+ "rewards/format_reward": 0.9821429252624512,
13352
+ "step": 834,
13353
+ "temporal_rewards": 1.0
13354
+ },
13355
+ {
13356
+ "all_correct": 0.42857142857142855,
13357
+ "all_wrong": 0.2857142857142857,
13358
+ "completion_length": 183.33929443359375,
13359
+ "epoch": 0.2555079559363525,
13360
+ "grad_norm": 1.9340557459916523,
13361
+ "kl": 0.0640869140625,
13362
+ "learning_rate": 8.508239394418064e-07,
13363
+ "loss": 0.0026,
13364
+ "reward": 1.706633448600769,
13365
+ "reward_std": 0.06095283478498459,
13366
+ "rewards/accuracy_reward": 0.4923476576805115,
13367
+ "rewards/format_reward": 0.9821429252624512,
13368
+ "step": 835,
13369
+ "temporal_rewards": 1.0
13370
+ },
13371
+ {
13372
+ "all_correct": 0.5714285714285714,
13373
+ "all_wrong": 0.0,
13374
+ "completion_length": 137.07144165039062,
13375
+ "epoch": 0.2558139534883721,
13376
+ "grad_norm": 3.9407094375455114,
13377
+ "kl": 0.054473876953125,
13378
+ "learning_rate": 8.504812957912018e-07,
13379
+ "loss": 0.0022,
13380
+ "reward": 1.9000000953674316,
13381
+ "reward_std": 0.23574510216712952,
13382
+ "rewards/accuracy_reward": 0.6964285969734192,
13383
+ "rewards/format_reward": 1.0,
13384
+ "step": 836,
13385
+ "temporal_rewards": 0.8571428656578064
13386
+ },
13387
+ {
13388
+ "all_correct": 0.14285714285714285,
13389
+ "all_wrong": 0.0,
13390
+ "completion_length": 188.94644165039062,
13391
+ "epoch": 0.2561199510403917,
13392
+ "grad_norm": 4.029147261922757,
13393
+ "kl": 0.0792236328125,
13394
+ "learning_rate": 8.501383282484176e-07,
13395
+ "loss": 0.0032,
13396
+ "reward": 1.6178863048553467,
13397
+ "reward_std": 0.11477590352296829,
13398
+ "rewards/accuracy_reward": 0.3321720063686371,
13399
+ "rewards/format_reward": 1.0,
13400
+ "step": 837,
13401
+ "temporal_rewards": 1.0
13402
+ },
13403
+ {
13404
+ "all_correct": 0.42857142857142855,
13405
+ "all_wrong": 0.0,
13406
+ "completion_length": 205.98214721679688,
13407
+ "epoch": 0.25642594859241125,
13408
+ "grad_norm": 11.476990775610078,
13409
+ "kl": 0.071044921875,
13410
+ "learning_rate": 8.497950371304023e-07,
13411
+ "loss": 0.0028,
13412
+ "reward": 2.0091044902801514,
13413
+ "reward_std": 0.22341029345989227,
13414
+ "rewards/accuracy_reward": 0.7216044068336487,
13415
+ "rewards/format_reward": 1.0,
13416
+ "step": 838,
13417
+ "temporal_rewards": 1.0
13418
+ },
13419
+ {
13420
+ "all_correct": 0.2857142857142857,
13421
+ "all_wrong": 0.42857142857142855,
13422
+ "completion_length": 125.21429443359375,
13423
+ "epoch": 0.25673194614443084,
13424
+ "grad_norm": 1.7000676273670414,
13425
+ "kl": 0.07147216796875,
13426
+ "learning_rate": 8.494514227544034e-07,
13427
+ "loss": 0.0029,
13428
+ "reward": 1.5309561491012573,
13429
+ "reward_std": 0.07276370376348495,
13430
+ "rewards/accuracy_reward": 0.37917035818099976,
13431
+ "rewards/format_reward": 0.9821429252624512,
13432
+ "step": 839,
13433
+ "temporal_rewards": 0.8571428656578064
13434
+ },
13435
+ {
13436
+ "all_correct": 0.42857142857142855,
13437
+ "all_wrong": 0.0,
13438
+ "completion_length": 249.60714721679688,
13439
+ "epoch": 0.25703794369645044,
13440
+ "grad_norm": 2.175181403729273,
13441
+ "kl": 0.1136474609375,
13442
+ "learning_rate": 8.491074854379671e-07,
13443
+ "loss": 0.0045,
13444
+ "reward": 1.8630489110946655,
13445
+ "reward_std": 0.2111286222934723,
13446
+ "rewards/accuracy_reward": 0.586263120174408,
13447
+ "rewards/format_reward": 0.9285714626312256,
13448
+ "step": 840,
13449
+ "temporal_rewards": 1.0
13450
+ },
13451
+ {
13452
+ "all_correct": 0.5714285714285714,
13453
+ "all_wrong": 0.0,
13454
+ "completion_length": 171.73214721679688,
13455
+ "epoch": 0.25734394124847,
13456
+ "grad_norm": 3.892248304212202,
13457
+ "kl": 0.07586669921875,
13458
+ "learning_rate": 8.487632254989379e-07,
13459
+ "loss": 0.003,
13460
+ "reward": 1.962437391281128,
13461
+ "reward_std": 0.15512388944625854,
13462
+ "rewards/accuracy_reward": 0.6874372959136963,
13463
+ "rewards/format_reward": 0.9821429252624512,
13464
+ "step": 841,
13465
+ "temporal_rewards": 1.0
13466
+ },
13467
+ {
13468
+ "all_correct": 0.14285714285714285,
13469
+ "all_wrong": 0.14285714285714285,
13470
+ "completion_length": 166.9107208251953,
13471
+ "epoch": 0.2576499388004896,
13472
+ "grad_norm": 2.675161584719784,
13473
+ "kl": 0.08343505859375,
13474
+ "learning_rate": 8.484186432554586e-07,
13475
+ "loss": 0.0033,
13476
+ "reward": 1.5791218280792236,
13477
+ "reward_std": 0.13287338614463806,
13478
+ "rewards/accuracy_reward": 0.36126458644866943,
13479
+ "rewards/format_reward": 0.9821429252624512,
13480
+ "step": 842,
13481
+ "temporal_rewards": 0.8571428656578064
13482
+ },
13483
+ {
13484
+ "all_correct": 0.14285714285714285,
13485
+ "all_wrong": 0.2857142857142857,
13486
+ "completion_length": 221.23214721679688,
13487
+ "epoch": 0.2579559363525092,
13488
+ "grad_norm": 2.02453164520589,
13489
+ "kl": 0.086669921875,
13490
+ "learning_rate": 8.480737390259702e-07,
13491
+ "loss": 0.0035,
13492
+ "reward": 1.5743305683135986,
13493
+ "reward_std": 0.09532348066568375,
13494
+ "rewards/accuracy_reward": 0.32433053851127625,
13495
+ "rewards/format_reward": 0.9821429252624512,
13496
+ "step": 843,
13497
+ "temporal_rewards": 1.0
13498
+ },
13499
+ {
13500
+ "all_correct": 0.42857142857142855,
13501
+ "all_wrong": 0.0,
13502
+ "completion_length": 114.85714721679688,
13503
+ "epoch": 0.2582619339045288,
13504
+ "grad_norm": 7.990069827061059,
13505
+ "kl": 0.06591796875,
13506
+ "learning_rate": 8.477285131292107e-07,
13507
+ "loss": 0.0026,
13508
+ "reward": 1.89658522605896,
13509
+ "reward_std": 0.1812773495912552,
13510
+ "rewards/accuracy_reward": 0.6501566767692566,
13511
+ "rewards/format_reward": 1.0,
13512
+ "step": 844,
13513
+ "temporal_rewards": 1.0
13514
+ },
13515
+ {
13516
+ "all_correct": 0.2857142857142857,
13517
+ "all_wrong": 0.0,
13518
+ "completion_length": 275.3571472167969,
13519
+ "epoch": 0.25856793145654833,
13520
+ "grad_norm": 4.670054056493445,
13521
+ "kl": 0.06304931640625,
13522
+ "learning_rate": 8.473829658842153e-07,
13523
+ "loss": 0.0025,
13524
+ "reward": 1.863919734954834,
13525
+ "reward_std": 0.10707426816225052,
13526
+ "rewards/accuracy_reward": 0.5067769289016724,
13527
+ "rewards/format_reward": 0.9821429252624512,
13528
+ "step": 845,
13529
+ "temporal_rewards": 1.0
13530
+ },
13531
+ {
13532
+ "all_correct": 0.14285714285714285,
13533
+ "all_wrong": 0.0,
13534
+ "completion_length": 190.7857208251953,
13535
+ "epoch": 0.2588739290085679,
13536
+ "grad_norm": 3.696883423375172,
13537
+ "kl": 0.09039306640625,
13538
+ "learning_rate": 8.47037097610317e-07,
13539
+ "loss": 0.0036,
13540
+ "reward": 1.8291937112808228,
13541
+ "reward_std": 0.30012595653533936,
13542
+ "rewards/accuracy_reward": 0.5559793710708618,
13543
+ "rewards/format_reward": 0.9821429252624512,
13544
+ "step": 846,
13545
+ "temporal_rewards": 1.0
13546
+ },
13547
+ {
13548
+ "all_correct": 0.14285714285714285,
13549
+ "all_wrong": 0.0,
13550
+ "completion_length": 205.21429443359375,
13551
+ "epoch": 0.2591799265605875,
13552
+ "grad_norm": 2.92257295296219,
13553
+ "kl": 0.09735107421875,
13554
+ "learning_rate": 8.466909086271443e-07,
13555
+ "loss": 0.0039,
13556
+ "reward": 1.9195644855499268,
13557
+ "reward_std": 0.17363081872463226,
13558
+ "rewards/accuracy_reward": 0.6017073392868042,
13559
+ "rewards/format_reward": 1.0,
13560
+ "step": 847,
13561
+ "temporal_rewards": 1.0
13562
+ },
13563
+ {
13564
+ "all_correct": 0.2857142857142857,
13565
+ "all_wrong": 0.14285714285714285,
13566
+ "completion_length": 166.2857208251953,
13567
+ "epoch": 0.2594859241126071,
13568
+ "grad_norm": 4.839827480327019,
13569
+ "kl": 0.06756591796875,
13570
+ "learning_rate": 8.463443992546234e-07,
13571
+ "loss": 0.0027,
13572
+ "reward": 1.7144334316253662,
13573
+ "reward_std": 0.2880959212779999,
13574
+ "rewards/accuracy_reward": 0.5072907209396362,
13575
+ "rewards/format_reward": 1.0,
13576
+ "step": 848,
13577
+ "temporal_rewards": 1.0
13578
+ },
13579
+ {
13580
+ "all_correct": 0.0,
13581
+ "all_wrong": 0.14285714285714285,
13582
+ "completion_length": 135.55357360839844,
13583
+ "epoch": 0.25979192166462667,
13584
+ "grad_norm": 5.993414281952189,
13585
+ "kl": 0.06500244140625,
13586
+ "learning_rate": 8.459975698129753e-07,
13587
+ "loss": 0.0026,
13588
+ "reward": 1.5761994123458862,
13589
+ "reward_std": 0.23083187639713287,
13590
+ "rewards/accuracy_reward": 0.39941367506980896,
13591
+ "rewards/format_reward": 1.0,
13592
+ "step": 849,
13593
+ "temporal_rewards": 1.0
13594
+ },
13595
+ {
13596
+ "all_correct": 0.7142857142857143,
13597
+ "all_wrong": 0.0,
13598
+ "completion_length": 209.3928680419922,
13599
+ "epoch": 0.26009791921664627,
13600
+ "grad_norm": 1.1340119659986048,
13601
+ "kl": 0.081298828125,
13602
+ "learning_rate": 8.456504206227177e-07,
13603
+ "loss": 0.0033,
13604
+ "reward": 2.096266746520996,
13605
+ "reward_std": 0.08216449618339539,
13606
+ "rewards/accuracy_reward": 0.7694810032844543,
13607
+ "rewards/format_reward": 0.9821429252624512,
13608
+ "step": 850,
13609
+ "temporal_rewards": 1.0
13610
+ },
13611
+ {
13612
+ "all_correct": 0.2857142857142857,
13613
+ "all_wrong": 0.0,
13614
+ "completion_length": 109.80357360839844,
13615
+ "epoch": 0.26040391676866587,
13616
+ "grad_norm": 4.222691355310761,
13617
+ "kl": 0.0751953125,
13618
+ "learning_rate": 8.453029520046634e-07,
13619
+ "loss": 0.003,
13620
+ "reward": 1.9611623287200928,
13621
+ "reward_std": 0.14106816053390503,
13622
+ "rewards/accuracy_reward": 0.677233874797821,
13623
+ "rewards/format_reward": 1.0,
13624
+ "step": 851,
13625
+ "temporal_rewards": 1.0
13626
+ },
13627
+ {
13628
+ "all_correct": 0.5714285714285714,
13629
+ "all_wrong": 0.0,
13630
+ "completion_length": 138.8928680419922,
13631
+ "epoch": 0.2607099143206854,
13632
+ "grad_norm": 2.9274501393456176,
13633
+ "kl": 0.06640625,
13634
+ "learning_rate": 8.449551642799204e-07,
13635
+ "loss": 0.0027,
13636
+ "reward": 2.114285707473755,
13637
+ "reward_std": 0.27120646834373474,
13638
+ "rewards/accuracy_reward": 0.8571429252624512,
13639
+ "rewards/format_reward": 1.0,
13640
+ "step": 852,
13641
+ "temporal_rewards": 1.0
13642
+ },
13643
+ {
13644
+ "all_correct": 0.2857142857142857,
13645
+ "all_wrong": 0.2857142857142857,
13646
+ "completion_length": 217.67857360839844,
13647
+ "epoch": 0.261015911872705,
13648
+ "grad_norm": 1.926086918566909,
13649
+ "kl": 0.078369140625,
13650
+ "learning_rate": 8.446070577698915e-07,
13651
+ "loss": 0.0031,
13652
+ "reward": 1.6957752704620361,
13653
+ "reward_std": 0.04128192737698555,
13654
+ "rewards/accuracy_reward": 0.42791807651519775,
13655
+ "rewards/format_reward": 1.0,
13656
+ "step": 853,
13657
+ "temporal_rewards": 1.0
13658
+ },
13659
+ {
13660
+ "all_correct": 0.14285714285714285,
13661
+ "all_wrong": 0.14285714285714285,
13662
+ "completion_length": 218.55357360839844,
13663
+ "epoch": 0.2613219094247246,
13664
+ "grad_norm": 3.7384806739684255,
13665
+ "kl": 0.075927734375,
13666
+ "learning_rate": 8.442586327962746e-07,
13667
+ "loss": 0.003,
13668
+ "reward": 1.6708444356918335,
13669
+ "reward_std": 0.19737550616264343,
13670
+ "rewards/accuracy_reward": 0.4119158089160919,
13671
+ "rewards/format_reward": 0.9821429252624512,
13672
+ "step": 854,
13673
+ "temporal_rewards": 0.8571428656578064
13674
+ },
13675
+ {
13676
+ "all_correct": 0.14285714285714285,
13677
+ "all_wrong": 0.14285714285714285,
13678
+ "completion_length": 190.7678680419922,
13679
+ "epoch": 0.2616279069767442,
13680
+ "grad_norm": 4.253646327546919,
13681
+ "kl": 0.07208251953125,
13682
+ "learning_rate": 8.439098896810614e-07,
13683
+ "loss": 0.0029,
13684
+ "reward": 1.5890089273452759,
13685
+ "reward_std": 0.2665659487247467,
13686
+ "rewards/accuracy_reward": 0.4193660020828247,
13687
+ "rewards/format_reward": 1.0,
13688
+ "step": 855,
13689
+ "temporal_rewards": 0.714285671710968
13690
+ },
13691
+ {
13692
+ "all_correct": 0.42857142857142855,
13693
+ "all_wrong": 0.14285714285714285,
13694
+ "completion_length": 209.08929443359375,
13695
+ "epoch": 0.26193390452876375,
13696
+ "grad_norm": 7.368057295625036,
13697
+ "kl": 0.09344482421875,
13698
+ "learning_rate": 8.435608287465376e-07,
13699
+ "loss": 0.0037,
13700
+ "reward": 1.756385087966919,
13701
+ "reward_std": 0.22209163010120392,
13702
+ "rewards/accuracy_reward": 0.5438850522041321,
13703
+ "rewards/format_reward": 0.9464285969734192,
13704
+ "step": 856,
13705
+ "temporal_rewards": 1.0
13706
+ },
13707
+ {
13708
+ "all_correct": 0.0,
13709
+ "all_wrong": 0.14285714285714285,
13710
+ "completion_length": 275.625,
13711
+ "epoch": 0.26223990208078335,
13712
+ "grad_norm": 2.9501313048424405,
13713
+ "kl": 0.0986328125,
13714
+ "learning_rate": 8.43211450315283e-07,
13715
+ "loss": 0.0039,
13716
+ "reward": 1.6245818138122559,
13717
+ "reward_std": 0.288144052028656,
13718
+ "rewards/accuracy_reward": 0.38529595732688904,
13719
+ "rewards/format_reward": 0.9642857313156128,
13720
+ "step": 857,
13721
+ "temporal_rewards": 0.8571428656578064
13722
+ },
13723
+ {
13724
+ "all_correct": 0.14285714285714285,
13725
+ "all_wrong": 0.0,
13726
+ "completion_length": 256.8214416503906,
13727
+ "epoch": 0.26254589963280295,
13728
+ "grad_norm": 4.252341850089602,
13729
+ "kl": 0.0767822265625,
13730
+ "learning_rate": 8.428617547101705e-07,
13731
+ "loss": 0.0031,
13732
+ "reward": 1.735701560974121,
13733
+ "reward_std": 0.12338480353355408,
13734
+ "rewards/accuracy_reward": 0.444629967212677,
13735
+ "rewards/format_reward": 1.0,
13736
+ "step": 858,
13737
+ "temporal_rewards": 0.8571428656578064
13738
+ },
13739
+ {
13740
+ "all_correct": 0.42857142857142855,
13741
+ "all_wrong": 0.0,
13742
+ "completion_length": 260.14288330078125,
13743
+ "epoch": 0.26285189718482255,
13744
+ "grad_norm": 2.0754843149035613,
13745
+ "kl": 0.09075927734375,
13746
+ "learning_rate": 8.425117422543662e-07,
13747
+ "loss": 0.0036,
13748
+ "reward": 1.9669040441513062,
13749
+ "reward_std": 0.21213708817958832,
13750
+ "rewards/accuracy_reward": 0.6383326649665833,
13751
+ "rewards/format_reward": 0.9821429252624512,
13752
+ "step": 859,
13753
+ "temporal_rewards": 1.0
13754
+ },
13755
+ {
13756
+ "all_correct": 0.42857142857142855,
13757
+ "all_wrong": 0.14285714285714285,
13758
+ "completion_length": 209.7857208251953,
13759
+ "epoch": 0.2631578947368421,
13760
+ "grad_norm": 2.1579992600990177,
13761
+ "kl": 0.07696533203125,
13762
+ "learning_rate": 8.421614132713291e-07,
13763
+ "loss": 0.0031,
13764
+ "reward": 1.8583227396011353,
13765
+ "reward_std": 0.029287781566381454,
13766
+ "rewards/accuracy_reward": 0.5511797666549683,
13767
+ "rewards/format_reward": 1.0,
13768
+ "step": 860,
13769
+ "temporal_rewards": 1.0
13770
+ },
13771
+ {
13772
+ "all_correct": 0.14285714285714285,
13773
+ "all_wrong": 0.2857142857142857,
13774
+ "completion_length": 175.25001525878906,
13775
+ "epoch": 0.2634638922888617,
13776
+ "grad_norm": 2.271360656952098,
13777
+ "kl": 0.064208984375,
13778
+ "learning_rate": 8.418107680848106e-07,
13779
+ "loss": 0.0026,
13780
+ "reward": 1.578763723373413,
13781
+ "reward_std": 0.08131464570760727,
13782
+ "rewards/accuracy_reward": 0.34840652346611023,
13783
+ "rewards/format_reward": 1.0,
13784
+ "step": 861,
13785
+ "temporal_rewards": 1.0
13786
+ },
13787
+ {
13788
+ "all_correct": 0.2857142857142857,
13789
+ "all_wrong": 0.0,
13790
+ "completion_length": 181.87501525878906,
13791
+ "epoch": 0.2637698898408813,
13792
+ "grad_norm": 3.4384761464367695,
13793
+ "kl": 0.0611572265625,
13794
+ "learning_rate": 8.414598070188541e-07,
13795
+ "loss": 0.0024,
13796
+ "reward": 1.7330061197280884,
13797
+ "reward_std": 0.16418743133544922,
13798
+ "rewards/accuracy_reward": 0.488363116979599,
13799
+ "rewards/format_reward": 1.0,
13800
+ "step": 862,
13801
+ "temporal_rewards": 0.8571428656578064
13802
+ },
13803
+ {
13804
+ "all_correct": 0.2857142857142857,
13805
+ "all_wrong": 0.0,
13806
+ "completion_length": 242.6607208251953,
13807
+ "epoch": 0.26407588739290083,
13808
+ "grad_norm": 5.694668293113178,
13809
+ "kl": 0.07659912109375,
13810
+ "learning_rate": 8.411085303977954e-07,
13811
+ "loss": 0.0031,
13812
+ "reward": 1.8662446737289429,
13813
+ "reward_std": 0.2061271071434021,
13814
+ "rewards/accuracy_reward": 0.5251731872558594,
13815
+ "rewards/format_reward": 0.9821429252624512,
13816
+ "step": 863,
13817
+ "temporal_rewards": 1.0
13818
+ },
13819
+ {
13820
+ "all_correct": 0.14285714285714285,
13821
+ "all_wrong": 0.14285714285714285,
13822
+ "completion_length": 208.46429443359375,
13823
+ "epoch": 0.26438188494492043,
13824
+ "grad_norm": 3.3670758880528613,
13825
+ "kl": 0.08154296875,
13826
+ "learning_rate": 8.407569385462614e-07,
13827
+ "loss": 0.0033,
13828
+ "reward": 1.8434805870056152,
13829
+ "reward_std": 0.27712368965148926,
13830
+ "rewards/accuracy_reward": 0.5720521807670593,
13831
+ "rewards/format_reward": 0.9821429252624512,
13832
+ "step": 864,
13833
+ "temporal_rewards": 1.0
13834
+ },
13835
+ {
13836
+ "all_correct": 0.2857142857142857,
13837
+ "all_wrong": 0.0,
13838
+ "completion_length": 238.94644165039062,
13839
+ "epoch": 0.26468788249694003,
13840
+ "grad_norm": 3.0431561776514036,
13841
+ "kl": 0.08380126953125,
13842
+ "learning_rate": 8.40405031789171e-07,
13843
+ "loss": 0.0034,
13844
+ "reward": 1.9124431610107422,
13845
+ "reward_std": 0.06434178352355957,
13846
+ "rewards/accuracy_reward": 0.5338715314865112,
13847
+ "rewards/format_reward": 1.0,
13848
+ "step": 865,
13849
+ "temporal_rewards": 1.0
13850
+ },
13851
+ {
13852
+ "all_correct": 0.42857142857142855,
13853
+ "all_wrong": 0.0,
13854
+ "completion_length": 121.92857360839844,
13855
+ "epoch": 0.26499388004895963,
13856
+ "grad_norm": 3.4914420316550565,
13857
+ "kl": 0.07379150390625,
13858
+ "learning_rate": 8.400528104517335e-07,
13859
+ "loss": 0.003,
13860
+ "reward": 1.9986642599105835,
13861
+ "reward_std": 0.189077690243721,
13862
+ "rewards/accuracy_reward": 0.7308071851730347,
13863
+ "rewards/format_reward": 1.0,
13864
+ "step": 866,
13865
+ "temporal_rewards": 1.0
13866
+ },
13867
+ {
13868
+ "all_correct": 0.2857142857142857,
13869
+ "all_wrong": 0.0,
13870
+ "completion_length": 198.8928680419922,
13871
+ "epoch": 0.2652998776009792,
13872
+ "grad_norm": 4.592100148789711,
13873
+ "kl": 0.073974609375,
13874
+ "learning_rate": 8.39700274859449e-07,
13875
+ "loss": 0.003,
13876
+ "reward": 1.7516790628433228,
13877
+ "reward_std": 0.18580719828605652,
13878
+ "rewards/accuracy_reward": 0.487393319606781,
13879
+ "rewards/format_reward": 1.0,
13880
+ "step": 867,
13881
+ "temporal_rewards": 0.8571428656578064
13882
+ },
13883
+ {
13884
+ "all_correct": 0.2857142857142857,
13885
+ "all_wrong": 0.14285714285714285,
13886
+ "completion_length": 131.33929443359375,
13887
+ "epoch": 0.26560587515299877,
13888
+ "grad_norm": 3.07398773389383,
13889
+ "kl": 0.06939697265625,
13890
+ "learning_rate": 8.393474253381081e-07,
13891
+ "loss": 0.0028,
13892
+ "reward": 1.7859539985656738,
13893
+ "reward_std": 0.10873201489448547,
13894
+ "rewards/accuracy_reward": 0.571668267250061,
13895
+ "rewards/format_reward": 1.0,
13896
+ "step": 868,
13897
+ "temporal_rewards": 0.8571428656578064
13898
+ },
13899
+ {
13900
+ "all_correct": 0.14285714285714285,
13901
+ "all_wrong": 0.14285714285714285,
13902
+ "completion_length": 252.7857208251953,
13903
+ "epoch": 0.26591187270501837,
13904
+ "grad_norm": 5.434152016124052,
13905
+ "kl": 0.074462890625,
13906
+ "learning_rate": 8.389942622137917e-07,
13907
+ "loss": 0.003,
13908
+ "reward": 1.732815146446228,
13909
+ "reward_std": 0.04574719816446304,
13910
+ "rewards/accuracy_reward": 0.3971007168292999,
13911
+ "rewards/format_reward": 1.0,
13912
+ "step": 869,
13913
+ "temporal_rewards": 0.8571428656578064
13914
+ },
13915
+ {
13916
+ "all_correct": 0.42857142857142855,
13917
+ "all_wrong": 0.14285714285714285,
13918
+ "completion_length": 163.5178680419922,
13919
+ "epoch": 0.26621787025703797,
13920
+ "grad_norm": 2.5700748516679117,
13921
+ "kl": 0.07745361328125,
13922
+ "learning_rate": 8.386407858128706e-07,
13923
+ "loss": 0.0031,
13924
+ "reward": 1.822379231452942,
13925
+ "reward_std": 0.09795420616865158,
13926
+ "rewards/accuracy_reward": 0.5830934643745422,
13927
+ "rewards/format_reward": 1.0,
13928
+ "step": 870,
13929
+ "temporal_rewards": 0.714285671710968
13930
+ },
13931
+ {
13932
+ "all_correct": 0.0,
13933
+ "all_wrong": 0.14285714285714285,
13934
+ "completion_length": 308.71429443359375,
13935
+ "epoch": 0.2665238678090575,
13936
+ "grad_norm": 11.736614064127435,
13937
+ "kl": 0.373779296875,
13938
+ "learning_rate": 8.382869964620043e-07,
13939
+ "loss": 0.015,
13940
+ "reward": 1.428949236869812,
13941
+ "reward_std": 0.24033333361148834,
13942
+ "rewards/accuracy_reward": 0.17359207570552826,
13943
+ "rewards/format_reward": 0.9642857313156128,
13944
+ "step": 871,
13945
+ "temporal_rewards": 0.8571428656578064
13946
+ },
13947
+ {
13948
+ "all_correct": 0.0,
13949
+ "all_wrong": 0.0,
13950
+ "completion_length": 295.6607360839844,
13951
+ "epoch": 0.2668298653610771,
13952
+ "grad_norm": 3.7904993786293204,
13953
+ "kl": 0.09088134765625,
13954
+ "learning_rate": 8.379328944881423e-07,
13955
+ "loss": 0.0036,
13956
+ "reward": 1.7479736804962158,
13957
+ "reward_std": 0.1676667034626007,
13958
+ "rewards/accuracy_reward": 0.3872593343257904,
13959
+ "rewards/format_reward": 0.9642857313156128,
13960
+ "step": 872,
13961
+ "temporal_rewards": 1.0
13962
+ },
13963
+ {
13964
+ "all_correct": 0.0,
13965
+ "all_wrong": 0.0,
13966
+ "completion_length": 348.8571472167969,
13967
+ "epoch": 0.2671358629130967,
13968
+ "grad_norm": 2.2910548072438606,
13969
+ "kl": 0.081787109375,
13970
+ "learning_rate": 8.375784802185231e-07,
13971
+ "loss": 0.0033,
13972
+ "reward": 1.61097252368927,
13973
+ "reward_std": 0.27649515867233276,
13974
+ "rewards/accuracy_reward": 0.3288295865058899,
13975
+ "rewards/format_reward": 0.9642857313156128,
13976
+ "step": 873,
13977
+ "temporal_rewards": 0.8571428656578064
13978
+ },
13979
+ {
13980
+ "all_correct": 0.2857142857142857,
13981
+ "all_wrong": 0.2857142857142857,
13982
+ "completion_length": 137.33929443359375,
13983
+ "epoch": 0.26744186046511625,
13984
+ "grad_norm": 9.477587189657482,
13985
+ "kl": 0.1707763671875,
13986
+ "learning_rate": 8.372237539806729e-07,
13987
+ "loss": 0.0068,
13988
+ "reward": 1.7725310325622559,
13989
+ "reward_std": 0.10993125289678574,
13990
+ "rewards/accuracy_reward": 0.5653882026672363,
13991
+ "rewards/format_reward": 1.0,
13992
+ "step": 874,
13993
+ "temporal_rewards": 1.0
13994
+ },
13995
+ {
13996
+ "all_correct": 0.42857142857142855,
13997
+ "all_wrong": 0.14285714285714285,
13998
+ "completion_length": 139.67857360839844,
13999
+ "epoch": 0.26774785801713585,
14000
+ "grad_norm": 3.0317667233515655,
14001
+ "kl": 0.072021484375,
14002
+ "learning_rate": 8.36868716102407e-07,
14003
+ "loss": 0.0029,
14004
+ "reward": 1.7030521631240845,
14005
+ "reward_std": 0.12748095393180847,
14006
+ "rewards/accuracy_reward": 0.5262664556503296,
14007
+ "rewards/format_reward": 1.0,
14008
+ "step": 875,
14009
+ "temporal_rewards": 0.8571428656578064
14010
+ },
14011
+ {
14012
+ "all_correct": 0.14285714285714285,
14013
+ "all_wrong": 0.14285714285714285,
14014
+ "completion_length": 209.87501525878906,
14015
+ "epoch": 0.26805385556915545,
14016
+ "grad_norm": 2.3417260897973846,
14017
+ "kl": 0.09869384765625,
14018
+ "learning_rate": 8.365133669118289e-07,
14019
+ "loss": 0.0039,
14020
+ "reward": 1.5527209043502808,
14021
+ "reward_std": 0.14238658547401428,
14022
+ "rewards/accuracy_reward": 0.302720844745636,
14023
+ "rewards/format_reward": 0.9821429252624512,
14024
+ "step": 876,
14025
+ "temporal_rewards": 0.714285671710968
14026
+ },
14027
+ {
14028
+ "all_correct": 0.5714285714285714,
14029
+ "all_wrong": 0.0,
14030
+ "completion_length": 221.87501525878906,
14031
+ "epoch": 0.26835985312117505,
14032
+ "grad_norm": 1.4544164125883434,
14033
+ "kl": 0.0880126953125,
14034
+ "learning_rate": 8.361577067373289e-07,
14035
+ "loss": 0.0035,
14036
+ "reward": 1.9879584312438965,
14037
+ "reward_std": 0.06715244054794312,
14038
+ "rewards/accuracy_reward": 0.6915298700332642,
14039
+ "rewards/format_reward": 0.9821429252624512,
14040
+ "step": 877,
14041
+ "temporal_rewards": 0.8571428656578064
14042
+ },
14043
+ {
14044
+ "all_correct": 0.0,
14045
+ "all_wrong": 0.0,
14046
+ "completion_length": 137.6428680419922,
14047
+ "epoch": 0.2686658506731946,
14048
+ "grad_norm": 4.041225889721134,
14049
+ "kl": 0.07952880859375,
14050
+ "learning_rate": 8.358017359075853e-07,
14051
+ "loss": 0.0032,
14052
+ "reward": 1.634163737297058,
14053
+ "reward_std": 0.3102668523788452,
14054
+ "rewards/accuracy_reward": 0.4466637969017029,
14055
+ "rewards/format_reward": 1.0,
14056
+ "step": 878,
14057
+ "temporal_rewards": 1.0
14058
+ },
14059
+ {
14060
+ "all_correct": 0.42857142857142855,
14061
+ "all_wrong": 0.0,
14062
+ "completion_length": 202.58929443359375,
14063
+ "epoch": 0.2689718482252142,
14064
+ "grad_norm": 2.865476731408168,
14065
+ "kl": 0.071044921875,
14066
+ "learning_rate": 8.354454547515632e-07,
14067
+ "loss": 0.0028,
14068
+ "reward": 1.960200309753418,
14069
+ "reward_std": 0.12427835166454315,
14070
+ "rewards/accuracy_reward": 0.6244859099388123,
14071
+ "rewards/format_reward": 1.0,
14072
+ "step": 879,
14073
+ "temporal_rewards": 1.0
14074
+ },
14075
+ {
14076
+ "all_correct": 0.14285714285714285,
14077
+ "all_wrong": 0.14285714285714285,
14078
+ "completion_length": 230.96429443359375,
14079
+ "epoch": 0.2692778457772338,
14080
+ "grad_norm": 4.795281271108613,
14081
+ "kl": 0.07501220703125,
14082
+ "learning_rate": 8.35088863598515e-07,
14083
+ "loss": 0.003,
14084
+ "reward": 1.6301624774932861,
14085
+ "reward_std": 0.20454014837741852,
14086
+ "rewards/accuracy_reward": 0.42301949858665466,
14087
+ "rewards/format_reward": 1.0,
14088
+ "step": 880,
14089
+ "temporal_rewards": 0.8571428656578064
14090
+ },
14091
+ {
14092
+ "all_correct": 0.2857142857142857,
14093
+ "all_wrong": 0.0,
14094
+ "completion_length": 174.5178680419922,
14095
+ "epoch": 0.2695838433292534,
14096
+ "grad_norm": 22.401908979877813,
14097
+ "kl": 1.875,
14098
+ "learning_rate": 8.347319627779788e-07,
14099
+ "loss": 0.075,
14100
+ "reward": 1.7476712465286255,
14101
+ "reward_std": 0.35592812299728394,
14102
+ "rewards/accuracy_reward": 0.5530281662940979,
14103
+ "rewards/format_reward": 0.9642857313156128,
14104
+ "step": 881,
14105
+ "temporal_rewards": 0.8571428656578064
14106
+ },
14107
+ {
14108
+ "all_correct": 0.14285714285714285,
14109
+ "all_wrong": 0.0,
14110
+ "completion_length": 189.82144165039062,
14111
+ "epoch": 0.26988984088127294,
14112
+ "grad_norm": 10.071232157212238,
14113
+ "kl": 0.0775146484375,
14114
+ "learning_rate": 8.343747526197796e-07,
14115
+ "loss": 0.0031,
14116
+ "reward": 1.7986595630645752,
14117
+ "reward_std": 0.3468095064163208,
14118
+ "rewards/accuracy_reward": 0.545087993144989,
14119
+ "rewards/format_reward": 1.0,
14120
+ "step": 882,
14121
+ "temporal_rewards": 1.0
14122
+ },
14123
+ {
14124
+ "all_correct": 0.2857142857142857,
14125
+ "all_wrong": 0.14285714285714285,
14126
+ "completion_length": 136.7857208251953,
14127
+ "epoch": 0.27019583843329253,
14128
+ "grad_norm": 5.112804545427567,
14129
+ "kl": 0.07208251953125,
14130
+ "learning_rate": 8.340172334540279e-07,
14131
+ "loss": 0.0029,
14132
+ "reward": 1.667857050895691,
14133
+ "reward_std": 0.29825282096862793,
14134
+ "rewards/accuracy_reward": 0.5178571939468384,
14135
+ "rewards/format_reward": 1.0,
14136
+ "step": 883,
14137
+ "temporal_rewards": 0.8571428656578064
14138
+ },
14139
+ {
14140
+ "all_correct": 0.14285714285714285,
14141
+ "all_wrong": 0.2857142857142857,
14142
+ "completion_length": 232.73214721679688,
14143
+ "epoch": 0.27050183598531213,
14144
+ "grad_norm": 3.5504006872403204,
14145
+ "kl": 0.072998046875,
14146
+ "learning_rate": 8.336594056111197e-07,
14147
+ "loss": 0.0029,
14148
+ "reward": 1.5460340976715088,
14149
+ "reward_std": 0.10997889935970306,
14150
+ "rewards/accuracy_reward": 0.3103196620941162,
14151
+ "rewards/format_reward": 1.0,
14152
+ "step": 884,
14153
+ "temporal_rewards": 0.8571428656578064
14154
+ },
14155
+ {
14156
+ "all_correct": 0.2857142857142857,
14157
+ "all_wrong": 0.14285714285714285,
14158
+ "completion_length": 153.32144165039062,
14159
+ "epoch": 0.2708078335373317,
14160
+ "grad_norm": 5.335782717652437,
14161
+ "kl": 0.0709228515625,
14162
+ "learning_rate": 8.333012694217365e-07,
14163
+ "loss": 0.0028,
14164
+ "reward": 1.7492895126342773,
14165
+ "reward_std": 0.12363868951797485,
14166
+ "rewards/accuracy_reward": 0.5189323425292969,
14167
+ "rewards/format_reward": 1.0,
14168
+ "step": 885,
14169
+ "temporal_rewards": 0.8571428656578064
14170
+ },
14171
+ {
14172
+ "all_correct": 0.42857142857142855,
14173
+ "all_wrong": 0.0,
14174
+ "completion_length": 184.60714721679688,
14175
+ "epoch": 0.2711138310893513,
14176
+ "grad_norm": 3.8542094040952666,
14177
+ "kl": 0.0665283203125,
14178
+ "learning_rate": 8.329428252168445e-07,
14179
+ "loss": 0.0027,
14180
+ "reward": 1.9230225086212158,
14181
+ "reward_std": 0.23322181403636932,
14182
+ "rewards/accuracy_reward": 0.6712366342544556,
14183
+ "rewards/format_reward": 0.9821429252624512,
14184
+ "step": 886,
14185
+ "temporal_rewards": 0.8571428656578064
14186
+ },
14187
+ {
14188
+ "all_correct": 0.2857142857142857,
14189
+ "all_wrong": 0.14285714285714285,
14190
+ "completion_length": 255.35714721679688,
14191
+ "epoch": 0.2714198286413709,
14192
+ "grad_norm": 6.692207465796444,
14193
+ "kl": 0.083740234375,
14194
+ "learning_rate": 8.325840733276947e-07,
14195
+ "loss": 0.0033,
14196
+ "reward": 1.7804242372512817,
14197
+ "reward_std": 0.03548278659582138,
14198
+ "rewards/accuracy_reward": 0.43756699562072754,
14199
+ "rewards/format_reward": 1.0,
14200
+ "step": 887,
14201
+ "temporal_rewards": 0.8571428656578064
14202
+ },
14203
+ {
14204
+ "all_correct": 0.14285714285714285,
14205
+ "all_wrong": 0.0,
14206
+ "completion_length": 231.6428680419922,
14207
+ "epoch": 0.2717258261933905,
14208
+ "grad_norm": 2.5721424140087072,
14209
+ "kl": 0.0772705078125,
14210
+ "learning_rate": 8.322250140858228e-07,
14211
+ "loss": 0.0031,
14212
+ "reward": 1.7938824892044067,
14213
+ "reward_std": 0.15584063529968262,
14214
+ "rewards/accuracy_reward": 0.47959670424461365,
14215
+ "rewards/format_reward": 1.0,
14216
+ "step": 888,
14217
+ "temporal_rewards": 1.0
14218
+ },
14219
+ {
14220
+ "all_correct": 0.2857142857142857,
14221
+ "all_wrong": 0.14285714285714285,
14222
+ "completion_length": 142.1607208251953,
14223
+ "epoch": 0.27203182374541,
14224
+ "grad_norm": 4.045289295991584,
14225
+ "kl": 0.0728759765625,
14226
+ "learning_rate": 8.318656478230477e-07,
14227
+ "loss": 0.0029,
14228
+ "reward": 1.7569411993026733,
14229
+ "reward_std": 0.26805686950683594,
14230
+ "rewards/accuracy_reward": 0.5855125188827515,
14231
+ "rewards/format_reward": 1.0,
14232
+ "step": 889,
14233
+ "temporal_rewards": 0.8571428656578064
14234
+ },
14235
+ {
14236
+ "all_correct": 0.2857142857142857,
14237
+ "all_wrong": 0.0,
14238
+ "completion_length": 157.42857360839844,
14239
+ "epoch": 0.2723378212974296,
14240
+ "grad_norm": 5.628418656090372,
14241
+ "kl": 0.0838623046875,
14242
+ "learning_rate": 8.315059748714728e-07,
14243
+ "loss": 0.0034,
14244
+ "reward": 1.9179658889770508,
14245
+ "reward_std": 0.310136616230011,
14246
+ "rewards/accuracy_reward": 0.6751086711883545,
14247
+ "rewards/format_reward": 0.9821429252624512,
14248
+ "step": 890,
14249
+ "temporal_rewards": 0.8571428656578064
14250
+ },
14251
+ {
14252
+ "all_correct": 0.42857142857142855,
14253
+ "all_wrong": 0.14285714285714285,
14254
+ "completion_length": 144.21429443359375,
14255
+ "epoch": 0.2726438188494492,
14256
+ "grad_norm": 2.9699381051247133,
14257
+ "kl": 0.072998046875,
14258
+ "learning_rate": 8.311459955634843e-07,
14259
+ "loss": 0.0029,
14260
+ "reward": 1.8272987604141235,
14261
+ "reward_std": 0.03353741019964218,
14262
+ "rewards/accuracy_reward": 0.5665844678878784,
14263
+ "rewards/format_reward": 1.0,
14264
+ "step": 891,
14265
+ "temporal_rewards": 1.0
14266
+ },
14267
+ {
14268
+ "all_correct": 0.2857142857142857,
14269
+ "all_wrong": 0.2857142857142857,
14270
+ "completion_length": 172.32144165039062,
14271
+ "epoch": 0.2729498164014688,
14272
+ "grad_norm": 2.5078210503509757,
14273
+ "kl": 0.07110595703125,
14274
+ "learning_rate": 8.307857102317521e-07,
14275
+ "loss": 0.0028,
14276
+ "reward": 1.698065996170044,
14277
+ "reward_std": 0.03687385469675064,
14278
+ "rewards/accuracy_reward": 0.4587802290916443,
14279
+ "rewards/format_reward": 1.0,
14280
+ "step": 892,
14281
+ "temporal_rewards": 0.8571428656578064
14282
+ },
14283
+ {
14284
+ "all_correct": 0.14285714285714285,
14285
+ "all_wrong": 0.14285714285714285,
14286
+ "completion_length": 226.8035888671875,
14287
+ "epoch": 0.27325581395348836,
14288
+ "grad_norm": 2.847612355217105,
14289
+ "kl": 0.0662841796875,
14290
+ "learning_rate": 8.304251192092284e-07,
14291
+ "loss": 0.0027,
14292
+ "reward": 1.7976547479629517,
14293
+ "reward_std": 0.20810642838478088,
14294
+ "rewards/accuracy_reward": 0.5190832614898682,
14295
+ "rewards/format_reward": 0.9821429252624512,
14296
+ "step": 893,
14297
+ "temporal_rewards": 1.0
14298
+ },
14299
+ {
14300
+ "all_correct": 0.5714285714285714,
14301
+ "all_wrong": 0.0,
14302
+ "completion_length": 215.75001525878906,
14303
+ "epoch": 0.27356181150550796,
14304
+ "grad_norm": 7.342437663901097,
14305
+ "kl": 0.071044921875,
14306
+ "learning_rate": 8.300642228291484e-07,
14307
+ "loss": 0.0028,
14308
+ "reward": 2.001753568649292,
14309
+ "reward_std": 0.07981517910957336,
14310
+ "rewards/accuracy_reward": 0.6910392642021179,
14311
+ "rewards/format_reward": 1.0,
14312
+ "step": 894,
14313
+ "temporal_rewards": 0.8571428656578064
14314
+ },
14315
+ {
14316
+ "all_correct": 0.2857142857142857,
14317
+ "all_wrong": 0.14285714285714285,
14318
+ "completion_length": 245.8035888671875,
14319
+ "epoch": 0.27386780905752756,
14320
+ "grad_norm": 2.8686174982836996,
14321
+ "kl": 0.1077880859375,
14322
+ "learning_rate": 8.297030214250291e-07,
14323
+ "loss": 0.0043,
14324
+ "reward": 1.7279855012893677,
14325
+ "reward_std": 0.20766517519950867,
14326
+ "rewards/accuracy_reward": 0.46548548340797424,
14327
+ "rewards/format_reward": 0.9821429252624512,
14328
+ "step": 895,
14329
+ "temporal_rewards": 1.0
14330
+ },
14331
+ {
14332
+ "all_correct": 0.14285714285714285,
14333
+ "all_wrong": 0.14285714285714285,
14334
+ "completion_length": 180.87501525878906,
14335
+ "epoch": 0.2741738066095471,
14336
+ "grad_norm": 3.844679218873974,
14337
+ "kl": 0.06439208984375,
14338
+ "learning_rate": 8.293415153306697e-07,
14339
+ "loss": 0.0026,
14340
+ "reward": 1.570464015007019,
14341
+ "reward_std": 0.27158409357070923,
14342
+ "rewards/accuracy_reward": 0.44546398520469666,
14343
+ "rewards/format_reward": 1.0,
14344
+ "step": 896,
14345
+ "temporal_rewards": 0.5714285373687744
14346
+ },
14347
+ {
14348
+ "all_correct": 0.14285714285714285,
14349
+ "all_wrong": 0.0,
14350
+ "completion_length": 244.7857208251953,
14351
+ "epoch": 0.2744798041615667,
14352
+ "grad_norm": 5.30272222373688,
14353
+ "kl": 0.0797119140625,
14354
+ "learning_rate": 8.289797048801503e-07,
14355
+ "loss": 0.0032,
14356
+ "reward": 1.8444569110870361,
14357
+ "reward_std": 0.18049336969852448,
14358
+ "rewards/accuracy_reward": 0.5033854246139526,
14359
+ "rewards/format_reward": 1.0,
14360
+ "step": 897,
14361
+ "temporal_rewards": 1.0
14362
+ },
14363
+ {
14364
+ "all_correct": 0.2857142857142857,
14365
+ "all_wrong": 0.0,
14366
+ "completion_length": 219.85714721679688,
14367
+ "epoch": 0.2747858017135863,
14368
+ "grad_norm": 4.238591988137467,
14369
+ "kl": 0.0887451171875,
14370
+ "learning_rate": 8.286175904078332e-07,
14371
+ "loss": 0.0035,
14372
+ "reward": 1.8598953485488892,
14373
+ "reward_std": 0.446624755859375,
14374
+ "rewards/accuracy_reward": 0.6420382261276245,
14375
+ "rewards/format_reward": 0.9285714626312256,
14376
+ "step": 898,
14377
+ "temporal_rewards": 1.0
14378
+ },
14379
+ {
14380
+ "all_correct": 0.14285714285714285,
14381
+ "all_wrong": 0.0,
14382
+ "completion_length": 179.73214721679688,
14383
+ "epoch": 0.2750917992656059,
14384
+ "grad_norm": 3.3254751036491594,
14385
+ "kl": 0.07159423828125,
14386
+ "learning_rate": 8.282551722483611e-07,
14387
+ "loss": 0.0029,
14388
+ "reward": 1.7057297229766846,
14389
+ "reward_std": 0.18852263689041138,
14390
+ "rewards/accuracy_reward": 0.4735867381095886,
14391
+ "rewards/format_reward": 1.0,
14392
+ "step": 899,
14393
+ "temporal_rewards": 0.8571428656578064
14394
+ },
14395
+ {
14396
+ "all_correct": 0.14285714285714285,
14397
+ "all_wrong": 0.14285714285714285,
14398
+ "completion_length": 187.19644165039062,
14399
+ "epoch": 0.27539779681762544,
14400
+ "grad_norm": 4.612353811363961,
14401
+ "kl": 0.08416748046875,
14402
+ "learning_rate": 8.278924507366573e-07,
14403
+ "loss": 0.0034,
14404
+ "reward": 1.6086270809173584,
14405
+ "reward_std": 0.11791487783193588,
14406
+ "rewards/accuracy_reward": 0.38184139132499695,
14407
+ "rewards/format_reward": 0.9821429252624512,
14408
+ "step": 900,
14409
+ "temporal_rewards": 0.714285671710968
14410
  }
14411
  ],
14412
  "logging_steps": 1.0,