Training in progress, step 900, checkpoint
Browse files- last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
- last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- last-checkpoint/latest +1 -1
- last-checkpoint/model-00001-of-00004.safetensors +1 -1
- last-checkpoint/model-00002-of-00004.safetensors +1 -1
- last-checkpoint/model-00003-of-00004.safetensors +1 -1
- last-checkpoint/model-00004-of-00004.safetensors +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/rng_state_4.pth +1 -1
- last-checkpoint/rng_state_5.pth +1 -1
- last-checkpoint/rng_state_6.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +1602 -2
last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05fe1b88528bc46c4cda49bc4a01e42631384a5ea55da387a14f37bb9f785760
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:119684ca6eaa3304b994cb0ca6c2c609e538093f85761f264540a302c687be22
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:413314a6a3536e2f73fe9573223a0cbb1e8f9cdd371e04a01a424d018c570781
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b92cf0ad31e096c86ab41dd988952e16a54bc7f14e74a94f660354a7eee1914
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9b322a9d34c580b99dca38dbfa503d05445f05e0c8bff821677bd7cddc1a9e6
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bb8a8779ed7b2dc721eeb785d5c61873aaf0e9ed94ba9ff998093c74752f20d
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87a017205025f69240e8cc591a021675de1f51493494e1a1c697aa668c064cab
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:384c0989a10a126c5c7d7a7b9f72ef35dfbc398c32123846e2e07689adb315ed
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0213f55026cc7c7de26d71c0bb2024ddec98ebc52ac67317eccef8c445e0966
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:734e259c2a11627b96a7376440289a0348721cd16e144afe1cb7adff22cdb64c
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36c4ead9b8b685f4c23f1b9a5bbc53944f33083843794dc22b7d5be6fc0e4658
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f60d5aabd902cfd76d5ba9511081474460690487134a6d64ffa3925cf0e289f3
|
| 3 |
+
size 14215152302
|
last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e336a3fc477735f2e6b4433383835db995837cbe62f524c6bd77f33cd0809b55
|
| 3 |
+
size 349379
|
last-checkpoint/global_step900/zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8da30a63edbc2ef896c289a68bd85a2ba5fa2d18271b4b4308d28e04462d33d2
|
| 3 |
+
size 14215152302
|
last-checkpoint/latest
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
global_step900
|
last-checkpoint/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4968242840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d5584d7d8ab8fce5f0e749132bbd3603c179d400f9070f72b405c26541a0715
|
| 3 |
size 4968242840
|
last-checkpoint/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4991495688
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bd51885a17a33d56a018bb7e738b8b0c7a07ac55b6bd73b2b6ed36b2e7574a5
|
| 3 |
size 4991495688
|
last-checkpoint/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4932750920
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8292604de962a9f1ec484afd5166e29c0f1f034160f26d3a7baaa899b90a1470
|
| 3 |
size 4932750920
|
last-checkpoint/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1691924368
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:088ee4e5f47e8e54e48083a414d467ece0362d7407c886cb087a8147a723d99a
|
| 3 |
size 1691924368
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15920
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fbd53c58df8917500ea3a32f627b80f3093bc83d5395e17696aff262f3065ed1
|
| 3 |
size 15920
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d2c713e5d5a3e9e920fa84d9b959311a0590a9d39ac1eb57d11c7092870f80c
|
| 3 |
size 15984
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:275171564de734632c71fef1b2e29ca6bf24c1b021438853efcba459a4fe3f11
|
| 3 |
size 15984
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38d4d671511a83b2975532f867cdacdb632409e481223d65aab40d4a541d5c1c
|
| 3 |
size 15984
|
last-checkpoint/rng_state_4.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4964e7fca7c3842d7e50a180f381cc988728d1d3219dbac2badaa370566a6ce6
|
| 3 |
size 15984
|
last-checkpoint/rng_state_5.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:545620e1f74e91eaf4d0ab187a660a53712e5d0ab01ed4b4923841a0ae812429
|
| 3 |
size 15984
|
last-checkpoint/rng_state_6.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15984
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a36705df35ce87e1a2f3b1395aa333e8f140488116fb59129918c2657e4b071
|
| 3 |
size 15984
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5af295ce1b3dcf465f79b8946f0d8fa680faeb4b08aa2de4700def074a628db
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -12807,6 +12807,1606 @@
|
|
| 12807 |
"rewards/format_reward": 1.0,
|
| 12808 |
"step": 800,
|
| 12809 |
"temporal_rewards": 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12810 |
}
|
| 12811 |
],
|
| 12812 |
"logging_steps": 1.0,
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.27539779681762544,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 900,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 12807 |
"rewards/format_reward": 1.0,
|
| 12808 |
"step": 800,
|
| 12809 |
"temporal_rewards": 1.0
|
| 12810 |
+
},
|
| 12811 |
+
{
|
| 12812 |
+
"all_correct": 0.42857142857142855,
|
| 12813 |
+
"all_wrong": 0.0,
|
| 12814 |
+
"completion_length": 207.0357208251953,
|
| 12815 |
+
"epoch": 0.24510403916768667,
|
| 12816 |
+
"grad_norm": 2.9281723699853845,
|
| 12817 |
+
"kl": 0.0657958984375,
|
| 12818 |
+
"learning_rate": 8.622788645990524e-07,
|
| 12819 |
+
"loss": 0.0026,
|
| 12820 |
+
"reward": 2.047179937362671,
|
| 12821 |
+
"reward_std": 0.2067583054304123,
|
| 12822 |
+
"rewards/accuracy_reward": 0.7293230295181274,
|
| 12823 |
+
"rewards/format_reward": 1.0,
|
| 12824 |
+
"step": 801,
|
| 12825 |
+
"temporal_rewards": 1.0
|
| 12826 |
+
},
|
| 12827 |
+
{
|
| 12828 |
+
"all_correct": 0.14285714285714285,
|
| 12829 |
+
"all_wrong": 0.42857142857142855,
|
| 12830 |
+
"completion_length": 179.07144165039062,
|
| 12831 |
+
"epoch": 0.24541003671970624,
|
| 12832 |
+
"grad_norm": 4.134309138458019,
|
| 12833 |
+
"kl": 0.06689453125,
|
| 12834 |
+
"learning_rate": 8.619474197126057e-07,
|
| 12835 |
+
"loss": 0.0027,
|
| 12836 |
+
"reward": 1.4867311716079712,
|
| 12837 |
+
"reward_std": 0.10263003408908844,
|
| 12838 |
+
"rewards/accuracy_reward": 0.32958826422691345,
|
| 12839 |
+
"rewards/format_reward": 1.0,
|
| 12840 |
+
"step": 802,
|
| 12841 |
+
"temporal_rewards": 0.8571428656578064
|
| 12842 |
+
},
|
| 12843 |
+
{
|
| 12844 |
+
"all_correct": 0.14285714285714285,
|
| 12845 |
+
"all_wrong": 0.0,
|
| 12846 |
+
"completion_length": 148.30357360839844,
|
| 12847 |
+
"epoch": 0.24571603427172584,
|
| 12848 |
+
"grad_norm": 4.751302590383965,
|
| 12849 |
+
"kl": 0.07086181640625,
|
| 12850 |
+
"learning_rate": 8.616156403377282e-07,
|
| 12851 |
+
"loss": 0.0028,
|
| 12852 |
+
"reward": 1.7162678241729736,
|
| 12853 |
+
"reward_std": 0.32455798983573914,
|
| 12854 |
+
"rewards/accuracy_reward": 0.5037676692008972,
|
| 12855 |
+
"rewards/format_reward": 1.0,
|
| 12856 |
+
"step": 803,
|
| 12857 |
+
"temporal_rewards": 0.8571428656578064
|
| 12858 |
+
},
|
| 12859 |
+
{
|
| 12860 |
+
"all_correct": 0.14285714285714285,
|
| 12861 |
+
"all_wrong": 0.14285714285714285,
|
| 12862 |
+
"completion_length": 227.0535888671875,
|
| 12863 |
+
"epoch": 0.2460220318237454,
|
| 12864 |
+
"grad_norm": 5.991111880083119,
|
| 12865 |
+
"kl": 0.0748291015625,
|
| 12866 |
+
"learning_rate": 8.612835267810286e-07,
|
| 12867 |
+
"loss": 0.003,
|
| 12868 |
+
"reward": 1.684024453163147,
|
| 12869 |
+
"reward_std": 0.1964358240365982,
|
| 12870 |
+
"rewards/accuracy_reward": 0.4233100712299347,
|
| 12871 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 12872 |
+
"step": 804,
|
| 12873 |
+
"temporal_rewards": 1.0
|
| 12874 |
+
},
|
| 12875 |
+
{
|
| 12876 |
+
"all_correct": 0.14285714285714285,
|
| 12877 |
+
"all_wrong": 0.0,
|
| 12878 |
+
"completion_length": 168.83929443359375,
|
| 12879 |
+
"epoch": 0.246328029375765,
|
| 12880 |
+
"grad_norm": 9.316771002327046,
|
| 12881 |
+
"kl": 0.12939453125,
|
| 12882 |
+
"learning_rate": 8.609510793494254e-07,
|
| 12883 |
+
"loss": 0.0052,
|
| 12884 |
+
"reward": 1.8607145547866821,
|
| 12885 |
+
"reward_std": 0.22949722409248352,
|
| 12886 |
+
"rewards/accuracy_reward": 0.594643235206604,
|
| 12887 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 12888 |
+
"step": 805,
|
| 12889 |
+
"temporal_rewards": 1.0
|
| 12890 |
+
},
|
| 12891 |
+
{
|
| 12892 |
+
"all_correct": 0.14285714285714285,
|
| 12893 |
+
"all_wrong": 0.2857142857142857,
|
| 12894 |
+
"completion_length": 148.375,
|
| 12895 |
+
"epoch": 0.24663402692778458,
|
| 12896 |
+
"grad_norm": 2.536873948888703,
|
| 12897 |
+
"kl": 0.07196044921875,
|
| 12898 |
+
"learning_rate": 8.606182983501446e-07,
|
| 12899 |
+
"loss": 0.0029,
|
| 12900 |
+
"reward": 1.4161813259124756,
|
| 12901 |
+
"reward_std": 0.2212689369916916,
|
| 12902 |
+
"rewards/accuracy_reward": 0.31260988116264343,
|
| 12903 |
+
"rewards/format_reward": 1.0,
|
| 12904 |
+
"step": 806,
|
| 12905 |
+
"temporal_rewards": 0.8571428656578064
|
| 12906 |
+
},
|
| 12907 |
+
{
|
| 12908 |
+
"all_correct": 0.14285714285714285,
|
| 12909 |
+
"all_wrong": 0.2857142857142857,
|
| 12910 |
+
"completion_length": 239.0357208251953,
|
| 12911 |
+
"epoch": 0.24694002447980415,
|
| 12912 |
+
"grad_norm": 2.2992367339007482,
|
| 12913 |
+
"kl": 0.08148193359375,
|
| 12914 |
+
"learning_rate": 8.602851840907212e-07,
|
| 12915 |
+
"loss": 0.0033,
|
| 12916 |
+
"reward": 1.5467685461044312,
|
| 12917 |
+
"reward_std": 0.09967747330665588,
|
| 12918 |
+
"rewards/accuracy_reward": 0.2967683970928192,
|
| 12919 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 12920 |
+
"step": 807,
|
| 12921 |
+
"temporal_rewards": 0.8571428656578064
|
| 12922 |
+
},
|
| 12923 |
+
{
|
| 12924 |
+
"all_correct": 0.14285714285714285,
|
| 12925 |
+
"all_wrong": 0.2857142857142857,
|
| 12926 |
+
"completion_length": 261.6785888671875,
|
| 12927 |
+
"epoch": 0.24724602203182375,
|
| 12928 |
+
"grad_norm": 2.6163619560973426,
|
| 12929 |
+
"kl": 0.06671142578125,
|
| 12930 |
+
"learning_rate": 8.599517368789979e-07,
|
| 12931 |
+
"loss": 0.0027,
|
| 12932 |
+
"reward": 1.467833399772644,
|
| 12933 |
+
"reward_std": 0.15854676067829132,
|
| 12934 |
+
"rewards/accuracy_reward": 0.267833411693573,
|
| 12935 |
+
"rewards/format_reward": 1.0,
|
| 12936 |
+
"step": 808,
|
| 12937 |
+
"temporal_rewards": 0.714285671710968
|
| 12938 |
+
},
|
| 12939 |
+
{
|
| 12940 |
+
"all_correct": 0.42857142857142855,
|
| 12941 |
+
"all_wrong": 0.0,
|
| 12942 |
+
"completion_length": 245.71429443359375,
|
| 12943 |
+
"epoch": 0.24755201958384332,
|
| 12944 |
+
"grad_norm": 1.876303446324673,
|
| 12945 |
+
"kl": 0.0562744140625,
|
| 12946 |
+
"learning_rate": 8.596179570231248e-07,
|
| 12947 |
+
"loss": 0.0023,
|
| 12948 |
+
"reward": 1.8534244298934937,
|
| 12949 |
+
"reward_std": 0.17602498829364777,
|
| 12950 |
+
"rewards/accuracy_reward": 0.5623528361320496,
|
| 12951 |
+
"rewards/format_reward": 1.0,
|
| 12952 |
+
"step": 809,
|
| 12953 |
+
"temporal_rewards": 0.8571428656578064
|
| 12954 |
+
},
|
| 12955 |
+
{
|
| 12956 |
+
"all_correct": 0.42857142857142855,
|
| 12957 |
+
"all_wrong": 0.2857142857142857,
|
| 12958 |
+
"completion_length": 160.96429443359375,
|
| 12959 |
+
"epoch": 0.24785801713586292,
|
| 12960 |
+
"grad_norm": 1.983593074435128,
|
| 12961 |
+
"kl": 0.0673828125,
|
| 12962 |
+
"learning_rate": 8.592838448315599e-07,
|
| 12963 |
+
"loss": 0.0027,
|
| 12964 |
+
"reward": 1.7673360109329224,
|
| 12965 |
+
"reward_std": 0.03303138539195061,
|
| 12966 |
+
"rewards/accuracy_reward": 0.5459073185920715,
|
| 12967 |
+
"rewards/format_reward": 1.0,
|
| 12968 |
+
"step": 810,
|
| 12969 |
+
"temporal_rewards": 0.8571428656578064
|
| 12970 |
+
},
|
| 12971 |
+
{
|
| 12972 |
+
"all_correct": 0.14285714285714285,
|
| 12973 |
+
"all_wrong": 0.14285714285714285,
|
| 12974 |
+
"completion_length": 232.12501525878906,
|
| 12975 |
+
"epoch": 0.2481640146878825,
|
| 12976 |
+
"grad_norm": 3.0190110438589124,
|
| 12977 |
+
"kl": 0.06878662109375,
|
| 12978 |
+
"learning_rate": 8.589494006130679e-07,
|
| 12979 |
+
"loss": 0.0028,
|
| 12980 |
+
"reward": 1.5021908283233643,
|
| 12981 |
+
"reward_std": 0.12059634923934937,
|
| 12982 |
+
"rewards/accuracy_reward": 0.3129049837589264,
|
| 12983 |
+
"rewards/format_reward": 1.0,
|
| 12984 |
+
"step": 811,
|
| 12985 |
+
"temporal_rewards": 0.714285671710968
|
| 12986 |
+
},
|
| 12987 |
+
{
|
| 12988 |
+
"all_correct": 0.42857142857142855,
|
| 12989 |
+
"all_wrong": 0.0,
|
| 12990 |
+
"completion_length": 286.26788330078125,
|
| 12991 |
+
"epoch": 0.2484700122399021,
|
| 12992 |
+
"grad_norm": 1.7801210792367876,
|
| 12993 |
+
"kl": 0.08331298828125,
|
| 12994 |
+
"learning_rate": 8.58614624676721e-07,
|
| 12995 |
+
"loss": 0.0033,
|
| 12996 |
+
"reward": 1.9375736713409424,
|
| 12997 |
+
"reward_std": 0.1051332876086235,
|
| 12998 |
+
"rewards/accuracy_reward": 0.6411451101303101,
|
| 12999 |
+
"rewards/format_reward": 1.0,
|
| 13000 |
+
"step": 812,
|
| 13001 |
+
"temporal_rewards": 0.714285671710968
|
| 13002 |
+
},
|
| 13003 |
+
{
|
| 13004 |
+
"all_correct": 0.2857142857142857,
|
| 13005 |
+
"all_wrong": 0.14285714285714285,
|
| 13006 |
+
"completion_length": 252.19644165039062,
|
| 13007 |
+
"epoch": 0.24877600979192166,
|
| 13008 |
+
"grad_norm": 2.9742718067292033,
|
| 13009 |
+
"kl": 0.062042236328125,
|
| 13010 |
+
"learning_rate": 8.582795173318969e-07,
|
| 13011 |
+
"loss": 0.0025,
|
| 13012 |
+
"reward": 1.7696127891540527,
|
| 13013 |
+
"reward_std": 0.12452299147844315,
|
| 13014 |
+
"rewards/accuracy_reward": 0.4981841444969177,
|
| 13015 |
+
"rewards/format_reward": 1.0,
|
| 13016 |
+
"step": 813,
|
| 13017 |
+
"temporal_rewards": 0.8571428656578064
|
| 13018 |
+
},
|
| 13019 |
+
{
|
| 13020 |
+
"all_correct": 0.14285714285714285,
|
| 13021 |
+
"all_wrong": 0.2857142857142857,
|
| 13022 |
+
"completion_length": 256.75,
|
| 13023 |
+
"epoch": 0.24908200734394126,
|
| 13024 |
+
"grad_norm": 2.2704735528170277,
|
| 13025 |
+
"kl": 0.07305908203125,
|
| 13026 |
+
"learning_rate": 8.579440788882806e-07,
|
| 13027 |
+
"loss": 0.0029,
|
| 13028 |
+
"reward": 1.5697615146636963,
|
| 13029 |
+
"reward_std": 0.06264042109251022,
|
| 13030 |
+
"rewards/accuracy_reward": 0.3036900758743286,
|
| 13031 |
+
"rewards/format_reward": 1.0,
|
| 13032 |
+
"step": 814,
|
| 13033 |
+
"temporal_rewards": 1.0
|
| 13034 |
+
},
|
| 13035 |
+
{
|
| 13036 |
+
"all_correct": 0.5714285714285714,
|
| 13037 |
+
"all_wrong": 0.0,
|
| 13038 |
+
"completion_length": 221.60714721679688,
|
| 13039 |
+
"epoch": 0.24938800489596083,
|
| 13040 |
+
"grad_norm": 1.623507735144211,
|
| 13041 |
+
"kl": 0.062255859375,
|
| 13042 |
+
"learning_rate": 8.576083096558624e-07,
|
| 13043 |
+
"loss": 0.0025,
|
| 13044 |
+
"reward": 1.9773122072219849,
|
| 13045 |
+
"reward_std": 0.08393806964159012,
|
| 13046 |
+
"rewards/accuracy_reward": 0.6844549775123596,
|
| 13047 |
+
"rewards/format_reward": 1.0,
|
| 13048 |
+
"step": 815,
|
| 13049 |
+
"temporal_rewards": 0.8571428656578064
|
| 13050 |
+
},
|
| 13051 |
+
{
|
| 13052 |
+
"all_correct": 0.2857142857142857,
|
| 13053 |
+
"all_wrong": 0.0,
|
| 13054 |
+
"completion_length": 206.00001525878906,
|
| 13055 |
+
"epoch": 0.24969400244798043,
|
| 13056 |
+
"grad_norm": 2.1482537052968547,
|
| 13057 |
+
"kl": 0.0755615234375,
|
| 13058 |
+
"learning_rate": 8.572722099449388e-07,
|
| 13059 |
+
"loss": 0.003,
|
| 13060 |
+
"reward": 1.7175023555755615,
|
| 13061 |
+
"reward_std": 0.11727368831634521,
|
| 13062 |
+
"rewards/accuracy_reward": 0.4639308750629425,
|
| 13063 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13064 |
+
"step": 816,
|
| 13065 |
+
"temporal_rewards": 0.714285671710968
|
| 13066 |
+
},
|
| 13067 |
+
{
|
| 13068 |
+
"all_correct": 0.2857142857142857,
|
| 13069 |
+
"all_wrong": 0.0,
|
| 13070 |
+
"completion_length": 222.7857208251953,
|
| 13071 |
+
"epoch": 0.25,
|
| 13072 |
+
"grad_norm": 4.595475850035951,
|
| 13073 |
+
"kl": 0.0740966796875,
|
| 13074 |
+
"learning_rate": 8.569357800661111e-07,
|
| 13075 |
+
"loss": 0.003,
|
| 13076 |
+
"reward": 1.8679395914077759,
|
| 13077 |
+
"reward_std": 0.12349120527505875,
|
| 13078 |
+
"rewards/accuracy_reward": 0.5572252869606018,
|
| 13079 |
+
"rewards/format_reward": 1.0,
|
| 13080 |
+
"step": 817,
|
| 13081 |
+
"temporal_rewards": 0.8571428656578064
|
| 13082 |
+
},
|
| 13083 |
+
{
|
| 13084 |
+
"all_correct": 0.2857142857142857,
|
| 13085 |
+
"all_wrong": 0.0,
|
| 13086 |
+
"completion_length": 183.33929443359375,
|
| 13087 |
+
"epoch": 0.2503059975520196,
|
| 13088 |
+
"grad_norm": 3.5483816862668784,
|
| 13089 |
+
"kl": 0.1090087890625,
|
| 13090 |
+
"learning_rate": 8.565990203302864e-07,
|
| 13091 |
+
"loss": 0.0044,
|
| 13092 |
+
"reward": 1.993505597114563,
|
| 13093 |
+
"reward_std": 0.18882004916667938,
|
| 13094 |
+
"rewards/accuracy_reward": 0.7095768451690674,
|
| 13095 |
+
"rewards/format_reward": 1.0,
|
| 13096 |
+
"step": 818,
|
| 13097 |
+
"temporal_rewards": 0.8571428656578064
|
| 13098 |
+
},
|
| 13099 |
+
{
|
| 13100 |
+
"all_correct": 0.14285714285714285,
|
| 13101 |
+
"all_wrong": 0.0,
|
| 13102 |
+
"completion_length": 366.2321472167969,
|
| 13103 |
+
"epoch": 0.25061199510403914,
|
| 13104 |
+
"grad_norm": 4.026645332394576,
|
| 13105 |
+
"kl": 0.0811767578125,
|
| 13106 |
+
"learning_rate": 8.562619310486763e-07,
|
| 13107 |
+
"loss": 0.0032,
|
| 13108 |
+
"reward": 1.833251714706421,
|
| 13109 |
+
"reward_std": 0.04854978993535042,
|
| 13110 |
+
"rewards/accuracy_reward": 0.3868231177330017,
|
| 13111 |
+
"rewards/format_reward": 1.0,
|
| 13112 |
+
"step": 819,
|
| 13113 |
+
"temporal_rewards": 1.0
|
| 13114 |
+
},
|
| 13115 |
+
{
|
| 13116 |
+
"all_correct": 0.2857142857142857,
|
| 13117 |
+
"all_wrong": 0.0,
|
| 13118 |
+
"completion_length": 243.7857208251953,
|
| 13119 |
+
"epoch": 0.25091799265605874,
|
| 13120 |
+
"grad_norm": 4.76954015263035,
|
| 13121 |
+
"kl": 0.06488037109375,
|
| 13122 |
+
"learning_rate": 8.559245125327965e-07,
|
| 13123 |
+
"loss": 0.0026,
|
| 13124 |
+
"reward": 1.9217417240142822,
|
| 13125 |
+
"reward_std": 0.25500017404556274,
|
| 13126 |
+
"rewards/accuracy_reward": 0.6449559330940247,
|
| 13127 |
+
"rewards/format_reward": 1.0,
|
| 13128 |
+
"step": 820,
|
| 13129 |
+
"temporal_rewards": 0.8571428656578064
|
| 13130 |
+
},
|
| 13131 |
+
{
|
| 13132 |
+
"all_correct": 0.2857142857142857,
|
| 13133 |
+
"all_wrong": 0.2857142857142857,
|
| 13134 |
+
"completion_length": 213.19644165039062,
|
| 13135 |
+
"epoch": 0.25122399020807834,
|
| 13136 |
+
"grad_norm": 1.8782121986183875,
|
| 13137 |
+
"kl": 0.07269287109375,
|
| 13138 |
+
"learning_rate": 8.55586765094468e-07,
|
| 13139 |
+
"loss": 0.0029,
|
| 13140 |
+
"reward": 1.7066212892532349,
|
| 13141 |
+
"reward_std": 0.1408018320798874,
|
| 13142 |
+
"rewards/accuracy_reward": 0.48697829246520996,
|
| 13143 |
+
"rewards/format_reward": 1.0,
|
| 13144 |
+
"step": 821,
|
| 13145 |
+
"temporal_rewards": 1.0
|
| 13146 |
+
},
|
| 13147 |
+
{
|
| 13148 |
+
"all_correct": 0.42857142857142855,
|
| 13149 |
+
"all_wrong": 0.0,
|
| 13150 |
+
"completion_length": 154.46429443359375,
|
| 13151 |
+
"epoch": 0.25152998776009794,
|
| 13152 |
+
"grad_norm": 2.802553457575392,
|
| 13153 |
+
"kl": 0.059600830078125,
|
| 13154 |
+
"learning_rate": 8.552486890458146e-07,
|
| 13155 |
+
"loss": 0.0024,
|
| 13156 |
+
"reward": 1.9007517099380493,
|
| 13157 |
+
"reward_std": 0.17006878554821014,
|
| 13158 |
+
"rewards/accuracy_reward": 0.6811087727546692,
|
| 13159 |
+
"rewards/format_reward": 1.0,
|
| 13160 |
+
"step": 822,
|
| 13161 |
+
"temporal_rewards": 0.8571428656578064
|
| 13162 |
+
},
|
| 13163 |
+
{
|
| 13164 |
+
"all_correct": 0.42857142857142855,
|
| 13165 |
+
"all_wrong": 0.2857142857142857,
|
| 13166 |
+
"completion_length": 254.5535888671875,
|
| 13167 |
+
"epoch": 0.2518359853121175,
|
| 13168 |
+
"grad_norm": 2.2104544083520765,
|
| 13169 |
+
"kl": 0.06500244140625,
|
| 13170 |
+
"learning_rate": 8.549102846992649e-07,
|
| 13171 |
+
"loss": 0.0026,
|
| 13172 |
+
"reward": 1.7576632499694824,
|
| 13173 |
+
"reward_std": 0.028187856078147888,
|
| 13174 |
+
"rewards/accuracy_reward": 0.4862346649169922,
|
| 13175 |
+
"rewards/format_reward": 1.0,
|
| 13176 |
+
"step": 823,
|
| 13177 |
+
"temporal_rewards": 1.0
|
| 13178 |
+
},
|
| 13179 |
+
{
|
| 13180 |
+
"all_correct": 0.42857142857142855,
|
| 13181 |
+
"all_wrong": 0.0,
|
| 13182 |
+
"completion_length": 139.6607208251953,
|
| 13183 |
+
"epoch": 0.2521419828641371,
|
| 13184 |
+
"grad_norm": 3.899024552852272,
|
| 13185 |
+
"kl": 0.0740966796875,
|
| 13186 |
+
"learning_rate": 8.5457155236755e-07,
|
| 13187 |
+
"loss": 0.003,
|
| 13188 |
+
"reward": 1.9015222787857056,
|
| 13189 |
+
"reward_std": 0.16740469634532928,
|
| 13190 |
+
"rewards/accuracy_reward": 0.7265222072601318,
|
| 13191 |
+
"rewards/format_reward": 1.0,
|
| 13192 |
+
"step": 824,
|
| 13193 |
+
"temporal_rewards": 0.5714285373687744
|
| 13194 |
+
},
|
| 13195 |
+
{
|
| 13196 |
+
"all_correct": 0.14285714285714285,
|
| 13197 |
+
"all_wrong": 0.2857142857142857,
|
| 13198 |
+
"completion_length": 182.2678680419922,
|
| 13199 |
+
"epoch": 0.2524479804161567,
|
| 13200 |
+
"grad_norm": 3.813733151425986,
|
| 13201 |
+
"kl": 0.080322265625,
|
| 13202 |
+
"learning_rate": 8.542324923637045e-07,
|
| 13203 |
+
"loss": 0.0032,
|
| 13204 |
+
"reward": 1.6101853847503662,
|
| 13205 |
+
"reward_std": 0.07268591970205307,
|
| 13206 |
+
"rewards/accuracy_reward": 0.41732820868492126,
|
| 13207 |
+
"rewards/format_reward": 1.0,
|
| 13208 |
+
"step": 825,
|
| 13209 |
+
"temporal_rewards": 0.8571428656578064
|
| 13210 |
+
},
|
| 13211 |
+
{
|
| 13212 |
+
"all_correct": 0.0,
|
| 13213 |
+
"all_wrong": 0.14285714285714285,
|
| 13214 |
+
"completion_length": 193.80357360839844,
|
| 13215 |
+
"epoch": 0.2527539779681763,
|
| 13216 |
+
"grad_norm": 2.3299012963911707,
|
| 13217 |
+
"kl": 0.06707763671875,
|
| 13218 |
+
"learning_rate": 8.538931050010659e-07,
|
| 13219 |
+
"loss": 0.0027,
|
| 13220 |
+
"reward": 1.664027452468872,
|
| 13221 |
+
"reward_std": 0.2882187068462372,
|
| 13222 |
+
"rewards/accuracy_reward": 0.5425989031791687,
|
| 13223 |
+
"rewards/format_reward": 1.0,
|
| 13224 |
+
"step": 826,
|
| 13225 |
+
"temporal_rewards": 0.5714285373687744
|
| 13226 |
+
},
|
| 13227 |
+
{
|
| 13228 |
+
"all_correct": 0.2857142857142857,
|
| 13229 |
+
"all_wrong": 0.2857142857142857,
|
| 13230 |
+
"completion_length": 156.67857360839844,
|
| 13231 |
+
"epoch": 0.2530599755201958,
|
| 13232 |
+
"grad_norm": 2.555255650663871,
|
| 13233 |
+
"kl": 0.06451416015625,
|
| 13234 |
+
"learning_rate": 8.535533905932737e-07,
|
| 13235 |
+
"loss": 0.0026,
|
| 13236 |
+
"reward": 1.6659547090530396,
|
| 13237 |
+
"reward_std": 0.1807582676410675,
|
| 13238 |
+
"rewards/accuracy_reward": 0.5177403688430786,
|
| 13239 |
+
"rewards/format_reward": 1.0,
|
| 13240 |
+
"step": 827,
|
| 13241 |
+
"temporal_rewards": 0.8571428656578064
|
| 13242 |
+
},
|
| 13243 |
+
{
|
| 13244 |
+
"all_correct": 0.5714285714285714,
|
| 13245 |
+
"all_wrong": 0.14285714285714285,
|
| 13246 |
+
"completion_length": 120.48214721679688,
|
| 13247 |
+
"epoch": 0.2533659730722154,
|
| 13248 |
+
"grad_norm": 5.737746204248841,
|
| 13249 |
+
"kl": 0.079833984375,
|
| 13250 |
+
"learning_rate": 8.532133494542705e-07,
|
| 13251 |
+
"loss": 0.0032,
|
| 13252 |
+
"reward": 1.9454082250595093,
|
| 13253 |
+
"reward_std": 0.00841815210878849,
|
| 13254 |
+
"rewards/accuracy_reward": 0.6882654428482056,
|
| 13255 |
+
"rewards/format_reward": 1.0,
|
| 13256 |
+
"step": 828,
|
| 13257 |
+
"temporal_rewards": 0.8571428656578064
|
| 13258 |
+
},
|
| 13259 |
+
{
|
| 13260 |
+
"all_correct": 0.2857142857142857,
|
| 13261 |
+
"all_wrong": 0.0,
|
| 13262 |
+
"completion_length": 211.7678680419922,
|
| 13263 |
+
"epoch": 0.253671970624235,
|
| 13264 |
+
"grad_norm": 3.114541528996617,
|
| 13265 |
+
"kl": 0.06292724609375,
|
| 13266 |
+
"learning_rate": 8.528729818983e-07,
|
| 13267 |
+
"loss": 0.0025,
|
| 13268 |
+
"reward": 1.7419655323028564,
|
| 13269 |
+
"reward_std": 0.3375326693058014,
|
| 13270 |
+
"rewards/accuracy_reward": 0.5526796579360962,
|
| 13271 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13272 |
+
"step": 829,
|
| 13273 |
+
"temporal_rewards": 0.5714285373687744
|
| 13274 |
+
},
|
| 13275 |
+
{
|
| 13276 |
+
"all_correct": 0.2857142857142857,
|
| 13277 |
+
"all_wrong": 0.2857142857142857,
|
| 13278 |
+
"completion_length": 173.35714721679688,
|
| 13279 |
+
"epoch": 0.25397796817625456,
|
| 13280 |
+
"grad_norm": 67.6969972123076,
|
| 13281 |
+
"kl": 0.068115234375,
|
| 13282 |
+
"learning_rate": 8.525322882399082e-07,
|
| 13283 |
+
"loss": 0.0027,
|
| 13284 |
+
"reward": 1.6851218938827515,
|
| 13285 |
+
"reward_std": 0.09688737243413925,
|
| 13286 |
+
"rewards/accuracy_reward": 0.48512178659439087,
|
| 13287 |
+
"rewards/format_reward": 1.0,
|
| 13288 |
+
"step": 830,
|
| 13289 |
+
"temporal_rewards": 0.714285671710968
|
| 13290 |
+
},
|
| 13291 |
+
{
|
| 13292 |
+
"all_correct": 0.42857142857142855,
|
| 13293 |
+
"all_wrong": 0.14285714285714285,
|
| 13294 |
+
"completion_length": 178.55357360839844,
|
| 13295 |
+
"epoch": 0.25428396572827416,
|
| 13296 |
+
"grad_norm": 1.4363689395781298,
|
| 13297 |
+
"kl": 0.0780029296875,
|
| 13298 |
+
"learning_rate": 8.52191268793942e-07,
|
| 13299 |
+
"loss": 0.0031,
|
| 13300 |
+
"reward": 1.9075700044631958,
|
| 13301 |
+
"reward_std": 0.028506727889180183,
|
| 13302 |
+
"rewards/accuracy_reward": 0.6254270672798157,
|
| 13303 |
+
"rewards/format_reward": 1.0,
|
| 13304 |
+
"step": 831,
|
| 13305 |
+
"temporal_rewards": 1.0
|
| 13306 |
+
},
|
| 13307 |
+
{
|
| 13308 |
+
"all_correct": 0.2857142857142857,
|
| 13309 |
+
"all_wrong": 0.14285714285714285,
|
| 13310 |
+
"completion_length": 191.42857360839844,
|
| 13311 |
+
"epoch": 0.25458996328029376,
|
| 13312 |
+
"grad_norm": 3.293920291757542,
|
| 13313 |
+
"kl": 0.08331298828125,
|
| 13314 |
+
"learning_rate": 8.518499238755496e-07,
|
| 13315 |
+
"loss": 0.0033,
|
| 13316 |
+
"reward": 1.807334542274475,
|
| 13317 |
+
"reward_std": 0.17562764883041382,
|
| 13318 |
+
"rewards/accuracy_reward": 0.54662024974823,
|
| 13319 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13320 |
+
"step": 832,
|
| 13321 |
+
"temporal_rewards": 1.0
|
| 13322 |
+
},
|
| 13323 |
+
{
|
| 13324 |
+
"all_correct": 0.14285714285714285,
|
| 13325 |
+
"all_wrong": 0.14285714285714285,
|
| 13326 |
+
"completion_length": 261.51788330078125,
|
| 13327 |
+
"epoch": 0.25489596083231336,
|
| 13328 |
+
"grad_norm": 2.5882653330593115,
|
| 13329 |
+
"kl": 0.076171875,
|
| 13330 |
+
"learning_rate": 8.515082538001798e-07,
|
| 13331 |
+
"loss": 0.003,
|
| 13332 |
+
"reward": 1.561861276626587,
|
| 13333 |
+
"reward_std": 0.2398105412721634,
|
| 13334 |
+
"rewards/accuracy_reward": 0.3225754499435425,
|
| 13335 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13336 |
+
"step": 833,
|
| 13337 |
+
"temporal_rewards": 0.714285671710968
|
| 13338 |
+
},
|
| 13339 |
+
{
|
| 13340 |
+
"all_correct": 0.14285714285714285,
|
| 13341 |
+
"all_wrong": 0.2857142857142857,
|
| 13342 |
+
"completion_length": 179.7678680419922,
|
| 13343 |
+
"epoch": 0.2552019583843329,
|
| 13344 |
+
"grad_norm": 2.0572353102018313,
|
| 13345 |
+
"kl": 0.0810546875,
|
| 13346 |
+
"learning_rate": 8.511662588835823e-07,
|
| 13347 |
+
"loss": 0.0032,
|
| 13348 |
+
"reward": 1.509498953819275,
|
| 13349 |
+
"reward_std": 0.19672635197639465,
|
| 13350 |
+
"rewards/accuracy_reward": 0.32199880480766296,
|
| 13351 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13352 |
+
"step": 834,
|
| 13353 |
+
"temporal_rewards": 1.0
|
| 13354 |
+
},
|
| 13355 |
+
{
|
| 13356 |
+
"all_correct": 0.42857142857142855,
|
| 13357 |
+
"all_wrong": 0.2857142857142857,
|
| 13358 |
+
"completion_length": 183.33929443359375,
|
| 13359 |
+
"epoch": 0.2555079559363525,
|
| 13360 |
+
"grad_norm": 1.9340557459916523,
|
| 13361 |
+
"kl": 0.0640869140625,
|
| 13362 |
+
"learning_rate": 8.508239394418064e-07,
|
| 13363 |
+
"loss": 0.0026,
|
| 13364 |
+
"reward": 1.706633448600769,
|
| 13365 |
+
"reward_std": 0.06095283478498459,
|
| 13366 |
+
"rewards/accuracy_reward": 0.4923476576805115,
|
| 13367 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13368 |
+
"step": 835,
|
| 13369 |
+
"temporal_rewards": 1.0
|
| 13370 |
+
},
|
| 13371 |
+
{
|
| 13372 |
+
"all_correct": 0.5714285714285714,
|
| 13373 |
+
"all_wrong": 0.0,
|
| 13374 |
+
"completion_length": 137.07144165039062,
|
| 13375 |
+
"epoch": 0.2558139534883721,
|
| 13376 |
+
"grad_norm": 3.9407094375455114,
|
| 13377 |
+
"kl": 0.054473876953125,
|
| 13378 |
+
"learning_rate": 8.504812957912018e-07,
|
| 13379 |
+
"loss": 0.0022,
|
| 13380 |
+
"reward": 1.9000000953674316,
|
| 13381 |
+
"reward_std": 0.23574510216712952,
|
| 13382 |
+
"rewards/accuracy_reward": 0.6964285969734192,
|
| 13383 |
+
"rewards/format_reward": 1.0,
|
| 13384 |
+
"step": 836,
|
| 13385 |
+
"temporal_rewards": 0.8571428656578064
|
| 13386 |
+
},
|
| 13387 |
+
{
|
| 13388 |
+
"all_correct": 0.14285714285714285,
|
| 13389 |
+
"all_wrong": 0.0,
|
| 13390 |
+
"completion_length": 188.94644165039062,
|
| 13391 |
+
"epoch": 0.2561199510403917,
|
| 13392 |
+
"grad_norm": 4.029147261922757,
|
| 13393 |
+
"kl": 0.0792236328125,
|
| 13394 |
+
"learning_rate": 8.501383282484176e-07,
|
| 13395 |
+
"loss": 0.0032,
|
| 13396 |
+
"reward": 1.6178863048553467,
|
| 13397 |
+
"reward_std": 0.11477590352296829,
|
| 13398 |
+
"rewards/accuracy_reward": 0.3321720063686371,
|
| 13399 |
+
"rewards/format_reward": 1.0,
|
| 13400 |
+
"step": 837,
|
| 13401 |
+
"temporal_rewards": 1.0
|
| 13402 |
+
},
|
| 13403 |
+
{
|
| 13404 |
+
"all_correct": 0.42857142857142855,
|
| 13405 |
+
"all_wrong": 0.0,
|
| 13406 |
+
"completion_length": 205.98214721679688,
|
| 13407 |
+
"epoch": 0.25642594859241125,
|
| 13408 |
+
"grad_norm": 11.476990775610078,
|
| 13409 |
+
"kl": 0.071044921875,
|
| 13410 |
+
"learning_rate": 8.497950371304023e-07,
|
| 13411 |
+
"loss": 0.0028,
|
| 13412 |
+
"reward": 2.0091044902801514,
|
| 13413 |
+
"reward_std": 0.22341029345989227,
|
| 13414 |
+
"rewards/accuracy_reward": 0.7216044068336487,
|
| 13415 |
+
"rewards/format_reward": 1.0,
|
| 13416 |
+
"step": 838,
|
| 13417 |
+
"temporal_rewards": 1.0
|
| 13418 |
+
},
|
| 13419 |
+
{
|
| 13420 |
+
"all_correct": 0.2857142857142857,
|
| 13421 |
+
"all_wrong": 0.42857142857142855,
|
| 13422 |
+
"completion_length": 125.21429443359375,
|
| 13423 |
+
"epoch": 0.25673194614443084,
|
| 13424 |
+
"grad_norm": 1.7000676273670414,
|
| 13425 |
+
"kl": 0.07147216796875,
|
| 13426 |
+
"learning_rate": 8.494514227544034e-07,
|
| 13427 |
+
"loss": 0.0029,
|
| 13428 |
+
"reward": 1.5309561491012573,
|
| 13429 |
+
"reward_std": 0.07276370376348495,
|
| 13430 |
+
"rewards/accuracy_reward": 0.37917035818099976,
|
| 13431 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13432 |
+
"step": 839,
|
| 13433 |
+
"temporal_rewards": 0.8571428656578064
|
| 13434 |
+
},
|
| 13435 |
+
{
|
| 13436 |
+
"all_correct": 0.42857142857142855,
|
| 13437 |
+
"all_wrong": 0.0,
|
| 13438 |
+
"completion_length": 249.60714721679688,
|
| 13439 |
+
"epoch": 0.25703794369645044,
|
| 13440 |
+
"grad_norm": 2.175181403729273,
|
| 13441 |
+
"kl": 0.1136474609375,
|
| 13442 |
+
"learning_rate": 8.491074854379671e-07,
|
| 13443 |
+
"loss": 0.0045,
|
| 13444 |
+
"reward": 1.8630489110946655,
|
| 13445 |
+
"reward_std": 0.2111286222934723,
|
| 13446 |
+
"rewards/accuracy_reward": 0.586263120174408,
|
| 13447 |
+
"rewards/format_reward": 0.9285714626312256,
|
| 13448 |
+
"step": 840,
|
| 13449 |
+
"temporal_rewards": 1.0
|
| 13450 |
+
},
|
| 13451 |
+
{
|
| 13452 |
+
"all_correct": 0.5714285714285714,
|
| 13453 |
+
"all_wrong": 0.0,
|
| 13454 |
+
"completion_length": 171.73214721679688,
|
| 13455 |
+
"epoch": 0.25734394124847,
|
| 13456 |
+
"grad_norm": 3.892248304212202,
|
| 13457 |
+
"kl": 0.07586669921875,
|
| 13458 |
+
"learning_rate": 8.487632254989379e-07,
|
| 13459 |
+
"loss": 0.003,
|
| 13460 |
+
"reward": 1.962437391281128,
|
| 13461 |
+
"reward_std": 0.15512388944625854,
|
| 13462 |
+
"rewards/accuracy_reward": 0.6874372959136963,
|
| 13463 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13464 |
+
"step": 841,
|
| 13465 |
+
"temporal_rewards": 1.0
|
| 13466 |
+
},
|
| 13467 |
+
{
|
| 13468 |
+
"all_correct": 0.14285714285714285,
|
| 13469 |
+
"all_wrong": 0.14285714285714285,
|
| 13470 |
+
"completion_length": 166.9107208251953,
|
| 13471 |
+
"epoch": 0.2576499388004896,
|
| 13472 |
+
"grad_norm": 2.675161584719784,
|
| 13473 |
+
"kl": 0.08343505859375,
|
| 13474 |
+
"learning_rate": 8.484186432554586e-07,
|
| 13475 |
+
"loss": 0.0033,
|
| 13476 |
+
"reward": 1.5791218280792236,
|
| 13477 |
+
"reward_std": 0.13287338614463806,
|
| 13478 |
+
"rewards/accuracy_reward": 0.36126458644866943,
|
| 13479 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13480 |
+
"step": 842,
|
| 13481 |
+
"temporal_rewards": 0.8571428656578064
|
| 13482 |
+
},
|
| 13483 |
+
{
|
| 13484 |
+
"all_correct": 0.14285714285714285,
|
| 13485 |
+
"all_wrong": 0.2857142857142857,
|
| 13486 |
+
"completion_length": 221.23214721679688,
|
| 13487 |
+
"epoch": 0.2579559363525092,
|
| 13488 |
+
"grad_norm": 2.02453164520589,
|
| 13489 |
+
"kl": 0.086669921875,
|
| 13490 |
+
"learning_rate": 8.480737390259702e-07,
|
| 13491 |
+
"loss": 0.0035,
|
| 13492 |
+
"reward": 1.5743305683135986,
|
| 13493 |
+
"reward_std": 0.09532348066568375,
|
| 13494 |
+
"rewards/accuracy_reward": 0.32433053851127625,
|
| 13495 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13496 |
+
"step": 843,
|
| 13497 |
+
"temporal_rewards": 1.0
|
| 13498 |
+
},
|
| 13499 |
+
{
|
| 13500 |
+
"all_correct": 0.42857142857142855,
|
| 13501 |
+
"all_wrong": 0.0,
|
| 13502 |
+
"completion_length": 114.85714721679688,
|
| 13503 |
+
"epoch": 0.2582619339045288,
|
| 13504 |
+
"grad_norm": 7.990069827061059,
|
| 13505 |
+
"kl": 0.06591796875,
|
| 13506 |
+
"learning_rate": 8.477285131292107e-07,
|
| 13507 |
+
"loss": 0.0026,
|
| 13508 |
+
"reward": 1.89658522605896,
|
| 13509 |
+
"reward_std": 0.1812773495912552,
|
| 13510 |
+
"rewards/accuracy_reward": 0.6501566767692566,
|
| 13511 |
+
"rewards/format_reward": 1.0,
|
| 13512 |
+
"step": 844,
|
| 13513 |
+
"temporal_rewards": 1.0
|
| 13514 |
+
},
|
| 13515 |
+
{
|
| 13516 |
+
"all_correct": 0.2857142857142857,
|
| 13517 |
+
"all_wrong": 0.0,
|
| 13518 |
+
"completion_length": 275.3571472167969,
|
| 13519 |
+
"epoch": 0.25856793145654833,
|
| 13520 |
+
"grad_norm": 4.670054056493445,
|
| 13521 |
+
"kl": 0.06304931640625,
|
| 13522 |
+
"learning_rate": 8.473829658842153e-07,
|
| 13523 |
+
"loss": 0.0025,
|
| 13524 |
+
"reward": 1.863919734954834,
|
| 13525 |
+
"reward_std": 0.10707426816225052,
|
| 13526 |
+
"rewards/accuracy_reward": 0.5067769289016724,
|
| 13527 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13528 |
+
"step": 845,
|
| 13529 |
+
"temporal_rewards": 1.0
|
| 13530 |
+
},
|
| 13531 |
+
{
|
| 13532 |
+
"all_correct": 0.14285714285714285,
|
| 13533 |
+
"all_wrong": 0.0,
|
| 13534 |
+
"completion_length": 190.7857208251953,
|
| 13535 |
+
"epoch": 0.2588739290085679,
|
| 13536 |
+
"grad_norm": 3.696883423375172,
|
| 13537 |
+
"kl": 0.09039306640625,
|
| 13538 |
+
"learning_rate": 8.47037097610317e-07,
|
| 13539 |
+
"loss": 0.0036,
|
| 13540 |
+
"reward": 1.8291937112808228,
|
| 13541 |
+
"reward_std": 0.30012595653533936,
|
| 13542 |
+
"rewards/accuracy_reward": 0.5559793710708618,
|
| 13543 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13544 |
+
"step": 846,
|
| 13545 |
+
"temporal_rewards": 1.0
|
| 13546 |
+
},
|
| 13547 |
+
{
|
| 13548 |
+
"all_correct": 0.14285714285714285,
|
| 13549 |
+
"all_wrong": 0.0,
|
| 13550 |
+
"completion_length": 205.21429443359375,
|
| 13551 |
+
"epoch": 0.2591799265605875,
|
| 13552 |
+
"grad_norm": 2.92257295296219,
|
| 13553 |
+
"kl": 0.09735107421875,
|
| 13554 |
+
"learning_rate": 8.466909086271443e-07,
|
| 13555 |
+
"loss": 0.0039,
|
| 13556 |
+
"reward": 1.9195644855499268,
|
| 13557 |
+
"reward_std": 0.17363081872463226,
|
| 13558 |
+
"rewards/accuracy_reward": 0.6017073392868042,
|
| 13559 |
+
"rewards/format_reward": 1.0,
|
| 13560 |
+
"step": 847,
|
| 13561 |
+
"temporal_rewards": 1.0
|
| 13562 |
+
},
|
| 13563 |
+
{
|
| 13564 |
+
"all_correct": 0.2857142857142857,
|
| 13565 |
+
"all_wrong": 0.14285714285714285,
|
| 13566 |
+
"completion_length": 166.2857208251953,
|
| 13567 |
+
"epoch": 0.2594859241126071,
|
| 13568 |
+
"grad_norm": 4.839827480327019,
|
| 13569 |
+
"kl": 0.06756591796875,
|
| 13570 |
+
"learning_rate": 8.463443992546234e-07,
|
| 13571 |
+
"loss": 0.0027,
|
| 13572 |
+
"reward": 1.7144334316253662,
|
| 13573 |
+
"reward_std": 0.2880959212779999,
|
| 13574 |
+
"rewards/accuracy_reward": 0.5072907209396362,
|
| 13575 |
+
"rewards/format_reward": 1.0,
|
| 13576 |
+
"step": 848,
|
| 13577 |
+
"temporal_rewards": 1.0
|
| 13578 |
+
},
|
| 13579 |
+
{
|
| 13580 |
+
"all_correct": 0.0,
|
| 13581 |
+
"all_wrong": 0.14285714285714285,
|
| 13582 |
+
"completion_length": 135.55357360839844,
|
| 13583 |
+
"epoch": 0.25979192166462667,
|
| 13584 |
+
"grad_norm": 5.993414281952189,
|
| 13585 |
+
"kl": 0.06500244140625,
|
| 13586 |
+
"learning_rate": 8.459975698129753e-07,
|
| 13587 |
+
"loss": 0.0026,
|
| 13588 |
+
"reward": 1.5761994123458862,
|
| 13589 |
+
"reward_std": 0.23083187639713287,
|
| 13590 |
+
"rewards/accuracy_reward": 0.39941367506980896,
|
| 13591 |
+
"rewards/format_reward": 1.0,
|
| 13592 |
+
"step": 849,
|
| 13593 |
+
"temporal_rewards": 1.0
|
| 13594 |
+
},
|
| 13595 |
+
{
|
| 13596 |
+
"all_correct": 0.7142857142857143,
|
| 13597 |
+
"all_wrong": 0.0,
|
| 13598 |
+
"completion_length": 209.3928680419922,
|
| 13599 |
+
"epoch": 0.26009791921664627,
|
| 13600 |
+
"grad_norm": 1.1340119659986048,
|
| 13601 |
+
"kl": 0.081298828125,
|
| 13602 |
+
"learning_rate": 8.456504206227177e-07,
|
| 13603 |
+
"loss": 0.0033,
|
| 13604 |
+
"reward": 2.096266746520996,
|
| 13605 |
+
"reward_std": 0.08216449618339539,
|
| 13606 |
+
"rewards/accuracy_reward": 0.7694810032844543,
|
| 13607 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13608 |
+
"step": 850,
|
| 13609 |
+
"temporal_rewards": 1.0
|
| 13610 |
+
},
|
| 13611 |
+
{
|
| 13612 |
+
"all_correct": 0.2857142857142857,
|
| 13613 |
+
"all_wrong": 0.0,
|
| 13614 |
+
"completion_length": 109.80357360839844,
|
| 13615 |
+
"epoch": 0.26040391676866587,
|
| 13616 |
+
"grad_norm": 4.222691355310761,
|
| 13617 |
+
"kl": 0.0751953125,
|
| 13618 |
+
"learning_rate": 8.453029520046634e-07,
|
| 13619 |
+
"loss": 0.003,
|
| 13620 |
+
"reward": 1.9611623287200928,
|
| 13621 |
+
"reward_std": 0.14106816053390503,
|
| 13622 |
+
"rewards/accuracy_reward": 0.677233874797821,
|
| 13623 |
+
"rewards/format_reward": 1.0,
|
| 13624 |
+
"step": 851,
|
| 13625 |
+
"temporal_rewards": 1.0
|
| 13626 |
+
},
|
| 13627 |
+
{
|
| 13628 |
+
"all_correct": 0.5714285714285714,
|
| 13629 |
+
"all_wrong": 0.0,
|
| 13630 |
+
"completion_length": 138.8928680419922,
|
| 13631 |
+
"epoch": 0.2607099143206854,
|
| 13632 |
+
"grad_norm": 2.9274501393456176,
|
| 13633 |
+
"kl": 0.06640625,
|
| 13634 |
+
"learning_rate": 8.449551642799204e-07,
|
| 13635 |
+
"loss": 0.0027,
|
| 13636 |
+
"reward": 2.114285707473755,
|
| 13637 |
+
"reward_std": 0.27120646834373474,
|
| 13638 |
+
"rewards/accuracy_reward": 0.8571429252624512,
|
| 13639 |
+
"rewards/format_reward": 1.0,
|
| 13640 |
+
"step": 852,
|
| 13641 |
+
"temporal_rewards": 1.0
|
| 13642 |
+
},
|
| 13643 |
+
{
|
| 13644 |
+
"all_correct": 0.2857142857142857,
|
| 13645 |
+
"all_wrong": 0.2857142857142857,
|
| 13646 |
+
"completion_length": 217.67857360839844,
|
| 13647 |
+
"epoch": 0.261015911872705,
|
| 13648 |
+
"grad_norm": 1.926086918566909,
|
| 13649 |
+
"kl": 0.078369140625,
|
| 13650 |
+
"learning_rate": 8.446070577698915e-07,
|
| 13651 |
+
"loss": 0.0031,
|
| 13652 |
+
"reward": 1.6957752704620361,
|
| 13653 |
+
"reward_std": 0.04128192737698555,
|
| 13654 |
+
"rewards/accuracy_reward": 0.42791807651519775,
|
| 13655 |
+
"rewards/format_reward": 1.0,
|
| 13656 |
+
"step": 853,
|
| 13657 |
+
"temporal_rewards": 1.0
|
| 13658 |
+
},
|
| 13659 |
+
{
|
| 13660 |
+
"all_correct": 0.14285714285714285,
|
| 13661 |
+
"all_wrong": 0.14285714285714285,
|
| 13662 |
+
"completion_length": 218.55357360839844,
|
| 13663 |
+
"epoch": 0.2613219094247246,
|
| 13664 |
+
"grad_norm": 3.7384806739684255,
|
| 13665 |
+
"kl": 0.075927734375,
|
| 13666 |
+
"learning_rate": 8.442586327962746e-07,
|
| 13667 |
+
"loss": 0.003,
|
| 13668 |
+
"reward": 1.6708444356918335,
|
| 13669 |
+
"reward_std": 0.19737550616264343,
|
| 13670 |
+
"rewards/accuracy_reward": 0.4119158089160919,
|
| 13671 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13672 |
+
"step": 854,
|
| 13673 |
+
"temporal_rewards": 0.8571428656578064
|
| 13674 |
+
},
|
| 13675 |
+
{
|
| 13676 |
+
"all_correct": 0.14285714285714285,
|
| 13677 |
+
"all_wrong": 0.14285714285714285,
|
| 13678 |
+
"completion_length": 190.7678680419922,
|
| 13679 |
+
"epoch": 0.2616279069767442,
|
| 13680 |
+
"grad_norm": 4.253646327546919,
|
| 13681 |
+
"kl": 0.07208251953125,
|
| 13682 |
+
"learning_rate": 8.439098896810614e-07,
|
| 13683 |
+
"loss": 0.0029,
|
| 13684 |
+
"reward": 1.5890089273452759,
|
| 13685 |
+
"reward_std": 0.2665659487247467,
|
| 13686 |
+
"rewards/accuracy_reward": 0.4193660020828247,
|
| 13687 |
+
"rewards/format_reward": 1.0,
|
| 13688 |
+
"step": 855,
|
| 13689 |
+
"temporal_rewards": 0.714285671710968
|
| 13690 |
+
},
|
| 13691 |
+
{
|
| 13692 |
+
"all_correct": 0.42857142857142855,
|
| 13693 |
+
"all_wrong": 0.14285714285714285,
|
| 13694 |
+
"completion_length": 209.08929443359375,
|
| 13695 |
+
"epoch": 0.26193390452876375,
|
| 13696 |
+
"grad_norm": 7.368057295625036,
|
| 13697 |
+
"kl": 0.09344482421875,
|
| 13698 |
+
"learning_rate": 8.435608287465376e-07,
|
| 13699 |
+
"loss": 0.0037,
|
| 13700 |
+
"reward": 1.756385087966919,
|
| 13701 |
+
"reward_std": 0.22209163010120392,
|
| 13702 |
+
"rewards/accuracy_reward": 0.5438850522041321,
|
| 13703 |
+
"rewards/format_reward": 0.9464285969734192,
|
| 13704 |
+
"step": 856,
|
| 13705 |
+
"temporal_rewards": 1.0
|
| 13706 |
+
},
|
| 13707 |
+
{
|
| 13708 |
+
"all_correct": 0.0,
|
| 13709 |
+
"all_wrong": 0.14285714285714285,
|
| 13710 |
+
"completion_length": 275.625,
|
| 13711 |
+
"epoch": 0.26223990208078335,
|
| 13712 |
+
"grad_norm": 2.9501313048424405,
|
| 13713 |
+
"kl": 0.0986328125,
|
| 13714 |
+
"learning_rate": 8.43211450315283e-07,
|
| 13715 |
+
"loss": 0.0039,
|
| 13716 |
+
"reward": 1.6245818138122559,
|
| 13717 |
+
"reward_std": 0.288144052028656,
|
| 13718 |
+
"rewards/accuracy_reward": 0.38529595732688904,
|
| 13719 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13720 |
+
"step": 857,
|
| 13721 |
+
"temporal_rewards": 0.8571428656578064
|
| 13722 |
+
},
|
| 13723 |
+
{
|
| 13724 |
+
"all_correct": 0.14285714285714285,
|
| 13725 |
+
"all_wrong": 0.0,
|
| 13726 |
+
"completion_length": 256.8214416503906,
|
| 13727 |
+
"epoch": 0.26254589963280295,
|
| 13728 |
+
"grad_norm": 4.252341850089602,
|
| 13729 |
+
"kl": 0.0767822265625,
|
| 13730 |
+
"learning_rate": 8.428617547101705e-07,
|
| 13731 |
+
"loss": 0.0031,
|
| 13732 |
+
"reward": 1.735701560974121,
|
| 13733 |
+
"reward_std": 0.12338480353355408,
|
| 13734 |
+
"rewards/accuracy_reward": 0.444629967212677,
|
| 13735 |
+
"rewards/format_reward": 1.0,
|
| 13736 |
+
"step": 858,
|
| 13737 |
+
"temporal_rewards": 0.8571428656578064
|
| 13738 |
+
},
|
| 13739 |
+
{
|
| 13740 |
+
"all_correct": 0.42857142857142855,
|
| 13741 |
+
"all_wrong": 0.0,
|
| 13742 |
+
"completion_length": 260.14288330078125,
|
| 13743 |
+
"epoch": 0.26285189718482255,
|
| 13744 |
+
"grad_norm": 2.0754843149035613,
|
| 13745 |
+
"kl": 0.09075927734375,
|
| 13746 |
+
"learning_rate": 8.425117422543662e-07,
|
| 13747 |
+
"loss": 0.0036,
|
| 13748 |
+
"reward": 1.9669040441513062,
|
| 13749 |
+
"reward_std": 0.21213708817958832,
|
| 13750 |
+
"rewards/accuracy_reward": 0.6383326649665833,
|
| 13751 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13752 |
+
"step": 859,
|
| 13753 |
+
"temporal_rewards": 1.0
|
| 13754 |
+
},
|
| 13755 |
+
{
|
| 13756 |
+
"all_correct": 0.42857142857142855,
|
| 13757 |
+
"all_wrong": 0.14285714285714285,
|
| 13758 |
+
"completion_length": 209.7857208251953,
|
| 13759 |
+
"epoch": 0.2631578947368421,
|
| 13760 |
+
"grad_norm": 2.1579992600990177,
|
| 13761 |
+
"kl": 0.07696533203125,
|
| 13762 |
+
"learning_rate": 8.421614132713291e-07,
|
| 13763 |
+
"loss": 0.0031,
|
| 13764 |
+
"reward": 1.8583227396011353,
|
| 13765 |
+
"reward_std": 0.029287781566381454,
|
| 13766 |
+
"rewards/accuracy_reward": 0.5511797666549683,
|
| 13767 |
+
"rewards/format_reward": 1.0,
|
| 13768 |
+
"step": 860,
|
| 13769 |
+
"temporal_rewards": 1.0
|
| 13770 |
+
},
|
| 13771 |
+
{
|
| 13772 |
+
"all_correct": 0.14285714285714285,
|
| 13773 |
+
"all_wrong": 0.2857142857142857,
|
| 13774 |
+
"completion_length": 175.25001525878906,
|
| 13775 |
+
"epoch": 0.2634638922888617,
|
| 13776 |
+
"grad_norm": 2.271360656952098,
|
| 13777 |
+
"kl": 0.064208984375,
|
| 13778 |
+
"learning_rate": 8.418107680848106e-07,
|
| 13779 |
+
"loss": 0.0026,
|
| 13780 |
+
"reward": 1.578763723373413,
|
| 13781 |
+
"reward_std": 0.08131464570760727,
|
| 13782 |
+
"rewards/accuracy_reward": 0.34840652346611023,
|
| 13783 |
+
"rewards/format_reward": 1.0,
|
| 13784 |
+
"step": 861,
|
| 13785 |
+
"temporal_rewards": 1.0
|
| 13786 |
+
},
|
| 13787 |
+
{
|
| 13788 |
+
"all_correct": 0.2857142857142857,
|
| 13789 |
+
"all_wrong": 0.0,
|
| 13790 |
+
"completion_length": 181.87501525878906,
|
| 13791 |
+
"epoch": 0.2637698898408813,
|
| 13792 |
+
"grad_norm": 3.4384761464367695,
|
| 13793 |
+
"kl": 0.0611572265625,
|
| 13794 |
+
"learning_rate": 8.414598070188541e-07,
|
| 13795 |
+
"loss": 0.0024,
|
| 13796 |
+
"reward": 1.7330061197280884,
|
| 13797 |
+
"reward_std": 0.16418743133544922,
|
| 13798 |
+
"rewards/accuracy_reward": 0.488363116979599,
|
| 13799 |
+
"rewards/format_reward": 1.0,
|
| 13800 |
+
"step": 862,
|
| 13801 |
+
"temporal_rewards": 0.8571428656578064
|
| 13802 |
+
},
|
| 13803 |
+
{
|
| 13804 |
+
"all_correct": 0.2857142857142857,
|
| 13805 |
+
"all_wrong": 0.0,
|
| 13806 |
+
"completion_length": 242.6607208251953,
|
| 13807 |
+
"epoch": 0.26407588739290083,
|
| 13808 |
+
"grad_norm": 5.694668293113178,
|
| 13809 |
+
"kl": 0.07659912109375,
|
| 13810 |
+
"learning_rate": 8.411085303977954e-07,
|
| 13811 |
+
"loss": 0.0031,
|
| 13812 |
+
"reward": 1.8662446737289429,
|
| 13813 |
+
"reward_std": 0.2061271071434021,
|
| 13814 |
+
"rewards/accuracy_reward": 0.5251731872558594,
|
| 13815 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13816 |
+
"step": 863,
|
| 13817 |
+
"temporal_rewards": 1.0
|
| 13818 |
+
},
|
| 13819 |
+
{
|
| 13820 |
+
"all_correct": 0.14285714285714285,
|
| 13821 |
+
"all_wrong": 0.14285714285714285,
|
| 13822 |
+
"completion_length": 208.46429443359375,
|
| 13823 |
+
"epoch": 0.26438188494492043,
|
| 13824 |
+
"grad_norm": 3.3670758880528613,
|
| 13825 |
+
"kl": 0.08154296875,
|
| 13826 |
+
"learning_rate": 8.407569385462614e-07,
|
| 13827 |
+
"loss": 0.0033,
|
| 13828 |
+
"reward": 1.8434805870056152,
|
| 13829 |
+
"reward_std": 0.27712368965148926,
|
| 13830 |
+
"rewards/accuracy_reward": 0.5720521807670593,
|
| 13831 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 13832 |
+
"step": 864,
|
| 13833 |
+
"temporal_rewards": 1.0
|
| 13834 |
+
},
|
| 13835 |
+
{
|
| 13836 |
+
"all_correct": 0.2857142857142857,
|
| 13837 |
+
"all_wrong": 0.0,
|
| 13838 |
+
"completion_length": 238.94644165039062,
|
| 13839 |
+
"epoch": 0.26468788249694003,
|
| 13840 |
+
"grad_norm": 3.0431561776514036,
|
| 13841 |
+
"kl": 0.08380126953125,
|
| 13842 |
+
"learning_rate": 8.40405031789171e-07,
|
| 13843 |
+
"loss": 0.0034,
|
| 13844 |
+
"reward": 1.9124431610107422,
|
| 13845 |
+
"reward_std": 0.06434178352355957,
|
| 13846 |
+
"rewards/accuracy_reward": 0.5338715314865112,
|
| 13847 |
+
"rewards/format_reward": 1.0,
|
| 13848 |
+
"step": 865,
|
| 13849 |
+
"temporal_rewards": 1.0
|
| 13850 |
+
},
|
| 13851 |
+
{
|
| 13852 |
+
"all_correct": 0.42857142857142855,
|
| 13853 |
+
"all_wrong": 0.0,
|
| 13854 |
+
"completion_length": 121.92857360839844,
|
| 13855 |
+
"epoch": 0.26499388004895963,
|
| 13856 |
+
"grad_norm": 3.4914420316550565,
|
| 13857 |
+
"kl": 0.07379150390625,
|
| 13858 |
+
"learning_rate": 8.400528104517335e-07,
|
| 13859 |
+
"loss": 0.003,
|
| 13860 |
+
"reward": 1.9986642599105835,
|
| 13861 |
+
"reward_std": 0.189077690243721,
|
| 13862 |
+
"rewards/accuracy_reward": 0.7308071851730347,
|
| 13863 |
+
"rewards/format_reward": 1.0,
|
| 13864 |
+
"step": 866,
|
| 13865 |
+
"temporal_rewards": 1.0
|
| 13866 |
+
},
|
| 13867 |
+
{
|
| 13868 |
+
"all_correct": 0.2857142857142857,
|
| 13869 |
+
"all_wrong": 0.0,
|
| 13870 |
+
"completion_length": 198.8928680419922,
|
| 13871 |
+
"epoch": 0.2652998776009792,
|
| 13872 |
+
"grad_norm": 4.592100148789711,
|
| 13873 |
+
"kl": 0.073974609375,
|
| 13874 |
+
"learning_rate": 8.39700274859449e-07,
|
| 13875 |
+
"loss": 0.003,
|
| 13876 |
+
"reward": 1.7516790628433228,
|
| 13877 |
+
"reward_std": 0.18580719828605652,
|
| 13878 |
+
"rewards/accuracy_reward": 0.487393319606781,
|
| 13879 |
+
"rewards/format_reward": 1.0,
|
| 13880 |
+
"step": 867,
|
| 13881 |
+
"temporal_rewards": 0.8571428656578064
|
| 13882 |
+
},
|
| 13883 |
+
{
|
| 13884 |
+
"all_correct": 0.2857142857142857,
|
| 13885 |
+
"all_wrong": 0.14285714285714285,
|
| 13886 |
+
"completion_length": 131.33929443359375,
|
| 13887 |
+
"epoch": 0.26560587515299877,
|
| 13888 |
+
"grad_norm": 3.07398773389383,
|
| 13889 |
+
"kl": 0.06939697265625,
|
| 13890 |
+
"learning_rate": 8.393474253381081e-07,
|
| 13891 |
+
"loss": 0.0028,
|
| 13892 |
+
"reward": 1.7859539985656738,
|
| 13893 |
+
"reward_std": 0.10873201489448547,
|
| 13894 |
+
"rewards/accuracy_reward": 0.571668267250061,
|
| 13895 |
+
"rewards/format_reward": 1.0,
|
| 13896 |
+
"step": 868,
|
| 13897 |
+
"temporal_rewards": 0.8571428656578064
|
| 13898 |
+
},
|
| 13899 |
+
{
|
| 13900 |
+
"all_correct": 0.14285714285714285,
|
| 13901 |
+
"all_wrong": 0.14285714285714285,
|
| 13902 |
+
"completion_length": 252.7857208251953,
|
| 13903 |
+
"epoch": 0.26591187270501837,
|
| 13904 |
+
"grad_norm": 5.434152016124052,
|
| 13905 |
+
"kl": 0.074462890625,
|
| 13906 |
+
"learning_rate": 8.389942622137917e-07,
|
| 13907 |
+
"loss": 0.003,
|
| 13908 |
+
"reward": 1.732815146446228,
|
| 13909 |
+
"reward_std": 0.04574719816446304,
|
| 13910 |
+
"rewards/accuracy_reward": 0.3971007168292999,
|
| 13911 |
+
"rewards/format_reward": 1.0,
|
| 13912 |
+
"step": 869,
|
| 13913 |
+
"temporal_rewards": 0.8571428656578064
|
| 13914 |
+
},
|
| 13915 |
+
{
|
| 13916 |
+
"all_correct": 0.42857142857142855,
|
| 13917 |
+
"all_wrong": 0.14285714285714285,
|
| 13918 |
+
"completion_length": 163.5178680419922,
|
| 13919 |
+
"epoch": 0.26621787025703797,
|
| 13920 |
+
"grad_norm": 2.5700748516679117,
|
| 13921 |
+
"kl": 0.07745361328125,
|
| 13922 |
+
"learning_rate": 8.386407858128706e-07,
|
| 13923 |
+
"loss": 0.0031,
|
| 13924 |
+
"reward": 1.822379231452942,
|
| 13925 |
+
"reward_std": 0.09795420616865158,
|
| 13926 |
+
"rewards/accuracy_reward": 0.5830934643745422,
|
| 13927 |
+
"rewards/format_reward": 1.0,
|
| 13928 |
+
"step": 870,
|
| 13929 |
+
"temporal_rewards": 0.714285671710968
|
| 13930 |
+
},
|
| 13931 |
+
{
|
| 13932 |
+
"all_correct": 0.0,
|
| 13933 |
+
"all_wrong": 0.14285714285714285,
|
| 13934 |
+
"completion_length": 308.71429443359375,
|
| 13935 |
+
"epoch": 0.2665238678090575,
|
| 13936 |
+
"grad_norm": 11.736614064127435,
|
| 13937 |
+
"kl": 0.373779296875,
|
| 13938 |
+
"learning_rate": 8.382869964620043e-07,
|
| 13939 |
+
"loss": 0.015,
|
| 13940 |
+
"reward": 1.428949236869812,
|
| 13941 |
+
"reward_std": 0.24033333361148834,
|
| 13942 |
+
"rewards/accuracy_reward": 0.17359207570552826,
|
| 13943 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13944 |
+
"step": 871,
|
| 13945 |
+
"temporal_rewards": 0.8571428656578064
|
| 13946 |
+
},
|
| 13947 |
+
{
|
| 13948 |
+
"all_correct": 0.0,
|
| 13949 |
+
"all_wrong": 0.0,
|
| 13950 |
+
"completion_length": 295.6607360839844,
|
| 13951 |
+
"epoch": 0.2668298653610771,
|
| 13952 |
+
"grad_norm": 3.7904993786293204,
|
| 13953 |
+
"kl": 0.09088134765625,
|
| 13954 |
+
"learning_rate": 8.379328944881423e-07,
|
| 13955 |
+
"loss": 0.0036,
|
| 13956 |
+
"reward": 1.7479736804962158,
|
| 13957 |
+
"reward_std": 0.1676667034626007,
|
| 13958 |
+
"rewards/accuracy_reward": 0.3872593343257904,
|
| 13959 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13960 |
+
"step": 872,
|
| 13961 |
+
"temporal_rewards": 1.0
|
| 13962 |
+
},
|
| 13963 |
+
{
|
| 13964 |
+
"all_correct": 0.0,
|
| 13965 |
+
"all_wrong": 0.0,
|
| 13966 |
+
"completion_length": 348.8571472167969,
|
| 13967 |
+
"epoch": 0.2671358629130967,
|
| 13968 |
+
"grad_norm": 2.2910548072438606,
|
| 13969 |
+
"kl": 0.081787109375,
|
| 13970 |
+
"learning_rate": 8.375784802185231e-07,
|
| 13971 |
+
"loss": 0.0033,
|
| 13972 |
+
"reward": 1.61097252368927,
|
| 13973 |
+
"reward_std": 0.27649515867233276,
|
| 13974 |
+
"rewards/accuracy_reward": 0.3288295865058899,
|
| 13975 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 13976 |
+
"step": 873,
|
| 13977 |
+
"temporal_rewards": 0.8571428656578064
|
| 13978 |
+
},
|
| 13979 |
+
{
|
| 13980 |
+
"all_correct": 0.2857142857142857,
|
| 13981 |
+
"all_wrong": 0.2857142857142857,
|
| 13982 |
+
"completion_length": 137.33929443359375,
|
| 13983 |
+
"epoch": 0.26744186046511625,
|
| 13984 |
+
"grad_norm": 9.477587189657482,
|
| 13985 |
+
"kl": 0.1707763671875,
|
| 13986 |
+
"learning_rate": 8.372237539806729e-07,
|
| 13987 |
+
"loss": 0.0068,
|
| 13988 |
+
"reward": 1.7725310325622559,
|
| 13989 |
+
"reward_std": 0.10993125289678574,
|
| 13990 |
+
"rewards/accuracy_reward": 0.5653882026672363,
|
| 13991 |
+
"rewards/format_reward": 1.0,
|
| 13992 |
+
"step": 874,
|
| 13993 |
+
"temporal_rewards": 1.0
|
| 13994 |
+
},
|
| 13995 |
+
{
|
| 13996 |
+
"all_correct": 0.42857142857142855,
|
| 13997 |
+
"all_wrong": 0.14285714285714285,
|
| 13998 |
+
"completion_length": 139.67857360839844,
|
| 13999 |
+
"epoch": 0.26774785801713585,
|
| 14000 |
+
"grad_norm": 3.0317667233515655,
|
| 14001 |
+
"kl": 0.072021484375,
|
| 14002 |
+
"learning_rate": 8.36868716102407e-07,
|
| 14003 |
+
"loss": 0.0029,
|
| 14004 |
+
"reward": 1.7030521631240845,
|
| 14005 |
+
"reward_std": 0.12748095393180847,
|
| 14006 |
+
"rewards/accuracy_reward": 0.5262664556503296,
|
| 14007 |
+
"rewards/format_reward": 1.0,
|
| 14008 |
+
"step": 875,
|
| 14009 |
+
"temporal_rewards": 0.8571428656578064
|
| 14010 |
+
},
|
| 14011 |
+
{
|
| 14012 |
+
"all_correct": 0.14285714285714285,
|
| 14013 |
+
"all_wrong": 0.14285714285714285,
|
| 14014 |
+
"completion_length": 209.87501525878906,
|
| 14015 |
+
"epoch": 0.26805385556915545,
|
| 14016 |
+
"grad_norm": 2.3417260897973846,
|
| 14017 |
+
"kl": 0.09869384765625,
|
| 14018 |
+
"learning_rate": 8.365133669118289e-07,
|
| 14019 |
+
"loss": 0.0039,
|
| 14020 |
+
"reward": 1.5527209043502808,
|
| 14021 |
+
"reward_std": 0.14238658547401428,
|
| 14022 |
+
"rewards/accuracy_reward": 0.302720844745636,
|
| 14023 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14024 |
+
"step": 876,
|
| 14025 |
+
"temporal_rewards": 0.714285671710968
|
| 14026 |
+
},
|
| 14027 |
+
{
|
| 14028 |
+
"all_correct": 0.5714285714285714,
|
| 14029 |
+
"all_wrong": 0.0,
|
| 14030 |
+
"completion_length": 221.87501525878906,
|
| 14031 |
+
"epoch": 0.26835985312117505,
|
| 14032 |
+
"grad_norm": 1.4544164125883434,
|
| 14033 |
+
"kl": 0.0880126953125,
|
| 14034 |
+
"learning_rate": 8.361577067373289e-07,
|
| 14035 |
+
"loss": 0.0035,
|
| 14036 |
+
"reward": 1.9879584312438965,
|
| 14037 |
+
"reward_std": 0.06715244054794312,
|
| 14038 |
+
"rewards/accuracy_reward": 0.6915298700332642,
|
| 14039 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14040 |
+
"step": 877,
|
| 14041 |
+
"temporal_rewards": 0.8571428656578064
|
| 14042 |
+
},
|
| 14043 |
+
{
|
| 14044 |
+
"all_correct": 0.0,
|
| 14045 |
+
"all_wrong": 0.0,
|
| 14046 |
+
"completion_length": 137.6428680419922,
|
| 14047 |
+
"epoch": 0.2686658506731946,
|
| 14048 |
+
"grad_norm": 4.041225889721134,
|
| 14049 |
+
"kl": 0.07952880859375,
|
| 14050 |
+
"learning_rate": 8.358017359075853e-07,
|
| 14051 |
+
"loss": 0.0032,
|
| 14052 |
+
"reward": 1.634163737297058,
|
| 14053 |
+
"reward_std": 0.3102668523788452,
|
| 14054 |
+
"rewards/accuracy_reward": 0.4466637969017029,
|
| 14055 |
+
"rewards/format_reward": 1.0,
|
| 14056 |
+
"step": 878,
|
| 14057 |
+
"temporal_rewards": 1.0
|
| 14058 |
+
},
|
| 14059 |
+
{
|
| 14060 |
+
"all_correct": 0.42857142857142855,
|
| 14061 |
+
"all_wrong": 0.0,
|
| 14062 |
+
"completion_length": 202.58929443359375,
|
| 14063 |
+
"epoch": 0.2689718482252142,
|
| 14064 |
+
"grad_norm": 2.865476731408168,
|
| 14065 |
+
"kl": 0.071044921875,
|
| 14066 |
+
"learning_rate": 8.354454547515632e-07,
|
| 14067 |
+
"loss": 0.0028,
|
| 14068 |
+
"reward": 1.960200309753418,
|
| 14069 |
+
"reward_std": 0.12427835166454315,
|
| 14070 |
+
"rewards/accuracy_reward": 0.6244859099388123,
|
| 14071 |
+
"rewards/format_reward": 1.0,
|
| 14072 |
+
"step": 879,
|
| 14073 |
+
"temporal_rewards": 1.0
|
| 14074 |
+
},
|
| 14075 |
+
{
|
| 14076 |
+
"all_correct": 0.14285714285714285,
|
| 14077 |
+
"all_wrong": 0.14285714285714285,
|
| 14078 |
+
"completion_length": 230.96429443359375,
|
| 14079 |
+
"epoch": 0.2692778457772338,
|
| 14080 |
+
"grad_norm": 4.795281271108613,
|
| 14081 |
+
"kl": 0.07501220703125,
|
| 14082 |
+
"learning_rate": 8.35088863598515e-07,
|
| 14083 |
+
"loss": 0.003,
|
| 14084 |
+
"reward": 1.6301624774932861,
|
| 14085 |
+
"reward_std": 0.20454014837741852,
|
| 14086 |
+
"rewards/accuracy_reward": 0.42301949858665466,
|
| 14087 |
+
"rewards/format_reward": 1.0,
|
| 14088 |
+
"step": 880,
|
| 14089 |
+
"temporal_rewards": 0.8571428656578064
|
| 14090 |
+
},
|
| 14091 |
+
{
|
| 14092 |
+
"all_correct": 0.2857142857142857,
|
| 14093 |
+
"all_wrong": 0.0,
|
| 14094 |
+
"completion_length": 174.5178680419922,
|
| 14095 |
+
"epoch": 0.2695838433292534,
|
| 14096 |
+
"grad_norm": 22.401908979877813,
|
| 14097 |
+
"kl": 1.875,
|
| 14098 |
+
"learning_rate": 8.347319627779788e-07,
|
| 14099 |
+
"loss": 0.075,
|
| 14100 |
+
"reward": 1.7476712465286255,
|
| 14101 |
+
"reward_std": 0.35592812299728394,
|
| 14102 |
+
"rewards/accuracy_reward": 0.5530281662940979,
|
| 14103 |
+
"rewards/format_reward": 0.9642857313156128,
|
| 14104 |
+
"step": 881,
|
| 14105 |
+
"temporal_rewards": 0.8571428656578064
|
| 14106 |
+
},
|
| 14107 |
+
{
|
| 14108 |
+
"all_correct": 0.14285714285714285,
|
| 14109 |
+
"all_wrong": 0.0,
|
| 14110 |
+
"completion_length": 189.82144165039062,
|
| 14111 |
+
"epoch": 0.26988984088127294,
|
| 14112 |
+
"grad_norm": 10.071232157212238,
|
| 14113 |
+
"kl": 0.0775146484375,
|
| 14114 |
+
"learning_rate": 8.343747526197796e-07,
|
| 14115 |
+
"loss": 0.0031,
|
| 14116 |
+
"reward": 1.7986595630645752,
|
| 14117 |
+
"reward_std": 0.3468095064163208,
|
| 14118 |
+
"rewards/accuracy_reward": 0.545087993144989,
|
| 14119 |
+
"rewards/format_reward": 1.0,
|
| 14120 |
+
"step": 882,
|
| 14121 |
+
"temporal_rewards": 1.0
|
| 14122 |
+
},
|
| 14123 |
+
{
|
| 14124 |
+
"all_correct": 0.2857142857142857,
|
| 14125 |
+
"all_wrong": 0.14285714285714285,
|
| 14126 |
+
"completion_length": 136.7857208251953,
|
| 14127 |
+
"epoch": 0.27019583843329253,
|
| 14128 |
+
"grad_norm": 5.112804545427567,
|
| 14129 |
+
"kl": 0.07208251953125,
|
| 14130 |
+
"learning_rate": 8.340172334540279e-07,
|
| 14131 |
+
"loss": 0.0029,
|
| 14132 |
+
"reward": 1.667857050895691,
|
| 14133 |
+
"reward_std": 0.29825282096862793,
|
| 14134 |
+
"rewards/accuracy_reward": 0.5178571939468384,
|
| 14135 |
+
"rewards/format_reward": 1.0,
|
| 14136 |
+
"step": 883,
|
| 14137 |
+
"temporal_rewards": 0.8571428656578064
|
| 14138 |
+
},
|
| 14139 |
+
{
|
| 14140 |
+
"all_correct": 0.14285714285714285,
|
| 14141 |
+
"all_wrong": 0.2857142857142857,
|
| 14142 |
+
"completion_length": 232.73214721679688,
|
| 14143 |
+
"epoch": 0.27050183598531213,
|
| 14144 |
+
"grad_norm": 3.5504006872403204,
|
| 14145 |
+
"kl": 0.072998046875,
|
| 14146 |
+
"learning_rate": 8.336594056111197e-07,
|
| 14147 |
+
"loss": 0.0029,
|
| 14148 |
+
"reward": 1.5460340976715088,
|
| 14149 |
+
"reward_std": 0.10997889935970306,
|
| 14150 |
+
"rewards/accuracy_reward": 0.3103196620941162,
|
| 14151 |
+
"rewards/format_reward": 1.0,
|
| 14152 |
+
"step": 884,
|
| 14153 |
+
"temporal_rewards": 0.8571428656578064
|
| 14154 |
+
},
|
| 14155 |
+
{
|
| 14156 |
+
"all_correct": 0.2857142857142857,
|
| 14157 |
+
"all_wrong": 0.14285714285714285,
|
| 14158 |
+
"completion_length": 153.32144165039062,
|
| 14159 |
+
"epoch": 0.2708078335373317,
|
| 14160 |
+
"grad_norm": 5.335782717652437,
|
| 14161 |
+
"kl": 0.0709228515625,
|
| 14162 |
+
"learning_rate": 8.333012694217365e-07,
|
| 14163 |
+
"loss": 0.0028,
|
| 14164 |
+
"reward": 1.7492895126342773,
|
| 14165 |
+
"reward_std": 0.12363868951797485,
|
| 14166 |
+
"rewards/accuracy_reward": 0.5189323425292969,
|
| 14167 |
+
"rewards/format_reward": 1.0,
|
| 14168 |
+
"step": 885,
|
| 14169 |
+
"temporal_rewards": 0.8571428656578064
|
| 14170 |
+
},
|
| 14171 |
+
{
|
| 14172 |
+
"all_correct": 0.42857142857142855,
|
| 14173 |
+
"all_wrong": 0.0,
|
| 14174 |
+
"completion_length": 184.60714721679688,
|
| 14175 |
+
"epoch": 0.2711138310893513,
|
| 14176 |
+
"grad_norm": 3.8542094040952666,
|
| 14177 |
+
"kl": 0.0665283203125,
|
| 14178 |
+
"learning_rate": 8.329428252168445e-07,
|
| 14179 |
+
"loss": 0.0027,
|
| 14180 |
+
"reward": 1.9230225086212158,
|
| 14181 |
+
"reward_std": 0.23322181403636932,
|
| 14182 |
+
"rewards/accuracy_reward": 0.6712366342544556,
|
| 14183 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14184 |
+
"step": 886,
|
| 14185 |
+
"temporal_rewards": 0.8571428656578064
|
| 14186 |
+
},
|
| 14187 |
+
{
|
| 14188 |
+
"all_correct": 0.2857142857142857,
|
| 14189 |
+
"all_wrong": 0.14285714285714285,
|
| 14190 |
+
"completion_length": 255.35714721679688,
|
| 14191 |
+
"epoch": 0.2714198286413709,
|
| 14192 |
+
"grad_norm": 6.692207465796444,
|
| 14193 |
+
"kl": 0.083740234375,
|
| 14194 |
+
"learning_rate": 8.325840733276947e-07,
|
| 14195 |
+
"loss": 0.0033,
|
| 14196 |
+
"reward": 1.7804242372512817,
|
| 14197 |
+
"reward_std": 0.03548278659582138,
|
| 14198 |
+
"rewards/accuracy_reward": 0.43756699562072754,
|
| 14199 |
+
"rewards/format_reward": 1.0,
|
| 14200 |
+
"step": 887,
|
| 14201 |
+
"temporal_rewards": 0.8571428656578064
|
| 14202 |
+
},
|
| 14203 |
+
{
|
| 14204 |
+
"all_correct": 0.14285714285714285,
|
| 14205 |
+
"all_wrong": 0.0,
|
| 14206 |
+
"completion_length": 231.6428680419922,
|
| 14207 |
+
"epoch": 0.2717258261933905,
|
| 14208 |
+
"grad_norm": 2.5721424140087072,
|
| 14209 |
+
"kl": 0.0772705078125,
|
| 14210 |
+
"learning_rate": 8.322250140858228e-07,
|
| 14211 |
+
"loss": 0.0031,
|
| 14212 |
+
"reward": 1.7938824892044067,
|
| 14213 |
+
"reward_std": 0.15584063529968262,
|
| 14214 |
+
"rewards/accuracy_reward": 0.47959670424461365,
|
| 14215 |
+
"rewards/format_reward": 1.0,
|
| 14216 |
+
"step": 888,
|
| 14217 |
+
"temporal_rewards": 1.0
|
| 14218 |
+
},
|
| 14219 |
+
{
|
| 14220 |
+
"all_correct": 0.2857142857142857,
|
| 14221 |
+
"all_wrong": 0.14285714285714285,
|
| 14222 |
+
"completion_length": 142.1607208251953,
|
| 14223 |
+
"epoch": 0.27203182374541,
|
| 14224 |
+
"grad_norm": 4.045289295991584,
|
| 14225 |
+
"kl": 0.0728759765625,
|
| 14226 |
+
"learning_rate": 8.318656478230477e-07,
|
| 14227 |
+
"loss": 0.0029,
|
| 14228 |
+
"reward": 1.7569411993026733,
|
| 14229 |
+
"reward_std": 0.26805686950683594,
|
| 14230 |
+
"rewards/accuracy_reward": 0.5855125188827515,
|
| 14231 |
+
"rewards/format_reward": 1.0,
|
| 14232 |
+
"step": 889,
|
| 14233 |
+
"temporal_rewards": 0.8571428656578064
|
| 14234 |
+
},
|
| 14235 |
+
{
|
| 14236 |
+
"all_correct": 0.2857142857142857,
|
| 14237 |
+
"all_wrong": 0.0,
|
| 14238 |
+
"completion_length": 157.42857360839844,
|
| 14239 |
+
"epoch": 0.2723378212974296,
|
| 14240 |
+
"grad_norm": 5.628418656090372,
|
| 14241 |
+
"kl": 0.0838623046875,
|
| 14242 |
+
"learning_rate": 8.315059748714728e-07,
|
| 14243 |
+
"loss": 0.0034,
|
| 14244 |
+
"reward": 1.9179658889770508,
|
| 14245 |
+
"reward_std": 0.310136616230011,
|
| 14246 |
+
"rewards/accuracy_reward": 0.6751086711883545,
|
| 14247 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14248 |
+
"step": 890,
|
| 14249 |
+
"temporal_rewards": 0.8571428656578064
|
| 14250 |
+
},
|
| 14251 |
+
{
|
| 14252 |
+
"all_correct": 0.42857142857142855,
|
| 14253 |
+
"all_wrong": 0.14285714285714285,
|
| 14254 |
+
"completion_length": 144.21429443359375,
|
| 14255 |
+
"epoch": 0.2726438188494492,
|
| 14256 |
+
"grad_norm": 2.9699381051247133,
|
| 14257 |
+
"kl": 0.072998046875,
|
| 14258 |
+
"learning_rate": 8.311459955634843e-07,
|
| 14259 |
+
"loss": 0.0029,
|
| 14260 |
+
"reward": 1.8272987604141235,
|
| 14261 |
+
"reward_std": 0.03353741019964218,
|
| 14262 |
+
"rewards/accuracy_reward": 0.5665844678878784,
|
| 14263 |
+
"rewards/format_reward": 1.0,
|
| 14264 |
+
"step": 891,
|
| 14265 |
+
"temporal_rewards": 1.0
|
| 14266 |
+
},
|
| 14267 |
+
{
|
| 14268 |
+
"all_correct": 0.2857142857142857,
|
| 14269 |
+
"all_wrong": 0.2857142857142857,
|
| 14270 |
+
"completion_length": 172.32144165039062,
|
| 14271 |
+
"epoch": 0.2729498164014688,
|
| 14272 |
+
"grad_norm": 2.5078210503509757,
|
| 14273 |
+
"kl": 0.07110595703125,
|
| 14274 |
+
"learning_rate": 8.307857102317521e-07,
|
| 14275 |
+
"loss": 0.0028,
|
| 14276 |
+
"reward": 1.698065996170044,
|
| 14277 |
+
"reward_std": 0.03687385469675064,
|
| 14278 |
+
"rewards/accuracy_reward": 0.4587802290916443,
|
| 14279 |
+
"rewards/format_reward": 1.0,
|
| 14280 |
+
"step": 892,
|
| 14281 |
+
"temporal_rewards": 0.8571428656578064
|
| 14282 |
+
},
|
| 14283 |
+
{
|
| 14284 |
+
"all_correct": 0.14285714285714285,
|
| 14285 |
+
"all_wrong": 0.14285714285714285,
|
| 14286 |
+
"completion_length": 226.8035888671875,
|
| 14287 |
+
"epoch": 0.27325581395348836,
|
| 14288 |
+
"grad_norm": 2.847612355217105,
|
| 14289 |
+
"kl": 0.0662841796875,
|
| 14290 |
+
"learning_rate": 8.304251192092284e-07,
|
| 14291 |
+
"loss": 0.0027,
|
| 14292 |
+
"reward": 1.7976547479629517,
|
| 14293 |
+
"reward_std": 0.20810642838478088,
|
| 14294 |
+
"rewards/accuracy_reward": 0.5190832614898682,
|
| 14295 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14296 |
+
"step": 893,
|
| 14297 |
+
"temporal_rewards": 1.0
|
| 14298 |
+
},
|
| 14299 |
+
{
|
| 14300 |
+
"all_correct": 0.5714285714285714,
|
| 14301 |
+
"all_wrong": 0.0,
|
| 14302 |
+
"completion_length": 215.75001525878906,
|
| 14303 |
+
"epoch": 0.27356181150550796,
|
| 14304 |
+
"grad_norm": 7.342437663901097,
|
| 14305 |
+
"kl": 0.071044921875,
|
| 14306 |
+
"learning_rate": 8.300642228291484e-07,
|
| 14307 |
+
"loss": 0.0028,
|
| 14308 |
+
"reward": 2.001753568649292,
|
| 14309 |
+
"reward_std": 0.07981517910957336,
|
| 14310 |
+
"rewards/accuracy_reward": 0.6910392642021179,
|
| 14311 |
+
"rewards/format_reward": 1.0,
|
| 14312 |
+
"step": 894,
|
| 14313 |
+
"temporal_rewards": 0.8571428656578064
|
| 14314 |
+
},
|
| 14315 |
+
{
|
| 14316 |
+
"all_correct": 0.2857142857142857,
|
| 14317 |
+
"all_wrong": 0.14285714285714285,
|
| 14318 |
+
"completion_length": 245.8035888671875,
|
| 14319 |
+
"epoch": 0.27386780905752756,
|
| 14320 |
+
"grad_norm": 2.8686174982836996,
|
| 14321 |
+
"kl": 0.1077880859375,
|
| 14322 |
+
"learning_rate": 8.297030214250291e-07,
|
| 14323 |
+
"loss": 0.0043,
|
| 14324 |
+
"reward": 1.7279855012893677,
|
| 14325 |
+
"reward_std": 0.20766517519950867,
|
| 14326 |
+
"rewards/accuracy_reward": 0.46548548340797424,
|
| 14327 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14328 |
+
"step": 895,
|
| 14329 |
+
"temporal_rewards": 1.0
|
| 14330 |
+
},
|
| 14331 |
+
{
|
| 14332 |
+
"all_correct": 0.14285714285714285,
|
| 14333 |
+
"all_wrong": 0.14285714285714285,
|
| 14334 |
+
"completion_length": 180.87501525878906,
|
| 14335 |
+
"epoch": 0.2741738066095471,
|
| 14336 |
+
"grad_norm": 3.844679218873974,
|
| 14337 |
+
"kl": 0.06439208984375,
|
| 14338 |
+
"learning_rate": 8.293415153306697e-07,
|
| 14339 |
+
"loss": 0.0026,
|
| 14340 |
+
"reward": 1.570464015007019,
|
| 14341 |
+
"reward_std": 0.27158409357070923,
|
| 14342 |
+
"rewards/accuracy_reward": 0.44546398520469666,
|
| 14343 |
+
"rewards/format_reward": 1.0,
|
| 14344 |
+
"step": 896,
|
| 14345 |
+
"temporal_rewards": 0.5714285373687744
|
| 14346 |
+
},
|
| 14347 |
+
{
|
| 14348 |
+
"all_correct": 0.14285714285714285,
|
| 14349 |
+
"all_wrong": 0.0,
|
| 14350 |
+
"completion_length": 244.7857208251953,
|
| 14351 |
+
"epoch": 0.2744798041615667,
|
| 14352 |
+
"grad_norm": 5.30272222373688,
|
| 14353 |
+
"kl": 0.0797119140625,
|
| 14354 |
+
"learning_rate": 8.289797048801503e-07,
|
| 14355 |
+
"loss": 0.0032,
|
| 14356 |
+
"reward": 1.8444569110870361,
|
| 14357 |
+
"reward_std": 0.18049336969852448,
|
| 14358 |
+
"rewards/accuracy_reward": 0.5033854246139526,
|
| 14359 |
+
"rewards/format_reward": 1.0,
|
| 14360 |
+
"step": 897,
|
| 14361 |
+
"temporal_rewards": 1.0
|
| 14362 |
+
},
|
| 14363 |
+
{
|
| 14364 |
+
"all_correct": 0.2857142857142857,
|
| 14365 |
+
"all_wrong": 0.0,
|
| 14366 |
+
"completion_length": 219.85714721679688,
|
| 14367 |
+
"epoch": 0.2747858017135863,
|
| 14368 |
+
"grad_norm": 4.238591988137467,
|
| 14369 |
+
"kl": 0.0887451171875,
|
| 14370 |
+
"learning_rate": 8.286175904078332e-07,
|
| 14371 |
+
"loss": 0.0035,
|
| 14372 |
+
"reward": 1.8598953485488892,
|
| 14373 |
+
"reward_std": 0.446624755859375,
|
| 14374 |
+
"rewards/accuracy_reward": 0.6420382261276245,
|
| 14375 |
+
"rewards/format_reward": 0.9285714626312256,
|
| 14376 |
+
"step": 898,
|
| 14377 |
+
"temporal_rewards": 1.0
|
| 14378 |
+
},
|
| 14379 |
+
{
|
| 14380 |
+
"all_correct": 0.14285714285714285,
|
| 14381 |
+
"all_wrong": 0.0,
|
| 14382 |
+
"completion_length": 179.73214721679688,
|
| 14383 |
+
"epoch": 0.2750917992656059,
|
| 14384 |
+
"grad_norm": 3.3254751036491594,
|
| 14385 |
+
"kl": 0.07159423828125,
|
| 14386 |
+
"learning_rate": 8.282551722483611e-07,
|
| 14387 |
+
"loss": 0.0029,
|
| 14388 |
+
"reward": 1.7057297229766846,
|
| 14389 |
+
"reward_std": 0.18852263689041138,
|
| 14390 |
+
"rewards/accuracy_reward": 0.4735867381095886,
|
| 14391 |
+
"rewards/format_reward": 1.0,
|
| 14392 |
+
"step": 899,
|
| 14393 |
+
"temporal_rewards": 0.8571428656578064
|
| 14394 |
+
},
|
| 14395 |
+
{
|
| 14396 |
+
"all_correct": 0.14285714285714285,
|
| 14397 |
+
"all_wrong": 0.14285714285714285,
|
| 14398 |
+
"completion_length": 187.19644165039062,
|
| 14399 |
+
"epoch": 0.27539779681762544,
|
| 14400 |
+
"grad_norm": 4.612353811363961,
|
| 14401 |
+
"kl": 0.08416748046875,
|
| 14402 |
+
"learning_rate": 8.278924507366573e-07,
|
| 14403 |
+
"loss": 0.0034,
|
| 14404 |
+
"reward": 1.6086270809173584,
|
| 14405 |
+
"reward_std": 0.11791487783193588,
|
| 14406 |
+
"rewards/accuracy_reward": 0.38184139132499695,
|
| 14407 |
+
"rewards/format_reward": 0.9821429252624512,
|
| 14408 |
+
"step": 900,
|
| 14409 |
+
"temporal_rewards": 0.714285671710968
|
| 14410 |
}
|
| 14411 |
],
|
| 14412 |
"logging_steps": 1.0,
|