diff --git a/.gitattributes b/.gitattributes index dbfbbb502d24b2e02a0f621a44e344a69f066987..178ed2d33b5604c444688a170cfb1cd1d33c0d5b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -154,3 +154,35 @@ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000 modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/a5/07/a50785f196369c06420598b40db7fe0178a90b0c3f804948751f4ad50381d84f filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/7a/46/7a46354b82536b5fd3b84c627a781ab5dd486257d6ccaec684ef58965d14c8ba filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/5a/82/5a8268831a8f08bfce731da9b4d1a69338adf279352c89832f2a6b9ec3400203 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/b0/4a/b04a7b7e9b10265cc9c1a6bd5bfb4bb4239a5da7a393588fc17c0f703daf9db7 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/73/ca/73caeb604d1153a7db8dd73b2fc37b9e826fec045c5ee26810e16966b749c507 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/51/c3/51c392d991a187c7fb9c677c9f7f3dc089284791c082912cbcad51d1534a37b0 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/d2/46/d246c14f659e3528a5e9698dbc9236bc6bb9eb18c55b627e0292e5e33a6f0d46 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/3a/b1/3ab12d2fdb52b6310449cb6653cf879090689802313feaa7917477e798206d38 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/4e/50/4e5081c229000fd604fd5b2c8852a910dc1525539d70ab47b202247f261ace45 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/29/d7/29d7a8844c0ec5618ca98b651b06c5d47a350267ffbc46cd92ee978330ed7107 filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/model.safetensors filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/model.safetensors filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/model.safetensors filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/bf/f7/bff7e107bf5d1efad55fa123a28edf876fc0a79e6504a35e8436b491f3bce835 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/f3/53/f3539d0188328679d6b168c95640b0e9bb61c47eb0294171660930064b47ef32 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/b0/66/b066839ff96a64f33d23d29e6842f0e37aa831105dd531efd8b7a2278350651e filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/1b/ba/1bbab43b76cdbed4ee5364e337787e088ac7a5b381ebe2f680cc9ee3fbf04b17 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/1b/3a/1b3a448b206afc6dc52858a4a6b015ce6b78d98fd496ec76352773962792dc26 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/f9/d7/f9d7d24b303024c8f8b32f821406abda528e58a29cb179cc9cb74d27b9bb1bc6 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/b3/f9/b3f9b4d5bda7f8dcca46b398b07a5d34c51df8b1719cca216ea5360ff0f57115 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/40/f9/40f9a2ce28306aa7f03882afc781b4b2a7070bc09f5cb5b454593ccab7e320cd filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/c9/29/c9292f8084df08f44633c95111d71cf5055e02e10995567cc733f2bef07d3113 filter=lfs diff=lfs merge=lfs -text +.git/lfs/objects/7e/97/7e973922297f0c493940a90076a687d8de67be2aa6db71915dd86d5c61411430 filter=lfs diff=lfs merge=lfs -text +modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors deleted file mode 100644 index dcbd5404291cd2cac08ade7043e00f97ede84290..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b04a7b7e9b10265cc9c1a6bd5bfb4bb4239a5da7a393588fc17c0f703daf9db7 -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt deleted file mode 100644 index 034f62f520a3a5aec58994ff229d61648b418080..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a50785f196369c06420598b40db7fe0178a90b0c3f804948751f4ad50381d84f -size 596170443 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth deleted file mode 100644 index 9e16bede47d21f5245866ab4b230d8f1399e5509..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:74e7e89cbe7a70edf66e8968948906fb1a820f09a6a8809481256cb4f59eaf10 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth deleted file mode 100644 index bffbdb35d4ab6331ed972f24d7c02a72a79ef5c7..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d5d3e5bce55f160ed0c87b1cbef42754767ac243615cafe6fe597c6c56abe221 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth deleted file mode 100644 index 8d209d9e5db157f7dcc05a4e4c301c0607f9aa40..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6004cc10346f251377bb583f9d9cb6fb19ba248f20a8ca5df932990f0b69313 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth deleted file mode 100644 index bb50879d0349d0b0753eda096223fd563c6ca505..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69bd5f01bd9ce43daeab69ea4b44d0bce391a11f8b9d1d80a742012fb4f66a87 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors deleted file mode 100644 index f56812f0e476ea9d43bd40d7ff7261866f6fa2d2..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a46354b82536b5fd3b84c627a781ab5dd486257d6ccaec684ef58965d14c8ba -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt deleted file mode 100644 index 34bfd225368f331a463e212e2fc50317a9860f8a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:29d7a8844c0ec5618ca98b651b06c5d47a350267ffbc46cd92ee978330ed7107 -size 596170443 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth deleted file mode 100644 index 2630ace0a6ca2e8b94dec507a517c8e51da1a062..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:38d7ef359718523590df9e682a92cb56fe7401ac013eeb40af3d6ce9eb52db3f -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth deleted file mode 100644 index b45cb1afbe06794de974cd0c7ca6c334e35a9b85..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e4e02846c018997fb2a63437b79039e6ffd03d4a1b2388956f198df7d435db23 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth deleted file mode 100644 index 50237828da6508abb5ec3674175c9963b0893cef..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5345a9c5559eca496b295ecccec4ba0d714c05b192234ffe0bb22d9fb9f9fa65 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth deleted file mode 100644 index c6312916d7136a31d3c4d139d9733d6c3b0947af..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f87ecc6acc3b0104c6b332cf9236660b108318e9bcba0d080d4a3915cd3eca90 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors deleted file mode 100644 index 286856c23b266935ded44a693277c23e5c991b78..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e5081c229000fd604fd5b2c8852a910dc1525539d70ab47b202247f261ace45 -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt deleted file mode 100644 index bec908c2f2d95d7f1ff9e72f7feb641524b87d65..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ab12d2fdb52b6310449cb6653cf879090689802313feaa7917477e798206d38 -size 596170443 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth deleted file mode 100644 index 08b7274a3171f570743d4934a217751b3607a641..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8833c00aebc0e619e587f9b710f631c27b0f144c194509f4b71fbc2b817fe73b -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth deleted file mode 100644 index 3a3caf230e38900de7e29992b3f5e1fbe9e4f747..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10e93038fca3c85b4bd66cf943246af72046fa052f77329dadcf03b484882631 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth deleted file mode 100644 index 26c651a76e58ac61aa2d50bd6e347551ed7cc74d..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:893995e08f7db4d1bd2fa7b61362d2ca2a6c5936eb6e9af8051c007c0afcd24a -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth deleted file mode 100644 index 4a827a79035aa8f1230df212ad059f1d41dbeb39..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6cbed03005b56a2cba542949acbe7f890ada7074d8c41dbf04128640c3459be0 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors deleted file mode 100644 index cc0802f8ac21d136637937e10ef465a86f102db2..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a8268831a8f08bfce731da9b4d1a69338adf279352c89832f2a6b9ec3400203 -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt deleted file mode 100644 index fce92b2d65e376da96ce4a6f96f20c32c9655fcb..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51c392d991a187c7fb9c677c9f7f3dc089284791c082912cbcad51d1534a37b0 -size 596170443 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth deleted file mode 100644 index f093f8b15e64cdb3afbcfd8580646da768f1a225..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7c3317de65b30d603f92fe6e96f6799b60ade22ae5df6aac7a9339d5943f7f1 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth deleted file mode 100644 index aaf8cba328ef41f729bacf4259542a278b886d21..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:970a742489b7a9284135996af739d3ba3335d58e54026a7f786d5bfb4f0dff69 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth deleted file mode 100644 index ef6b8c474ded1685ab642a2123382fb2995028d6..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc9ee43db00b894b802bd59ba7b8b86295da75ac768fa84976d017cfadd8c106 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth deleted file mode 100644 index 0c05a403378092d38901f30aa4bd12e0b96ebe64..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98094ade0b37e212a3a763c172efa6d586516480838a3f8fafb9403a87fb9492 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors deleted file mode 100644 index 37e4dc9d03e5ca5ac817e681c1990663f5f93b13..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d246c14f659e3528a5e9698dbc9236bc6bb9eb18c55b627e0292e5e33a6f0d46 -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt deleted file mode 100644 index 5bc4a386894587edcc0b673b6d56a9319890521c..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73caeb604d1153a7db8dd73b2fc37b9e826fec045c5ee26810e16966b749c507 -size 596170443 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth deleted file mode 100644 index a0d9acbaec581c8f3e295832d287d737e9103575..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:676e3c962f8f33d9a16826c7fe7dd98a8b3bfb774ddb934acc5fb734b106b59d -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth deleted file mode 100644 index c6bf0fb0c2b195f8041f33d2a9f672a6a2cedeb1..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d374a9c193c317a1dab9c9052e0ce5250f98dbb111c85aa423a009c121e1fc49 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth deleted file mode 100644 index 7640be24b9de8ed53fbbe9f2814857f497d20521..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe84239abd2d7794d2c95c6c196b6450efef11198f055cb008f1ad56b35e4dbc -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth deleted file mode 100644 index 4b84b163ec49f65ec248af895feb77becaa37d9e..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:358b89bc71f93b52249a9889b255e6ef55fedc24db1a6a3e29b8f70d82acf972 -size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/model.safetensors b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/model.safetensors deleted file mode 100644 index cc0802f8ac21d136637937e10ef465a86f102db2..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a8268831a8f08bfce731da9b4d1a69338adf279352c89832f2a6b9ec3400203 -size 298041696 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/runs/Nov24_23-18-07_nid005102/events.out.tfevents.1764019358.nid005102.70116.0 b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/runs/Nov24_23-18-07_nid005102/events.out.tfevents.1764019358.nid005102.70116.0 deleted file mode 100644 index 30f506bc5a35283537e1daaafb57e01fbb507a02..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/runs/Nov24_23-18-07_nid005102/events.out.tfevents.1764019358.nid005102.70116.0 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac9a9290de7c1c26521db0bc385ab85d3984ceea0f11b5146e7a2cf9a6b37cdd -size 631145 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/training_args.bin b/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/training_args.bin deleted file mode 100644 index af2dac6567b500317449ced253b06ad07cb0d85a..0000000000000000000000000000000000000000 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b -size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a24c4dd28e9eaffe8fd58d6eb3276eeb3506da8 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e973922297f0c493940a90076a687d8de67be2aa6db71915dd86d5c61411430 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ffa7a0de37729549ecbd00fe90e8b89446df9e3 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40f9a2ce28306aa7f03882afc781b4b2a7070bc09f5cb5b454593ccab7e320cd +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..b8b84e5a5b6d9d950b41129ca967536bbff0837a --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc940a8d556e924e63165a153482b7202c426188f75c20c24e12d424fd66480 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf92ab4253c6618d25c886d3c51d7c46960d0c94 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ffdae51287c04c7c26dda1477c56be872f0a72d6ef4c91eed718fc9815f5cd0 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..e291239b9a39b10101e6347a2bf1dfc5a0ad661d --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a259d84200d732d2a50300986ac2bfeae195d6cd6b8a5965f25beb1168967c93 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..41a50dd95aa3f74e5055320ac3bead24dac50036 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be124d0a4382f6650d248da735663a65eb35d7e6648a020fa3373d621111b9f4 +size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json similarity index 54% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json index a48350a4722b23d78a7c7fa725364a5f90f55505..77642a17e467a2d88fb94486cf4c7c71232d6742 100644 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.2039489525644114, + "epoch": 7.132667617689016, "eval_steps": 100, "global_step": 10000, "is_hyper_param_search": false, @@ -10,8310 +10,8310 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0012039489525644113, - "grad_norm": 29.25, + "epoch": 0.007132667617689016, + "grad_norm": 19.75, "learning_rate": 3.6e-07, - "loss": 5.6475, + "loss": 5.6319, "step": 10 }, { - "epoch": 0.0024078979051288226, - "grad_norm": 13.6875, + "epoch": 0.014265335235378032, + "grad_norm": 19.375, "learning_rate": 7.6e-07, - "loss": 5.6394, + "loss": 5.5914, "step": 20 }, { - "epoch": 0.003611846857693234, - "grad_norm": 36.0, + "epoch": 0.021398002853067047, + "grad_norm": 51.25, "learning_rate": 1.16e-06, - "loss": 5.6168, + "loss": 5.6495, "step": 30 }, { - "epoch": 0.004815795810257645, - "grad_norm": 17.0, + "epoch": 0.028530670470756064, + "grad_norm": 19.0, "learning_rate": 1.56e-06, - "loss": 5.6346, + "loss": 5.6581, "step": 40 }, { - "epoch": 0.006019744762822056, - "grad_norm": 16.5, + "epoch": 0.03566333808844508, + "grad_norm": 23.75, "learning_rate": 1.96e-06, - "loss": 5.6391, + "loss": 5.6366, "step": 50 }, { - "epoch": 0.007223693715386468, - "grad_norm": 16.5, + "epoch": 0.042796005706134094, + "grad_norm": 18.0, "learning_rate": 2.36e-06, - "loss": 5.6272, + "loss": 5.6411, "step": 60 }, { - "epoch": 0.00842764266795088, - "grad_norm": 14.8125, + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, "learning_rate": 2.7600000000000003e-06, - "loss": 5.5979, + "loss": 5.5919, "step": 70 }, { - "epoch": 0.00963159162051529, - "grad_norm": 22.375, + "epoch": 0.05706134094151213, + "grad_norm": 24.125, "learning_rate": 3.1600000000000007e-06, - "loss": 5.6515, + "loss": 5.6083, "step": 80 }, { - "epoch": 0.010835540573079701, - "grad_norm": 17.125, + "epoch": 0.06419400855920114, + "grad_norm": 18.25, "learning_rate": 3.5600000000000002e-06, - "loss": 5.6018, + "loss": 5.6599, "step": 90 }, { - "epoch": 0.012039489525644112, - "grad_norm": 14.9375, + "epoch": 0.07132667617689016, + "grad_norm": 18.25, "learning_rate": 3.96e-06, - "loss": 5.6342, + "loss": 5.6652, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval/acc": 3.4883720874786377, + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval_loss": 5.140806198120117, - "eval_runtime": 2.4165, - "eval_samples_per_second": 17.794, - "eval_steps_per_second": 0.414, + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, "step": 100 }, { - "epoch": 0.013243438478208525, - "grad_norm": 13.0, + "epoch": 0.07845934379457917, + "grad_norm": 21.0, "learning_rate": 4.360000000000001e-06, - "loss": 5.6124, + "loss": 5.6402, "step": 110 }, { - "epoch": 0.014447387430772935, - "grad_norm": 18.625, + "epoch": 0.08559201141226819, + "grad_norm": 16.875, "learning_rate": 4.76e-06, - "loss": 5.6127, + "loss": 5.6535, "step": 120 }, { - "epoch": 0.015651336383337346, - "grad_norm": 14.375, + "epoch": 0.09272467902995721, + "grad_norm": 21.5, "learning_rate": 5.1600000000000006e-06, - "loss": 5.5663, + "loss": 5.5821, "step": 130 }, { - "epoch": 0.01685528533590176, - "grad_norm": 11.9375, + "epoch": 0.09985734664764621, + "grad_norm": 18.5, "learning_rate": 5.56e-06, - "loss": 5.55, + "loss": 5.6184, "step": 140 }, { - "epoch": 0.018059234288466168, - "grad_norm": 14.5, + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, "learning_rate": 5.9600000000000005e-06, - "loss": 5.5839, + "loss": 5.5743, "step": 150 }, { - "epoch": 0.01926318324103058, - "grad_norm": 15.0625, + "epoch": 0.11412268188302425, + "grad_norm": 16.875, "learning_rate": 6.360000000000001e-06, - "loss": 5.5259, + "loss": 5.5684, "step": 160 }, { - "epoch": 0.020467132193594993, - "grad_norm": 14.8125, + "epoch": 0.12125534950071326, + "grad_norm": 22.125, "learning_rate": 6.76e-06, - "loss": 5.4812, + "loss": 5.535, "step": 170 }, { - "epoch": 0.021671081146159402, - "grad_norm": 15.375, + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, "learning_rate": 7.16e-06, - "loss": 5.4964, + "loss": 5.4357, "step": 180 }, { - "epoch": 0.022875030098723815, - "grad_norm": 14.0625, + "epoch": 0.1355206847360913, + "grad_norm": 16.375, "learning_rate": 7.5600000000000005e-06, - "loss": 5.4023, + "loss": 5.3766, "step": 190 }, { - "epoch": 0.024078979051288224, - "grad_norm": 18.625, + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, "learning_rate": 7.96e-06, - "loss": 5.3778, + "loss": 5.4437, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval/acc": 5.232558250427246, + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval_loss": 4.991551399230957, - "eval_runtime": 0.2363, - "eval_samples_per_second": 181.988, - "eval_steps_per_second": 4.232, + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, "step": 200 }, { - "epoch": 0.025282928003852637, - "grad_norm": 16.25, + "epoch": 0.14978601997146934, + "grad_norm": 16.75, "learning_rate": 8.36e-06, - "loss": 5.3983, + "loss": 5.4744, "step": 210 }, { - "epoch": 0.02648687695641705, - "grad_norm": 17.25, + "epoch": 0.15691868758915833, + "grad_norm": 43.25, "learning_rate": 8.76e-06, - "loss": 5.2953, + "loss": 5.381, "step": 220 }, { - "epoch": 0.02769082590898146, - "grad_norm": 15.9375, + "epoch": 0.16405135520684735, + "grad_norm": 21.0, "learning_rate": 9.16e-06, - "loss": 5.2266, + "loss": 5.3092, "step": 230 }, { - "epoch": 0.02889477486154587, - "grad_norm": 21.875, + "epoch": 0.17118402282453637, + "grad_norm": 26.75, "learning_rate": 9.560000000000002e-06, - "loss": 5.139, + "loss": 5.2752, "step": 240 }, { - "epoch": 0.03009872381411028, - "grad_norm": 17.875, + "epoch": 0.1783166904422254, + "grad_norm": 26.875, "learning_rate": 9.96e-06, - "loss": 5.0639, + "loss": 5.2194, "step": 250 }, { - "epoch": 0.03130267276667469, - "grad_norm": 18.875, + "epoch": 0.18544935805991442, + "grad_norm": 20.875, "learning_rate": 1.036e-05, - "loss": 5.0118, + "loss": 5.0657, "step": 260 }, { - "epoch": 0.032506621719239105, - "grad_norm": 26.0, + "epoch": 0.19258202567760344, + "grad_norm": 25.125, "learning_rate": 1.076e-05, - "loss": 4.8959, + "loss": 4.967, "step": 270 }, { - "epoch": 0.03371057067180352, - "grad_norm": 18.5, + "epoch": 0.19971469329529243, + "grad_norm": 30.125, "learning_rate": 1.1160000000000002e-05, - "loss": 4.8454, + "loss": 4.9544, "step": 280 }, { - "epoch": 0.03491451962436792, - "grad_norm": 28.0, + "epoch": 0.20684736091298145, + "grad_norm": 24.625, "learning_rate": 1.156e-05, - "loss": 4.6846, + "loss": 4.7585, "step": 290 }, { - "epoch": 0.036118468576932336, - "grad_norm": 25.5, + "epoch": 0.21398002853067047, + "grad_norm": 21.375, "learning_rate": 1.196e-05, - "loss": 4.5211, + "loss": 4.635, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval/acc": 6.395349025726318, + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval_loss": 4.604515075683594, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.428, - "eval_steps_per_second": 4.638, + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, "step": 300 }, { - "epoch": 0.03732241752949675, - "grad_norm": 28.0, + "epoch": 0.2211126961483595, + "grad_norm": 30.125, "learning_rate": 1.236e-05, - "loss": 4.3466, + "loss": 4.5333, "step": 310 }, { - "epoch": 0.03852636648206116, - "grad_norm": 27.125, + "epoch": 0.2282453637660485, + "grad_norm": 28.125, "learning_rate": 1.276e-05, - "loss": 4.1005, + "loss": 4.2888, "step": 320 }, { - "epoch": 0.039730315434625574, - "grad_norm": 33.0, + "epoch": 0.23537803138373753, + "grad_norm": 30.5, "learning_rate": 1.316e-05, - "loss": 3.7904, + "loss": 4.1744, "step": 330 }, { - "epoch": 0.040934264387189986, - "grad_norm": 32.75, + "epoch": 0.24251069900142652, + "grad_norm": 35.0, "learning_rate": 1.356e-05, - "loss": 3.4061, + "loss": 3.8812, "step": 340 }, { - "epoch": 0.04213821333975439, - "grad_norm": 31.125, + "epoch": 0.24964336661911554, + "grad_norm": 30.75, "learning_rate": 1.396e-05, - "loss": 3.2838, + "loss": 3.6772, "step": 350 }, { - "epoch": 0.043342162292318805, - "grad_norm": 23.75, + "epoch": 0.25677603423680456, + "grad_norm": 25.875, "learning_rate": 1.4360000000000001e-05, - "loss": 2.9101, + "loss": 3.3797, "step": 360 }, { - "epoch": 0.04454611124488322, - "grad_norm": 44.75, + "epoch": 0.26390870185449355, + "grad_norm": 31.375, "learning_rate": 1.4760000000000001e-05, - "loss": 2.6306, + "loss": 3.2338, "step": 370 }, { - "epoch": 0.04575006019744763, - "grad_norm": 33.25, + "epoch": 0.2710413694721826, + "grad_norm": 72.0, "learning_rate": 1.5160000000000002e-05, - "loss": 2.5454, + "loss": 2.976, "step": 380 }, { - "epoch": 0.04695400915001204, - "grad_norm": 31.375, + "epoch": 0.2781740370898716, + "grad_norm": 22.375, "learning_rate": 1.556e-05, - "loss": 2.5867, + "loss": 2.8207, "step": 390 }, { - "epoch": 0.04815795810257645, - "grad_norm": 18.5, + "epoch": 0.28530670470756064, + "grad_norm": 21.25, "learning_rate": 1.596e-05, - "loss": 2.3251, + "loss": 2.8341, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval/acc": 12.209301948547363, + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval_loss": 3.941906452178955, - "eval_runtime": 0.2265, - "eval_samples_per_second": 189.814, - "eval_steps_per_second": 4.414, + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, "step": 400 }, { - "epoch": 0.04936190705514086, - "grad_norm": 18.0, + "epoch": 0.29243937232524964, + "grad_norm": 21.0, "learning_rate": 1.636e-05, - "loss": 2.394, + "loss": 2.6431, "step": 410 }, { - "epoch": 0.05056585600770527, - "grad_norm": 22.375, + "epoch": 0.2995720399429387, + "grad_norm": 20.875, "learning_rate": 1.6760000000000002e-05, - "loss": 2.2856, + "loss": 2.6506, "step": 420 }, { - "epoch": 0.051769804960269686, - "grad_norm": 17.25, + "epoch": 0.3067047075606277, + "grad_norm": 21.125, "learning_rate": 1.7160000000000002e-05, - "loss": 2.3414, + "loss": 2.491, "step": 430 }, { - "epoch": 0.0529737539128341, - "grad_norm": 15.25, + "epoch": 0.31383737517831667, + "grad_norm": 31.75, "learning_rate": 1.756e-05, - "loss": 2.156, + "loss": 2.423, "step": 440 }, { - "epoch": 0.054177702865398504, - "grad_norm": 15.75, + "epoch": 0.3209700427960057, + "grad_norm": 19.375, "learning_rate": 1.796e-05, - "loss": 2.0164, + "loss": 2.5108, "step": 450 }, { - "epoch": 0.05538165181796292, - "grad_norm": 28.5, + "epoch": 0.3281027104136947, + "grad_norm": 17.375, "learning_rate": 1.8360000000000004e-05, - "loss": 1.9555, + "loss": 2.4584, "step": 460 }, { - "epoch": 0.05658560077052733, - "grad_norm": 19.25, + "epoch": 0.33523537803138376, + "grad_norm": 22.625, "learning_rate": 1.876e-05, - "loss": 2.0277, + "loss": 2.3526, "step": 470 }, { - "epoch": 0.05778954972309174, - "grad_norm": 15.375, + "epoch": 0.34236804564907275, + "grad_norm": 30.25, "learning_rate": 1.916e-05, - "loss": 2.1719, + "loss": 2.3634, "step": 480 }, { - "epoch": 0.058993498675656154, - "grad_norm": 18.875, + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, "learning_rate": 1.956e-05, - "loss": 2.013, + "loss": 2.3339, "step": 490 }, { - "epoch": 0.06019744762822056, - "grad_norm": 18.625, + "epoch": 0.3566333808844508, + "grad_norm": 19.5, "learning_rate": 1.9960000000000002e-05, - "loss": 1.8574, + "loss": 2.268, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval/acc": 20.930233001708984, + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval_loss": 3.6547293663024902, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.002, - "eval_steps_per_second": 4.674, + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, "step": 500 }, { - "epoch": 0.06140139658078497, - "grad_norm": 19.875, + "epoch": 0.3637660485021398, + "grad_norm": 29.375, "learning_rate": 2.036e-05, - "loss": 1.9431, + "loss": 2.2728, "step": 510 }, { - "epoch": 0.06260534553334939, - "grad_norm": 14.625, + "epoch": 0.37089871611982883, + "grad_norm": 21.25, "learning_rate": 2.076e-05, - "loss": 1.8311, + "loss": 2.1346, "step": 520 }, { - "epoch": 0.0638092944859138, - "grad_norm": 20.0, + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, "learning_rate": 2.116e-05, - "loss": 2.0005, + "loss": 2.2719, "step": 530 }, { - "epoch": 0.06501324343847821, - "grad_norm": 16.0, + "epoch": 0.38516405135520687, + "grad_norm": 27.75, "learning_rate": 2.1560000000000004e-05, - "loss": 1.7374, + "loss": 2.145, "step": 540 }, { - "epoch": 0.06621719239104262, - "grad_norm": 13.0625, + "epoch": 0.39229671897289586, + "grad_norm": 16.125, "learning_rate": 2.196e-05, - "loss": 1.7838, + "loss": 2.0912, "step": 550 }, { - "epoch": 0.06742114134360704, - "grad_norm": 16.5, + "epoch": 0.39942938659058486, + "grad_norm": 20.25, "learning_rate": 2.236e-05, - "loss": 1.8264, + "loss": 2.0302, "step": 560 }, { - "epoch": 0.06862509029617145, - "grad_norm": 20.5, + "epoch": 0.4065620542082739, + "grad_norm": 17.75, "learning_rate": 2.2760000000000002e-05, - "loss": 1.658, + "loss": 2.1832, "step": 570 }, { - "epoch": 0.06982903924873585, - "grad_norm": 25.75, + "epoch": 0.4136947218259629, + "grad_norm": 14.5, "learning_rate": 2.3160000000000002e-05, - "loss": 1.7826, + "loss": 1.9652, "step": 580 }, { - "epoch": 0.07103298820130026, - "grad_norm": 19.375, + "epoch": 0.42082738944365194, + "grad_norm": 17.0, "learning_rate": 2.356e-05, - "loss": 1.6539, + "loss": 1.8911, "step": 590 }, { - "epoch": 0.07223693715386467, - "grad_norm": 19.25, + "epoch": 0.42796005706134094, + "grad_norm": 20.0, "learning_rate": 2.396e-05, - "loss": 1.6278, + "loss": 2.0266, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval/acc": 20.930233001708984, + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval_loss": 3.387899398803711, - "eval_runtime": 0.2536, - "eval_samples_per_second": 169.572, - "eval_steps_per_second": 3.944, + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, "step": 600 }, { - "epoch": 0.07344088610642908, - "grad_norm": 12.0625, + "epoch": 0.43509272467902993, + "grad_norm": 25.5, "learning_rate": 2.4360000000000004e-05, - "loss": 1.5342, + "loss": 1.9116, "step": 610 }, { - "epoch": 0.0746448350589935, - "grad_norm": 15.625, + "epoch": 0.442225392296719, + "grad_norm": 25.375, "learning_rate": 2.476e-05, - "loss": 1.5919, + "loss": 1.7644, "step": 620 }, { - "epoch": 0.07584878401155791, - "grad_norm": 25.5, + "epoch": 0.44935805991440797, + "grad_norm": 15.5, "learning_rate": 2.516e-05, - "loss": 1.5713, + "loss": 1.9008, "step": 630 }, { - "epoch": 0.07705273296412232, - "grad_norm": 14.8125, + "epoch": 0.456490727532097, + "grad_norm": 16.875, "learning_rate": 2.556e-05, - "loss": 1.4714, + "loss": 1.619, "step": 640 }, { - "epoch": 0.07825668191668674, - "grad_norm": 21.5, + "epoch": 0.463623395149786, + "grad_norm": 37.25, "learning_rate": 2.5960000000000002e-05, - "loss": 1.5835, + "loss": 1.7725, "step": 650 }, { - "epoch": 0.07946063086925115, - "grad_norm": 58.0, + "epoch": 0.47075606276747506, + "grad_norm": 16.5, "learning_rate": 2.6360000000000002e-05, - "loss": 1.5369, + "loss": 1.7405, "step": 660 }, { - "epoch": 0.08066457982181556, - "grad_norm": 45.0, + "epoch": 0.47788873038516405, + "grad_norm": 16.25, "learning_rate": 2.676e-05, - "loss": 1.4629, + "loss": 1.5825, "step": 670 }, { - "epoch": 0.08186852877437997, - "grad_norm": 14.1875, + "epoch": 0.48502139800285304, + "grad_norm": 68.5, "learning_rate": 2.716e-05, - "loss": 1.4288, + "loss": 1.8379, "step": 680 }, { - "epoch": 0.08307247772694437, - "grad_norm": 40.25, + "epoch": 0.4921540656205421, + "grad_norm": 50.0, "learning_rate": 2.7560000000000004e-05, - "loss": 1.4729, + "loss": 1.7989, "step": 690 }, { - "epoch": 0.08427642667950878, - "grad_norm": 13.625, + "epoch": 0.4992867332382311, + "grad_norm": 16.25, "learning_rate": 2.7960000000000003e-05, - "loss": 1.4883, + "loss": 1.7058, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval/acc": 23.255813598632812, + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval_loss": 3.206946611404419, - "eval_runtime": 0.4188, - "eval_samples_per_second": 102.684, - "eval_steps_per_second": 2.388, + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, "step": 700 }, { - "epoch": 0.0854803756320732, - "grad_norm": 15.75, + "epoch": 0.5064194008559201, + "grad_norm": 14.625, "learning_rate": 2.8360000000000003e-05, - "loss": 1.5656, + "loss": 1.6542, "step": 710 }, { - "epoch": 0.08668432458463761, - "grad_norm": 22.25, + "epoch": 0.5135520684736091, + "grad_norm": 71.0, "learning_rate": 2.8760000000000002e-05, - "loss": 1.6742, + "loss": 1.6763, "step": 720 }, { - "epoch": 0.08788827353720202, - "grad_norm": 12.3125, + "epoch": 0.5206847360912982, + "grad_norm": 17.125, "learning_rate": 2.9160000000000005e-05, - "loss": 1.35, + "loss": 1.6858, "step": 730 }, { - "epoch": 0.08909222248976643, - "grad_norm": 13.8125, + "epoch": 0.5278174037089871, + "grad_norm": 19.75, "learning_rate": 2.9559999999999998e-05, - "loss": 1.4435, + "loss": 1.6718, "step": 740 }, { - "epoch": 0.09029617144233085, - "grad_norm": 13.1875, + "epoch": 0.5349500713266762, + "grad_norm": 13.375, "learning_rate": 2.9959999999999998e-05, - "loss": 1.3843, + "loss": 1.6164, "step": 750 }, { - "epoch": 0.09150012039489526, - "grad_norm": 13.3125, + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, "learning_rate": 3.036e-05, - "loss": 1.3327, + "loss": 1.6049, "step": 760 }, { - "epoch": 0.09270406934745967, - "grad_norm": 18.875, + "epoch": 0.5492154065620543, + "grad_norm": 35.75, "learning_rate": 3.076e-05, - "loss": 1.4628, + "loss": 1.5453, "step": 770 }, { - "epoch": 0.09390801830002408, - "grad_norm": 14.5625, + "epoch": 0.5563480741797432, + "grad_norm": 28.75, "learning_rate": 3.116e-05, - "loss": 1.3306, + "loss": 1.4818, "step": 780 }, { - "epoch": 0.09511196725258848, - "grad_norm": 18.75, + "epoch": 0.5634807417974322, + "grad_norm": 17.375, "learning_rate": 3.156e-05, - "loss": 1.4936, + "loss": 1.5647, "step": 790 }, { - "epoch": 0.0963159162051529, - "grad_norm": 11.5, + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, "learning_rate": 3.196e-05, - "loss": 1.3515, + "loss": 1.5206, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval/acc": 22.674419403076172, + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval_loss": 3.1510462760925293, - "eval_runtime": 0.2676, - "eval_samples_per_second": 160.701, - "eval_steps_per_second": 3.737, + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, "step": 800 }, { - "epoch": 0.09751986515771731, - "grad_norm": 11.6875, + "epoch": 0.5777460770328102, + "grad_norm": 17.125, "learning_rate": 3.236e-05, - "loss": 1.4593, + "loss": 1.6124, "step": 810 }, { - "epoch": 0.09872381411028172, - "grad_norm": 10.5625, + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, "learning_rate": 3.2760000000000005e-05, - "loss": 1.3453, + "loss": 1.4254, "step": 820 }, { - "epoch": 0.09992776306284613, - "grad_norm": 11.625, + "epoch": 0.5920114122681883, + "grad_norm": 15.0, "learning_rate": 3.316e-05, - "loss": 1.4041, + "loss": 1.7124, "step": 830 }, { - "epoch": 0.10113171201541055, - "grad_norm": 13.0, + "epoch": 0.5991440798858774, + "grad_norm": 14.75, "learning_rate": 3.3560000000000004e-05, - "loss": 1.2766, + "loss": 1.5384, "step": 840 }, { - "epoch": 0.10233566096797496, - "grad_norm": 40.0, + "epoch": 0.6062767475035663, + "grad_norm": 31.5, "learning_rate": 3.396e-05, - "loss": 1.2678, + "loss": 1.4899, "step": 850 }, { - "epoch": 0.10353960992053937, - "grad_norm": 13.75, + "epoch": 0.6134094151212554, + "grad_norm": 13.875, "learning_rate": 3.436e-05, - "loss": 1.2514, + "loss": 1.5377, "step": 860 }, { - "epoch": 0.10474355887310378, - "grad_norm": 11.75, + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, "learning_rate": 3.4760000000000006e-05, - "loss": 1.3518, + "loss": 1.4892, "step": 870 }, { - "epoch": 0.1059475078256682, - "grad_norm": 11.875, + "epoch": 0.6276747503566333, + "grad_norm": 37.25, "learning_rate": 3.516e-05, - "loss": 1.2675, + "loss": 1.4872, "step": 880 }, { - "epoch": 0.10715145677823261, - "grad_norm": 13.0, + "epoch": 0.6348074179743224, + "grad_norm": 18.875, "learning_rate": 3.5560000000000005e-05, - "loss": 1.294, + "loss": 1.536, "step": 890 }, { - "epoch": 0.10835540573079701, - "grad_norm": 13.0, + "epoch": 0.6419400855920114, + "grad_norm": 18.625, "learning_rate": 3.596e-05, - "loss": 1.1209, + "loss": 1.5208, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval/acc": 25.581396102905273, + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval_loss": 3.0571491718292236, - "eval_runtime": 0.3097, - "eval_samples_per_second": 138.846, - "eval_steps_per_second": 3.229, + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, "step": 900 }, { - "epoch": 0.10955935468336142, - "grad_norm": 12.75, + "epoch": 0.6490727532097005, + "grad_norm": 19.875, "learning_rate": 3.636e-05, - "loss": 1.2681, + "loss": 1.4606, "step": 910 }, { - "epoch": 0.11076330363592583, - "grad_norm": 17.0, + "epoch": 0.6562054208273894, + "grad_norm": 12.625, "learning_rate": 3.676e-05, - "loss": 1.2606, + "loss": 1.4728, "step": 920 }, { - "epoch": 0.11196725258849025, - "grad_norm": 11.375, + "epoch": 0.6633380884450785, + "grad_norm": 15.0, "learning_rate": 3.716e-05, - "loss": 1.2194, + "loss": 1.449, "step": 930 }, { - "epoch": 0.11317120154105466, - "grad_norm": 12.125, + "epoch": 0.6704707560627675, + "grad_norm": 19.0, "learning_rate": 3.756e-05, - "loss": 1.2905, + "loss": 1.5292, "step": 940 }, { - "epoch": 0.11437515049361907, - "grad_norm": 18.125, + "epoch": 0.6776034236804565, + "grad_norm": 111.5, "learning_rate": 3.796e-05, - "loss": 1.2563, + "loss": 1.4891, "step": 950 }, { - "epoch": 0.11557909944618348, - "grad_norm": 17.125, + "epoch": 0.6847360912981455, + "grad_norm": 14.75, "learning_rate": 3.836e-05, - "loss": 1.1894, + "loss": 1.4202, "step": 960 }, { - "epoch": 0.1167830483987479, - "grad_norm": 11.875, + "epoch": 0.6918687589158345, + "grad_norm": 20.25, "learning_rate": 3.876e-05, - "loss": 1.2441, + "loss": 1.5258, "step": 970 }, { - "epoch": 0.11798699735131231, - "grad_norm": 15.8125, + "epoch": 0.6990014265335235, + "grad_norm": 48.0, "learning_rate": 3.9160000000000005e-05, - "loss": 1.2627, + "loss": 1.3912, "step": 980 }, { - "epoch": 0.11919094630387672, - "grad_norm": 17.375, + "epoch": 0.7061340941512125, + "grad_norm": 13.0, "learning_rate": 3.956e-05, - "loss": 1.3929, + "loss": 1.4859, "step": 990 }, { - "epoch": 0.12039489525644112, - "grad_norm": 11.125, + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, "learning_rate": 3.9960000000000004e-05, - "loss": 1.1332, + "loss": 1.4614, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval/acc": 26.162790298461914, + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval_loss": 2.9910976886749268, - "eval_runtime": 0.2826, - "eval_samples_per_second": 152.17, - "eval_steps_per_second": 3.539, + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, "step": 1000 }, { - "epoch": 0.12159884420900553, - "grad_norm": 13.75, + "epoch": 0.7203994293865906, + "grad_norm": 16.625, "learning_rate": 4.0360000000000007e-05, - "loss": 1.2314, + "loss": 1.56, "step": 1010 }, { - "epoch": 0.12280279316156995, - "grad_norm": 11.875, + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, "learning_rate": 4.076e-05, - "loss": 1.2654, + "loss": 1.4469, "step": 1020 }, { - "epoch": 0.12400674211413436, - "grad_norm": 12.8125, + "epoch": 0.7346647646219686, + "grad_norm": 15.0, "learning_rate": 4.1160000000000006e-05, - "loss": 1.1432, + "loss": 1.381, "step": 1030 }, { - "epoch": 0.12521069106669877, - "grad_norm": 13.9375, + "epoch": 0.7417974322396577, + "grad_norm": 13.625, "learning_rate": 4.156e-05, - "loss": 1.1669, + "loss": 1.3749, "step": 1040 }, { - "epoch": 0.1264146400192632, - "grad_norm": 19.25, + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, "learning_rate": 4.196e-05, - "loss": 1.1836, + "loss": 1.3919, "step": 1050 }, { - "epoch": 0.1276185889718276, - "grad_norm": 11.375, + "epoch": 0.7560627674750356, + "grad_norm": 16.25, "learning_rate": 4.236e-05, - "loss": 1.2449, + "loss": 1.4208, "step": 1060 }, { - "epoch": 0.128822537924392, - "grad_norm": 10.6875, + "epoch": 0.7631954350927247, + "grad_norm": 27.75, "learning_rate": 4.276e-05, - "loss": 1.1361, + "loss": 1.3714, "step": 1070 }, { - "epoch": 0.13002648687695642, - "grad_norm": 11.5, + "epoch": 0.7703281027104137, + "grad_norm": 13.125, "learning_rate": 4.316e-05, - "loss": 1.1989, + "loss": 1.3344, "step": 1080 }, { - "epoch": 0.13123043582952082, - "grad_norm": 13.0, + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, "learning_rate": 4.356e-05, - "loss": 1.1004, + "loss": 1.3291, "step": 1090 }, { - "epoch": 0.13243438478208525, - "grad_norm": 10.125, + "epoch": 0.7845934379457917, + "grad_norm": 17.125, "learning_rate": 4.396e-05, - "loss": 1.1308, + "loss": 1.3536, "step": 1100 }, { - "epoch": 0.13243438478208525, + "epoch": 0.7845934379457917, "eval/acc": 27.9069766998291, "step": 1100 }, { - "epoch": 0.13243438478208525, - "eval_loss": 3.0177316665649414, - "eval_runtime": 0.2801, - "eval_samples_per_second": 153.54, - "eval_steps_per_second": 3.571, + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, "step": 1100 }, { - "epoch": 0.13363833373464964, - "grad_norm": 9.5, + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, "learning_rate": 4.436e-05, - "loss": 1.1862, + "loss": 1.4598, "step": 1110 }, { - "epoch": 0.13484228268721407, - "grad_norm": 13.75, + "epoch": 0.7988587731811697, + "grad_norm": 15.25, "learning_rate": 4.4760000000000005e-05, - "loss": 1.1764, + "loss": 1.3795, "step": 1120 }, { - "epoch": 0.13604623163977847, - "grad_norm": 30.625, + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, "learning_rate": 4.516e-05, - "loss": 1.0422, + "loss": 1.2518, "step": 1130 }, { - "epoch": 0.1372501805923429, - "grad_norm": 9.875, + "epoch": 0.8131241084165478, + "grad_norm": 16.625, "learning_rate": 4.5560000000000004e-05, - "loss": 1.1796, + "loss": 1.3104, "step": 1140 }, { - "epoch": 0.1384541295449073, - "grad_norm": 13.1875, + "epoch": 0.8202567760342369, + "grad_norm": 11.875, "learning_rate": 4.596e-05, - "loss": 1.0483, + "loss": 1.2996, "step": 1150 }, { - "epoch": 0.1396580784974717, - "grad_norm": 11.75, + "epoch": 0.8273894436519258, + "grad_norm": 24.125, "learning_rate": 4.636e-05, - "loss": 1.1647, + "loss": 1.2067, "step": 1160 }, { - "epoch": 0.14086202745003612, - "grad_norm": 13.375, + "epoch": 0.8345221112696148, + "grad_norm": 11.0, "learning_rate": 4.6760000000000006e-05, - "loss": 1.2839, + "loss": 1.3035, "step": 1170 }, { - "epoch": 0.14206597640260052, - "grad_norm": 42.0, + "epoch": 0.8416547788873039, + "grad_norm": 13.125, "learning_rate": 4.716e-05, - "loss": 1.1594, + "loss": 1.2859, "step": 1180 }, { - "epoch": 0.14326992535516495, - "grad_norm": 15.625, + "epoch": 0.8487874465049928, + "grad_norm": 11.0, "learning_rate": 4.7560000000000005e-05, - "loss": 1.1073, + "loss": 1.3982, "step": 1190 }, { - "epoch": 0.14447387430772934, - "grad_norm": 11.5, + "epoch": 0.8559201141226819, + "grad_norm": 12.875, "learning_rate": 4.796e-05, - "loss": 1.1593, + "loss": 1.299, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval/acc": 26.162790298461914, + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval_loss": 3.0329606533050537, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.829, - "eval_steps_per_second": 4.577, + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, "step": 1200 }, { - "epoch": 0.14567782326029377, - "grad_norm": 12.5625, + "epoch": 0.8630527817403709, + "grad_norm": 11.25, "learning_rate": 4.836e-05, - "loss": 1.1088, + "loss": 1.3549, "step": 1210 }, { - "epoch": 0.14688177221285817, - "grad_norm": 10.4375, + "epoch": 0.8701854493580599, + "grad_norm": 15.25, "learning_rate": 4.876e-05, - "loss": 1.1565, + "loss": 1.3649, "step": 1220 }, { - "epoch": 0.1480857211654226, - "grad_norm": 11.3125, + "epoch": 0.8773181169757489, + "grad_norm": 22.0, "learning_rate": 4.9160000000000004e-05, - "loss": 1.0596, + "loss": 1.2441, "step": 1230 }, { - "epoch": 0.149289670117987, - "grad_norm": 11.375, + "epoch": 0.884450784593438, + "grad_norm": 12.375, "learning_rate": 4.956e-05, - "loss": 1.2416, + "loss": 1.2196, "step": 1240 }, { - "epoch": 0.15049361907055142, - "grad_norm": 10.3125, + "epoch": 0.891583452211127, + "grad_norm": 14.25, "learning_rate": 4.996e-05, - "loss": 1.0492, + "loss": 1.3274, "step": 1250 }, { - "epoch": 0.15169756802311582, - "grad_norm": 10.9375, + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, "learning_rate": 5.0360000000000006e-05, - "loss": 1.0263, + "loss": 1.2896, "step": 1260 }, { - "epoch": 0.15290151697568022, - "grad_norm": 11.0625, + "epoch": 0.905848787446505, + "grad_norm": 16.875, "learning_rate": 5.076000000000001e-05, - "loss": 1.1197, + "loss": 1.3019, "step": 1270 }, { - "epoch": 0.15410546592824464, - "grad_norm": 33.25, + "epoch": 0.912981455064194, + "grad_norm": 26.375, "learning_rate": 5.1160000000000005e-05, - "loss": 1.0614, + "loss": 1.3756, "step": 1280 }, { - "epoch": 0.15530941488080904, - "grad_norm": 11.3125, + "epoch": 0.920114122681883, + "grad_norm": 18.25, "learning_rate": 5.1559999999999994e-05, - "loss": 1.0948, + "loss": 1.327, "step": 1290 }, { - "epoch": 0.15651336383337347, - "grad_norm": 24.5, + "epoch": 0.927246790299572, + "grad_norm": 11.3125, "learning_rate": 5.196e-05, - "loss": 1.1113, + "loss": 1.3237, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval/acc": 25.581396102905273, + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval_loss": 2.944797992706299, - "eval_runtime": 0.3019, - "eval_samples_per_second": 142.434, - "eval_steps_per_second": 3.312, + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, "step": 1300 }, { - "epoch": 0.15771731278593787, - "grad_norm": 12.4375, + "epoch": 0.9343794579172611, + "grad_norm": 18.125, "learning_rate": 5.236e-05, - "loss": 0.9531, + "loss": 1.256, "step": 1310 }, { - "epoch": 0.1589212617385023, - "grad_norm": 12.3125, + "epoch": 0.9415121255349501, + "grad_norm": 10.25, "learning_rate": 5.2759999999999996e-05, - "loss": 1.0079, + "loss": 1.1386, "step": 1320 }, { - "epoch": 0.1601252106910667, - "grad_norm": 13.1875, + "epoch": 0.948644793152639, + "grad_norm": 11.1875, "learning_rate": 5.316e-05, - "loss": 1.0674, + "loss": 1.3115, "step": 1330 }, { - "epoch": 0.16132915964363112, - "grad_norm": 16.875, + "epoch": 0.9557774607703281, + "grad_norm": 10.875, "learning_rate": 5.356e-05, - "loss": 1.1194, + "loss": 1.2315, "step": 1340 }, { - "epoch": 0.16253310859619552, - "grad_norm": 10.625, + "epoch": 0.9629101283880172, + "grad_norm": 12.0, "learning_rate": 5.396e-05, - "loss": 1.0057, + "loss": 1.3327, "step": 1350 }, { - "epoch": 0.16373705754875995, - "grad_norm": 9.125, + "epoch": 0.9700427960057061, + "grad_norm": 11.75, "learning_rate": 5.436e-05, - "loss": 1.1257, + "loss": 1.4052, "step": 1360 }, { - "epoch": 0.16494100650132434, - "grad_norm": 8.5, + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, "learning_rate": 5.476e-05, - "loss": 0.9545, + "loss": 1.1349, "step": 1370 }, { - "epoch": 0.16614495545388874, - "grad_norm": 10.25, + "epoch": 0.9843081312410842, + "grad_norm": 15.125, "learning_rate": 5.516e-05, - "loss": 1.0648, + "loss": 1.3803, "step": 1380 }, { - "epoch": 0.16734890440645317, - "grad_norm": 14.9375, + "epoch": 0.9914407988587732, + "grad_norm": 16.75, "learning_rate": 5.556e-05, - "loss": 1.0364, + "loss": 1.3536, "step": 1390 }, { - "epoch": 0.16855285335901757, - "grad_norm": 138.0, + "epoch": 0.9985734664764622, + "grad_norm": 10.625, "learning_rate": 5.596e-05, - "loss": 1.0255, + "loss": 1.2981, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval/acc": 27.9069766998291, + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval_loss": 2.763101100921631, - "eval_runtime": 0.2759, - "eval_samples_per_second": 155.826, - "eval_steps_per_second": 3.624, + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, "step": 1400 }, { - "epoch": 0.169756802311582, - "grad_norm": 11.8125, + "epoch": 1.005706134094151, + "grad_norm": 15.0, "learning_rate": 5.636e-05, - "loss": 0.9813, + "loss": 1.2173, "step": 1410 }, { - "epoch": 0.1709607512641464, - "grad_norm": 9.1875, + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, "learning_rate": 5.6760000000000005e-05, - "loss": 0.9929, + "loss": 1.1965, "step": 1420 }, { - "epoch": 0.17216470021671082, - "grad_norm": 10.875, + "epoch": 1.0199714693295292, + "grad_norm": 21.625, "learning_rate": 5.716e-05, - "loss": 0.9113, + "loss": 1.2494, "step": 1430 }, { - "epoch": 0.17336864916927522, - "grad_norm": 19.375, + "epoch": 1.0271041369472182, + "grad_norm": 13.0, "learning_rate": 5.7560000000000005e-05, - "loss": 1.0711, + "loss": 1.1948, "step": 1440 }, { - "epoch": 0.17457259812183964, - "grad_norm": 9.8125, + "epoch": 1.0342368045649073, + "grad_norm": 11.0, "learning_rate": 5.796e-05, - "loss": 0.9322, + "loss": 1.2641, "step": 1450 }, { - "epoch": 0.17577654707440404, - "grad_norm": 10.5, + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, "learning_rate": 5.8360000000000004e-05, - "loss": 1.0316, + "loss": 1.2526, "step": 1460 }, { - "epoch": 0.17698049602696847, - "grad_norm": 10.25, + "epoch": 1.0485021398002854, + "grad_norm": 46.0, "learning_rate": 5.876000000000001e-05, - "loss": 1.0165, + "loss": 1.0786, "step": 1470 }, { - "epoch": 0.17818444497953287, - "grad_norm": 10.4375, + "epoch": 1.0556348074179742, + "grad_norm": 11.0, "learning_rate": 5.916e-05, - "loss": 1.0229, + "loss": 1.3154, "step": 1480 }, { - "epoch": 0.17938839393209727, - "grad_norm": 14.4375, + "epoch": 1.0627674750356633, + "grad_norm": 18.75, "learning_rate": 5.9560000000000006e-05, - "loss": 0.9684, + "loss": 1.257, "step": 1490 }, { - "epoch": 0.1805923428846617, - "grad_norm": 8.375, + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, "learning_rate": 5.996e-05, - "loss": 0.9948, + "loss": 1.2636, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval/acc": 34.88372039794922, + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval_loss": 2.8177433013916016, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.732, - "eval_steps_per_second": 4.808, + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, "step": 1500 }, { - "epoch": 0.1817962918372261, - "grad_norm": 19.25, + "epoch": 1.0770328102710414, + "grad_norm": 13.75, "learning_rate": 6.0360000000000005e-05, - "loss": 0.9897, + "loss": 1.2602, "step": 1510 }, { - "epoch": 0.18300024078979052, - "grad_norm": 32.5, + "epoch": 1.0841654778887304, + "grad_norm": 11.625, "learning_rate": 6.076000000000001e-05, - "loss": 0.9217, + "loss": 1.0823, "step": 1520 }, { - "epoch": 0.18420418974235492, - "grad_norm": 9.5, + "epoch": 1.0912981455064195, + "grad_norm": 9.0, "learning_rate": 6.116e-05, - "loss": 1.0494, + "loss": 1.3059, "step": 1530 }, { - "epoch": 0.18540813869491934, - "grad_norm": 9.25, + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, "learning_rate": 6.156e-05, - "loss": 0.9359, + "loss": 1.2006, "step": 1540 }, { - "epoch": 0.18661208764748374, - "grad_norm": 11.375, + "epoch": 1.1055634807417973, + "grad_norm": 15.75, "learning_rate": 6.196000000000001e-05, - "loss": 0.9112, + "loss": 1.3731, "step": 1550 }, { - "epoch": 0.18781603660004817, - "grad_norm": 12.6875, + "epoch": 1.1126961483594864, + "grad_norm": 9.5, "learning_rate": 6.236e-05, - "loss": 1.07, + "loss": 1.1925, "step": 1560 }, { - "epoch": 0.18901998555261257, - "grad_norm": 11.1875, + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, "learning_rate": 6.276e-05, - "loss": 0.9853, + "loss": 1.1554, "step": 1570 }, { - "epoch": 0.19022393450517697, - "grad_norm": 8.375, + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, "learning_rate": 6.316000000000001e-05, - "loss": 0.9579, + "loss": 1.0875, "step": 1580 }, { - "epoch": 0.1914278834577414, - "grad_norm": 20.875, + "epoch": 1.1340941512125535, + "grad_norm": 10.875, "learning_rate": 6.356000000000001e-05, - "loss": 0.9401, + "loss": 1.1895, "step": 1590 }, { - "epoch": 0.1926318324103058, - "grad_norm": 8.9375, + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, "learning_rate": 6.396e-05, - "loss": 1.0279, + "loss": 1.2354, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval/acc": 30.23255729675293, + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval_loss": 2.8526248931884766, - "eval_runtime": 0.3114, - "eval_samples_per_second": 138.103, - "eval_steps_per_second": 3.212, + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, "step": 1600 }, { - "epoch": 0.19383578136287022, - "grad_norm": 7.78125, + "epoch": 1.1483594864479316, + "grad_norm": 12.375, "learning_rate": 6.436e-05, - "loss": 0.8743, + "loss": 1.2167, "step": 1610 }, { - "epoch": 0.19503973031543462, - "grad_norm": 9.8125, + "epoch": 1.1554921540656204, + "grad_norm": 10.375, "learning_rate": 6.476e-05, - "loss": 0.8702, + "loss": 1.1638, "step": 1620 }, { - "epoch": 0.19624367926799904, - "grad_norm": 12.4375, + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, "learning_rate": 6.515999999999999e-05, - "loss": 1.0028, + "loss": 1.1666, "step": 1630 }, { - "epoch": 0.19744762822056344, - "grad_norm": 10.125, + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, "learning_rate": 6.556e-05, - "loss": 0.9377, + "loss": 1.1961, "step": 1640 }, { - "epoch": 0.19865157717312787, - "grad_norm": 8.9375, + "epoch": 1.1768901569186876, + "grad_norm": 9.875, "learning_rate": 6.596e-05, - "loss": 1.031, + "loss": 1.2558, "step": 1650 }, { - "epoch": 0.19985552612569227, - "grad_norm": 8.5625, + "epoch": 1.1840228245363766, + "grad_norm": 10.375, "learning_rate": 6.636e-05, - "loss": 1.0162, + "loss": 1.1728, "step": 1660 }, { - "epoch": 0.2010594750782567, - "grad_norm": 33.75, + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, "learning_rate": 6.676e-05, - "loss": 0.9448, + "loss": 1.2947, "step": 1670 }, { - "epoch": 0.2022634240308211, - "grad_norm": 9.625, + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, "learning_rate": 6.716e-05, - "loss": 1.0077, + "loss": 1.2151, "step": 1680 }, { - "epoch": 0.2034673729833855, - "grad_norm": 8.6875, + "epoch": 1.2054208273894436, + "grad_norm": 10.5, "learning_rate": 6.756e-05, - "loss": 0.9654, + "loss": 1.0612, "step": 1690 }, { - "epoch": 0.20467132193594992, - "grad_norm": 12.625, + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, "learning_rate": 6.796e-05, - "loss": 0.8899, + "loss": 1.1079, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval/acc": 32.55813980102539, + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval_loss": 2.7813549041748047, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.701, - "eval_steps_per_second": 4.691, + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, "step": 1700 }, { - "epoch": 0.20587527088851432, - "grad_norm": 12.0, + "epoch": 1.2196861626248217, + "grad_norm": 11.25, "learning_rate": 6.836e-05, - "loss": 1.0412, + "loss": 1.1541, "step": 1710 }, { - "epoch": 0.20707921984107874, - "grad_norm": 11.75, + "epoch": 1.2268188302425107, + "grad_norm": 8.125, "learning_rate": 6.876e-05, - "loss": 0.9239, + "loss": 1.0772, "step": 1720 }, { - "epoch": 0.20828316879364314, - "grad_norm": 11.375, + "epoch": 1.2339514978601998, + "grad_norm": 18.125, "learning_rate": 6.916000000000001e-05, - "loss": 0.9243, + "loss": 1.1623, "step": 1730 }, { - "epoch": 0.20948711774620757, - "grad_norm": 12.0, + "epoch": 1.2410841654778888, + "grad_norm": 10.125, "learning_rate": 6.956e-05, - "loss": 1.0204, + "loss": 1.182, "step": 1740 }, { - "epoch": 0.21069106669877197, - "grad_norm": 13.0625, + "epoch": 1.2482168330955776, + "grad_norm": 9.75, "learning_rate": 6.996e-05, - "loss": 0.8811, + "loss": 1.0796, "step": 1750 }, { - "epoch": 0.2118950156513364, - "grad_norm": 17.0, + "epoch": 1.2553495007132667, + "grad_norm": 10.5, "learning_rate": 7.036e-05, - "loss": 0.8755, + "loss": 1.2374, "step": 1760 }, { - "epoch": 0.2130989646039008, - "grad_norm": 11.25, + "epoch": 1.2624821683309557, + "grad_norm": 20.875, "learning_rate": 7.076000000000001e-05, - "loss": 0.858, + "loss": 1.2718, "step": 1770 }, { - "epoch": 0.21430291355646522, - "grad_norm": 9.625, + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, "learning_rate": 7.116e-05, - "loss": 0.9076, + "loss": 1.0922, "step": 1780 }, { - "epoch": 0.21550686250902962, - "grad_norm": 10.4375, + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, "learning_rate": 7.156e-05, - "loss": 0.8817, + "loss": 1.0637, "step": 1790 }, { - "epoch": 0.21671081146159402, - "grad_norm": 12.8125, + "epoch": 1.2838801711840229, + "grad_norm": 9.5, "learning_rate": 7.196000000000001e-05, - "loss": 0.9121, + "loss": 1.1661, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval/acc": 30.813953399658203, + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval_loss": 2.6508796215057373, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.798, - "eval_steps_per_second": 4.577, + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, "step": 1800 }, { - "epoch": 0.21791476041415844, - "grad_norm": 16.5, + "epoch": 1.291012838801712, + "grad_norm": 14.3125, "learning_rate": 7.236e-05, - "loss": 0.9044, + "loss": 1.1139, "step": 1810 }, { - "epoch": 0.21911870936672284, - "grad_norm": 15.1875, + "epoch": 1.298145506419401, + "grad_norm": 41.5, "learning_rate": 7.276e-05, - "loss": 0.9552, + "loss": 1.0817, "step": 1820 }, { - "epoch": 0.22032265831928727, - "grad_norm": 11.375, + "epoch": 1.3052781740370898, + "grad_norm": 15.125, "learning_rate": 7.316000000000001e-05, - "loss": 0.9264, + "loss": 1.2462, "step": 1830 }, { - "epoch": 0.22152660727185167, - "grad_norm": 8.8125, + "epoch": 1.3124108416547788, + "grad_norm": 33.25, "learning_rate": 7.356000000000001e-05, - "loss": 0.8928, + "loss": 1.1143, "step": 1840 }, { - "epoch": 0.2227305562244161, - "grad_norm": 9.625, + "epoch": 1.3195435092724679, + "grad_norm": 13.625, "learning_rate": 7.396e-05, - "loss": 0.9515, + "loss": 1.1783, "step": 1850 }, { - "epoch": 0.2239345051769805, - "grad_norm": 31.0, + "epoch": 1.326676176890157, + "grad_norm": 18.375, "learning_rate": 7.436000000000001e-05, - "loss": 0.8989, + "loss": 1.2101, "step": 1860 }, { - "epoch": 0.22513845412954492, - "grad_norm": 9.5, + "epoch": 1.333808844507846, + "grad_norm": 13.875, "learning_rate": 7.476000000000001e-05, - "loss": 1.0206, + "loss": 1.1348, "step": 1870 }, { - "epoch": 0.22634240308210932, - "grad_norm": 8.625, + "epoch": 1.340941512125535, + "grad_norm": 13.9375, "learning_rate": 7.516e-05, - "loss": 0.8961, + "loss": 1.0747, "step": 1880 }, { - "epoch": 0.22754635203467374, - "grad_norm": 9.0, + "epoch": 1.3480741797432239, + "grad_norm": 29.75, "learning_rate": 7.556000000000002e-05, - "loss": 0.9421, + "loss": 1.1895, "step": 1890 }, { - "epoch": 0.22875030098723814, - "grad_norm": 12.0625, + "epoch": 1.355206847360913, + "grad_norm": 17.25, "learning_rate": 7.596000000000001e-05, - "loss": 0.9049, + "loss": 1.2512, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval/acc": 36.046512603759766, + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval_loss": 2.636018753051758, - "eval_runtime": 0.2084, - "eval_samples_per_second": 206.343, - "eval_steps_per_second": 4.799, + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, "step": 1900 }, { - "epoch": 0.22995424993980254, - "grad_norm": 8.0625, + "epoch": 1.362339514978602, + "grad_norm": 21.125, "learning_rate": 7.636e-05, - "loss": 0.8983, + "loss": 1.1306, "step": 1910 }, { - "epoch": 0.23115819889236697, - "grad_norm": 11.875, + "epoch": 1.369472182596291, + "grad_norm": 9.0625, "learning_rate": 7.676e-05, - "loss": 0.9293, + "loss": 1.1139, "step": 1920 }, { - "epoch": 0.23236214784493137, - "grad_norm": 11.75, + "epoch": 1.37660485021398, + "grad_norm": 30.25, "learning_rate": 7.716e-05, - "loss": 0.8602, + "loss": 1.1595, "step": 1930 }, { - "epoch": 0.2335660967974958, - "grad_norm": 11.5625, + "epoch": 1.383737517831669, + "grad_norm": 13.6875, "learning_rate": 7.756e-05, - "loss": 0.8078, + "loss": 1.2437, "step": 1940 }, { - "epoch": 0.2347700457500602, - "grad_norm": 9.125, + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, "learning_rate": 7.796e-05, - "loss": 0.8773, + "loss": 1.1005, "step": 1950 }, { - "epoch": 0.23597399470262462, - "grad_norm": 10.6875, + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, "learning_rate": 7.836e-05, - "loss": 0.8464, + "loss": 1.0748, "step": 1960 }, { - "epoch": 0.23717794365518902, - "grad_norm": 18.25, + "epoch": 1.405135520684736, + "grad_norm": 9.125, "learning_rate": 7.876e-05, - "loss": 0.8779, + "loss": 1.1576, "step": 1970 }, { - "epoch": 0.23838189260775344, - "grad_norm": 10.875, + "epoch": 1.412268188302425, + "grad_norm": 11.375, "learning_rate": 7.916e-05, - "loss": 0.9351, + "loss": 1.0982, "step": 1980 }, { - "epoch": 0.23958584156031784, - "grad_norm": 11.0, + "epoch": 1.4194008559201141, + "grad_norm": 10.375, "learning_rate": 7.956e-05, - "loss": 0.8581, + "loss": 1.132, "step": 1990 }, { - "epoch": 0.24078979051288224, - "grad_norm": 8.875, + "epoch": 1.4265335235378032, + "grad_norm": 16.375, "learning_rate": 7.996e-05, - "loss": 0.9799, + "loss": 1.121, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval/acc": 36.046512603759766, + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval_loss": 2.716654062271118, - "eval_runtime": 0.21, - "eval_samples_per_second": 204.721, - "eval_steps_per_second": 4.761, + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, "step": 2000 }, { - "epoch": 0.24199373946544667, - "grad_norm": 11.0625, + "epoch": 1.4336661911554922, + "grad_norm": 9.125, "learning_rate": 8.036e-05, - "loss": 0.8678, + "loss": 1.2079, "step": 2010 }, { - "epoch": 0.24319768841801107, + "epoch": 1.440798858773181, "grad_norm": 12.125, "learning_rate": 8.076e-05, - "loss": 0.8832, + "loss": 1.1098, "step": 2020 }, { - "epoch": 0.2444016373705755, - "grad_norm": 8.25, + "epoch": 1.44793152639087, + "grad_norm": 8.8125, "learning_rate": 8.116e-05, - "loss": 0.8689, + "loss": 0.9849, "step": 2030 }, { - "epoch": 0.2456055863231399, - "grad_norm": 6.53125, + "epoch": 1.4550641940085591, + "grad_norm": 9.0, "learning_rate": 8.156e-05, - "loss": 0.8829, + "loss": 1.0905, "step": 2040 }, { - "epoch": 0.24680953527570432, - "grad_norm": 9.5625, + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, "learning_rate": 8.196000000000001e-05, - "loss": 0.9181, + "loss": 1.2211, "step": 2050 }, { - "epoch": 0.24801348422826872, - "grad_norm": 22.875, + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, "learning_rate": 8.236e-05, - "loss": 0.8011, + "loss": 1.0968, "step": 2060 }, { - "epoch": 0.24921743318083314, - "grad_norm": 14.4375, + "epoch": 1.4764621968616263, + "grad_norm": 9.0, "learning_rate": 8.276e-05, - "loss": 0.9163, + "loss": 1.0973, "step": 2070 }, { - "epoch": 0.25042138213339754, - "grad_norm": 10.625, + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, "learning_rate": 8.316000000000001e-05, - "loss": 0.7869, + "loss": 1.1012, "step": 2080 }, { - "epoch": 0.25162533108596197, - "grad_norm": 11.0, + "epoch": 1.4907275320970044, + "grad_norm": 31.0, "learning_rate": 8.356e-05, - "loss": 0.8779, + "loss": 1.0437, "step": 2090 }, { - "epoch": 0.2528292800385264, - "grad_norm": 12.625, + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, "learning_rate": 8.396e-05, - "loss": 0.889, + "loss": 1.0934, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval/acc": 37.20930099487305, + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval_loss": 2.626293182373047, - "eval_runtime": 0.2735, - "eval_samples_per_second": 157.235, - "eval_steps_per_second": 3.657, + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, "step": 2100 }, { - "epoch": 0.25403322899109076, - "grad_norm": 8.3125, + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, "learning_rate": 8.436000000000001e-05, - "loss": 0.8363, + "loss": 1.0862, "step": 2110 }, { - "epoch": 0.2552371779436552, - "grad_norm": 8.625, + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, "learning_rate": 8.476000000000001e-05, - "loss": 0.8762, + "loss": 1.0786, "step": 2120 }, { - "epoch": 0.2564411268962196, - "grad_norm": 7.4375, + "epoch": 1.5192582025677603, + "grad_norm": 8.25, "learning_rate": 8.516e-05, - "loss": 0.7925, + "loss": 1.1496, "step": 2130 }, { - "epoch": 0.257645075848784, - "grad_norm": 9.1875, + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, "learning_rate": 8.556e-05, - "loss": 0.9575, + "loss": 1.1132, "step": 2140 }, { - "epoch": 0.2588490248013484, - "grad_norm": 9.8125, + "epoch": 1.5335235378031382, + "grad_norm": 21.375, "learning_rate": 8.596000000000001e-05, - "loss": 0.7551, + "loss": 1.1043, "step": 2150 }, { - "epoch": 0.26005297375391284, - "grad_norm": 7.15625, + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, "learning_rate": 8.636e-05, - "loss": 0.808, + "loss": 1.2549, "step": 2160 }, { - "epoch": 0.26125692270647727, - "grad_norm": 8.3125, + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, "learning_rate": 8.676e-05, - "loss": 0.9449, + "loss": 1.115, "step": 2170 }, { - "epoch": 0.26246087165904164, - "grad_norm": 11.5, + "epoch": 1.5549215406562054, + "grad_norm": 8.375, "learning_rate": 8.716000000000001e-05, - "loss": 0.8712, + "loss": 1.1963, "step": 2180 }, { - "epoch": 0.26366482061160607, - "grad_norm": 8.0, + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, "learning_rate": 8.756000000000001e-05, - "loss": 0.9389, + "loss": 1.1697, "step": 2190 }, { - "epoch": 0.2648687695641705, - "grad_norm": 13.5, + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, "learning_rate": 8.796e-05, - "loss": 0.7875, + "loss": 0.9716, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval/acc": 35.46511459350586, + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval_loss": 2.5862526893615723, - "eval_runtime": 0.2151, - "eval_samples_per_second": 199.927, - "eval_steps_per_second": 4.649, + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, "step": 2200 }, { - "epoch": 0.26607271851673486, - "grad_norm": 11.5625, + "epoch": 1.5763195435092725, + "grad_norm": 10.0, "learning_rate": 8.836000000000001e-05, - "loss": 0.9947, + "loss": 1.0254, "step": 2210 }, { - "epoch": 0.2672766674692993, - "grad_norm": 8.25, + "epoch": 1.5834522111269616, + "grad_norm": 12.625, "learning_rate": 8.876e-05, - "loss": 0.717, + "loss": 1.1672, "step": 2220 }, { - "epoch": 0.2684806164218637, - "grad_norm": 26.25, + "epoch": 1.5905848787446506, + "grad_norm": 11.5, "learning_rate": 8.916e-05, - "loss": 0.8688, + "loss": 1.0656, "step": 2230 }, { - "epoch": 0.26968456537442814, - "grad_norm": 11.5, + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, "learning_rate": 8.956e-05, - "loss": 0.9134, + "loss": 1.035, "step": 2240 }, { - "epoch": 0.2708885143269925, - "grad_norm": 6.875, + "epoch": 1.6048502139800287, + "grad_norm": 9.25, "learning_rate": 8.996e-05, - "loss": 0.8592, + "loss": 1.0972, "step": 2250 }, { - "epoch": 0.27209246327955694, - "grad_norm": 7.21875, + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, "learning_rate": 9.036e-05, - "loss": 0.6548, + "loss": 1.0148, "step": 2260 }, { - "epoch": 0.27329641223212137, - "grad_norm": 12.25, + "epoch": 1.6191155492154066, + "grad_norm": 13.5, "learning_rate": 9.076e-05, - "loss": 0.8613, + "loss": 1.1202, "step": 2270 }, { - "epoch": 0.2745003611846858, - "grad_norm": 8.875, + "epoch": 1.6262482168330956, + "grad_norm": 9.125, "learning_rate": 9.116e-05, - "loss": 0.7455, + "loss": 1.1134, "step": 2280 }, { - "epoch": 0.27570431013725016, - "grad_norm": 12.5625, + "epoch": 1.6333808844507844, + "grad_norm": 15.25, "learning_rate": 9.156e-05, - "loss": 0.8458, + "loss": 1.0373, "step": 2290 }, { - "epoch": 0.2769082590898146, - "grad_norm": 8.8125, + "epoch": 1.6405135520684735, + "grad_norm": 9.125, "learning_rate": 9.196000000000001e-05, - "loss": 0.8003, + "loss": 1.0654, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval/acc": 32.55813980102539, + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval_loss": 2.6594340801239014, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.965, - "eval_steps_per_second": 4.697, + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, "step": 2300 }, { - "epoch": 0.278112208042379, - "grad_norm": 10.6875, + "epoch": 1.6476462196861625, + "grad_norm": 8.25, "learning_rate": 9.236e-05, - "loss": 0.812, + "loss": 1.0218, "step": 2310 }, { - "epoch": 0.2793161569949434, - "grad_norm": 12.1875, + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, "learning_rate": 9.276e-05, - "loss": 0.781, + "loss": 1.106, "step": 2320 }, { - "epoch": 0.2805201059475078, - "grad_norm": 8.125, + "epoch": 1.6619115549215406, + "grad_norm": 8.25, "learning_rate": 9.316000000000001e-05, - "loss": 0.9682, + "loss": 1.0558, "step": 2330 }, { - "epoch": 0.28172405490007224, - "grad_norm": 8.8125, + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, "learning_rate": 9.356e-05, - "loss": 0.7531, + "loss": 0.9931, "step": 2340 }, { - "epoch": 0.28292800385263667, - "grad_norm": 7.375, + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, "learning_rate": 9.396e-05, - "loss": 0.7235, + "loss": 1.0683, "step": 2350 }, { - "epoch": 0.28413195280520104, - "grad_norm": 7.8125, + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, "learning_rate": 9.436e-05, - "loss": 0.9204, + "loss": 1.0631, "step": 2360 }, { - "epoch": 0.28533590175776546, - "grad_norm": 6.65625, + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, "learning_rate": 9.476000000000001e-05, - "loss": 0.7636, + "loss": 1.049, "step": 2370 }, { - "epoch": 0.2865398507103299, - "grad_norm": 9.625, + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, "learning_rate": 9.516e-05, - "loss": 0.855, + "loss": 1.0259, "step": 2380 }, { - "epoch": 0.2877437996628943, - "grad_norm": 9.6875, + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, "learning_rate": 9.556e-05, - "loss": 0.8643, + "loss": 1.0085, "step": 2390 }, { - "epoch": 0.2889477486154587, - "grad_norm": 7.1875, + "epoch": 1.7118402282453637, + "grad_norm": 131.0, "learning_rate": 9.596000000000001e-05, - "loss": 0.8258, + "loss": 0.944, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval/acc": 36.627906799316406, + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval_loss": 2.7174084186553955, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.672, - "eval_steps_per_second": 4.737, + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, "step": 2400 }, { - "epoch": 0.2901516975680231, - "grad_norm": 7.65625, + "epoch": 1.7189728958630528, + "grad_norm": 8.375, "learning_rate": 9.636e-05, - "loss": 0.8752, + "loss": 1.0069, "step": 2410 }, { - "epoch": 0.29135564652058754, - "grad_norm": 8.75, + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, "learning_rate": 9.676e-05, - "loss": 0.8082, + "loss": 1.0648, "step": 2420 }, { - "epoch": 0.2925595954731519, - "grad_norm": 10.4375, + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, "learning_rate": 9.716000000000001e-05, - "loss": 0.7538, + "loss": 1.0594, "step": 2430 }, { - "epoch": 0.29376354442571634, - "grad_norm": 6.4375, + "epoch": 1.7403708987161197, + "grad_norm": 8.75, "learning_rate": 9.756000000000001e-05, - "loss": 0.7766, + "loss": 1.2082, "step": 2440 }, { - "epoch": 0.29496749337828077, - "grad_norm": 7.96875, + "epoch": 1.7475035663338088, + "grad_norm": 9.875, "learning_rate": 9.796e-05, - "loss": 0.844, + "loss": 1.0225, "step": 2450 }, { - "epoch": 0.2961714423308452, - "grad_norm": 7.75, + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, "learning_rate": 9.836000000000001e-05, - "loss": 0.7127, + "loss": 0.9975, "step": 2460 }, { - "epoch": 0.29737539128340956, - "grad_norm": 11.5, + "epoch": 1.7617689015691869, + "grad_norm": 21.0, "learning_rate": 9.876000000000001e-05, - "loss": 0.8363, + "loss": 0.9533, "step": 2470 }, { - "epoch": 0.298579340235974, - "grad_norm": 6.4375, + "epoch": 1.768901569186876, + "grad_norm": 7.65625, "learning_rate": 9.916e-05, - "loss": 0.7429, + "loss": 0.9619, "step": 2480 }, { - "epoch": 0.2997832891885384, - "grad_norm": 11.5, + "epoch": 1.776034236804565, + "grad_norm": 13.625, "learning_rate": 9.956e-05, - "loss": 0.736, + "loss": 0.9425, "step": 2490 }, { - "epoch": 0.30098723814110284, - "grad_norm": 9.25, + "epoch": 1.783166904422254, + "grad_norm": 12.375, "learning_rate": 9.996000000000001e-05, - "loss": 0.8365, + "loss": 0.9893, "step": 2500 }, { - "epoch": 0.30098723814110284, + "epoch": 1.783166904422254, "eval/acc": 39.53488540649414, "step": 2500 }, { - "epoch": 0.30098723814110284, - "eval_loss": 2.713433027267456, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.919, - "eval_steps_per_second": 4.789, + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 2500 }, { - "epoch": 0.3021911870936672, - "grad_norm": 7.03125, + "epoch": 1.790299572039943, + "grad_norm": 10.0, "learning_rate": 9.996000000000001e-05, - "loss": 0.7664, + "loss": 1.0137, "step": 2510 }, { - "epoch": 0.30339513604623164, - "grad_norm": 7.75, + "epoch": 1.797432239657632, + "grad_norm": 10.125, "learning_rate": 9.991555555555556e-05, - "loss": 0.9128, + "loss": 1.059, "step": 2520 }, { - "epoch": 0.30459908499879607, - "grad_norm": 9.0, + "epoch": 1.804564907275321, + "grad_norm": 32.0, "learning_rate": 9.987111111111111e-05, - "loss": 0.8045, + "loss": 1.0498, "step": 2530 }, { - "epoch": 0.30580303395136044, - "grad_norm": 8.9375, + "epoch": 1.81169757489301, + "grad_norm": 10.125, "learning_rate": 9.982666666666667e-05, - "loss": 0.8292, + "loss": 1.1431, "step": 2540 }, { - "epoch": 0.30700698290392486, - "grad_norm": 7.40625, + "epoch": 1.818830242510699, + "grad_norm": 7.90625, "learning_rate": 9.978222222222223e-05, - "loss": 0.7557, + "loss": 1.0715, "step": 2550 }, { - "epoch": 0.3082109318564893, - "grad_norm": 7.625, + "epoch": 1.825962910128388, + "grad_norm": 10.9375, "learning_rate": 9.973777777777778e-05, - "loss": 0.683, + "loss": 1.0446, "step": 2560 }, { - "epoch": 0.3094148808090537, - "grad_norm": 8.1875, + "epoch": 1.833095577746077, + "grad_norm": 13.0, "learning_rate": 9.969333333333334e-05, - "loss": 0.8052, + "loss": 1.0291, "step": 2570 }, { - "epoch": 0.3106188297616181, - "grad_norm": 8.4375, + "epoch": 1.840228245363766, + "grad_norm": 9.75, "learning_rate": 9.964888888888889e-05, - "loss": 0.7819, + "loss": 0.9713, "step": 2580 }, { - "epoch": 0.3118227787141825, - "grad_norm": 10.8125, + "epoch": 1.847360912981455, + "grad_norm": 10.5625, "learning_rate": 9.960444444444444e-05, - "loss": 0.8452, + "loss": 1.2157, "step": 2590 }, { - "epoch": 0.31302672766674694, - "grad_norm": 6.21875, + "epoch": 1.854493580599144, + "grad_norm": 9.3125, "learning_rate": 9.956e-05, - "loss": 0.7478, + "loss": 1.0455, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval/acc": 34.88372039794922, + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval_loss": 2.6625020503997803, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.644, - "eval_steps_per_second": 4.852, + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, "step": 2600 }, { - "epoch": 0.31423067661931137, - "grad_norm": 7.375, + "epoch": 1.861626248216833, + "grad_norm": 10.5, "learning_rate": 9.951555555555556e-05, - "loss": 0.7623, + "loss": 1.0604, "step": 2610 }, { - "epoch": 0.31543462557187574, - "grad_norm": 9.0, + "epoch": 1.8687589158345221, + "grad_norm": 9.375, "learning_rate": 9.947111111111111e-05, - "loss": 0.8223, + "loss": 0.8715, "step": 2620 }, { - "epoch": 0.31663857452444016, - "grad_norm": 6.75, + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, "learning_rate": 9.942666666666667e-05, - "loss": 0.7797, + "loss": 1.0034, "step": 2630 }, { - "epoch": 0.3178425234770046, - "grad_norm": 9.125, + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, "learning_rate": 9.938222222222224e-05, - "loss": 0.6746, + "loss": 1.0557, "step": 2640 }, { - "epoch": 0.31904647242956896, - "grad_norm": 8.5, + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, "learning_rate": 9.933777777777779e-05, - "loss": 0.8434, + "loss": 0.974, "step": 2650 }, { - "epoch": 0.3202504213821334, - "grad_norm": 10.3125, + "epoch": 1.8972895863052783, + "grad_norm": 10.875, "learning_rate": 9.929333333333333e-05, - "loss": 0.8625, + "loss": 1.1366, "step": 2660 }, { - "epoch": 0.3214543703346978, - "grad_norm": 8.125, + "epoch": 1.9044222539229672, + "grad_norm": 28.75, "learning_rate": 9.92488888888889e-05, - "loss": 0.8003, + "loss": 1.0135, "step": 2670 }, { - "epoch": 0.32265831928726224, - "grad_norm": 8.5625, + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, "learning_rate": 9.920444444444444e-05, - "loss": 0.8145, + "loss": 1.0263, "step": 2680 }, { - "epoch": 0.3238622682398266, - "grad_norm": 8.0, + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, "learning_rate": 9.916e-05, - "loss": 0.6519, + "loss": 0.9952, "step": 2690 }, { - "epoch": 0.32506621719239104, - "grad_norm": 8.5625, + "epoch": 1.925820256776034, + "grad_norm": 8.8125, "learning_rate": 9.911555555555557e-05, - "loss": 0.7627, + "loss": 1.0438, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval/acc": 38.953487396240234, + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval_loss": 2.629239082336426, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.931, - "eval_steps_per_second": 4.626, + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, "step": 2700 }, { - "epoch": 0.32627016614495546, - "grad_norm": 7.625, + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, "learning_rate": 9.907111111111112e-05, - "loss": 0.7265, + "loss": 0.9522, "step": 2710 }, { - "epoch": 0.3274741150975199, - "grad_norm": 7.15625, + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, "learning_rate": 9.902666666666666e-05, - "loss": 0.7468, + "loss": 0.9729, "step": 2720 }, { - "epoch": 0.32867806405008426, - "grad_norm": 8.5, + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, "learning_rate": 9.898222222222223e-05, - "loss": 0.7816, + "loss": 1.0528, "step": 2730 }, { - "epoch": 0.3298820130026487, - "grad_norm": 6.8125, + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, "learning_rate": 9.893777777777779e-05, - "loss": 0.7828, + "loss": 1.1212, "step": 2740 }, { - "epoch": 0.3310859619552131, - "grad_norm": 8.5625, + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, "learning_rate": 9.889333333333334e-05, - "loss": 0.8273, + "loss": 0.9866, "step": 2750 }, { - "epoch": 0.3322899109077775, - "grad_norm": 7.28125, + "epoch": 1.9686162624821684, + "grad_norm": 8.25, "learning_rate": 9.884888888888889e-05, - "loss": 0.6265, + "loss": 0.8616, "step": 2760 }, { - "epoch": 0.3334938598603419, - "grad_norm": 7.78125, + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, "learning_rate": 9.880444444444445e-05, - "loss": 0.8716, + "loss": 0.9972, "step": 2770 }, { - "epoch": 0.33469780881290634, - "grad_norm": 6.0, + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, "learning_rate": 9.876000000000001e-05, - "loss": 0.7587, + "loss": 0.9781, "step": 2780 }, { - "epoch": 0.33590175776547077, - "grad_norm": 11.8125, + "epoch": 1.9900142653352355, + "grad_norm": 10.75, "learning_rate": 9.871555555555556e-05, - "loss": 0.836, + "loss": 1.0579, "step": 2790 }, { - "epoch": 0.33710570671803514, - "grad_norm": 8.3125, + "epoch": 1.9971469329529246, + "grad_norm": 8.25, "learning_rate": 9.867111111111112e-05, - "loss": 0.7196, + "loss": 1.0323, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval/acc": 34.88372039794922, + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval_loss": 2.5979089736938477, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.843, - "eval_steps_per_second": 4.717, + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, "step": 2800 }, { - "epoch": 0.33830965567059956, - "grad_norm": 8.125, + "epoch": 2.0042796005706136, + "grad_norm": 10.25, "learning_rate": 9.862666666666667e-05, - "loss": 0.7128, + "loss": 1.0597, "step": 2810 }, { - "epoch": 0.339513604623164, - "grad_norm": 7.0, + "epoch": 2.011412268188302, + "grad_norm": 7.0625, "learning_rate": 9.858222222222223e-05, - "loss": 0.8709, + "loss": 0.9582, "step": 2820 }, { - "epoch": 0.3407175535757284, - "grad_norm": 10.875, + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, "learning_rate": 9.853777777777778e-05, - "loss": 0.6885, + "loss": 1.0058, "step": 2830 }, { - "epoch": 0.3419215025282928, - "grad_norm": 6.625, + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, "learning_rate": 9.849333333333334e-05, - "loss": 0.8262, + "loss": 1.009, "step": 2840 }, { - "epoch": 0.3431254514808572, - "grad_norm": 9.0625, + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, "learning_rate": 9.844888888888889e-05, - "loss": 0.6365, + "loss": 0.93, "step": 2850 }, { - "epoch": 0.34432940043342164, - "grad_norm": 7.96875, + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, "learning_rate": 9.840444444444445e-05, - "loss": 0.8177, + "loss": 1.0953, "step": 2860 }, { - "epoch": 0.345533349385986, - "grad_norm": 6.71875, + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, "learning_rate": 9.836000000000001e-05, - "loss": 0.7043, + "loss": 1.0437, "step": 2870 }, { - "epoch": 0.34673729833855044, - "grad_norm": 10.4375, + "epoch": 2.0542082738944365, + "grad_norm": 8.75, "learning_rate": 9.831555555555556e-05, - "loss": 0.7503, + "loss": 0.9873, "step": 2880 }, { - "epoch": 0.34794124729111486, - "grad_norm": 7.375, + "epoch": 2.0613409415121255, + "grad_norm": 8.375, "learning_rate": 9.827111111111111e-05, - "loss": 0.7532, + "loss": 0.9414, "step": 2890 }, { - "epoch": 0.3491451962436793, - "grad_norm": 7.65625, + "epoch": 2.0684736091298146, + "grad_norm": 9.0, "learning_rate": 9.822666666666667e-05, - "loss": 0.6942, + "loss": 0.9625, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval/acc": 37.79069900512695, + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval_loss": 2.698911190032959, - "eval_runtime": 1.2554, - "eval_samples_per_second": 34.253, - "eval_steps_per_second": 0.797, + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, "step": 2900 }, { - "epoch": 0.35034914519624366, - "grad_norm": 7.1875, + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, "learning_rate": 9.818222222222223e-05, - "loss": 0.7651, + "loss": 1.0246, "step": 2910 }, { - "epoch": 0.3515530941488081, - "grad_norm": 6.0, + "epoch": 2.0827389443651927, + "grad_norm": 8.125, "learning_rate": 9.813777777777778e-05, - "loss": 0.7786, + "loss": 0.9646, "step": 2920 }, { - "epoch": 0.3527570431013725, - "grad_norm": 9.375, + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, "learning_rate": 9.809333333333333e-05, - "loss": 0.8285, + "loss": 1.0022, "step": 2930 }, { - "epoch": 0.35396099205393694, - "grad_norm": 6.4375, + "epoch": 2.097004279600571, + "grad_norm": 8.625, "learning_rate": 9.80488888888889e-05, - "loss": 0.7339, + "loss": 0.9834, "step": 2940 }, { - "epoch": 0.3551649410065013, - "grad_norm": 8.8125, + "epoch": 2.10413694721826, + "grad_norm": 45.25, "learning_rate": 9.800444444444446e-05, - "loss": 0.6948, + "loss": 0.9159, "step": 2950 }, { - "epoch": 0.35636888995906574, - "grad_norm": 11.4375, + "epoch": 2.1112696148359484, + "grad_norm": 9.375, "learning_rate": 9.796e-05, - "loss": 0.8455, + "loss": 1.0598, "step": 2960 }, { - "epoch": 0.35757283891163016, - "grad_norm": 8.5625, + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, "learning_rate": 9.791555555555557e-05, - "loss": 0.791, + "loss": 0.8848, "step": 2970 }, { - "epoch": 0.35877678786419454, - "grad_norm": 7.84375, + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, "learning_rate": 9.787111111111111e-05, - "loss": 0.8574, + "loss": 0.942, "step": 2980 }, { - "epoch": 0.35998073681675896, - "grad_norm": 9.4375, + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, "learning_rate": 9.782666666666666e-05, - "loss": 0.7923, + "loss": 0.9583, "step": 2990 }, { - "epoch": 0.3611846857693234, - "grad_norm": 8.0625, + "epoch": 2.1398002853067046, + "grad_norm": 9.0, "learning_rate": 9.778222222222222e-05, - "loss": 0.863, + "loss": 0.9836, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval/acc": 41.86046600341797, + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval_loss": 2.5240559577941895, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.269, - "eval_steps_per_second": 4.75, + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, "step": 3000 }, { - "epoch": 0.3623886347218878, - "grad_norm": 6.71875, + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, "learning_rate": 9.773777777777779e-05, - "loss": 0.7726, + "loss": 1.028, "step": 3010 }, { - "epoch": 0.3635925836744522, - "grad_norm": 8.125, + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, "learning_rate": 9.769333333333334e-05, - "loss": 0.8234, + "loss": 0.9209, "step": 3020 }, { - "epoch": 0.3647965326270166, - "grad_norm": 7.90625, + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, "learning_rate": 9.764888888888888e-05, - "loss": 0.8125, + "loss": 0.9999, "step": 3030 }, { - "epoch": 0.36600048157958104, - "grad_norm": 5.875, + "epoch": 2.168330955777461, + "grad_norm": 8.375, "learning_rate": 9.760444444444446e-05, - "loss": 0.739, + "loss": 0.9576, "step": 3040 }, { - "epoch": 0.3672044305321454, - "grad_norm": 32.75, + "epoch": 2.17546362339515, + "grad_norm": 7.4375, "learning_rate": 9.756000000000001e-05, - "loss": 0.8773, + "loss": 0.8832, "step": 3050 }, { - "epoch": 0.36840837948470984, - "grad_norm": 8.625, + "epoch": 2.182596291012839, + "grad_norm": 8.125, "learning_rate": 9.751555555555556e-05, - "loss": 0.6411, + "loss": 0.933, "step": 3060 }, { - "epoch": 0.36961232843727426, - "grad_norm": 10.0625, + "epoch": 2.189728958630528, + "grad_norm": 8.9375, "learning_rate": 9.747111111111112e-05, - "loss": 0.7757, + "loss": 0.9962, "step": 3070 }, { - "epoch": 0.3708162773898387, - "grad_norm": 7.78125, + "epoch": 2.196861626248217, + "grad_norm": 7.1875, "learning_rate": 9.742666666666667e-05, - "loss": 0.8144, + "loss": 1.003, "step": 3080 }, { - "epoch": 0.37202022634240306, - "grad_norm": 8.25, + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, "learning_rate": 9.738222222222223e-05, - "loss": 0.7915, + "loss": 0.9441, "step": 3090 }, { - "epoch": 0.3732241752949675, - "grad_norm": 9.5, + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, "learning_rate": 9.733777777777778e-05, - "loss": 0.7808, + "loss": 1.0335, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval/acc": 39.53488540649414, + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval_loss": 2.6263325214385986, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.065, - "eval_steps_per_second": 4.746, + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, "step": 3100 }, { - "epoch": 0.3744281242475319, - "grad_norm": 7.34375, + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, "learning_rate": 9.729333333333334e-05, - "loss": 0.6467, + "loss": 0.9694, "step": 3110 }, { - "epoch": 0.37563207320009634, - "grad_norm": 10.5625, + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, "learning_rate": 9.724888888888889e-05, - "loss": 0.7271, + "loss": 1.0386, "step": 3120 }, { - "epoch": 0.3768360221526607, - "grad_norm": 19.375, + "epoch": 2.232524964336662, + "grad_norm": 8.6875, "learning_rate": 9.720444444444445e-05, - "loss": 0.8248, + "loss": 0.9614, "step": 3130 }, { - "epoch": 0.37803997110522514, - "grad_norm": 11.6875, + "epoch": 2.239657631954351, + "grad_norm": 8.3125, "learning_rate": 9.716000000000001e-05, - "loss": 0.7468, + "loss": 1.0643, "step": 3140 }, { - "epoch": 0.37924392005778956, - "grad_norm": 6.71875, + "epoch": 2.24679029957204, + "grad_norm": 8.125, "learning_rate": 9.711555555555556e-05, - "loss": 0.8189, + "loss": 0.9243, "step": 3150 }, { - "epoch": 0.38044786901035393, - "grad_norm": 7.15625, + "epoch": 2.253922967189729, + "grad_norm": 9.125, "learning_rate": 9.707111111111111e-05, - "loss": 0.7265, + "loss": 0.8419, "step": 3160 }, { - "epoch": 0.38165181796291836, - "grad_norm": 11.9375, + "epoch": 2.261055634807418, + "grad_norm": 9.125, "learning_rate": 9.702666666666667e-05, - "loss": 0.7502, + "loss": 0.9961, "step": 3170 }, { - "epoch": 0.3828557669154828, - "grad_norm": 7.78125, + "epoch": 2.268188302425107, + "grad_norm": 6.3125, "learning_rate": 9.698222222222223e-05, - "loss": 0.8412, + "loss": 0.8931, "step": 3180 }, { - "epoch": 0.3840597158680472, - "grad_norm": 6.75, + "epoch": 2.275320970042796, + "grad_norm": 7.875, "learning_rate": 9.693777777777778e-05, - "loss": 0.8689, + "loss": 1.0057, "step": 3190 }, { - "epoch": 0.3852636648206116, - "grad_norm": 7.6875, + "epoch": 2.282453637660485, + "grad_norm": 6.90625, "learning_rate": 9.689333333333333e-05, - "loss": 0.8053, + "loss": 0.9606, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval/acc": 39.53488540649414, + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval_loss": 2.6145706176757812, - "eval_runtime": 0.2093, - "eval_samples_per_second": 205.398, - "eval_steps_per_second": 4.777, + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, "step": 3200 }, { - "epoch": 0.386467613773176, - "grad_norm": 7.65625, + "epoch": 2.289586305278174, + "grad_norm": 11.8125, "learning_rate": 9.684888888888889e-05, - "loss": 0.7601, + "loss": 0.9218, "step": 3210 }, { - "epoch": 0.38767156272574044, - "grad_norm": 19.25, + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, "learning_rate": 9.680444444444445e-05, - "loss": 0.7944, + "loss": 1.0111, "step": 3220 }, { - "epoch": 0.38887551167830486, - "grad_norm": 9.375, + "epoch": 2.3038516405135523, + "grad_norm": 8.625, "learning_rate": 9.676e-05, - "loss": 0.839, + "loss": 1.0968, "step": 3230 }, { - "epoch": 0.39007946063086923, - "grad_norm": 8.5, + "epoch": 2.310984308131241, + "grad_norm": 7.1875, "learning_rate": 9.671555555555556e-05, - "loss": 0.7794, + "loss": 1.0236, "step": 3240 }, { - "epoch": 0.39128340958343366, - "grad_norm": 7.78125, + "epoch": 2.31811697574893, + "grad_norm": 6.84375, "learning_rate": 9.667111111111111e-05, - "loss": 0.753, + "loss": 0.92, "step": 3250 }, { - "epoch": 0.3924873585359981, - "grad_norm": 7.15625, + "epoch": 2.325249643366619, + "grad_norm": 8.75, "learning_rate": 9.662666666666667e-05, - "loss": 0.7326, + "loss": 0.8205, "step": 3260 }, { - "epoch": 0.39369130748856246, - "grad_norm": 13.4375, + "epoch": 2.332382310984308, + "grad_norm": 30.75, "learning_rate": 9.658222222222222e-05, - "loss": 0.6754, + "loss": 0.9676, "step": 3270 }, { - "epoch": 0.3948952564411269, - "grad_norm": 6.71875, + "epoch": 2.339514978601997, + "grad_norm": 13.0, "learning_rate": 9.653777777777778e-05, - "loss": 0.757, + "loss": 0.9086, "step": 3280 }, { - "epoch": 0.3960992053936913, - "grad_norm": 7.5625, + "epoch": 2.346647646219686, + "grad_norm": 9.375, "learning_rate": 9.649333333333333e-05, - "loss": 0.9203, + "loss": 1.0504, "step": 3290 }, { - "epoch": 0.39730315434625574, - "grad_norm": 8.375, + "epoch": 2.353780313837375, + "grad_norm": 39.0, "learning_rate": 9.64488888888889e-05, - "loss": 0.8552, + "loss": 0.9481, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval/acc": 44.1860466003418, + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval_loss": 2.571866273880005, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.479, - "eval_steps_per_second": 4.802, + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, "step": 3300 }, { - "epoch": 0.3985071032988201, - "grad_norm": 7.5625, + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, "learning_rate": 9.640444444444446e-05, - "loss": 0.7811, + "loss": 0.9641, "step": 3310 }, { - "epoch": 0.39971105225138454, - "grad_norm": 11.75, + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, "learning_rate": 9.636e-05, - "loss": 0.6717, + "loss": 0.9624, "step": 3320 }, { - "epoch": 0.40091500120394896, - "grad_norm": 8.1875, + "epoch": 2.3751783166904423, + "grad_norm": 12.625, "learning_rate": 9.631555555555555e-05, - "loss": 0.838, + "loss": 1.0082, "step": 3330 }, { - "epoch": 0.4021189501565134, - "grad_norm": 6.40625, + "epoch": 2.3823109843081314, + "grad_norm": 7.25, "learning_rate": 9.627111111111112e-05, - "loss": 0.8568, + "loss": 1.0249, "step": 3340 }, { - "epoch": 0.40332289910907776, - "grad_norm": 7.3125, + "epoch": 2.3894436519258204, + "grad_norm": 13.375, "learning_rate": 9.622666666666668e-05, - "loss": 0.6742, + "loss": 1.0153, "step": 3350 }, { - "epoch": 0.4045268480616422, - "grad_norm": 7.875, + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, "learning_rate": 9.618222222222223e-05, - "loss": 0.7849, + "loss": 0.9533, "step": 3360 }, { - "epoch": 0.4057307970142066, - "grad_norm": 8.5625, + "epoch": 2.403708987161198, + "grad_norm": 9.25, "learning_rate": 9.613777777777779e-05, - "loss": 0.7537, + "loss": 1.1051, "step": 3370 }, { - "epoch": 0.406934745966771, - "grad_norm": 8.5625, + "epoch": 2.410841654778887, + "grad_norm": 9.5625, "learning_rate": 9.609333333333334e-05, - "loss": 0.6935, + "loss": 1.0551, "step": 3380 }, { - "epoch": 0.4081386949193354, - "grad_norm": 6.3125, + "epoch": 2.417974322396576, + "grad_norm": 7.21875, "learning_rate": 9.604888888888889e-05, - "loss": 0.8065, + "loss": 0.9032, "step": 3390 }, { - "epoch": 0.40934264387189984, - "grad_norm": 26.25, + "epoch": 2.425106990014265, + "grad_norm": 8.5625, "learning_rate": 9.600444444444445e-05, - "loss": 0.6558, + "loss": 1.1008, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval/acc": 37.20930099487305, + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval_loss": 2.7212982177734375, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.345, - "eval_steps_per_second": 4.775, + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, "step": 3400 }, { - "epoch": 0.41054659282446426, - "grad_norm": 6.84375, + "epoch": 2.4322396576319543, + "grad_norm": 10.375, "learning_rate": 9.596000000000001e-05, - "loss": 0.7642, + "loss": 0.9562, "step": 3410 }, { - "epoch": 0.41175054177702863, - "grad_norm": 7.0625, + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, "learning_rate": 9.591555555555556e-05, - "loss": 0.7185, + "loss": 1.0756, "step": 3420 }, { - "epoch": 0.41295449072959306, - "grad_norm": 7.15625, + "epoch": 2.4465049928673324, + "grad_norm": 9.125, "learning_rate": 9.58711111111111e-05, - "loss": 0.6634, + "loss": 0.9554, "step": 3430 }, { - "epoch": 0.4141584396821575, - "grad_norm": 4.96875, + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, "learning_rate": 9.582666666666668e-05, - "loss": 0.6383, + "loss": 0.9122, "step": 3440 }, { - "epoch": 0.4153623886347219, - "grad_norm": 7.15625, + "epoch": 2.4607703281027105, + "grad_norm": 8.625, "learning_rate": 9.578222222222223e-05, - "loss": 0.8032, + "loss": 0.9311, "step": 3450 }, { - "epoch": 0.4165663375872863, - "grad_norm": 9.0625, + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, "learning_rate": 9.573777777777778e-05, - "loss": 0.7294, + "loss": 1.0023, "step": 3460 }, { - "epoch": 0.4177702865398507, - "grad_norm": 9.5, + "epoch": 2.4750356633380886, + "grad_norm": 8.125, "learning_rate": 9.569333333333334e-05, - "loss": 0.802, + "loss": 0.9172, "step": 3470 }, { - "epoch": 0.41897423549241514, - "grad_norm": 7.0, + "epoch": 2.4821683309557776, + "grad_norm": 7.375, "learning_rate": 9.56488888888889e-05, - "loss": 0.7307, + "loss": 0.9407, "step": 3480 }, { - "epoch": 0.4201781844449795, - "grad_norm": 6.34375, + "epoch": 2.4893009985734667, + "grad_norm": 10.25, "learning_rate": 9.560444444444445e-05, - "loss": 0.7239, + "loss": 0.9433, "step": 3490 }, { - "epoch": 0.42138213339754393, - "grad_norm": 6.5, + "epoch": 2.4964336661911553, + "grad_norm": 8.625, "learning_rate": 9.556e-05, - "loss": 0.6711, + "loss": 0.9934, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval/acc": 39.53488540649414, + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval_loss": 2.569326400756836, - "eval_runtime": 0.2066, - "eval_samples_per_second": 208.137, - "eval_steps_per_second": 4.84, + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 3500 }, { - "epoch": 0.42258608235010836, - "grad_norm": 8.125, + "epoch": 2.5035663338088447, + "grad_norm": 7.625, "learning_rate": 9.551555555555556e-05, - "loss": 0.695, + "loss": 0.9157, "step": 3510 }, { - "epoch": 0.4237900313026728, - "grad_norm": 8.3125, + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, "learning_rate": 9.547111111111111e-05, - "loss": 0.8691, + "loss": 0.9202, "step": 3520 }, { - "epoch": 0.42499398025523716, - "grad_norm": 8.6875, + "epoch": 2.5178316690442224, + "grad_norm": 9.25, "learning_rate": 9.542666666666667e-05, - "loss": 0.7582, + "loss": 0.8526, "step": 3530 }, { - "epoch": 0.4261979292078016, - "grad_norm": 7.25, + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, "learning_rate": 9.538222222222223e-05, - "loss": 0.7143, + "loss": 0.9562, "step": 3540 }, { - "epoch": 0.427401878160366, - "grad_norm": 8.6875, + "epoch": 2.5320970042796005, + "grad_norm": 9.75, "learning_rate": 9.533777777777778e-05, - "loss": 0.6754, + "loss": 0.9927, "step": 3550 }, { - "epoch": 0.42860582711293044, - "grad_norm": 7.8125, + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, "learning_rate": 9.529333333333333e-05, - "loss": 0.7153, + "loss": 0.9263, "step": 3560 }, { - "epoch": 0.4298097760654948, - "grad_norm": 7.5625, + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, "learning_rate": 9.52488888888889e-05, - "loss": 0.7293, + "loss": 0.9367, "step": 3570 }, { - "epoch": 0.43101372501805923, - "grad_norm": 7.5625, + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, "learning_rate": 9.520444444444446e-05, - "loss": 0.7066, + "loss": 0.9284, "step": 3580 }, { - "epoch": 0.43221767397062366, - "grad_norm": 8.1875, + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, "learning_rate": 9.516e-05, - "loss": 0.691, + "loss": 0.8394, "step": 3590 }, { - "epoch": 0.43342162292318803, - "grad_norm": 7.125, + "epoch": 2.5677603423680457, + "grad_norm": 10.25, "learning_rate": 9.511555555555555e-05, - "loss": 0.8239, + "loss": 0.9336, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval/acc": 44.1860466003418, + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval_loss": 2.4877374172210693, - "eval_runtime": 0.3957, - "eval_samples_per_second": 108.658, - "eval_steps_per_second": 2.527, + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, "step": 3600 }, { - "epoch": 0.43462557187575246, - "grad_norm": 6.375, + "epoch": 2.574893009985735, + "grad_norm": 10.0625, "learning_rate": 9.507111111111111e-05, - "loss": 0.6782, + "loss": 1.0005, "step": 3610 }, { - "epoch": 0.4358295208283169, - "grad_norm": 7.1875, + "epoch": 2.582025677603424, + "grad_norm": 8.375, "learning_rate": 9.502666666666668e-05, - "loss": 0.7602, + "loss": 0.9319, "step": 3620 }, { - "epoch": 0.4370334697808813, - "grad_norm": 8.125, + "epoch": 2.5891583452211124, + "grad_norm": 8.5, "learning_rate": 9.498222222222222e-05, - "loss": 0.7232, + "loss": 0.9125, "step": 3630 }, { - "epoch": 0.4382374187334457, - "grad_norm": 7.84375, + "epoch": 2.596291012838802, + "grad_norm": 7.71875, "learning_rate": 9.493777777777779e-05, - "loss": 0.729, + "loss": 0.9279, "step": 3640 }, { - "epoch": 0.4394413676860101, - "grad_norm": 8.375, + "epoch": 2.6034236804564905, + "grad_norm": 11.875, "learning_rate": 9.489333333333334e-05, - "loss": 0.8222, + "loss": 0.952, "step": 3650 }, { - "epoch": 0.44064531663857454, - "grad_norm": 8.125, + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, "learning_rate": 9.48488888888889e-05, - "loss": 0.6918, + "loss": 1.0043, "step": 3660 }, { - "epoch": 0.44184926559113896, - "grad_norm": 8.1875, + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, "learning_rate": 9.480444444444445e-05, - "loss": 0.6761, + "loss": 0.8932, "step": 3670 }, { - "epoch": 0.44305321454370333, - "grad_norm": 5.65625, + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, "learning_rate": 9.476000000000001e-05, - "loss": 0.7532, + "loss": 0.8775, "step": 3680 }, { - "epoch": 0.44425716349626776, - "grad_norm": 8.8125, + "epoch": 2.6319543509272467, + "grad_norm": 9.0, "learning_rate": 9.471555555555556e-05, - "loss": 0.7072, + "loss": 0.9756, "step": 3690 }, { - "epoch": 0.4454611124488322, - "grad_norm": 6.5625, + "epoch": 2.6390870185449358, + "grad_norm": 7.375, "learning_rate": 9.46711111111111e-05, - "loss": 0.8405, + "loss": 0.9345, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval/acc": 39.53488540649414, + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval_loss": 2.615053176879883, - "eval_runtime": 4.8304, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 0.207, + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, "step": 3700 }, { - "epoch": 0.44666506140139656, - "grad_norm": 8.6875, + "epoch": 2.646219686162625, + "grad_norm": 8.4375, "learning_rate": 9.462666666666668e-05, - "loss": 0.7249, + "loss": 0.9851, "step": 3710 }, { - "epoch": 0.447869010353961, - "grad_norm": 8.4375, + "epoch": 2.653352353780314, + "grad_norm": 31.75, "learning_rate": 9.458222222222223e-05, - "loss": 0.8561, + "loss": 0.9712, "step": 3720 }, { - "epoch": 0.4490729593065254, - "grad_norm": 7.3125, + "epoch": 2.660485021398003, + "grad_norm": 6.75, "learning_rate": 9.453777777777778e-05, - "loss": 0.7884, + "loss": 0.8641, "step": 3730 }, { - "epoch": 0.45027690825908984, - "grad_norm": 7.34375, + "epoch": 2.667617689015692, + "grad_norm": 6.5625, "learning_rate": 9.449333333333334e-05, - "loss": 0.7169, + "loss": 0.945, "step": 3740 }, { - "epoch": 0.4514808572116542, - "grad_norm": 5.5, + "epoch": 2.674750356633381, + "grad_norm": 6.0625, "learning_rate": 9.44488888888889e-05, - "loss": 0.7542, + "loss": 0.9535, "step": 3750 }, { - "epoch": 0.45268480616421863, - "grad_norm": 6.09375, + "epoch": 2.68188302425107, + "grad_norm": 7.90625, "learning_rate": 9.440444444444445e-05, - "loss": 0.6292, + "loss": 0.8844, "step": 3760 }, { - "epoch": 0.45388875511678306, - "grad_norm": 8.9375, + "epoch": 2.689015691868759, + "grad_norm": 9.8125, "learning_rate": 9.436e-05, - "loss": 0.6682, + "loss": 0.9064, "step": 3770 }, { - "epoch": 0.4550927040693475, - "grad_norm": 5.09375, + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, "learning_rate": 9.431555555555556e-05, - "loss": 0.6499, + "loss": 1.0119, "step": 3780 }, { - "epoch": 0.45629665302191186, - "grad_norm": 8.5, + "epoch": 2.703281027104137, + "grad_norm": 7.15625, "learning_rate": 9.427111111111112e-05, - "loss": 0.7859, + "loss": 0.9655, "step": 3790 }, { - "epoch": 0.4575006019744763, - "grad_norm": 14.5, + "epoch": 2.710413694721826, + "grad_norm": 9.4375, "learning_rate": 9.422666666666667e-05, - "loss": 0.7987, + "loss": 0.9187, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval/acc": 39.53488540649414, + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval_loss": 2.645066022872925, - "eval_runtime": 0.6165, - "eval_samples_per_second": 69.745, - "eval_steps_per_second": 1.622, + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, "step": 3800 }, { - "epoch": 0.4587045509270407, - "grad_norm": 6.25, + "epoch": 2.717546362339515, + "grad_norm": 9.25, "learning_rate": 9.418222222222223e-05, - "loss": 0.7035, + "loss": 0.8689, "step": 3810 }, { - "epoch": 0.4599084998796051, - "grad_norm": 6.46875, + "epoch": 2.724679029957204, + "grad_norm": 8.0625, "learning_rate": 9.413777777777778e-05, - "loss": 0.6329, + "loss": 0.9138, "step": 3820 }, { - "epoch": 0.4611124488321695, - "grad_norm": 8.875, + "epoch": 2.731811697574893, + "grad_norm": 14.3125, "learning_rate": 9.409333333333333e-05, - "loss": 0.7553, + "loss": 0.9129, "step": 3830 }, { - "epoch": 0.46231639778473393, - "grad_norm": 9.3125, + "epoch": 2.738944365192582, + "grad_norm": 6.78125, "learning_rate": 9.404888888888889e-05, - "loss": 0.6551, + "loss": 0.8666, "step": 3840 }, { - "epoch": 0.46352034673729836, - "grad_norm": 11.0625, + "epoch": 2.746077032810271, + "grad_norm": 7.4375, "learning_rate": 9.400444444444445e-05, - "loss": 0.6634, + "loss": 0.9474, "step": 3850 }, { - "epoch": 0.46472429568986273, - "grad_norm": 6.71875, + "epoch": 2.75320970042796, + "grad_norm": 7.46875, "learning_rate": 9.396e-05, - "loss": 0.6527, + "loss": 0.9312, "step": 3860 }, { - "epoch": 0.46592824464242716, - "grad_norm": 6.75, + "epoch": 2.760342368045649, + "grad_norm": 7.84375, "learning_rate": 9.391555555555555e-05, - "loss": 0.8268, + "loss": 0.943, "step": 3870 }, { - "epoch": 0.4671321935949916, - "grad_norm": 7.78125, + "epoch": 2.767475035663338, + "grad_norm": 8.125, "learning_rate": 9.387111111111113e-05, - "loss": 0.742, + "loss": 0.9471, "step": 3880 }, { - "epoch": 0.468336142547556, - "grad_norm": 6.53125, + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, "learning_rate": 9.382666666666667e-05, - "loss": 0.7446, + "loss": 0.9785, "step": 3890 }, { - "epoch": 0.4695400915001204, - "grad_norm": 7.0625, + "epoch": 2.7817403708987163, + "grad_norm": 10.5, "learning_rate": 9.378222222222222e-05, - "loss": 0.7764, + "loss": 1.0151, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval/acc": 37.79069900512695, + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval_loss": 2.6463897228240967, - "eval_runtime": 1.4145, - "eval_samples_per_second": 30.4, - "eval_steps_per_second": 0.707, + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, "step": 3900 }, { - "epoch": 0.4707440404526848, - "grad_norm": 5.625, + "epoch": 2.788873038516405, + "grad_norm": 9.75, "learning_rate": 9.373777777777778e-05, - "loss": 0.7248, + "loss": 0.9148, "step": 3910 }, { - "epoch": 0.47194798940524924, - "grad_norm": 7.09375, + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, "learning_rate": 9.369333333333333e-05, - "loss": 0.6977, + "loss": 1.0314, "step": 3920 }, { - "epoch": 0.4731519383578136, - "grad_norm": 7.53125, + "epoch": 2.803138373751783, + "grad_norm": 8.375, "learning_rate": 9.36488888888889e-05, - "loss": 0.6496, + "loss": 0.9076, "step": 3930 }, { - "epoch": 0.47435588731037803, - "grad_norm": 11.0, + "epoch": 2.810271041369472, + "grad_norm": 6.46875, "learning_rate": 9.360444444444444e-05, - "loss": 0.7309, + "loss": 0.8218, "step": 3940 }, { - "epoch": 0.47555983626294246, - "grad_norm": 10.5625, + "epoch": 2.817403708987161, + "grad_norm": 7.96875, "learning_rate": 9.356e-05, - "loss": 0.7837, + "loss": 0.9415, "step": 3950 }, { - "epoch": 0.4767637852155069, - "grad_norm": 6.9375, + "epoch": 2.82453637660485, + "grad_norm": 7.53125, "learning_rate": 9.351555555555555e-05, - "loss": 0.6769, + "loss": 0.9593, "step": 3960 }, { - "epoch": 0.47796773416807126, - "grad_norm": 6.84375, + "epoch": 2.831669044222539, + "grad_norm": 5.96875, "learning_rate": 9.347111111111112e-05, - "loss": 0.642, + "loss": 0.9134, "step": 3970 }, { - "epoch": 0.4791716831206357, - "grad_norm": 9.125, + "epoch": 2.8388017118402282, + "grad_norm": 8.25, "learning_rate": 9.342666666666668e-05, - "loss": 0.6947, + "loss": 0.9339, "step": 3980 }, { - "epoch": 0.4803756320732001, - "grad_norm": 7.4375, + "epoch": 2.8459343794579173, + "grad_norm": 9.625, "learning_rate": 9.338222222222223e-05, - "loss": 0.5902, + "loss": 1.0018, "step": 3990 }, { - "epoch": 0.4815795810257645, - "grad_norm": 8.1875, + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, "learning_rate": 9.333777777777777e-05, - "loss": 0.6075, + "loss": 0.9302, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval/acc": 34.88372039794922, + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval_loss": 2.6985960006713867, - "eval_runtime": 0.2767, - "eval_samples_per_second": 155.399, - "eval_steps_per_second": 3.614, + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, "step": 4000 }, { - "epoch": 0.4827835299783289, - "grad_norm": 6.8125, + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, "learning_rate": 9.329333333333334e-05, - "loss": 0.7166, + "loss": 0.9375, "step": 4010 }, { - "epoch": 0.48398747893089333, - "grad_norm": 6.375, + "epoch": 2.8673323823109844, + "grad_norm": 11.875, "learning_rate": 9.32488888888889e-05, - "loss": 0.6136, + "loss": 0.8406, "step": 4020 }, { - "epoch": 0.48519142788345776, - "grad_norm": 6.09375, + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, "learning_rate": 9.320444444444445e-05, - "loss": 0.7948, + "loss": 0.8863, "step": 4030 }, { - "epoch": 0.48639537683602213, - "grad_norm": 7.5625, + "epoch": 2.881597717546362, + "grad_norm": 6.9375, "learning_rate": 9.316000000000001e-05, - "loss": 0.7253, + "loss": 0.9546, "step": 4040 }, { - "epoch": 0.48759932578858656, - "grad_norm": 7.1875, + "epoch": 2.8887303851640516, + "grad_norm": 8.625, "learning_rate": 9.311555555555556e-05, - "loss": 0.7386, + "loss": 1.0175, "step": 4050 }, { - "epoch": 0.488803274741151, - "grad_norm": 7.71875, + "epoch": 2.89586305278174, + "grad_norm": 45.0, "learning_rate": 9.307111111111112e-05, - "loss": 0.7222, + "loss": 0.9058, "step": 4060 }, { - "epoch": 0.4900072236937154, - "grad_norm": 10.8125, + "epoch": 2.9029957203994297, + "grad_norm": 13.625, "learning_rate": 9.302666666666667e-05, - "loss": 0.6298, + "loss": 0.9137, "step": 4070 }, { - "epoch": 0.4912111726462798, - "grad_norm": 14.25, + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, "learning_rate": 9.298222222222223e-05, - "loss": 0.6551, + "loss": 0.8862, "step": 4080 }, { - "epoch": 0.4924151215988442, - "grad_norm": 7.75, + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, "learning_rate": 9.293777777777778e-05, - "loss": 0.7201, + "loss": 0.9152, "step": 4090 }, { - "epoch": 0.49361907055140863, - "grad_norm": 9.0625, + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, "learning_rate": 9.289333333333334e-05, - "loss": 0.708, + "loss": 0.9623, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval/acc": 34.88372039794922, + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval_loss": 2.7673676013946533, - "eval_runtime": 0.3468, - "eval_samples_per_second": 124.003, - "eval_steps_per_second": 2.884, + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, "step": 4100 }, { - "epoch": 0.494823019503973, - "grad_norm": 7.9375, + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, "learning_rate": 9.28488888888889e-05, - "loss": 0.6997, + "loss": 0.9088, "step": 4110 }, { - "epoch": 0.49602696845653743, - "grad_norm": 6.84375, + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, "learning_rate": 9.280444444444445e-05, - "loss": 0.6195, + "loss": 0.9927, "step": 4120 }, { - "epoch": 0.49723091740910186, - "grad_norm": 7.40625, + "epoch": 2.9457917261055635, + "grad_norm": 75.0, "learning_rate": 9.276e-05, - "loss": 0.765, + "loss": 0.912, "step": 4130 }, { - "epoch": 0.4984348663616663, - "grad_norm": 7.8125, + "epoch": 2.9529243937232525, + "grad_norm": 9.125, "learning_rate": 9.271555555555556e-05, - "loss": 0.7097, + "loss": 0.9878, "step": 4140 }, { - "epoch": 0.49963881531423066, - "grad_norm": 7.75, + "epoch": 2.9600570613409416, + "grad_norm": 7.125, "learning_rate": 9.267111111111112e-05, - "loss": 0.7067, + "loss": 0.8785, "step": 4150 }, { - "epoch": 0.5008427642667951, - "grad_norm": 27.875, + "epoch": 2.9671897289586306, + "grad_norm": 8.25, "learning_rate": 9.262666666666667e-05, - "loss": 0.7989, + "loss": 0.9296, "step": 4160 }, { - "epoch": 0.5020467132193595, - "grad_norm": 8.0, + "epoch": 2.9743223965763197, + "grad_norm": 8.75, "learning_rate": 9.258222222222222e-05, - "loss": 0.6744, + "loss": 0.9284, "step": 4170 }, { - "epoch": 0.5032506621719239, - "grad_norm": 7.96875, + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, "learning_rate": 9.253777777777778e-05, - "loss": 0.738, + "loss": 0.9566, "step": 4180 }, { - "epoch": 0.5044546111244883, - "grad_norm": 7.21875, + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, "learning_rate": 9.249333333333334e-05, - "loss": 0.7021, + "loss": 0.8368, "step": 4190 }, { - "epoch": 0.5056585600770528, - "grad_norm": 9.6875, + "epoch": 2.995720399429387, + "grad_norm": 9.875, "learning_rate": 9.244888888888889e-05, - "loss": 0.7133, + "loss": 1.0306, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval/acc": 32.55813980102539, + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval_loss": 2.7288577556610107, - "eval_runtime": 0.2266, - "eval_samples_per_second": 189.803, - "eval_steps_per_second": 4.414, + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, "step": 4200 }, { - "epoch": 0.5068625090296172, - "grad_norm": 10.5, + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, "learning_rate": 9.240444444444445e-05, - "loss": 0.6886, + "loss": 0.957, "step": 4210 }, { - "epoch": 0.5080664579821815, - "grad_norm": 9.0625, + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, "learning_rate": 9.236e-05, - "loss": 0.7944, + "loss": 0.884, "step": 4220 }, { - "epoch": 0.509270406934746, - "grad_norm": 7.78125, + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, "learning_rate": 9.231555555555555e-05, - "loss": 0.7869, + "loss": 0.9064, "step": 4230 }, { - "epoch": 0.5104743558873104, - "grad_norm": 6.375, + "epoch": 3.0242510699001426, + "grad_norm": 8.0, "learning_rate": 9.227111111111111e-05, - "loss": 0.6245, + "loss": 0.9164, "step": 4240 }, { - "epoch": 0.5116783048398748, - "grad_norm": 9.9375, + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, "learning_rate": 9.222666666666668e-05, - "loss": 0.7006, + "loss": 0.9787, "step": 4250 }, { - "epoch": 0.5128822537924392, - "grad_norm": 6.1875, + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, "learning_rate": 9.218222222222222e-05, - "loss": 0.7588, + "loss": 0.8852, "step": 4260 }, { - "epoch": 0.5140862027450036, - "grad_norm": 10.6875, + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, "learning_rate": 9.213777777777777e-05, - "loss": 0.737, + "loss": 1.0092, "step": 4270 }, { - "epoch": 0.515290151697568, - "grad_norm": 6.15625, + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, "learning_rate": 9.209333333333335e-05, - "loss": 0.6774, + "loss": 0.9972, "step": 4280 }, { - "epoch": 0.5164941006501325, - "grad_norm": 8.8125, + "epoch": 3.059914407988588, + "grad_norm": 7.25, "learning_rate": 9.20488888888889e-05, - "loss": 0.6972, + "loss": 0.9237, "step": 4290 }, { - "epoch": 0.5176980496026968, - "grad_norm": 6.40625, + "epoch": 3.067047075606277, + "grad_norm": 6.4375, "learning_rate": 9.200444444444445e-05, - "loss": 0.6423, + "loss": 0.9096, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval/acc": 38.953487396240234, + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval_loss": 2.7444300651550293, - "eval_runtime": 0.2708, - "eval_samples_per_second": 158.776, - "eval_steps_per_second": 3.692, + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, "step": 4300 }, { - "epoch": 0.5189019985552613, - "grad_norm": 6.8125, + "epoch": 3.074179743223966, + "grad_norm": 8.4375, "learning_rate": 9.196000000000001e-05, - "loss": 0.7705, + "loss": 0.9697, "step": 4310 }, { - "epoch": 0.5201059475078257, - "grad_norm": 5.90625, + "epoch": 3.081312410841655, + "grad_norm": 8.4375, "learning_rate": 9.191555555555556e-05, - "loss": 0.7534, + "loss": 0.8379, "step": 4320 }, { - "epoch": 0.52130989646039, - "grad_norm": 9.25, + "epoch": 3.088445078459344, + "grad_norm": 8.125, "learning_rate": 9.187111111111112e-05, - "loss": 0.6586, + "loss": 0.8576, "step": 4330 }, { - "epoch": 0.5225138454129545, - "grad_norm": 7.53125, + "epoch": 3.0955777460770326, + "grad_norm": 10.75, "learning_rate": 9.182666666666667e-05, - "loss": 0.7459, + "loss": 0.9616, "step": 4340 }, { - "epoch": 0.5237177943655189, - "grad_norm": 6.09375, + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, "learning_rate": 9.178222222222223e-05, - "loss": 0.7088, + "loss": 0.7674, "step": 4350 }, { - "epoch": 0.5249217433180833, - "grad_norm": 8.5, + "epoch": 3.1098430813124107, + "grad_norm": 8.375, "learning_rate": 9.173777777777778e-05, - "loss": 0.7313, + "loss": 0.8712, "step": 4360 }, { - "epoch": 0.5261256922706478, - "grad_norm": 8.8125, + "epoch": 3.1169757489300998, + "grad_norm": 8.375, "learning_rate": 9.169333333333334e-05, - "loss": 0.7364, + "loss": 0.8599, "step": 4370 }, { - "epoch": 0.5273296412232121, - "grad_norm": 7.09375, + "epoch": 3.124108416547789, + "grad_norm": 7.1875, "learning_rate": 9.16488888888889e-05, - "loss": 0.6962, + "loss": 0.9736, "step": 4380 }, { - "epoch": 0.5285335901757765, - "grad_norm": 6.28125, + "epoch": 3.131241084165478, + "grad_norm": 7.75, "learning_rate": 9.160444444444445e-05, - "loss": 0.6817, + "loss": 0.8663, "step": 4390 }, { - "epoch": 0.529737539128341, - "grad_norm": 8.25, + "epoch": 3.138373751783167, + "grad_norm": 7.53125, "learning_rate": 9.156e-05, - "loss": 0.6786, + "loss": 0.9221, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval/acc": 34.88372039794922, + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval_loss": 2.728501081466675, - "eval_runtime": 0.3599, - "eval_samples_per_second": 119.474, - "eval_steps_per_second": 2.778, + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, "step": 4400 }, { - "epoch": 0.5309414880809054, - "grad_norm": 7.59375, + "epoch": 3.145506419400856, + "grad_norm": 8.125, "learning_rate": 9.151555555555556e-05, - "loss": 0.6744, + "loss": 0.9144, "step": 4410 }, { - "epoch": 0.5321454370334697, - "grad_norm": 8.0625, + "epoch": 3.152639087018545, + "grad_norm": 7.46875, "learning_rate": 9.147111111111112e-05, - "loss": 0.8287, + "loss": 0.9445, "step": 4420 }, { - "epoch": 0.5333493859860342, - "grad_norm": 8.1875, + "epoch": 3.159771754636234, + "grad_norm": 6.9375, "learning_rate": 9.142666666666667e-05, - "loss": 0.7069, + "loss": 0.8308, "step": 4430 }, { - "epoch": 0.5345533349385986, - "grad_norm": 8.125, + "epoch": 3.166904422253923, + "grad_norm": 7.53125, "learning_rate": 9.138222222222222e-05, - "loss": 0.662, + "loss": 0.8428, "step": 4440 }, { - "epoch": 0.5357572838911631, - "grad_norm": 7.46875, + "epoch": 3.174037089871612, + "grad_norm": 7.96875, "learning_rate": 9.133777777777778e-05, - "loss": 0.7424, + "loss": 0.9022, "step": 4450 }, { - "epoch": 0.5369612328437274, - "grad_norm": 6.96875, + "epoch": 3.181169757489301, + "grad_norm": 6.875, "learning_rate": 9.129333333333334e-05, - "loss": 0.7308, + "loss": 0.9955, "step": 4460 }, { - "epoch": 0.5381651817962918, - "grad_norm": 8.3125, + "epoch": 3.18830242510699, + "grad_norm": 9.5625, "learning_rate": 9.124888888888889e-05, - "loss": 0.7524, + "loss": 0.9493, "step": 4470 }, { - "epoch": 0.5393691307488563, - "grad_norm": 6.40625, + "epoch": 3.195435092724679, + "grad_norm": 9.0625, "learning_rate": 9.120444444444445e-05, - "loss": 0.7523, + "loss": 0.9608, "step": 4480 }, { - "epoch": 0.5405730797014207, - "grad_norm": 7.65625, + "epoch": 3.202567760342368, + "grad_norm": 8.625, "learning_rate": 9.116e-05, - "loss": 0.647, + "loss": 0.821, "step": 4490 }, { - "epoch": 0.541777028653985, - "grad_norm": 6.875, + "epoch": 3.209700427960057, + "grad_norm": 8.125, "learning_rate": 9.111555555555556e-05, - "loss": 0.6547, + "loss": 0.9175, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval/acc": 37.20930099487305, + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval_loss": 2.8390543460845947, - "eval_runtime": 0.2096, - "eval_samples_per_second": 205.2, - "eval_steps_per_second": 4.772, + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, "step": 4500 }, { - "epoch": 0.5429809776065495, - "grad_norm": 9.375, + "epoch": 3.216833095577746, + "grad_norm": 8.0625, "learning_rate": 9.107111111111111e-05, - "loss": 0.6773, + "loss": 0.9169, "step": 4510 }, { - "epoch": 0.5441849265591139, - "grad_norm": 10.1875, + "epoch": 3.223965763195435, + "grad_norm": 8.3125, "learning_rate": 9.102666666666667e-05, - "loss": 0.704, + "loss": 0.8001, "step": 4520 }, { - "epoch": 0.5453888755116783, - "grad_norm": 5.0625, + "epoch": 3.231098430813124, + "grad_norm": 7.3125, "learning_rate": 9.098222222222222e-05, - "loss": 0.6303, + "loss": 0.8513, "step": 4530 }, { - "epoch": 0.5465928244642427, - "grad_norm": 8.25, + "epoch": 3.238231098430813, + "grad_norm": 7.625, "learning_rate": 9.093777777777777e-05, - "loss": 0.7469, + "loss": 0.912, "step": 4540 }, { - "epoch": 0.5477967734168071, - "grad_norm": 7.375, + "epoch": 3.245363766048502, + "grad_norm": 6.46875, "learning_rate": 9.089333333333335e-05, - "loss": 0.6995, + "loss": 0.9418, "step": 4550 }, { - "epoch": 0.5490007223693716, - "grad_norm": 7.78125, + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, "learning_rate": 9.08488888888889e-05, - "loss": 0.6965, + "loss": 0.871, "step": 4560 }, { - "epoch": 0.550204671321936, - "grad_norm": 13.625, + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, "learning_rate": 9.080444444444444e-05, - "loss": 0.759, + "loss": 0.8507, "step": 4570 }, { - "epoch": 0.5514086202745003, - "grad_norm": 6.875, + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, "learning_rate": 9.076e-05, - "loss": 0.7284, + "loss": 0.8058, "step": 4580 }, { - "epoch": 0.5526125692270648, - "grad_norm": 5.875, + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, "learning_rate": 9.071555555555557e-05, - "loss": 0.6721, + "loss": 0.7959, "step": 4590 }, { - "epoch": 0.5538165181796292, - "grad_norm": 5.46875, + "epoch": 3.281027104136947, + "grad_norm": 6.375, "learning_rate": 9.067111111111112e-05, - "loss": 0.6522, + "loss": 0.9206, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval/acc": 39.53488540649414, + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval_loss": 2.801618814468384, - "eval_runtime": 0.2155, - "eval_samples_per_second": 199.501, - "eval_steps_per_second": 4.64, + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, "step": 4600 }, { - "epoch": 0.5550204671321936, - "grad_norm": 8.5625, + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, "learning_rate": 9.062666666666666e-05, - "loss": 0.6399, + "loss": 0.8306, "step": 4610 }, { - "epoch": 0.556224416084758, - "grad_norm": 7.40625, + "epoch": 3.295292439372325, + "grad_norm": 6.9375, "learning_rate": 9.058222222222223e-05, - "loss": 0.7303, + "loss": 0.8958, "step": 4620 }, { - "epoch": 0.5574283650373224, - "grad_norm": 6.96875, + "epoch": 3.302425106990014, + "grad_norm": 7.96875, "learning_rate": 9.053777777777777e-05, - "loss": 0.7126, + "loss": 0.8919, "step": 4630 }, { - "epoch": 0.5586323139898868, - "grad_norm": 7.15625, + "epoch": 3.309557774607703, + "grad_norm": 6.9375, "learning_rate": 9.049333333333334e-05, - "loss": 0.702, + "loss": 0.8844, "step": 4640 }, { - "epoch": 0.5598362629424513, - "grad_norm": 6.625, + "epoch": 3.316690442225392, + "grad_norm": 7.21875, "learning_rate": 9.04488888888889e-05, - "loss": 0.6957, + "loss": 0.8335, "step": 4650 }, { - "epoch": 0.5610402118950156, - "grad_norm": 7.90625, + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, "learning_rate": 9.040444444444445e-05, - "loss": 0.703, + "loss": 0.9337, "step": 4660 }, { - "epoch": 0.5622441608475801, - "grad_norm": 7.75, + "epoch": 3.3309557774607703, + "grad_norm": 9.25, "learning_rate": 9.036e-05, - "loss": 0.7195, + "loss": 1.0282, "step": 4670 }, { - "epoch": 0.5634481098001445, - "grad_norm": 6.59375, + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, "learning_rate": 9.031555555555557e-05, - "loss": 0.6445, + "loss": 0.9401, "step": 4680 }, { - "epoch": 0.5646520587527089, - "grad_norm": 25.125, + "epoch": 3.3452211126961484, + "grad_norm": 7.25, "learning_rate": 9.027111111111112e-05, - "loss": 0.699, + "loss": 0.908, "step": 4690 }, { - "epoch": 0.5658560077052733, - "grad_norm": 8.125, + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, "learning_rate": 9.022666666666667e-05, - "loss": 0.716, + "loss": 0.9262, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval/acc": 34.88372039794922, + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval_loss": 2.777444839477539, - "eval_runtime": 0.218, - "eval_samples_per_second": 197.287, - "eval_steps_per_second": 4.588, + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, "step": 4700 }, { - "epoch": 0.5670599566578377, - "grad_norm": 7.0, + "epoch": 3.3594864479315265, + "grad_norm": 13.0, "learning_rate": 9.018222222222223e-05, - "loss": 0.693, + "loss": 0.9692, "step": 4710 }, { - "epoch": 0.5682639056104021, - "grad_norm": 8.8125, + "epoch": 3.3666191155492156, + "grad_norm": 5.875, "learning_rate": 9.013777777777779e-05, - "loss": 0.7, + "loss": 0.9071, "step": 4720 }, { - "epoch": 0.5694678545629666, - "grad_norm": 7.0, + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, "learning_rate": 9.009333333333334e-05, - "loss": 0.6616, + "loss": 0.8528, "step": 4730 }, { - "epoch": 0.5706718035155309, - "grad_norm": 7.75, + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, "learning_rate": 9.004888888888889e-05, - "loss": 0.7987, + "loss": 0.9408, "step": 4740 }, { - "epoch": 0.5718757524680953, - "grad_norm": 6.53125, + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, "learning_rate": 9.000444444444445e-05, - "loss": 0.7162, + "loss": 1.0017, "step": 4750 }, { - "epoch": 0.5730797014206598, - "grad_norm": 8.6875, + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, "learning_rate": 8.996e-05, - "loss": 0.673, + "loss": 0.9107, "step": 4760 }, { - "epoch": 0.5742836503732242, - "grad_norm": 6.5625, + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, "learning_rate": 8.991555555555556e-05, - "loss": 0.7389, + "loss": 0.9387, "step": 4770 }, { - "epoch": 0.5754875993257886, - "grad_norm": 7.25, + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, "learning_rate": 8.987111111111112e-05, - "loss": 0.6674, + "loss": 0.9775, "step": 4780 }, { - "epoch": 0.576691548278353, - "grad_norm": 8.8125, + "epoch": 3.4165477888730384, + "grad_norm": 8.375, "learning_rate": 8.982666666666667e-05, - "loss": 0.7464, + "loss": 0.8173, "step": 4790 }, { - "epoch": 0.5778954972309174, - "grad_norm": 7.65625, + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, "learning_rate": 8.978222222222222e-05, - "loss": 0.6979, + "loss": 0.9068, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval/acc": 37.20930099487305, + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval_loss": 2.7990331649780273, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.72, - "eval_steps_per_second": 4.831, + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, "step": 4800 }, { - "epoch": 0.5790994461834819, - "grad_norm": 6.90625, + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, "learning_rate": 8.973777777777778e-05, - "loss": 0.7292, + "loss": 0.8262, "step": 4810 }, { - "epoch": 0.5803033951360462, - "grad_norm": 7.34375, + "epoch": 3.4379457917261056, + "grad_norm": 9.125, "learning_rate": 8.969333333333334e-05, - "loss": 0.6484, + "loss": 0.9207, "step": 4820 }, { - "epoch": 0.5815073440886106, - "grad_norm": 7.96875, + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, "learning_rate": 8.964888888888889e-05, - "loss": 0.6246, + "loss": 1.0115, "step": 4830 }, { - "epoch": 0.5827112930411751, - "grad_norm": 5.4375, + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, "learning_rate": 8.960444444444444e-05, - "loss": 0.6978, + "loss": 0.9031, "step": 4840 }, { - "epoch": 0.5839152419937395, - "grad_norm": 7.25, + "epoch": 3.4593437945791727, + "grad_norm": 7.875, "learning_rate": 8.956e-05, - "loss": 0.6848, + "loss": 0.9626, "step": 4850 }, { - "epoch": 0.5851191909463038, - "grad_norm": 8.9375, + "epoch": 3.466476462196862, + "grad_norm": 4.625, "learning_rate": 8.951555555555557e-05, - "loss": 0.7541, + "loss": 0.7793, "step": 4860 }, { - "epoch": 0.5863231398988683, - "grad_norm": 8.6875, + "epoch": 3.473609129814551, + "grad_norm": 7.40625, "learning_rate": 8.947111111111111e-05, - "loss": 0.6872, + "loss": 0.8733, "step": 4870 }, { - "epoch": 0.5875270888514327, - "grad_norm": 6.375, + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, "learning_rate": 8.942666666666668e-05, - "loss": 0.7521, + "loss": 0.8448, "step": 4880 }, { - "epoch": 0.5887310378039972, - "grad_norm": 7.34375, + "epoch": 3.4878744650499285, + "grad_norm": 8.625, "learning_rate": 8.938222222222222e-05, - "loss": 0.6741, + "loss": 0.815, "step": 4890 }, { - "epoch": 0.5899349867565615, - "grad_norm": 9.25, + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, "learning_rate": 8.933777777777779e-05, - "loss": 0.7085, + "loss": 0.7837, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval/acc": 32.55813980102539, + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval_loss": 2.822793483734131, - "eval_runtime": 0.2077, - "eval_samples_per_second": 206.985, - "eval_steps_per_second": 4.814, + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, "step": 4900 }, { - "epoch": 0.5911389357091259, - "grad_norm": 6.75, + "epoch": 3.5021398002853066, + "grad_norm": 7.25, "learning_rate": 8.929333333333333e-05, - "loss": 0.6908, + "loss": 0.9082, "step": 4910 }, { - "epoch": 0.5923428846616904, - "grad_norm": 14.3125, + "epoch": 3.5092724679029956, + "grad_norm": 9.0, "learning_rate": 8.92488888888889e-05, - "loss": 0.6954, + "loss": 0.8041, "step": 4920 }, { - "epoch": 0.5935468336142548, - "grad_norm": 5.03125, + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, "learning_rate": 8.920444444444444e-05, - "loss": 0.6255, + "loss": 0.878, "step": 4930 }, { - "epoch": 0.5947507825668191, - "grad_norm": 7.3125, + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, "learning_rate": 8.916e-05, - "loss": 0.6094, + "loss": 0.8609, "step": 4940 }, { - "epoch": 0.5959547315193836, - "grad_norm": 6.875, + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, "learning_rate": 8.911555555555557e-05, - "loss": 0.6488, + "loss": 0.8203, "step": 4950 }, { - "epoch": 0.597158680471948, - "grad_norm": 6.90625, + "epoch": 3.537803138373752, + "grad_norm": 6.4375, "learning_rate": 8.907111111111112e-05, - "loss": 0.6333, + "loss": 0.8976, "step": 4960 }, { - "epoch": 0.5983626294245123, - "grad_norm": 7.0, + "epoch": 3.544935805991441, + "grad_norm": 15.0, "learning_rate": 8.902666666666667e-05, - "loss": 0.6687, + "loss": 0.8585, "step": 4970 }, { - "epoch": 0.5995665783770768, - "grad_norm": 8.9375, + "epoch": 3.55206847360913, + "grad_norm": 6.21875, "learning_rate": 8.898222222222223e-05, - "loss": 0.6762, + "loss": 0.9642, "step": 4980 }, { - "epoch": 0.6007705273296412, - "grad_norm": 7.53125, + "epoch": 3.559201141226819, + "grad_norm": 9.8125, "learning_rate": 8.893777777777779e-05, - "loss": 0.6007, + "loss": 0.9241, "step": 4990 }, { - "epoch": 0.6019744762822057, - "grad_norm": 5.78125, + "epoch": 3.566333808844508, + "grad_norm": 9.25, "learning_rate": 8.889333333333334e-05, - "loss": 0.682, + "loss": 0.7841, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval/acc": 32.55813980102539, + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval_loss": 2.827073097229004, - "eval_runtime": 0.2073, - "eval_samples_per_second": 207.385, - "eval_steps_per_second": 4.823, + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, "step": 5000 }, { - "epoch": 0.60317842523477, - "grad_norm": 8.25, + "epoch": 3.5734664764621966, + "grad_norm": 7.53125, "learning_rate": 8.884888888888889e-05, - "loss": 0.6711, + "loss": 0.8513, "step": 5010 }, { - "epoch": 0.6043823741873344, - "grad_norm": 7.34375, + "epoch": 3.580599144079886, + "grad_norm": 7.3125, "learning_rate": 8.880444444444445e-05, - "loss": 0.6916, + "loss": 0.9502, "step": 5020 }, { - "epoch": 0.6055863231398989, - "grad_norm": 6.6875, + "epoch": 3.5877318116975747, + "grad_norm": 7.375, "learning_rate": 8.876e-05, - "loss": 0.6601, + "loss": 0.9329, "step": 5030 }, { - "epoch": 0.6067902720924633, - "grad_norm": 6.34375, + "epoch": 3.5948644793152638, + "grad_norm": 7.3125, "learning_rate": 8.871555555555556e-05, - "loss": 0.6945, + "loss": 0.8648, "step": 5040 }, { - "epoch": 0.6079942210450276, - "grad_norm": 6.9375, + "epoch": 3.601997146932953, + "grad_norm": 6.5, "learning_rate": 8.867111111111112e-05, - "loss": 0.6492, + "loss": 0.8019, "step": 5050 }, { - "epoch": 0.6091981699975921, - "grad_norm": 7.1875, + "epoch": 3.609129814550642, + "grad_norm": 9.0, "learning_rate": 8.862666666666667e-05, - "loss": 0.5963, + "loss": 0.8829, "step": 5060 }, { - "epoch": 0.6104021189501565, - "grad_norm": 7.1875, + "epoch": 3.616262482168331, + "grad_norm": 6.46875, "learning_rate": 8.858222222222222e-05, - "loss": 0.6715, + "loss": 0.8419, "step": 5070 }, { - "epoch": 0.6116060679027209, - "grad_norm": 9.25, + "epoch": 3.62339514978602, + "grad_norm": 8.9375, "learning_rate": 8.853777777777778e-05, - "loss": 0.7572, + "loss": 0.9345, "step": 5080 }, { - "epoch": 0.6128100168552854, - "grad_norm": 6.3125, + "epoch": 3.630527817403709, + "grad_norm": 7.09375, "learning_rate": 8.849333333333334e-05, - "loss": 0.7521, + "loss": 0.8204, "step": 5090 }, { - "epoch": 0.6140139658078497, - "grad_norm": 6.9375, + "epoch": 3.637660485021398, + "grad_norm": 7.71875, "learning_rate": 8.844888888888889e-05, - "loss": 0.6313, + "loss": 0.9305, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval/acc": 34.88372039794922, + "epoch": 3.637660485021398, + "eval/acc": 39.53488540649414, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval_loss": 2.9495913982391357, - "eval_runtime": 0.2063, - "eval_samples_per_second": 208.439, - "eval_steps_per_second": 4.847, + "epoch": 3.637660485021398, + "eval_loss": 2.0034291744232178, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 5100 }, { - "epoch": 0.6152179147604142, - "grad_norm": 9.0, + "epoch": 3.644793152639087, + "grad_norm": 6.09375, "learning_rate": 8.840444444444444e-05, - "loss": 0.7974, + "loss": 0.9168, "step": 5110 }, { - "epoch": 0.6164218637129786, - "grad_norm": 5.46875, + "epoch": 3.651925820256776, + "grad_norm": 8.25, "learning_rate": 8.836000000000001e-05, - "loss": 0.6245, + "loss": 0.8155, "step": 5120 }, { - "epoch": 0.617625812665543, - "grad_norm": 9.4375, + "epoch": 3.659058487874465, + "grad_norm": 7.84375, "learning_rate": 8.831555555555556e-05, - "loss": 0.7513, + "loss": 0.8641, "step": 5130 }, { - "epoch": 0.6188297616181074, - "grad_norm": 8.125, + "epoch": 3.666191155492154, + "grad_norm": 6.5, "learning_rate": 8.827111111111111e-05, - "loss": 0.6427, + "loss": 0.8623, "step": 5140 }, { - "epoch": 0.6200337105706718, - "grad_norm": 5.78125, + "epoch": 3.6733238231098433, + "grad_norm": 21.125, "learning_rate": 8.822666666666667e-05, - "loss": 0.6801, + "loss": 0.8205, "step": 5150 }, { - "epoch": 0.6212376595232362, - "grad_norm": 8.8125, + "epoch": 3.680456490727532, + "grad_norm": 7.28125, "learning_rate": 8.818222222222222e-05, - "loss": 0.5978, + "loss": 0.7993, "step": 5160 }, { - "epoch": 0.6224416084758007, - "grad_norm": 8.0, + "epoch": 3.6875891583452214, + "grad_norm": 36.0, "learning_rate": 8.813777777777778e-05, - "loss": 0.6697, + "loss": 0.9083, "step": 5170 }, { - "epoch": 0.623645557428365, - "grad_norm": 8.1875, + "epoch": 3.69472182596291, + "grad_norm": 8.125, "learning_rate": 8.809333333333333e-05, - "loss": 0.7621, + "loss": 0.9264, "step": 5180 }, { - "epoch": 0.6248495063809294, - "grad_norm": 6.4375, + "epoch": 3.701854493580599, + "grad_norm": 10.75, "learning_rate": 8.80488888888889e-05, - "loss": 0.6934, + "loss": 0.8496, "step": 5190 }, { - "epoch": 0.6260534553334939, - "grad_norm": 7.8125, + "epoch": 3.708987161198288, + "grad_norm": 7.78125, "learning_rate": 8.800444444444444e-05, - "loss": 0.7008, + "loss": 0.8718, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval/acc": 34.88372039794922, + "epoch": 3.708987161198288, + "eval/acc": 39.53488540649414, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval_loss": 2.8201522827148438, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.368, - "eval_steps_per_second": 4.729, + "epoch": 3.708987161198288, + "eval_loss": 2.0305864810943604, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.683, + "eval_steps_per_second": 4.504, "step": 5200 }, { - "epoch": 0.6272574042860583, - "grad_norm": 5.78125, + "epoch": 3.716119828815977, + "grad_norm": 9.3125, "learning_rate": 8.796e-05, - "loss": 0.7211, + "loss": 1.0077, "step": 5210 }, { - "epoch": 0.6284613532386227, - "grad_norm": 6.25, + "epoch": 3.723252496433666, + "grad_norm": 11.4375, "learning_rate": 8.791555555555557e-05, - "loss": 0.654, + "loss": 0.8364, "step": 5220 }, { - "epoch": 0.6296653021911871, - "grad_norm": 9.0625, + "epoch": 3.7303851640513552, + "grad_norm": 15.125, "learning_rate": 8.787111111111112e-05, - "loss": 0.6348, + "loss": 0.8557, "step": 5230 }, { - "epoch": 0.6308692511437515, - "grad_norm": 7.59375, + "epoch": 3.7375178316690443, + "grad_norm": 7.875, "learning_rate": 8.782666666666666e-05, - "loss": 0.6363, + "loss": 0.8674, "step": 5240 }, { - "epoch": 0.632073200096316, - "grad_norm": 6.25, + "epoch": 3.7446504992867333, + "grad_norm": 7.84375, "learning_rate": 8.778222222222223e-05, - "loss": 0.629, + "loss": 0.8788, "step": 5250 }, { - "epoch": 0.6332771490488803, - "grad_norm": 12.375, + "epoch": 3.7517831669044224, + "grad_norm": 7.59375, "learning_rate": 8.773777777777779e-05, - "loss": 0.771, + "loss": 0.8098, "step": 5260 }, { - "epoch": 0.6344810980014447, - "grad_norm": 5.96875, + "epoch": 3.7589158345221114, + "grad_norm": 7.40625, "learning_rate": 8.769333333333334e-05, - "loss": 0.589, + "loss": 0.8895, "step": 5270 }, { - "epoch": 0.6356850469540092, - "grad_norm": 7.1875, + "epoch": 3.7660485021398005, + "grad_norm": 6.78125, "learning_rate": 8.76488888888889e-05, - "loss": 0.5794, + "loss": 0.823, "step": 5280 }, { - "epoch": 0.6368889959065736, - "grad_norm": 7.09375, + "epoch": 3.773181169757489, + "grad_norm": 8.125, "learning_rate": 8.760444444444445e-05, - "loss": 0.6449, + "loss": 0.8418, "step": 5290 }, { - "epoch": 0.6380929448591379, - "grad_norm": 11.1875, + "epoch": 3.7803138373751786, + "grad_norm": 8.4375, "learning_rate": 8.756000000000001e-05, - "loss": 0.6708, + "loss": 0.8202, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval/acc": 36.627906799316406, + "epoch": 3.7803138373751786, + "eval/acc": 41.86046600341797, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval_loss": 2.902387857437134, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.456, - "eval_steps_per_second": 4.732, + "epoch": 3.7803138373751786, + "eval_loss": 2.100001811981201, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.218, + "eval_steps_per_second": 4.47, "step": 5300 }, { - "epoch": 0.6392968938117024, - "grad_norm": 8.625, + "epoch": 3.787446504992867, + "grad_norm": 7.78125, "learning_rate": 8.751555555555556e-05, - "loss": 0.5895, + "loss": 0.9786, "step": 5310 }, { - "epoch": 0.6405008427642668, - "grad_norm": 8.625, + "epoch": 3.794579172610556, + "grad_norm": 14.125, "learning_rate": 8.747111111111112e-05, - "loss": 0.6012, + "loss": 1.0893, "step": 5320 }, { - "epoch": 0.6417047917168313, - "grad_norm": 5.25, + "epoch": 3.8017118402282453, + "grad_norm": 6.71875, "learning_rate": 8.742666666666667e-05, - "loss": 0.6262, + "loss": 0.8484, "step": 5330 }, { - "epoch": 0.6429087406693956, - "grad_norm": 8.5625, + "epoch": 3.8088445078459343, + "grad_norm": 7.53125, "learning_rate": 8.738222222222222e-05, - "loss": 0.7584, + "loss": 0.922, "step": 5340 }, { - "epoch": 0.64411268962196, - "grad_norm": 7.53125, + "epoch": 3.8159771754636234, + "grad_norm": 6.9375, "learning_rate": 8.733777777777779e-05, - "loss": 0.6793, + "loss": 0.87, "step": 5350 }, { - "epoch": 0.6453166385745245, - "grad_norm": 9.625, + "epoch": 3.8231098430813124, + "grad_norm": 6.75, "learning_rate": 8.729333333333334e-05, - "loss": 0.6166, + "loss": 0.9272, "step": 5360 }, { - "epoch": 0.6465205875270889, - "grad_norm": 7.0625, + "epoch": 3.8302425106990015, + "grad_norm": 6.875, "learning_rate": 8.724888888888889e-05, - "loss": 0.667, + "loss": 0.8358, "step": 5370 }, { - "epoch": 0.6477245364796532, - "grad_norm": 6.90625, + "epoch": 3.8373751783166905, + "grad_norm": 7.53125, "learning_rate": 8.720444444444445e-05, - "loss": 0.6427, + "loss": 0.8764, "step": 5380 }, { - "epoch": 0.6489284854322177, + "epoch": 3.8445078459343796, "grad_norm": 7.96875, "learning_rate": 8.716000000000001e-05, - "loss": 0.7689, + "loss": 0.9348, "step": 5390 }, { - "epoch": 0.6501324343847821, - "grad_norm": 8.9375, + "epoch": 3.8516405135520686, + "grad_norm": 7.5625, "learning_rate": 8.711555555555556e-05, - "loss": 0.6957, + "loss": 0.9033, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval/acc": 34.88372039794922, + "epoch": 3.8516405135520686, + "eval/acc": 39.53488540649414, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval_loss": 2.8916988372802734, - "eval_runtime": 0.2068, - "eval_samples_per_second": 207.976, - "eval_steps_per_second": 4.837, + "epoch": 3.8516405135520686, + "eval_loss": 2.0633187294006348, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.324, + "eval_steps_per_second": 4.449, "step": 5400 }, { - "epoch": 0.6513363833373464, - "grad_norm": 6.34375, + "epoch": 3.8587731811697576, + "grad_norm": 6.90625, "learning_rate": 8.707111111111111e-05, - "loss": 0.6811, + "loss": 0.9344, "step": 5410 }, { - "epoch": 0.6525403322899109, - "grad_norm": 6.71875, + "epoch": 3.8659058487874463, + "grad_norm": 7.5, "learning_rate": 8.702666666666667e-05, - "loss": 0.6849, + "loss": 0.9346, "step": 5420 }, { - "epoch": 0.6537442812424753, - "grad_norm": 6.46875, + "epoch": 3.8730385164051357, + "grad_norm": 7.03125, "learning_rate": 8.698222222222223e-05, - "loss": 0.6134, + "loss": 0.8835, "step": 5430 }, { - "epoch": 0.6549482301950398, - "grad_norm": 10.5, + "epoch": 3.8801711840228243, + "grad_norm": 6.3125, "learning_rate": 8.693777777777778e-05, - "loss": 0.6213, + "loss": 0.8434, "step": 5440 }, { - "epoch": 0.6561521791476042, - "grad_norm": 6.25, + "epoch": 3.8873038516405134, + "grad_norm": 7.03125, "learning_rate": 8.689333333333334e-05, - "loss": 0.6892, + "loss": 0.8555, "step": 5450 }, { - "epoch": 0.6573561281001685, - "grad_norm": 7.0, + "epoch": 3.8944365192582024, + "grad_norm": 8.0, "learning_rate": 8.684888888888889e-05, - "loss": 0.6003, + "loss": 0.9287, "step": 5460 }, { - "epoch": 0.658560077052733, - "grad_norm": 7.46875, + "epoch": 3.9015691868758915, + "grad_norm": 8.1875, "learning_rate": 8.680444444444444e-05, - "loss": 0.726, + "loss": 0.8738, "step": 5470 }, { - "epoch": 0.6597640260052974, - "grad_norm": 6.0, + "epoch": 3.9087018544935805, + "grad_norm": 7.96875, "learning_rate": 8.676e-05, - "loss": 0.7526, + "loss": 0.8189, "step": 5480 }, { - "epoch": 0.6609679749578617, - "grad_norm": 9.875, + "epoch": 3.9158345221112696, + "grad_norm": 10.1875, "learning_rate": 8.671555555555556e-05, - "loss": 0.603, + "loss": 0.8983, "step": 5490 }, { - "epoch": 0.6621719239104262, - "grad_norm": 13.6875, + "epoch": 3.9229671897289586, + "grad_norm": 10.375, "learning_rate": 8.667111111111111e-05, - "loss": 0.6759, + "loss": 0.8083, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval/acc": 34.88372039794922, + "epoch": 3.9229671897289586, + "eval/acc": 39.53488540649414, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval_loss": 2.915025234222412, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.294, - "eval_steps_per_second": 4.821, + "epoch": 3.9229671897289586, + "eval_loss": 2.089243173599243, + "eval_runtime": 0.2203, + "eval_samples_per_second": 195.23, + "eval_steps_per_second": 4.54, "step": 5500 }, { - "epoch": 0.6633758728629906, - "grad_norm": 8.8125, + "epoch": 3.9300998573466477, + "grad_norm": 13.125, "learning_rate": 8.662666666666666e-05, - "loss": 0.6582, + "loss": 0.8747, "step": 5510 }, { - "epoch": 0.664579821815555, - "grad_norm": 7.6875, + "epoch": 3.9372325249643367, + "grad_norm": 8.25, "learning_rate": 8.658222222222224e-05, - "loss": 0.6219, + "loss": 0.8609, "step": 5520 }, { - "epoch": 0.6657837707681195, - "grad_norm": 9.25, + "epoch": 3.944365192582026, + "grad_norm": 6.75, "learning_rate": 8.653777777777779e-05, - "loss": 0.742, + "loss": 0.8563, "step": 5530 }, { - "epoch": 0.6669877197206838, - "grad_norm": 6.59375, + "epoch": 3.951497860199715, + "grad_norm": 7.75, "learning_rate": 8.649333333333333e-05, - "loss": 0.653, + "loss": 0.8912, "step": 5540 }, { - "epoch": 0.6681916686732483, - "grad_norm": 9.25, + "epoch": 3.9586305278174034, + "grad_norm": 6.40625, "learning_rate": 8.64488888888889e-05, - "loss": 0.67, + "loss": 0.7477, "step": 5550 }, { - "epoch": 0.6693956176258127, - "grad_norm": 7.59375, + "epoch": 3.965763195435093, + "grad_norm": 7.0, "learning_rate": 8.640444444444444e-05, - "loss": 0.7448, + "loss": 0.8185, "step": 5560 }, { - "epoch": 0.670599566578377, - "grad_norm": 7.125, + "epoch": 3.9728958630527815, + "grad_norm": 5.6875, "learning_rate": 8.636e-05, - "loss": 0.607, + "loss": 0.9497, "step": 5570 }, { - "epoch": 0.6718035155309415, - "grad_norm": 6.59375, + "epoch": 3.980028530670471, + "grad_norm": 8.0, "learning_rate": 8.631555555555556e-05, - "loss": 0.6398, + "loss": 0.8117, "step": 5580 }, { - "epoch": 0.6730074644835059, - "grad_norm": 6.21875, + "epoch": 3.9871611982881596, + "grad_norm": 6.625, "learning_rate": 8.627111111111112e-05, - "loss": 0.6334, + "loss": 0.8245, "step": 5590 }, { - "epoch": 0.6742114134360703, - "grad_norm": 7.0625, + "epoch": 3.9942938659058487, + "grad_norm": 6.96875, "learning_rate": 8.622666666666667e-05, - "loss": 0.6878, + "loss": 0.902, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval/acc": 32.55813980102539, + "epoch": 3.9942938659058487, + "eval/acc": 39.53488540649414, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval_loss": 2.8182010650634766, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.724, - "eval_steps_per_second": 4.831, + "epoch": 3.9942938659058487, + "eval_loss": 2.186225652694702, + "eval_runtime": 0.2194, + "eval_samples_per_second": 196.001, + "eval_steps_per_second": 4.558, "step": 5600 }, { - "epoch": 0.6754153623886348, - "grad_norm": 7.9375, + "epoch": 4.001426533523538, + "grad_norm": 6.78125, "learning_rate": 8.618222222222223e-05, - "loss": 0.6577, + "loss": 0.8757, "step": 5610 }, { - "epoch": 0.6766193113411991, - "grad_norm": 7.34375, + "epoch": 4.008559201141227, + "grad_norm": 11.0625, "learning_rate": 8.613777777777779e-05, - "loss": 0.7787, + "loss": 0.885, "step": 5620 }, { - "epoch": 0.6778232602937635, - "grad_norm": 6.96875, + "epoch": 4.015691868758916, + "grad_norm": 6.4375, "learning_rate": 8.609333333333334e-05, - "loss": 0.7849, + "loss": 0.8611, "step": 5630 }, { - "epoch": 0.679027209246328, - "grad_norm": 16.125, + "epoch": 4.022824536376604, + "grad_norm": 14.8125, "learning_rate": 8.604888888888889e-05, - "loss": 0.8503, + "loss": 0.8262, "step": 5640 }, { - "epoch": 0.6802311581988923, - "grad_norm": 7.625, + "epoch": 4.029957203994294, + "grad_norm": 8.0625, "learning_rate": 8.600444444444445e-05, - "loss": 0.6215, + "loss": 0.7549, "step": 5650 }, { - "epoch": 0.6814351071514568, - "grad_norm": 7.28125, + "epoch": 4.0370898716119825, + "grad_norm": 6.84375, "learning_rate": 8.596000000000001e-05, - "loss": 0.6894, + "loss": 0.8725, "step": 5660 }, { - "epoch": 0.6826390561040212, - "grad_norm": 6.28125, + "epoch": 4.044222539229672, + "grad_norm": 8.0, "learning_rate": 8.591555555555556e-05, - "loss": 0.616, + "loss": 0.8846, "step": 5670 }, { - "epoch": 0.6838430050565856, - "grad_norm": 6.125, + "epoch": 4.051355206847361, + "grad_norm": 7.84375, "learning_rate": 8.587111111111111e-05, - "loss": 0.6417, + "loss": 0.9373, "step": 5680 }, { - "epoch": 0.68504695400915, - "grad_norm": 7.78125, + "epoch": 4.05848787446505, + "grad_norm": 6.84375, "learning_rate": 8.582666666666667e-05, - "loss": 0.7842, + "loss": 0.7823, "step": 5690 }, { - "epoch": 0.6862509029617144, - "grad_norm": 9.4375, + "epoch": 4.065620542082739, + "grad_norm": 11.4375, "learning_rate": 8.578222222222223e-05, - "loss": 0.6562, + "loss": 0.9588, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval/acc": 32.55813980102539, + "epoch": 4.065620542082739, + "eval/acc": 37.20930099487305, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval_loss": 2.861806869506836, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.449, - "eval_steps_per_second": 4.801, + "epoch": 4.065620542082739, + "eval_loss": 2.841008424758911, + "eval_runtime": 1.3984, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.715, "step": 5700 }, { - "epoch": 0.6874548519142788, - "grad_norm": 6.46875, + "epoch": 4.072753209700428, + "grad_norm": 5.5625, "learning_rate": 8.573777777777778e-05, - "loss": 0.6165, + "loss": 0.8014, "step": 5710 }, { - "epoch": 0.6886588008668433, - "grad_norm": 7.0625, + "epoch": 4.079885877318117, + "grad_norm": 6.90625, "learning_rate": 8.569333333333334e-05, - "loss": 0.7014, + "loss": 0.818, "step": 5720 }, { - "epoch": 0.6898627498194077, - "grad_norm": 8.0625, + "epoch": 4.087018544935806, + "grad_norm": 8.4375, "learning_rate": 8.564888888888889e-05, - "loss": 0.7459, + "loss": 0.8142, "step": 5730 }, { - "epoch": 0.691066698771972, - "grad_norm": 5.84375, + "epoch": 4.094151212553495, + "grad_norm": 7.75, "learning_rate": 8.560444444444445e-05, - "loss": 0.6708, + "loss": 0.863, "step": 5740 }, { - "epoch": 0.6922706477245365, - "grad_norm": 7.9375, + "epoch": 4.101283880171184, + "grad_norm": 6.90625, "learning_rate": 8.556e-05, - "loss": 0.6487, + "loss": 0.8501, "step": 5750 }, { - "epoch": 0.6934745966771009, - "grad_norm": 8.125, + "epoch": 4.108416547788873, + "grad_norm": 7.15625, "learning_rate": 8.551555555555556e-05, - "loss": 0.6634, + "loss": 0.8293, "step": 5760 }, { - "epoch": 0.6946785456296654, - "grad_norm": 5.0, + "epoch": 4.1155492154065625, + "grad_norm": 8.125, "learning_rate": 8.547111111111111e-05, - "loss": 0.6575, + "loss": 0.8655, "step": 5770 }, { - "epoch": 0.6958824945822297, - "grad_norm": 6.28125, + "epoch": 4.122681883024251, + "grad_norm": 7.75, "learning_rate": 8.542666666666666e-05, - "loss": 0.6661, + "loss": 0.7958, "step": 5780 }, { - "epoch": 0.6970864435347941, - "grad_norm": 6.5, + "epoch": 4.12981455064194, + "grad_norm": 8.3125, "learning_rate": 8.538222222222224e-05, - "loss": 0.6922, + "loss": 0.9186, "step": 5790 }, { - "epoch": 0.6982903924873586, - "grad_norm": 9.0625, + "epoch": 4.136947218259629, + "grad_norm": 7.0625, "learning_rate": 8.533777777777778e-05, - "loss": 0.687, + "loss": 0.9135, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval/acc": 37.79069900512695, + "epoch": 4.136947218259629, + "eval/acc": 37.20930099487305, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval_loss": 2.878754138946533, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.039, - "eval_steps_per_second": 4.745, + "epoch": 4.136947218259629, + "eval_loss": 2.8186914920806885, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.722, + "eval_steps_per_second": 4.645, "step": 5800 }, { - "epoch": 0.699494341439923, - "grad_norm": 8.875, + "epoch": 4.144079885877318, + "grad_norm": 8.125, "learning_rate": 8.529333333333333e-05, - "loss": 0.7106, + "loss": 0.8248, "step": 5810 }, { - "epoch": 0.7006982903924873, - "grad_norm": 8.3125, + "epoch": 4.151212553495007, + "grad_norm": 7.65625, "learning_rate": 8.52488888888889e-05, - "loss": 0.5969, + "loss": 0.9186, "step": 5820 }, { - "epoch": 0.7019022393450518, - "grad_norm": 6.40625, + "epoch": 4.158345221112696, + "grad_norm": 7.6875, "learning_rate": 8.520444444444446e-05, - "loss": 0.6795, + "loss": 0.8367, "step": 5830 }, { - "epoch": 0.7031061882976162, - "grad_norm": 8.5625, + "epoch": 4.165477888730385, + "grad_norm": 9.75, "learning_rate": 8.516e-05, - "loss": 0.7621, + "loss": 0.8898, "step": 5840 }, { - "epoch": 0.7043101372501805, - "grad_norm": 9.5625, + "epoch": 4.172610556348074, + "grad_norm": 8.5625, "learning_rate": 8.511555555555555e-05, - "loss": 0.7035, + "loss": 0.9218, "step": 5850 }, { - "epoch": 0.705514086202745, - "grad_norm": 11.3125, + "epoch": 4.1797432239657635, + "grad_norm": 6.0, "learning_rate": 8.507111111111112e-05, - "loss": 0.8043, + "loss": 0.8784, "step": 5860 }, { - "epoch": 0.7067180351553094, - "grad_norm": 7.4375, + "epoch": 4.186875891583452, + "grad_norm": 8.5625, "learning_rate": 8.502666666666666e-05, - "loss": 0.6349, + "loss": 0.8361, "step": 5870 }, { - "epoch": 0.7079219841078739, - "grad_norm": 6.28125, + "epoch": 4.194008559201142, + "grad_norm": 7.40625, "learning_rate": 8.498222222222223e-05, - "loss": 0.6593, + "loss": 0.816, "step": 5880 }, { - "epoch": 0.7091259330604383, - "grad_norm": 6.4375, + "epoch": 4.20114122681883, + "grad_norm": 7.84375, "learning_rate": 8.493777777777779e-05, - "loss": 0.6236, + "loss": 0.897, "step": 5890 }, { - "epoch": 0.7103298820130026, - "grad_norm": 7.84375, + "epoch": 4.20827389443652, + "grad_norm": 10.0625, "learning_rate": 8.489333333333334e-05, - "loss": 0.6134, + "loss": 0.7807, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval/acc": 34.88372039794922, + "epoch": 4.20827389443652, + "eval/acc": 37.20930099487305, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval_loss": 2.918956756591797, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.945, - "eval_steps_per_second": 4.696, + "epoch": 4.20827389443652, + "eval_loss": 2.890333890914917, + "eval_runtime": 0.2187, + "eval_samples_per_second": 196.595, + "eval_steps_per_second": 4.572, "step": 5900 }, { - "epoch": 0.7115338309655671, - "grad_norm": 7.40625, + "epoch": 4.215406562054208, + "grad_norm": 7.6875, "learning_rate": 8.484888888888888e-05, - "loss": 0.5883, + "loss": 0.8786, "step": 5910 }, { - "epoch": 0.7127377799181315, - "grad_norm": 7.0625, + "epoch": 4.222539229671897, + "grad_norm": 7.46875, "learning_rate": 8.480444444444445e-05, - "loss": 0.6805, + "loss": 0.8689, "step": 5920 }, { - "epoch": 0.7139417288706958, - "grad_norm": 5.25, + "epoch": 4.229671897289586, + "grad_norm": 14.125, "learning_rate": 8.476000000000001e-05, - "loss": 0.5638, + "loss": 0.83, "step": 5930 }, { - "epoch": 0.7151456778232603, - "grad_norm": 5.84375, + "epoch": 4.236804564907275, + "grad_norm": 6.09375, "learning_rate": 8.471555555555556e-05, - "loss": 0.6112, + "loss": 0.8921, "step": 5940 }, { - "epoch": 0.7163496267758247, - "grad_norm": 6.5625, + "epoch": 4.2439372325249645, + "grad_norm": 8.875, "learning_rate": 8.467111111111112e-05, - "loss": 0.6147, + "loss": 0.9293, "step": 5950 }, { - "epoch": 0.7175535757283891, - "grad_norm": 6.15625, + "epoch": 4.251069900142653, + "grad_norm": 10.5625, "learning_rate": 8.462666666666667e-05, - "loss": 0.7292, + "loss": 0.7955, "step": 5960 }, { - "epoch": 0.7187575246809536, - "grad_norm": 8.25, + "epoch": 4.258202567760343, + "grad_norm": 15.25, "learning_rate": 8.458222222222223e-05, - "loss": 0.6048, + "loss": 0.9267, "step": 5970 }, { - "epoch": 0.7199614736335179, - "grad_norm": 8.0625, + "epoch": 4.265335235378031, + "grad_norm": 8.0, "learning_rate": 8.453777777777778e-05, - "loss": 0.581, + "loss": 0.7665, "step": 5980 }, { - "epoch": 0.7211654225860824, - "grad_norm": 7.90625, + "epoch": 4.272467902995721, + "grad_norm": 6.4375, "learning_rate": 8.449333333333334e-05, - "loss": 0.6918, + "loss": 0.8212, "step": 5990 }, { - "epoch": 0.7223693715386468, - "grad_norm": 5.65625, + "epoch": 4.279600570613409, + "grad_norm": 8.0625, "learning_rate": 8.444888888888889e-05, - "loss": 0.6774, + "loss": 0.8294, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval/acc": 36.627906799316406, + "epoch": 4.279600570613409, + "eval/acc": 34.88372039794922, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval_loss": 2.936192512512207, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.531, - "eval_steps_per_second": 4.733, + "epoch": 4.279600570613409, + "eval_loss": 2.8812708854675293, + "eval_runtime": 0.2262, + "eval_samples_per_second": 190.082, + "eval_steps_per_second": 4.421, "step": 6000 }, { - "epoch": 0.7235733204912111, - "grad_norm": 7.59375, + "epoch": 4.286733238231099, + "grad_norm": 5.625, "learning_rate": 8.440444444444445e-05, - "loss": 0.5982, + "loss": 0.8813, "step": 6010 }, { - "epoch": 0.7247772694437756, - "grad_norm": 9.0625, + "epoch": 4.293865905848787, + "grad_norm": 8.375, "learning_rate": 8.436000000000001e-05, - "loss": 0.6048, + "loss": 0.8792, "step": 6020 }, { - "epoch": 0.72598121839634, - "grad_norm": 7.46875, + "epoch": 4.300998573466477, + "grad_norm": 9.125, "learning_rate": 8.431555555555556e-05, - "loss": 0.7024, + "loss": 0.9509, "step": 6030 }, { - "epoch": 0.7271851673489044, - "grad_norm": 8.0625, + "epoch": 4.3081312410841655, + "grad_norm": 7.34375, "learning_rate": 8.427111111111111e-05, - "loss": 0.7556, + "loss": 0.9452, "step": 6040 }, { - "epoch": 0.7283891163014689, - "grad_norm": 6.78125, + "epoch": 4.315263908701855, + "grad_norm": 8.25, "learning_rate": 8.422666666666667e-05, - "loss": 0.7187, + "loss": 0.8801, "step": 6050 }, { - "epoch": 0.7295930652540332, - "grad_norm": 6.8125, + "epoch": 4.3223965763195435, + "grad_norm": 6.75, "learning_rate": 8.418222222222223e-05, - "loss": 0.5774, + "loss": 0.805, "step": 6060 }, { - "epoch": 0.7307970142065976, - "grad_norm": 6.9375, + "epoch": 4.329529243937232, + "grad_norm": 8.375, "learning_rate": 8.413777777777778e-05, - "loss": 0.6724, + "loss": 0.8176, "step": 6070 }, { - "epoch": 0.7320009631591621, + "epoch": 4.336661911554922, "grad_norm": 6.1875, "learning_rate": 8.409333333333333e-05, - "loss": 0.6109, + "loss": 0.8662, "step": 6080 }, { - "epoch": 0.7332049121117264, - "grad_norm": 5.84375, + "epoch": 4.34379457917261, + "grad_norm": 6.03125, "learning_rate": 8.404888888888889e-05, - "loss": 0.6251, + "loss": 0.9121, "step": 6090 }, { - "epoch": 0.7344088610642908, - "grad_norm": 6.78125, + "epoch": 4.3509272467903, + "grad_norm": 5.6875, "learning_rate": 8.400444444444445e-05, - "loss": 0.6916, + "loss": 0.8697, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval/acc": 32.55813980102539, + "epoch": 4.3509272467903, + "eval/acc": 39.53488540649414, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval_loss": 2.947686195373535, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.91, - "eval_steps_per_second": 4.789, + "epoch": 4.3509272467903, + "eval_loss": 2.7605249881744385, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.191, + "eval_steps_per_second": 4.493, "step": 6100 }, { - "epoch": 0.7356128100168553, - "grad_norm": 6.96875, + "epoch": 4.358059914407988, + "grad_norm": 8.125, "learning_rate": 8.396e-05, - "loss": 0.6525, + "loss": 0.783, "step": 6110 }, { - "epoch": 0.7368167589694197, - "grad_norm": 9.625, + "epoch": 4.365192582025678, + "grad_norm": 6.71875, "learning_rate": 8.391555555555556e-05, - "loss": 0.6107, + "loss": 0.7273, "step": 6120 }, { - "epoch": 0.7380207079219842, - "grad_norm": 5.84375, + "epoch": 4.372325249643366, + "grad_norm": 7.625, "learning_rate": 8.387111111111111e-05, - "loss": 0.6339, + "loss": 0.9497, "step": 6130 }, { - "epoch": 0.7392246568745485, - "grad_norm": 8.0, + "epoch": 4.379457917261056, + "grad_norm": 7.625, "learning_rate": 8.382666666666667e-05, - "loss": 0.6243, + "loss": 0.9318, "step": 6140 }, { - "epoch": 0.7404286058271129, - "grad_norm": 7.9375, + "epoch": 4.3865905848787445, + "grad_norm": 7.5625, "learning_rate": 8.378222222222222e-05, - "loss": 0.6644, + "loss": 0.7827, "step": 6150 }, { - "epoch": 0.7416325547796774, + "epoch": 4.393723252496434, "grad_norm": 7.4375, "learning_rate": 8.373777777777779e-05, - "loss": 0.6117, + "loss": 0.8471, "step": 6160 }, { - "epoch": 0.7428365037322417, - "grad_norm": 7.28125, + "epoch": 4.400855920114123, + "grad_norm": 5.59375, "learning_rate": 8.369333333333333e-05, - "loss": 0.6253, + "loss": 0.866, "step": 6170 }, { - "epoch": 0.7440404526848061, - "grad_norm": 6.59375, + "epoch": 4.407988587731811, + "grad_norm": 5.34375, "learning_rate": 8.364888888888888e-05, - "loss": 0.5973, + "loss": 0.8237, "step": 6180 }, { - "epoch": 0.7452444016373706, - "grad_norm": 8.5, + "epoch": 4.415121255349501, + "grad_norm": 9.375, "learning_rate": 8.360444444444446e-05, - "loss": 0.5938, + "loss": 0.896, "step": 6190 }, { - "epoch": 0.746448350589935, - "grad_norm": 6.40625, + "epoch": 4.422253922967189, + "grad_norm": 7.78125, "learning_rate": 8.356e-05, - "loss": 0.7276, + "loss": 0.8402, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval/acc": 34.88372039794922, + "epoch": 4.422253922967189, + "eval/acc": 37.20930099487305, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval_loss": 3.0573887825012207, - "eval_runtime": 0.2067, - "eval_samples_per_second": 208.014, - "eval_steps_per_second": 4.838, + "epoch": 4.422253922967189, + "eval_loss": 2.8444175720214844, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.997, + "eval_steps_per_second": 4.512, "step": 6200 }, { - "epoch": 0.7476522995424993, - "grad_norm": 6.75, + "epoch": 4.429386590584879, + "grad_norm": 7.625, "learning_rate": 8.351555555555555e-05, - "loss": 0.6518, + "loss": 0.8708, "step": 6210 }, { - "epoch": 0.7488562484950638, - "grad_norm": 6.5, + "epoch": 4.436519258202567, + "grad_norm": 7.28125, "learning_rate": 8.347111111111112e-05, - "loss": 0.5737, + "loss": 0.8505, "step": 6220 }, { - "epoch": 0.7500601974476282, - "grad_norm": 7.96875, + "epoch": 4.443651925820257, + "grad_norm": 7.28125, "learning_rate": 8.342666666666668e-05, - "loss": 0.743, + "loss": 0.878, "step": 6230 }, { - "epoch": 0.7512641464001927, - "grad_norm": 8.375, + "epoch": 4.4507845934379455, + "grad_norm": 8.0, "learning_rate": 8.338222222222223e-05, - "loss": 0.6803, + "loss": 0.7568, "step": 6240 }, { - "epoch": 0.752468095352757, - "grad_norm": 10.9375, + "epoch": 4.457917261055635, + "grad_norm": 7.28125, "learning_rate": 8.333777777777778e-05, - "loss": 0.8047, + "loss": 0.7909, "step": 6250 }, { - "epoch": 0.7536720443053214, - "grad_norm": 6.21875, + "epoch": 4.465049928673324, + "grad_norm": 10.625, "learning_rate": 8.329333333333334e-05, - "loss": 0.5941, + "loss": 0.8732, "step": 6260 }, { - "epoch": 0.7548759932578859, - "grad_norm": 7.0, + "epoch": 4.472182596291013, + "grad_norm": 7.40625, "learning_rate": 8.324888888888889e-05, - "loss": 0.673, + "loss": 0.8827, "step": 6270 }, { - "epoch": 0.7560799422104503, - "grad_norm": 5.6875, + "epoch": 4.479315263908702, + "grad_norm": 11.25, "learning_rate": 8.320444444444445e-05, - "loss": 0.6869, + "loss": 0.7889, "step": 6280 }, { - "epoch": 0.7572838911630146, - "grad_norm": 7.46875, + "epoch": 4.486447931526391, + "grad_norm": 7.59375, "learning_rate": 8.316000000000001e-05, - "loss": 0.7399, + "loss": 0.7808, "step": 6290 }, { - "epoch": 0.7584878401155791, - "grad_norm": 7.21875, + "epoch": 4.49358059914408, + "grad_norm": 5.40625, "learning_rate": 8.311555555555556e-05, - "loss": 0.6582, + "loss": 0.8223, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval/acc": 34.88372039794922, + "epoch": 4.49358059914408, + "eval/acc": 37.20930099487305, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval_loss": 2.991325616836548, - "eval_runtime": 0.2058, - "eval_samples_per_second": 208.93, - "eval_steps_per_second": 4.859, + "epoch": 4.49358059914408, + "eval_loss": 2.798743963241577, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.44, + "eval_steps_per_second": 4.592, "step": 6300 }, { - "epoch": 0.7596917890681435, - "grad_norm": 7.5625, + "epoch": 4.500713266761769, + "grad_norm": 7.9375, "learning_rate": 8.307111111111111e-05, - "loss": 0.6455, + "loss": 0.8588, "step": 6310 }, { - "epoch": 0.7608957380207079, - "grad_norm": 5.0625, + "epoch": 4.507845934379458, + "grad_norm": 8.0625, "learning_rate": 8.302666666666667e-05, - "loss": 0.6269, + "loss": 0.9003, "step": 6320 }, { - "epoch": 0.7620996869732723, - "grad_norm": 7.15625, + "epoch": 4.5149786019971465, + "grad_norm": 7.21875, "learning_rate": 8.298222222222223e-05, - "loss": 0.6453, + "loss": 0.8942, "step": 6330 }, { - "epoch": 0.7633036359258367, - "grad_norm": 6.34375, + "epoch": 4.522111269614836, + "grad_norm": 7.625, "learning_rate": 8.293777777777778e-05, - "loss": 0.6721, + "loss": 0.8622, "step": 6340 }, { - "epoch": 0.7645075848784012, - "grad_norm": 7.59375, + "epoch": 4.529243937232525, + "grad_norm": 5.53125, "learning_rate": 8.289333333333333e-05, - "loss": 0.569, + "loss": 0.8048, "step": 6350 }, { - "epoch": 0.7657115338309656, - "grad_norm": 6.78125, + "epoch": 4.536376604850214, + "grad_norm": 9.125, "learning_rate": 8.28488888888889e-05, - "loss": 0.6221, + "loss": 0.8506, "step": 6360 }, { - "epoch": 0.76691548278353, - "grad_norm": 9.875, + "epoch": 4.543509272467903, + "grad_norm": 6.125, "learning_rate": 8.280444444444445e-05, - "loss": 0.6623, + "loss": 0.7767, "step": 6370 }, { - "epoch": 0.7681194317360944, - "grad_norm": 7.125, + "epoch": 4.550641940085592, + "grad_norm": 6.90625, "learning_rate": 8.276e-05, - "loss": 0.7166, + "loss": 0.9143, "step": 6380 }, { - "epoch": 0.7693233806886588, - "grad_norm": 7.59375, + "epoch": 4.557774607703281, + "grad_norm": 5.84375, "learning_rate": 8.271555555555556e-05, - "loss": 0.6984, + "loss": 0.8641, "step": 6390 }, { - "epoch": 0.7705273296412232, - "grad_norm": 9.4375, + "epoch": 4.56490727532097, + "grad_norm": 6.3125, "learning_rate": 8.267111111111111e-05, - "loss": 0.7095, + "loss": 0.8297, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval/acc": 34.88372039794922, + "epoch": 4.56490727532097, + "eval/acc": 37.20930099487305, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval_loss": 3.0461771488189697, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.746, - "eval_steps_per_second": 4.808, + "epoch": 4.56490727532097, + "eval_loss": 2.804457426071167, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.742, + "eval_steps_per_second": 4.529, "step": 6400 }, { - "epoch": 0.7717312785937877, - "grad_norm": 9.375, + "epoch": 4.572039942938659, + "grad_norm": 7.15625, "learning_rate": 8.262666666666667e-05, - "loss": 0.6975, + "loss": 0.7398, "step": 6410 }, { - "epoch": 0.772935227546352, - "grad_norm": 6.75, + "epoch": 4.579172610556348, + "grad_norm": 6.125, "learning_rate": 8.258222222222222e-05, - "loss": 0.5826, + "loss": 0.8443, "step": 6420 }, { - "epoch": 0.7741391764989164, - "grad_norm": 8.25, + "epoch": 4.586305278174037, + "grad_norm": 9.25, "learning_rate": 8.253777777777778e-05, - "loss": 0.6596, + "loss": 0.7983, "step": 6430 }, { - "epoch": 0.7753431254514809, - "grad_norm": 6.375, + "epoch": 4.5934379457917265, + "grad_norm": 7.3125, "learning_rate": 8.249333333333333e-05, - "loss": 0.6624, + "loss": 0.9705, "step": 6440 }, { - "epoch": 0.7765470744040452, - "grad_norm": 7.375, + "epoch": 4.600570613409415, + "grad_norm": 7.34375, "learning_rate": 8.24488888888889e-05, - "loss": 0.6221, + "loss": 1.0079, "step": 6450 }, { - "epoch": 0.7777510233566097, - "grad_norm": 8.125, + "epoch": 4.607703281027105, + "grad_norm": 8.875, "learning_rate": 8.240444444444446e-05, - "loss": 0.6819, + "loss": 0.8982, "step": 6460 }, { - "epoch": 0.7789549723091741, - "grad_norm": 4.375, + "epoch": 4.614835948644793, + "grad_norm": 8.375, "learning_rate": 8.236e-05, - "loss": 0.588, + "loss": 0.8417, "step": 6470 }, { - "epoch": 0.7801589212617385, - "grad_norm": 8.875, + "epoch": 4.621968616262482, + "grad_norm": 7.78125, "learning_rate": 8.231555555555555e-05, - "loss": 0.7451, + "loss": 0.8566, "step": 6480 }, { - "epoch": 0.781362870214303, - "grad_norm": 8.5, + "epoch": 4.629101283880171, + "grad_norm": 6.5625, "learning_rate": 8.227111111111111e-05, - "loss": 0.64, + "loss": 0.8155, "step": 6490 }, { - "epoch": 0.7825668191668673, - "grad_norm": 6.59375, + "epoch": 4.63623395149786, + "grad_norm": 5.875, "learning_rate": 8.222666666666668e-05, - "loss": 0.6879, + "loss": 0.9449, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval/acc": 32.55813980102539, + "epoch": 4.63623395149786, + "eval/acc": 41.86046600341797, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval_loss": 2.970376491546631, - "eval_runtime": 0.2075, - "eval_samples_per_second": 207.198, - "eval_steps_per_second": 4.819, + "epoch": 4.63623395149786, + "eval_loss": 2.761596918106079, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.549, + "eval_steps_per_second": 4.664, "step": 6500 }, { - "epoch": 0.7837707681194317, - "grad_norm": 6.96875, + "epoch": 4.643366619115549, + "grad_norm": 7.5, "learning_rate": 8.218222222222223e-05, - "loss": 0.6584, + "loss": 0.8549, "step": 6510 }, { - "epoch": 0.7849747170719962, - "grad_norm": 7.3125, + "epoch": 4.650499286733238, + "grad_norm": 7.0625, "learning_rate": 8.213777777777777e-05, - "loss": 0.6892, + "loss": 0.8473, "step": 6520 }, { - "epoch": 0.7861786660245605, - "grad_norm": 6.28125, + "epoch": 4.6576319543509275, + "grad_norm": 7.1875, "learning_rate": 8.209333333333334e-05, - "loss": 0.6658, + "loss": 0.8773, "step": 6530 }, { - "epoch": 0.7873826149771249, - "grad_norm": 7.3125, + "epoch": 4.664764621968616, + "grad_norm": 7.25, "learning_rate": 8.20488888888889e-05, - "loss": 0.6379, + "loss": 0.789, "step": 6540 }, { - "epoch": 0.7885865639296894, - "grad_norm": 6.09375, + "epoch": 4.671897289586306, + "grad_norm": 7.34375, "learning_rate": 8.200444444444445e-05, - "loss": 0.5797, + "loss": 0.852, "step": 6550 }, { - "epoch": 0.7897905128822538, - "grad_norm": 7.03125, + "epoch": 4.679029957203994, + "grad_norm": 5.65625, "learning_rate": 8.196000000000001e-05, - "loss": 0.6778, + "loss": 0.8291, "step": 6560 }, { - "epoch": 0.7909944618348183, - "grad_norm": 7.46875, + "epoch": 4.686162624821684, + "grad_norm": 5.5625, "learning_rate": 8.191555555555556e-05, - "loss": 0.669, + "loss": 0.7943, "step": 6570 }, { - "epoch": 0.7921984107873826, - "grad_norm": 7.46875, + "epoch": 4.693295292439372, + "grad_norm": 9.25, "learning_rate": 8.18711111111111e-05, - "loss": 0.7272, + "loss": 0.8418, "step": 6580 }, { - "epoch": 0.793402359739947, - "grad_norm": 6.3125, + "epoch": 4.700427960057061, + "grad_norm": 6.75, "learning_rate": 8.182666666666667e-05, - "loss": 0.5767, + "loss": 0.8661, "step": 6590 }, { - "epoch": 0.7946063086925115, - "grad_norm": 7.28125, + "epoch": 4.70756062767475, + "grad_norm": 7.40625, "learning_rate": 8.178222222222223e-05, - "loss": 0.6776, + "loss": 0.768, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval/acc": 34.88372039794922, + "epoch": 4.70756062767475, + "eval/acc": 41.86046600341797, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval_loss": 2.941105842590332, - "eval_runtime": 0.2071, - "eval_samples_per_second": 207.595, - "eval_steps_per_second": 4.828, + "epoch": 4.70756062767475, + "eval_loss": 2.8003947734832764, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.665, + "eval_steps_per_second": 4.527, "step": 6600 }, { - "epoch": 0.7958102576450758, - "grad_norm": 7.125, + "epoch": 4.71469329529244, + "grad_norm": 7.1875, "learning_rate": 8.173777777777778e-05, - "loss": 0.6368, + "loss": 0.9038, "step": 6610 }, { - "epoch": 0.7970142065976402, - "grad_norm": 6.34375, + "epoch": 4.7218259629101285, + "grad_norm": 6.46875, "learning_rate": 8.169333333333334e-05, - "loss": 0.6504, + "loss": 0.7185, "step": 6620 }, { - "epoch": 0.7982181555502047, - "grad_norm": 5.46875, + "epoch": 4.728958630527817, + "grad_norm": 6.3125, "learning_rate": 8.16488888888889e-05, - "loss": 0.6305, + "loss": 0.9515, "step": 6630 }, { - "epoch": 0.7994221045027691, - "grad_norm": 6.3125, + "epoch": 4.736091298145507, + "grad_norm": 6.46875, "learning_rate": 8.160444444444445e-05, - "loss": 0.6538, + "loss": 0.8127, "step": 6640 }, { - "epoch": 0.8006260534553334, - "grad_norm": 9.0625, + "epoch": 4.743223965763195, + "grad_norm": 6.4375, "learning_rate": 8.156e-05, - "loss": 0.6747, + "loss": 0.8914, "step": 6650 }, { - "epoch": 0.8018300024078979, - "grad_norm": 13.0, + "epoch": 4.750356633380885, + "grad_norm": 6.8125, "learning_rate": 8.151555555555556e-05, - "loss": 0.6412, + "loss": 0.8545, "step": 6660 }, { - "epoch": 0.8030339513604623, - "grad_norm": 7.0, + "epoch": 4.757489300998573, + "grad_norm": 7.21875, "learning_rate": 8.147111111111112e-05, - "loss": 0.6479, + "loss": 0.6783, "step": 6670 }, { - "epoch": 0.8042379003130268, - "grad_norm": 7.375, + "epoch": 4.764621968616263, + "grad_norm": 7.03125, "learning_rate": 8.142666666666667e-05, - "loss": 0.6577, + "loss": 0.9337, "step": 6680 }, { - "epoch": 0.8054418492655911, - "grad_norm": 7.625, + "epoch": 4.771754636233951, + "grad_norm": 10.5625, "learning_rate": 8.138222222222223e-05, - "loss": 0.7217, + "loss": 0.8181, "step": 6690 }, { - "epoch": 0.8066457982181555, - "grad_norm": 5.625, + "epoch": 4.778887303851641, + "grad_norm": 7.375, "learning_rate": 8.133777777777778e-05, - "loss": 0.6363, + "loss": 0.8639, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval/acc": 34.88372039794922, + "epoch": 4.778887303851641, + "eval/acc": 37.20930099487305, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval_loss": 2.8945717811584473, - "eval_runtime": 0.2054, - "eval_samples_per_second": 209.381, - "eval_steps_per_second": 4.869, + "epoch": 4.778887303851641, + "eval_loss": 2.8262782096862793, + "eval_runtime": 0.2194, + "eval_samples_per_second": 195.949, + "eval_steps_per_second": 4.557, "step": 6700 }, { - "epoch": 0.80784974717072, - "grad_norm": 8.0625, + "epoch": 4.7860199714693294, + "grad_norm": 10.8125, "learning_rate": 8.129333333333333e-05, - "loss": 0.6784, + "loss": 0.8742, "step": 6710 }, { - "epoch": 0.8090536961232844, - "grad_norm": 9.1875, + "epoch": 4.793152639087019, + "grad_norm": 5.53125, "learning_rate": 8.124888888888889e-05, - "loss": 0.6187, + "loss": 0.7438, "step": 6720 }, { - "epoch": 0.8102576450758487, - "grad_norm": 9.1875, + "epoch": 4.8002853067047075, + "grad_norm": 6.65625, "learning_rate": 8.120444444444445e-05, - "loss": 0.6461, + "loss": 0.7859, "step": 6730 }, { - "epoch": 0.8114615940284132, - "grad_norm": 7.375, + "epoch": 4.807417974322396, + "grad_norm": 6.78125, "learning_rate": 8.116e-05, - "loss": 0.7325, + "loss": 0.8942, "step": 6740 }, { - "epoch": 0.8126655429809776, - "grad_norm": 7.71875, + "epoch": 4.814550641940086, + "grad_norm": 8.4375, "learning_rate": 8.111555555555555e-05, - "loss": 0.6758, + "loss": 0.8483, "step": 6750 }, { - "epoch": 0.813869491933542, - "grad_norm": 10.125, + "epoch": 4.821683309557774, + "grad_norm": 6.40625, "learning_rate": 8.107111111111113e-05, - "loss": 0.6223, + "loss": 0.8284, "step": 6760 }, { - "epoch": 0.8150734408861064, - "grad_norm": 7.90625, + "epoch": 4.828815977175464, + "grad_norm": 6.84375, "learning_rate": 8.102666666666667e-05, - "loss": 0.6115, + "loss": 0.8887, "step": 6770 }, { - "epoch": 0.8162773898386708, - "grad_norm": 5.375, + "epoch": 4.835948644793152, + "grad_norm": 8.875, "learning_rate": 8.098222222222222e-05, - "loss": 0.5747, + "loss": 0.8431, "step": 6780 }, { - "epoch": 0.8174813387912353, - "grad_norm": 7.375, + "epoch": 4.843081312410842, + "grad_norm": 6.90625, "learning_rate": 8.093777777777779e-05, - "loss": 0.618, + "loss": 0.8325, "step": 6790 }, { - "epoch": 0.8186852877437997, - "grad_norm": 7.125, + "epoch": 4.85021398002853, + "grad_norm": 7.0, "learning_rate": 8.089333333333333e-05, - "loss": 0.6603, + "loss": 0.7742, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval/acc": 34.88372039794922, + "epoch": 4.85021398002853, + "eval/acc": 39.53488540649414, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval_loss": 2.9451656341552734, - "eval_runtime": 1.2476, - "eval_samples_per_second": 34.466, - "eval_steps_per_second": 0.802, + "epoch": 4.85021398002853, + "eval_loss": 2.7403292655944824, + "eval_runtime": 0.5509, + "eval_samples_per_second": 78.059, + "eval_steps_per_second": 1.815, "step": 6800 }, { - "epoch": 0.819889236696364, - "grad_norm": 6.28125, + "epoch": 4.85734664764622, + "grad_norm": 6.625, "learning_rate": 8.08488888888889e-05, - "loss": 0.5918, + "loss": 0.8418, "step": 6810 }, { - "epoch": 0.8210931856489285, - "grad_norm": 8.6875, + "epoch": 4.8644793152639085, + "grad_norm": 7.65625, "learning_rate": 8.080444444444444e-05, - "loss": 0.5911, + "loss": 0.9022, "step": 6820 }, { - "epoch": 0.8222971346014929, - "grad_norm": 6.75, + "epoch": 4.871611982881598, + "grad_norm": 7.75, "learning_rate": 8.076e-05, - "loss": 0.6648, + "loss": 0.8201, "step": 6830 }, { - "epoch": 0.8235010835540573, - "grad_norm": 6.78125, + "epoch": 4.878744650499287, + "grad_norm": 7.84375, "learning_rate": 8.071555555555555e-05, - "loss": 0.6044, + "loss": 0.8144, "step": 6840 }, { - "epoch": 0.8247050325066217, - "grad_norm": 15.1875, + "epoch": 4.885877318116976, + "grad_norm": 8.3125, "learning_rate": 8.067111111111112e-05, - "loss": 0.6896, + "loss": 0.8821, "step": 6850 }, { - "epoch": 0.8259089814591861, - "grad_norm": 7.6875, + "epoch": 4.893009985734665, + "grad_norm": 9.0, "learning_rate": 8.062666666666668e-05, - "loss": 0.5829, + "loss": 0.8572, "step": 6860 }, { - "epoch": 0.8271129304117505, - "grad_norm": 5.21875, + "epoch": 4.900142653352354, + "grad_norm": 10.0, "learning_rate": 8.058222222222223e-05, - "loss": 0.6934, + "loss": 0.7498, "step": 6870 }, { - "epoch": 0.828316879364315, - "grad_norm": 10.375, + "epoch": 4.907275320970043, + "grad_norm": 6.09375, "learning_rate": 8.053777777777778e-05, - "loss": 0.7309, + "loss": 0.8709, "step": 6880 }, { - "epoch": 0.8295208283168793, - "grad_norm": 8.1875, + "epoch": 4.914407988587731, + "grad_norm": 7.84375, "learning_rate": 8.049333333333334e-05, - "loss": 0.7213, + "loss": 0.8045, "step": 6890 }, { - "epoch": 0.8307247772694438, - "grad_norm": 5.15625, + "epoch": 4.921540656205421, + "grad_norm": 7.0625, "learning_rate": 8.04488888888889e-05, - "loss": 0.6034, + "loss": 0.8919, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval/acc": 32.55813980102539, + "epoch": 4.921540656205421, + "eval/acc": 34.88372039794922, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval_loss": 2.8601129055023193, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.302, - "eval_steps_per_second": 4.821, + "epoch": 4.921540656205421, + "eval_loss": 2.8702921867370605, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.143, + "eval_steps_per_second": 4.515, "step": 6900 }, { - "epoch": 0.8319287262220082, - "grad_norm": 7.25, + "epoch": 4.9286733238231095, + "grad_norm": 18.125, "learning_rate": 8.040444444444445e-05, - "loss": 0.5585, + "loss": 0.8407, "step": 6910 }, { - "epoch": 0.8331326751745726, - "grad_norm": 5.9375, + "epoch": 4.935805991440799, + "grad_norm": 7.8125, "learning_rate": 8.036e-05, - "loss": 0.7539, + "loss": 0.9023, "step": 6920 }, { - "epoch": 0.834336624127137, - "grad_norm": 8.0, + "epoch": 4.942938659058488, + "grad_norm": 6.53125, "learning_rate": 8.031555555555556e-05, - "loss": 0.6104, + "loss": 0.7747, "step": 6930 }, { - "epoch": 0.8355405730797014, - "grad_norm": 7.4375, + "epoch": 4.950071326676177, + "grad_norm": 7.3125, "learning_rate": 8.027111111111112e-05, - "loss": 0.613, + "loss": 0.7357, "step": 6940 }, { - "epoch": 0.8367445220322658, - "grad_norm": 8.1875, + "epoch": 4.957203994293866, + "grad_norm": 5.71875, "learning_rate": 8.022666666666667e-05, - "loss": 0.6647, + "loss": 0.8914, "step": 6950 }, { - "epoch": 0.8379484709848303, - "grad_norm": 7.4375, + "epoch": 4.964336661911555, + "grad_norm": 7.9375, "learning_rate": 8.018222222222223e-05, - "loss": 0.7037, + "loss": 0.8626, "step": 6960 }, { - "epoch": 0.8391524199373946, - "grad_norm": 7.25, + "epoch": 4.971469329529244, + "grad_norm": 6.9375, "learning_rate": 8.013777777777778e-05, - "loss": 0.5853, + "loss": 0.8388, "step": 6970 }, { - "epoch": 0.840356368889959, - "grad_norm": 8.75, + "epoch": 4.978601997146933, + "grad_norm": 6.5, "learning_rate": 8.009333333333334e-05, - "loss": 0.6264, + "loss": 0.8321, "step": 6980 }, { - "epoch": 0.8415603178425235, - "grad_norm": 8.4375, + "epoch": 4.985734664764622, + "grad_norm": 6.6875, "learning_rate": 8.004888888888889e-05, - "loss": 0.6221, + "loss": 0.8276, "step": 6990 }, { - "epoch": 0.8427642667950879, - "grad_norm": 8.3125, + "epoch": 4.9928673323823105, + "grad_norm": 10.5625, "learning_rate": 8.000444444444445e-05, - "loss": 0.6408, + "loss": 0.8847, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval/acc": 33.72093200683594, + "epoch": 4.9928673323823105, + "eval/acc": 39.53488540649414, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval_loss": 2.9269802570343018, - "eval_runtime": 0.2045, - "eval_samples_per_second": 210.301, - "eval_steps_per_second": 4.891, + "epoch": 4.9928673323823105, + "eval_loss": 2.7940218448638916, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.063, + "eval_steps_per_second": 4.467, "step": 7000 }, { - "epoch": 0.8439682157476524, - "grad_norm": 9.125, + "epoch": 5.0, + "grad_norm": 7.1875, "learning_rate": 7.996e-05, - "loss": 0.6321, + "loss": 0.9472, "step": 7010 }, { - "epoch": 0.8451721647002167, - "grad_norm": 7.125, + "epoch": 5.007132667617689, + "grad_norm": 7.25, "learning_rate": 7.991555555555555e-05, - "loss": 0.5927, + "loss": 0.9009, "step": 7020 }, { - "epoch": 0.8463761136527811, - "grad_norm": 7.65625, + "epoch": 5.014265335235378, + "grad_norm": 7.34375, "learning_rate": 7.987111111111112e-05, - "loss": 0.6574, + "loss": 0.8805, "step": 7030 }, { - "epoch": 0.8475800626053456, - "grad_norm": 7.0, + "epoch": 5.021398002853067, + "grad_norm": 5.78125, "learning_rate": 7.982666666666667e-05, - "loss": 0.7185, + "loss": 0.8475, "step": 7040 }, { - "epoch": 0.84878401155791, - "grad_norm": 7.3125, + "epoch": 5.028530670470756, + "grad_norm": 5.53125, "learning_rate": 7.978222222222222e-05, - "loss": 0.7157, + "loss": 0.7598, "step": 7050 }, { - "epoch": 0.8499879605104743, - "grad_norm": 5.6875, + "epoch": 5.035663338088445, + "grad_norm": 6.25, "learning_rate": 7.973777777777778e-05, - "loss": 0.606, + "loss": 0.8605, "step": 7060 }, { - "epoch": 0.8511919094630388, - "grad_norm": 6.28125, + "epoch": 5.042796005706134, + "grad_norm": 7.46875, "learning_rate": 7.969333333333335e-05, - "loss": 0.6493, + "loss": 0.9293, "step": 7070 }, { - "epoch": 0.8523958584156032, - "grad_norm": 7.8125, + "epoch": 5.049928673323823, + "grad_norm": 5.9375, "learning_rate": 7.96488888888889e-05, - "loss": 0.6123, + "loss": 0.7984, "step": 7080 }, { - "epoch": 0.8535998073681675, + "epoch": 5.057061340941512, "grad_norm": 8.375, "learning_rate": 7.960444444444444e-05, - "loss": 0.6035, + "loss": 0.8222, "step": 7090 }, { - "epoch": 0.854803756320732, - "grad_norm": 7.78125, + "epoch": 5.064194008559201, + "grad_norm": 6.9375, "learning_rate": 7.956e-05, - "loss": 0.5902, + "loss": 0.8535, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval/acc": 37.20930099487305, + "epoch": 5.064194008559201, + "eval/acc": 41.86046600341797, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval_loss": 2.926543712615967, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.37, - "eval_steps_per_second": 4.73, + "epoch": 5.064194008559201, + "eval_loss": 2.631981134414673, + "eval_runtime": 2.5832, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.387, "step": 7100 }, { - "epoch": 0.8560077052732964, - "grad_norm": 6.0625, + "epoch": 5.0713266761768905, + "grad_norm": 6.5625, "learning_rate": 7.951555555555555e-05, - "loss": 0.6464, + "loss": 0.8668, "step": 7110 }, { - "epoch": 0.8572116542258609, + "epoch": 5.078459343794579, "grad_norm": 9.0, "learning_rate": 7.947111111111111e-05, - "loss": 0.7656, + "loss": 0.8142, "step": 7120 }, { - "epoch": 0.8584156031784252, - "grad_norm": 7.21875, + "epoch": 5.085592011412269, + "grad_norm": 8.3125, "learning_rate": 7.942666666666668e-05, - "loss": 0.5546, + "loss": 0.9271, "step": 7130 }, { - "epoch": 0.8596195521309896, - "grad_norm": 8.5, + "epoch": 5.092724679029957, + "grad_norm": 7.875, "learning_rate": 7.938222222222222e-05, - "loss": 0.6538, + "loss": 0.8213, "step": 7140 }, { - "epoch": 0.8608235010835541, - "grad_norm": 8.0625, + "epoch": 5.099857346647646, + "grad_norm": 6.8125, "learning_rate": 7.933777777777777e-05, - "loss": 0.7057, + "loss": 0.8511, "step": 7150 }, { - "epoch": 0.8620274500361185, - "grad_norm": 7.34375, + "epoch": 5.106990014265335, + "grad_norm": 7.53125, "learning_rate": 7.929333333333334e-05, - "loss": 0.6287, + "loss": 0.8525, "step": 7160 }, { - "epoch": 0.8632313989886828, - "grad_norm": 6.53125, + "epoch": 5.114122681883024, + "grad_norm": 7.21875, "learning_rate": 7.92488888888889e-05, - "loss": 0.6231, + "loss": 0.8554, "step": 7170 }, { - "epoch": 0.8644353479412473, - "grad_norm": 18.5, + "epoch": 5.121255349500713, + "grad_norm": 6.84375, "learning_rate": 7.920444444444445e-05, - "loss": 0.664, + "loss": 0.8128, "step": 7180 }, { - "epoch": 0.8656392968938117, - "grad_norm": 8.875, + "epoch": 5.128388017118402, + "grad_norm": 7.84375, "learning_rate": 7.916e-05, - "loss": 0.6286, + "loss": 0.7726, "step": 7190 }, { - "epoch": 0.8668432458463761, - "grad_norm": 6.0625, + "epoch": 5.1355206847360915, + "grad_norm": 7.78125, "learning_rate": 7.911555555555556e-05, - "loss": 0.6808, + "loss": 0.8902, "step": 7200 }, { - "epoch": 0.8668432458463761, + "epoch": 5.1355206847360915, "eval/acc": 37.20930099487305, "step": 7200 }, { - "epoch": 0.8668432458463761, - "eval_loss": 2.9467363357543945, - "eval_runtime": 0.2052, - "eval_samples_per_second": 209.502, - "eval_steps_per_second": 4.872, + "epoch": 5.1355206847360915, + "eval_loss": 2.5633885860443115, + "eval_runtime": 0.2541, + "eval_samples_per_second": 169.248, + "eval_steps_per_second": 3.936, "step": 7200 }, { - "epoch": 0.8680471947989405, - "grad_norm": 7.9375, + "epoch": 5.14265335235378, + "grad_norm": 6.8125, "learning_rate": 7.907111111111112e-05, - "loss": 0.6626, + "loss": 0.7482, "step": 7210 }, { - "epoch": 0.8692511437515049, - "grad_norm": 7.15625, + "epoch": 5.14978601997147, + "grad_norm": 42.0, "learning_rate": 7.902666666666667e-05, - "loss": 0.7685, + "loss": 0.9007, "step": 7220 }, { - "epoch": 0.8704550927040694, - "grad_norm": 10.3125, + "epoch": 5.156918687589158, + "grad_norm": 6.0625, "learning_rate": 7.898222222222223e-05, - "loss": 0.6848, + "loss": 0.8643, "step": 7230 }, { - "epoch": 0.8716590416566338, - "grad_norm": 7.21875, + "epoch": 5.164051355206848, + "grad_norm": 7.03125, "learning_rate": 7.893777777777778e-05, - "loss": 0.6433, + "loss": 0.8899, "step": 7240 }, { - "epoch": 0.8728629906091981, - "grad_norm": 6.34375, + "epoch": 5.171184022824536, + "grad_norm": 7.53125, "learning_rate": 7.889333333333334e-05, - "loss": 0.6121, + "loss": 0.7462, "step": 7250 }, { - "epoch": 0.8740669395617626, - "grad_norm": 7.40625, + "epoch": 5.178316690442226, + "grad_norm": 7.21875, "learning_rate": 7.884888888888889e-05, - "loss": 0.6391, + "loss": 0.9199, "step": 7260 }, { - "epoch": 0.875270888514327, - "grad_norm": 7.96875, + "epoch": 5.185449358059914, + "grad_norm": 8.1875, "learning_rate": 7.880444444444445e-05, - "loss": 0.638, + "loss": 0.7966, "step": 7270 }, { - "epoch": 0.8764748374668914, - "grad_norm": 6.28125, + "epoch": 5.192582025677604, + "grad_norm": 8.0, "learning_rate": 7.876e-05, - "loss": 0.6214, + "loss": 0.9086, "step": 7280 }, { - "epoch": 0.8776787864194558, - "grad_norm": 9.125, + "epoch": 5.1997146932952925, + "grad_norm": 7.46875, "learning_rate": 7.871555555555556e-05, - "loss": 0.7473, + "loss": 0.9184, "step": 7290 }, { - "epoch": 0.8788827353720202, - "grad_norm": 7.5, + "epoch": 5.206847360912981, + "grad_norm": 7.28125, "learning_rate": 7.867111111111112e-05, - "loss": 0.68, + "loss": 0.742, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval/acc": 34.88372039794922, + "epoch": 5.206847360912981, + "eval/acc": 39.53488540649414, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval_loss": 2.999979257583618, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.25, - "eval_steps_per_second": 4.703, + "epoch": 5.206847360912981, + "eval_loss": 2.5178542137145996, + "eval_runtime": 0.2274, + "eval_samples_per_second": 189.112, + "eval_steps_per_second": 4.398, "step": 7300 }, { - "epoch": 0.8800866843245846, - "grad_norm": 7.03125, + "epoch": 5.2139800285306706, + "grad_norm": 10.4375, "learning_rate": 7.862666666666667e-05, - "loss": 0.4952, + "loss": 0.8737, "step": 7310 }, { - "epoch": 0.8812906332771491, - "grad_norm": 7.65625, + "epoch": 5.221112696148359, + "grad_norm": 6.8125, "learning_rate": 7.858222222222222e-05, - "loss": 0.7879, + "loss": 0.8197, "step": 7320 }, { - "epoch": 0.8824945822297134, - "grad_norm": 7.71875, + "epoch": 5.228245363766049, + "grad_norm": 8.125, "learning_rate": 7.853777777777778e-05, - "loss": 0.6093, + "loss": 0.9561, "step": 7330 }, { - "epoch": 0.8836985311822779, - "grad_norm": 8.125, + "epoch": 5.235378031383737, + "grad_norm": 9.5, "learning_rate": 7.849333333333334e-05, - "loss": 0.6522, + "loss": 0.9066, "step": 7340 }, { - "epoch": 0.8849024801348423, - "grad_norm": 8.9375, + "epoch": 5.242510699001427, + "grad_norm": 6.09375, "learning_rate": 7.844888888888889e-05, - "loss": 0.6861, + "loss": 0.839, "step": 7350 }, { - "epoch": 0.8861064290874067, - "grad_norm": 6.9375, + "epoch": 5.249643366619115, + "grad_norm": 8.0625, "learning_rate": 7.840444444444445e-05, - "loss": 0.6023, + "loss": 0.8996, "step": 7360 }, { - "epoch": 0.8873103780399711, - "grad_norm": 8.1875, + "epoch": 5.256776034236805, + "grad_norm": 6.3125, "learning_rate": 7.836e-05, - "loss": 0.5156, + "loss": 0.8253, "step": 7370 }, { - "epoch": 0.8885143269925355, - "grad_norm": 7.125, + "epoch": 5.263908701854493, + "grad_norm": 6.15625, "learning_rate": 7.831555555555556e-05, - "loss": 0.6841, + "loss": 0.7275, "step": 7380 }, { - "epoch": 0.8897182759450999, - "grad_norm": 8.0625, + "epoch": 5.271041369472183, + "grad_norm": 6.375, "learning_rate": 7.827111111111111e-05, - "loss": 0.5521, + "loss": 0.8548, "step": 7390 }, { - "epoch": 0.8909222248976644, - "grad_norm": 7.03125, + "epoch": 5.2781740370898715, + "grad_norm": 8.0625, "learning_rate": 7.822666666666667e-05, - "loss": 0.7556, + "loss": 0.8754, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval/acc": 32.55813980102539, + "epoch": 5.2781740370898715, + "eval/acc": 39.53488540649414, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval_loss": 2.882596015930176, - "eval_runtime": 0.2414, - "eval_samples_per_second": 178.131, - "eval_steps_per_second": 4.143, + "epoch": 5.2781740370898715, + "eval_loss": 2.599212408065796, + "eval_runtime": 0.2355, + "eval_samples_per_second": 182.56, + "eval_steps_per_second": 4.246, "step": 7400 }, { - "epoch": 0.8921261738502287, - "grad_norm": 6.8125, + "epoch": 5.285306704707561, + "grad_norm": 8.875, "learning_rate": 7.818222222222222e-05, - "loss": 0.6252, + "loss": 0.8725, "step": 7410 }, { - "epoch": 0.8933301228027931, - "grad_norm": 6.5, + "epoch": 5.29243937232525, + "grad_norm": 8.0625, "learning_rate": 7.813777777777777e-05, - "loss": 0.6405, + "loss": 0.8689, "step": 7420 }, { - "epoch": 0.8945340717553576, - "grad_norm": 7.25, + "epoch": 5.299572039942939, + "grad_norm": 7.59375, "learning_rate": 7.809333333333335e-05, - "loss": 0.5753, + "loss": 0.7615, "step": 7430 }, { - "epoch": 0.895738020707922, - "grad_norm": 8.4375, + "epoch": 5.306704707560628, + "grad_norm": 6.3125, "learning_rate": 7.80488888888889e-05, - "loss": 0.5782, + "loss": 0.8141, "step": 7440 }, { - "epoch": 0.8969419696604864, - "grad_norm": 7.875, + "epoch": 5.313837375178316, + "grad_norm": 6.84375, "learning_rate": 7.800444444444444e-05, - "loss": 0.6364, + "loss": 0.8328, "step": 7450 }, { - "epoch": 0.8981459186130508, - "grad_norm": 6.15625, + "epoch": 5.320970042796006, + "grad_norm": 7.71875, "learning_rate": 7.796e-05, - "loss": 0.6243, + "loss": 0.8158, "step": 7460 }, { - "epoch": 0.8993498675656152, - "grad_norm": 7.5, + "epoch": 5.328102710413694, + "grad_norm": 7.0625, "learning_rate": 7.791555555555557e-05, - "loss": 0.6401, + "loss": 0.7663, "step": 7470 }, { - "epoch": 0.9005538165181797, - "grad_norm": 6.03125, + "epoch": 5.335235378031384, + "grad_norm": 8.1875, "learning_rate": 7.787111111111112e-05, - "loss": 0.5183, + "loss": 0.7704, "step": 7480 }, { - "epoch": 0.901757765470744, - "grad_norm": 6.5, + "epoch": 5.3423680456490725, + "grad_norm": 8.0, "learning_rate": 7.782666666666666e-05, - "loss": 0.6057, + "loss": 0.8511, "step": 7490 }, { - "epoch": 0.9029617144233084, - "grad_norm": 9.0, + "epoch": 5.349500713266762, + "grad_norm": 5.15625, "learning_rate": 7.778222222222223e-05, - "loss": 0.6341, + "loss": 0.783, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval/acc": 34.30232620239258, + "epoch": 5.349500713266762, + "eval/acc": 39.53488540649414, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval_loss": 2.997713804244995, - "eval_runtime": 1.0811, - "eval_samples_per_second": 39.775, - "eval_steps_per_second": 0.925, + "epoch": 5.349500713266762, + "eval_loss": 2.6000046730041504, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.876, + "eval_steps_per_second": 4.392, "step": 7500 }, { - "epoch": 0.9041656633758729, - "grad_norm": 7.03125, + "epoch": 5.356633380884451, + "grad_norm": 7.6875, "learning_rate": 7.773777777777778e-05, - "loss": 0.6595, + "loss": 0.7674, "step": 7510 }, { - "epoch": 0.9053696123284373, - "grad_norm": 7.84375, + "epoch": 5.36376604850214, + "grad_norm": 6.53125, "learning_rate": 7.769333333333334e-05, - "loss": 0.7769, + "loss": 0.8338, "step": 7520 }, { - "epoch": 0.9065735612810016, - "grad_norm": 6.78125, + "epoch": 5.370898716119829, + "grad_norm": 5.8125, "learning_rate": 7.76488888888889e-05, - "loss": 0.6876, + "loss": 0.8279, "step": 7530 }, { - "epoch": 0.9077775102335661, - "grad_norm": 9.375, + "epoch": 5.378031383737518, + "grad_norm": 7.0625, "learning_rate": 7.760444444444445e-05, - "loss": 0.6271, + "loss": 0.7954, "step": 7540 }, { - "epoch": 0.9089814591861305, - "grad_norm": 6.96875, + "epoch": 5.385164051355207, + "grad_norm": 8.0, "learning_rate": 7.756e-05, - "loss": 0.6117, + "loss": 0.8632, "step": 7550 }, { - "epoch": 0.910185408138695, - "grad_norm": 6.28125, + "epoch": 5.392296718972895, + "grad_norm": 6.84375, "learning_rate": 7.751555555555556e-05, - "loss": 0.6461, + "loss": 0.8191, "step": 7560 }, { - "epoch": 0.9113893570912593, - "grad_norm": 7.96875, + "epoch": 5.399429386590585, + "grad_norm": 7.375, "learning_rate": 7.747111111111112e-05, - "loss": 0.6543, + "loss": 0.708, "step": 7570 }, { - "epoch": 0.9125933060438237, - "grad_norm": 10.0, + "epoch": 5.4065620542082735, + "grad_norm": 7.15625, "learning_rate": 7.742666666666667e-05, - "loss": 0.686, + "loss": 0.6851, "step": 7580 }, { - "epoch": 0.9137972549963882, - "grad_norm": 7.90625, + "epoch": 5.413694721825963, + "grad_norm": 7.25, "learning_rate": 7.738222222222222e-05, - "loss": 0.6634, + "loss": 0.8769, "step": 7590 }, { - "epoch": 0.9150012039489526, - "grad_norm": 11.5625, + "epoch": 5.420827389443652, + "grad_norm": 7.6875, "learning_rate": 7.733777777777779e-05, - "loss": 0.6627, + "loss": 0.8316, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval/acc": 37.20930099487305, + "epoch": 5.420827389443652, + "eval/acc": 39.53488540649414, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval_loss": 2.908363103866577, - "eval_runtime": 2.6366, - "eval_samples_per_second": 16.309, - "eval_steps_per_second": 0.379, + "epoch": 5.420827389443652, + "eval_loss": 2.583944797515869, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.433, + "eval_steps_per_second": 4.522, "step": 7600 }, { - "epoch": 0.9162051529015169, - "grad_norm": 5.65625, + "epoch": 5.427960057061341, + "grad_norm": 7.625, "learning_rate": 7.729333333333334e-05, - "loss": 0.5503, + "loss": 0.8444, "step": 7610 }, { - "epoch": 0.9174091018540814, - "grad_norm": 7.15625, + "epoch": 5.43509272467903, + "grad_norm": 6.6875, "learning_rate": 7.724888888888889e-05, - "loss": 0.5263, + "loss": 0.8101, "step": 7620 }, { - "epoch": 0.9186130508066458, - "grad_norm": 5.96875, + "epoch": 5.442225392296719, + "grad_norm": 6.375, "learning_rate": 7.720444444444445e-05, - "loss": 0.6969, + "loss": 0.8094, "step": 7630 }, { - "epoch": 0.9198169997592102, - "grad_norm": 8.0625, + "epoch": 5.449358059914408, + "grad_norm": 7.09375, "learning_rate": 7.716e-05, - "loss": 0.6371, + "loss": 0.9292, "step": 7640 }, { - "epoch": 0.9210209487117746, - "grad_norm": 7.5625, + "epoch": 5.456490727532097, + "grad_norm": 8.0, "learning_rate": 7.711555555555556e-05, - "loss": 0.6406, + "loss": 0.8544, "step": 7650 }, { - "epoch": 0.922224897664339, - "grad_norm": 10.6875, + "epoch": 5.463623395149786, + "grad_norm": 5.625, "learning_rate": 7.707111111111111e-05, - "loss": 0.7058, + "loss": 0.787, "step": 7660 }, { - "epoch": 0.9234288466169035, - "grad_norm": 12.5625, + "epoch": 5.470756062767475, + "grad_norm": 8.375, "learning_rate": 7.702666666666667e-05, - "loss": 0.7067, + "loss": 0.8763, "step": 7670 }, { - "epoch": 0.9246327955694679, - "grad_norm": 7.21875, + "epoch": 5.477888730385164, + "grad_norm": 12.9375, "learning_rate": 7.698222222222222e-05, - "loss": 0.5543, + "loss": 0.8317, "step": 7680 }, { - "epoch": 0.9258367445220322, - "grad_norm": 10.125, + "epoch": 5.4850213980028535, + "grad_norm": 8.125, "learning_rate": 7.693777777777778e-05, - "loss": 0.6719, + "loss": 0.8156, "step": 7690 }, { - "epoch": 0.9270406934745967, - "grad_norm": 7.03125, + "epoch": 5.492154065620542, + "grad_norm": 6.96875, "learning_rate": 7.689333333333334e-05, - "loss": 0.5764, + "loss": 0.8998, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval/acc": 35.46511459350586, + "epoch": 5.492154065620542, + "eval/acc": 39.53488540649414, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval_loss": 2.8986358642578125, - "eval_runtime": 4.4935, - "eval_samples_per_second": 9.569, - "eval_steps_per_second": 0.223, + "epoch": 5.492154065620542, + "eval_loss": 2.6069791316986084, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.632, + "eval_steps_per_second": 4.457, "step": 7700 }, { - "epoch": 0.9282446424271611, - "grad_norm": 10.25, + "epoch": 5.499286733238231, + "grad_norm": 7.5625, "learning_rate": 7.68488888888889e-05, - "loss": 0.6302, + "loss": 0.7881, "step": 7710 }, { - "epoch": 0.9294485913797255, - "grad_norm": 9.8125, + "epoch": 5.50641940085592, + "grad_norm": 6.65625, "learning_rate": 7.680444444444444e-05, - "loss": 0.6236, + "loss": 0.8379, "step": 7720 }, { - "epoch": 0.93065254033229, - "grad_norm": 6.5, + "epoch": 5.513552068473609, + "grad_norm": 6.34375, "learning_rate": 7.676e-05, - "loss": 0.7159, + "loss": 0.844, "step": 7730 }, { - "epoch": 0.9318564892848543, - "grad_norm": 7.1875, + "epoch": 5.520684736091298, + "grad_norm": 8.3125, "learning_rate": 7.671555555555557e-05, - "loss": 0.6257, + "loss": 0.8762, "step": 7740 }, { - "epoch": 0.9330604382374187, - "grad_norm": 7.3125, + "epoch": 5.527817403708987, + "grad_norm": 7.09375, "learning_rate": 7.667111111111111e-05, - "loss": 0.5247, + "loss": 0.8621, "step": 7750 }, { - "epoch": 0.9342643871899832, - "grad_norm": 5.0, + "epoch": 5.534950071326676, + "grad_norm": 8.5625, "learning_rate": 7.662666666666666e-05, - "loss": 0.5185, + "loss": 1.0092, "step": 7760 }, { - "epoch": 0.9354683361425475, - "grad_norm": 13.375, + "epoch": 5.542082738944365, + "grad_norm": 6.3125, "learning_rate": 7.658222222222222e-05, - "loss": 0.8069, + "loss": 0.8743, "step": 7770 }, { - "epoch": 0.936672285095112, - "grad_norm": 10.3125, + "epoch": 5.5492154065620545, + "grad_norm": 6.0625, "learning_rate": 7.653777777777779e-05, - "loss": 0.6619, + "loss": 0.754, "step": 7780 }, { - "epoch": 0.9378762340476764, - "grad_norm": 7.1875, + "epoch": 5.556348074179743, + "grad_norm": 7.6875, "learning_rate": 7.649333333333334e-05, - "loss": 0.785, + "loss": 0.8504, "step": 7790 }, { - "epoch": 0.9390801830002408, - "grad_norm": 6.0625, + "epoch": 5.563480741797433, + "grad_norm": 8.3125, "learning_rate": 7.64488888888889e-05, - "loss": 0.6064, + "loss": 0.7512, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval/acc": 40.69767379760742, + "epoch": 5.563480741797433, + "eval/acc": 37.20930099487305, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval_loss": 2.9625086784362793, - "eval_runtime": 1.0058, - "eval_samples_per_second": 42.753, - "eval_steps_per_second": 0.994, + "epoch": 5.563480741797433, + "eval_loss": 2.610304594039917, + "eval_runtime": 0.2338, + "eval_samples_per_second": 183.899, + "eval_steps_per_second": 4.277, "step": 7800 }, { - "epoch": 0.9402841319528052, - "grad_norm": 7.0, + "epoch": 5.570613409415121, + "grad_norm": 6.71875, "learning_rate": 7.640444444444445e-05, - "loss": 0.5744, + "loss": 0.8204, "step": 7810 }, { - "epoch": 0.9414880809053696, - "grad_norm": 7.78125, + "epoch": 5.57774607703281, + "grad_norm": 6.625, "learning_rate": 7.636e-05, - "loss": 0.6294, + "loss": 0.734, "step": 7820 }, { - "epoch": 0.942692029857934, - "grad_norm": 6.46875, + "epoch": 5.584878744650499, + "grad_norm": 5.65625, "learning_rate": 7.631555555555556e-05, - "loss": 0.7608, + "loss": 0.8047, "step": 7830 }, { - "epoch": 0.9438959788104985, - "grad_norm": 6.71875, + "epoch": 5.592011412268189, + "grad_norm": 6.40625, "learning_rate": 7.627111111111112e-05, - "loss": 0.6084, + "loss": 0.7179, "step": 7840 }, { - "epoch": 0.9450999277630628, - "grad_norm": 7.15625, + "epoch": 5.599144079885877, + "grad_norm": 6.78125, "learning_rate": 7.622666666666667e-05, - "loss": 0.5791, + "loss": 0.849, "step": 7850 }, { - "epoch": 0.9463038767156272, - "grad_norm": 10.1875, + "epoch": 5.606276747503566, + "grad_norm": 8.8125, "learning_rate": 7.618222222222221e-05, - "loss": 0.683, + "loss": 0.8817, "step": 7860 }, { - "epoch": 0.9475078256681917, - "grad_norm": 7.59375, + "epoch": 5.6134094151212555, + "grad_norm": 6.375, "learning_rate": 7.613777777777779e-05, - "loss": 0.6413, + "loss": 0.8812, "step": 7870 }, { - "epoch": 0.9487117746207561, - "grad_norm": 5.71875, + "epoch": 5.620542082738944, + "grad_norm": 13.125, "learning_rate": 7.609333333333334e-05, - "loss": 0.5985, + "loss": 0.8522, "step": 7880 }, { - "epoch": 0.9499157235733204, - "grad_norm": 8.625, + "epoch": 5.627674750356634, + "grad_norm": 7.0625, "learning_rate": 7.604888888888889e-05, - "loss": 0.572, + "loss": 0.731, "step": 7890 }, { - "epoch": 0.9511196725258849, - "grad_norm": 15.75, + "epoch": 5.634807417974322, + "grad_norm": 7.21875, "learning_rate": 7.600444444444445e-05, - "loss": 0.674, + "loss": 0.8841, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval/acc": 34.88372039794922, + "epoch": 5.634807417974322, + "eval/acc": 39.53488540649414, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval_loss": 2.9794013500213623, - "eval_runtime": 0.209, - "eval_samples_per_second": 205.705, - "eval_steps_per_second": 4.784, + "epoch": 5.634807417974322, + "eval_loss": 2.6105217933654785, + "eval_runtime": 0.2306, + "eval_samples_per_second": 186.447, + "eval_steps_per_second": 4.336, "step": 7900 }, { - "epoch": 0.9523236214784493, - "grad_norm": 8.25, + "epoch": 5.641940085592012, + "grad_norm": 7.625, "learning_rate": 7.596000000000001e-05, - "loss": 0.7455, + "loss": 0.8654, "step": 7910 }, { - "epoch": 0.9535275704310138, - "grad_norm": 7.40625, + "epoch": 5.6490727532097, + "grad_norm": 26.75, "learning_rate": 7.591555555555556e-05, - "loss": 0.5746, + "loss": 0.8103, "step": 7920 }, { - "epoch": 0.9547315193835781, - "grad_norm": 7.78125, + "epoch": 5.65620542082739, + "grad_norm": 7.375, "learning_rate": 7.587111111111112e-05, - "loss": 0.6232, + "loss": 0.7461, "step": 7930 }, { - "epoch": 0.9559354683361425, - "grad_norm": 10.9375, + "epoch": 5.663338088445078, + "grad_norm": 6.09375, "learning_rate": 7.582666666666667e-05, - "loss": 0.7393, + "loss": 0.9693, "step": 7940 }, { - "epoch": 0.957139417288707, - "grad_norm": 8.6875, + "epoch": 5.670470756062768, + "grad_norm": 7.09375, "learning_rate": 7.578222222222222e-05, - "loss": 0.6138, + "loss": 0.8595, "step": 7950 }, { - "epoch": 0.9583433662412714, - "grad_norm": 7.625, + "epoch": 5.6776034236804565, + "grad_norm": 7.3125, "learning_rate": 7.573777777777778e-05, - "loss": 0.637, + "loss": 0.8541, "step": 7960 }, { - "epoch": 0.9595473151938357, - "grad_norm": 6.90625, + "epoch": 5.684736091298145, + "grad_norm": 7.90625, "learning_rate": 7.569333333333334e-05, - "loss": 0.606, + "loss": 0.8774, "step": 7970 }, { - "epoch": 0.9607512641464002, - "grad_norm": 8.8125, + "epoch": 5.6918687589158345, + "grad_norm": 9.0, "learning_rate": 7.564888888888889e-05, - "loss": 0.7135, + "loss": 0.8823, "step": 7980 }, { - "epoch": 0.9619552130989646, - "grad_norm": 6.84375, + "epoch": 5.699001426533523, + "grad_norm": 6.09375, "learning_rate": 7.560444444444444e-05, - "loss": 0.6138, + "loss": 0.7302, "step": 7990 }, { - "epoch": 0.963159162051529, - "grad_norm": 8.25, + "epoch": 5.706134094151213, + "grad_norm": 7.21875, "learning_rate": 7.556000000000002e-05, - "loss": 0.7128, + "loss": 0.8339, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval/acc": 34.88372039794922, + "epoch": 5.706134094151213, + "eval/acc": 37.20930099487305, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval_loss": 2.9879119396209717, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.451, - "eval_steps_per_second": 4.708, + "epoch": 5.706134094151213, + "eval_loss": 2.576781988143921, + "eval_runtime": 0.2231, + "eval_samples_per_second": 192.779, + "eval_steps_per_second": 4.483, "step": 8000 }, { - "epoch": 0.9643631110040934, - "grad_norm": 9.875, + "epoch": 5.713266761768901, + "grad_norm": 7.75, "learning_rate": 7.551555555555556e-05, - "loss": 0.5835, + "loss": 0.7642, "step": 8010 }, { - "epoch": 0.9655670599566578, - "grad_norm": 8.8125, + "epoch": 5.720399429386591, + "grad_norm": 7.8125, "learning_rate": 7.547111111111111e-05, - "loss": 0.6138, + "loss": 0.9188, "step": 8020 }, { - "epoch": 0.9667710089092223, - "grad_norm": 8.3125, + "epoch": 5.727532097004279, + "grad_norm": 7.28125, "learning_rate": 7.542666666666667e-05, - "loss": 0.6638, + "loss": 0.8202, "step": 8030 }, { - "epoch": 0.9679749578617867, - "grad_norm": 7.0, + "epoch": 5.734664764621969, + "grad_norm": 9.0, "learning_rate": 7.538222222222222e-05, - "loss": 0.6484, + "loss": 0.8286, "step": 8040 }, { - "epoch": 0.969178906814351, - "grad_norm": 8.25, + "epoch": 5.741797432239657, + "grad_norm": 7.25, "learning_rate": 7.533777777777778e-05, - "loss": 0.6291, + "loss": 0.7856, "step": 8050 }, { - "epoch": 0.9703828557669155, - "grad_norm": 9.75, + "epoch": 5.748930099857347, + "grad_norm": 6.90625, "learning_rate": 7.529333333333333e-05, - "loss": 0.71, + "loss": 0.8832, "step": 8060 }, { - "epoch": 0.9715868047194799, - "grad_norm": 6.375, + "epoch": 5.7560627674750355, + "grad_norm": 6.09375, "learning_rate": 7.52488888888889e-05, - "loss": 0.5791, + "loss": 0.7606, "step": 8070 }, { - "epoch": 0.9727907536720443, - "grad_norm": 7.40625, + "epoch": 5.763195435092725, + "grad_norm": 6.625, "learning_rate": 7.520444444444444e-05, - "loss": 0.6359, + "loss": 0.8706, "step": 8080 }, { - "epoch": 0.9739947026246087, - "grad_norm": 8.125, + "epoch": 5.770328102710414, + "grad_norm": 7.25, "learning_rate": 7.516e-05, - "loss": 0.5274, + "loss": 0.8542, "step": 8090 }, { - "epoch": 0.9751986515771731, - "grad_norm": 8.6875, + "epoch": 5.777460770328103, + "grad_norm": 6.84375, "learning_rate": 7.511555555555557e-05, - "loss": 0.5887, + "loss": 0.7988, "step": 8100 }, { - "epoch": 0.9751986515771731, + "epoch": 5.777460770328103, "eval/acc": 37.20930099487305, "step": 8100 }, { - "epoch": 0.9751986515771731, - "eval_loss": 3.0165836811065674, - "eval_runtime": 0.2158, - "eval_samples_per_second": 199.215, - "eval_steps_per_second": 4.633, + "epoch": 5.777460770328103, + "eval_loss": 2.598762273788452, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.04, + "eval_steps_per_second": 4.443, "step": 8100 }, { - "epoch": 0.9764026005297375, - "grad_norm": 9.125, + "epoch": 5.784593437945792, + "grad_norm": 8.875, "learning_rate": 7.507111111111112e-05, - "loss": 0.6646, + "loss": 0.8825, "step": 8110 }, { - "epoch": 0.977606549482302, - "grad_norm": 6.4375, + "epoch": 5.79172610556348, + "grad_norm": 7.375, "learning_rate": 7.502666666666666e-05, - "loss": 0.6757, + "loss": 0.8316, "step": 8120 }, { - "epoch": 0.9788104984348663, - "grad_norm": 6.90625, + "epoch": 5.79885877318117, + "grad_norm": 8.125, "learning_rate": 7.498222222222223e-05, - "loss": 0.6722, + "loss": 0.8567, "step": 8130 }, { - "epoch": 0.9800144473874308, - "grad_norm": 5.34375, + "epoch": 5.805991440798858, + "grad_norm": 6.3125, "learning_rate": 7.493777777777779e-05, - "loss": 0.5574, + "loss": 0.8415, "step": 8140 }, { - "epoch": 0.9812183963399952, - "grad_norm": 12.25, + "epoch": 5.813124108416548, + "grad_norm": 8.5, "learning_rate": 7.489333333333334e-05, - "loss": 0.5701, + "loss": 0.8369, "step": 8150 }, { - "epoch": 0.9824223452925596, - "grad_norm": 5.09375, + "epoch": 5.8202567760342365, + "grad_norm": 13.25, "learning_rate": 7.484888888888889e-05, - "loss": 0.7311, + "loss": 0.8692, "step": 8160 }, { - "epoch": 0.983626294245124, - "grad_norm": 9.6875, + "epoch": 5.827389443651926, + "grad_norm": 7.71875, "learning_rate": 7.480444444444445e-05, - "loss": 0.6314, + "loss": 0.8535, "step": 8170 }, { - "epoch": 0.9848302431976884, - "grad_norm": 7.46875, + "epoch": 5.834522111269615, + "grad_norm": 7.6875, "learning_rate": 7.476000000000001e-05, - "loss": 0.6023, + "loss": 0.8701, "step": 8180 }, { - "epoch": 0.9860341921502528, - "grad_norm": 4.53125, + "epoch": 5.841654778887304, + "grad_norm": 5.46875, "learning_rate": 7.471555555555556e-05, - "loss": 0.5998, + "loss": 0.7843, "step": 8190 }, { - "epoch": 0.9872381411028173, - "grad_norm": 7.3125, + "epoch": 5.848787446504993, + "grad_norm": 7.46875, "learning_rate": 7.467111111111112e-05, - "loss": 0.6607, + "loss": 0.7914, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval/acc": 34.88372039794922, + "epoch": 5.848787446504993, + "eval/acc": 37.20930099487305, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval_loss": 2.9665606021881104, - "eval_runtime": 0.2143, - "eval_samples_per_second": 200.607, - "eval_steps_per_second": 4.665, + "epoch": 5.848787446504993, + "eval_loss": 2.566337823867798, + "eval_runtime": 0.3566, + "eval_samples_per_second": 120.59, + "eval_steps_per_second": 2.804, "step": 8200 }, { - "epoch": 0.9884420900553816, - "grad_norm": 7.96875, + "epoch": 5.855920114122682, + "grad_norm": 7.03125, "learning_rate": 7.462666666666667e-05, - "loss": 0.7103, + "loss": 0.849, "step": 8210 }, { - "epoch": 0.989646039007946, - "grad_norm": 10.3125, + "epoch": 5.863052781740371, + "grad_norm": 7.5625, "learning_rate": 7.458222222222223e-05, - "loss": 0.5721, + "loss": 0.8066, "step": 8220 }, { - "epoch": 0.9908499879605105, - "grad_norm": 8.5, + "epoch": 5.870185449358059, + "grad_norm": 6.875, "learning_rate": 7.453777777777778e-05, - "loss": 0.7032, + "loss": 0.8556, "step": 8230 }, { - "epoch": 0.9920539369130749, - "grad_norm": 6.21875, + "epoch": 5.877318116975749, + "grad_norm": 8.0, "learning_rate": 7.449333333333334e-05, - "loss": 0.6547, + "loss": 0.9098, "step": 8240 }, { - "epoch": 0.9932578858656393, - "grad_norm": 7.84375, + "epoch": 5.884450784593438, + "grad_norm": 8.375, "learning_rate": 7.444888888888889e-05, - "loss": 0.6587, + "loss": 0.8183, "step": 8250 }, { - "epoch": 0.9944618348182037, - "grad_norm": 6.53125, + "epoch": 5.891583452211127, + "grad_norm": 13.9375, "learning_rate": 7.440444444444444e-05, - "loss": 0.5486, + "loss": 0.8316, "step": 8260 }, { - "epoch": 0.9956657837707681, - "grad_norm": 8.1875, + "epoch": 5.898716119828816, + "grad_norm": 7.25, "learning_rate": 7.436000000000001e-05, - "loss": 0.6284, + "loss": 0.8563, "step": 8270 }, { - "epoch": 0.9968697327233326, - "grad_norm": 7.59375, + "epoch": 5.905848787446505, + "grad_norm": 10.75, "learning_rate": 7.431555555555556e-05, - "loss": 0.7033, + "loss": 0.8473, "step": 8280 }, { - "epoch": 0.9980736816758969, - "grad_norm": 9.0625, + "epoch": 5.912981455064194, + "grad_norm": 14.1875, "learning_rate": 7.427111111111111e-05, - "loss": 0.6621, + "loss": 0.774, "step": 8290 }, { - "epoch": 0.9992776306284613, - "grad_norm": 8.1875, + "epoch": 5.920114122681883, + "grad_norm": 6.8125, "learning_rate": 7.422666666666667e-05, - "loss": 0.6675, + "loss": 0.8783, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval/acc": 34.30232620239258, + "epoch": 5.920114122681883, + "eval/acc": 34.88372039794922, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval_loss": 2.9075520038604736, - "eval_runtime": 0.7142, - "eval_samples_per_second": 60.205, - "eval_steps_per_second": 1.4, + "epoch": 5.920114122681883, + "eval_loss": 2.6135735511779785, + "eval_runtime": 0.2367, + "eval_samples_per_second": 181.665, + "eval_steps_per_second": 4.225, "step": 8300 }, { - "epoch": 1.0004815795810258, - "grad_norm": 6.34375, + "epoch": 5.927246790299572, + "grad_norm": 7.5625, "learning_rate": 7.418222222222223e-05, - "loss": 0.6534, + "loss": 0.9057, "step": 8310 }, { - "epoch": 1.0016855285335902, - "grad_norm": 7.5, + "epoch": 5.934379457917261, + "grad_norm": 7.875, "learning_rate": 7.413777777777778e-05, - "loss": 0.5778, + "loss": 0.8854, "step": 8320 }, { - "epoch": 1.0028894774861545, - "grad_norm": 8.25, + "epoch": 5.94151212553495, + "grad_norm": 8.1875, "learning_rate": 7.409333333333333e-05, - "loss": 0.6143, + "loss": 0.8049, "step": 8330 }, { - "epoch": 1.004093426438719, - "grad_norm": 6.40625, + "epoch": 5.948644793152639, + "grad_norm": 6.90625, "learning_rate": 7.404888888888889e-05, - "loss": 0.5399, + "loss": 0.7738, "step": 8340 }, { - "epoch": 1.0052973753912835, - "grad_norm": 8.6875, + "epoch": 5.955777460770328, + "grad_norm": 7.90625, "learning_rate": 7.400444444444444e-05, - "loss": 0.6422, + "loss": 0.8268, "step": 8350 }, { - "epoch": 1.0065013243438479, - "grad_norm": 6.5625, + "epoch": 5.9629101283880175, + "grad_norm": 8.3125, "learning_rate": 7.396e-05, - "loss": 0.5578, + "loss": 0.8336, "step": 8360 }, { - "epoch": 1.0077052732964122, - "grad_norm": 6.15625, + "epoch": 5.970042796005706, + "grad_norm": 7.375, "learning_rate": 7.391555555555557e-05, - "loss": 0.6529, + "loss": 0.8282, "step": 8370 }, { - "epoch": 1.0089092222489766, - "grad_norm": 8.875, + "epoch": 5.977175463623395, + "grad_norm": 6.8125, "learning_rate": 7.387111111111111e-05, - "loss": 0.7195, + "loss": 0.8234, "step": 8380 }, { - "epoch": 1.010113171201541, - "grad_norm": 14.0, + "epoch": 5.984308131241084, + "grad_norm": 7.15625, "learning_rate": 7.382666666666666e-05, - "loss": 0.6301, + "loss": 0.8771, "step": 8390 }, { - "epoch": 1.0113171201541056, - "grad_norm": 7.46875, + "epoch": 5.991440798858774, + "grad_norm": 8.5, "learning_rate": 7.378222222222222e-05, - "loss": 0.6439, + "loss": 0.8572, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval/acc": 44.1860466003418, + "epoch": 5.991440798858774, + "eval/acc": 34.88372039794922, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval_loss": 2.8671419620513916, - "eval_runtime": 7.1371, - "eval_samples_per_second": 6.025, - "eval_steps_per_second": 0.14, + "epoch": 5.991440798858774, + "eval_loss": 2.5367989540100098, + "eval_runtime": 0.224, + "eval_samples_per_second": 191.97, + "eval_steps_per_second": 4.464, "step": 8400 }, { - "epoch": 1.01252106910667, - "grad_norm": 11.5625, + "epoch": 5.998573466476462, + "grad_norm": 7.0, "learning_rate": 7.373777777777779e-05, - "loss": 0.618, + "loss": 0.7468, "step": 8410 }, { - "epoch": 1.0137250180592343, - "grad_norm": 6.1875, + "epoch": 6.005706134094151, + "grad_norm": 7.78125, "learning_rate": 7.369333333333333e-05, - "loss": 0.666, + "loss": 0.7882, "step": 8420 }, { - "epoch": 1.0149289670117987, - "grad_norm": 7.34375, + "epoch": 6.01283880171184, + "grad_norm": 9.1875, "learning_rate": 7.364888888888888e-05, - "loss": 0.6237, + "loss": 0.9419, "step": 8430 }, { - "epoch": 1.016132915964363, - "grad_norm": 7.21875, + "epoch": 6.019971469329529, + "grad_norm": 17.625, "learning_rate": 7.360444444444445e-05, - "loss": 0.5974, + "loss": 0.7904, "step": 8440 }, { - "epoch": 1.0173368649169274, - "grad_norm": 8.625, + "epoch": 6.0271041369472185, + "grad_norm": 8.0625, "learning_rate": 7.356000000000001e-05, - "loss": 0.5766, + "loss": 0.8125, "step": 8450 }, { - "epoch": 1.018540813869492, - "grad_norm": 7.71875, + "epoch": 6.034236804564907, + "grad_norm": 7.4375, "learning_rate": 7.351555555555556e-05, - "loss": 0.6754, + "loss": 0.8002, "step": 8460 }, { - "epoch": 1.0197447628220564, - "grad_norm": 6.8125, + "epoch": 6.041369472182597, + "grad_norm": 5.6875, "learning_rate": 7.347111111111112e-05, - "loss": 0.6515, + "loss": 0.7719, "step": 8470 }, { - "epoch": 1.0209487117746208, - "grad_norm": 7.40625, + "epoch": 6.048502139800285, + "grad_norm": 8.9375, "learning_rate": 7.342666666666667e-05, - "loss": 0.6191, + "loss": 0.8122, "step": 8480 }, { - "epoch": 1.0221526607271851, - "grad_norm": 7.34375, + "epoch": 6.055634807417975, + "grad_norm": 9.875, "learning_rate": 7.338222222222223e-05, - "loss": 0.5703, + "loss": 0.8052, "step": 8490 }, { - "epoch": 1.0233566096797495, - "grad_norm": 8.125, + "epoch": 6.062767475035663, + "grad_norm": 9.125, "learning_rate": 7.333777777777778e-05, - "loss": 0.585, + "loss": 0.8171, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval/acc": 44.1860466003418, + "epoch": 6.062767475035663, + "eval/acc": 46.511627197265625, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval_loss": 2.8172407150268555, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.34, - "eval_steps_per_second": 4.729, + "epoch": 6.062767475035663, + "eval_loss": 2.4180805683135986, + "eval_runtime": 1.182, + "eval_samples_per_second": 36.38, + "eval_steps_per_second": 0.846, "step": 8500 }, { - "epoch": 1.0245605586323139, - "grad_norm": 8.375, + "epoch": 6.069900142653353, + "grad_norm": 6.84375, "learning_rate": 7.329333333333334e-05, - "loss": 0.5365, + "loss": 0.9028, "step": 8510 }, { - "epoch": 1.0257645075848785, - "grad_norm": 8.0, + "epoch": 6.077032810271041, + "grad_norm": 23.625, "learning_rate": 7.324888888888889e-05, - "loss": 0.5976, + "loss": 0.8576, "step": 8520 }, { - "epoch": 1.0269684565374428, - "grad_norm": 6.59375, + "epoch": 6.08416547788873, + "grad_norm": 6.96875, "learning_rate": 7.320444444444445e-05, - "loss": 0.6249, + "loss": 0.8407, "step": 8530 }, { - "epoch": 1.0281724054900072, - "grad_norm": 8.625, + "epoch": 6.0912981455064195, + "grad_norm": 8.6875, "learning_rate": 7.316000000000001e-05, - "loss": 0.5953, + "loss": 0.8419, "step": 8540 }, { - "epoch": 1.0293763544425716, - "grad_norm": 4.875, + "epoch": 6.098430813124108, + "grad_norm": 6.90625, "learning_rate": 7.311555555555556e-05, - "loss": 0.5528, + "loss": 0.7802, "step": 8550 }, { - "epoch": 1.030580303395136, - "grad_norm": 5.28125, + "epoch": 6.1055634807417976, + "grad_norm": 6.34375, "learning_rate": 7.307111111111111e-05, - "loss": 0.5181, + "loss": 0.7716, "step": 8560 }, { - "epoch": 1.0317842523477005, - "grad_norm": 9.9375, + "epoch": 6.112696148359486, + "grad_norm": 13.5, "learning_rate": 7.302666666666667e-05, - "loss": 0.5991, + "loss": 0.8538, "step": 8570 }, { - "epoch": 1.032988201300265, - "grad_norm": 5.78125, + "epoch": 6.119828815977176, + "grad_norm": 6.59375, "learning_rate": 7.298222222222223e-05, - "loss": 0.6822, + "loss": 0.6951, "step": 8580 }, { - "epoch": 1.0341921502528293, - "grad_norm": 7.84375, + "epoch": 6.126961483594864, + "grad_norm": 7.0625, "learning_rate": 7.293777777777778e-05, - "loss": 0.671, + "loss": 0.794, "step": 8590 }, { - "epoch": 1.0353960992053937, - "grad_norm": 8.4375, + "epoch": 6.134094151212554, + "grad_norm": 7.15625, "learning_rate": 7.289333333333334e-05, - "loss": 0.6266, + "loss": 0.8058, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval/acc": 41.27906799316406, + "epoch": 6.134094151212554, + "eval/acc": 46.511627197265625, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval_loss": 2.89090895652771, - "eval_runtime": 0.2168, - "eval_samples_per_second": 198.358, - "eval_steps_per_second": 4.613, + "epoch": 6.134094151212554, + "eval_loss": 2.5194764137268066, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.736, + "eval_steps_per_second": 4.529, "step": 8600 }, { - "epoch": 1.036600048157958, - "grad_norm": 6.84375, + "epoch": 6.141226818830242, + "grad_norm": 8.0625, "learning_rate": 7.284888888888889e-05, - "loss": 0.5829, + "loss": 0.8754, "step": 8610 }, { - "epoch": 1.0378039971105224, - "grad_norm": 12.6875, + "epoch": 6.148359486447932, + "grad_norm": 4.875, "learning_rate": 7.280444444444445e-05, - "loss": 0.6336, + "loss": 0.7852, "step": 8620 }, { - "epoch": 1.039007946063087, - "grad_norm": 5.78125, + "epoch": 6.1554921540656204, + "grad_norm": 8.0, "learning_rate": 7.276e-05, - "loss": 0.5621, + "loss": 0.8064, "step": 8630 }, { - "epoch": 1.0402118950156514, - "grad_norm": 6.78125, + "epoch": 6.16262482168331, + "grad_norm": 6.3125, "learning_rate": 7.271555555555556e-05, - "loss": 0.5822, + "loss": 0.7643, "step": 8640 }, { - "epoch": 1.0414158439682157, - "grad_norm": 5.40625, + "epoch": 6.1697574893009985, + "grad_norm": 8.875, "learning_rate": 7.267111111111111e-05, - "loss": 0.6402, + "loss": 0.7702, "step": 8650 }, { - "epoch": 1.04261979292078, - "grad_norm": 5.84375, + "epoch": 6.176890156918688, + "grad_norm": 18.5, "learning_rate": 7.262666666666666e-05, - "loss": 0.5793, + "loss": 0.903, "step": 8660 }, { - "epoch": 1.0438237418733445, - "grad_norm": 9.375, + "epoch": 6.184022824536377, + "grad_norm": 9.875, "learning_rate": 7.258222222222224e-05, - "loss": 0.6447, + "loss": 0.788, "step": 8670 }, { - "epoch": 1.045027690825909, - "grad_norm": 8.4375, + "epoch": 6.191155492154065, + "grad_norm": 7.71875, "learning_rate": 7.253777777777778e-05, - "loss": 0.6428, + "loss": 0.7504, "step": 8680 }, { - "epoch": 1.0462316397784734, - "grad_norm": 8.5, + "epoch": 6.198288159771755, + "grad_norm": 7.5, "learning_rate": 7.249333333333333e-05, - "loss": 0.6219, + "loss": 0.8821, "step": 8690 }, { - "epoch": 1.0474355887310378, - "grad_norm": 8.0625, + "epoch": 6.205420827389443, + "grad_norm": 6.71875, "learning_rate": 7.24488888888889e-05, - "loss": 0.5728, + "loss": 0.9166, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval/acc": 41.86046600341797, + "epoch": 6.205420827389443, + "eval/acc": 48.83720779418945, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval_loss": 2.881147861480713, - "eval_runtime": 0.2167, - "eval_samples_per_second": 198.476, - "eval_steps_per_second": 4.616, + "epoch": 6.205420827389443, + "eval_loss": 2.488805055618286, + "eval_runtime": 0.2195, + "eval_samples_per_second": 195.91, + "eval_steps_per_second": 4.556, "step": 8700 }, { - "epoch": 1.0486395376836022, - "grad_norm": 7.09375, + "epoch": 6.212553495007133, + "grad_norm": 8.3125, "learning_rate": 7.240444444444446e-05, - "loss": 0.6532, + "loss": 0.7724, "step": 8710 }, { - "epoch": 1.0498434866361666, - "grad_norm": 7.0625, + "epoch": 6.219686162624821, + "grad_norm": 7.84375, "learning_rate": 7.236e-05, - "loss": 0.5758, + "loss": 0.8881, "step": 8720 }, { - "epoch": 1.051047435588731, - "grad_norm": 8.375, + "epoch": 6.226818830242511, + "grad_norm": 7.21875, "learning_rate": 7.231555555555555e-05, - "loss": 0.6071, + "loss": 0.8538, "step": 8730 }, { - "epoch": 1.0522513845412955, - "grad_norm": 7.34375, + "epoch": 6.2339514978601995, + "grad_norm": 7.5, "learning_rate": 7.227111111111112e-05, - "loss": 0.6905, + "loss": 0.8909, "step": 8740 }, { - "epoch": 1.05345533349386, - "grad_norm": 6.59375, + "epoch": 6.241084165477889, + "grad_norm": 7.25, "learning_rate": 7.222666666666666e-05, - "loss": 0.584, + "loss": 0.7965, "step": 8750 }, { - "epoch": 1.0546592824464243, - "grad_norm": 7.4375, + "epoch": 6.248216833095578, + "grad_norm": 7.46875, "learning_rate": 7.218222222222223e-05, - "loss": 0.6222, + "loss": 0.8547, "step": 8760 }, { - "epoch": 1.0558632313989886, - "grad_norm": 7.1875, + "epoch": 6.255349500713267, + "grad_norm": 6.1875, "learning_rate": 7.213777777777779e-05, - "loss": 0.6167, + "loss": 0.7528, "step": 8770 }, { - "epoch": 1.057067180351553, - "grad_norm": 7.875, + "epoch": 6.262482168330956, + "grad_norm": 7.03125, "learning_rate": 7.209333333333334e-05, - "loss": 0.5766, + "loss": 0.8632, "step": 8780 }, { - "epoch": 1.0582711293041176, - "grad_norm": 7.96875, + "epoch": 6.269614835948644, + "grad_norm": 8.375, "learning_rate": 7.204888888888888e-05, - "loss": 0.5747, + "loss": 0.7832, "step": 8790 }, { - "epoch": 1.059475078256682, - "grad_norm": 7.5, + "epoch": 6.276747503566334, + "grad_norm": 8.125, "learning_rate": 7.200444444444445e-05, - "loss": 0.5361, + "loss": 0.7659, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval/acc": 44.1860466003418, + "epoch": 6.276747503566334, + "eval/acc": 48.83720779418945, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval_loss": 2.9378437995910645, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.123, - "eval_steps_per_second": 4.631, + "epoch": 6.276747503566334, + "eval_loss": 2.4990618228912354, + "eval_runtime": 0.2586, + "eval_samples_per_second": 166.3, + "eval_steps_per_second": 3.867, "step": 8800 }, { - "epoch": 1.0606790272092463, - "grad_norm": 6.46875, + "epoch": 6.283880171184022, + "grad_norm": 7.375, "learning_rate": 7.196000000000001e-05, - "loss": 0.5386, + "loss": 0.7402, "step": 8810 }, { - "epoch": 1.0618829761618107, - "grad_norm": 5.75, + "epoch": 6.291012838801712, + "grad_norm": 7.0, "learning_rate": 7.191555555555556e-05, - "loss": 0.5701, + "loss": 0.8381, "step": 8820 }, { - "epoch": 1.063086925114375, - "grad_norm": 10.5625, + "epoch": 6.2981455064194005, + "grad_norm": 15.75, "learning_rate": 7.18711111111111e-05, - "loss": 0.6061, + "loss": 0.8837, "step": 8830 }, { - "epoch": 1.0642908740669395, - "grad_norm": 6.75, + "epoch": 6.30527817403709, + "grad_norm": 5.46875, "learning_rate": 7.182666666666668e-05, - "loss": 0.6201, + "loss": 0.8638, "step": 8840 }, { - "epoch": 1.065494823019504, - "grad_norm": 9.625, + "epoch": 6.312410841654779, + "grad_norm": 5.46875, "learning_rate": 7.178222222222223e-05, - "loss": 0.6315, + "loss": 0.8348, "step": 8850 }, { - "epoch": 1.0666987719720684, - "grad_norm": 6.15625, + "epoch": 6.319543509272468, + "grad_norm": 7.9375, "learning_rate": 7.173777777777778e-05, - "loss": 0.6142, + "loss": 0.8598, "step": 8860 }, { - "epoch": 1.0679027209246328, - "grad_norm": 8.875, + "epoch": 6.326676176890157, + "grad_norm": 7.15625, "learning_rate": 7.169333333333334e-05, - "loss": 0.6545, + "loss": 0.8124, "step": 8870 }, { - "epoch": 1.0691066698771972, - "grad_norm": 6.5, + "epoch": 6.333808844507846, + "grad_norm": 6.28125, "learning_rate": 7.164888888888889e-05, - "loss": 0.6305, + "loss": 0.8184, "step": 8880 }, { - "epoch": 1.0703106188297615, - "grad_norm": 12.5, + "epoch": 6.340941512125535, + "grad_norm": 7.25, "learning_rate": 7.160444444444445e-05, - "loss": 0.6451, + "loss": 0.8522, "step": 8890 }, { - "epoch": 1.0715145677823261, - "grad_norm": 6.28125, + "epoch": 6.348074179743224, + "grad_norm": 8.4375, "learning_rate": 7.156e-05, - "loss": 0.5406, + "loss": 0.894, "step": 8900 }, { - "epoch": 1.0715145677823261, + "epoch": 6.348074179743224, "eval/acc": 46.511627197265625, "step": 8900 }, { - "epoch": 1.0715145677823261, - "eval_loss": 2.895603656768799, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.356, - "eval_steps_per_second": 4.729, + "epoch": 6.348074179743224, + "eval_loss": 2.4536728858947754, + "eval_runtime": 0.2168, + "eval_samples_per_second": 198.347, + "eval_steps_per_second": 4.613, "step": 8900 }, { - "epoch": 1.0727185167348905, - "grad_norm": 5.8125, + "epoch": 6.355206847360913, + "grad_norm": 8.3125, "learning_rate": 7.151555555555556e-05, - "loss": 0.6598, + "loss": 0.8331, "step": 8910 }, { - "epoch": 1.0739224656874549, - "grad_norm": 6.9375, + "epoch": 6.362339514978602, + "grad_norm": 13.1875, "learning_rate": 7.147111111111111e-05, - "loss": 0.5678, + "loss": 0.8107, "step": 8920 }, { - "epoch": 1.0751264146400192, - "grad_norm": 7.25, + "epoch": 6.369472182596291, + "grad_norm": 7.0, "learning_rate": 7.142666666666667e-05, - "loss": 0.6032, + "loss": 0.9504, "step": 8930 }, { - "epoch": 1.0763303635925836, - "grad_norm": 7.09375, + "epoch": 6.37660485021398, + "grad_norm": 9.5625, "learning_rate": 7.138222222222223e-05, - "loss": 0.5432, + "loss": 0.766, "step": 8940 }, { - "epoch": 1.077534312545148, - "grad_norm": 8.6875, + "epoch": 6.383737517831669, + "grad_norm": 13.4375, "learning_rate": 7.133777777777778e-05, - "loss": 0.6408, + "loss": 0.7923, "step": 8950 }, { - "epoch": 1.0787382614977126, - "grad_norm": 8.1875, + "epoch": 6.390870185449358, + "grad_norm": 6.6875, "learning_rate": 7.129333333333333e-05, - "loss": 0.5834, + "loss": 0.7777, "step": 8960 }, { - "epoch": 1.079942210450277, - "grad_norm": 8.75, + "epoch": 6.398002853067047, + "grad_norm": 6.09375, "learning_rate": 7.124888888888889e-05, - "loss": 0.5956, + "loss": 0.7729, "step": 8970 }, { - "epoch": 1.0811461594028413, - "grad_norm": 6.90625, + "epoch": 6.405135520684736, + "grad_norm": 6.46875, "learning_rate": 7.120444444444445e-05, - "loss": 0.6124, + "loss": 0.8118, "step": 8980 }, { - "epoch": 1.0823501083554057, - "grad_norm": 9.5625, + "epoch": 6.412268188302425, + "grad_norm": 6.21875, "learning_rate": 7.116e-05, - "loss": 0.6513, + "loss": 0.9006, "step": 8990 }, { - "epoch": 1.08355405730797, - "grad_norm": 8.0, + "epoch": 6.419400855920114, + "grad_norm": 6.5625, "learning_rate": 7.111555555555555e-05, - "loss": 0.6044, + "loss": 0.7092, "step": 9000 }, { - "epoch": 1.08355405730797, + "epoch": 6.419400855920114, "eval/acc": 44.1860466003418, "step": 9000 }, { - "epoch": 1.08355405730797, - "eval_loss": 2.894747257232666, - "eval_runtime": 0.2236, - "eval_samples_per_second": 192.288, - "eval_steps_per_second": 4.472, + "epoch": 6.419400855920114, + "eval_loss": 2.533996343612671, + "eval_runtime": 0.3418, + "eval_samples_per_second": 125.802, + "eval_steps_per_second": 2.926, "step": 9000 }, { - "epoch": 1.0847580062605346, - "grad_norm": 7.1875, + "epoch": 6.426533523537803, + "grad_norm": 7.59375, "learning_rate": 7.107111111111111e-05, - "loss": 0.4939, + "loss": 0.7684, "step": 9010 }, { - "epoch": 1.085961955213099, - "grad_norm": 8.25, + "epoch": 6.433666191155492, + "grad_norm": 6.8125, "learning_rate": 7.102666666666668e-05, - "loss": 0.7751, + "loss": 0.7654, "step": 9020 }, { - "epoch": 1.0871659041656634, - "grad_norm": 6.875, + "epoch": 6.4407988587731815, + "grad_norm": 7.5625, "learning_rate": 7.098222222222222e-05, - "loss": 0.593, + "loss": 0.8404, "step": 9030 }, { - "epoch": 1.0883698531182278, - "grad_norm": 7.5625, + "epoch": 6.44793152639087, + "grad_norm": 8.5, "learning_rate": 7.093777777777779e-05, - "loss": 0.587, + "loss": 0.8519, "step": 9040 }, { - "epoch": 1.0895738020707921, - "grad_norm": 9.5625, + "epoch": 6.45506419400856, + "grad_norm": 6.53125, "learning_rate": 7.089333333333333e-05, - "loss": 0.639, + "loss": 0.8487, "step": 9050 }, { - "epoch": 1.0907777510233565, - "grad_norm": 8.25, + "epoch": 6.462196861626248, + "grad_norm": 7.59375, "learning_rate": 7.084888888888888e-05, - "loss": 0.6537, + "loss": 0.8695, "step": 9060 }, { - "epoch": 1.091981699975921, - "grad_norm": 9.9375, + "epoch": 6.469329529243938, + "grad_norm": 8.4375, "learning_rate": 7.080444444444444e-05, - "loss": 0.6134, + "loss": 0.7864, "step": 9070 }, { - "epoch": 1.0931856489284855, - "grad_norm": 9.375, + "epoch": 6.476462196861626, + "grad_norm": 66.5, "learning_rate": 7.076000000000001e-05, - "loss": 0.5259, + "loss": 0.7726, "step": 9080 }, { - "epoch": 1.0943895978810498, - "grad_norm": 7.90625, + "epoch": 6.483594864479315, + "grad_norm": 6.96875, "learning_rate": 7.071555555555556e-05, - "loss": 0.7362, + "loss": 0.7832, "step": 9090 }, { - "epoch": 1.0955935468336142, - "grad_norm": 7.46875, + "epoch": 6.490727532097004, + "grad_norm": 7.40625, "learning_rate": 7.06711111111111e-05, - "loss": 0.6197, + "loss": 0.8063, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval/acc": 41.86046600341797, + "epoch": 6.490727532097004, + "eval/acc": 44.1860466003418, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval_loss": 2.920775890350342, - "eval_runtime": 0.2089, - "eval_samples_per_second": 205.889, - "eval_steps_per_second": 4.788, + "epoch": 6.490727532097004, + "eval_loss": 2.5438809394836426, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.594, + "eval_steps_per_second": 4.642, "step": 9100 }, { - "epoch": 1.0967974957861786, - "grad_norm": 9.6875, + "epoch": 6.497860199714693, + "grad_norm": 7.21875, "learning_rate": 7.062666666666668e-05, - "loss": 0.5682, + "loss": 0.7605, "step": 9110 }, { - "epoch": 1.0980014447387432, - "grad_norm": 6.5625, + "epoch": 6.5049928673323825, + "grad_norm": 7.90625, "learning_rate": 7.058222222222223e-05, - "loss": 0.572, + "loss": 0.8032, "step": 9120 }, { - "epoch": 1.0992053936913075, - "grad_norm": 7.5, + "epoch": 6.512125534950071, + "grad_norm": 6.9375, "learning_rate": 7.053777777777778e-05, - "loss": 0.5307, + "loss": 0.743, "step": 9130 }, { - "epoch": 1.100409342643872, - "grad_norm": 7.75, + "epoch": 6.519258202567761, + "grad_norm": 5.65625, "learning_rate": 7.049333333333334e-05, - "loss": 0.5432, + "loss": 0.8261, "step": 9140 }, { - "epoch": 1.1016132915964363, - "grad_norm": 6.84375, + "epoch": 6.526390870185449, + "grad_norm": 7.03125, "learning_rate": 7.04488888888889e-05, - "loss": 0.6012, + "loss": 0.8099, "step": 9150 }, { - "epoch": 1.1028172405490007, - "grad_norm": 6.84375, + "epoch": 6.533523537803139, + "grad_norm": 7.15625, "learning_rate": 7.040444444444445e-05, - "loss": 0.5776, + "loss": 0.817, "step": 9160 }, { - "epoch": 1.104021189501565, - "grad_norm": 8.0625, + "epoch": 6.540656205420827, + "grad_norm": 11.625, "learning_rate": 7.036e-05, - "loss": 0.5353, + "loss": 0.782, "step": 9170 }, { - "epoch": 1.1052251384541296, - "grad_norm": 5.65625, + "epoch": 6.547788873038517, + "grad_norm": 7.5625, "learning_rate": 7.031555555555556e-05, - "loss": 0.5664, + "loss": 0.8145, "step": 9180 }, { - "epoch": 1.106429087406694, - "grad_norm": 14.0, + "epoch": 6.554921540656205, + "grad_norm": 7.5625, "learning_rate": 7.027111111111111e-05, - "loss": 0.6547, + "loss": 0.8822, "step": 9190 }, { - "epoch": 1.1076330363592584, - "grad_norm": 7.9375, + "epoch": 6.562054208273894, + "grad_norm": 6.53125, "learning_rate": 7.022666666666667e-05, - "loss": 0.6063, + "loss": 0.8132, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval/acc": 41.86046600341797, + "epoch": 6.562054208273894, + "eval/acc": 44.1860466003418, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval_loss": 2.9192073345184326, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.15, - "eval_steps_per_second": 4.794, + "epoch": 6.562054208273894, + "eval_loss": 2.528564929962158, + "eval_runtime": 0.2169, + "eval_samples_per_second": 198.28, + "eval_steps_per_second": 4.611, "step": 9200 }, { - "epoch": 1.1088369853118227, - "grad_norm": 11.5625, + "epoch": 6.5691868758915835, + "grad_norm": 7.21875, "learning_rate": 7.018222222222223e-05, - "loss": 0.6251, + "loss": 0.7858, "step": 9210 }, { - "epoch": 1.110040934264387, - "grad_norm": 7.34375, + "epoch": 6.576319543509273, + "grad_norm": 6.4375, "learning_rate": 7.013777777777778e-05, - "loss": 0.5408, + "loss": 0.7098, "step": 9220 }, { - "epoch": 1.1112448832169517, - "grad_norm": 6.75, + "epoch": 6.5834522111269616, + "grad_norm": 7.125, "learning_rate": 7.009333333333333e-05, - "loss": 0.6217, + "loss": 0.8362, "step": 9230 }, { - "epoch": 1.112448832169516, - "grad_norm": 8.375, + "epoch": 6.59058487874465, + "grad_norm": 5.78125, "learning_rate": 7.004888888888889e-05, - "loss": 0.6792, + "loss": 0.7737, "step": 9240 }, { - "epoch": 1.1136527811220804, - "grad_norm": 8.375, + "epoch": 6.59771754636234, + "grad_norm": 9.0625, "learning_rate": 7.000444444444445e-05, - "loss": 0.5786, + "loss": 0.857, "step": 9250 }, { - "epoch": 1.1148567300746448, - "grad_norm": 11.0, + "epoch": 6.604850213980028, + "grad_norm": 9.125, "learning_rate": 6.996e-05, - "loss": 0.6588, + "loss": 0.7562, "step": 9260 }, { - "epoch": 1.1160606790272092, - "grad_norm": 6.75, + "epoch": 6.611982881597718, + "grad_norm": 8.3125, "learning_rate": 6.991555555555556e-05, - "loss": 0.6016, + "loss": 0.8619, "step": 9270 }, { - "epoch": 1.1172646279797736, - "grad_norm": 9.1875, + "epoch": 6.619115549215406, + "grad_norm": 6.78125, "learning_rate": 6.987111111111111e-05, - "loss": 0.5728, + "loss": 0.7212, "step": 9280 }, { - "epoch": 1.1184685769323381, - "grad_norm": 8.0625, + "epoch": 6.626248216833096, + "grad_norm": 26.125, "learning_rate": 6.982666666666667e-05, - "loss": 0.669, + "loss": 0.951, "step": 9290 }, { - "epoch": 1.1196725258849025, - "grad_norm": 9.6875, + "epoch": 6.633380884450784, + "grad_norm": 7.03125, "learning_rate": 6.978222222222222e-05, - "loss": 0.625, + "loss": 0.7791, "step": 9300 }, { - "epoch": 1.1196725258849025, + "epoch": 6.633380884450784, "eval/acc": 44.1860466003418, "step": 9300 }, { - "epoch": 1.1196725258849025, - "eval_loss": 2.8807859420776367, - "eval_runtime": 0.2269, - "eval_samples_per_second": 189.503, - "eval_steps_per_second": 4.407, + "epoch": 6.633380884450784, + "eval_loss": 2.587022304534912, + "eval_runtime": 0.2175, + "eval_samples_per_second": 197.663, + "eval_steps_per_second": 4.597, "step": 9300 }, { - "epoch": 1.1208764748374669, - "grad_norm": 7.09375, + "epoch": 6.640513552068474, + "grad_norm": 6.6875, "learning_rate": 6.973777777777778e-05, - "loss": 0.5112, + "loss": 0.8082, "step": 9310 }, { - "epoch": 1.1220804237900313, - "grad_norm": 19.375, + "epoch": 6.6476462196861625, + "grad_norm": 7.625, "learning_rate": 6.969333333333333e-05, - "loss": 0.7337, + "loss": 0.6863, "step": 9320 }, { - "epoch": 1.1232843727425956, - "grad_norm": 8.25, + "epoch": 6.654778887303852, + "grad_norm": 8.625, "learning_rate": 6.96488888888889e-05, - "loss": 0.6687, + "loss": 0.7921, "step": 9330 }, { - "epoch": 1.1244883216951602, - "grad_norm": 8.125, + "epoch": 6.661911554921541, + "grad_norm": 6.5, "learning_rate": 6.960444444444446e-05, - "loss": 0.5604, + "loss": 0.7762, "step": 9340 }, { - "epoch": 1.1256922706477246, - "grad_norm": 9.1875, + "epoch": 6.669044222539229, + "grad_norm": 12.6875, "learning_rate": 6.956e-05, - "loss": 0.6999, + "loss": 0.7977, "step": 9350 }, { - "epoch": 1.126896219600289, - "grad_norm": 8.5, + "epoch": 6.676176890156919, + "grad_norm": 6.84375, "learning_rate": 6.951555555555555e-05, - "loss": 0.5909, + "loss": 0.907, "step": 9360 }, { - "epoch": 1.1281001685528533, - "grad_norm": 7.21875, + "epoch": 6.683309557774607, + "grad_norm": 7.15625, "learning_rate": 6.947111111111112e-05, - "loss": 0.5857, + "loss": 0.792, "step": 9370 }, { - "epoch": 1.1293041175054177, - "grad_norm": 6.84375, + "epoch": 6.690442225392297, + "grad_norm": 8.5, "learning_rate": 6.942666666666668e-05, - "loss": 0.5965, + "loss": 0.7838, "step": 9380 }, { - "epoch": 1.130508066457982, - "grad_norm": 6.59375, + "epoch": 6.697574893009985, + "grad_norm": 8.1875, "learning_rate": 6.938222222222223e-05, - "loss": 0.6098, + "loss": 0.8141, "step": 9390 }, { - "epoch": 1.1317120154105467, - "grad_norm": 9.3125, + "epoch": 6.704707560627675, + "grad_norm": 7.875, "learning_rate": 6.933777777777777e-05, - "loss": 0.5917, + "loss": 0.8348, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval/acc": 44.1860466003418, + "epoch": 6.704707560627675, + "eval/acc": 39.53488540649414, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval_loss": 2.906259536743164, - "eval_runtime": 0.2551, - "eval_samples_per_second": 168.571, - "eval_steps_per_second": 3.92, + "epoch": 6.704707560627675, + "eval_loss": 2.6398463249206543, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.145, + "eval_steps_per_second": 4.585, "step": 9400 }, { - "epoch": 1.132915964363111, - "grad_norm": 8.3125, + "epoch": 6.7118402282453635, + "grad_norm": 6.625, "learning_rate": 6.929333333333334e-05, - "loss": 0.5629, + "loss": 0.889, "step": 9410 }, { - "epoch": 1.1341199133156754, - "grad_norm": 5.21875, + "epoch": 6.718972895863053, + "grad_norm": 7.3125, "learning_rate": 6.92488888888889e-05, - "loss": 0.4913, + "loss": 0.7913, "step": 9420 }, { - "epoch": 1.1353238622682398, - "grad_norm": 7.5625, + "epoch": 6.726105563480742, + "grad_norm": 10.875, "learning_rate": 6.920444444444445e-05, - "loss": 0.5868, + "loss": 0.8099, "step": 9430 }, { - "epoch": 1.1365278112208042, - "grad_norm": 8.8125, + "epoch": 6.733238231098431, + "grad_norm": 23.75, "learning_rate": 6.916000000000001e-05, - "loss": 0.6205, + "loss": 0.7098, "step": 9440 }, { - "epoch": 1.1377317601733687, - "grad_norm": 6.78125, + "epoch": 6.74037089871612, + "grad_norm": 6.625, "learning_rate": 6.911555555555556e-05, - "loss": 0.6569, + "loss": 0.7859, "step": 9450 }, { - "epoch": 1.1389357091259331, - "grad_norm": 7.9375, + "epoch": 6.747503566333809, + "grad_norm": 5.875, "learning_rate": 6.907111111111112e-05, - "loss": 0.5849, + "loss": 0.7947, "step": 9460 }, { - "epoch": 1.1401396580784975, - "grad_norm": 8.6875, + "epoch": 6.754636233951498, + "grad_norm": 7.25, "learning_rate": 6.902666666666667e-05, - "loss": 0.5997, + "loss": 0.927, "step": 9470 }, { - "epoch": 1.1413436070310619, - "grad_norm": 12.75, + "epoch": 6.761768901569187, + "grad_norm": 12.875, "learning_rate": 6.898222222222223e-05, - "loss": 0.6568, + "loss": 0.8474, "step": 9480 }, { - "epoch": 1.1425475559836262, - "grad_norm": 7.6875, + "epoch": 6.768901569186876, + "grad_norm": 6.8125, "learning_rate": 6.893777777777778e-05, - "loss": 0.6542, + "loss": 0.848, "step": 9490 }, { - "epoch": 1.1437515049361906, - "grad_norm": 6.59375, + "epoch": 6.7760342368045645, + "grad_norm": 7.96875, "learning_rate": 6.889333333333333e-05, - "loss": 0.4745, + "loss": 0.8081, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval/acc": 48.83720779418945, + "epoch": 6.7760342368045645, + "eval/acc": 41.86046600341797, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval_loss": 2.8177154064178467, - "eval_runtime": 0.2171, - "eval_samples_per_second": 198.092, - "eval_steps_per_second": 4.607, + "epoch": 6.7760342368045645, + "eval_loss": 2.6681759357452393, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.372, + "eval_steps_per_second": 4.451, "step": 9500 }, { - "epoch": 1.1449554538887552, - "grad_norm": 6.53125, + "epoch": 6.783166904422254, + "grad_norm": 9.0625, "learning_rate": 6.88488888888889e-05, - "loss": 0.664, + "loss": 0.8836, "step": 9510 }, { - "epoch": 1.1461594028413196, - "grad_norm": 7.6875, + "epoch": 6.790299572039943, + "grad_norm": 15.875, "learning_rate": 6.880444444444445e-05, - "loss": 0.5778, + "loss": 0.8696, "step": 9520 }, { - "epoch": 1.147363351793884, - "grad_norm": 6.84375, + "epoch": 6.797432239657632, + "grad_norm": 10.75, "learning_rate": 6.876e-05, - "loss": 0.6596, + "loss": 0.844, "step": 9530 }, { - "epoch": 1.1485673007464483, - "grad_norm": 8.75, + "epoch": 6.804564907275321, + "grad_norm": 23.875, "learning_rate": 6.871555555555556e-05, - "loss": 0.6422, + "loss": 0.823, "step": 9540 }, { - "epoch": 1.1497712496990127, - "grad_norm": 6.5625, + "epoch": 6.81169757489301, + "grad_norm": 7.75, "learning_rate": 6.867111111111112e-05, - "loss": 0.5794, + "loss": 0.8875, "step": 9550 }, { - "epoch": 1.1509751986515773, - "grad_norm": 8.625, + "epoch": 6.818830242510699, + "grad_norm": 6.46875, "learning_rate": 6.862666666666667e-05, - "loss": 0.6171, + "loss": 0.7703, "step": 9560 }, { - "epoch": 1.1521791476041416, - "grad_norm": 6.875, + "epoch": 6.825962910128388, + "grad_norm": 6.375, "learning_rate": 6.858222222222222e-05, - "loss": 0.58, + "loss": 0.8, "step": 9570 }, { - "epoch": 1.153383096556706, - "grad_norm": 14.375, + "epoch": 6.833095577746077, + "grad_norm": 7.96875, "learning_rate": 6.853777777777778e-05, - "loss": 0.6651, + "loss": 0.8139, "step": 9580 }, { - "epoch": 1.1545870455092704, - "grad_norm": 6.65625, + "epoch": 6.840228245363766, + "grad_norm": 11.625, "learning_rate": 6.849333333333333e-05, - "loss": 0.621, + "loss": 0.8042, "step": 9590 }, { - "epoch": 1.1557909944618348, - "grad_norm": 8.25, + "epoch": 6.847360912981455, + "grad_norm": 10.8125, "learning_rate": 6.844888888888889e-05, - "loss": 0.6578, + "loss": 0.8403, "step": 9600 }, { - "epoch": 1.1557909944618348, + "epoch": 6.847360912981455, "eval/acc": 44.1860466003418, "step": 9600 }, { - "epoch": 1.1557909944618348, - "eval_loss": 2.841442108154297, - "eval_runtime": 0.2144, - "eval_samples_per_second": 200.545, - "eval_steps_per_second": 4.664, + "epoch": 6.847360912981455, + "eval_loss": 2.6575427055358887, + "eval_runtime": 0.2186, + "eval_samples_per_second": 196.745, + "eval_steps_per_second": 4.575, "step": 9600 }, { - "epoch": 1.1569949434143991, - "grad_norm": 6.09375, + "epoch": 6.854493580599144, + "grad_norm": 14.6875, "learning_rate": 6.840444444444445e-05, - "loss": 0.5215, + "loss": 0.8426, "step": 9610 }, { - "epoch": 1.1581988923669637, - "grad_norm": 9.1875, + "epoch": 6.861626248216833, + "grad_norm": 7.84375, "learning_rate": 6.836e-05, - "loss": 0.6458, + "loss": 0.8874, "step": 9620 }, { - "epoch": 1.159402841319528, - "grad_norm": 8.8125, + "epoch": 6.868758915834523, + "grad_norm": 8.9375, "learning_rate": 6.831555555555555e-05, - "loss": 0.6037, + "loss": 0.78, "step": 9630 }, { - "epoch": 1.1606067902720925, - "grad_norm": 7.0, + "epoch": 6.875891583452211, + "grad_norm": 6.1875, "learning_rate": 6.827111111111111e-05, - "loss": 0.5408, + "loss": 0.7788, "step": 9640 }, { - "epoch": 1.1618107392246568, - "grad_norm": 5.78125, + "epoch": 6.8830242510699, + "grad_norm": 6.34375, "learning_rate": 6.822666666666668e-05, - "loss": 0.5832, + "loss": 0.7385, "step": 9650 }, { - "epoch": 1.1630146881772212, - "grad_norm": 6.84375, + "epoch": 6.890156918687589, + "grad_norm": 7.59375, "learning_rate": 6.818222222222222e-05, - "loss": 0.5802, + "loss": 0.8938, "step": 9660 }, { - "epoch": 1.1642186371297858, - "grad_norm": 5.75, + "epoch": 6.897289586305278, + "grad_norm": 10.8125, "learning_rate": 6.813777777777777e-05, - "loss": 0.5377, + "loss": 0.8154, "step": 9670 }, { - "epoch": 1.1654225860823502, - "grad_norm": 7.5625, + "epoch": 6.904422253922967, + "grad_norm": 6.90625, "learning_rate": 6.809333333333333e-05, - "loss": 0.5657, + "loss": 0.9273, "step": 9680 }, { - "epoch": 1.1666265350349145, - "grad_norm": 6.15625, + "epoch": 6.911554921540656, + "grad_norm": 8.3125, "learning_rate": 6.80488888888889e-05, - "loss": 0.5107, + "loss": 0.8595, "step": 9690 }, { - "epoch": 1.167830483987479, - "grad_norm": 5.28125, + "epoch": 6.9186875891583455, + "grad_norm": 10.75, "learning_rate": 6.800444444444444e-05, - "loss": 0.5898, + "loss": 0.8569, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval/acc": 46.511627197265625, + "epoch": 6.9186875891583455, + "eval/acc": 39.53488540649414, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval_loss": 2.8665220737457275, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.771, - "eval_steps_per_second": 4.739, + "epoch": 6.9186875891583455, + "eval_loss": 2.6524884700775146, + "eval_runtime": 0.215, + "eval_samples_per_second": 200.025, + "eval_steps_per_second": 4.652, "step": 9700 }, { - "epoch": 1.1690344329400433, - "grad_norm": 7.0, + "epoch": 6.925820256776034, + "grad_norm": 7.90625, "learning_rate": 6.796e-05, - "loss": 0.6016, + "loss": 0.7726, "step": 9710 }, { - "epoch": 1.1702383818926076, - "grad_norm": 7.0, + "epoch": 6.932952924393724, + "grad_norm": 7.71875, "learning_rate": 6.791555555555556e-05, - "loss": 0.6048, + "loss": 0.789, "step": 9720 }, { - "epoch": 1.1714423308451722, - "grad_norm": 7.21875, + "epoch": 6.940085592011412, + "grad_norm": 7.4375, "learning_rate": 6.787111111111112e-05, - "loss": 0.5315, + "loss": 0.7525, "step": 9730 }, { - "epoch": 1.1726462797977366, - "grad_norm": 6.53125, + "epoch": 6.947218259629102, + "grad_norm": 6.96875, "learning_rate": 6.782666666666667e-05, - "loss": 0.5033, + "loss": 0.8183, "step": 9740 }, { - "epoch": 1.173850228750301, - "grad_norm": 6.34375, + "epoch": 6.95435092724679, + "grad_norm": 6.5625, "learning_rate": 6.778222222222223e-05, - "loss": 0.5615, + "loss": 0.8713, "step": 9750 }, { - "epoch": 1.1750541777028654, - "grad_norm": 6.34375, + "epoch": 6.961483594864479, + "grad_norm": 6.59375, "learning_rate": 6.773777777777778e-05, - "loss": 0.5494, + "loss": 0.8089, "step": 9760 }, { - "epoch": 1.1762581266554297, - "grad_norm": 7.3125, + "epoch": 6.968616262482168, + "grad_norm": 7.46875, "learning_rate": 6.769333333333334e-05, - "loss": 0.6047, + "loss": 0.8173, "step": 9770 }, { - "epoch": 1.1774620756079943, - "grad_norm": 6.53125, + "epoch": 6.975748930099857, + "grad_norm": 8.75, "learning_rate": 6.76488888888889e-05, - "loss": 0.6653, + "loss": 0.8359, "step": 9780 }, { - "epoch": 1.1786660245605587, - "grad_norm": 21.75, + "epoch": 6.9828815977175465, + "grad_norm": 6.96875, "learning_rate": 6.760444444444445e-05, - "loss": 0.5944, + "loss": 0.7308, "step": 9790 }, { - "epoch": 1.179869973513123, - "grad_norm": 17.25, + "epoch": 6.990014265335235, + "grad_norm": 8.6875, "learning_rate": 6.756e-05, - "loss": 0.6511, + "loss": 0.7651, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval/acc": 46.511627197265625, + "epoch": 6.990014265335235, + "eval/acc": 44.1860466003418, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval_loss": 2.8695812225341797, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.781, - "eval_steps_per_second": 4.669, + "epoch": 6.990014265335235, + "eval_loss": 2.581909418106079, + "eval_runtime": 0.217, + "eval_samples_per_second": 198.162, + "eval_steps_per_second": 4.608, "step": 9800 }, { - "epoch": 1.1810739224656874, - "grad_norm": 5.3125, + "epoch": 6.997146932952925, + "grad_norm": 7.6875, "learning_rate": 6.751555555555556e-05, - "loss": 0.6008, + "loss": 0.8653, "step": 9810 }, { - "epoch": 1.1822778714182518, - "grad_norm": 9.4375, + "epoch": 7.004279600570613, + "grad_norm": 8.5, "learning_rate": 6.747111111111112e-05, - "loss": 0.5898, + "loss": 0.8445, "step": 9820 }, { - "epoch": 1.1834818203708162, - "grad_norm": 6.6875, + "epoch": 7.011412268188303, + "grad_norm": 6.375, "learning_rate": 6.742666666666667e-05, - "loss": 0.5976, + "loss": 0.7759, "step": 9830 }, { - "epoch": 1.1846857693233808, - "grad_norm": 7.875, + "epoch": 7.018544935805991, + "grad_norm": 6.375, "learning_rate": 6.738222222222222e-05, - "loss": 0.5604, + "loss": 0.7709, "step": 9840 }, { - "epoch": 1.1858897182759451, - "grad_norm": 6.0625, + "epoch": 7.025677603423681, + "grad_norm": 7.8125, "learning_rate": 6.733777777777778e-05, - "loss": 0.736, + "loss": 0.768, "step": 9850 }, { - "epoch": 1.1870936672285095, - "grad_norm": 8.125, + "epoch": 7.032810271041369, + "grad_norm": 8.4375, "learning_rate": 6.729333333333334e-05, - "loss": 0.5235, + "loss": 0.8725, "step": 9860 }, { - "epoch": 1.1882976161810739, - "grad_norm": 6.46875, + "epoch": 7.039942938659059, + "grad_norm": 7.8125, "learning_rate": 6.724888888888889e-05, - "loss": 0.5716, + "loss": 0.8146, "step": 9870 }, { - "epoch": 1.1895015651336383, - "grad_norm": 6.21875, + "epoch": 7.0470756062767475, + "grad_norm": 70.0, "learning_rate": 6.720444444444445e-05, - "loss": 0.5337, + "loss": 0.8137, "step": 9880 }, { - "epoch": 1.1907055140862028, - "grad_norm": 7.28125, + "epoch": 7.054208273894437, + "grad_norm": 7.03125, "learning_rate": 6.716e-05, - "loss": 0.5203, + "loss": 0.8025, "step": 9890 }, { - "epoch": 1.1919094630387672, - "grad_norm": 8.1875, + "epoch": 7.0613409415121255, + "grad_norm": 7.15625, "learning_rate": 6.711555555555555e-05, - "loss": 0.5532, + "loss": 0.8237, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval/acc": 46.511627197265625, + "epoch": 7.0613409415121255, + "eval/acc": 62.79069900512695, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval_loss": 2.864424705505371, - "eval_runtime": 0.2199, - "eval_samples_per_second": 195.51, - "eval_steps_per_second": 4.547, + "epoch": 7.0613409415121255, + "eval_loss": 2.023484706878662, + "eval_runtime": 1.3641, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 0.733, "step": 9900 }, { - "epoch": 1.1931134119913316, - "grad_norm": 8.5625, + "epoch": 7.068473609129814, + "grad_norm": 10.375, "learning_rate": 6.707111111111111e-05, - "loss": 0.585, + "loss": 0.7141, "step": 9910 }, { - "epoch": 1.194317360943896, - "grad_norm": 8.25, + "epoch": 7.075606276747504, + "grad_norm": 9.25, "learning_rate": 6.702666666666667e-05, - "loss": 0.6533, + "loss": 0.7963, "step": 9920 }, { - "epoch": 1.1955213098964603, - "grad_norm": 8.8125, + "epoch": 7.082738944365192, + "grad_norm": 7.375, "learning_rate": 6.698222222222222e-05, - "loss": 0.5962, + "loss": 0.7935, "step": 9930 }, { - "epoch": 1.1967252588490247, - "grad_norm": 13.0625, + "epoch": 7.089871611982882, + "grad_norm": 6.8125, "learning_rate": 6.693777777777778e-05, - "loss": 0.6169, + "loss": 0.7882, "step": 9940 }, { - "epoch": 1.1979292078015893, - "grad_norm": 7.5625, + "epoch": 7.09700427960057, + "grad_norm": 7.0625, "learning_rate": 6.689333333333335e-05, - "loss": 0.5756, + "loss": 0.7698, "step": 9950 }, { - "epoch": 1.1991331567541537, - "grad_norm": 6.03125, + "epoch": 7.10413694721826, + "grad_norm": 6.9375, "learning_rate": 6.68488888888889e-05, - "loss": 0.5746, + "loss": 0.8595, "step": 9960 }, { - "epoch": 1.200337105706718, - "grad_norm": 4.875, + "epoch": 7.111269614835948, + "grad_norm": 9.5, "learning_rate": 6.680444444444444e-05, - "loss": 0.6586, + "loss": 0.8158, "step": 9970 }, { - "epoch": 1.2015410546592824, - "grad_norm": 7.375, + "epoch": 7.118402282453638, + "grad_norm": 8.375, "learning_rate": 6.676e-05, - "loss": 0.6928, + "loss": 0.7916, "step": 9980 }, { - "epoch": 1.2027450036118468, - "grad_norm": 8.875, + "epoch": 7.1255349500713265, + "grad_norm": 6.3125, "learning_rate": 6.671555555555555e-05, - "loss": 0.6166, + "loss": 0.7455, "step": 9990 }, { - "epoch": 1.2039489525644114, - "grad_norm": 7.96875, + "epoch": 7.132667617689016, + "grad_norm": 7.375, "learning_rate": 6.667111111111112e-05, - "loss": 0.6778, + "loss": 0.7398, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval/acc": 46.511627197265625, + "epoch": 7.132667617689016, + "eval/acc": 65.11627960205078, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval_loss": 2.8400421142578125, - "eval_runtime": 0.2085, - "eval_samples_per_second": 206.266, - "eval_steps_per_second": 4.797, + "epoch": 7.132667617689016, + "eval_loss": 2.0408403873443604, + "eval_runtime": 0.2184, + "eval_samples_per_second": 196.923, + "eval_steps_per_second": 4.58, "step": 10000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 18, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8be2779f506d415c7837cccadd704feafa92652e --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f9b4d5bda7f8dcca46b398b07a5d34c51df8b1719cca216ea5360ff0f57115 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..12cacf5d18b4a473742ceb6dfcc12b649e26d395 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9292f8084df08f44633c95111d71cf5055e02e10995567cc733f2bef07d3113 +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5aa6708634ed79bb6fd9134f461929114273332 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c51e5b3326777258165ef9fd6dbeda9d9f2e6b3d63aa88f25cd5e474c25dc3ad +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c80e12e9df2abfc48987df398f65147f7b2077b --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3695e5780e3e6d3dada7f75ec329ee3a7c34e6a5026a0b08b408ba954cd73208 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..77f2083b25f4f248f622843de6076913c802e13f --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df4d939779d8b5df4dd72653512b42a685cea2c253425fbec20974ba7e33b73 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..71f24a409d0df4116c1c362b711e7d84a3afbf5d --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86ab796da437c185f316ffaa4fa4ad8b6d86e8c908196fb1990906cd0c40b0b +size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json similarity index 56% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json index 7c5305668e4570991ea60fac8c82af488511527a..f4033136041da2eccbe317ca472c8f9cabe60c36 100644 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.8059234288466168, + "epoch": 10.699001426533524, "eval_steps": 100, "global_step": 15000, "is_hyper_param_search": false, @@ -10,12460 +10,12460 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0012039489525644113, - "grad_norm": 29.25, + "epoch": 0.007132667617689016, + "grad_norm": 19.75, "learning_rate": 3.6e-07, - "loss": 5.6475, + "loss": 5.6319, "step": 10 }, { - "epoch": 0.0024078979051288226, - "grad_norm": 13.6875, + "epoch": 0.014265335235378032, + "grad_norm": 19.375, "learning_rate": 7.6e-07, - "loss": 5.6394, + "loss": 5.5914, "step": 20 }, { - "epoch": 0.003611846857693234, - "grad_norm": 36.0, + "epoch": 0.021398002853067047, + "grad_norm": 51.25, "learning_rate": 1.16e-06, - "loss": 5.6168, + "loss": 5.6495, "step": 30 }, { - "epoch": 0.004815795810257645, - "grad_norm": 17.0, + "epoch": 0.028530670470756064, + "grad_norm": 19.0, "learning_rate": 1.56e-06, - "loss": 5.6346, + "loss": 5.6581, "step": 40 }, { - "epoch": 0.006019744762822056, - "grad_norm": 16.5, + "epoch": 0.03566333808844508, + "grad_norm": 23.75, "learning_rate": 1.96e-06, - "loss": 5.6391, + "loss": 5.6366, "step": 50 }, { - "epoch": 0.007223693715386468, - "grad_norm": 16.5, + "epoch": 0.042796005706134094, + "grad_norm": 18.0, "learning_rate": 2.36e-06, - "loss": 5.6272, + "loss": 5.6411, "step": 60 }, { - "epoch": 0.00842764266795088, - "grad_norm": 14.8125, + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, "learning_rate": 2.7600000000000003e-06, - "loss": 5.5979, + "loss": 5.5919, "step": 70 }, { - "epoch": 0.00963159162051529, - "grad_norm": 22.375, + "epoch": 0.05706134094151213, + "grad_norm": 24.125, "learning_rate": 3.1600000000000007e-06, - "loss": 5.6515, + "loss": 5.6083, "step": 80 }, { - "epoch": 0.010835540573079701, - "grad_norm": 17.125, + "epoch": 0.06419400855920114, + "grad_norm": 18.25, "learning_rate": 3.5600000000000002e-06, - "loss": 5.6018, + "loss": 5.6599, "step": 90 }, { - "epoch": 0.012039489525644112, - "grad_norm": 14.9375, + "epoch": 0.07132667617689016, + "grad_norm": 18.25, "learning_rate": 3.96e-06, - "loss": 5.6342, + "loss": 5.6652, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval/acc": 3.4883720874786377, + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval_loss": 5.140806198120117, - "eval_runtime": 2.4165, - "eval_samples_per_second": 17.794, - "eval_steps_per_second": 0.414, + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, "step": 100 }, { - "epoch": 0.013243438478208525, - "grad_norm": 13.0, + "epoch": 0.07845934379457917, + "grad_norm": 21.0, "learning_rate": 4.360000000000001e-06, - "loss": 5.6124, + "loss": 5.6402, "step": 110 }, { - "epoch": 0.014447387430772935, - "grad_norm": 18.625, + "epoch": 0.08559201141226819, + "grad_norm": 16.875, "learning_rate": 4.76e-06, - "loss": 5.6127, + "loss": 5.6535, "step": 120 }, { - "epoch": 0.015651336383337346, - "grad_norm": 14.375, + "epoch": 0.09272467902995721, + "grad_norm": 21.5, "learning_rate": 5.1600000000000006e-06, - "loss": 5.5663, + "loss": 5.5821, "step": 130 }, { - "epoch": 0.01685528533590176, - "grad_norm": 11.9375, + "epoch": 0.09985734664764621, + "grad_norm": 18.5, "learning_rate": 5.56e-06, - "loss": 5.55, + "loss": 5.6184, "step": 140 }, { - "epoch": 0.018059234288466168, - "grad_norm": 14.5, + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, "learning_rate": 5.9600000000000005e-06, - "loss": 5.5839, + "loss": 5.5743, "step": 150 }, { - "epoch": 0.01926318324103058, - "grad_norm": 15.0625, + "epoch": 0.11412268188302425, + "grad_norm": 16.875, "learning_rate": 6.360000000000001e-06, - "loss": 5.5259, + "loss": 5.5684, "step": 160 }, { - "epoch": 0.020467132193594993, - "grad_norm": 14.8125, + "epoch": 0.12125534950071326, + "grad_norm": 22.125, "learning_rate": 6.76e-06, - "loss": 5.4812, + "loss": 5.535, "step": 170 }, { - "epoch": 0.021671081146159402, - "grad_norm": 15.375, + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, "learning_rate": 7.16e-06, - "loss": 5.4964, + "loss": 5.4357, "step": 180 }, { - "epoch": 0.022875030098723815, - "grad_norm": 14.0625, + "epoch": 0.1355206847360913, + "grad_norm": 16.375, "learning_rate": 7.5600000000000005e-06, - "loss": 5.4023, + "loss": 5.3766, "step": 190 }, { - "epoch": 0.024078979051288224, - "grad_norm": 18.625, + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, "learning_rate": 7.96e-06, - "loss": 5.3778, + "loss": 5.4437, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval/acc": 5.232558250427246, + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval_loss": 4.991551399230957, - "eval_runtime": 0.2363, - "eval_samples_per_second": 181.988, - "eval_steps_per_second": 4.232, + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, "step": 200 }, { - "epoch": 0.025282928003852637, - "grad_norm": 16.25, + "epoch": 0.14978601997146934, + "grad_norm": 16.75, "learning_rate": 8.36e-06, - "loss": 5.3983, + "loss": 5.4744, "step": 210 }, { - "epoch": 0.02648687695641705, - "grad_norm": 17.25, + "epoch": 0.15691868758915833, + "grad_norm": 43.25, "learning_rate": 8.76e-06, - "loss": 5.2953, + "loss": 5.381, "step": 220 }, { - "epoch": 0.02769082590898146, - "grad_norm": 15.9375, + "epoch": 0.16405135520684735, + "grad_norm": 21.0, "learning_rate": 9.16e-06, - "loss": 5.2266, + "loss": 5.3092, "step": 230 }, { - "epoch": 0.02889477486154587, - "grad_norm": 21.875, + "epoch": 0.17118402282453637, + "grad_norm": 26.75, "learning_rate": 9.560000000000002e-06, - "loss": 5.139, + "loss": 5.2752, "step": 240 }, { - "epoch": 0.03009872381411028, - "grad_norm": 17.875, + "epoch": 0.1783166904422254, + "grad_norm": 26.875, "learning_rate": 9.96e-06, - "loss": 5.0639, + "loss": 5.2194, "step": 250 }, { - "epoch": 0.03130267276667469, - "grad_norm": 18.875, + "epoch": 0.18544935805991442, + "grad_norm": 20.875, "learning_rate": 1.036e-05, - "loss": 5.0118, + "loss": 5.0657, "step": 260 }, { - "epoch": 0.032506621719239105, - "grad_norm": 26.0, + "epoch": 0.19258202567760344, + "grad_norm": 25.125, "learning_rate": 1.076e-05, - "loss": 4.8959, + "loss": 4.967, "step": 270 }, { - "epoch": 0.03371057067180352, - "grad_norm": 18.5, + "epoch": 0.19971469329529243, + "grad_norm": 30.125, "learning_rate": 1.1160000000000002e-05, - "loss": 4.8454, + "loss": 4.9544, "step": 280 }, { - "epoch": 0.03491451962436792, - "grad_norm": 28.0, + "epoch": 0.20684736091298145, + "grad_norm": 24.625, "learning_rate": 1.156e-05, - "loss": 4.6846, + "loss": 4.7585, "step": 290 }, { - "epoch": 0.036118468576932336, - "grad_norm": 25.5, + "epoch": 0.21398002853067047, + "grad_norm": 21.375, "learning_rate": 1.196e-05, - "loss": 4.5211, + "loss": 4.635, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval/acc": 6.395349025726318, + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval_loss": 4.604515075683594, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.428, - "eval_steps_per_second": 4.638, + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, "step": 300 }, { - "epoch": 0.03732241752949675, - "grad_norm": 28.0, + "epoch": 0.2211126961483595, + "grad_norm": 30.125, "learning_rate": 1.236e-05, - "loss": 4.3466, + "loss": 4.5333, "step": 310 }, { - "epoch": 0.03852636648206116, - "grad_norm": 27.125, + "epoch": 0.2282453637660485, + "grad_norm": 28.125, "learning_rate": 1.276e-05, - "loss": 4.1005, + "loss": 4.2888, "step": 320 }, { - "epoch": 0.039730315434625574, - "grad_norm": 33.0, + "epoch": 0.23537803138373753, + "grad_norm": 30.5, "learning_rate": 1.316e-05, - "loss": 3.7904, + "loss": 4.1744, "step": 330 }, { - "epoch": 0.040934264387189986, - "grad_norm": 32.75, + "epoch": 0.24251069900142652, + "grad_norm": 35.0, "learning_rate": 1.356e-05, - "loss": 3.4061, + "loss": 3.8812, "step": 340 }, { - "epoch": 0.04213821333975439, - "grad_norm": 31.125, + "epoch": 0.24964336661911554, + "grad_norm": 30.75, "learning_rate": 1.396e-05, - "loss": 3.2838, + "loss": 3.6772, "step": 350 }, { - "epoch": 0.043342162292318805, - "grad_norm": 23.75, + "epoch": 0.25677603423680456, + "grad_norm": 25.875, "learning_rate": 1.4360000000000001e-05, - "loss": 2.9101, + "loss": 3.3797, "step": 360 }, { - "epoch": 0.04454611124488322, - "grad_norm": 44.75, + "epoch": 0.26390870185449355, + "grad_norm": 31.375, "learning_rate": 1.4760000000000001e-05, - "loss": 2.6306, + "loss": 3.2338, "step": 370 }, { - "epoch": 0.04575006019744763, - "grad_norm": 33.25, + "epoch": 0.2710413694721826, + "grad_norm": 72.0, "learning_rate": 1.5160000000000002e-05, - "loss": 2.5454, + "loss": 2.976, "step": 380 }, { - "epoch": 0.04695400915001204, - "grad_norm": 31.375, + "epoch": 0.2781740370898716, + "grad_norm": 22.375, "learning_rate": 1.556e-05, - "loss": 2.5867, + "loss": 2.8207, "step": 390 }, { - "epoch": 0.04815795810257645, - "grad_norm": 18.5, + "epoch": 0.28530670470756064, + "grad_norm": 21.25, "learning_rate": 1.596e-05, - "loss": 2.3251, + "loss": 2.8341, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval/acc": 12.209301948547363, + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval_loss": 3.941906452178955, - "eval_runtime": 0.2265, - "eval_samples_per_second": 189.814, - "eval_steps_per_second": 4.414, + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, "step": 400 }, { - "epoch": 0.04936190705514086, - "grad_norm": 18.0, + "epoch": 0.29243937232524964, + "grad_norm": 21.0, "learning_rate": 1.636e-05, - "loss": 2.394, + "loss": 2.6431, "step": 410 }, { - "epoch": 0.05056585600770527, - "grad_norm": 22.375, + "epoch": 0.2995720399429387, + "grad_norm": 20.875, "learning_rate": 1.6760000000000002e-05, - "loss": 2.2856, + "loss": 2.6506, "step": 420 }, { - "epoch": 0.051769804960269686, - "grad_norm": 17.25, + "epoch": 0.3067047075606277, + "grad_norm": 21.125, "learning_rate": 1.7160000000000002e-05, - "loss": 2.3414, + "loss": 2.491, "step": 430 }, { - "epoch": 0.0529737539128341, - "grad_norm": 15.25, + "epoch": 0.31383737517831667, + "grad_norm": 31.75, "learning_rate": 1.756e-05, - "loss": 2.156, + "loss": 2.423, "step": 440 }, { - "epoch": 0.054177702865398504, - "grad_norm": 15.75, + "epoch": 0.3209700427960057, + "grad_norm": 19.375, "learning_rate": 1.796e-05, - "loss": 2.0164, + "loss": 2.5108, "step": 450 }, { - "epoch": 0.05538165181796292, - "grad_norm": 28.5, + "epoch": 0.3281027104136947, + "grad_norm": 17.375, "learning_rate": 1.8360000000000004e-05, - "loss": 1.9555, + "loss": 2.4584, "step": 460 }, { - "epoch": 0.05658560077052733, - "grad_norm": 19.25, + "epoch": 0.33523537803138376, + "grad_norm": 22.625, "learning_rate": 1.876e-05, - "loss": 2.0277, + "loss": 2.3526, "step": 470 }, { - "epoch": 0.05778954972309174, - "grad_norm": 15.375, + "epoch": 0.34236804564907275, + "grad_norm": 30.25, "learning_rate": 1.916e-05, - "loss": 2.1719, + "loss": 2.3634, "step": 480 }, { - "epoch": 0.058993498675656154, - "grad_norm": 18.875, + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, "learning_rate": 1.956e-05, - "loss": 2.013, + "loss": 2.3339, "step": 490 }, { - "epoch": 0.06019744762822056, - "grad_norm": 18.625, + "epoch": 0.3566333808844508, + "grad_norm": 19.5, "learning_rate": 1.9960000000000002e-05, - "loss": 1.8574, + "loss": 2.268, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval/acc": 20.930233001708984, + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval_loss": 3.6547293663024902, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.002, - "eval_steps_per_second": 4.674, + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, "step": 500 }, { - "epoch": 0.06140139658078497, - "grad_norm": 19.875, + "epoch": 0.3637660485021398, + "grad_norm": 29.375, "learning_rate": 2.036e-05, - "loss": 1.9431, + "loss": 2.2728, "step": 510 }, { - "epoch": 0.06260534553334939, - "grad_norm": 14.625, + "epoch": 0.37089871611982883, + "grad_norm": 21.25, "learning_rate": 2.076e-05, - "loss": 1.8311, + "loss": 2.1346, "step": 520 }, { - "epoch": 0.0638092944859138, - "grad_norm": 20.0, + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, "learning_rate": 2.116e-05, - "loss": 2.0005, + "loss": 2.2719, "step": 530 }, { - "epoch": 0.06501324343847821, - "grad_norm": 16.0, + "epoch": 0.38516405135520687, + "grad_norm": 27.75, "learning_rate": 2.1560000000000004e-05, - "loss": 1.7374, + "loss": 2.145, "step": 540 }, { - "epoch": 0.06621719239104262, - "grad_norm": 13.0625, + "epoch": 0.39229671897289586, + "grad_norm": 16.125, "learning_rate": 2.196e-05, - "loss": 1.7838, + "loss": 2.0912, "step": 550 }, { - "epoch": 0.06742114134360704, - "grad_norm": 16.5, + "epoch": 0.39942938659058486, + "grad_norm": 20.25, "learning_rate": 2.236e-05, - "loss": 1.8264, + "loss": 2.0302, "step": 560 }, { - "epoch": 0.06862509029617145, - "grad_norm": 20.5, + "epoch": 0.4065620542082739, + "grad_norm": 17.75, "learning_rate": 2.2760000000000002e-05, - "loss": 1.658, + "loss": 2.1832, "step": 570 }, { - "epoch": 0.06982903924873585, - "grad_norm": 25.75, + "epoch": 0.4136947218259629, + "grad_norm": 14.5, "learning_rate": 2.3160000000000002e-05, - "loss": 1.7826, + "loss": 1.9652, "step": 580 }, { - "epoch": 0.07103298820130026, - "grad_norm": 19.375, + "epoch": 0.42082738944365194, + "grad_norm": 17.0, "learning_rate": 2.356e-05, - "loss": 1.6539, + "loss": 1.8911, "step": 590 }, { - "epoch": 0.07223693715386467, - "grad_norm": 19.25, + "epoch": 0.42796005706134094, + "grad_norm": 20.0, "learning_rate": 2.396e-05, - "loss": 1.6278, + "loss": 2.0266, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval/acc": 20.930233001708984, + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval_loss": 3.387899398803711, - "eval_runtime": 0.2536, - "eval_samples_per_second": 169.572, - "eval_steps_per_second": 3.944, + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, "step": 600 }, { - "epoch": 0.07344088610642908, - "grad_norm": 12.0625, + "epoch": 0.43509272467902993, + "grad_norm": 25.5, "learning_rate": 2.4360000000000004e-05, - "loss": 1.5342, + "loss": 1.9116, "step": 610 }, { - "epoch": 0.0746448350589935, - "grad_norm": 15.625, + "epoch": 0.442225392296719, + "grad_norm": 25.375, "learning_rate": 2.476e-05, - "loss": 1.5919, + "loss": 1.7644, "step": 620 }, { - "epoch": 0.07584878401155791, - "grad_norm": 25.5, + "epoch": 0.44935805991440797, + "grad_norm": 15.5, "learning_rate": 2.516e-05, - "loss": 1.5713, + "loss": 1.9008, "step": 630 }, { - "epoch": 0.07705273296412232, - "grad_norm": 14.8125, + "epoch": 0.456490727532097, + "grad_norm": 16.875, "learning_rate": 2.556e-05, - "loss": 1.4714, + "loss": 1.619, "step": 640 }, { - "epoch": 0.07825668191668674, - "grad_norm": 21.5, + "epoch": 0.463623395149786, + "grad_norm": 37.25, "learning_rate": 2.5960000000000002e-05, - "loss": 1.5835, + "loss": 1.7725, "step": 650 }, { - "epoch": 0.07946063086925115, - "grad_norm": 58.0, + "epoch": 0.47075606276747506, + "grad_norm": 16.5, "learning_rate": 2.6360000000000002e-05, - "loss": 1.5369, + "loss": 1.7405, "step": 660 }, { - "epoch": 0.08066457982181556, - "grad_norm": 45.0, + "epoch": 0.47788873038516405, + "grad_norm": 16.25, "learning_rate": 2.676e-05, - "loss": 1.4629, + "loss": 1.5825, "step": 670 }, { - "epoch": 0.08186852877437997, - "grad_norm": 14.1875, + "epoch": 0.48502139800285304, + "grad_norm": 68.5, "learning_rate": 2.716e-05, - "loss": 1.4288, + "loss": 1.8379, "step": 680 }, { - "epoch": 0.08307247772694437, - "grad_norm": 40.25, + "epoch": 0.4921540656205421, + "grad_norm": 50.0, "learning_rate": 2.7560000000000004e-05, - "loss": 1.4729, + "loss": 1.7989, "step": 690 }, { - "epoch": 0.08427642667950878, - "grad_norm": 13.625, + "epoch": 0.4992867332382311, + "grad_norm": 16.25, "learning_rate": 2.7960000000000003e-05, - "loss": 1.4883, + "loss": 1.7058, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval/acc": 23.255813598632812, + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval_loss": 3.206946611404419, - "eval_runtime": 0.4188, - "eval_samples_per_second": 102.684, - "eval_steps_per_second": 2.388, + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, "step": 700 }, { - "epoch": 0.0854803756320732, - "grad_norm": 15.75, + "epoch": 0.5064194008559201, + "grad_norm": 14.625, "learning_rate": 2.8360000000000003e-05, - "loss": 1.5656, + "loss": 1.6542, "step": 710 }, { - "epoch": 0.08668432458463761, - "grad_norm": 22.25, + "epoch": 0.5135520684736091, + "grad_norm": 71.0, "learning_rate": 2.8760000000000002e-05, - "loss": 1.6742, + "loss": 1.6763, "step": 720 }, { - "epoch": 0.08788827353720202, - "grad_norm": 12.3125, + "epoch": 0.5206847360912982, + "grad_norm": 17.125, "learning_rate": 2.9160000000000005e-05, - "loss": 1.35, + "loss": 1.6858, "step": 730 }, { - "epoch": 0.08909222248976643, - "grad_norm": 13.8125, + "epoch": 0.5278174037089871, + "grad_norm": 19.75, "learning_rate": 2.9559999999999998e-05, - "loss": 1.4435, + "loss": 1.6718, "step": 740 }, { - "epoch": 0.09029617144233085, - "grad_norm": 13.1875, + "epoch": 0.5349500713266762, + "grad_norm": 13.375, "learning_rate": 2.9959999999999998e-05, - "loss": 1.3843, + "loss": 1.6164, "step": 750 }, { - "epoch": 0.09150012039489526, - "grad_norm": 13.3125, + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, "learning_rate": 3.036e-05, - "loss": 1.3327, + "loss": 1.6049, "step": 760 }, { - "epoch": 0.09270406934745967, - "grad_norm": 18.875, + "epoch": 0.5492154065620543, + "grad_norm": 35.75, "learning_rate": 3.076e-05, - "loss": 1.4628, + "loss": 1.5453, "step": 770 }, { - "epoch": 0.09390801830002408, - "grad_norm": 14.5625, + "epoch": 0.5563480741797432, + "grad_norm": 28.75, "learning_rate": 3.116e-05, - "loss": 1.3306, + "loss": 1.4818, "step": 780 }, { - "epoch": 0.09511196725258848, - "grad_norm": 18.75, + "epoch": 0.5634807417974322, + "grad_norm": 17.375, "learning_rate": 3.156e-05, - "loss": 1.4936, + "loss": 1.5647, "step": 790 }, { - "epoch": 0.0963159162051529, - "grad_norm": 11.5, + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, "learning_rate": 3.196e-05, - "loss": 1.3515, + "loss": 1.5206, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval/acc": 22.674419403076172, + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval_loss": 3.1510462760925293, - "eval_runtime": 0.2676, - "eval_samples_per_second": 160.701, - "eval_steps_per_second": 3.737, + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, "step": 800 }, { - "epoch": 0.09751986515771731, - "grad_norm": 11.6875, + "epoch": 0.5777460770328102, + "grad_norm": 17.125, "learning_rate": 3.236e-05, - "loss": 1.4593, + "loss": 1.6124, "step": 810 }, { - "epoch": 0.09872381411028172, - "grad_norm": 10.5625, + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, "learning_rate": 3.2760000000000005e-05, - "loss": 1.3453, + "loss": 1.4254, "step": 820 }, { - "epoch": 0.09992776306284613, - "grad_norm": 11.625, + "epoch": 0.5920114122681883, + "grad_norm": 15.0, "learning_rate": 3.316e-05, - "loss": 1.4041, + "loss": 1.7124, "step": 830 }, { - "epoch": 0.10113171201541055, - "grad_norm": 13.0, + "epoch": 0.5991440798858774, + "grad_norm": 14.75, "learning_rate": 3.3560000000000004e-05, - "loss": 1.2766, + "loss": 1.5384, "step": 840 }, { - "epoch": 0.10233566096797496, - "grad_norm": 40.0, + "epoch": 0.6062767475035663, + "grad_norm": 31.5, "learning_rate": 3.396e-05, - "loss": 1.2678, + "loss": 1.4899, "step": 850 }, { - "epoch": 0.10353960992053937, - "grad_norm": 13.75, + "epoch": 0.6134094151212554, + "grad_norm": 13.875, "learning_rate": 3.436e-05, - "loss": 1.2514, + "loss": 1.5377, "step": 860 }, { - "epoch": 0.10474355887310378, - "grad_norm": 11.75, + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, "learning_rate": 3.4760000000000006e-05, - "loss": 1.3518, + "loss": 1.4892, "step": 870 }, { - "epoch": 0.1059475078256682, - "grad_norm": 11.875, + "epoch": 0.6276747503566333, + "grad_norm": 37.25, "learning_rate": 3.516e-05, - "loss": 1.2675, + "loss": 1.4872, "step": 880 }, { - "epoch": 0.10715145677823261, - "grad_norm": 13.0, + "epoch": 0.6348074179743224, + "grad_norm": 18.875, "learning_rate": 3.5560000000000005e-05, - "loss": 1.294, + "loss": 1.536, "step": 890 }, { - "epoch": 0.10835540573079701, - "grad_norm": 13.0, + "epoch": 0.6419400855920114, + "grad_norm": 18.625, "learning_rate": 3.596e-05, - "loss": 1.1209, + "loss": 1.5208, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval/acc": 25.581396102905273, + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval_loss": 3.0571491718292236, - "eval_runtime": 0.3097, - "eval_samples_per_second": 138.846, - "eval_steps_per_second": 3.229, + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, "step": 900 }, { - "epoch": 0.10955935468336142, - "grad_norm": 12.75, + "epoch": 0.6490727532097005, + "grad_norm": 19.875, "learning_rate": 3.636e-05, - "loss": 1.2681, + "loss": 1.4606, "step": 910 }, { - "epoch": 0.11076330363592583, - "grad_norm": 17.0, + "epoch": 0.6562054208273894, + "grad_norm": 12.625, "learning_rate": 3.676e-05, - "loss": 1.2606, + "loss": 1.4728, "step": 920 }, { - "epoch": 0.11196725258849025, - "grad_norm": 11.375, + "epoch": 0.6633380884450785, + "grad_norm": 15.0, "learning_rate": 3.716e-05, - "loss": 1.2194, + "loss": 1.449, "step": 930 }, { - "epoch": 0.11317120154105466, - "grad_norm": 12.125, + "epoch": 0.6704707560627675, + "grad_norm": 19.0, "learning_rate": 3.756e-05, - "loss": 1.2905, + "loss": 1.5292, "step": 940 }, { - "epoch": 0.11437515049361907, - "grad_norm": 18.125, + "epoch": 0.6776034236804565, + "grad_norm": 111.5, "learning_rate": 3.796e-05, - "loss": 1.2563, + "loss": 1.4891, "step": 950 }, { - "epoch": 0.11557909944618348, - "grad_norm": 17.125, + "epoch": 0.6847360912981455, + "grad_norm": 14.75, "learning_rate": 3.836e-05, - "loss": 1.1894, + "loss": 1.4202, "step": 960 }, { - "epoch": 0.1167830483987479, - "grad_norm": 11.875, + "epoch": 0.6918687589158345, + "grad_norm": 20.25, "learning_rate": 3.876e-05, - "loss": 1.2441, + "loss": 1.5258, "step": 970 }, { - "epoch": 0.11798699735131231, - "grad_norm": 15.8125, + "epoch": 0.6990014265335235, + "grad_norm": 48.0, "learning_rate": 3.9160000000000005e-05, - "loss": 1.2627, + "loss": 1.3912, "step": 980 }, { - "epoch": 0.11919094630387672, - "grad_norm": 17.375, + "epoch": 0.7061340941512125, + "grad_norm": 13.0, "learning_rate": 3.956e-05, - "loss": 1.3929, + "loss": 1.4859, "step": 990 }, { - "epoch": 0.12039489525644112, - "grad_norm": 11.125, + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, "learning_rate": 3.9960000000000004e-05, - "loss": 1.1332, + "loss": 1.4614, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval/acc": 26.162790298461914, + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval_loss": 2.9910976886749268, - "eval_runtime": 0.2826, - "eval_samples_per_second": 152.17, - "eval_steps_per_second": 3.539, + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, "step": 1000 }, { - "epoch": 0.12159884420900553, - "grad_norm": 13.75, + "epoch": 0.7203994293865906, + "grad_norm": 16.625, "learning_rate": 4.0360000000000007e-05, - "loss": 1.2314, + "loss": 1.56, "step": 1010 }, { - "epoch": 0.12280279316156995, - "grad_norm": 11.875, + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, "learning_rate": 4.076e-05, - "loss": 1.2654, + "loss": 1.4469, "step": 1020 }, { - "epoch": 0.12400674211413436, - "grad_norm": 12.8125, + "epoch": 0.7346647646219686, + "grad_norm": 15.0, "learning_rate": 4.1160000000000006e-05, - "loss": 1.1432, + "loss": 1.381, "step": 1030 }, { - "epoch": 0.12521069106669877, - "grad_norm": 13.9375, + "epoch": 0.7417974322396577, + "grad_norm": 13.625, "learning_rate": 4.156e-05, - "loss": 1.1669, + "loss": 1.3749, "step": 1040 }, { - "epoch": 0.1264146400192632, - "grad_norm": 19.25, + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, "learning_rate": 4.196e-05, - "loss": 1.1836, + "loss": 1.3919, "step": 1050 }, { - "epoch": 0.1276185889718276, - "grad_norm": 11.375, + "epoch": 0.7560627674750356, + "grad_norm": 16.25, "learning_rate": 4.236e-05, - "loss": 1.2449, + "loss": 1.4208, "step": 1060 }, { - "epoch": 0.128822537924392, - "grad_norm": 10.6875, + "epoch": 0.7631954350927247, + "grad_norm": 27.75, "learning_rate": 4.276e-05, - "loss": 1.1361, + "loss": 1.3714, "step": 1070 }, { - "epoch": 0.13002648687695642, - "grad_norm": 11.5, + "epoch": 0.7703281027104137, + "grad_norm": 13.125, "learning_rate": 4.316e-05, - "loss": 1.1989, + "loss": 1.3344, "step": 1080 }, { - "epoch": 0.13123043582952082, - "grad_norm": 13.0, + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, "learning_rate": 4.356e-05, - "loss": 1.1004, + "loss": 1.3291, "step": 1090 }, { - "epoch": 0.13243438478208525, - "grad_norm": 10.125, + "epoch": 0.7845934379457917, + "grad_norm": 17.125, "learning_rate": 4.396e-05, - "loss": 1.1308, + "loss": 1.3536, "step": 1100 }, { - "epoch": 0.13243438478208525, + "epoch": 0.7845934379457917, "eval/acc": 27.9069766998291, "step": 1100 }, { - "epoch": 0.13243438478208525, - "eval_loss": 3.0177316665649414, - "eval_runtime": 0.2801, - "eval_samples_per_second": 153.54, - "eval_steps_per_second": 3.571, + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, "step": 1100 }, { - "epoch": 0.13363833373464964, - "grad_norm": 9.5, + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, "learning_rate": 4.436e-05, - "loss": 1.1862, + "loss": 1.4598, "step": 1110 }, { - "epoch": 0.13484228268721407, - "grad_norm": 13.75, + "epoch": 0.7988587731811697, + "grad_norm": 15.25, "learning_rate": 4.4760000000000005e-05, - "loss": 1.1764, + "loss": 1.3795, "step": 1120 }, { - "epoch": 0.13604623163977847, - "grad_norm": 30.625, + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, "learning_rate": 4.516e-05, - "loss": 1.0422, + "loss": 1.2518, "step": 1130 }, { - "epoch": 0.1372501805923429, - "grad_norm": 9.875, + "epoch": 0.8131241084165478, + "grad_norm": 16.625, "learning_rate": 4.5560000000000004e-05, - "loss": 1.1796, + "loss": 1.3104, "step": 1140 }, { - "epoch": 0.1384541295449073, - "grad_norm": 13.1875, + "epoch": 0.8202567760342369, + "grad_norm": 11.875, "learning_rate": 4.596e-05, - "loss": 1.0483, + "loss": 1.2996, "step": 1150 }, { - "epoch": 0.1396580784974717, - "grad_norm": 11.75, + "epoch": 0.8273894436519258, + "grad_norm": 24.125, "learning_rate": 4.636e-05, - "loss": 1.1647, + "loss": 1.2067, "step": 1160 }, { - "epoch": 0.14086202745003612, - "grad_norm": 13.375, + "epoch": 0.8345221112696148, + "grad_norm": 11.0, "learning_rate": 4.6760000000000006e-05, - "loss": 1.2839, + "loss": 1.3035, "step": 1170 }, { - "epoch": 0.14206597640260052, - "grad_norm": 42.0, + "epoch": 0.8416547788873039, + "grad_norm": 13.125, "learning_rate": 4.716e-05, - "loss": 1.1594, + "loss": 1.2859, "step": 1180 }, { - "epoch": 0.14326992535516495, - "grad_norm": 15.625, + "epoch": 0.8487874465049928, + "grad_norm": 11.0, "learning_rate": 4.7560000000000005e-05, - "loss": 1.1073, + "loss": 1.3982, "step": 1190 }, { - "epoch": 0.14447387430772934, - "grad_norm": 11.5, + "epoch": 0.8559201141226819, + "grad_norm": 12.875, "learning_rate": 4.796e-05, - "loss": 1.1593, + "loss": 1.299, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval/acc": 26.162790298461914, + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval_loss": 3.0329606533050537, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.829, - "eval_steps_per_second": 4.577, + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, "step": 1200 }, { - "epoch": 0.14567782326029377, - "grad_norm": 12.5625, + "epoch": 0.8630527817403709, + "grad_norm": 11.25, "learning_rate": 4.836e-05, - "loss": 1.1088, + "loss": 1.3549, "step": 1210 }, { - "epoch": 0.14688177221285817, - "grad_norm": 10.4375, + "epoch": 0.8701854493580599, + "grad_norm": 15.25, "learning_rate": 4.876e-05, - "loss": 1.1565, + "loss": 1.3649, "step": 1220 }, { - "epoch": 0.1480857211654226, - "grad_norm": 11.3125, + "epoch": 0.8773181169757489, + "grad_norm": 22.0, "learning_rate": 4.9160000000000004e-05, - "loss": 1.0596, + "loss": 1.2441, "step": 1230 }, { - "epoch": 0.149289670117987, - "grad_norm": 11.375, + "epoch": 0.884450784593438, + "grad_norm": 12.375, "learning_rate": 4.956e-05, - "loss": 1.2416, + "loss": 1.2196, "step": 1240 }, { - "epoch": 0.15049361907055142, - "grad_norm": 10.3125, + "epoch": 0.891583452211127, + "grad_norm": 14.25, "learning_rate": 4.996e-05, - "loss": 1.0492, + "loss": 1.3274, "step": 1250 }, { - "epoch": 0.15169756802311582, - "grad_norm": 10.9375, + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, "learning_rate": 5.0360000000000006e-05, - "loss": 1.0263, + "loss": 1.2896, "step": 1260 }, { - "epoch": 0.15290151697568022, - "grad_norm": 11.0625, + "epoch": 0.905848787446505, + "grad_norm": 16.875, "learning_rate": 5.076000000000001e-05, - "loss": 1.1197, + "loss": 1.3019, "step": 1270 }, { - "epoch": 0.15410546592824464, - "grad_norm": 33.25, + "epoch": 0.912981455064194, + "grad_norm": 26.375, "learning_rate": 5.1160000000000005e-05, - "loss": 1.0614, + "loss": 1.3756, "step": 1280 }, { - "epoch": 0.15530941488080904, - "grad_norm": 11.3125, + "epoch": 0.920114122681883, + "grad_norm": 18.25, "learning_rate": 5.1559999999999994e-05, - "loss": 1.0948, + "loss": 1.327, "step": 1290 }, { - "epoch": 0.15651336383337347, - "grad_norm": 24.5, + "epoch": 0.927246790299572, + "grad_norm": 11.3125, "learning_rate": 5.196e-05, - "loss": 1.1113, + "loss": 1.3237, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval/acc": 25.581396102905273, + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval_loss": 2.944797992706299, - "eval_runtime": 0.3019, - "eval_samples_per_second": 142.434, - "eval_steps_per_second": 3.312, + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, "step": 1300 }, { - "epoch": 0.15771731278593787, - "grad_norm": 12.4375, + "epoch": 0.9343794579172611, + "grad_norm": 18.125, "learning_rate": 5.236e-05, - "loss": 0.9531, + "loss": 1.256, "step": 1310 }, { - "epoch": 0.1589212617385023, - "grad_norm": 12.3125, + "epoch": 0.9415121255349501, + "grad_norm": 10.25, "learning_rate": 5.2759999999999996e-05, - "loss": 1.0079, + "loss": 1.1386, "step": 1320 }, { - "epoch": 0.1601252106910667, - "grad_norm": 13.1875, + "epoch": 0.948644793152639, + "grad_norm": 11.1875, "learning_rate": 5.316e-05, - "loss": 1.0674, + "loss": 1.3115, "step": 1330 }, { - "epoch": 0.16132915964363112, - "grad_norm": 16.875, + "epoch": 0.9557774607703281, + "grad_norm": 10.875, "learning_rate": 5.356e-05, - "loss": 1.1194, + "loss": 1.2315, "step": 1340 }, { - "epoch": 0.16253310859619552, - "grad_norm": 10.625, + "epoch": 0.9629101283880172, + "grad_norm": 12.0, "learning_rate": 5.396e-05, - "loss": 1.0057, + "loss": 1.3327, "step": 1350 }, { - "epoch": 0.16373705754875995, - "grad_norm": 9.125, + "epoch": 0.9700427960057061, + "grad_norm": 11.75, "learning_rate": 5.436e-05, - "loss": 1.1257, + "loss": 1.4052, "step": 1360 }, { - "epoch": 0.16494100650132434, - "grad_norm": 8.5, + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, "learning_rate": 5.476e-05, - "loss": 0.9545, + "loss": 1.1349, "step": 1370 }, { - "epoch": 0.16614495545388874, - "grad_norm": 10.25, + "epoch": 0.9843081312410842, + "grad_norm": 15.125, "learning_rate": 5.516e-05, - "loss": 1.0648, + "loss": 1.3803, "step": 1380 }, { - "epoch": 0.16734890440645317, - "grad_norm": 14.9375, + "epoch": 0.9914407988587732, + "grad_norm": 16.75, "learning_rate": 5.556e-05, - "loss": 1.0364, + "loss": 1.3536, "step": 1390 }, { - "epoch": 0.16855285335901757, - "grad_norm": 138.0, + "epoch": 0.9985734664764622, + "grad_norm": 10.625, "learning_rate": 5.596e-05, - "loss": 1.0255, + "loss": 1.2981, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval/acc": 27.9069766998291, + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval_loss": 2.763101100921631, - "eval_runtime": 0.2759, - "eval_samples_per_second": 155.826, - "eval_steps_per_second": 3.624, + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, "step": 1400 }, { - "epoch": 0.169756802311582, - "grad_norm": 11.8125, + "epoch": 1.005706134094151, + "grad_norm": 15.0, "learning_rate": 5.636e-05, - "loss": 0.9813, + "loss": 1.2173, "step": 1410 }, { - "epoch": 0.1709607512641464, - "grad_norm": 9.1875, + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, "learning_rate": 5.6760000000000005e-05, - "loss": 0.9929, + "loss": 1.1965, "step": 1420 }, { - "epoch": 0.17216470021671082, - "grad_norm": 10.875, + "epoch": 1.0199714693295292, + "grad_norm": 21.625, "learning_rate": 5.716e-05, - "loss": 0.9113, + "loss": 1.2494, "step": 1430 }, { - "epoch": 0.17336864916927522, - "grad_norm": 19.375, + "epoch": 1.0271041369472182, + "grad_norm": 13.0, "learning_rate": 5.7560000000000005e-05, - "loss": 1.0711, + "loss": 1.1948, "step": 1440 }, { - "epoch": 0.17457259812183964, - "grad_norm": 9.8125, + "epoch": 1.0342368045649073, + "grad_norm": 11.0, "learning_rate": 5.796e-05, - "loss": 0.9322, + "loss": 1.2641, "step": 1450 }, { - "epoch": 0.17577654707440404, - "grad_norm": 10.5, + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, "learning_rate": 5.8360000000000004e-05, - "loss": 1.0316, + "loss": 1.2526, "step": 1460 }, { - "epoch": 0.17698049602696847, - "grad_norm": 10.25, + "epoch": 1.0485021398002854, + "grad_norm": 46.0, "learning_rate": 5.876000000000001e-05, - "loss": 1.0165, + "loss": 1.0786, "step": 1470 }, { - "epoch": 0.17818444497953287, - "grad_norm": 10.4375, + "epoch": 1.0556348074179742, + "grad_norm": 11.0, "learning_rate": 5.916e-05, - "loss": 1.0229, + "loss": 1.3154, "step": 1480 }, { - "epoch": 0.17938839393209727, - "grad_norm": 14.4375, + "epoch": 1.0627674750356633, + "grad_norm": 18.75, "learning_rate": 5.9560000000000006e-05, - "loss": 0.9684, + "loss": 1.257, "step": 1490 }, { - "epoch": 0.1805923428846617, - "grad_norm": 8.375, + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, "learning_rate": 5.996e-05, - "loss": 0.9948, + "loss": 1.2636, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval/acc": 34.88372039794922, + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval_loss": 2.8177433013916016, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.732, - "eval_steps_per_second": 4.808, + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, "step": 1500 }, { - "epoch": 0.1817962918372261, - "grad_norm": 19.25, + "epoch": 1.0770328102710414, + "grad_norm": 13.75, "learning_rate": 6.0360000000000005e-05, - "loss": 0.9897, + "loss": 1.2602, "step": 1510 }, { - "epoch": 0.18300024078979052, - "grad_norm": 32.5, + "epoch": 1.0841654778887304, + "grad_norm": 11.625, "learning_rate": 6.076000000000001e-05, - "loss": 0.9217, + "loss": 1.0823, "step": 1520 }, { - "epoch": 0.18420418974235492, - "grad_norm": 9.5, + "epoch": 1.0912981455064195, + "grad_norm": 9.0, "learning_rate": 6.116e-05, - "loss": 1.0494, + "loss": 1.3059, "step": 1530 }, { - "epoch": 0.18540813869491934, - "grad_norm": 9.25, + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, "learning_rate": 6.156e-05, - "loss": 0.9359, + "loss": 1.2006, "step": 1540 }, { - "epoch": 0.18661208764748374, - "grad_norm": 11.375, + "epoch": 1.1055634807417973, + "grad_norm": 15.75, "learning_rate": 6.196000000000001e-05, - "loss": 0.9112, + "loss": 1.3731, "step": 1550 }, { - "epoch": 0.18781603660004817, - "grad_norm": 12.6875, + "epoch": 1.1126961483594864, + "grad_norm": 9.5, "learning_rate": 6.236e-05, - "loss": 1.07, + "loss": 1.1925, "step": 1560 }, { - "epoch": 0.18901998555261257, - "grad_norm": 11.1875, + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, "learning_rate": 6.276e-05, - "loss": 0.9853, + "loss": 1.1554, "step": 1570 }, { - "epoch": 0.19022393450517697, - "grad_norm": 8.375, + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, "learning_rate": 6.316000000000001e-05, - "loss": 0.9579, + "loss": 1.0875, "step": 1580 }, { - "epoch": 0.1914278834577414, - "grad_norm": 20.875, + "epoch": 1.1340941512125535, + "grad_norm": 10.875, "learning_rate": 6.356000000000001e-05, - "loss": 0.9401, + "loss": 1.1895, "step": 1590 }, { - "epoch": 0.1926318324103058, - "grad_norm": 8.9375, + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, "learning_rate": 6.396e-05, - "loss": 1.0279, + "loss": 1.2354, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval/acc": 30.23255729675293, + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval_loss": 2.8526248931884766, - "eval_runtime": 0.3114, - "eval_samples_per_second": 138.103, - "eval_steps_per_second": 3.212, + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, "step": 1600 }, { - "epoch": 0.19383578136287022, - "grad_norm": 7.78125, + "epoch": 1.1483594864479316, + "grad_norm": 12.375, "learning_rate": 6.436e-05, - "loss": 0.8743, + "loss": 1.2167, "step": 1610 }, { - "epoch": 0.19503973031543462, - "grad_norm": 9.8125, + "epoch": 1.1554921540656204, + "grad_norm": 10.375, "learning_rate": 6.476e-05, - "loss": 0.8702, + "loss": 1.1638, "step": 1620 }, { - "epoch": 0.19624367926799904, - "grad_norm": 12.4375, + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, "learning_rate": 6.515999999999999e-05, - "loss": 1.0028, + "loss": 1.1666, "step": 1630 }, { - "epoch": 0.19744762822056344, - "grad_norm": 10.125, + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, "learning_rate": 6.556e-05, - "loss": 0.9377, + "loss": 1.1961, "step": 1640 }, { - "epoch": 0.19865157717312787, - "grad_norm": 8.9375, + "epoch": 1.1768901569186876, + "grad_norm": 9.875, "learning_rate": 6.596e-05, - "loss": 1.031, + "loss": 1.2558, "step": 1650 }, { - "epoch": 0.19985552612569227, - "grad_norm": 8.5625, + "epoch": 1.1840228245363766, + "grad_norm": 10.375, "learning_rate": 6.636e-05, - "loss": 1.0162, + "loss": 1.1728, "step": 1660 }, { - "epoch": 0.2010594750782567, - "grad_norm": 33.75, + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, "learning_rate": 6.676e-05, - "loss": 0.9448, + "loss": 1.2947, "step": 1670 }, { - "epoch": 0.2022634240308211, - "grad_norm": 9.625, + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, "learning_rate": 6.716e-05, - "loss": 1.0077, + "loss": 1.2151, "step": 1680 }, { - "epoch": 0.2034673729833855, - "grad_norm": 8.6875, + "epoch": 1.2054208273894436, + "grad_norm": 10.5, "learning_rate": 6.756e-05, - "loss": 0.9654, + "loss": 1.0612, "step": 1690 }, { - "epoch": 0.20467132193594992, - "grad_norm": 12.625, + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, "learning_rate": 6.796e-05, - "loss": 0.8899, + "loss": 1.1079, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval/acc": 32.55813980102539, + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval_loss": 2.7813549041748047, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.701, - "eval_steps_per_second": 4.691, + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, "step": 1700 }, { - "epoch": 0.20587527088851432, - "grad_norm": 12.0, + "epoch": 1.2196861626248217, + "grad_norm": 11.25, "learning_rate": 6.836e-05, - "loss": 1.0412, + "loss": 1.1541, "step": 1710 }, { - "epoch": 0.20707921984107874, - "grad_norm": 11.75, + "epoch": 1.2268188302425107, + "grad_norm": 8.125, "learning_rate": 6.876e-05, - "loss": 0.9239, + "loss": 1.0772, "step": 1720 }, { - "epoch": 0.20828316879364314, - "grad_norm": 11.375, + "epoch": 1.2339514978601998, + "grad_norm": 18.125, "learning_rate": 6.916000000000001e-05, - "loss": 0.9243, + "loss": 1.1623, "step": 1730 }, { - "epoch": 0.20948711774620757, - "grad_norm": 12.0, + "epoch": 1.2410841654778888, + "grad_norm": 10.125, "learning_rate": 6.956e-05, - "loss": 1.0204, + "loss": 1.182, "step": 1740 }, { - "epoch": 0.21069106669877197, - "grad_norm": 13.0625, + "epoch": 1.2482168330955776, + "grad_norm": 9.75, "learning_rate": 6.996e-05, - "loss": 0.8811, + "loss": 1.0796, "step": 1750 }, { - "epoch": 0.2118950156513364, - "grad_norm": 17.0, + "epoch": 1.2553495007132667, + "grad_norm": 10.5, "learning_rate": 7.036e-05, - "loss": 0.8755, + "loss": 1.2374, "step": 1760 }, { - "epoch": 0.2130989646039008, - "grad_norm": 11.25, + "epoch": 1.2624821683309557, + "grad_norm": 20.875, "learning_rate": 7.076000000000001e-05, - "loss": 0.858, + "loss": 1.2718, "step": 1770 }, { - "epoch": 0.21430291355646522, - "grad_norm": 9.625, + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, "learning_rate": 7.116e-05, - "loss": 0.9076, + "loss": 1.0922, "step": 1780 }, { - "epoch": 0.21550686250902962, - "grad_norm": 10.4375, + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, "learning_rate": 7.156e-05, - "loss": 0.8817, + "loss": 1.0637, "step": 1790 }, { - "epoch": 0.21671081146159402, - "grad_norm": 12.8125, + "epoch": 1.2838801711840229, + "grad_norm": 9.5, "learning_rate": 7.196000000000001e-05, - "loss": 0.9121, + "loss": 1.1661, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval/acc": 30.813953399658203, + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval_loss": 2.6508796215057373, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.798, - "eval_steps_per_second": 4.577, + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, "step": 1800 }, { - "epoch": 0.21791476041415844, - "grad_norm": 16.5, + "epoch": 1.291012838801712, + "grad_norm": 14.3125, "learning_rate": 7.236e-05, - "loss": 0.9044, + "loss": 1.1139, "step": 1810 }, { - "epoch": 0.21911870936672284, - "grad_norm": 15.1875, + "epoch": 1.298145506419401, + "grad_norm": 41.5, "learning_rate": 7.276e-05, - "loss": 0.9552, + "loss": 1.0817, "step": 1820 }, { - "epoch": 0.22032265831928727, - "grad_norm": 11.375, + "epoch": 1.3052781740370898, + "grad_norm": 15.125, "learning_rate": 7.316000000000001e-05, - "loss": 0.9264, + "loss": 1.2462, "step": 1830 }, { - "epoch": 0.22152660727185167, - "grad_norm": 8.8125, + "epoch": 1.3124108416547788, + "grad_norm": 33.25, "learning_rate": 7.356000000000001e-05, - "loss": 0.8928, + "loss": 1.1143, "step": 1840 }, { - "epoch": 0.2227305562244161, - "grad_norm": 9.625, + "epoch": 1.3195435092724679, + "grad_norm": 13.625, "learning_rate": 7.396e-05, - "loss": 0.9515, + "loss": 1.1783, "step": 1850 }, { - "epoch": 0.2239345051769805, - "grad_norm": 31.0, + "epoch": 1.326676176890157, + "grad_norm": 18.375, "learning_rate": 7.436000000000001e-05, - "loss": 0.8989, + "loss": 1.2101, "step": 1860 }, { - "epoch": 0.22513845412954492, - "grad_norm": 9.5, + "epoch": 1.333808844507846, + "grad_norm": 13.875, "learning_rate": 7.476000000000001e-05, - "loss": 1.0206, + "loss": 1.1348, "step": 1870 }, { - "epoch": 0.22634240308210932, - "grad_norm": 8.625, + "epoch": 1.340941512125535, + "grad_norm": 13.9375, "learning_rate": 7.516e-05, - "loss": 0.8961, + "loss": 1.0747, "step": 1880 }, { - "epoch": 0.22754635203467374, - "grad_norm": 9.0, + "epoch": 1.3480741797432239, + "grad_norm": 29.75, "learning_rate": 7.556000000000002e-05, - "loss": 0.9421, + "loss": 1.1895, "step": 1890 }, { - "epoch": 0.22875030098723814, - "grad_norm": 12.0625, + "epoch": 1.355206847360913, + "grad_norm": 17.25, "learning_rate": 7.596000000000001e-05, - "loss": 0.9049, + "loss": 1.2512, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval/acc": 36.046512603759766, + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval_loss": 2.636018753051758, - "eval_runtime": 0.2084, - "eval_samples_per_second": 206.343, - "eval_steps_per_second": 4.799, + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, "step": 1900 }, { - "epoch": 0.22995424993980254, - "grad_norm": 8.0625, + "epoch": 1.362339514978602, + "grad_norm": 21.125, "learning_rate": 7.636e-05, - "loss": 0.8983, + "loss": 1.1306, "step": 1910 }, { - "epoch": 0.23115819889236697, - "grad_norm": 11.875, + "epoch": 1.369472182596291, + "grad_norm": 9.0625, "learning_rate": 7.676e-05, - "loss": 0.9293, + "loss": 1.1139, "step": 1920 }, { - "epoch": 0.23236214784493137, - "grad_norm": 11.75, + "epoch": 1.37660485021398, + "grad_norm": 30.25, "learning_rate": 7.716e-05, - "loss": 0.8602, + "loss": 1.1595, "step": 1930 }, { - "epoch": 0.2335660967974958, - "grad_norm": 11.5625, + "epoch": 1.383737517831669, + "grad_norm": 13.6875, "learning_rate": 7.756e-05, - "loss": 0.8078, + "loss": 1.2437, "step": 1940 }, { - "epoch": 0.2347700457500602, - "grad_norm": 9.125, + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, "learning_rate": 7.796e-05, - "loss": 0.8773, + "loss": 1.1005, "step": 1950 }, { - "epoch": 0.23597399470262462, - "grad_norm": 10.6875, + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, "learning_rate": 7.836e-05, - "loss": 0.8464, + "loss": 1.0748, "step": 1960 }, { - "epoch": 0.23717794365518902, - "grad_norm": 18.25, + "epoch": 1.405135520684736, + "grad_norm": 9.125, "learning_rate": 7.876e-05, - "loss": 0.8779, + "loss": 1.1576, "step": 1970 }, { - "epoch": 0.23838189260775344, - "grad_norm": 10.875, + "epoch": 1.412268188302425, + "grad_norm": 11.375, "learning_rate": 7.916e-05, - "loss": 0.9351, + "loss": 1.0982, "step": 1980 }, { - "epoch": 0.23958584156031784, - "grad_norm": 11.0, + "epoch": 1.4194008559201141, + "grad_norm": 10.375, "learning_rate": 7.956e-05, - "loss": 0.8581, + "loss": 1.132, "step": 1990 }, { - "epoch": 0.24078979051288224, - "grad_norm": 8.875, + "epoch": 1.4265335235378032, + "grad_norm": 16.375, "learning_rate": 7.996e-05, - "loss": 0.9799, + "loss": 1.121, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval/acc": 36.046512603759766, + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval_loss": 2.716654062271118, - "eval_runtime": 0.21, - "eval_samples_per_second": 204.721, - "eval_steps_per_second": 4.761, + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, "step": 2000 }, { - "epoch": 0.24199373946544667, - "grad_norm": 11.0625, + "epoch": 1.4336661911554922, + "grad_norm": 9.125, "learning_rate": 8.036e-05, - "loss": 0.8678, + "loss": 1.2079, "step": 2010 }, { - "epoch": 0.24319768841801107, + "epoch": 1.440798858773181, "grad_norm": 12.125, "learning_rate": 8.076e-05, - "loss": 0.8832, + "loss": 1.1098, "step": 2020 }, { - "epoch": 0.2444016373705755, - "grad_norm": 8.25, + "epoch": 1.44793152639087, + "grad_norm": 8.8125, "learning_rate": 8.116e-05, - "loss": 0.8689, + "loss": 0.9849, "step": 2030 }, { - "epoch": 0.2456055863231399, - "grad_norm": 6.53125, + "epoch": 1.4550641940085591, + "grad_norm": 9.0, "learning_rate": 8.156e-05, - "loss": 0.8829, + "loss": 1.0905, "step": 2040 }, { - "epoch": 0.24680953527570432, - "grad_norm": 9.5625, + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, "learning_rate": 8.196000000000001e-05, - "loss": 0.9181, + "loss": 1.2211, "step": 2050 }, { - "epoch": 0.24801348422826872, - "grad_norm": 22.875, + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, "learning_rate": 8.236e-05, - "loss": 0.8011, + "loss": 1.0968, "step": 2060 }, { - "epoch": 0.24921743318083314, - "grad_norm": 14.4375, + "epoch": 1.4764621968616263, + "grad_norm": 9.0, "learning_rate": 8.276e-05, - "loss": 0.9163, + "loss": 1.0973, "step": 2070 }, { - "epoch": 0.25042138213339754, - "grad_norm": 10.625, + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, "learning_rate": 8.316000000000001e-05, - "loss": 0.7869, + "loss": 1.1012, "step": 2080 }, { - "epoch": 0.25162533108596197, - "grad_norm": 11.0, + "epoch": 1.4907275320970044, + "grad_norm": 31.0, "learning_rate": 8.356e-05, - "loss": 0.8779, + "loss": 1.0437, "step": 2090 }, { - "epoch": 0.2528292800385264, - "grad_norm": 12.625, + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, "learning_rate": 8.396e-05, - "loss": 0.889, + "loss": 1.0934, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval/acc": 37.20930099487305, + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval_loss": 2.626293182373047, - "eval_runtime": 0.2735, - "eval_samples_per_second": 157.235, - "eval_steps_per_second": 3.657, + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, "step": 2100 }, { - "epoch": 0.25403322899109076, - "grad_norm": 8.3125, + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, "learning_rate": 8.436000000000001e-05, - "loss": 0.8363, + "loss": 1.0862, "step": 2110 }, { - "epoch": 0.2552371779436552, - "grad_norm": 8.625, + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, "learning_rate": 8.476000000000001e-05, - "loss": 0.8762, + "loss": 1.0786, "step": 2120 }, { - "epoch": 0.2564411268962196, - "grad_norm": 7.4375, + "epoch": 1.5192582025677603, + "grad_norm": 8.25, "learning_rate": 8.516e-05, - "loss": 0.7925, + "loss": 1.1496, "step": 2130 }, { - "epoch": 0.257645075848784, - "grad_norm": 9.1875, + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, "learning_rate": 8.556e-05, - "loss": 0.9575, + "loss": 1.1132, "step": 2140 }, { - "epoch": 0.2588490248013484, - "grad_norm": 9.8125, + "epoch": 1.5335235378031382, + "grad_norm": 21.375, "learning_rate": 8.596000000000001e-05, - "loss": 0.7551, + "loss": 1.1043, "step": 2150 }, { - "epoch": 0.26005297375391284, - "grad_norm": 7.15625, + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, "learning_rate": 8.636e-05, - "loss": 0.808, + "loss": 1.2549, "step": 2160 }, { - "epoch": 0.26125692270647727, - "grad_norm": 8.3125, + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, "learning_rate": 8.676e-05, - "loss": 0.9449, + "loss": 1.115, "step": 2170 }, { - "epoch": 0.26246087165904164, - "grad_norm": 11.5, + "epoch": 1.5549215406562054, + "grad_norm": 8.375, "learning_rate": 8.716000000000001e-05, - "loss": 0.8712, + "loss": 1.1963, "step": 2180 }, { - "epoch": 0.26366482061160607, - "grad_norm": 8.0, + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, "learning_rate": 8.756000000000001e-05, - "loss": 0.9389, + "loss": 1.1697, "step": 2190 }, { - "epoch": 0.2648687695641705, - "grad_norm": 13.5, + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, "learning_rate": 8.796e-05, - "loss": 0.7875, + "loss": 0.9716, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval/acc": 35.46511459350586, + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval_loss": 2.5862526893615723, - "eval_runtime": 0.2151, - "eval_samples_per_second": 199.927, - "eval_steps_per_second": 4.649, + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, "step": 2200 }, { - "epoch": 0.26607271851673486, - "grad_norm": 11.5625, + "epoch": 1.5763195435092725, + "grad_norm": 10.0, "learning_rate": 8.836000000000001e-05, - "loss": 0.9947, + "loss": 1.0254, "step": 2210 }, { - "epoch": 0.2672766674692993, - "grad_norm": 8.25, + "epoch": 1.5834522111269616, + "grad_norm": 12.625, "learning_rate": 8.876e-05, - "loss": 0.717, + "loss": 1.1672, "step": 2220 }, { - "epoch": 0.2684806164218637, - "grad_norm": 26.25, + "epoch": 1.5905848787446506, + "grad_norm": 11.5, "learning_rate": 8.916e-05, - "loss": 0.8688, + "loss": 1.0656, "step": 2230 }, { - "epoch": 0.26968456537442814, - "grad_norm": 11.5, + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, "learning_rate": 8.956e-05, - "loss": 0.9134, + "loss": 1.035, "step": 2240 }, { - "epoch": 0.2708885143269925, - "grad_norm": 6.875, + "epoch": 1.6048502139800287, + "grad_norm": 9.25, "learning_rate": 8.996e-05, - "loss": 0.8592, + "loss": 1.0972, "step": 2250 }, { - "epoch": 0.27209246327955694, - "grad_norm": 7.21875, + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, "learning_rate": 9.036e-05, - "loss": 0.6548, + "loss": 1.0148, "step": 2260 }, { - "epoch": 0.27329641223212137, - "grad_norm": 12.25, + "epoch": 1.6191155492154066, + "grad_norm": 13.5, "learning_rate": 9.076e-05, - "loss": 0.8613, + "loss": 1.1202, "step": 2270 }, { - "epoch": 0.2745003611846858, - "grad_norm": 8.875, + "epoch": 1.6262482168330956, + "grad_norm": 9.125, "learning_rate": 9.116e-05, - "loss": 0.7455, + "loss": 1.1134, "step": 2280 }, { - "epoch": 0.27570431013725016, - "grad_norm": 12.5625, + "epoch": 1.6333808844507844, + "grad_norm": 15.25, "learning_rate": 9.156e-05, - "loss": 0.8458, + "loss": 1.0373, "step": 2290 }, { - "epoch": 0.2769082590898146, - "grad_norm": 8.8125, + "epoch": 1.6405135520684735, + "grad_norm": 9.125, "learning_rate": 9.196000000000001e-05, - "loss": 0.8003, + "loss": 1.0654, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval/acc": 32.55813980102539, + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval_loss": 2.6594340801239014, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.965, - "eval_steps_per_second": 4.697, + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, "step": 2300 }, { - "epoch": 0.278112208042379, - "grad_norm": 10.6875, + "epoch": 1.6476462196861625, + "grad_norm": 8.25, "learning_rate": 9.236e-05, - "loss": 0.812, + "loss": 1.0218, "step": 2310 }, { - "epoch": 0.2793161569949434, - "grad_norm": 12.1875, + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, "learning_rate": 9.276e-05, - "loss": 0.781, + "loss": 1.106, "step": 2320 }, { - "epoch": 0.2805201059475078, - "grad_norm": 8.125, + "epoch": 1.6619115549215406, + "grad_norm": 8.25, "learning_rate": 9.316000000000001e-05, - "loss": 0.9682, + "loss": 1.0558, "step": 2330 }, { - "epoch": 0.28172405490007224, - "grad_norm": 8.8125, + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, "learning_rate": 9.356e-05, - "loss": 0.7531, + "loss": 0.9931, "step": 2340 }, { - "epoch": 0.28292800385263667, - "grad_norm": 7.375, + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, "learning_rate": 9.396e-05, - "loss": 0.7235, + "loss": 1.0683, "step": 2350 }, { - "epoch": 0.28413195280520104, - "grad_norm": 7.8125, + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, "learning_rate": 9.436e-05, - "loss": 0.9204, + "loss": 1.0631, "step": 2360 }, { - "epoch": 0.28533590175776546, - "grad_norm": 6.65625, + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, "learning_rate": 9.476000000000001e-05, - "loss": 0.7636, + "loss": 1.049, "step": 2370 }, { - "epoch": 0.2865398507103299, - "grad_norm": 9.625, + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, "learning_rate": 9.516e-05, - "loss": 0.855, + "loss": 1.0259, "step": 2380 }, { - "epoch": 0.2877437996628943, - "grad_norm": 9.6875, + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, "learning_rate": 9.556e-05, - "loss": 0.8643, + "loss": 1.0085, "step": 2390 }, { - "epoch": 0.2889477486154587, - "grad_norm": 7.1875, + "epoch": 1.7118402282453637, + "grad_norm": 131.0, "learning_rate": 9.596000000000001e-05, - "loss": 0.8258, + "loss": 0.944, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval/acc": 36.627906799316406, + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval_loss": 2.7174084186553955, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.672, - "eval_steps_per_second": 4.737, + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, "step": 2400 }, { - "epoch": 0.2901516975680231, - "grad_norm": 7.65625, + "epoch": 1.7189728958630528, + "grad_norm": 8.375, "learning_rate": 9.636e-05, - "loss": 0.8752, + "loss": 1.0069, "step": 2410 }, { - "epoch": 0.29135564652058754, - "grad_norm": 8.75, + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, "learning_rate": 9.676e-05, - "loss": 0.8082, + "loss": 1.0648, "step": 2420 }, { - "epoch": 0.2925595954731519, - "grad_norm": 10.4375, + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, "learning_rate": 9.716000000000001e-05, - "loss": 0.7538, + "loss": 1.0594, "step": 2430 }, { - "epoch": 0.29376354442571634, - "grad_norm": 6.4375, + "epoch": 1.7403708987161197, + "grad_norm": 8.75, "learning_rate": 9.756000000000001e-05, - "loss": 0.7766, + "loss": 1.2082, "step": 2440 }, { - "epoch": 0.29496749337828077, - "grad_norm": 7.96875, + "epoch": 1.7475035663338088, + "grad_norm": 9.875, "learning_rate": 9.796e-05, - "loss": 0.844, + "loss": 1.0225, "step": 2450 }, { - "epoch": 0.2961714423308452, - "grad_norm": 7.75, + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, "learning_rate": 9.836000000000001e-05, - "loss": 0.7127, + "loss": 0.9975, "step": 2460 }, { - "epoch": 0.29737539128340956, - "grad_norm": 11.5, + "epoch": 1.7617689015691869, + "grad_norm": 21.0, "learning_rate": 9.876000000000001e-05, - "loss": 0.8363, + "loss": 0.9533, "step": 2470 }, { - "epoch": 0.298579340235974, - "grad_norm": 6.4375, + "epoch": 1.768901569186876, + "grad_norm": 7.65625, "learning_rate": 9.916e-05, - "loss": 0.7429, + "loss": 0.9619, "step": 2480 }, { - "epoch": 0.2997832891885384, - "grad_norm": 11.5, + "epoch": 1.776034236804565, + "grad_norm": 13.625, "learning_rate": 9.956e-05, - "loss": 0.736, + "loss": 0.9425, "step": 2490 }, { - "epoch": 0.30098723814110284, - "grad_norm": 9.25, + "epoch": 1.783166904422254, + "grad_norm": 12.375, "learning_rate": 9.996000000000001e-05, - "loss": 0.8365, + "loss": 0.9893, "step": 2500 }, { - "epoch": 0.30098723814110284, + "epoch": 1.783166904422254, "eval/acc": 39.53488540649414, "step": 2500 }, { - "epoch": 0.30098723814110284, - "eval_loss": 2.713433027267456, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.919, - "eval_steps_per_second": 4.789, + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 2500 }, { - "epoch": 0.3021911870936672, - "grad_norm": 7.03125, + "epoch": 1.790299572039943, + "grad_norm": 10.0, "learning_rate": 9.996000000000001e-05, - "loss": 0.7664, + "loss": 1.0137, "step": 2510 }, { - "epoch": 0.30339513604623164, - "grad_norm": 7.75, + "epoch": 1.797432239657632, + "grad_norm": 10.125, "learning_rate": 9.991555555555556e-05, - "loss": 0.9128, + "loss": 1.059, "step": 2520 }, { - "epoch": 0.30459908499879607, - "grad_norm": 9.0, + "epoch": 1.804564907275321, + "grad_norm": 32.0, "learning_rate": 9.987111111111111e-05, - "loss": 0.8045, + "loss": 1.0498, "step": 2530 }, { - "epoch": 0.30580303395136044, - "grad_norm": 8.9375, + "epoch": 1.81169757489301, + "grad_norm": 10.125, "learning_rate": 9.982666666666667e-05, - "loss": 0.8292, + "loss": 1.1431, "step": 2540 }, { - "epoch": 0.30700698290392486, - "grad_norm": 7.40625, + "epoch": 1.818830242510699, + "grad_norm": 7.90625, "learning_rate": 9.978222222222223e-05, - "loss": 0.7557, + "loss": 1.0715, "step": 2550 }, { - "epoch": 0.3082109318564893, - "grad_norm": 7.625, + "epoch": 1.825962910128388, + "grad_norm": 10.9375, "learning_rate": 9.973777777777778e-05, - "loss": 0.683, + "loss": 1.0446, "step": 2560 }, { - "epoch": 0.3094148808090537, - "grad_norm": 8.1875, + "epoch": 1.833095577746077, + "grad_norm": 13.0, "learning_rate": 9.969333333333334e-05, - "loss": 0.8052, + "loss": 1.0291, "step": 2570 }, { - "epoch": 0.3106188297616181, - "grad_norm": 8.4375, + "epoch": 1.840228245363766, + "grad_norm": 9.75, "learning_rate": 9.964888888888889e-05, - "loss": 0.7819, + "loss": 0.9713, "step": 2580 }, { - "epoch": 0.3118227787141825, - "grad_norm": 10.8125, + "epoch": 1.847360912981455, + "grad_norm": 10.5625, "learning_rate": 9.960444444444444e-05, - "loss": 0.8452, + "loss": 1.2157, "step": 2590 }, { - "epoch": 0.31302672766674694, - "grad_norm": 6.21875, + "epoch": 1.854493580599144, + "grad_norm": 9.3125, "learning_rate": 9.956e-05, - "loss": 0.7478, + "loss": 1.0455, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval/acc": 34.88372039794922, + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval_loss": 2.6625020503997803, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.644, - "eval_steps_per_second": 4.852, + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, "step": 2600 }, { - "epoch": 0.31423067661931137, - "grad_norm": 7.375, + "epoch": 1.861626248216833, + "grad_norm": 10.5, "learning_rate": 9.951555555555556e-05, - "loss": 0.7623, + "loss": 1.0604, "step": 2610 }, { - "epoch": 0.31543462557187574, - "grad_norm": 9.0, + "epoch": 1.8687589158345221, + "grad_norm": 9.375, "learning_rate": 9.947111111111111e-05, - "loss": 0.8223, + "loss": 0.8715, "step": 2620 }, { - "epoch": 0.31663857452444016, - "grad_norm": 6.75, + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, "learning_rate": 9.942666666666667e-05, - "loss": 0.7797, + "loss": 1.0034, "step": 2630 }, { - "epoch": 0.3178425234770046, - "grad_norm": 9.125, + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, "learning_rate": 9.938222222222224e-05, - "loss": 0.6746, + "loss": 1.0557, "step": 2640 }, { - "epoch": 0.31904647242956896, - "grad_norm": 8.5, + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, "learning_rate": 9.933777777777779e-05, - "loss": 0.8434, + "loss": 0.974, "step": 2650 }, { - "epoch": 0.3202504213821334, - "grad_norm": 10.3125, + "epoch": 1.8972895863052783, + "grad_norm": 10.875, "learning_rate": 9.929333333333333e-05, - "loss": 0.8625, + "loss": 1.1366, "step": 2660 }, { - "epoch": 0.3214543703346978, - "grad_norm": 8.125, + "epoch": 1.9044222539229672, + "grad_norm": 28.75, "learning_rate": 9.92488888888889e-05, - "loss": 0.8003, + "loss": 1.0135, "step": 2670 }, { - "epoch": 0.32265831928726224, - "grad_norm": 8.5625, + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, "learning_rate": 9.920444444444444e-05, - "loss": 0.8145, + "loss": 1.0263, "step": 2680 }, { - "epoch": 0.3238622682398266, - "grad_norm": 8.0, + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, "learning_rate": 9.916e-05, - "loss": 0.6519, + "loss": 0.9952, "step": 2690 }, { - "epoch": 0.32506621719239104, - "grad_norm": 8.5625, + "epoch": 1.925820256776034, + "grad_norm": 8.8125, "learning_rate": 9.911555555555557e-05, - "loss": 0.7627, + "loss": 1.0438, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval/acc": 38.953487396240234, + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval_loss": 2.629239082336426, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.931, - "eval_steps_per_second": 4.626, + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, "step": 2700 }, { - "epoch": 0.32627016614495546, - "grad_norm": 7.625, + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, "learning_rate": 9.907111111111112e-05, - "loss": 0.7265, + "loss": 0.9522, "step": 2710 }, { - "epoch": 0.3274741150975199, - "grad_norm": 7.15625, + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, "learning_rate": 9.902666666666666e-05, - "loss": 0.7468, + "loss": 0.9729, "step": 2720 }, { - "epoch": 0.32867806405008426, - "grad_norm": 8.5, + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, "learning_rate": 9.898222222222223e-05, - "loss": 0.7816, + "loss": 1.0528, "step": 2730 }, { - "epoch": 0.3298820130026487, - "grad_norm": 6.8125, + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, "learning_rate": 9.893777777777779e-05, - "loss": 0.7828, + "loss": 1.1212, "step": 2740 }, { - "epoch": 0.3310859619552131, - "grad_norm": 8.5625, + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, "learning_rate": 9.889333333333334e-05, - "loss": 0.8273, + "loss": 0.9866, "step": 2750 }, { - "epoch": 0.3322899109077775, - "grad_norm": 7.28125, + "epoch": 1.9686162624821684, + "grad_norm": 8.25, "learning_rate": 9.884888888888889e-05, - "loss": 0.6265, + "loss": 0.8616, "step": 2760 }, { - "epoch": 0.3334938598603419, - "grad_norm": 7.78125, + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, "learning_rate": 9.880444444444445e-05, - "loss": 0.8716, + "loss": 0.9972, "step": 2770 }, { - "epoch": 0.33469780881290634, - "grad_norm": 6.0, + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, "learning_rate": 9.876000000000001e-05, - "loss": 0.7587, + "loss": 0.9781, "step": 2780 }, { - "epoch": 0.33590175776547077, - "grad_norm": 11.8125, + "epoch": 1.9900142653352355, + "grad_norm": 10.75, "learning_rate": 9.871555555555556e-05, - "loss": 0.836, + "loss": 1.0579, "step": 2790 }, { - "epoch": 0.33710570671803514, - "grad_norm": 8.3125, + "epoch": 1.9971469329529246, + "grad_norm": 8.25, "learning_rate": 9.867111111111112e-05, - "loss": 0.7196, + "loss": 1.0323, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval/acc": 34.88372039794922, + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval_loss": 2.5979089736938477, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.843, - "eval_steps_per_second": 4.717, + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, "step": 2800 }, { - "epoch": 0.33830965567059956, - "grad_norm": 8.125, + "epoch": 2.0042796005706136, + "grad_norm": 10.25, "learning_rate": 9.862666666666667e-05, - "loss": 0.7128, + "loss": 1.0597, "step": 2810 }, { - "epoch": 0.339513604623164, - "grad_norm": 7.0, + "epoch": 2.011412268188302, + "grad_norm": 7.0625, "learning_rate": 9.858222222222223e-05, - "loss": 0.8709, + "loss": 0.9582, "step": 2820 }, { - "epoch": 0.3407175535757284, - "grad_norm": 10.875, + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, "learning_rate": 9.853777777777778e-05, - "loss": 0.6885, + "loss": 1.0058, "step": 2830 }, { - "epoch": 0.3419215025282928, - "grad_norm": 6.625, + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, "learning_rate": 9.849333333333334e-05, - "loss": 0.8262, + "loss": 1.009, "step": 2840 }, { - "epoch": 0.3431254514808572, - "grad_norm": 9.0625, + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, "learning_rate": 9.844888888888889e-05, - "loss": 0.6365, + "loss": 0.93, "step": 2850 }, { - "epoch": 0.34432940043342164, - "grad_norm": 7.96875, + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, "learning_rate": 9.840444444444445e-05, - "loss": 0.8177, + "loss": 1.0953, "step": 2860 }, { - "epoch": 0.345533349385986, - "grad_norm": 6.71875, + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, "learning_rate": 9.836000000000001e-05, - "loss": 0.7043, + "loss": 1.0437, "step": 2870 }, { - "epoch": 0.34673729833855044, - "grad_norm": 10.4375, + "epoch": 2.0542082738944365, + "grad_norm": 8.75, "learning_rate": 9.831555555555556e-05, - "loss": 0.7503, + "loss": 0.9873, "step": 2880 }, { - "epoch": 0.34794124729111486, - "grad_norm": 7.375, + "epoch": 2.0613409415121255, + "grad_norm": 8.375, "learning_rate": 9.827111111111111e-05, - "loss": 0.7532, + "loss": 0.9414, "step": 2890 }, { - "epoch": 0.3491451962436793, - "grad_norm": 7.65625, + "epoch": 2.0684736091298146, + "grad_norm": 9.0, "learning_rate": 9.822666666666667e-05, - "loss": 0.6942, + "loss": 0.9625, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval/acc": 37.79069900512695, + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval_loss": 2.698911190032959, - "eval_runtime": 1.2554, - "eval_samples_per_second": 34.253, - "eval_steps_per_second": 0.797, + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, "step": 2900 }, { - "epoch": 0.35034914519624366, - "grad_norm": 7.1875, + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, "learning_rate": 9.818222222222223e-05, - "loss": 0.7651, + "loss": 1.0246, "step": 2910 }, { - "epoch": 0.3515530941488081, - "grad_norm": 6.0, + "epoch": 2.0827389443651927, + "grad_norm": 8.125, "learning_rate": 9.813777777777778e-05, - "loss": 0.7786, + "loss": 0.9646, "step": 2920 }, { - "epoch": 0.3527570431013725, - "grad_norm": 9.375, + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, "learning_rate": 9.809333333333333e-05, - "loss": 0.8285, + "loss": 1.0022, "step": 2930 }, { - "epoch": 0.35396099205393694, - "grad_norm": 6.4375, + "epoch": 2.097004279600571, + "grad_norm": 8.625, "learning_rate": 9.80488888888889e-05, - "loss": 0.7339, + "loss": 0.9834, "step": 2940 }, { - "epoch": 0.3551649410065013, - "grad_norm": 8.8125, + "epoch": 2.10413694721826, + "grad_norm": 45.25, "learning_rate": 9.800444444444446e-05, - "loss": 0.6948, + "loss": 0.9159, "step": 2950 }, { - "epoch": 0.35636888995906574, - "grad_norm": 11.4375, + "epoch": 2.1112696148359484, + "grad_norm": 9.375, "learning_rate": 9.796e-05, - "loss": 0.8455, + "loss": 1.0598, "step": 2960 }, { - "epoch": 0.35757283891163016, - "grad_norm": 8.5625, + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, "learning_rate": 9.791555555555557e-05, - "loss": 0.791, + "loss": 0.8848, "step": 2970 }, { - "epoch": 0.35877678786419454, - "grad_norm": 7.84375, + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, "learning_rate": 9.787111111111111e-05, - "loss": 0.8574, + "loss": 0.942, "step": 2980 }, { - "epoch": 0.35998073681675896, - "grad_norm": 9.4375, + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, "learning_rate": 9.782666666666666e-05, - "loss": 0.7923, + "loss": 0.9583, "step": 2990 }, { - "epoch": 0.3611846857693234, - "grad_norm": 8.0625, + "epoch": 2.1398002853067046, + "grad_norm": 9.0, "learning_rate": 9.778222222222222e-05, - "loss": 0.863, + "loss": 0.9836, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval/acc": 41.86046600341797, + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval_loss": 2.5240559577941895, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.269, - "eval_steps_per_second": 4.75, + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, "step": 3000 }, { - "epoch": 0.3623886347218878, - "grad_norm": 6.71875, + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, "learning_rate": 9.773777777777779e-05, - "loss": 0.7726, + "loss": 1.028, "step": 3010 }, { - "epoch": 0.3635925836744522, - "grad_norm": 8.125, + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, "learning_rate": 9.769333333333334e-05, - "loss": 0.8234, + "loss": 0.9209, "step": 3020 }, { - "epoch": 0.3647965326270166, - "grad_norm": 7.90625, + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, "learning_rate": 9.764888888888888e-05, - "loss": 0.8125, + "loss": 0.9999, "step": 3030 }, { - "epoch": 0.36600048157958104, - "grad_norm": 5.875, + "epoch": 2.168330955777461, + "grad_norm": 8.375, "learning_rate": 9.760444444444446e-05, - "loss": 0.739, + "loss": 0.9576, "step": 3040 }, { - "epoch": 0.3672044305321454, - "grad_norm": 32.75, + "epoch": 2.17546362339515, + "grad_norm": 7.4375, "learning_rate": 9.756000000000001e-05, - "loss": 0.8773, + "loss": 0.8832, "step": 3050 }, { - "epoch": 0.36840837948470984, - "grad_norm": 8.625, + "epoch": 2.182596291012839, + "grad_norm": 8.125, "learning_rate": 9.751555555555556e-05, - "loss": 0.6411, + "loss": 0.933, "step": 3060 }, { - "epoch": 0.36961232843727426, - "grad_norm": 10.0625, + "epoch": 2.189728958630528, + "grad_norm": 8.9375, "learning_rate": 9.747111111111112e-05, - "loss": 0.7757, + "loss": 0.9962, "step": 3070 }, { - "epoch": 0.3708162773898387, - "grad_norm": 7.78125, + "epoch": 2.196861626248217, + "grad_norm": 7.1875, "learning_rate": 9.742666666666667e-05, - "loss": 0.8144, + "loss": 1.003, "step": 3080 }, { - "epoch": 0.37202022634240306, - "grad_norm": 8.25, + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, "learning_rate": 9.738222222222223e-05, - "loss": 0.7915, + "loss": 0.9441, "step": 3090 }, { - "epoch": 0.3732241752949675, - "grad_norm": 9.5, + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, "learning_rate": 9.733777777777778e-05, - "loss": 0.7808, + "loss": 1.0335, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval/acc": 39.53488540649414, + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval_loss": 2.6263325214385986, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.065, - "eval_steps_per_second": 4.746, + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, "step": 3100 }, { - "epoch": 0.3744281242475319, - "grad_norm": 7.34375, + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, "learning_rate": 9.729333333333334e-05, - "loss": 0.6467, + "loss": 0.9694, "step": 3110 }, { - "epoch": 0.37563207320009634, - "grad_norm": 10.5625, + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, "learning_rate": 9.724888888888889e-05, - "loss": 0.7271, + "loss": 1.0386, "step": 3120 }, { - "epoch": 0.3768360221526607, - "grad_norm": 19.375, + "epoch": 2.232524964336662, + "grad_norm": 8.6875, "learning_rate": 9.720444444444445e-05, - "loss": 0.8248, + "loss": 0.9614, "step": 3130 }, { - "epoch": 0.37803997110522514, - "grad_norm": 11.6875, + "epoch": 2.239657631954351, + "grad_norm": 8.3125, "learning_rate": 9.716000000000001e-05, - "loss": 0.7468, + "loss": 1.0643, "step": 3140 }, { - "epoch": 0.37924392005778956, - "grad_norm": 6.71875, + "epoch": 2.24679029957204, + "grad_norm": 8.125, "learning_rate": 9.711555555555556e-05, - "loss": 0.8189, + "loss": 0.9243, "step": 3150 }, { - "epoch": 0.38044786901035393, - "grad_norm": 7.15625, + "epoch": 2.253922967189729, + "grad_norm": 9.125, "learning_rate": 9.707111111111111e-05, - "loss": 0.7265, + "loss": 0.8419, "step": 3160 }, { - "epoch": 0.38165181796291836, - "grad_norm": 11.9375, + "epoch": 2.261055634807418, + "grad_norm": 9.125, "learning_rate": 9.702666666666667e-05, - "loss": 0.7502, + "loss": 0.9961, "step": 3170 }, { - "epoch": 0.3828557669154828, - "grad_norm": 7.78125, + "epoch": 2.268188302425107, + "grad_norm": 6.3125, "learning_rate": 9.698222222222223e-05, - "loss": 0.8412, + "loss": 0.8931, "step": 3180 }, { - "epoch": 0.3840597158680472, - "grad_norm": 6.75, + "epoch": 2.275320970042796, + "grad_norm": 7.875, "learning_rate": 9.693777777777778e-05, - "loss": 0.8689, + "loss": 1.0057, "step": 3190 }, { - "epoch": 0.3852636648206116, - "grad_norm": 7.6875, + "epoch": 2.282453637660485, + "grad_norm": 6.90625, "learning_rate": 9.689333333333333e-05, - "loss": 0.8053, + "loss": 0.9606, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval/acc": 39.53488540649414, + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval_loss": 2.6145706176757812, - "eval_runtime": 0.2093, - "eval_samples_per_second": 205.398, - "eval_steps_per_second": 4.777, + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, "step": 3200 }, { - "epoch": 0.386467613773176, - "grad_norm": 7.65625, + "epoch": 2.289586305278174, + "grad_norm": 11.8125, "learning_rate": 9.684888888888889e-05, - "loss": 0.7601, + "loss": 0.9218, "step": 3210 }, { - "epoch": 0.38767156272574044, - "grad_norm": 19.25, + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, "learning_rate": 9.680444444444445e-05, - "loss": 0.7944, + "loss": 1.0111, "step": 3220 }, { - "epoch": 0.38887551167830486, - "grad_norm": 9.375, + "epoch": 2.3038516405135523, + "grad_norm": 8.625, "learning_rate": 9.676e-05, - "loss": 0.839, + "loss": 1.0968, "step": 3230 }, { - "epoch": 0.39007946063086923, - "grad_norm": 8.5, + "epoch": 2.310984308131241, + "grad_norm": 7.1875, "learning_rate": 9.671555555555556e-05, - "loss": 0.7794, + "loss": 1.0236, "step": 3240 }, { - "epoch": 0.39128340958343366, - "grad_norm": 7.78125, + "epoch": 2.31811697574893, + "grad_norm": 6.84375, "learning_rate": 9.667111111111111e-05, - "loss": 0.753, + "loss": 0.92, "step": 3250 }, { - "epoch": 0.3924873585359981, - "grad_norm": 7.15625, + "epoch": 2.325249643366619, + "grad_norm": 8.75, "learning_rate": 9.662666666666667e-05, - "loss": 0.7326, + "loss": 0.8205, "step": 3260 }, { - "epoch": 0.39369130748856246, - "grad_norm": 13.4375, + "epoch": 2.332382310984308, + "grad_norm": 30.75, "learning_rate": 9.658222222222222e-05, - "loss": 0.6754, + "loss": 0.9676, "step": 3270 }, { - "epoch": 0.3948952564411269, - "grad_norm": 6.71875, + "epoch": 2.339514978601997, + "grad_norm": 13.0, "learning_rate": 9.653777777777778e-05, - "loss": 0.757, + "loss": 0.9086, "step": 3280 }, { - "epoch": 0.3960992053936913, - "grad_norm": 7.5625, + "epoch": 2.346647646219686, + "grad_norm": 9.375, "learning_rate": 9.649333333333333e-05, - "loss": 0.9203, + "loss": 1.0504, "step": 3290 }, { - "epoch": 0.39730315434625574, - "grad_norm": 8.375, + "epoch": 2.353780313837375, + "grad_norm": 39.0, "learning_rate": 9.64488888888889e-05, - "loss": 0.8552, + "loss": 0.9481, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval/acc": 44.1860466003418, + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval_loss": 2.571866273880005, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.479, - "eval_steps_per_second": 4.802, + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, "step": 3300 }, { - "epoch": 0.3985071032988201, - "grad_norm": 7.5625, + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, "learning_rate": 9.640444444444446e-05, - "loss": 0.7811, + "loss": 0.9641, "step": 3310 }, { - "epoch": 0.39971105225138454, - "grad_norm": 11.75, + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, "learning_rate": 9.636e-05, - "loss": 0.6717, + "loss": 0.9624, "step": 3320 }, { - "epoch": 0.40091500120394896, - "grad_norm": 8.1875, + "epoch": 2.3751783166904423, + "grad_norm": 12.625, "learning_rate": 9.631555555555555e-05, - "loss": 0.838, + "loss": 1.0082, "step": 3330 }, { - "epoch": 0.4021189501565134, - "grad_norm": 6.40625, + "epoch": 2.3823109843081314, + "grad_norm": 7.25, "learning_rate": 9.627111111111112e-05, - "loss": 0.8568, + "loss": 1.0249, "step": 3340 }, { - "epoch": 0.40332289910907776, - "grad_norm": 7.3125, + "epoch": 2.3894436519258204, + "grad_norm": 13.375, "learning_rate": 9.622666666666668e-05, - "loss": 0.6742, + "loss": 1.0153, "step": 3350 }, { - "epoch": 0.4045268480616422, - "grad_norm": 7.875, + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, "learning_rate": 9.618222222222223e-05, - "loss": 0.7849, + "loss": 0.9533, "step": 3360 }, { - "epoch": 0.4057307970142066, - "grad_norm": 8.5625, + "epoch": 2.403708987161198, + "grad_norm": 9.25, "learning_rate": 9.613777777777779e-05, - "loss": 0.7537, + "loss": 1.1051, "step": 3370 }, { - "epoch": 0.406934745966771, - "grad_norm": 8.5625, + "epoch": 2.410841654778887, + "grad_norm": 9.5625, "learning_rate": 9.609333333333334e-05, - "loss": 0.6935, + "loss": 1.0551, "step": 3380 }, { - "epoch": 0.4081386949193354, - "grad_norm": 6.3125, + "epoch": 2.417974322396576, + "grad_norm": 7.21875, "learning_rate": 9.604888888888889e-05, - "loss": 0.8065, + "loss": 0.9032, "step": 3390 }, { - "epoch": 0.40934264387189984, - "grad_norm": 26.25, + "epoch": 2.425106990014265, + "grad_norm": 8.5625, "learning_rate": 9.600444444444445e-05, - "loss": 0.6558, + "loss": 1.1008, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval/acc": 37.20930099487305, + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval_loss": 2.7212982177734375, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.345, - "eval_steps_per_second": 4.775, + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, "step": 3400 }, { - "epoch": 0.41054659282446426, - "grad_norm": 6.84375, + "epoch": 2.4322396576319543, + "grad_norm": 10.375, "learning_rate": 9.596000000000001e-05, - "loss": 0.7642, + "loss": 0.9562, "step": 3410 }, { - "epoch": 0.41175054177702863, - "grad_norm": 7.0625, + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, "learning_rate": 9.591555555555556e-05, - "loss": 0.7185, + "loss": 1.0756, "step": 3420 }, { - "epoch": 0.41295449072959306, - "grad_norm": 7.15625, + "epoch": 2.4465049928673324, + "grad_norm": 9.125, "learning_rate": 9.58711111111111e-05, - "loss": 0.6634, + "loss": 0.9554, "step": 3430 }, { - "epoch": 0.4141584396821575, - "grad_norm": 4.96875, + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, "learning_rate": 9.582666666666668e-05, - "loss": 0.6383, + "loss": 0.9122, "step": 3440 }, { - "epoch": 0.4153623886347219, - "grad_norm": 7.15625, + "epoch": 2.4607703281027105, + "grad_norm": 8.625, "learning_rate": 9.578222222222223e-05, - "loss": 0.8032, + "loss": 0.9311, "step": 3450 }, { - "epoch": 0.4165663375872863, - "grad_norm": 9.0625, + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, "learning_rate": 9.573777777777778e-05, - "loss": 0.7294, + "loss": 1.0023, "step": 3460 }, { - "epoch": 0.4177702865398507, - "grad_norm": 9.5, + "epoch": 2.4750356633380886, + "grad_norm": 8.125, "learning_rate": 9.569333333333334e-05, - "loss": 0.802, + "loss": 0.9172, "step": 3470 }, { - "epoch": 0.41897423549241514, - "grad_norm": 7.0, + "epoch": 2.4821683309557776, + "grad_norm": 7.375, "learning_rate": 9.56488888888889e-05, - "loss": 0.7307, + "loss": 0.9407, "step": 3480 }, { - "epoch": 0.4201781844449795, - "grad_norm": 6.34375, + "epoch": 2.4893009985734667, + "grad_norm": 10.25, "learning_rate": 9.560444444444445e-05, - "loss": 0.7239, + "loss": 0.9433, "step": 3490 }, { - "epoch": 0.42138213339754393, - "grad_norm": 6.5, + "epoch": 2.4964336661911553, + "grad_norm": 8.625, "learning_rate": 9.556e-05, - "loss": 0.6711, + "loss": 0.9934, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval/acc": 39.53488540649414, + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval_loss": 2.569326400756836, - "eval_runtime": 0.2066, - "eval_samples_per_second": 208.137, - "eval_steps_per_second": 4.84, + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 3500 }, { - "epoch": 0.42258608235010836, - "grad_norm": 8.125, + "epoch": 2.5035663338088447, + "grad_norm": 7.625, "learning_rate": 9.551555555555556e-05, - "loss": 0.695, + "loss": 0.9157, "step": 3510 }, { - "epoch": 0.4237900313026728, - "grad_norm": 8.3125, + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, "learning_rate": 9.547111111111111e-05, - "loss": 0.8691, + "loss": 0.9202, "step": 3520 }, { - "epoch": 0.42499398025523716, - "grad_norm": 8.6875, + "epoch": 2.5178316690442224, + "grad_norm": 9.25, "learning_rate": 9.542666666666667e-05, - "loss": 0.7582, + "loss": 0.8526, "step": 3530 }, { - "epoch": 0.4261979292078016, - "grad_norm": 7.25, + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, "learning_rate": 9.538222222222223e-05, - "loss": 0.7143, + "loss": 0.9562, "step": 3540 }, { - "epoch": 0.427401878160366, - "grad_norm": 8.6875, + "epoch": 2.5320970042796005, + "grad_norm": 9.75, "learning_rate": 9.533777777777778e-05, - "loss": 0.6754, + "loss": 0.9927, "step": 3550 }, { - "epoch": 0.42860582711293044, - "grad_norm": 7.8125, + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, "learning_rate": 9.529333333333333e-05, - "loss": 0.7153, + "loss": 0.9263, "step": 3560 }, { - "epoch": 0.4298097760654948, - "grad_norm": 7.5625, + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, "learning_rate": 9.52488888888889e-05, - "loss": 0.7293, + "loss": 0.9367, "step": 3570 }, { - "epoch": 0.43101372501805923, - "grad_norm": 7.5625, + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, "learning_rate": 9.520444444444446e-05, - "loss": 0.7066, + "loss": 0.9284, "step": 3580 }, { - "epoch": 0.43221767397062366, - "grad_norm": 8.1875, + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, "learning_rate": 9.516e-05, - "loss": 0.691, + "loss": 0.8394, "step": 3590 }, { - "epoch": 0.43342162292318803, - "grad_norm": 7.125, + "epoch": 2.5677603423680457, + "grad_norm": 10.25, "learning_rate": 9.511555555555555e-05, - "loss": 0.8239, + "loss": 0.9336, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval/acc": 44.1860466003418, + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval_loss": 2.4877374172210693, - "eval_runtime": 0.3957, - "eval_samples_per_second": 108.658, - "eval_steps_per_second": 2.527, + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, "step": 3600 }, { - "epoch": 0.43462557187575246, - "grad_norm": 6.375, + "epoch": 2.574893009985735, + "grad_norm": 10.0625, "learning_rate": 9.507111111111111e-05, - "loss": 0.6782, + "loss": 1.0005, "step": 3610 }, { - "epoch": 0.4358295208283169, - "grad_norm": 7.1875, + "epoch": 2.582025677603424, + "grad_norm": 8.375, "learning_rate": 9.502666666666668e-05, - "loss": 0.7602, + "loss": 0.9319, "step": 3620 }, { - "epoch": 0.4370334697808813, - "grad_norm": 8.125, + "epoch": 2.5891583452211124, + "grad_norm": 8.5, "learning_rate": 9.498222222222222e-05, - "loss": 0.7232, + "loss": 0.9125, "step": 3630 }, { - "epoch": 0.4382374187334457, - "grad_norm": 7.84375, + "epoch": 2.596291012838802, + "grad_norm": 7.71875, "learning_rate": 9.493777777777779e-05, - "loss": 0.729, + "loss": 0.9279, "step": 3640 }, { - "epoch": 0.4394413676860101, - "grad_norm": 8.375, + "epoch": 2.6034236804564905, + "grad_norm": 11.875, "learning_rate": 9.489333333333334e-05, - "loss": 0.8222, + "loss": 0.952, "step": 3650 }, { - "epoch": 0.44064531663857454, - "grad_norm": 8.125, + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, "learning_rate": 9.48488888888889e-05, - "loss": 0.6918, + "loss": 1.0043, "step": 3660 }, { - "epoch": 0.44184926559113896, - "grad_norm": 8.1875, + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, "learning_rate": 9.480444444444445e-05, - "loss": 0.6761, + "loss": 0.8932, "step": 3670 }, { - "epoch": 0.44305321454370333, - "grad_norm": 5.65625, + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, "learning_rate": 9.476000000000001e-05, - "loss": 0.7532, + "loss": 0.8775, "step": 3680 }, { - "epoch": 0.44425716349626776, - "grad_norm": 8.8125, + "epoch": 2.6319543509272467, + "grad_norm": 9.0, "learning_rate": 9.471555555555556e-05, - "loss": 0.7072, + "loss": 0.9756, "step": 3690 }, { - "epoch": 0.4454611124488322, - "grad_norm": 6.5625, + "epoch": 2.6390870185449358, + "grad_norm": 7.375, "learning_rate": 9.46711111111111e-05, - "loss": 0.8405, + "loss": 0.9345, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval/acc": 39.53488540649414, + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval_loss": 2.615053176879883, - "eval_runtime": 4.8304, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 0.207, + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, "step": 3700 }, { - "epoch": 0.44666506140139656, - "grad_norm": 8.6875, + "epoch": 2.646219686162625, + "grad_norm": 8.4375, "learning_rate": 9.462666666666668e-05, - "loss": 0.7249, + "loss": 0.9851, "step": 3710 }, { - "epoch": 0.447869010353961, - "grad_norm": 8.4375, + "epoch": 2.653352353780314, + "grad_norm": 31.75, "learning_rate": 9.458222222222223e-05, - "loss": 0.8561, + "loss": 0.9712, "step": 3720 }, { - "epoch": 0.4490729593065254, - "grad_norm": 7.3125, + "epoch": 2.660485021398003, + "grad_norm": 6.75, "learning_rate": 9.453777777777778e-05, - "loss": 0.7884, + "loss": 0.8641, "step": 3730 }, { - "epoch": 0.45027690825908984, - "grad_norm": 7.34375, + "epoch": 2.667617689015692, + "grad_norm": 6.5625, "learning_rate": 9.449333333333334e-05, - "loss": 0.7169, + "loss": 0.945, "step": 3740 }, { - "epoch": 0.4514808572116542, - "grad_norm": 5.5, + "epoch": 2.674750356633381, + "grad_norm": 6.0625, "learning_rate": 9.44488888888889e-05, - "loss": 0.7542, + "loss": 0.9535, "step": 3750 }, { - "epoch": 0.45268480616421863, - "grad_norm": 6.09375, + "epoch": 2.68188302425107, + "grad_norm": 7.90625, "learning_rate": 9.440444444444445e-05, - "loss": 0.6292, + "loss": 0.8844, "step": 3760 }, { - "epoch": 0.45388875511678306, - "grad_norm": 8.9375, + "epoch": 2.689015691868759, + "grad_norm": 9.8125, "learning_rate": 9.436e-05, - "loss": 0.6682, + "loss": 0.9064, "step": 3770 }, { - "epoch": 0.4550927040693475, - "grad_norm": 5.09375, + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, "learning_rate": 9.431555555555556e-05, - "loss": 0.6499, + "loss": 1.0119, "step": 3780 }, { - "epoch": 0.45629665302191186, - "grad_norm": 8.5, + "epoch": 2.703281027104137, + "grad_norm": 7.15625, "learning_rate": 9.427111111111112e-05, - "loss": 0.7859, + "loss": 0.9655, "step": 3790 }, { - "epoch": 0.4575006019744763, - "grad_norm": 14.5, + "epoch": 2.710413694721826, + "grad_norm": 9.4375, "learning_rate": 9.422666666666667e-05, - "loss": 0.7987, + "loss": 0.9187, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval/acc": 39.53488540649414, + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval_loss": 2.645066022872925, - "eval_runtime": 0.6165, - "eval_samples_per_second": 69.745, - "eval_steps_per_second": 1.622, + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, "step": 3800 }, { - "epoch": 0.4587045509270407, - "grad_norm": 6.25, + "epoch": 2.717546362339515, + "grad_norm": 9.25, "learning_rate": 9.418222222222223e-05, - "loss": 0.7035, + "loss": 0.8689, "step": 3810 }, { - "epoch": 0.4599084998796051, - "grad_norm": 6.46875, + "epoch": 2.724679029957204, + "grad_norm": 8.0625, "learning_rate": 9.413777777777778e-05, - "loss": 0.6329, + "loss": 0.9138, "step": 3820 }, { - "epoch": 0.4611124488321695, - "grad_norm": 8.875, + "epoch": 2.731811697574893, + "grad_norm": 14.3125, "learning_rate": 9.409333333333333e-05, - "loss": 0.7553, + "loss": 0.9129, "step": 3830 }, { - "epoch": 0.46231639778473393, - "grad_norm": 9.3125, + "epoch": 2.738944365192582, + "grad_norm": 6.78125, "learning_rate": 9.404888888888889e-05, - "loss": 0.6551, + "loss": 0.8666, "step": 3840 }, { - "epoch": 0.46352034673729836, - "grad_norm": 11.0625, + "epoch": 2.746077032810271, + "grad_norm": 7.4375, "learning_rate": 9.400444444444445e-05, - "loss": 0.6634, + "loss": 0.9474, "step": 3850 }, { - "epoch": 0.46472429568986273, - "grad_norm": 6.71875, + "epoch": 2.75320970042796, + "grad_norm": 7.46875, "learning_rate": 9.396e-05, - "loss": 0.6527, + "loss": 0.9312, "step": 3860 }, { - "epoch": 0.46592824464242716, - "grad_norm": 6.75, + "epoch": 2.760342368045649, + "grad_norm": 7.84375, "learning_rate": 9.391555555555555e-05, - "loss": 0.8268, + "loss": 0.943, "step": 3870 }, { - "epoch": 0.4671321935949916, - "grad_norm": 7.78125, + "epoch": 2.767475035663338, + "grad_norm": 8.125, "learning_rate": 9.387111111111113e-05, - "loss": 0.742, + "loss": 0.9471, "step": 3880 }, { - "epoch": 0.468336142547556, - "grad_norm": 6.53125, + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, "learning_rate": 9.382666666666667e-05, - "loss": 0.7446, + "loss": 0.9785, "step": 3890 }, { - "epoch": 0.4695400915001204, - "grad_norm": 7.0625, + "epoch": 2.7817403708987163, + "grad_norm": 10.5, "learning_rate": 9.378222222222222e-05, - "loss": 0.7764, + "loss": 1.0151, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval/acc": 37.79069900512695, + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval_loss": 2.6463897228240967, - "eval_runtime": 1.4145, - "eval_samples_per_second": 30.4, - "eval_steps_per_second": 0.707, + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, "step": 3900 }, { - "epoch": 0.4707440404526848, - "grad_norm": 5.625, + "epoch": 2.788873038516405, + "grad_norm": 9.75, "learning_rate": 9.373777777777778e-05, - "loss": 0.7248, + "loss": 0.9148, "step": 3910 }, { - "epoch": 0.47194798940524924, - "grad_norm": 7.09375, + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, "learning_rate": 9.369333333333333e-05, - "loss": 0.6977, + "loss": 1.0314, "step": 3920 }, { - "epoch": 0.4731519383578136, - "grad_norm": 7.53125, + "epoch": 2.803138373751783, + "grad_norm": 8.375, "learning_rate": 9.36488888888889e-05, - "loss": 0.6496, + "loss": 0.9076, "step": 3930 }, { - "epoch": 0.47435588731037803, - "grad_norm": 11.0, + "epoch": 2.810271041369472, + "grad_norm": 6.46875, "learning_rate": 9.360444444444444e-05, - "loss": 0.7309, + "loss": 0.8218, "step": 3940 }, { - "epoch": 0.47555983626294246, - "grad_norm": 10.5625, + "epoch": 2.817403708987161, + "grad_norm": 7.96875, "learning_rate": 9.356e-05, - "loss": 0.7837, + "loss": 0.9415, "step": 3950 }, { - "epoch": 0.4767637852155069, - "grad_norm": 6.9375, + "epoch": 2.82453637660485, + "grad_norm": 7.53125, "learning_rate": 9.351555555555555e-05, - "loss": 0.6769, + "loss": 0.9593, "step": 3960 }, { - "epoch": 0.47796773416807126, - "grad_norm": 6.84375, + "epoch": 2.831669044222539, + "grad_norm": 5.96875, "learning_rate": 9.347111111111112e-05, - "loss": 0.642, + "loss": 0.9134, "step": 3970 }, { - "epoch": 0.4791716831206357, - "grad_norm": 9.125, + "epoch": 2.8388017118402282, + "grad_norm": 8.25, "learning_rate": 9.342666666666668e-05, - "loss": 0.6947, + "loss": 0.9339, "step": 3980 }, { - "epoch": 0.4803756320732001, - "grad_norm": 7.4375, + "epoch": 2.8459343794579173, + "grad_norm": 9.625, "learning_rate": 9.338222222222223e-05, - "loss": 0.5902, + "loss": 1.0018, "step": 3990 }, { - "epoch": 0.4815795810257645, - "grad_norm": 8.1875, + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, "learning_rate": 9.333777777777777e-05, - "loss": 0.6075, + "loss": 0.9302, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval/acc": 34.88372039794922, + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval_loss": 2.6985960006713867, - "eval_runtime": 0.2767, - "eval_samples_per_second": 155.399, - "eval_steps_per_second": 3.614, + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, "step": 4000 }, { - "epoch": 0.4827835299783289, - "grad_norm": 6.8125, + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, "learning_rate": 9.329333333333334e-05, - "loss": 0.7166, + "loss": 0.9375, "step": 4010 }, { - "epoch": 0.48398747893089333, - "grad_norm": 6.375, + "epoch": 2.8673323823109844, + "grad_norm": 11.875, "learning_rate": 9.32488888888889e-05, - "loss": 0.6136, + "loss": 0.8406, "step": 4020 }, { - "epoch": 0.48519142788345776, - "grad_norm": 6.09375, + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, "learning_rate": 9.320444444444445e-05, - "loss": 0.7948, + "loss": 0.8863, "step": 4030 }, { - "epoch": 0.48639537683602213, - "grad_norm": 7.5625, + "epoch": 2.881597717546362, + "grad_norm": 6.9375, "learning_rate": 9.316000000000001e-05, - "loss": 0.7253, + "loss": 0.9546, "step": 4040 }, { - "epoch": 0.48759932578858656, - "grad_norm": 7.1875, + "epoch": 2.8887303851640516, + "grad_norm": 8.625, "learning_rate": 9.311555555555556e-05, - "loss": 0.7386, + "loss": 1.0175, "step": 4050 }, { - "epoch": 0.488803274741151, - "grad_norm": 7.71875, + "epoch": 2.89586305278174, + "grad_norm": 45.0, "learning_rate": 9.307111111111112e-05, - "loss": 0.7222, + "loss": 0.9058, "step": 4060 }, { - "epoch": 0.4900072236937154, - "grad_norm": 10.8125, + "epoch": 2.9029957203994297, + "grad_norm": 13.625, "learning_rate": 9.302666666666667e-05, - "loss": 0.6298, + "loss": 0.9137, "step": 4070 }, { - "epoch": 0.4912111726462798, - "grad_norm": 14.25, + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, "learning_rate": 9.298222222222223e-05, - "loss": 0.6551, + "loss": 0.8862, "step": 4080 }, { - "epoch": 0.4924151215988442, - "grad_norm": 7.75, + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, "learning_rate": 9.293777777777778e-05, - "loss": 0.7201, + "loss": 0.9152, "step": 4090 }, { - "epoch": 0.49361907055140863, - "grad_norm": 9.0625, + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, "learning_rate": 9.289333333333334e-05, - "loss": 0.708, + "loss": 0.9623, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval/acc": 34.88372039794922, + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval_loss": 2.7673676013946533, - "eval_runtime": 0.3468, - "eval_samples_per_second": 124.003, - "eval_steps_per_second": 2.884, + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, "step": 4100 }, { - "epoch": 0.494823019503973, - "grad_norm": 7.9375, + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, "learning_rate": 9.28488888888889e-05, - "loss": 0.6997, + "loss": 0.9088, "step": 4110 }, { - "epoch": 0.49602696845653743, - "grad_norm": 6.84375, + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, "learning_rate": 9.280444444444445e-05, - "loss": 0.6195, + "loss": 0.9927, "step": 4120 }, { - "epoch": 0.49723091740910186, - "grad_norm": 7.40625, + "epoch": 2.9457917261055635, + "grad_norm": 75.0, "learning_rate": 9.276e-05, - "loss": 0.765, + "loss": 0.912, "step": 4130 }, { - "epoch": 0.4984348663616663, - "grad_norm": 7.8125, + "epoch": 2.9529243937232525, + "grad_norm": 9.125, "learning_rate": 9.271555555555556e-05, - "loss": 0.7097, + "loss": 0.9878, "step": 4140 }, { - "epoch": 0.49963881531423066, - "grad_norm": 7.75, + "epoch": 2.9600570613409416, + "grad_norm": 7.125, "learning_rate": 9.267111111111112e-05, - "loss": 0.7067, + "loss": 0.8785, "step": 4150 }, { - "epoch": 0.5008427642667951, - "grad_norm": 27.875, + "epoch": 2.9671897289586306, + "grad_norm": 8.25, "learning_rate": 9.262666666666667e-05, - "loss": 0.7989, + "loss": 0.9296, "step": 4160 }, { - "epoch": 0.5020467132193595, - "grad_norm": 8.0, + "epoch": 2.9743223965763197, + "grad_norm": 8.75, "learning_rate": 9.258222222222222e-05, - "loss": 0.6744, + "loss": 0.9284, "step": 4170 }, { - "epoch": 0.5032506621719239, - "grad_norm": 7.96875, + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, "learning_rate": 9.253777777777778e-05, - "loss": 0.738, + "loss": 0.9566, "step": 4180 }, { - "epoch": 0.5044546111244883, - "grad_norm": 7.21875, + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, "learning_rate": 9.249333333333334e-05, - "loss": 0.7021, + "loss": 0.8368, "step": 4190 }, { - "epoch": 0.5056585600770528, - "grad_norm": 9.6875, + "epoch": 2.995720399429387, + "grad_norm": 9.875, "learning_rate": 9.244888888888889e-05, - "loss": 0.7133, + "loss": 1.0306, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval/acc": 32.55813980102539, + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval_loss": 2.7288577556610107, - "eval_runtime": 0.2266, - "eval_samples_per_second": 189.803, - "eval_steps_per_second": 4.414, + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, "step": 4200 }, { - "epoch": 0.5068625090296172, - "grad_norm": 10.5, + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, "learning_rate": 9.240444444444445e-05, - "loss": 0.6886, + "loss": 0.957, "step": 4210 }, { - "epoch": 0.5080664579821815, - "grad_norm": 9.0625, + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, "learning_rate": 9.236e-05, - "loss": 0.7944, + "loss": 0.884, "step": 4220 }, { - "epoch": 0.509270406934746, - "grad_norm": 7.78125, + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, "learning_rate": 9.231555555555555e-05, - "loss": 0.7869, + "loss": 0.9064, "step": 4230 }, { - "epoch": 0.5104743558873104, - "grad_norm": 6.375, + "epoch": 3.0242510699001426, + "grad_norm": 8.0, "learning_rate": 9.227111111111111e-05, - "loss": 0.6245, + "loss": 0.9164, "step": 4240 }, { - "epoch": 0.5116783048398748, - "grad_norm": 9.9375, + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, "learning_rate": 9.222666666666668e-05, - "loss": 0.7006, + "loss": 0.9787, "step": 4250 }, { - "epoch": 0.5128822537924392, - "grad_norm": 6.1875, + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, "learning_rate": 9.218222222222222e-05, - "loss": 0.7588, + "loss": 0.8852, "step": 4260 }, { - "epoch": 0.5140862027450036, - "grad_norm": 10.6875, + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, "learning_rate": 9.213777777777777e-05, - "loss": 0.737, + "loss": 1.0092, "step": 4270 }, { - "epoch": 0.515290151697568, - "grad_norm": 6.15625, + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, "learning_rate": 9.209333333333335e-05, - "loss": 0.6774, + "loss": 0.9972, "step": 4280 }, { - "epoch": 0.5164941006501325, - "grad_norm": 8.8125, + "epoch": 3.059914407988588, + "grad_norm": 7.25, "learning_rate": 9.20488888888889e-05, - "loss": 0.6972, + "loss": 0.9237, "step": 4290 }, { - "epoch": 0.5176980496026968, - "grad_norm": 6.40625, + "epoch": 3.067047075606277, + "grad_norm": 6.4375, "learning_rate": 9.200444444444445e-05, - "loss": 0.6423, + "loss": 0.9096, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval/acc": 38.953487396240234, + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval_loss": 2.7444300651550293, - "eval_runtime": 0.2708, - "eval_samples_per_second": 158.776, - "eval_steps_per_second": 3.692, + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, "step": 4300 }, { - "epoch": 0.5189019985552613, - "grad_norm": 6.8125, + "epoch": 3.074179743223966, + "grad_norm": 8.4375, "learning_rate": 9.196000000000001e-05, - "loss": 0.7705, + "loss": 0.9697, "step": 4310 }, { - "epoch": 0.5201059475078257, - "grad_norm": 5.90625, + "epoch": 3.081312410841655, + "grad_norm": 8.4375, "learning_rate": 9.191555555555556e-05, - "loss": 0.7534, + "loss": 0.8379, "step": 4320 }, { - "epoch": 0.52130989646039, - "grad_norm": 9.25, + "epoch": 3.088445078459344, + "grad_norm": 8.125, "learning_rate": 9.187111111111112e-05, - "loss": 0.6586, + "loss": 0.8576, "step": 4330 }, { - "epoch": 0.5225138454129545, - "grad_norm": 7.53125, + "epoch": 3.0955777460770326, + "grad_norm": 10.75, "learning_rate": 9.182666666666667e-05, - "loss": 0.7459, + "loss": 0.9616, "step": 4340 }, { - "epoch": 0.5237177943655189, - "grad_norm": 6.09375, + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, "learning_rate": 9.178222222222223e-05, - "loss": 0.7088, + "loss": 0.7674, "step": 4350 }, { - "epoch": 0.5249217433180833, - "grad_norm": 8.5, + "epoch": 3.1098430813124107, + "grad_norm": 8.375, "learning_rate": 9.173777777777778e-05, - "loss": 0.7313, + "loss": 0.8712, "step": 4360 }, { - "epoch": 0.5261256922706478, - "grad_norm": 8.8125, + "epoch": 3.1169757489300998, + "grad_norm": 8.375, "learning_rate": 9.169333333333334e-05, - "loss": 0.7364, + "loss": 0.8599, "step": 4370 }, { - "epoch": 0.5273296412232121, - "grad_norm": 7.09375, + "epoch": 3.124108416547789, + "grad_norm": 7.1875, "learning_rate": 9.16488888888889e-05, - "loss": 0.6962, + "loss": 0.9736, "step": 4380 }, { - "epoch": 0.5285335901757765, - "grad_norm": 6.28125, + "epoch": 3.131241084165478, + "grad_norm": 7.75, "learning_rate": 9.160444444444445e-05, - "loss": 0.6817, + "loss": 0.8663, "step": 4390 }, { - "epoch": 0.529737539128341, - "grad_norm": 8.25, + "epoch": 3.138373751783167, + "grad_norm": 7.53125, "learning_rate": 9.156e-05, - "loss": 0.6786, + "loss": 0.9221, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval/acc": 34.88372039794922, + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval_loss": 2.728501081466675, - "eval_runtime": 0.3599, - "eval_samples_per_second": 119.474, - "eval_steps_per_second": 2.778, + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, "step": 4400 }, { - "epoch": 0.5309414880809054, - "grad_norm": 7.59375, + "epoch": 3.145506419400856, + "grad_norm": 8.125, "learning_rate": 9.151555555555556e-05, - "loss": 0.6744, + "loss": 0.9144, "step": 4410 }, { - "epoch": 0.5321454370334697, - "grad_norm": 8.0625, + "epoch": 3.152639087018545, + "grad_norm": 7.46875, "learning_rate": 9.147111111111112e-05, - "loss": 0.8287, + "loss": 0.9445, "step": 4420 }, { - "epoch": 0.5333493859860342, - "grad_norm": 8.1875, + "epoch": 3.159771754636234, + "grad_norm": 6.9375, "learning_rate": 9.142666666666667e-05, - "loss": 0.7069, + "loss": 0.8308, "step": 4430 }, { - "epoch": 0.5345533349385986, - "grad_norm": 8.125, + "epoch": 3.166904422253923, + "grad_norm": 7.53125, "learning_rate": 9.138222222222222e-05, - "loss": 0.662, + "loss": 0.8428, "step": 4440 }, { - "epoch": 0.5357572838911631, - "grad_norm": 7.46875, + "epoch": 3.174037089871612, + "grad_norm": 7.96875, "learning_rate": 9.133777777777778e-05, - "loss": 0.7424, + "loss": 0.9022, "step": 4450 }, { - "epoch": 0.5369612328437274, - "grad_norm": 6.96875, + "epoch": 3.181169757489301, + "grad_norm": 6.875, "learning_rate": 9.129333333333334e-05, - "loss": 0.7308, + "loss": 0.9955, "step": 4460 }, { - "epoch": 0.5381651817962918, - "grad_norm": 8.3125, + "epoch": 3.18830242510699, + "grad_norm": 9.5625, "learning_rate": 9.124888888888889e-05, - "loss": 0.7524, + "loss": 0.9493, "step": 4470 }, { - "epoch": 0.5393691307488563, - "grad_norm": 6.40625, + "epoch": 3.195435092724679, + "grad_norm": 9.0625, "learning_rate": 9.120444444444445e-05, - "loss": 0.7523, + "loss": 0.9608, "step": 4480 }, { - "epoch": 0.5405730797014207, - "grad_norm": 7.65625, + "epoch": 3.202567760342368, + "grad_norm": 8.625, "learning_rate": 9.116e-05, - "loss": 0.647, + "loss": 0.821, "step": 4490 }, { - "epoch": 0.541777028653985, - "grad_norm": 6.875, + "epoch": 3.209700427960057, + "grad_norm": 8.125, "learning_rate": 9.111555555555556e-05, - "loss": 0.6547, + "loss": 0.9175, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval/acc": 37.20930099487305, + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval_loss": 2.8390543460845947, - "eval_runtime": 0.2096, - "eval_samples_per_second": 205.2, - "eval_steps_per_second": 4.772, + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, "step": 4500 }, { - "epoch": 0.5429809776065495, - "grad_norm": 9.375, + "epoch": 3.216833095577746, + "grad_norm": 8.0625, "learning_rate": 9.107111111111111e-05, - "loss": 0.6773, + "loss": 0.9169, "step": 4510 }, { - "epoch": 0.5441849265591139, - "grad_norm": 10.1875, + "epoch": 3.223965763195435, + "grad_norm": 8.3125, "learning_rate": 9.102666666666667e-05, - "loss": 0.704, + "loss": 0.8001, "step": 4520 }, { - "epoch": 0.5453888755116783, - "grad_norm": 5.0625, + "epoch": 3.231098430813124, + "grad_norm": 7.3125, "learning_rate": 9.098222222222222e-05, - "loss": 0.6303, + "loss": 0.8513, "step": 4530 }, { - "epoch": 0.5465928244642427, - "grad_norm": 8.25, + "epoch": 3.238231098430813, + "grad_norm": 7.625, "learning_rate": 9.093777777777777e-05, - "loss": 0.7469, + "loss": 0.912, "step": 4540 }, { - "epoch": 0.5477967734168071, - "grad_norm": 7.375, + "epoch": 3.245363766048502, + "grad_norm": 6.46875, "learning_rate": 9.089333333333335e-05, - "loss": 0.6995, + "loss": 0.9418, "step": 4550 }, { - "epoch": 0.5490007223693716, - "grad_norm": 7.78125, + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, "learning_rate": 9.08488888888889e-05, - "loss": 0.6965, + "loss": 0.871, "step": 4560 }, { - "epoch": 0.550204671321936, - "grad_norm": 13.625, + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, "learning_rate": 9.080444444444444e-05, - "loss": 0.759, + "loss": 0.8507, "step": 4570 }, { - "epoch": 0.5514086202745003, - "grad_norm": 6.875, + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, "learning_rate": 9.076e-05, - "loss": 0.7284, + "loss": 0.8058, "step": 4580 }, { - "epoch": 0.5526125692270648, - "grad_norm": 5.875, + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, "learning_rate": 9.071555555555557e-05, - "loss": 0.6721, + "loss": 0.7959, "step": 4590 }, { - "epoch": 0.5538165181796292, - "grad_norm": 5.46875, + "epoch": 3.281027104136947, + "grad_norm": 6.375, "learning_rate": 9.067111111111112e-05, - "loss": 0.6522, + "loss": 0.9206, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval/acc": 39.53488540649414, + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval_loss": 2.801618814468384, - "eval_runtime": 0.2155, - "eval_samples_per_second": 199.501, - "eval_steps_per_second": 4.64, + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, "step": 4600 }, { - "epoch": 0.5550204671321936, - "grad_norm": 8.5625, + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, "learning_rate": 9.062666666666666e-05, - "loss": 0.6399, + "loss": 0.8306, "step": 4610 }, { - "epoch": 0.556224416084758, - "grad_norm": 7.40625, + "epoch": 3.295292439372325, + "grad_norm": 6.9375, "learning_rate": 9.058222222222223e-05, - "loss": 0.7303, + "loss": 0.8958, "step": 4620 }, { - "epoch": 0.5574283650373224, - "grad_norm": 6.96875, + "epoch": 3.302425106990014, + "grad_norm": 7.96875, "learning_rate": 9.053777777777777e-05, - "loss": 0.7126, + "loss": 0.8919, "step": 4630 }, { - "epoch": 0.5586323139898868, - "grad_norm": 7.15625, + "epoch": 3.309557774607703, + "grad_norm": 6.9375, "learning_rate": 9.049333333333334e-05, - "loss": 0.702, + "loss": 0.8844, "step": 4640 }, { - "epoch": 0.5598362629424513, - "grad_norm": 6.625, + "epoch": 3.316690442225392, + "grad_norm": 7.21875, "learning_rate": 9.04488888888889e-05, - "loss": 0.6957, + "loss": 0.8335, "step": 4650 }, { - "epoch": 0.5610402118950156, - "grad_norm": 7.90625, + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, "learning_rate": 9.040444444444445e-05, - "loss": 0.703, + "loss": 0.9337, "step": 4660 }, { - "epoch": 0.5622441608475801, - "grad_norm": 7.75, + "epoch": 3.3309557774607703, + "grad_norm": 9.25, "learning_rate": 9.036e-05, - "loss": 0.7195, + "loss": 1.0282, "step": 4670 }, { - "epoch": 0.5634481098001445, - "grad_norm": 6.59375, + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, "learning_rate": 9.031555555555557e-05, - "loss": 0.6445, + "loss": 0.9401, "step": 4680 }, { - "epoch": 0.5646520587527089, - "grad_norm": 25.125, + "epoch": 3.3452211126961484, + "grad_norm": 7.25, "learning_rate": 9.027111111111112e-05, - "loss": 0.699, + "loss": 0.908, "step": 4690 }, { - "epoch": 0.5658560077052733, - "grad_norm": 8.125, + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, "learning_rate": 9.022666666666667e-05, - "loss": 0.716, + "loss": 0.9262, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval/acc": 34.88372039794922, + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval_loss": 2.777444839477539, - "eval_runtime": 0.218, - "eval_samples_per_second": 197.287, - "eval_steps_per_second": 4.588, + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, "step": 4700 }, { - "epoch": 0.5670599566578377, - "grad_norm": 7.0, + "epoch": 3.3594864479315265, + "grad_norm": 13.0, "learning_rate": 9.018222222222223e-05, - "loss": 0.693, + "loss": 0.9692, "step": 4710 }, { - "epoch": 0.5682639056104021, - "grad_norm": 8.8125, + "epoch": 3.3666191155492156, + "grad_norm": 5.875, "learning_rate": 9.013777777777779e-05, - "loss": 0.7, + "loss": 0.9071, "step": 4720 }, { - "epoch": 0.5694678545629666, - "grad_norm": 7.0, + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, "learning_rate": 9.009333333333334e-05, - "loss": 0.6616, + "loss": 0.8528, "step": 4730 }, { - "epoch": 0.5706718035155309, - "grad_norm": 7.75, + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, "learning_rate": 9.004888888888889e-05, - "loss": 0.7987, + "loss": 0.9408, "step": 4740 }, { - "epoch": 0.5718757524680953, - "grad_norm": 6.53125, + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, "learning_rate": 9.000444444444445e-05, - "loss": 0.7162, + "loss": 1.0017, "step": 4750 }, { - "epoch": 0.5730797014206598, - "grad_norm": 8.6875, + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, "learning_rate": 8.996e-05, - "loss": 0.673, + "loss": 0.9107, "step": 4760 }, { - "epoch": 0.5742836503732242, - "grad_norm": 6.5625, + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, "learning_rate": 8.991555555555556e-05, - "loss": 0.7389, + "loss": 0.9387, "step": 4770 }, { - "epoch": 0.5754875993257886, - "grad_norm": 7.25, + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, "learning_rate": 8.987111111111112e-05, - "loss": 0.6674, + "loss": 0.9775, "step": 4780 }, { - "epoch": 0.576691548278353, - "grad_norm": 8.8125, + "epoch": 3.4165477888730384, + "grad_norm": 8.375, "learning_rate": 8.982666666666667e-05, - "loss": 0.7464, + "loss": 0.8173, "step": 4790 }, { - "epoch": 0.5778954972309174, - "grad_norm": 7.65625, + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, "learning_rate": 8.978222222222222e-05, - "loss": 0.6979, + "loss": 0.9068, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval/acc": 37.20930099487305, + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval_loss": 2.7990331649780273, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.72, - "eval_steps_per_second": 4.831, + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, "step": 4800 }, { - "epoch": 0.5790994461834819, - "grad_norm": 6.90625, + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, "learning_rate": 8.973777777777778e-05, - "loss": 0.7292, + "loss": 0.8262, "step": 4810 }, { - "epoch": 0.5803033951360462, - "grad_norm": 7.34375, + "epoch": 3.4379457917261056, + "grad_norm": 9.125, "learning_rate": 8.969333333333334e-05, - "loss": 0.6484, + "loss": 0.9207, "step": 4820 }, { - "epoch": 0.5815073440886106, - "grad_norm": 7.96875, + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, "learning_rate": 8.964888888888889e-05, - "loss": 0.6246, + "loss": 1.0115, "step": 4830 }, { - "epoch": 0.5827112930411751, - "grad_norm": 5.4375, + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, "learning_rate": 8.960444444444444e-05, - "loss": 0.6978, + "loss": 0.9031, "step": 4840 }, { - "epoch": 0.5839152419937395, - "grad_norm": 7.25, + "epoch": 3.4593437945791727, + "grad_norm": 7.875, "learning_rate": 8.956e-05, - "loss": 0.6848, + "loss": 0.9626, "step": 4850 }, { - "epoch": 0.5851191909463038, - "grad_norm": 8.9375, + "epoch": 3.466476462196862, + "grad_norm": 4.625, "learning_rate": 8.951555555555557e-05, - "loss": 0.7541, + "loss": 0.7793, "step": 4860 }, { - "epoch": 0.5863231398988683, - "grad_norm": 8.6875, + "epoch": 3.473609129814551, + "grad_norm": 7.40625, "learning_rate": 8.947111111111111e-05, - "loss": 0.6872, + "loss": 0.8733, "step": 4870 }, { - "epoch": 0.5875270888514327, - "grad_norm": 6.375, + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, "learning_rate": 8.942666666666668e-05, - "loss": 0.7521, + "loss": 0.8448, "step": 4880 }, { - "epoch": 0.5887310378039972, - "grad_norm": 7.34375, + "epoch": 3.4878744650499285, + "grad_norm": 8.625, "learning_rate": 8.938222222222222e-05, - "loss": 0.6741, + "loss": 0.815, "step": 4890 }, { - "epoch": 0.5899349867565615, - "grad_norm": 9.25, + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, "learning_rate": 8.933777777777779e-05, - "loss": 0.7085, + "loss": 0.7837, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval/acc": 32.55813980102539, + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval_loss": 2.822793483734131, - "eval_runtime": 0.2077, - "eval_samples_per_second": 206.985, - "eval_steps_per_second": 4.814, + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, "step": 4900 }, { - "epoch": 0.5911389357091259, - "grad_norm": 6.75, + "epoch": 3.5021398002853066, + "grad_norm": 7.25, "learning_rate": 8.929333333333333e-05, - "loss": 0.6908, + "loss": 0.9082, "step": 4910 }, { - "epoch": 0.5923428846616904, - "grad_norm": 14.3125, + "epoch": 3.5092724679029956, + "grad_norm": 9.0, "learning_rate": 8.92488888888889e-05, - "loss": 0.6954, + "loss": 0.8041, "step": 4920 }, { - "epoch": 0.5935468336142548, - "grad_norm": 5.03125, + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, "learning_rate": 8.920444444444444e-05, - "loss": 0.6255, + "loss": 0.878, "step": 4930 }, { - "epoch": 0.5947507825668191, - "grad_norm": 7.3125, + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, "learning_rate": 8.916e-05, - "loss": 0.6094, + "loss": 0.8609, "step": 4940 }, { - "epoch": 0.5959547315193836, - "grad_norm": 6.875, + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, "learning_rate": 8.911555555555557e-05, - "loss": 0.6488, + "loss": 0.8203, "step": 4950 }, { - "epoch": 0.597158680471948, - "grad_norm": 6.90625, + "epoch": 3.537803138373752, + "grad_norm": 6.4375, "learning_rate": 8.907111111111112e-05, - "loss": 0.6333, + "loss": 0.8976, "step": 4960 }, { - "epoch": 0.5983626294245123, - "grad_norm": 7.0, + "epoch": 3.544935805991441, + "grad_norm": 15.0, "learning_rate": 8.902666666666667e-05, - "loss": 0.6687, + "loss": 0.8585, "step": 4970 }, { - "epoch": 0.5995665783770768, - "grad_norm": 8.9375, + "epoch": 3.55206847360913, + "grad_norm": 6.21875, "learning_rate": 8.898222222222223e-05, - "loss": 0.6762, + "loss": 0.9642, "step": 4980 }, { - "epoch": 0.6007705273296412, - "grad_norm": 7.53125, + "epoch": 3.559201141226819, + "grad_norm": 9.8125, "learning_rate": 8.893777777777779e-05, - "loss": 0.6007, + "loss": 0.9241, "step": 4990 }, { - "epoch": 0.6019744762822057, - "grad_norm": 5.78125, + "epoch": 3.566333808844508, + "grad_norm": 9.25, "learning_rate": 8.889333333333334e-05, - "loss": 0.682, + "loss": 0.7841, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval/acc": 32.55813980102539, + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval_loss": 2.827073097229004, - "eval_runtime": 0.2073, - "eval_samples_per_second": 207.385, - "eval_steps_per_second": 4.823, + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, "step": 5000 }, { - "epoch": 0.60317842523477, - "grad_norm": 8.25, + "epoch": 3.5734664764621966, + "grad_norm": 7.53125, "learning_rate": 8.884888888888889e-05, - "loss": 0.6711, + "loss": 0.8513, "step": 5010 }, { - "epoch": 0.6043823741873344, - "grad_norm": 7.34375, + "epoch": 3.580599144079886, + "grad_norm": 7.3125, "learning_rate": 8.880444444444445e-05, - "loss": 0.6916, + "loss": 0.9502, "step": 5020 }, { - "epoch": 0.6055863231398989, - "grad_norm": 6.6875, + "epoch": 3.5877318116975747, + "grad_norm": 7.375, "learning_rate": 8.876e-05, - "loss": 0.6601, + "loss": 0.9329, "step": 5030 }, { - "epoch": 0.6067902720924633, - "grad_norm": 6.34375, + "epoch": 3.5948644793152638, + "grad_norm": 7.3125, "learning_rate": 8.871555555555556e-05, - "loss": 0.6945, + "loss": 0.8648, "step": 5040 }, { - "epoch": 0.6079942210450276, - "grad_norm": 6.9375, + "epoch": 3.601997146932953, + "grad_norm": 6.5, "learning_rate": 8.867111111111112e-05, - "loss": 0.6492, + "loss": 0.8019, "step": 5050 }, { - "epoch": 0.6091981699975921, - "grad_norm": 7.1875, + "epoch": 3.609129814550642, + "grad_norm": 9.0, "learning_rate": 8.862666666666667e-05, - "loss": 0.5963, + "loss": 0.8829, "step": 5060 }, { - "epoch": 0.6104021189501565, - "grad_norm": 7.1875, + "epoch": 3.616262482168331, + "grad_norm": 6.46875, "learning_rate": 8.858222222222222e-05, - "loss": 0.6715, + "loss": 0.8419, "step": 5070 }, { - "epoch": 0.6116060679027209, - "grad_norm": 9.25, + "epoch": 3.62339514978602, + "grad_norm": 8.9375, "learning_rate": 8.853777777777778e-05, - "loss": 0.7572, + "loss": 0.9345, "step": 5080 }, { - "epoch": 0.6128100168552854, - "grad_norm": 6.3125, + "epoch": 3.630527817403709, + "grad_norm": 7.09375, "learning_rate": 8.849333333333334e-05, - "loss": 0.7521, + "loss": 0.8204, "step": 5090 }, { - "epoch": 0.6140139658078497, - "grad_norm": 6.9375, + "epoch": 3.637660485021398, + "grad_norm": 7.71875, "learning_rate": 8.844888888888889e-05, - "loss": 0.6313, + "loss": 0.9305, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval/acc": 34.88372039794922, + "epoch": 3.637660485021398, + "eval/acc": 39.53488540649414, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval_loss": 2.9495913982391357, - "eval_runtime": 0.2063, - "eval_samples_per_second": 208.439, - "eval_steps_per_second": 4.847, + "epoch": 3.637660485021398, + "eval_loss": 2.0034291744232178, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 5100 }, { - "epoch": 0.6152179147604142, - "grad_norm": 9.0, + "epoch": 3.644793152639087, + "grad_norm": 6.09375, "learning_rate": 8.840444444444444e-05, - "loss": 0.7974, + "loss": 0.9168, "step": 5110 }, { - "epoch": 0.6164218637129786, - "grad_norm": 5.46875, + "epoch": 3.651925820256776, + "grad_norm": 8.25, "learning_rate": 8.836000000000001e-05, - "loss": 0.6245, + "loss": 0.8155, "step": 5120 }, { - "epoch": 0.617625812665543, - "grad_norm": 9.4375, + "epoch": 3.659058487874465, + "grad_norm": 7.84375, "learning_rate": 8.831555555555556e-05, - "loss": 0.7513, + "loss": 0.8641, "step": 5130 }, { - "epoch": 0.6188297616181074, - "grad_norm": 8.125, + "epoch": 3.666191155492154, + "grad_norm": 6.5, "learning_rate": 8.827111111111111e-05, - "loss": 0.6427, + "loss": 0.8623, "step": 5140 }, { - "epoch": 0.6200337105706718, - "grad_norm": 5.78125, + "epoch": 3.6733238231098433, + "grad_norm": 21.125, "learning_rate": 8.822666666666667e-05, - "loss": 0.6801, + "loss": 0.8205, "step": 5150 }, { - "epoch": 0.6212376595232362, - "grad_norm": 8.8125, + "epoch": 3.680456490727532, + "grad_norm": 7.28125, "learning_rate": 8.818222222222222e-05, - "loss": 0.5978, + "loss": 0.7993, "step": 5160 }, { - "epoch": 0.6224416084758007, - "grad_norm": 8.0, + "epoch": 3.6875891583452214, + "grad_norm": 36.0, "learning_rate": 8.813777777777778e-05, - "loss": 0.6697, + "loss": 0.9083, "step": 5170 }, { - "epoch": 0.623645557428365, - "grad_norm": 8.1875, + "epoch": 3.69472182596291, + "grad_norm": 8.125, "learning_rate": 8.809333333333333e-05, - "loss": 0.7621, + "loss": 0.9264, "step": 5180 }, { - "epoch": 0.6248495063809294, - "grad_norm": 6.4375, + "epoch": 3.701854493580599, + "grad_norm": 10.75, "learning_rate": 8.80488888888889e-05, - "loss": 0.6934, + "loss": 0.8496, "step": 5190 }, { - "epoch": 0.6260534553334939, - "grad_norm": 7.8125, + "epoch": 3.708987161198288, + "grad_norm": 7.78125, "learning_rate": 8.800444444444444e-05, - "loss": 0.7008, + "loss": 0.8718, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval/acc": 34.88372039794922, + "epoch": 3.708987161198288, + "eval/acc": 39.53488540649414, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval_loss": 2.8201522827148438, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.368, - "eval_steps_per_second": 4.729, + "epoch": 3.708987161198288, + "eval_loss": 2.0305864810943604, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.683, + "eval_steps_per_second": 4.504, "step": 5200 }, { - "epoch": 0.6272574042860583, - "grad_norm": 5.78125, + "epoch": 3.716119828815977, + "grad_norm": 9.3125, "learning_rate": 8.796e-05, - "loss": 0.7211, + "loss": 1.0077, "step": 5210 }, { - "epoch": 0.6284613532386227, - "grad_norm": 6.25, + "epoch": 3.723252496433666, + "grad_norm": 11.4375, "learning_rate": 8.791555555555557e-05, - "loss": 0.654, + "loss": 0.8364, "step": 5220 }, { - "epoch": 0.6296653021911871, - "grad_norm": 9.0625, + "epoch": 3.7303851640513552, + "grad_norm": 15.125, "learning_rate": 8.787111111111112e-05, - "loss": 0.6348, + "loss": 0.8557, "step": 5230 }, { - "epoch": 0.6308692511437515, - "grad_norm": 7.59375, + "epoch": 3.7375178316690443, + "grad_norm": 7.875, "learning_rate": 8.782666666666666e-05, - "loss": 0.6363, + "loss": 0.8674, "step": 5240 }, { - "epoch": 0.632073200096316, - "grad_norm": 6.25, + "epoch": 3.7446504992867333, + "grad_norm": 7.84375, "learning_rate": 8.778222222222223e-05, - "loss": 0.629, + "loss": 0.8788, "step": 5250 }, { - "epoch": 0.6332771490488803, - "grad_norm": 12.375, + "epoch": 3.7517831669044224, + "grad_norm": 7.59375, "learning_rate": 8.773777777777779e-05, - "loss": 0.771, + "loss": 0.8098, "step": 5260 }, { - "epoch": 0.6344810980014447, - "grad_norm": 5.96875, + "epoch": 3.7589158345221114, + "grad_norm": 7.40625, "learning_rate": 8.769333333333334e-05, - "loss": 0.589, + "loss": 0.8895, "step": 5270 }, { - "epoch": 0.6356850469540092, - "grad_norm": 7.1875, + "epoch": 3.7660485021398005, + "grad_norm": 6.78125, "learning_rate": 8.76488888888889e-05, - "loss": 0.5794, + "loss": 0.823, "step": 5280 }, { - "epoch": 0.6368889959065736, - "grad_norm": 7.09375, + "epoch": 3.773181169757489, + "grad_norm": 8.125, "learning_rate": 8.760444444444445e-05, - "loss": 0.6449, + "loss": 0.8418, "step": 5290 }, { - "epoch": 0.6380929448591379, - "grad_norm": 11.1875, + "epoch": 3.7803138373751786, + "grad_norm": 8.4375, "learning_rate": 8.756000000000001e-05, - "loss": 0.6708, + "loss": 0.8202, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval/acc": 36.627906799316406, + "epoch": 3.7803138373751786, + "eval/acc": 41.86046600341797, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval_loss": 2.902387857437134, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.456, - "eval_steps_per_second": 4.732, + "epoch": 3.7803138373751786, + "eval_loss": 2.100001811981201, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.218, + "eval_steps_per_second": 4.47, "step": 5300 }, { - "epoch": 0.6392968938117024, - "grad_norm": 8.625, + "epoch": 3.787446504992867, + "grad_norm": 7.78125, "learning_rate": 8.751555555555556e-05, - "loss": 0.5895, + "loss": 0.9786, "step": 5310 }, { - "epoch": 0.6405008427642668, - "grad_norm": 8.625, + "epoch": 3.794579172610556, + "grad_norm": 14.125, "learning_rate": 8.747111111111112e-05, - "loss": 0.6012, + "loss": 1.0893, "step": 5320 }, { - "epoch": 0.6417047917168313, - "grad_norm": 5.25, + "epoch": 3.8017118402282453, + "grad_norm": 6.71875, "learning_rate": 8.742666666666667e-05, - "loss": 0.6262, + "loss": 0.8484, "step": 5330 }, { - "epoch": 0.6429087406693956, - "grad_norm": 8.5625, + "epoch": 3.8088445078459343, + "grad_norm": 7.53125, "learning_rate": 8.738222222222222e-05, - "loss": 0.7584, + "loss": 0.922, "step": 5340 }, { - "epoch": 0.64411268962196, - "grad_norm": 7.53125, + "epoch": 3.8159771754636234, + "grad_norm": 6.9375, "learning_rate": 8.733777777777779e-05, - "loss": 0.6793, + "loss": 0.87, "step": 5350 }, { - "epoch": 0.6453166385745245, - "grad_norm": 9.625, + "epoch": 3.8231098430813124, + "grad_norm": 6.75, "learning_rate": 8.729333333333334e-05, - "loss": 0.6166, + "loss": 0.9272, "step": 5360 }, { - "epoch": 0.6465205875270889, - "grad_norm": 7.0625, + "epoch": 3.8302425106990015, + "grad_norm": 6.875, "learning_rate": 8.724888888888889e-05, - "loss": 0.667, + "loss": 0.8358, "step": 5370 }, { - "epoch": 0.6477245364796532, - "grad_norm": 6.90625, + "epoch": 3.8373751783166905, + "grad_norm": 7.53125, "learning_rate": 8.720444444444445e-05, - "loss": 0.6427, + "loss": 0.8764, "step": 5380 }, { - "epoch": 0.6489284854322177, + "epoch": 3.8445078459343796, "grad_norm": 7.96875, "learning_rate": 8.716000000000001e-05, - "loss": 0.7689, + "loss": 0.9348, "step": 5390 }, { - "epoch": 0.6501324343847821, - "grad_norm": 8.9375, + "epoch": 3.8516405135520686, + "grad_norm": 7.5625, "learning_rate": 8.711555555555556e-05, - "loss": 0.6957, + "loss": 0.9033, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval/acc": 34.88372039794922, + "epoch": 3.8516405135520686, + "eval/acc": 39.53488540649414, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval_loss": 2.8916988372802734, - "eval_runtime": 0.2068, - "eval_samples_per_second": 207.976, - "eval_steps_per_second": 4.837, + "epoch": 3.8516405135520686, + "eval_loss": 2.0633187294006348, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.324, + "eval_steps_per_second": 4.449, "step": 5400 }, { - "epoch": 0.6513363833373464, - "grad_norm": 6.34375, + "epoch": 3.8587731811697576, + "grad_norm": 6.90625, "learning_rate": 8.707111111111111e-05, - "loss": 0.6811, + "loss": 0.9344, "step": 5410 }, { - "epoch": 0.6525403322899109, - "grad_norm": 6.71875, + "epoch": 3.8659058487874463, + "grad_norm": 7.5, "learning_rate": 8.702666666666667e-05, - "loss": 0.6849, + "loss": 0.9346, "step": 5420 }, { - "epoch": 0.6537442812424753, - "grad_norm": 6.46875, + "epoch": 3.8730385164051357, + "grad_norm": 7.03125, "learning_rate": 8.698222222222223e-05, - "loss": 0.6134, + "loss": 0.8835, "step": 5430 }, { - "epoch": 0.6549482301950398, - "grad_norm": 10.5, + "epoch": 3.8801711840228243, + "grad_norm": 6.3125, "learning_rate": 8.693777777777778e-05, - "loss": 0.6213, + "loss": 0.8434, "step": 5440 }, { - "epoch": 0.6561521791476042, - "grad_norm": 6.25, + "epoch": 3.8873038516405134, + "grad_norm": 7.03125, "learning_rate": 8.689333333333334e-05, - "loss": 0.6892, + "loss": 0.8555, "step": 5450 }, { - "epoch": 0.6573561281001685, - "grad_norm": 7.0, + "epoch": 3.8944365192582024, + "grad_norm": 8.0, "learning_rate": 8.684888888888889e-05, - "loss": 0.6003, + "loss": 0.9287, "step": 5460 }, { - "epoch": 0.658560077052733, - "grad_norm": 7.46875, + "epoch": 3.9015691868758915, + "grad_norm": 8.1875, "learning_rate": 8.680444444444444e-05, - "loss": 0.726, + "loss": 0.8738, "step": 5470 }, { - "epoch": 0.6597640260052974, - "grad_norm": 6.0, + "epoch": 3.9087018544935805, + "grad_norm": 7.96875, "learning_rate": 8.676e-05, - "loss": 0.7526, + "loss": 0.8189, "step": 5480 }, { - "epoch": 0.6609679749578617, - "grad_norm": 9.875, + "epoch": 3.9158345221112696, + "grad_norm": 10.1875, "learning_rate": 8.671555555555556e-05, - "loss": 0.603, + "loss": 0.8983, "step": 5490 }, { - "epoch": 0.6621719239104262, - "grad_norm": 13.6875, + "epoch": 3.9229671897289586, + "grad_norm": 10.375, "learning_rate": 8.667111111111111e-05, - "loss": 0.6759, + "loss": 0.8083, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval/acc": 34.88372039794922, + "epoch": 3.9229671897289586, + "eval/acc": 39.53488540649414, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval_loss": 2.915025234222412, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.294, - "eval_steps_per_second": 4.821, + "epoch": 3.9229671897289586, + "eval_loss": 2.089243173599243, + "eval_runtime": 0.2203, + "eval_samples_per_second": 195.23, + "eval_steps_per_second": 4.54, "step": 5500 }, { - "epoch": 0.6633758728629906, - "grad_norm": 8.8125, + "epoch": 3.9300998573466477, + "grad_norm": 13.125, "learning_rate": 8.662666666666666e-05, - "loss": 0.6582, + "loss": 0.8747, "step": 5510 }, { - "epoch": 0.664579821815555, - "grad_norm": 7.6875, + "epoch": 3.9372325249643367, + "grad_norm": 8.25, "learning_rate": 8.658222222222224e-05, - "loss": 0.6219, + "loss": 0.8609, "step": 5520 }, { - "epoch": 0.6657837707681195, - "grad_norm": 9.25, + "epoch": 3.944365192582026, + "grad_norm": 6.75, "learning_rate": 8.653777777777779e-05, - "loss": 0.742, + "loss": 0.8563, "step": 5530 }, { - "epoch": 0.6669877197206838, - "grad_norm": 6.59375, + "epoch": 3.951497860199715, + "grad_norm": 7.75, "learning_rate": 8.649333333333333e-05, - "loss": 0.653, + "loss": 0.8912, "step": 5540 }, { - "epoch": 0.6681916686732483, - "grad_norm": 9.25, + "epoch": 3.9586305278174034, + "grad_norm": 6.40625, "learning_rate": 8.64488888888889e-05, - "loss": 0.67, + "loss": 0.7477, "step": 5550 }, { - "epoch": 0.6693956176258127, - "grad_norm": 7.59375, + "epoch": 3.965763195435093, + "grad_norm": 7.0, "learning_rate": 8.640444444444444e-05, - "loss": 0.7448, + "loss": 0.8185, "step": 5560 }, { - "epoch": 0.670599566578377, - "grad_norm": 7.125, + "epoch": 3.9728958630527815, + "grad_norm": 5.6875, "learning_rate": 8.636e-05, - "loss": 0.607, + "loss": 0.9497, "step": 5570 }, { - "epoch": 0.6718035155309415, - "grad_norm": 6.59375, + "epoch": 3.980028530670471, + "grad_norm": 8.0, "learning_rate": 8.631555555555556e-05, - "loss": 0.6398, + "loss": 0.8117, "step": 5580 }, { - "epoch": 0.6730074644835059, - "grad_norm": 6.21875, + "epoch": 3.9871611982881596, + "grad_norm": 6.625, "learning_rate": 8.627111111111112e-05, - "loss": 0.6334, + "loss": 0.8245, "step": 5590 }, { - "epoch": 0.6742114134360703, - "grad_norm": 7.0625, + "epoch": 3.9942938659058487, + "grad_norm": 6.96875, "learning_rate": 8.622666666666667e-05, - "loss": 0.6878, + "loss": 0.902, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval/acc": 32.55813980102539, + "epoch": 3.9942938659058487, + "eval/acc": 39.53488540649414, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval_loss": 2.8182010650634766, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.724, - "eval_steps_per_second": 4.831, + "epoch": 3.9942938659058487, + "eval_loss": 2.186225652694702, + "eval_runtime": 0.2194, + "eval_samples_per_second": 196.001, + "eval_steps_per_second": 4.558, "step": 5600 }, { - "epoch": 0.6754153623886348, - "grad_norm": 7.9375, + "epoch": 4.001426533523538, + "grad_norm": 6.78125, "learning_rate": 8.618222222222223e-05, - "loss": 0.6577, + "loss": 0.8757, "step": 5610 }, { - "epoch": 0.6766193113411991, - "grad_norm": 7.34375, + "epoch": 4.008559201141227, + "grad_norm": 11.0625, "learning_rate": 8.613777777777779e-05, - "loss": 0.7787, + "loss": 0.885, "step": 5620 }, { - "epoch": 0.6778232602937635, - "grad_norm": 6.96875, + "epoch": 4.015691868758916, + "grad_norm": 6.4375, "learning_rate": 8.609333333333334e-05, - "loss": 0.7849, + "loss": 0.8611, "step": 5630 }, { - "epoch": 0.679027209246328, - "grad_norm": 16.125, + "epoch": 4.022824536376604, + "grad_norm": 14.8125, "learning_rate": 8.604888888888889e-05, - "loss": 0.8503, + "loss": 0.8262, "step": 5640 }, { - "epoch": 0.6802311581988923, - "grad_norm": 7.625, + "epoch": 4.029957203994294, + "grad_norm": 8.0625, "learning_rate": 8.600444444444445e-05, - "loss": 0.6215, + "loss": 0.7549, "step": 5650 }, { - "epoch": 0.6814351071514568, - "grad_norm": 7.28125, + "epoch": 4.0370898716119825, + "grad_norm": 6.84375, "learning_rate": 8.596000000000001e-05, - "loss": 0.6894, + "loss": 0.8725, "step": 5660 }, { - "epoch": 0.6826390561040212, - "grad_norm": 6.28125, + "epoch": 4.044222539229672, + "grad_norm": 8.0, "learning_rate": 8.591555555555556e-05, - "loss": 0.616, + "loss": 0.8846, "step": 5670 }, { - "epoch": 0.6838430050565856, - "grad_norm": 6.125, + "epoch": 4.051355206847361, + "grad_norm": 7.84375, "learning_rate": 8.587111111111111e-05, - "loss": 0.6417, + "loss": 0.9373, "step": 5680 }, { - "epoch": 0.68504695400915, - "grad_norm": 7.78125, + "epoch": 4.05848787446505, + "grad_norm": 6.84375, "learning_rate": 8.582666666666667e-05, - "loss": 0.7842, + "loss": 0.7823, "step": 5690 }, { - "epoch": 0.6862509029617144, - "grad_norm": 9.4375, + "epoch": 4.065620542082739, + "grad_norm": 11.4375, "learning_rate": 8.578222222222223e-05, - "loss": 0.6562, + "loss": 0.9588, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval/acc": 32.55813980102539, + "epoch": 4.065620542082739, + "eval/acc": 37.20930099487305, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval_loss": 2.861806869506836, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.449, - "eval_steps_per_second": 4.801, + "epoch": 4.065620542082739, + "eval_loss": 2.841008424758911, + "eval_runtime": 1.3984, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.715, "step": 5700 }, { - "epoch": 0.6874548519142788, - "grad_norm": 6.46875, + "epoch": 4.072753209700428, + "grad_norm": 5.5625, "learning_rate": 8.573777777777778e-05, - "loss": 0.6165, + "loss": 0.8014, "step": 5710 }, { - "epoch": 0.6886588008668433, - "grad_norm": 7.0625, + "epoch": 4.079885877318117, + "grad_norm": 6.90625, "learning_rate": 8.569333333333334e-05, - "loss": 0.7014, + "loss": 0.818, "step": 5720 }, { - "epoch": 0.6898627498194077, - "grad_norm": 8.0625, + "epoch": 4.087018544935806, + "grad_norm": 8.4375, "learning_rate": 8.564888888888889e-05, - "loss": 0.7459, + "loss": 0.8142, "step": 5730 }, { - "epoch": 0.691066698771972, - "grad_norm": 5.84375, + "epoch": 4.094151212553495, + "grad_norm": 7.75, "learning_rate": 8.560444444444445e-05, - "loss": 0.6708, + "loss": 0.863, "step": 5740 }, { - "epoch": 0.6922706477245365, - "grad_norm": 7.9375, + "epoch": 4.101283880171184, + "grad_norm": 6.90625, "learning_rate": 8.556e-05, - "loss": 0.6487, + "loss": 0.8501, "step": 5750 }, { - "epoch": 0.6934745966771009, - "grad_norm": 8.125, + "epoch": 4.108416547788873, + "grad_norm": 7.15625, "learning_rate": 8.551555555555556e-05, - "loss": 0.6634, + "loss": 0.8293, "step": 5760 }, { - "epoch": 0.6946785456296654, - "grad_norm": 5.0, + "epoch": 4.1155492154065625, + "grad_norm": 8.125, "learning_rate": 8.547111111111111e-05, - "loss": 0.6575, + "loss": 0.8655, "step": 5770 }, { - "epoch": 0.6958824945822297, - "grad_norm": 6.28125, + "epoch": 4.122681883024251, + "grad_norm": 7.75, "learning_rate": 8.542666666666666e-05, - "loss": 0.6661, + "loss": 0.7958, "step": 5780 }, { - "epoch": 0.6970864435347941, - "grad_norm": 6.5, + "epoch": 4.12981455064194, + "grad_norm": 8.3125, "learning_rate": 8.538222222222224e-05, - "loss": 0.6922, + "loss": 0.9186, "step": 5790 }, { - "epoch": 0.6982903924873586, - "grad_norm": 9.0625, + "epoch": 4.136947218259629, + "grad_norm": 7.0625, "learning_rate": 8.533777777777778e-05, - "loss": 0.687, + "loss": 0.9135, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval/acc": 37.79069900512695, + "epoch": 4.136947218259629, + "eval/acc": 37.20930099487305, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval_loss": 2.878754138946533, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.039, - "eval_steps_per_second": 4.745, + "epoch": 4.136947218259629, + "eval_loss": 2.8186914920806885, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.722, + "eval_steps_per_second": 4.645, "step": 5800 }, { - "epoch": 0.699494341439923, - "grad_norm": 8.875, + "epoch": 4.144079885877318, + "grad_norm": 8.125, "learning_rate": 8.529333333333333e-05, - "loss": 0.7106, + "loss": 0.8248, "step": 5810 }, { - "epoch": 0.7006982903924873, - "grad_norm": 8.3125, + "epoch": 4.151212553495007, + "grad_norm": 7.65625, "learning_rate": 8.52488888888889e-05, - "loss": 0.5969, + "loss": 0.9186, "step": 5820 }, { - "epoch": 0.7019022393450518, - "grad_norm": 6.40625, + "epoch": 4.158345221112696, + "grad_norm": 7.6875, "learning_rate": 8.520444444444446e-05, - "loss": 0.6795, + "loss": 0.8367, "step": 5830 }, { - "epoch": 0.7031061882976162, - "grad_norm": 8.5625, + "epoch": 4.165477888730385, + "grad_norm": 9.75, "learning_rate": 8.516e-05, - "loss": 0.7621, + "loss": 0.8898, "step": 5840 }, { - "epoch": 0.7043101372501805, - "grad_norm": 9.5625, + "epoch": 4.172610556348074, + "grad_norm": 8.5625, "learning_rate": 8.511555555555555e-05, - "loss": 0.7035, + "loss": 0.9218, "step": 5850 }, { - "epoch": 0.705514086202745, - "grad_norm": 11.3125, + "epoch": 4.1797432239657635, + "grad_norm": 6.0, "learning_rate": 8.507111111111112e-05, - "loss": 0.8043, + "loss": 0.8784, "step": 5860 }, { - "epoch": 0.7067180351553094, - "grad_norm": 7.4375, + "epoch": 4.186875891583452, + "grad_norm": 8.5625, "learning_rate": 8.502666666666666e-05, - "loss": 0.6349, + "loss": 0.8361, "step": 5870 }, { - "epoch": 0.7079219841078739, - "grad_norm": 6.28125, + "epoch": 4.194008559201142, + "grad_norm": 7.40625, "learning_rate": 8.498222222222223e-05, - "loss": 0.6593, + "loss": 0.816, "step": 5880 }, { - "epoch": 0.7091259330604383, - "grad_norm": 6.4375, + "epoch": 4.20114122681883, + "grad_norm": 7.84375, "learning_rate": 8.493777777777779e-05, - "loss": 0.6236, + "loss": 0.897, "step": 5890 }, { - "epoch": 0.7103298820130026, - "grad_norm": 7.84375, + "epoch": 4.20827389443652, + "grad_norm": 10.0625, "learning_rate": 8.489333333333334e-05, - "loss": 0.6134, + "loss": 0.7807, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval/acc": 34.88372039794922, + "epoch": 4.20827389443652, + "eval/acc": 37.20930099487305, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval_loss": 2.918956756591797, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.945, - "eval_steps_per_second": 4.696, + "epoch": 4.20827389443652, + "eval_loss": 2.890333890914917, + "eval_runtime": 0.2187, + "eval_samples_per_second": 196.595, + "eval_steps_per_second": 4.572, "step": 5900 }, { - "epoch": 0.7115338309655671, - "grad_norm": 7.40625, + "epoch": 4.215406562054208, + "grad_norm": 7.6875, "learning_rate": 8.484888888888888e-05, - "loss": 0.5883, + "loss": 0.8786, "step": 5910 }, { - "epoch": 0.7127377799181315, - "grad_norm": 7.0625, + "epoch": 4.222539229671897, + "grad_norm": 7.46875, "learning_rate": 8.480444444444445e-05, - "loss": 0.6805, + "loss": 0.8689, "step": 5920 }, { - "epoch": 0.7139417288706958, - "grad_norm": 5.25, + "epoch": 4.229671897289586, + "grad_norm": 14.125, "learning_rate": 8.476000000000001e-05, - "loss": 0.5638, + "loss": 0.83, "step": 5930 }, { - "epoch": 0.7151456778232603, - "grad_norm": 5.84375, + "epoch": 4.236804564907275, + "grad_norm": 6.09375, "learning_rate": 8.471555555555556e-05, - "loss": 0.6112, + "loss": 0.8921, "step": 5940 }, { - "epoch": 0.7163496267758247, - "grad_norm": 6.5625, + "epoch": 4.2439372325249645, + "grad_norm": 8.875, "learning_rate": 8.467111111111112e-05, - "loss": 0.6147, + "loss": 0.9293, "step": 5950 }, { - "epoch": 0.7175535757283891, - "grad_norm": 6.15625, + "epoch": 4.251069900142653, + "grad_norm": 10.5625, "learning_rate": 8.462666666666667e-05, - "loss": 0.7292, + "loss": 0.7955, "step": 5960 }, { - "epoch": 0.7187575246809536, - "grad_norm": 8.25, + "epoch": 4.258202567760343, + "grad_norm": 15.25, "learning_rate": 8.458222222222223e-05, - "loss": 0.6048, + "loss": 0.9267, "step": 5970 }, { - "epoch": 0.7199614736335179, - "grad_norm": 8.0625, + "epoch": 4.265335235378031, + "grad_norm": 8.0, "learning_rate": 8.453777777777778e-05, - "loss": 0.581, + "loss": 0.7665, "step": 5980 }, { - "epoch": 0.7211654225860824, - "grad_norm": 7.90625, + "epoch": 4.272467902995721, + "grad_norm": 6.4375, "learning_rate": 8.449333333333334e-05, - "loss": 0.6918, + "loss": 0.8212, "step": 5990 }, { - "epoch": 0.7223693715386468, - "grad_norm": 5.65625, + "epoch": 4.279600570613409, + "grad_norm": 8.0625, "learning_rate": 8.444888888888889e-05, - "loss": 0.6774, + "loss": 0.8294, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval/acc": 36.627906799316406, + "epoch": 4.279600570613409, + "eval/acc": 34.88372039794922, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval_loss": 2.936192512512207, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.531, - "eval_steps_per_second": 4.733, + "epoch": 4.279600570613409, + "eval_loss": 2.8812708854675293, + "eval_runtime": 0.2262, + "eval_samples_per_second": 190.082, + "eval_steps_per_second": 4.421, "step": 6000 }, { - "epoch": 0.7235733204912111, - "grad_norm": 7.59375, + "epoch": 4.286733238231099, + "grad_norm": 5.625, "learning_rate": 8.440444444444445e-05, - "loss": 0.5982, + "loss": 0.8813, "step": 6010 }, { - "epoch": 0.7247772694437756, - "grad_norm": 9.0625, + "epoch": 4.293865905848787, + "grad_norm": 8.375, "learning_rate": 8.436000000000001e-05, - "loss": 0.6048, + "loss": 0.8792, "step": 6020 }, { - "epoch": 0.72598121839634, - "grad_norm": 7.46875, + "epoch": 4.300998573466477, + "grad_norm": 9.125, "learning_rate": 8.431555555555556e-05, - "loss": 0.7024, + "loss": 0.9509, "step": 6030 }, { - "epoch": 0.7271851673489044, - "grad_norm": 8.0625, + "epoch": 4.3081312410841655, + "grad_norm": 7.34375, "learning_rate": 8.427111111111111e-05, - "loss": 0.7556, + "loss": 0.9452, "step": 6040 }, { - "epoch": 0.7283891163014689, - "grad_norm": 6.78125, + "epoch": 4.315263908701855, + "grad_norm": 8.25, "learning_rate": 8.422666666666667e-05, - "loss": 0.7187, + "loss": 0.8801, "step": 6050 }, { - "epoch": 0.7295930652540332, - "grad_norm": 6.8125, + "epoch": 4.3223965763195435, + "grad_norm": 6.75, "learning_rate": 8.418222222222223e-05, - "loss": 0.5774, + "loss": 0.805, "step": 6060 }, { - "epoch": 0.7307970142065976, - "grad_norm": 6.9375, + "epoch": 4.329529243937232, + "grad_norm": 8.375, "learning_rate": 8.413777777777778e-05, - "loss": 0.6724, + "loss": 0.8176, "step": 6070 }, { - "epoch": 0.7320009631591621, + "epoch": 4.336661911554922, "grad_norm": 6.1875, "learning_rate": 8.409333333333333e-05, - "loss": 0.6109, + "loss": 0.8662, "step": 6080 }, { - "epoch": 0.7332049121117264, - "grad_norm": 5.84375, + "epoch": 4.34379457917261, + "grad_norm": 6.03125, "learning_rate": 8.404888888888889e-05, - "loss": 0.6251, + "loss": 0.9121, "step": 6090 }, { - "epoch": 0.7344088610642908, - "grad_norm": 6.78125, + "epoch": 4.3509272467903, + "grad_norm": 5.6875, "learning_rate": 8.400444444444445e-05, - "loss": 0.6916, + "loss": 0.8697, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval/acc": 32.55813980102539, + "epoch": 4.3509272467903, + "eval/acc": 39.53488540649414, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval_loss": 2.947686195373535, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.91, - "eval_steps_per_second": 4.789, + "epoch": 4.3509272467903, + "eval_loss": 2.7605249881744385, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.191, + "eval_steps_per_second": 4.493, "step": 6100 }, { - "epoch": 0.7356128100168553, - "grad_norm": 6.96875, + "epoch": 4.358059914407988, + "grad_norm": 8.125, "learning_rate": 8.396e-05, - "loss": 0.6525, + "loss": 0.783, "step": 6110 }, { - "epoch": 0.7368167589694197, - "grad_norm": 9.625, + "epoch": 4.365192582025678, + "grad_norm": 6.71875, "learning_rate": 8.391555555555556e-05, - "loss": 0.6107, + "loss": 0.7273, "step": 6120 }, { - "epoch": 0.7380207079219842, - "grad_norm": 5.84375, + "epoch": 4.372325249643366, + "grad_norm": 7.625, "learning_rate": 8.387111111111111e-05, - "loss": 0.6339, + "loss": 0.9497, "step": 6130 }, { - "epoch": 0.7392246568745485, - "grad_norm": 8.0, + "epoch": 4.379457917261056, + "grad_norm": 7.625, "learning_rate": 8.382666666666667e-05, - "loss": 0.6243, + "loss": 0.9318, "step": 6140 }, { - "epoch": 0.7404286058271129, - "grad_norm": 7.9375, + "epoch": 4.3865905848787445, + "grad_norm": 7.5625, "learning_rate": 8.378222222222222e-05, - "loss": 0.6644, + "loss": 0.7827, "step": 6150 }, { - "epoch": 0.7416325547796774, + "epoch": 4.393723252496434, "grad_norm": 7.4375, "learning_rate": 8.373777777777779e-05, - "loss": 0.6117, + "loss": 0.8471, "step": 6160 }, { - "epoch": 0.7428365037322417, - "grad_norm": 7.28125, + "epoch": 4.400855920114123, + "grad_norm": 5.59375, "learning_rate": 8.369333333333333e-05, - "loss": 0.6253, + "loss": 0.866, "step": 6170 }, { - "epoch": 0.7440404526848061, - "grad_norm": 6.59375, + "epoch": 4.407988587731811, + "grad_norm": 5.34375, "learning_rate": 8.364888888888888e-05, - "loss": 0.5973, + "loss": 0.8237, "step": 6180 }, { - "epoch": 0.7452444016373706, - "grad_norm": 8.5, + "epoch": 4.415121255349501, + "grad_norm": 9.375, "learning_rate": 8.360444444444446e-05, - "loss": 0.5938, + "loss": 0.896, "step": 6190 }, { - "epoch": 0.746448350589935, - "grad_norm": 6.40625, + "epoch": 4.422253922967189, + "grad_norm": 7.78125, "learning_rate": 8.356e-05, - "loss": 0.7276, + "loss": 0.8402, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval/acc": 34.88372039794922, + "epoch": 4.422253922967189, + "eval/acc": 37.20930099487305, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval_loss": 3.0573887825012207, - "eval_runtime": 0.2067, - "eval_samples_per_second": 208.014, - "eval_steps_per_second": 4.838, + "epoch": 4.422253922967189, + "eval_loss": 2.8444175720214844, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.997, + "eval_steps_per_second": 4.512, "step": 6200 }, { - "epoch": 0.7476522995424993, - "grad_norm": 6.75, + "epoch": 4.429386590584879, + "grad_norm": 7.625, "learning_rate": 8.351555555555555e-05, - "loss": 0.6518, + "loss": 0.8708, "step": 6210 }, { - "epoch": 0.7488562484950638, - "grad_norm": 6.5, + "epoch": 4.436519258202567, + "grad_norm": 7.28125, "learning_rate": 8.347111111111112e-05, - "loss": 0.5737, + "loss": 0.8505, "step": 6220 }, { - "epoch": 0.7500601974476282, - "grad_norm": 7.96875, + "epoch": 4.443651925820257, + "grad_norm": 7.28125, "learning_rate": 8.342666666666668e-05, - "loss": 0.743, + "loss": 0.878, "step": 6230 }, { - "epoch": 0.7512641464001927, - "grad_norm": 8.375, + "epoch": 4.4507845934379455, + "grad_norm": 8.0, "learning_rate": 8.338222222222223e-05, - "loss": 0.6803, + "loss": 0.7568, "step": 6240 }, { - "epoch": 0.752468095352757, - "grad_norm": 10.9375, + "epoch": 4.457917261055635, + "grad_norm": 7.28125, "learning_rate": 8.333777777777778e-05, - "loss": 0.8047, + "loss": 0.7909, "step": 6250 }, { - "epoch": 0.7536720443053214, - "grad_norm": 6.21875, + "epoch": 4.465049928673324, + "grad_norm": 10.625, "learning_rate": 8.329333333333334e-05, - "loss": 0.5941, + "loss": 0.8732, "step": 6260 }, { - "epoch": 0.7548759932578859, - "grad_norm": 7.0, + "epoch": 4.472182596291013, + "grad_norm": 7.40625, "learning_rate": 8.324888888888889e-05, - "loss": 0.673, + "loss": 0.8827, "step": 6270 }, { - "epoch": 0.7560799422104503, - "grad_norm": 5.6875, + "epoch": 4.479315263908702, + "grad_norm": 11.25, "learning_rate": 8.320444444444445e-05, - "loss": 0.6869, + "loss": 0.7889, "step": 6280 }, { - "epoch": 0.7572838911630146, - "grad_norm": 7.46875, + "epoch": 4.486447931526391, + "grad_norm": 7.59375, "learning_rate": 8.316000000000001e-05, - "loss": 0.7399, + "loss": 0.7808, "step": 6290 }, { - "epoch": 0.7584878401155791, - "grad_norm": 7.21875, + "epoch": 4.49358059914408, + "grad_norm": 5.40625, "learning_rate": 8.311555555555556e-05, - "loss": 0.6582, + "loss": 0.8223, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval/acc": 34.88372039794922, + "epoch": 4.49358059914408, + "eval/acc": 37.20930099487305, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval_loss": 2.991325616836548, - "eval_runtime": 0.2058, - "eval_samples_per_second": 208.93, - "eval_steps_per_second": 4.859, + "epoch": 4.49358059914408, + "eval_loss": 2.798743963241577, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.44, + "eval_steps_per_second": 4.592, "step": 6300 }, { - "epoch": 0.7596917890681435, - "grad_norm": 7.5625, + "epoch": 4.500713266761769, + "grad_norm": 7.9375, "learning_rate": 8.307111111111111e-05, - "loss": 0.6455, + "loss": 0.8588, "step": 6310 }, { - "epoch": 0.7608957380207079, - "grad_norm": 5.0625, + "epoch": 4.507845934379458, + "grad_norm": 8.0625, "learning_rate": 8.302666666666667e-05, - "loss": 0.6269, + "loss": 0.9003, "step": 6320 }, { - "epoch": 0.7620996869732723, - "grad_norm": 7.15625, + "epoch": 4.5149786019971465, + "grad_norm": 7.21875, "learning_rate": 8.298222222222223e-05, - "loss": 0.6453, + "loss": 0.8942, "step": 6330 }, { - "epoch": 0.7633036359258367, - "grad_norm": 6.34375, + "epoch": 4.522111269614836, + "grad_norm": 7.625, "learning_rate": 8.293777777777778e-05, - "loss": 0.6721, + "loss": 0.8622, "step": 6340 }, { - "epoch": 0.7645075848784012, - "grad_norm": 7.59375, + "epoch": 4.529243937232525, + "grad_norm": 5.53125, "learning_rate": 8.289333333333333e-05, - "loss": 0.569, + "loss": 0.8048, "step": 6350 }, { - "epoch": 0.7657115338309656, - "grad_norm": 6.78125, + "epoch": 4.536376604850214, + "grad_norm": 9.125, "learning_rate": 8.28488888888889e-05, - "loss": 0.6221, + "loss": 0.8506, "step": 6360 }, { - "epoch": 0.76691548278353, - "grad_norm": 9.875, + "epoch": 4.543509272467903, + "grad_norm": 6.125, "learning_rate": 8.280444444444445e-05, - "loss": 0.6623, + "loss": 0.7767, "step": 6370 }, { - "epoch": 0.7681194317360944, - "grad_norm": 7.125, + "epoch": 4.550641940085592, + "grad_norm": 6.90625, "learning_rate": 8.276e-05, - "loss": 0.7166, + "loss": 0.9143, "step": 6380 }, { - "epoch": 0.7693233806886588, - "grad_norm": 7.59375, + "epoch": 4.557774607703281, + "grad_norm": 5.84375, "learning_rate": 8.271555555555556e-05, - "loss": 0.6984, + "loss": 0.8641, "step": 6390 }, { - "epoch": 0.7705273296412232, - "grad_norm": 9.4375, + "epoch": 4.56490727532097, + "grad_norm": 6.3125, "learning_rate": 8.267111111111111e-05, - "loss": 0.7095, + "loss": 0.8297, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval/acc": 34.88372039794922, + "epoch": 4.56490727532097, + "eval/acc": 37.20930099487305, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval_loss": 3.0461771488189697, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.746, - "eval_steps_per_second": 4.808, + "epoch": 4.56490727532097, + "eval_loss": 2.804457426071167, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.742, + "eval_steps_per_second": 4.529, "step": 6400 }, { - "epoch": 0.7717312785937877, - "grad_norm": 9.375, + "epoch": 4.572039942938659, + "grad_norm": 7.15625, "learning_rate": 8.262666666666667e-05, - "loss": 0.6975, + "loss": 0.7398, "step": 6410 }, { - "epoch": 0.772935227546352, - "grad_norm": 6.75, + "epoch": 4.579172610556348, + "grad_norm": 6.125, "learning_rate": 8.258222222222222e-05, - "loss": 0.5826, + "loss": 0.8443, "step": 6420 }, { - "epoch": 0.7741391764989164, - "grad_norm": 8.25, + "epoch": 4.586305278174037, + "grad_norm": 9.25, "learning_rate": 8.253777777777778e-05, - "loss": 0.6596, + "loss": 0.7983, "step": 6430 }, { - "epoch": 0.7753431254514809, - "grad_norm": 6.375, + "epoch": 4.5934379457917265, + "grad_norm": 7.3125, "learning_rate": 8.249333333333333e-05, - "loss": 0.6624, + "loss": 0.9705, "step": 6440 }, { - "epoch": 0.7765470744040452, - "grad_norm": 7.375, + "epoch": 4.600570613409415, + "grad_norm": 7.34375, "learning_rate": 8.24488888888889e-05, - "loss": 0.6221, + "loss": 1.0079, "step": 6450 }, { - "epoch": 0.7777510233566097, - "grad_norm": 8.125, + "epoch": 4.607703281027105, + "grad_norm": 8.875, "learning_rate": 8.240444444444446e-05, - "loss": 0.6819, + "loss": 0.8982, "step": 6460 }, { - "epoch": 0.7789549723091741, - "grad_norm": 4.375, + "epoch": 4.614835948644793, + "grad_norm": 8.375, "learning_rate": 8.236e-05, - "loss": 0.588, + "loss": 0.8417, "step": 6470 }, { - "epoch": 0.7801589212617385, - "grad_norm": 8.875, + "epoch": 4.621968616262482, + "grad_norm": 7.78125, "learning_rate": 8.231555555555555e-05, - "loss": 0.7451, + "loss": 0.8566, "step": 6480 }, { - "epoch": 0.781362870214303, - "grad_norm": 8.5, + "epoch": 4.629101283880171, + "grad_norm": 6.5625, "learning_rate": 8.227111111111111e-05, - "loss": 0.64, + "loss": 0.8155, "step": 6490 }, { - "epoch": 0.7825668191668673, - "grad_norm": 6.59375, + "epoch": 4.63623395149786, + "grad_norm": 5.875, "learning_rate": 8.222666666666668e-05, - "loss": 0.6879, + "loss": 0.9449, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval/acc": 32.55813980102539, + "epoch": 4.63623395149786, + "eval/acc": 41.86046600341797, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval_loss": 2.970376491546631, - "eval_runtime": 0.2075, - "eval_samples_per_second": 207.198, - "eval_steps_per_second": 4.819, + "epoch": 4.63623395149786, + "eval_loss": 2.761596918106079, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.549, + "eval_steps_per_second": 4.664, "step": 6500 }, { - "epoch": 0.7837707681194317, - "grad_norm": 6.96875, + "epoch": 4.643366619115549, + "grad_norm": 7.5, "learning_rate": 8.218222222222223e-05, - "loss": 0.6584, + "loss": 0.8549, "step": 6510 }, { - "epoch": 0.7849747170719962, - "grad_norm": 7.3125, + "epoch": 4.650499286733238, + "grad_norm": 7.0625, "learning_rate": 8.213777777777777e-05, - "loss": 0.6892, + "loss": 0.8473, "step": 6520 }, { - "epoch": 0.7861786660245605, - "grad_norm": 6.28125, + "epoch": 4.6576319543509275, + "grad_norm": 7.1875, "learning_rate": 8.209333333333334e-05, - "loss": 0.6658, + "loss": 0.8773, "step": 6530 }, { - "epoch": 0.7873826149771249, - "grad_norm": 7.3125, + "epoch": 4.664764621968616, + "grad_norm": 7.25, "learning_rate": 8.20488888888889e-05, - "loss": 0.6379, + "loss": 0.789, "step": 6540 }, { - "epoch": 0.7885865639296894, - "grad_norm": 6.09375, + "epoch": 4.671897289586306, + "grad_norm": 7.34375, "learning_rate": 8.200444444444445e-05, - "loss": 0.5797, + "loss": 0.852, "step": 6550 }, { - "epoch": 0.7897905128822538, - "grad_norm": 7.03125, + "epoch": 4.679029957203994, + "grad_norm": 5.65625, "learning_rate": 8.196000000000001e-05, - "loss": 0.6778, + "loss": 0.8291, "step": 6560 }, { - "epoch": 0.7909944618348183, - "grad_norm": 7.46875, + "epoch": 4.686162624821684, + "grad_norm": 5.5625, "learning_rate": 8.191555555555556e-05, - "loss": 0.669, + "loss": 0.7943, "step": 6570 }, { - "epoch": 0.7921984107873826, - "grad_norm": 7.46875, + "epoch": 4.693295292439372, + "grad_norm": 9.25, "learning_rate": 8.18711111111111e-05, - "loss": 0.7272, + "loss": 0.8418, "step": 6580 }, { - "epoch": 0.793402359739947, - "grad_norm": 6.3125, + "epoch": 4.700427960057061, + "grad_norm": 6.75, "learning_rate": 8.182666666666667e-05, - "loss": 0.5767, + "loss": 0.8661, "step": 6590 }, { - "epoch": 0.7946063086925115, - "grad_norm": 7.28125, + "epoch": 4.70756062767475, + "grad_norm": 7.40625, "learning_rate": 8.178222222222223e-05, - "loss": 0.6776, + "loss": 0.768, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval/acc": 34.88372039794922, + "epoch": 4.70756062767475, + "eval/acc": 41.86046600341797, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval_loss": 2.941105842590332, - "eval_runtime": 0.2071, - "eval_samples_per_second": 207.595, - "eval_steps_per_second": 4.828, + "epoch": 4.70756062767475, + "eval_loss": 2.8003947734832764, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.665, + "eval_steps_per_second": 4.527, "step": 6600 }, { - "epoch": 0.7958102576450758, - "grad_norm": 7.125, + "epoch": 4.71469329529244, + "grad_norm": 7.1875, "learning_rate": 8.173777777777778e-05, - "loss": 0.6368, + "loss": 0.9038, "step": 6610 }, { - "epoch": 0.7970142065976402, - "grad_norm": 6.34375, + "epoch": 4.7218259629101285, + "grad_norm": 6.46875, "learning_rate": 8.169333333333334e-05, - "loss": 0.6504, + "loss": 0.7185, "step": 6620 }, { - "epoch": 0.7982181555502047, - "grad_norm": 5.46875, + "epoch": 4.728958630527817, + "grad_norm": 6.3125, "learning_rate": 8.16488888888889e-05, - "loss": 0.6305, + "loss": 0.9515, "step": 6630 }, { - "epoch": 0.7994221045027691, - "grad_norm": 6.3125, + "epoch": 4.736091298145507, + "grad_norm": 6.46875, "learning_rate": 8.160444444444445e-05, - "loss": 0.6538, + "loss": 0.8127, "step": 6640 }, { - "epoch": 0.8006260534553334, - "grad_norm": 9.0625, + "epoch": 4.743223965763195, + "grad_norm": 6.4375, "learning_rate": 8.156e-05, - "loss": 0.6747, + "loss": 0.8914, "step": 6650 }, { - "epoch": 0.8018300024078979, - "grad_norm": 13.0, + "epoch": 4.750356633380885, + "grad_norm": 6.8125, "learning_rate": 8.151555555555556e-05, - "loss": 0.6412, + "loss": 0.8545, "step": 6660 }, { - "epoch": 0.8030339513604623, - "grad_norm": 7.0, + "epoch": 4.757489300998573, + "grad_norm": 7.21875, "learning_rate": 8.147111111111112e-05, - "loss": 0.6479, + "loss": 0.6783, "step": 6670 }, { - "epoch": 0.8042379003130268, - "grad_norm": 7.375, + "epoch": 4.764621968616263, + "grad_norm": 7.03125, "learning_rate": 8.142666666666667e-05, - "loss": 0.6577, + "loss": 0.9337, "step": 6680 }, { - "epoch": 0.8054418492655911, - "grad_norm": 7.625, + "epoch": 4.771754636233951, + "grad_norm": 10.5625, "learning_rate": 8.138222222222223e-05, - "loss": 0.7217, + "loss": 0.8181, "step": 6690 }, { - "epoch": 0.8066457982181555, - "grad_norm": 5.625, + "epoch": 4.778887303851641, + "grad_norm": 7.375, "learning_rate": 8.133777777777778e-05, - "loss": 0.6363, + "loss": 0.8639, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval/acc": 34.88372039794922, + "epoch": 4.778887303851641, + "eval/acc": 37.20930099487305, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval_loss": 2.8945717811584473, - "eval_runtime": 0.2054, - "eval_samples_per_second": 209.381, - "eval_steps_per_second": 4.869, + "epoch": 4.778887303851641, + "eval_loss": 2.8262782096862793, + "eval_runtime": 0.2194, + "eval_samples_per_second": 195.949, + "eval_steps_per_second": 4.557, "step": 6700 }, { - "epoch": 0.80784974717072, - "grad_norm": 8.0625, + "epoch": 4.7860199714693294, + "grad_norm": 10.8125, "learning_rate": 8.129333333333333e-05, - "loss": 0.6784, + "loss": 0.8742, "step": 6710 }, { - "epoch": 0.8090536961232844, - "grad_norm": 9.1875, + "epoch": 4.793152639087019, + "grad_norm": 5.53125, "learning_rate": 8.124888888888889e-05, - "loss": 0.6187, + "loss": 0.7438, "step": 6720 }, { - "epoch": 0.8102576450758487, - "grad_norm": 9.1875, + "epoch": 4.8002853067047075, + "grad_norm": 6.65625, "learning_rate": 8.120444444444445e-05, - "loss": 0.6461, + "loss": 0.7859, "step": 6730 }, { - "epoch": 0.8114615940284132, - "grad_norm": 7.375, + "epoch": 4.807417974322396, + "grad_norm": 6.78125, "learning_rate": 8.116e-05, - "loss": 0.7325, + "loss": 0.8942, "step": 6740 }, { - "epoch": 0.8126655429809776, - "grad_norm": 7.71875, + "epoch": 4.814550641940086, + "grad_norm": 8.4375, "learning_rate": 8.111555555555555e-05, - "loss": 0.6758, + "loss": 0.8483, "step": 6750 }, { - "epoch": 0.813869491933542, - "grad_norm": 10.125, + "epoch": 4.821683309557774, + "grad_norm": 6.40625, "learning_rate": 8.107111111111113e-05, - "loss": 0.6223, + "loss": 0.8284, "step": 6760 }, { - "epoch": 0.8150734408861064, - "grad_norm": 7.90625, + "epoch": 4.828815977175464, + "grad_norm": 6.84375, "learning_rate": 8.102666666666667e-05, - "loss": 0.6115, + "loss": 0.8887, "step": 6770 }, { - "epoch": 0.8162773898386708, - "grad_norm": 5.375, + "epoch": 4.835948644793152, + "grad_norm": 8.875, "learning_rate": 8.098222222222222e-05, - "loss": 0.5747, + "loss": 0.8431, "step": 6780 }, { - "epoch": 0.8174813387912353, - "grad_norm": 7.375, + "epoch": 4.843081312410842, + "grad_norm": 6.90625, "learning_rate": 8.093777777777779e-05, - "loss": 0.618, + "loss": 0.8325, "step": 6790 }, { - "epoch": 0.8186852877437997, - "grad_norm": 7.125, + "epoch": 4.85021398002853, + "grad_norm": 7.0, "learning_rate": 8.089333333333333e-05, - "loss": 0.6603, + "loss": 0.7742, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval/acc": 34.88372039794922, + "epoch": 4.85021398002853, + "eval/acc": 39.53488540649414, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval_loss": 2.9451656341552734, - "eval_runtime": 1.2476, - "eval_samples_per_second": 34.466, - "eval_steps_per_second": 0.802, + "epoch": 4.85021398002853, + "eval_loss": 2.7403292655944824, + "eval_runtime": 0.5509, + "eval_samples_per_second": 78.059, + "eval_steps_per_second": 1.815, "step": 6800 }, { - "epoch": 0.819889236696364, - "grad_norm": 6.28125, + "epoch": 4.85734664764622, + "grad_norm": 6.625, "learning_rate": 8.08488888888889e-05, - "loss": 0.5918, + "loss": 0.8418, "step": 6810 }, { - "epoch": 0.8210931856489285, - "grad_norm": 8.6875, + "epoch": 4.8644793152639085, + "grad_norm": 7.65625, "learning_rate": 8.080444444444444e-05, - "loss": 0.5911, + "loss": 0.9022, "step": 6820 }, { - "epoch": 0.8222971346014929, - "grad_norm": 6.75, + "epoch": 4.871611982881598, + "grad_norm": 7.75, "learning_rate": 8.076e-05, - "loss": 0.6648, + "loss": 0.8201, "step": 6830 }, { - "epoch": 0.8235010835540573, - "grad_norm": 6.78125, + "epoch": 4.878744650499287, + "grad_norm": 7.84375, "learning_rate": 8.071555555555555e-05, - "loss": 0.6044, + "loss": 0.8144, "step": 6840 }, { - "epoch": 0.8247050325066217, - "grad_norm": 15.1875, + "epoch": 4.885877318116976, + "grad_norm": 8.3125, "learning_rate": 8.067111111111112e-05, - "loss": 0.6896, + "loss": 0.8821, "step": 6850 }, { - "epoch": 0.8259089814591861, - "grad_norm": 7.6875, + "epoch": 4.893009985734665, + "grad_norm": 9.0, "learning_rate": 8.062666666666668e-05, - "loss": 0.5829, + "loss": 0.8572, "step": 6860 }, { - "epoch": 0.8271129304117505, - "grad_norm": 5.21875, + "epoch": 4.900142653352354, + "grad_norm": 10.0, "learning_rate": 8.058222222222223e-05, - "loss": 0.6934, + "loss": 0.7498, "step": 6870 }, { - "epoch": 0.828316879364315, - "grad_norm": 10.375, + "epoch": 4.907275320970043, + "grad_norm": 6.09375, "learning_rate": 8.053777777777778e-05, - "loss": 0.7309, + "loss": 0.8709, "step": 6880 }, { - "epoch": 0.8295208283168793, - "grad_norm": 8.1875, + "epoch": 4.914407988587731, + "grad_norm": 7.84375, "learning_rate": 8.049333333333334e-05, - "loss": 0.7213, + "loss": 0.8045, "step": 6890 }, { - "epoch": 0.8307247772694438, - "grad_norm": 5.15625, + "epoch": 4.921540656205421, + "grad_norm": 7.0625, "learning_rate": 8.04488888888889e-05, - "loss": 0.6034, + "loss": 0.8919, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval/acc": 32.55813980102539, + "epoch": 4.921540656205421, + "eval/acc": 34.88372039794922, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval_loss": 2.8601129055023193, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.302, - "eval_steps_per_second": 4.821, + "epoch": 4.921540656205421, + "eval_loss": 2.8702921867370605, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.143, + "eval_steps_per_second": 4.515, "step": 6900 }, { - "epoch": 0.8319287262220082, - "grad_norm": 7.25, + "epoch": 4.9286733238231095, + "grad_norm": 18.125, "learning_rate": 8.040444444444445e-05, - "loss": 0.5585, + "loss": 0.8407, "step": 6910 }, { - "epoch": 0.8331326751745726, - "grad_norm": 5.9375, + "epoch": 4.935805991440799, + "grad_norm": 7.8125, "learning_rate": 8.036e-05, - "loss": 0.7539, + "loss": 0.9023, "step": 6920 }, { - "epoch": 0.834336624127137, - "grad_norm": 8.0, + "epoch": 4.942938659058488, + "grad_norm": 6.53125, "learning_rate": 8.031555555555556e-05, - "loss": 0.6104, + "loss": 0.7747, "step": 6930 }, { - "epoch": 0.8355405730797014, - "grad_norm": 7.4375, + "epoch": 4.950071326676177, + "grad_norm": 7.3125, "learning_rate": 8.027111111111112e-05, - "loss": 0.613, + "loss": 0.7357, "step": 6940 }, { - "epoch": 0.8367445220322658, - "grad_norm": 8.1875, + "epoch": 4.957203994293866, + "grad_norm": 5.71875, "learning_rate": 8.022666666666667e-05, - "loss": 0.6647, + "loss": 0.8914, "step": 6950 }, { - "epoch": 0.8379484709848303, - "grad_norm": 7.4375, + "epoch": 4.964336661911555, + "grad_norm": 7.9375, "learning_rate": 8.018222222222223e-05, - "loss": 0.7037, + "loss": 0.8626, "step": 6960 }, { - "epoch": 0.8391524199373946, - "grad_norm": 7.25, + "epoch": 4.971469329529244, + "grad_norm": 6.9375, "learning_rate": 8.013777777777778e-05, - "loss": 0.5853, + "loss": 0.8388, "step": 6970 }, { - "epoch": 0.840356368889959, - "grad_norm": 8.75, + "epoch": 4.978601997146933, + "grad_norm": 6.5, "learning_rate": 8.009333333333334e-05, - "loss": 0.6264, + "loss": 0.8321, "step": 6980 }, { - "epoch": 0.8415603178425235, - "grad_norm": 8.4375, + "epoch": 4.985734664764622, + "grad_norm": 6.6875, "learning_rate": 8.004888888888889e-05, - "loss": 0.6221, + "loss": 0.8276, "step": 6990 }, { - "epoch": 0.8427642667950879, - "grad_norm": 8.3125, + "epoch": 4.9928673323823105, + "grad_norm": 10.5625, "learning_rate": 8.000444444444445e-05, - "loss": 0.6408, + "loss": 0.8847, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval/acc": 33.72093200683594, + "epoch": 4.9928673323823105, + "eval/acc": 39.53488540649414, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval_loss": 2.9269802570343018, - "eval_runtime": 0.2045, - "eval_samples_per_second": 210.301, - "eval_steps_per_second": 4.891, + "epoch": 4.9928673323823105, + "eval_loss": 2.7940218448638916, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.063, + "eval_steps_per_second": 4.467, "step": 7000 }, { - "epoch": 0.8439682157476524, - "grad_norm": 9.125, + "epoch": 5.0, + "grad_norm": 7.1875, "learning_rate": 7.996e-05, - "loss": 0.6321, + "loss": 0.9472, "step": 7010 }, { - "epoch": 0.8451721647002167, - "grad_norm": 7.125, + "epoch": 5.007132667617689, + "grad_norm": 7.25, "learning_rate": 7.991555555555555e-05, - "loss": 0.5927, + "loss": 0.9009, "step": 7020 }, { - "epoch": 0.8463761136527811, - "grad_norm": 7.65625, + "epoch": 5.014265335235378, + "grad_norm": 7.34375, "learning_rate": 7.987111111111112e-05, - "loss": 0.6574, + "loss": 0.8805, "step": 7030 }, { - "epoch": 0.8475800626053456, - "grad_norm": 7.0, + "epoch": 5.021398002853067, + "grad_norm": 5.78125, "learning_rate": 7.982666666666667e-05, - "loss": 0.7185, + "loss": 0.8475, "step": 7040 }, { - "epoch": 0.84878401155791, - "grad_norm": 7.3125, + "epoch": 5.028530670470756, + "grad_norm": 5.53125, "learning_rate": 7.978222222222222e-05, - "loss": 0.7157, + "loss": 0.7598, "step": 7050 }, { - "epoch": 0.8499879605104743, - "grad_norm": 5.6875, + "epoch": 5.035663338088445, + "grad_norm": 6.25, "learning_rate": 7.973777777777778e-05, - "loss": 0.606, + "loss": 0.8605, "step": 7060 }, { - "epoch": 0.8511919094630388, - "grad_norm": 6.28125, + "epoch": 5.042796005706134, + "grad_norm": 7.46875, "learning_rate": 7.969333333333335e-05, - "loss": 0.6493, + "loss": 0.9293, "step": 7070 }, { - "epoch": 0.8523958584156032, - "grad_norm": 7.8125, + "epoch": 5.049928673323823, + "grad_norm": 5.9375, "learning_rate": 7.96488888888889e-05, - "loss": 0.6123, + "loss": 0.7984, "step": 7080 }, { - "epoch": 0.8535998073681675, + "epoch": 5.057061340941512, "grad_norm": 8.375, "learning_rate": 7.960444444444444e-05, - "loss": 0.6035, + "loss": 0.8222, "step": 7090 }, { - "epoch": 0.854803756320732, - "grad_norm": 7.78125, + "epoch": 5.064194008559201, + "grad_norm": 6.9375, "learning_rate": 7.956e-05, - "loss": 0.5902, + "loss": 0.8535, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval/acc": 37.20930099487305, + "epoch": 5.064194008559201, + "eval/acc": 41.86046600341797, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval_loss": 2.926543712615967, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.37, - "eval_steps_per_second": 4.73, + "epoch": 5.064194008559201, + "eval_loss": 2.631981134414673, + "eval_runtime": 2.5832, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.387, "step": 7100 }, { - "epoch": 0.8560077052732964, - "grad_norm": 6.0625, + "epoch": 5.0713266761768905, + "grad_norm": 6.5625, "learning_rate": 7.951555555555555e-05, - "loss": 0.6464, + "loss": 0.8668, "step": 7110 }, { - "epoch": 0.8572116542258609, + "epoch": 5.078459343794579, "grad_norm": 9.0, "learning_rate": 7.947111111111111e-05, - "loss": 0.7656, + "loss": 0.8142, "step": 7120 }, { - "epoch": 0.8584156031784252, - "grad_norm": 7.21875, + "epoch": 5.085592011412269, + "grad_norm": 8.3125, "learning_rate": 7.942666666666668e-05, - "loss": 0.5546, + "loss": 0.9271, "step": 7130 }, { - "epoch": 0.8596195521309896, - "grad_norm": 8.5, + "epoch": 5.092724679029957, + "grad_norm": 7.875, "learning_rate": 7.938222222222222e-05, - "loss": 0.6538, + "loss": 0.8213, "step": 7140 }, { - "epoch": 0.8608235010835541, - "grad_norm": 8.0625, + "epoch": 5.099857346647646, + "grad_norm": 6.8125, "learning_rate": 7.933777777777777e-05, - "loss": 0.7057, + "loss": 0.8511, "step": 7150 }, { - "epoch": 0.8620274500361185, - "grad_norm": 7.34375, + "epoch": 5.106990014265335, + "grad_norm": 7.53125, "learning_rate": 7.929333333333334e-05, - "loss": 0.6287, + "loss": 0.8525, "step": 7160 }, { - "epoch": 0.8632313989886828, - "grad_norm": 6.53125, + "epoch": 5.114122681883024, + "grad_norm": 7.21875, "learning_rate": 7.92488888888889e-05, - "loss": 0.6231, + "loss": 0.8554, "step": 7170 }, { - "epoch": 0.8644353479412473, - "grad_norm": 18.5, + "epoch": 5.121255349500713, + "grad_norm": 6.84375, "learning_rate": 7.920444444444445e-05, - "loss": 0.664, + "loss": 0.8128, "step": 7180 }, { - "epoch": 0.8656392968938117, - "grad_norm": 8.875, + "epoch": 5.128388017118402, + "grad_norm": 7.84375, "learning_rate": 7.916e-05, - "loss": 0.6286, + "loss": 0.7726, "step": 7190 }, { - "epoch": 0.8668432458463761, - "grad_norm": 6.0625, + "epoch": 5.1355206847360915, + "grad_norm": 7.78125, "learning_rate": 7.911555555555556e-05, - "loss": 0.6808, + "loss": 0.8902, "step": 7200 }, { - "epoch": 0.8668432458463761, + "epoch": 5.1355206847360915, "eval/acc": 37.20930099487305, "step": 7200 }, { - "epoch": 0.8668432458463761, - "eval_loss": 2.9467363357543945, - "eval_runtime": 0.2052, - "eval_samples_per_second": 209.502, - "eval_steps_per_second": 4.872, + "epoch": 5.1355206847360915, + "eval_loss": 2.5633885860443115, + "eval_runtime": 0.2541, + "eval_samples_per_second": 169.248, + "eval_steps_per_second": 3.936, "step": 7200 }, { - "epoch": 0.8680471947989405, - "grad_norm": 7.9375, + "epoch": 5.14265335235378, + "grad_norm": 6.8125, "learning_rate": 7.907111111111112e-05, - "loss": 0.6626, + "loss": 0.7482, "step": 7210 }, { - "epoch": 0.8692511437515049, - "grad_norm": 7.15625, + "epoch": 5.14978601997147, + "grad_norm": 42.0, "learning_rate": 7.902666666666667e-05, - "loss": 0.7685, + "loss": 0.9007, "step": 7220 }, { - "epoch": 0.8704550927040694, - "grad_norm": 10.3125, + "epoch": 5.156918687589158, + "grad_norm": 6.0625, "learning_rate": 7.898222222222223e-05, - "loss": 0.6848, + "loss": 0.8643, "step": 7230 }, { - "epoch": 0.8716590416566338, - "grad_norm": 7.21875, + "epoch": 5.164051355206848, + "grad_norm": 7.03125, "learning_rate": 7.893777777777778e-05, - "loss": 0.6433, + "loss": 0.8899, "step": 7240 }, { - "epoch": 0.8728629906091981, - "grad_norm": 6.34375, + "epoch": 5.171184022824536, + "grad_norm": 7.53125, "learning_rate": 7.889333333333334e-05, - "loss": 0.6121, + "loss": 0.7462, "step": 7250 }, { - "epoch": 0.8740669395617626, - "grad_norm": 7.40625, + "epoch": 5.178316690442226, + "grad_norm": 7.21875, "learning_rate": 7.884888888888889e-05, - "loss": 0.6391, + "loss": 0.9199, "step": 7260 }, { - "epoch": 0.875270888514327, - "grad_norm": 7.96875, + "epoch": 5.185449358059914, + "grad_norm": 8.1875, "learning_rate": 7.880444444444445e-05, - "loss": 0.638, + "loss": 0.7966, "step": 7270 }, { - "epoch": 0.8764748374668914, - "grad_norm": 6.28125, + "epoch": 5.192582025677604, + "grad_norm": 8.0, "learning_rate": 7.876e-05, - "loss": 0.6214, + "loss": 0.9086, "step": 7280 }, { - "epoch": 0.8776787864194558, - "grad_norm": 9.125, + "epoch": 5.1997146932952925, + "grad_norm": 7.46875, "learning_rate": 7.871555555555556e-05, - "loss": 0.7473, + "loss": 0.9184, "step": 7290 }, { - "epoch": 0.8788827353720202, - "grad_norm": 7.5, + "epoch": 5.206847360912981, + "grad_norm": 7.28125, "learning_rate": 7.867111111111112e-05, - "loss": 0.68, + "loss": 0.742, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval/acc": 34.88372039794922, + "epoch": 5.206847360912981, + "eval/acc": 39.53488540649414, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval_loss": 2.999979257583618, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.25, - "eval_steps_per_second": 4.703, + "epoch": 5.206847360912981, + "eval_loss": 2.5178542137145996, + "eval_runtime": 0.2274, + "eval_samples_per_second": 189.112, + "eval_steps_per_second": 4.398, "step": 7300 }, { - "epoch": 0.8800866843245846, - "grad_norm": 7.03125, + "epoch": 5.2139800285306706, + "grad_norm": 10.4375, "learning_rate": 7.862666666666667e-05, - "loss": 0.4952, + "loss": 0.8737, "step": 7310 }, { - "epoch": 0.8812906332771491, - "grad_norm": 7.65625, + "epoch": 5.221112696148359, + "grad_norm": 6.8125, "learning_rate": 7.858222222222222e-05, - "loss": 0.7879, + "loss": 0.8197, "step": 7320 }, { - "epoch": 0.8824945822297134, - "grad_norm": 7.71875, + "epoch": 5.228245363766049, + "grad_norm": 8.125, "learning_rate": 7.853777777777778e-05, - "loss": 0.6093, + "loss": 0.9561, "step": 7330 }, { - "epoch": 0.8836985311822779, - "grad_norm": 8.125, + "epoch": 5.235378031383737, + "grad_norm": 9.5, "learning_rate": 7.849333333333334e-05, - "loss": 0.6522, + "loss": 0.9066, "step": 7340 }, { - "epoch": 0.8849024801348423, - "grad_norm": 8.9375, + "epoch": 5.242510699001427, + "grad_norm": 6.09375, "learning_rate": 7.844888888888889e-05, - "loss": 0.6861, + "loss": 0.839, "step": 7350 }, { - "epoch": 0.8861064290874067, - "grad_norm": 6.9375, + "epoch": 5.249643366619115, + "grad_norm": 8.0625, "learning_rate": 7.840444444444445e-05, - "loss": 0.6023, + "loss": 0.8996, "step": 7360 }, { - "epoch": 0.8873103780399711, - "grad_norm": 8.1875, + "epoch": 5.256776034236805, + "grad_norm": 6.3125, "learning_rate": 7.836e-05, - "loss": 0.5156, + "loss": 0.8253, "step": 7370 }, { - "epoch": 0.8885143269925355, - "grad_norm": 7.125, + "epoch": 5.263908701854493, + "grad_norm": 6.15625, "learning_rate": 7.831555555555556e-05, - "loss": 0.6841, + "loss": 0.7275, "step": 7380 }, { - "epoch": 0.8897182759450999, - "grad_norm": 8.0625, + "epoch": 5.271041369472183, + "grad_norm": 6.375, "learning_rate": 7.827111111111111e-05, - "loss": 0.5521, + "loss": 0.8548, "step": 7390 }, { - "epoch": 0.8909222248976644, - "grad_norm": 7.03125, + "epoch": 5.2781740370898715, + "grad_norm": 8.0625, "learning_rate": 7.822666666666667e-05, - "loss": 0.7556, + "loss": 0.8754, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval/acc": 32.55813980102539, + "epoch": 5.2781740370898715, + "eval/acc": 39.53488540649414, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval_loss": 2.882596015930176, - "eval_runtime": 0.2414, - "eval_samples_per_second": 178.131, - "eval_steps_per_second": 4.143, + "epoch": 5.2781740370898715, + "eval_loss": 2.599212408065796, + "eval_runtime": 0.2355, + "eval_samples_per_second": 182.56, + "eval_steps_per_second": 4.246, "step": 7400 }, { - "epoch": 0.8921261738502287, - "grad_norm": 6.8125, + "epoch": 5.285306704707561, + "grad_norm": 8.875, "learning_rate": 7.818222222222222e-05, - "loss": 0.6252, + "loss": 0.8725, "step": 7410 }, { - "epoch": 0.8933301228027931, - "grad_norm": 6.5, + "epoch": 5.29243937232525, + "grad_norm": 8.0625, "learning_rate": 7.813777777777777e-05, - "loss": 0.6405, + "loss": 0.8689, "step": 7420 }, { - "epoch": 0.8945340717553576, - "grad_norm": 7.25, + "epoch": 5.299572039942939, + "grad_norm": 7.59375, "learning_rate": 7.809333333333335e-05, - "loss": 0.5753, + "loss": 0.7615, "step": 7430 }, { - "epoch": 0.895738020707922, - "grad_norm": 8.4375, + "epoch": 5.306704707560628, + "grad_norm": 6.3125, "learning_rate": 7.80488888888889e-05, - "loss": 0.5782, + "loss": 0.8141, "step": 7440 }, { - "epoch": 0.8969419696604864, - "grad_norm": 7.875, + "epoch": 5.313837375178316, + "grad_norm": 6.84375, "learning_rate": 7.800444444444444e-05, - "loss": 0.6364, + "loss": 0.8328, "step": 7450 }, { - "epoch": 0.8981459186130508, - "grad_norm": 6.15625, + "epoch": 5.320970042796006, + "grad_norm": 7.71875, "learning_rate": 7.796e-05, - "loss": 0.6243, + "loss": 0.8158, "step": 7460 }, { - "epoch": 0.8993498675656152, - "grad_norm": 7.5, + "epoch": 5.328102710413694, + "grad_norm": 7.0625, "learning_rate": 7.791555555555557e-05, - "loss": 0.6401, + "loss": 0.7663, "step": 7470 }, { - "epoch": 0.9005538165181797, - "grad_norm": 6.03125, + "epoch": 5.335235378031384, + "grad_norm": 8.1875, "learning_rate": 7.787111111111112e-05, - "loss": 0.5183, + "loss": 0.7704, "step": 7480 }, { - "epoch": 0.901757765470744, - "grad_norm": 6.5, + "epoch": 5.3423680456490725, + "grad_norm": 8.0, "learning_rate": 7.782666666666666e-05, - "loss": 0.6057, + "loss": 0.8511, "step": 7490 }, { - "epoch": 0.9029617144233084, - "grad_norm": 9.0, + "epoch": 5.349500713266762, + "grad_norm": 5.15625, "learning_rate": 7.778222222222223e-05, - "loss": 0.6341, + "loss": 0.783, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval/acc": 34.30232620239258, + "epoch": 5.349500713266762, + "eval/acc": 39.53488540649414, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval_loss": 2.997713804244995, - "eval_runtime": 1.0811, - "eval_samples_per_second": 39.775, - "eval_steps_per_second": 0.925, + "epoch": 5.349500713266762, + "eval_loss": 2.6000046730041504, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.876, + "eval_steps_per_second": 4.392, "step": 7500 }, { - "epoch": 0.9041656633758729, - "grad_norm": 7.03125, + "epoch": 5.356633380884451, + "grad_norm": 7.6875, "learning_rate": 7.773777777777778e-05, - "loss": 0.6595, + "loss": 0.7674, "step": 7510 }, { - "epoch": 0.9053696123284373, - "grad_norm": 7.84375, + "epoch": 5.36376604850214, + "grad_norm": 6.53125, "learning_rate": 7.769333333333334e-05, - "loss": 0.7769, + "loss": 0.8338, "step": 7520 }, { - "epoch": 0.9065735612810016, - "grad_norm": 6.78125, + "epoch": 5.370898716119829, + "grad_norm": 5.8125, "learning_rate": 7.76488888888889e-05, - "loss": 0.6876, + "loss": 0.8279, "step": 7530 }, { - "epoch": 0.9077775102335661, - "grad_norm": 9.375, + "epoch": 5.378031383737518, + "grad_norm": 7.0625, "learning_rate": 7.760444444444445e-05, - "loss": 0.6271, + "loss": 0.7954, "step": 7540 }, { - "epoch": 0.9089814591861305, - "grad_norm": 6.96875, + "epoch": 5.385164051355207, + "grad_norm": 8.0, "learning_rate": 7.756e-05, - "loss": 0.6117, + "loss": 0.8632, "step": 7550 }, { - "epoch": 0.910185408138695, - "grad_norm": 6.28125, + "epoch": 5.392296718972895, + "grad_norm": 6.84375, "learning_rate": 7.751555555555556e-05, - "loss": 0.6461, + "loss": 0.8191, "step": 7560 }, { - "epoch": 0.9113893570912593, - "grad_norm": 7.96875, + "epoch": 5.399429386590585, + "grad_norm": 7.375, "learning_rate": 7.747111111111112e-05, - "loss": 0.6543, + "loss": 0.708, "step": 7570 }, { - "epoch": 0.9125933060438237, - "grad_norm": 10.0, + "epoch": 5.4065620542082735, + "grad_norm": 7.15625, "learning_rate": 7.742666666666667e-05, - "loss": 0.686, + "loss": 0.6851, "step": 7580 }, { - "epoch": 0.9137972549963882, - "grad_norm": 7.90625, + "epoch": 5.413694721825963, + "grad_norm": 7.25, "learning_rate": 7.738222222222222e-05, - "loss": 0.6634, + "loss": 0.8769, "step": 7590 }, { - "epoch": 0.9150012039489526, - "grad_norm": 11.5625, + "epoch": 5.420827389443652, + "grad_norm": 7.6875, "learning_rate": 7.733777777777779e-05, - "loss": 0.6627, + "loss": 0.8316, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval/acc": 37.20930099487305, + "epoch": 5.420827389443652, + "eval/acc": 39.53488540649414, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval_loss": 2.908363103866577, - "eval_runtime": 2.6366, - "eval_samples_per_second": 16.309, - "eval_steps_per_second": 0.379, + "epoch": 5.420827389443652, + "eval_loss": 2.583944797515869, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.433, + "eval_steps_per_second": 4.522, "step": 7600 }, { - "epoch": 0.9162051529015169, - "grad_norm": 5.65625, + "epoch": 5.427960057061341, + "grad_norm": 7.625, "learning_rate": 7.729333333333334e-05, - "loss": 0.5503, + "loss": 0.8444, "step": 7610 }, { - "epoch": 0.9174091018540814, - "grad_norm": 7.15625, + "epoch": 5.43509272467903, + "grad_norm": 6.6875, "learning_rate": 7.724888888888889e-05, - "loss": 0.5263, + "loss": 0.8101, "step": 7620 }, { - "epoch": 0.9186130508066458, - "grad_norm": 5.96875, + "epoch": 5.442225392296719, + "grad_norm": 6.375, "learning_rate": 7.720444444444445e-05, - "loss": 0.6969, + "loss": 0.8094, "step": 7630 }, { - "epoch": 0.9198169997592102, - "grad_norm": 8.0625, + "epoch": 5.449358059914408, + "grad_norm": 7.09375, "learning_rate": 7.716e-05, - "loss": 0.6371, + "loss": 0.9292, "step": 7640 }, { - "epoch": 0.9210209487117746, - "grad_norm": 7.5625, + "epoch": 5.456490727532097, + "grad_norm": 8.0, "learning_rate": 7.711555555555556e-05, - "loss": 0.6406, + "loss": 0.8544, "step": 7650 }, { - "epoch": 0.922224897664339, - "grad_norm": 10.6875, + "epoch": 5.463623395149786, + "grad_norm": 5.625, "learning_rate": 7.707111111111111e-05, - "loss": 0.7058, + "loss": 0.787, "step": 7660 }, { - "epoch": 0.9234288466169035, - "grad_norm": 12.5625, + "epoch": 5.470756062767475, + "grad_norm": 8.375, "learning_rate": 7.702666666666667e-05, - "loss": 0.7067, + "loss": 0.8763, "step": 7670 }, { - "epoch": 0.9246327955694679, - "grad_norm": 7.21875, + "epoch": 5.477888730385164, + "grad_norm": 12.9375, "learning_rate": 7.698222222222222e-05, - "loss": 0.5543, + "loss": 0.8317, "step": 7680 }, { - "epoch": 0.9258367445220322, - "grad_norm": 10.125, + "epoch": 5.4850213980028535, + "grad_norm": 8.125, "learning_rate": 7.693777777777778e-05, - "loss": 0.6719, + "loss": 0.8156, "step": 7690 }, { - "epoch": 0.9270406934745967, - "grad_norm": 7.03125, + "epoch": 5.492154065620542, + "grad_norm": 6.96875, "learning_rate": 7.689333333333334e-05, - "loss": 0.5764, + "loss": 0.8998, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval/acc": 35.46511459350586, + "epoch": 5.492154065620542, + "eval/acc": 39.53488540649414, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval_loss": 2.8986358642578125, - "eval_runtime": 4.4935, - "eval_samples_per_second": 9.569, - "eval_steps_per_second": 0.223, + "epoch": 5.492154065620542, + "eval_loss": 2.6069791316986084, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.632, + "eval_steps_per_second": 4.457, "step": 7700 }, { - "epoch": 0.9282446424271611, - "grad_norm": 10.25, + "epoch": 5.499286733238231, + "grad_norm": 7.5625, "learning_rate": 7.68488888888889e-05, - "loss": 0.6302, + "loss": 0.7881, "step": 7710 }, { - "epoch": 0.9294485913797255, - "grad_norm": 9.8125, + "epoch": 5.50641940085592, + "grad_norm": 6.65625, "learning_rate": 7.680444444444444e-05, - "loss": 0.6236, + "loss": 0.8379, "step": 7720 }, { - "epoch": 0.93065254033229, - "grad_norm": 6.5, + "epoch": 5.513552068473609, + "grad_norm": 6.34375, "learning_rate": 7.676e-05, - "loss": 0.7159, + "loss": 0.844, "step": 7730 }, { - "epoch": 0.9318564892848543, - "grad_norm": 7.1875, + "epoch": 5.520684736091298, + "grad_norm": 8.3125, "learning_rate": 7.671555555555557e-05, - "loss": 0.6257, + "loss": 0.8762, "step": 7740 }, { - "epoch": 0.9330604382374187, - "grad_norm": 7.3125, + "epoch": 5.527817403708987, + "grad_norm": 7.09375, "learning_rate": 7.667111111111111e-05, - "loss": 0.5247, + "loss": 0.8621, "step": 7750 }, { - "epoch": 0.9342643871899832, - "grad_norm": 5.0, + "epoch": 5.534950071326676, + "grad_norm": 8.5625, "learning_rate": 7.662666666666666e-05, - "loss": 0.5185, + "loss": 1.0092, "step": 7760 }, { - "epoch": 0.9354683361425475, - "grad_norm": 13.375, + "epoch": 5.542082738944365, + "grad_norm": 6.3125, "learning_rate": 7.658222222222222e-05, - "loss": 0.8069, + "loss": 0.8743, "step": 7770 }, { - "epoch": 0.936672285095112, - "grad_norm": 10.3125, + "epoch": 5.5492154065620545, + "grad_norm": 6.0625, "learning_rate": 7.653777777777779e-05, - "loss": 0.6619, + "loss": 0.754, "step": 7780 }, { - "epoch": 0.9378762340476764, - "grad_norm": 7.1875, + "epoch": 5.556348074179743, + "grad_norm": 7.6875, "learning_rate": 7.649333333333334e-05, - "loss": 0.785, + "loss": 0.8504, "step": 7790 }, { - "epoch": 0.9390801830002408, - "grad_norm": 6.0625, + "epoch": 5.563480741797433, + "grad_norm": 8.3125, "learning_rate": 7.64488888888889e-05, - "loss": 0.6064, + "loss": 0.7512, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval/acc": 40.69767379760742, + "epoch": 5.563480741797433, + "eval/acc": 37.20930099487305, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval_loss": 2.9625086784362793, - "eval_runtime": 1.0058, - "eval_samples_per_second": 42.753, - "eval_steps_per_second": 0.994, + "epoch": 5.563480741797433, + "eval_loss": 2.610304594039917, + "eval_runtime": 0.2338, + "eval_samples_per_second": 183.899, + "eval_steps_per_second": 4.277, "step": 7800 }, { - "epoch": 0.9402841319528052, - "grad_norm": 7.0, + "epoch": 5.570613409415121, + "grad_norm": 6.71875, "learning_rate": 7.640444444444445e-05, - "loss": 0.5744, + "loss": 0.8204, "step": 7810 }, { - "epoch": 0.9414880809053696, - "grad_norm": 7.78125, + "epoch": 5.57774607703281, + "grad_norm": 6.625, "learning_rate": 7.636e-05, - "loss": 0.6294, + "loss": 0.734, "step": 7820 }, { - "epoch": 0.942692029857934, - "grad_norm": 6.46875, + "epoch": 5.584878744650499, + "grad_norm": 5.65625, "learning_rate": 7.631555555555556e-05, - "loss": 0.7608, + "loss": 0.8047, "step": 7830 }, { - "epoch": 0.9438959788104985, - "grad_norm": 6.71875, + "epoch": 5.592011412268189, + "grad_norm": 6.40625, "learning_rate": 7.627111111111112e-05, - "loss": 0.6084, + "loss": 0.7179, "step": 7840 }, { - "epoch": 0.9450999277630628, - "grad_norm": 7.15625, + "epoch": 5.599144079885877, + "grad_norm": 6.78125, "learning_rate": 7.622666666666667e-05, - "loss": 0.5791, + "loss": 0.849, "step": 7850 }, { - "epoch": 0.9463038767156272, - "grad_norm": 10.1875, + "epoch": 5.606276747503566, + "grad_norm": 8.8125, "learning_rate": 7.618222222222221e-05, - "loss": 0.683, + "loss": 0.8817, "step": 7860 }, { - "epoch": 0.9475078256681917, - "grad_norm": 7.59375, + "epoch": 5.6134094151212555, + "grad_norm": 6.375, "learning_rate": 7.613777777777779e-05, - "loss": 0.6413, + "loss": 0.8812, "step": 7870 }, { - "epoch": 0.9487117746207561, - "grad_norm": 5.71875, + "epoch": 5.620542082738944, + "grad_norm": 13.125, "learning_rate": 7.609333333333334e-05, - "loss": 0.5985, + "loss": 0.8522, "step": 7880 }, { - "epoch": 0.9499157235733204, - "grad_norm": 8.625, + "epoch": 5.627674750356634, + "grad_norm": 7.0625, "learning_rate": 7.604888888888889e-05, - "loss": 0.572, + "loss": 0.731, "step": 7890 }, { - "epoch": 0.9511196725258849, - "grad_norm": 15.75, + "epoch": 5.634807417974322, + "grad_norm": 7.21875, "learning_rate": 7.600444444444445e-05, - "loss": 0.674, + "loss": 0.8841, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval/acc": 34.88372039794922, + "epoch": 5.634807417974322, + "eval/acc": 39.53488540649414, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval_loss": 2.9794013500213623, - "eval_runtime": 0.209, - "eval_samples_per_second": 205.705, - "eval_steps_per_second": 4.784, + "epoch": 5.634807417974322, + "eval_loss": 2.6105217933654785, + "eval_runtime": 0.2306, + "eval_samples_per_second": 186.447, + "eval_steps_per_second": 4.336, "step": 7900 }, { - "epoch": 0.9523236214784493, - "grad_norm": 8.25, + "epoch": 5.641940085592012, + "grad_norm": 7.625, "learning_rate": 7.596000000000001e-05, - "loss": 0.7455, + "loss": 0.8654, "step": 7910 }, { - "epoch": 0.9535275704310138, - "grad_norm": 7.40625, + "epoch": 5.6490727532097, + "grad_norm": 26.75, "learning_rate": 7.591555555555556e-05, - "loss": 0.5746, + "loss": 0.8103, "step": 7920 }, { - "epoch": 0.9547315193835781, - "grad_norm": 7.78125, + "epoch": 5.65620542082739, + "grad_norm": 7.375, "learning_rate": 7.587111111111112e-05, - "loss": 0.6232, + "loss": 0.7461, "step": 7930 }, { - "epoch": 0.9559354683361425, - "grad_norm": 10.9375, + "epoch": 5.663338088445078, + "grad_norm": 6.09375, "learning_rate": 7.582666666666667e-05, - "loss": 0.7393, + "loss": 0.9693, "step": 7940 }, { - "epoch": 0.957139417288707, - "grad_norm": 8.6875, + "epoch": 5.670470756062768, + "grad_norm": 7.09375, "learning_rate": 7.578222222222222e-05, - "loss": 0.6138, + "loss": 0.8595, "step": 7950 }, { - "epoch": 0.9583433662412714, - "grad_norm": 7.625, + "epoch": 5.6776034236804565, + "grad_norm": 7.3125, "learning_rate": 7.573777777777778e-05, - "loss": 0.637, + "loss": 0.8541, "step": 7960 }, { - "epoch": 0.9595473151938357, - "grad_norm": 6.90625, + "epoch": 5.684736091298145, + "grad_norm": 7.90625, "learning_rate": 7.569333333333334e-05, - "loss": 0.606, + "loss": 0.8774, "step": 7970 }, { - "epoch": 0.9607512641464002, - "grad_norm": 8.8125, + "epoch": 5.6918687589158345, + "grad_norm": 9.0, "learning_rate": 7.564888888888889e-05, - "loss": 0.7135, + "loss": 0.8823, "step": 7980 }, { - "epoch": 0.9619552130989646, - "grad_norm": 6.84375, + "epoch": 5.699001426533523, + "grad_norm": 6.09375, "learning_rate": 7.560444444444444e-05, - "loss": 0.6138, + "loss": 0.7302, "step": 7990 }, { - "epoch": 0.963159162051529, - "grad_norm": 8.25, + "epoch": 5.706134094151213, + "grad_norm": 7.21875, "learning_rate": 7.556000000000002e-05, - "loss": 0.7128, + "loss": 0.8339, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval/acc": 34.88372039794922, + "epoch": 5.706134094151213, + "eval/acc": 37.20930099487305, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval_loss": 2.9879119396209717, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.451, - "eval_steps_per_second": 4.708, + "epoch": 5.706134094151213, + "eval_loss": 2.576781988143921, + "eval_runtime": 0.2231, + "eval_samples_per_second": 192.779, + "eval_steps_per_second": 4.483, "step": 8000 }, { - "epoch": 0.9643631110040934, - "grad_norm": 9.875, + "epoch": 5.713266761768901, + "grad_norm": 7.75, "learning_rate": 7.551555555555556e-05, - "loss": 0.5835, + "loss": 0.7642, "step": 8010 }, { - "epoch": 0.9655670599566578, - "grad_norm": 8.8125, + "epoch": 5.720399429386591, + "grad_norm": 7.8125, "learning_rate": 7.547111111111111e-05, - "loss": 0.6138, + "loss": 0.9188, "step": 8020 }, { - "epoch": 0.9667710089092223, - "grad_norm": 8.3125, + "epoch": 5.727532097004279, + "grad_norm": 7.28125, "learning_rate": 7.542666666666667e-05, - "loss": 0.6638, + "loss": 0.8202, "step": 8030 }, { - "epoch": 0.9679749578617867, - "grad_norm": 7.0, + "epoch": 5.734664764621969, + "grad_norm": 9.0, "learning_rate": 7.538222222222222e-05, - "loss": 0.6484, + "loss": 0.8286, "step": 8040 }, { - "epoch": 0.969178906814351, - "grad_norm": 8.25, + "epoch": 5.741797432239657, + "grad_norm": 7.25, "learning_rate": 7.533777777777778e-05, - "loss": 0.6291, + "loss": 0.7856, "step": 8050 }, { - "epoch": 0.9703828557669155, - "grad_norm": 9.75, + "epoch": 5.748930099857347, + "grad_norm": 6.90625, "learning_rate": 7.529333333333333e-05, - "loss": 0.71, + "loss": 0.8832, "step": 8060 }, { - "epoch": 0.9715868047194799, - "grad_norm": 6.375, + "epoch": 5.7560627674750355, + "grad_norm": 6.09375, "learning_rate": 7.52488888888889e-05, - "loss": 0.5791, + "loss": 0.7606, "step": 8070 }, { - "epoch": 0.9727907536720443, - "grad_norm": 7.40625, + "epoch": 5.763195435092725, + "grad_norm": 6.625, "learning_rate": 7.520444444444444e-05, - "loss": 0.6359, + "loss": 0.8706, "step": 8080 }, { - "epoch": 0.9739947026246087, - "grad_norm": 8.125, + "epoch": 5.770328102710414, + "grad_norm": 7.25, "learning_rate": 7.516e-05, - "loss": 0.5274, + "loss": 0.8542, "step": 8090 }, { - "epoch": 0.9751986515771731, - "grad_norm": 8.6875, + "epoch": 5.777460770328103, + "grad_norm": 6.84375, "learning_rate": 7.511555555555557e-05, - "loss": 0.5887, + "loss": 0.7988, "step": 8100 }, { - "epoch": 0.9751986515771731, + "epoch": 5.777460770328103, "eval/acc": 37.20930099487305, "step": 8100 }, { - "epoch": 0.9751986515771731, - "eval_loss": 3.0165836811065674, - "eval_runtime": 0.2158, - "eval_samples_per_second": 199.215, - "eval_steps_per_second": 4.633, + "epoch": 5.777460770328103, + "eval_loss": 2.598762273788452, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.04, + "eval_steps_per_second": 4.443, "step": 8100 }, { - "epoch": 0.9764026005297375, - "grad_norm": 9.125, + "epoch": 5.784593437945792, + "grad_norm": 8.875, "learning_rate": 7.507111111111112e-05, - "loss": 0.6646, + "loss": 0.8825, "step": 8110 }, { - "epoch": 0.977606549482302, - "grad_norm": 6.4375, + "epoch": 5.79172610556348, + "grad_norm": 7.375, "learning_rate": 7.502666666666666e-05, - "loss": 0.6757, + "loss": 0.8316, "step": 8120 }, { - "epoch": 0.9788104984348663, - "grad_norm": 6.90625, + "epoch": 5.79885877318117, + "grad_norm": 8.125, "learning_rate": 7.498222222222223e-05, - "loss": 0.6722, + "loss": 0.8567, "step": 8130 }, { - "epoch": 0.9800144473874308, - "grad_norm": 5.34375, + "epoch": 5.805991440798858, + "grad_norm": 6.3125, "learning_rate": 7.493777777777779e-05, - "loss": 0.5574, + "loss": 0.8415, "step": 8140 }, { - "epoch": 0.9812183963399952, - "grad_norm": 12.25, + "epoch": 5.813124108416548, + "grad_norm": 8.5, "learning_rate": 7.489333333333334e-05, - "loss": 0.5701, + "loss": 0.8369, "step": 8150 }, { - "epoch": 0.9824223452925596, - "grad_norm": 5.09375, + "epoch": 5.8202567760342365, + "grad_norm": 13.25, "learning_rate": 7.484888888888889e-05, - "loss": 0.7311, + "loss": 0.8692, "step": 8160 }, { - "epoch": 0.983626294245124, - "grad_norm": 9.6875, + "epoch": 5.827389443651926, + "grad_norm": 7.71875, "learning_rate": 7.480444444444445e-05, - "loss": 0.6314, + "loss": 0.8535, "step": 8170 }, { - "epoch": 0.9848302431976884, - "grad_norm": 7.46875, + "epoch": 5.834522111269615, + "grad_norm": 7.6875, "learning_rate": 7.476000000000001e-05, - "loss": 0.6023, + "loss": 0.8701, "step": 8180 }, { - "epoch": 0.9860341921502528, - "grad_norm": 4.53125, + "epoch": 5.841654778887304, + "grad_norm": 5.46875, "learning_rate": 7.471555555555556e-05, - "loss": 0.5998, + "loss": 0.7843, "step": 8190 }, { - "epoch": 0.9872381411028173, - "grad_norm": 7.3125, + "epoch": 5.848787446504993, + "grad_norm": 7.46875, "learning_rate": 7.467111111111112e-05, - "loss": 0.6607, + "loss": 0.7914, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval/acc": 34.88372039794922, + "epoch": 5.848787446504993, + "eval/acc": 37.20930099487305, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval_loss": 2.9665606021881104, - "eval_runtime": 0.2143, - "eval_samples_per_second": 200.607, - "eval_steps_per_second": 4.665, + "epoch": 5.848787446504993, + "eval_loss": 2.566337823867798, + "eval_runtime": 0.3566, + "eval_samples_per_second": 120.59, + "eval_steps_per_second": 2.804, "step": 8200 }, { - "epoch": 0.9884420900553816, - "grad_norm": 7.96875, + "epoch": 5.855920114122682, + "grad_norm": 7.03125, "learning_rate": 7.462666666666667e-05, - "loss": 0.7103, + "loss": 0.849, "step": 8210 }, { - "epoch": 0.989646039007946, - "grad_norm": 10.3125, + "epoch": 5.863052781740371, + "grad_norm": 7.5625, "learning_rate": 7.458222222222223e-05, - "loss": 0.5721, + "loss": 0.8066, "step": 8220 }, { - "epoch": 0.9908499879605105, - "grad_norm": 8.5, + "epoch": 5.870185449358059, + "grad_norm": 6.875, "learning_rate": 7.453777777777778e-05, - "loss": 0.7032, + "loss": 0.8556, "step": 8230 }, { - "epoch": 0.9920539369130749, - "grad_norm": 6.21875, + "epoch": 5.877318116975749, + "grad_norm": 8.0, "learning_rate": 7.449333333333334e-05, - "loss": 0.6547, + "loss": 0.9098, "step": 8240 }, { - "epoch": 0.9932578858656393, - "grad_norm": 7.84375, + "epoch": 5.884450784593438, + "grad_norm": 8.375, "learning_rate": 7.444888888888889e-05, - "loss": 0.6587, + "loss": 0.8183, "step": 8250 }, { - "epoch": 0.9944618348182037, - "grad_norm": 6.53125, + "epoch": 5.891583452211127, + "grad_norm": 13.9375, "learning_rate": 7.440444444444444e-05, - "loss": 0.5486, + "loss": 0.8316, "step": 8260 }, { - "epoch": 0.9956657837707681, - "grad_norm": 8.1875, + "epoch": 5.898716119828816, + "grad_norm": 7.25, "learning_rate": 7.436000000000001e-05, - "loss": 0.6284, + "loss": 0.8563, "step": 8270 }, { - "epoch": 0.9968697327233326, - "grad_norm": 7.59375, + "epoch": 5.905848787446505, + "grad_norm": 10.75, "learning_rate": 7.431555555555556e-05, - "loss": 0.7033, + "loss": 0.8473, "step": 8280 }, { - "epoch": 0.9980736816758969, - "grad_norm": 9.0625, + "epoch": 5.912981455064194, + "grad_norm": 14.1875, "learning_rate": 7.427111111111111e-05, - "loss": 0.6621, + "loss": 0.774, "step": 8290 }, { - "epoch": 0.9992776306284613, - "grad_norm": 8.1875, + "epoch": 5.920114122681883, + "grad_norm": 6.8125, "learning_rate": 7.422666666666667e-05, - "loss": 0.6675, + "loss": 0.8783, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval/acc": 34.30232620239258, + "epoch": 5.920114122681883, + "eval/acc": 34.88372039794922, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval_loss": 2.9075520038604736, - "eval_runtime": 0.7142, - "eval_samples_per_second": 60.205, - "eval_steps_per_second": 1.4, + "epoch": 5.920114122681883, + "eval_loss": 2.6135735511779785, + "eval_runtime": 0.2367, + "eval_samples_per_second": 181.665, + "eval_steps_per_second": 4.225, "step": 8300 }, { - "epoch": 1.0004815795810258, - "grad_norm": 6.34375, + "epoch": 5.927246790299572, + "grad_norm": 7.5625, "learning_rate": 7.418222222222223e-05, - "loss": 0.6534, + "loss": 0.9057, "step": 8310 }, { - "epoch": 1.0016855285335902, - "grad_norm": 7.5, + "epoch": 5.934379457917261, + "grad_norm": 7.875, "learning_rate": 7.413777777777778e-05, - "loss": 0.5778, + "loss": 0.8854, "step": 8320 }, { - "epoch": 1.0028894774861545, - "grad_norm": 8.25, + "epoch": 5.94151212553495, + "grad_norm": 8.1875, "learning_rate": 7.409333333333333e-05, - "loss": 0.6143, + "loss": 0.8049, "step": 8330 }, { - "epoch": 1.004093426438719, - "grad_norm": 6.40625, + "epoch": 5.948644793152639, + "grad_norm": 6.90625, "learning_rate": 7.404888888888889e-05, - "loss": 0.5399, + "loss": 0.7738, "step": 8340 }, { - "epoch": 1.0052973753912835, - "grad_norm": 8.6875, + "epoch": 5.955777460770328, + "grad_norm": 7.90625, "learning_rate": 7.400444444444444e-05, - "loss": 0.6422, + "loss": 0.8268, "step": 8350 }, { - "epoch": 1.0065013243438479, - "grad_norm": 6.5625, + "epoch": 5.9629101283880175, + "grad_norm": 8.3125, "learning_rate": 7.396e-05, - "loss": 0.5578, + "loss": 0.8336, "step": 8360 }, { - "epoch": 1.0077052732964122, - "grad_norm": 6.15625, + "epoch": 5.970042796005706, + "grad_norm": 7.375, "learning_rate": 7.391555555555557e-05, - "loss": 0.6529, + "loss": 0.8282, "step": 8370 }, { - "epoch": 1.0089092222489766, - "grad_norm": 8.875, + "epoch": 5.977175463623395, + "grad_norm": 6.8125, "learning_rate": 7.387111111111111e-05, - "loss": 0.7195, + "loss": 0.8234, "step": 8380 }, { - "epoch": 1.010113171201541, - "grad_norm": 14.0, + "epoch": 5.984308131241084, + "grad_norm": 7.15625, "learning_rate": 7.382666666666666e-05, - "loss": 0.6301, + "loss": 0.8771, "step": 8390 }, { - "epoch": 1.0113171201541056, - "grad_norm": 7.46875, + "epoch": 5.991440798858774, + "grad_norm": 8.5, "learning_rate": 7.378222222222222e-05, - "loss": 0.6439, + "loss": 0.8572, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval/acc": 44.1860466003418, + "epoch": 5.991440798858774, + "eval/acc": 34.88372039794922, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval_loss": 2.8671419620513916, - "eval_runtime": 7.1371, - "eval_samples_per_second": 6.025, - "eval_steps_per_second": 0.14, + "epoch": 5.991440798858774, + "eval_loss": 2.5367989540100098, + "eval_runtime": 0.224, + "eval_samples_per_second": 191.97, + "eval_steps_per_second": 4.464, "step": 8400 }, { - "epoch": 1.01252106910667, - "grad_norm": 11.5625, + "epoch": 5.998573466476462, + "grad_norm": 7.0, "learning_rate": 7.373777777777779e-05, - "loss": 0.618, + "loss": 0.7468, "step": 8410 }, { - "epoch": 1.0137250180592343, - "grad_norm": 6.1875, + "epoch": 6.005706134094151, + "grad_norm": 7.78125, "learning_rate": 7.369333333333333e-05, - "loss": 0.666, + "loss": 0.7882, "step": 8420 }, { - "epoch": 1.0149289670117987, - "grad_norm": 7.34375, + "epoch": 6.01283880171184, + "grad_norm": 9.1875, "learning_rate": 7.364888888888888e-05, - "loss": 0.6237, + "loss": 0.9419, "step": 8430 }, { - "epoch": 1.016132915964363, - "grad_norm": 7.21875, + "epoch": 6.019971469329529, + "grad_norm": 17.625, "learning_rate": 7.360444444444445e-05, - "loss": 0.5974, + "loss": 0.7904, "step": 8440 }, { - "epoch": 1.0173368649169274, - "grad_norm": 8.625, + "epoch": 6.0271041369472185, + "grad_norm": 8.0625, "learning_rate": 7.356000000000001e-05, - "loss": 0.5766, + "loss": 0.8125, "step": 8450 }, { - "epoch": 1.018540813869492, - "grad_norm": 7.71875, + "epoch": 6.034236804564907, + "grad_norm": 7.4375, "learning_rate": 7.351555555555556e-05, - "loss": 0.6754, + "loss": 0.8002, "step": 8460 }, { - "epoch": 1.0197447628220564, - "grad_norm": 6.8125, + "epoch": 6.041369472182597, + "grad_norm": 5.6875, "learning_rate": 7.347111111111112e-05, - "loss": 0.6515, + "loss": 0.7719, "step": 8470 }, { - "epoch": 1.0209487117746208, - "grad_norm": 7.40625, + "epoch": 6.048502139800285, + "grad_norm": 8.9375, "learning_rate": 7.342666666666667e-05, - "loss": 0.6191, + "loss": 0.8122, "step": 8480 }, { - "epoch": 1.0221526607271851, - "grad_norm": 7.34375, + "epoch": 6.055634807417975, + "grad_norm": 9.875, "learning_rate": 7.338222222222223e-05, - "loss": 0.5703, + "loss": 0.8052, "step": 8490 }, { - "epoch": 1.0233566096797495, - "grad_norm": 8.125, + "epoch": 6.062767475035663, + "grad_norm": 9.125, "learning_rate": 7.333777777777778e-05, - "loss": 0.585, + "loss": 0.8171, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval/acc": 44.1860466003418, + "epoch": 6.062767475035663, + "eval/acc": 46.511627197265625, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval_loss": 2.8172407150268555, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.34, - "eval_steps_per_second": 4.729, + "epoch": 6.062767475035663, + "eval_loss": 2.4180805683135986, + "eval_runtime": 1.182, + "eval_samples_per_second": 36.38, + "eval_steps_per_second": 0.846, "step": 8500 }, { - "epoch": 1.0245605586323139, - "grad_norm": 8.375, + "epoch": 6.069900142653353, + "grad_norm": 6.84375, "learning_rate": 7.329333333333334e-05, - "loss": 0.5365, + "loss": 0.9028, "step": 8510 }, { - "epoch": 1.0257645075848785, - "grad_norm": 8.0, + "epoch": 6.077032810271041, + "grad_norm": 23.625, "learning_rate": 7.324888888888889e-05, - "loss": 0.5976, + "loss": 0.8576, "step": 8520 }, { - "epoch": 1.0269684565374428, - "grad_norm": 6.59375, + "epoch": 6.08416547788873, + "grad_norm": 6.96875, "learning_rate": 7.320444444444445e-05, - "loss": 0.6249, + "loss": 0.8407, "step": 8530 }, { - "epoch": 1.0281724054900072, - "grad_norm": 8.625, + "epoch": 6.0912981455064195, + "grad_norm": 8.6875, "learning_rate": 7.316000000000001e-05, - "loss": 0.5953, + "loss": 0.8419, "step": 8540 }, { - "epoch": 1.0293763544425716, - "grad_norm": 4.875, + "epoch": 6.098430813124108, + "grad_norm": 6.90625, "learning_rate": 7.311555555555556e-05, - "loss": 0.5528, + "loss": 0.7802, "step": 8550 }, { - "epoch": 1.030580303395136, - "grad_norm": 5.28125, + "epoch": 6.1055634807417976, + "grad_norm": 6.34375, "learning_rate": 7.307111111111111e-05, - "loss": 0.5181, + "loss": 0.7716, "step": 8560 }, { - "epoch": 1.0317842523477005, - "grad_norm": 9.9375, + "epoch": 6.112696148359486, + "grad_norm": 13.5, "learning_rate": 7.302666666666667e-05, - "loss": 0.5991, + "loss": 0.8538, "step": 8570 }, { - "epoch": 1.032988201300265, - "grad_norm": 5.78125, + "epoch": 6.119828815977176, + "grad_norm": 6.59375, "learning_rate": 7.298222222222223e-05, - "loss": 0.6822, + "loss": 0.6951, "step": 8580 }, { - "epoch": 1.0341921502528293, - "grad_norm": 7.84375, + "epoch": 6.126961483594864, + "grad_norm": 7.0625, "learning_rate": 7.293777777777778e-05, - "loss": 0.671, + "loss": 0.794, "step": 8590 }, { - "epoch": 1.0353960992053937, - "grad_norm": 8.4375, + "epoch": 6.134094151212554, + "grad_norm": 7.15625, "learning_rate": 7.289333333333334e-05, - "loss": 0.6266, + "loss": 0.8058, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval/acc": 41.27906799316406, + "epoch": 6.134094151212554, + "eval/acc": 46.511627197265625, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval_loss": 2.89090895652771, - "eval_runtime": 0.2168, - "eval_samples_per_second": 198.358, - "eval_steps_per_second": 4.613, + "epoch": 6.134094151212554, + "eval_loss": 2.5194764137268066, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.736, + "eval_steps_per_second": 4.529, "step": 8600 }, { - "epoch": 1.036600048157958, - "grad_norm": 6.84375, + "epoch": 6.141226818830242, + "grad_norm": 8.0625, "learning_rate": 7.284888888888889e-05, - "loss": 0.5829, + "loss": 0.8754, "step": 8610 }, { - "epoch": 1.0378039971105224, - "grad_norm": 12.6875, + "epoch": 6.148359486447932, + "grad_norm": 4.875, "learning_rate": 7.280444444444445e-05, - "loss": 0.6336, + "loss": 0.7852, "step": 8620 }, { - "epoch": 1.039007946063087, - "grad_norm": 5.78125, + "epoch": 6.1554921540656204, + "grad_norm": 8.0, "learning_rate": 7.276e-05, - "loss": 0.5621, + "loss": 0.8064, "step": 8630 }, { - "epoch": 1.0402118950156514, - "grad_norm": 6.78125, + "epoch": 6.16262482168331, + "grad_norm": 6.3125, "learning_rate": 7.271555555555556e-05, - "loss": 0.5822, + "loss": 0.7643, "step": 8640 }, { - "epoch": 1.0414158439682157, - "grad_norm": 5.40625, + "epoch": 6.1697574893009985, + "grad_norm": 8.875, "learning_rate": 7.267111111111111e-05, - "loss": 0.6402, + "loss": 0.7702, "step": 8650 }, { - "epoch": 1.04261979292078, - "grad_norm": 5.84375, + "epoch": 6.176890156918688, + "grad_norm": 18.5, "learning_rate": 7.262666666666666e-05, - "loss": 0.5793, + "loss": 0.903, "step": 8660 }, { - "epoch": 1.0438237418733445, - "grad_norm": 9.375, + "epoch": 6.184022824536377, + "grad_norm": 9.875, "learning_rate": 7.258222222222224e-05, - "loss": 0.6447, + "loss": 0.788, "step": 8670 }, { - "epoch": 1.045027690825909, - "grad_norm": 8.4375, + "epoch": 6.191155492154065, + "grad_norm": 7.71875, "learning_rate": 7.253777777777778e-05, - "loss": 0.6428, + "loss": 0.7504, "step": 8680 }, { - "epoch": 1.0462316397784734, - "grad_norm": 8.5, + "epoch": 6.198288159771755, + "grad_norm": 7.5, "learning_rate": 7.249333333333333e-05, - "loss": 0.6219, + "loss": 0.8821, "step": 8690 }, { - "epoch": 1.0474355887310378, - "grad_norm": 8.0625, + "epoch": 6.205420827389443, + "grad_norm": 6.71875, "learning_rate": 7.24488888888889e-05, - "loss": 0.5728, + "loss": 0.9166, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval/acc": 41.86046600341797, + "epoch": 6.205420827389443, + "eval/acc": 48.83720779418945, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval_loss": 2.881147861480713, - "eval_runtime": 0.2167, - "eval_samples_per_second": 198.476, - "eval_steps_per_second": 4.616, + "epoch": 6.205420827389443, + "eval_loss": 2.488805055618286, + "eval_runtime": 0.2195, + "eval_samples_per_second": 195.91, + "eval_steps_per_second": 4.556, "step": 8700 }, { - "epoch": 1.0486395376836022, - "grad_norm": 7.09375, + "epoch": 6.212553495007133, + "grad_norm": 8.3125, "learning_rate": 7.240444444444446e-05, - "loss": 0.6532, + "loss": 0.7724, "step": 8710 }, { - "epoch": 1.0498434866361666, - "grad_norm": 7.0625, + "epoch": 6.219686162624821, + "grad_norm": 7.84375, "learning_rate": 7.236e-05, - "loss": 0.5758, + "loss": 0.8881, "step": 8720 }, { - "epoch": 1.051047435588731, - "grad_norm": 8.375, + "epoch": 6.226818830242511, + "grad_norm": 7.21875, "learning_rate": 7.231555555555555e-05, - "loss": 0.6071, + "loss": 0.8538, "step": 8730 }, { - "epoch": 1.0522513845412955, - "grad_norm": 7.34375, + "epoch": 6.2339514978601995, + "grad_norm": 7.5, "learning_rate": 7.227111111111112e-05, - "loss": 0.6905, + "loss": 0.8909, "step": 8740 }, { - "epoch": 1.05345533349386, - "grad_norm": 6.59375, + "epoch": 6.241084165477889, + "grad_norm": 7.25, "learning_rate": 7.222666666666666e-05, - "loss": 0.584, + "loss": 0.7965, "step": 8750 }, { - "epoch": 1.0546592824464243, - "grad_norm": 7.4375, + "epoch": 6.248216833095578, + "grad_norm": 7.46875, "learning_rate": 7.218222222222223e-05, - "loss": 0.6222, + "loss": 0.8547, "step": 8760 }, { - "epoch": 1.0558632313989886, - "grad_norm": 7.1875, + "epoch": 6.255349500713267, + "grad_norm": 6.1875, "learning_rate": 7.213777777777779e-05, - "loss": 0.6167, + "loss": 0.7528, "step": 8770 }, { - "epoch": 1.057067180351553, - "grad_norm": 7.875, + "epoch": 6.262482168330956, + "grad_norm": 7.03125, "learning_rate": 7.209333333333334e-05, - "loss": 0.5766, + "loss": 0.8632, "step": 8780 }, { - "epoch": 1.0582711293041176, - "grad_norm": 7.96875, + "epoch": 6.269614835948644, + "grad_norm": 8.375, "learning_rate": 7.204888888888888e-05, - "loss": 0.5747, + "loss": 0.7832, "step": 8790 }, { - "epoch": 1.059475078256682, - "grad_norm": 7.5, + "epoch": 6.276747503566334, + "grad_norm": 8.125, "learning_rate": 7.200444444444445e-05, - "loss": 0.5361, + "loss": 0.7659, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval/acc": 44.1860466003418, + "epoch": 6.276747503566334, + "eval/acc": 48.83720779418945, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval_loss": 2.9378437995910645, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.123, - "eval_steps_per_second": 4.631, + "epoch": 6.276747503566334, + "eval_loss": 2.4990618228912354, + "eval_runtime": 0.2586, + "eval_samples_per_second": 166.3, + "eval_steps_per_second": 3.867, "step": 8800 }, { - "epoch": 1.0606790272092463, - "grad_norm": 6.46875, + "epoch": 6.283880171184022, + "grad_norm": 7.375, "learning_rate": 7.196000000000001e-05, - "loss": 0.5386, + "loss": 0.7402, "step": 8810 }, { - "epoch": 1.0618829761618107, - "grad_norm": 5.75, + "epoch": 6.291012838801712, + "grad_norm": 7.0, "learning_rate": 7.191555555555556e-05, - "loss": 0.5701, + "loss": 0.8381, "step": 8820 }, { - "epoch": 1.063086925114375, - "grad_norm": 10.5625, + "epoch": 6.2981455064194005, + "grad_norm": 15.75, "learning_rate": 7.18711111111111e-05, - "loss": 0.6061, + "loss": 0.8837, "step": 8830 }, { - "epoch": 1.0642908740669395, - "grad_norm": 6.75, + "epoch": 6.30527817403709, + "grad_norm": 5.46875, "learning_rate": 7.182666666666668e-05, - "loss": 0.6201, + "loss": 0.8638, "step": 8840 }, { - "epoch": 1.065494823019504, - "grad_norm": 9.625, + "epoch": 6.312410841654779, + "grad_norm": 5.46875, "learning_rate": 7.178222222222223e-05, - "loss": 0.6315, + "loss": 0.8348, "step": 8850 }, { - "epoch": 1.0666987719720684, - "grad_norm": 6.15625, + "epoch": 6.319543509272468, + "grad_norm": 7.9375, "learning_rate": 7.173777777777778e-05, - "loss": 0.6142, + "loss": 0.8598, "step": 8860 }, { - "epoch": 1.0679027209246328, - "grad_norm": 8.875, + "epoch": 6.326676176890157, + "grad_norm": 7.15625, "learning_rate": 7.169333333333334e-05, - "loss": 0.6545, + "loss": 0.8124, "step": 8870 }, { - "epoch": 1.0691066698771972, - "grad_norm": 6.5, + "epoch": 6.333808844507846, + "grad_norm": 6.28125, "learning_rate": 7.164888888888889e-05, - "loss": 0.6305, + "loss": 0.8184, "step": 8880 }, { - "epoch": 1.0703106188297615, - "grad_norm": 12.5, + "epoch": 6.340941512125535, + "grad_norm": 7.25, "learning_rate": 7.160444444444445e-05, - "loss": 0.6451, + "loss": 0.8522, "step": 8890 }, { - "epoch": 1.0715145677823261, - "grad_norm": 6.28125, + "epoch": 6.348074179743224, + "grad_norm": 8.4375, "learning_rate": 7.156e-05, - "loss": 0.5406, + "loss": 0.894, "step": 8900 }, { - "epoch": 1.0715145677823261, + "epoch": 6.348074179743224, "eval/acc": 46.511627197265625, "step": 8900 }, { - "epoch": 1.0715145677823261, - "eval_loss": 2.895603656768799, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.356, - "eval_steps_per_second": 4.729, + "epoch": 6.348074179743224, + "eval_loss": 2.4536728858947754, + "eval_runtime": 0.2168, + "eval_samples_per_second": 198.347, + "eval_steps_per_second": 4.613, "step": 8900 }, { - "epoch": 1.0727185167348905, - "grad_norm": 5.8125, + "epoch": 6.355206847360913, + "grad_norm": 8.3125, "learning_rate": 7.151555555555556e-05, - "loss": 0.6598, + "loss": 0.8331, "step": 8910 }, { - "epoch": 1.0739224656874549, - "grad_norm": 6.9375, + "epoch": 6.362339514978602, + "grad_norm": 13.1875, "learning_rate": 7.147111111111111e-05, - "loss": 0.5678, + "loss": 0.8107, "step": 8920 }, { - "epoch": 1.0751264146400192, - "grad_norm": 7.25, + "epoch": 6.369472182596291, + "grad_norm": 7.0, "learning_rate": 7.142666666666667e-05, - "loss": 0.6032, + "loss": 0.9504, "step": 8930 }, { - "epoch": 1.0763303635925836, - "grad_norm": 7.09375, + "epoch": 6.37660485021398, + "grad_norm": 9.5625, "learning_rate": 7.138222222222223e-05, - "loss": 0.5432, + "loss": 0.766, "step": 8940 }, { - "epoch": 1.077534312545148, - "grad_norm": 8.6875, + "epoch": 6.383737517831669, + "grad_norm": 13.4375, "learning_rate": 7.133777777777778e-05, - "loss": 0.6408, + "loss": 0.7923, "step": 8950 }, { - "epoch": 1.0787382614977126, - "grad_norm": 8.1875, + "epoch": 6.390870185449358, + "grad_norm": 6.6875, "learning_rate": 7.129333333333333e-05, - "loss": 0.5834, + "loss": 0.7777, "step": 8960 }, { - "epoch": 1.079942210450277, - "grad_norm": 8.75, + "epoch": 6.398002853067047, + "grad_norm": 6.09375, "learning_rate": 7.124888888888889e-05, - "loss": 0.5956, + "loss": 0.7729, "step": 8970 }, { - "epoch": 1.0811461594028413, - "grad_norm": 6.90625, + "epoch": 6.405135520684736, + "grad_norm": 6.46875, "learning_rate": 7.120444444444445e-05, - "loss": 0.6124, + "loss": 0.8118, "step": 8980 }, { - "epoch": 1.0823501083554057, - "grad_norm": 9.5625, + "epoch": 6.412268188302425, + "grad_norm": 6.21875, "learning_rate": 7.116e-05, - "loss": 0.6513, + "loss": 0.9006, "step": 8990 }, { - "epoch": 1.08355405730797, - "grad_norm": 8.0, + "epoch": 6.419400855920114, + "grad_norm": 6.5625, "learning_rate": 7.111555555555555e-05, - "loss": 0.6044, + "loss": 0.7092, "step": 9000 }, { - "epoch": 1.08355405730797, + "epoch": 6.419400855920114, "eval/acc": 44.1860466003418, "step": 9000 }, { - "epoch": 1.08355405730797, - "eval_loss": 2.894747257232666, - "eval_runtime": 0.2236, - "eval_samples_per_second": 192.288, - "eval_steps_per_second": 4.472, + "epoch": 6.419400855920114, + "eval_loss": 2.533996343612671, + "eval_runtime": 0.3418, + "eval_samples_per_second": 125.802, + "eval_steps_per_second": 2.926, "step": 9000 }, { - "epoch": 1.0847580062605346, - "grad_norm": 7.1875, + "epoch": 6.426533523537803, + "grad_norm": 7.59375, "learning_rate": 7.107111111111111e-05, - "loss": 0.4939, + "loss": 0.7684, "step": 9010 }, { - "epoch": 1.085961955213099, - "grad_norm": 8.25, + "epoch": 6.433666191155492, + "grad_norm": 6.8125, "learning_rate": 7.102666666666668e-05, - "loss": 0.7751, + "loss": 0.7654, "step": 9020 }, { - "epoch": 1.0871659041656634, - "grad_norm": 6.875, + "epoch": 6.4407988587731815, + "grad_norm": 7.5625, "learning_rate": 7.098222222222222e-05, - "loss": 0.593, + "loss": 0.8404, "step": 9030 }, { - "epoch": 1.0883698531182278, - "grad_norm": 7.5625, + "epoch": 6.44793152639087, + "grad_norm": 8.5, "learning_rate": 7.093777777777779e-05, - "loss": 0.587, + "loss": 0.8519, "step": 9040 }, { - "epoch": 1.0895738020707921, - "grad_norm": 9.5625, + "epoch": 6.45506419400856, + "grad_norm": 6.53125, "learning_rate": 7.089333333333333e-05, - "loss": 0.639, + "loss": 0.8487, "step": 9050 }, { - "epoch": 1.0907777510233565, - "grad_norm": 8.25, + "epoch": 6.462196861626248, + "grad_norm": 7.59375, "learning_rate": 7.084888888888888e-05, - "loss": 0.6537, + "loss": 0.8695, "step": 9060 }, { - "epoch": 1.091981699975921, - "grad_norm": 9.9375, + "epoch": 6.469329529243938, + "grad_norm": 8.4375, "learning_rate": 7.080444444444444e-05, - "loss": 0.6134, + "loss": 0.7864, "step": 9070 }, { - "epoch": 1.0931856489284855, - "grad_norm": 9.375, + "epoch": 6.476462196861626, + "grad_norm": 66.5, "learning_rate": 7.076000000000001e-05, - "loss": 0.5259, + "loss": 0.7726, "step": 9080 }, { - "epoch": 1.0943895978810498, - "grad_norm": 7.90625, + "epoch": 6.483594864479315, + "grad_norm": 6.96875, "learning_rate": 7.071555555555556e-05, - "loss": 0.7362, + "loss": 0.7832, "step": 9090 }, { - "epoch": 1.0955935468336142, - "grad_norm": 7.46875, + "epoch": 6.490727532097004, + "grad_norm": 7.40625, "learning_rate": 7.06711111111111e-05, - "loss": 0.6197, + "loss": 0.8063, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval/acc": 41.86046600341797, + "epoch": 6.490727532097004, + "eval/acc": 44.1860466003418, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval_loss": 2.920775890350342, - "eval_runtime": 0.2089, - "eval_samples_per_second": 205.889, - "eval_steps_per_second": 4.788, + "epoch": 6.490727532097004, + "eval_loss": 2.5438809394836426, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.594, + "eval_steps_per_second": 4.642, "step": 9100 }, { - "epoch": 1.0967974957861786, - "grad_norm": 9.6875, + "epoch": 6.497860199714693, + "grad_norm": 7.21875, "learning_rate": 7.062666666666668e-05, - "loss": 0.5682, + "loss": 0.7605, "step": 9110 }, { - "epoch": 1.0980014447387432, - "grad_norm": 6.5625, + "epoch": 6.5049928673323825, + "grad_norm": 7.90625, "learning_rate": 7.058222222222223e-05, - "loss": 0.572, + "loss": 0.8032, "step": 9120 }, { - "epoch": 1.0992053936913075, - "grad_norm": 7.5, + "epoch": 6.512125534950071, + "grad_norm": 6.9375, "learning_rate": 7.053777777777778e-05, - "loss": 0.5307, + "loss": 0.743, "step": 9130 }, { - "epoch": 1.100409342643872, - "grad_norm": 7.75, + "epoch": 6.519258202567761, + "grad_norm": 5.65625, "learning_rate": 7.049333333333334e-05, - "loss": 0.5432, + "loss": 0.8261, "step": 9140 }, { - "epoch": 1.1016132915964363, - "grad_norm": 6.84375, + "epoch": 6.526390870185449, + "grad_norm": 7.03125, "learning_rate": 7.04488888888889e-05, - "loss": 0.6012, + "loss": 0.8099, "step": 9150 }, { - "epoch": 1.1028172405490007, - "grad_norm": 6.84375, + "epoch": 6.533523537803139, + "grad_norm": 7.15625, "learning_rate": 7.040444444444445e-05, - "loss": 0.5776, + "loss": 0.817, "step": 9160 }, { - "epoch": 1.104021189501565, - "grad_norm": 8.0625, + "epoch": 6.540656205420827, + "grad_norm": 11.625, "learning_rate": 7.036e-05, - "loss": 0.5353, + "loss": 0.782, "step": 9170 }, { - "epoch": 1.1052251384541296, - "grad_norm": 5.65625, + "epoch": 6.547788873038517, + "grad_norm": 7.5625, "learning_rate": 7.031555555555556e-05, - "loss": 0.5664, + "loss": 0.8145, "step": 9180 }, { - "epoch": 1.106429087406694, - "grad_norm": 14.0, + "epoch": 6.554921540656205, + "grad_norm": 7.5625, "learning_rate": 7.027111111111111e-05, - "loss": 0.6547, + "loss": 0.8822, "step": 9190 }, { - "epoch": 1.1076330363592584, - "grad_norm": 7.9375, + "epoch": 6.562054208273894, + "grad_norm": 6.53125, "learning_rate": 7.022666666666667e-05, - "loss": 0.6063, + "loss": 0.8132, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval/acc": 41.86046600341797, + "epoch": 6.562054208273894, + "eval/acc": 44.1860466003418, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval_loss": 2.9192073345184326, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.15, - "eval_steps_per_second": 4.794, + "epoch": 6.562054208273894, + "eval_loss": 2.528564929962158, + "eval_runtime": 0.2169, + "eval_samples_per_second": 198.28, + "eval_steps_per_second": 4.611, "step": 9200 }, { - "epoch": 1.1088369853118227, - "grad_norm": 11.5625, + "epoch": 6.5691868758915835, + "grad_norm": 7.21875, "learning_rate": 7.018222222222223e-05, - "loss": 0.6251, + "loss": 0.7858, "step": 9210 }, { - "epoch": 1.110040934264387, - "grad_norm": 7.34375, + "epoch": 6.576319543509273, + "grad_norm": 6.4375, "learning_rate": 7.013777777777778e-05, - "loss": 0.5408, + "loss": 0.7098, "step": 9220 }, { - "epoch": 1.1112448832169517, - "grad_norm": 6.75, + "epoch": 6.5834522111269616, + "grad_norm": 7.125, "learning_rate": 7.009333333333333e-05, - "loss": 0.6217, + "loss": 0.8362, "step": 9230 }, { - "epoch": 1.112448832169516, - "grad_norm": 8.375, + "epoch": 6.59058487874465, + "grad_norm": 5.78125, "learning_rate": 7.004888888888889e-05, - "loss": 0.6792, + "loss": 0.7737, "step": 9240 }, { - "epoch": 1.1136527811220804, - "grad_norm": 8.375, + "epoch": 6.59771754636234, + "grad_norm": 9.0625, "learning_rate": 7.000444444444445e-05, - "loss": 0.5786, + "loss": 0.857, "step": 9250 }, { - "epoch": 1.1148567300746448, - "grad_norm": 11.0, + "epoch": 6.604850213980028, + "grad_norm": 9.125, "learning_rate": 6.996e-05, - "loss": 0.6588, + "loss": 0.7562, "step": 9260 }, { - "epoch": 1.1160606790272092, - "grad_norm": 6.75, + "epoch": 6.611982881597718, + "grad_norm": 8.3125, "learning_rate": 6.991555555555556e-05, - "loss": 0.6016, + "loss": 0.8619, "step": 9270 }, { - "epoch": 1.1172646279797736, - "grad_norm": 9.1875, + "epoch": 6.619115549215406, + "grad_norm": 6.78125, "learning_rate": 6.987111111111111e-05, - "loss": 0.5728, + "loss": 0.7212, "step": 9280 }, { - "epoch": 1.1184685769323381, - "grad_norm": 8.0625, + "epoch": 6.626248216833096, + "grad_norm": 26.125, "learning_rate": 6.982666666666667e-05, - "loss": 0.669, + "loss": 0.951, "step": 9290 }, { - "epoch": 1.1196725258849025, - "grad_norm": 9.6875, + "epoch": 6.633380884450784, + "grad_norm": 7.03125, "learning_rate": 6.978222222222222e-05, - "loss": 0.625, + "loss": 0.7791, "step": 9300 }, { - "epoch": 1.1196725258849025, + "epoch": 6.633380884450784, "eval/acc": 44.1860466003418, "step": 9300 }, { - "epoch": 1.1196725258849025, - "eval_loss": 2.8807859420776367, - "eval_runtime": 0.2269, - "eval_samples_per_second": 189.503, - "eval_steps_per_second": 4.407, + "epoch": 6.633380884450784, + "eval_loss": 2.587022304534912, + "eval_runtime": 0.2175, + "eval_samples_per_second": 197.663, + "eval_steps_per_second": 4.597, "step": 9300 }, { - "epoch": 1.1208764748374669, - "grad_norm": 7.09375, + "epoch": 6.640513552068474, + "grad_norm": 6.6875, "learning_rate": 6.973777777777778e-05, - "loss": 0.5112, + "loss": 0.8082, "step": 9310 }, { - "epoch": 1.1220804237900313, - "grad_norm": 19.375, + "epoch": 6.6476462196861625, + "grad_norm": 7.625, "learning_rate": 6.969333333333333e-05, - "loss": 0.7337, + "loss": 0.6863, "step": 9320 }, { - "epoch": 1.1232843727425956, - "grad_norm": 8.25, + "epoch": 6.654778887303852, + "grad_norm": 8.625, "learning_rate": 6.96488888888889e-05, - "loss": 0.6687, + "loss": 0.7921, "step": 9330 }, { - "epoch": 1.1244883216951602, - "grad_norm": 8.125, + "epoch": 6.661911554921541, + "grad_norm": 6.5, "learning_rate": 6.960444444444446e-05, - "loss": 0.5604, + "loss": 0.7762, "step": 9340 }, { - "epoch": 1.1256922706477246, - "grad_norm": 9.1875, + "epoch": 6.669044222539229, + "grad_norm": 12.6875, "learning_rate": 6.956e-05, - "loss": 0.6999, + "loss": 0.7977, "step": 9350 }, { - "epoch": 1.126896219600289, - "grad_norm": 8.5, + "epoch": 6.676176890156919, + "grad_norm": 6.84375, "learning_rate": 6.951555555555555e-05, - "loss": 0.5909, + "loss": 0.907, "step": 9360 }, { - "epoch": 1.1281001685528533, - "grad_norm": 7.21875, + "epoch": 6.683309557774607, + "grad_norm": 7.15625, "learning_rate": 6.947111111111112e-05, - "loss": 0.5857, + "loss": 0.792, "step": 9370 }, { - "epoch": 1.1293041175054177, - "grad_norm": 6.84375, + "epoch": 6.690442225392297, + "grad_norm": 8.5, "learning_rate": 6.942666666666668e-05, - "loss": 0.5965, + "loss": 0.7838, "step": 9380 }, { - "epoch": 1.130508066457982, - "grad_norm": 6.59375, + "epoch": 6.697574893009985, + "grad_norm": 8.1875, "learning_rate": 6.938222222222223e-05, - "loss": 0.6098, + "loss": 0.8141, "step": 9390 }, { - "epoch": 1.1317120154105467, - "grad_norm": 9.3125, + "epoch": 6.704707560627675, + "grad_norm": 7.875, "learning_rate": 6.933777777777777e-05, - "loss": 0.5917, + "loss": 0.8348, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval/acc": 44.1860466003418, + "epoch": 6.704707560627675, + "eval/acc": 39.53488540649414, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval_loss": 2.906259536743164, - "eval_runtime": 0.2551, - "eval_samples_per_second": 168.571, - "eval_steps_per_second": 3.92, + "epoch": 6.704707560627675, + "eval_loss": 2.6398463249206543, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.145, + "eval_steps_per_second": 4.585, "step": 9400 }, { - "epoch": 1.132915964363111, - "grad_norm": 8.3125, + "epoch": 6.7118402282453635, + "grad_norm": 6.625, "learning_rate": 6.929333333333334e-05, - "loss": 0.5629, + "loss": 0.889, "step": 9410 }, { - "epoch": 1.1341199133156754, - "grad_norm": 5.21875, + "epoch": 6.718972895863053, + "grad_norm": 7.3125, "learning_rate": 6.92488888888889e-05, - "loss": 0.4913, + "loss": 0.7913, "step": 9420 }, { - "epoch": 1.1353238622682398, - "grad_norm": 7.5625, + "epoch": 6.726105563480742, + "grad_norm": 10.875, "learning_rate": 6.920444444444445e-05, - "loss": 0.5868, + "loss": 0.8099, "step": 9430 }, { - "epoch": 1.1365278112208042, - "grad_norm": 8.8125, + "epoch": 6.733238231098431, + "grad_norm": 23.75, "learning_rate": 6.916000000000001e-05, - "loss": 0.6205, + "loss": 0.7098, "step": 9440 }, { - "epoch": 1.1377317601733687, - "grad_norm": 6.78125, + "epoch": 6.74037089871612, + "grad_norm": 6.625, "learning_rate": 6.911555555555556e-05, - "loss": 0.6569, + "loss": 0.7859, "step": 9450 }, { - "epoch": 1.1389357091259331, - "grad_norm": 7.9375, + "epoch": 6.747503566333809, + "grad_norm": 5.875, "learning_rate": 6.907111111111112e-05, - "loss": 0.5849, + "loss": 0.7947, "step": 9460 }, { - "epoch": 1.1401396580784975, - "grad_norm": 8.6875, + "epoch": 6.754636233951498, + "grad_norm": 7.25, "learning_rate": 6.902666666666667e-05, - "loss": 0.5997, + "loss": 0.927, "step": 9470 }, { - "epoch": 1.1413436070310619, - "grad_norm": 12.75, + "epoch": 6.761768901569187, + "grad_norm": 12.875, "learning_rate": 6.898222222222223e-05, - "loss": 0.6568, + "loss": 0.8474, "step": 9480 }, { - "epoch": 1.1425475559836262, - "grad_norm": 7.6875, + "epoch": 6.768901569186876, + "grad_norm": 6.8125, "learning_rate": 6.893777777777778e-05, - "loss": 0.6542, + "loss": 0.848, "step": 9490 }, { - "epoch": 1.1437515049361906, - "grad_norm": 6.59375, + "epoch": 6.7760342368045645, + "grad_norm": 7.96875, "learning_rate": 6.889333333333333e-05, - "loss": 0.4745, + "loss": 0.8081, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval/acc": 48.83720779418945, + "epoch": 6.7760342368045645, + "eval/acc": 41.86046600341797, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval_loss": 2.8177154064178467, - "eval_runtime": 0.2171, - "eval_samples_per_second": 198.092, - "eval_steps_per_second": 4.607, + "epoch": 6.7760342368045645, + "eval_loss": 2.6681759357452393, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.372, + "eval_steps_per_second": 4.451, "step": 9500 }, { - "epoch": 1.1449554538887552, - "grad_norm": 6.53125, + "epoch": 6.783166904422254, + "grad_norm": 9.0625, "learning_rate": 6.88488888888889e-05, - "loss": 0.664, + "loss": 0.8836, "step": 9510 }, { - "epoch": 1.1461594028413196, - "grad_norm": 7.6875, + "epoch": 6.790299572039943, + "grad_norm": 15.875, "learning_rate": 6.880444444444445e-05, - "loss": 0.5778, + "loss": 0.8696, "step": 9520 }, { - "epoch": 1.147363351793884, - "grad_norm": 6.84375, + "epoch": 6.797432239657632, + "grad_norm": 10.75, "learning_rate": 6.876e-05, - "loss": 0.6596, + "loss": 0.844, "step": 9530 }, { - "epoch": 1.1485673007464483, - "grad_norm": 8.75, + "epoch": 6.804564907275321, + "grad_norm": 23.875, "learning_rate": 6.871555555555556e-05, - "loss": 0.6422, + "loss": 0.823, "step": 9540 }, { - "epoch": 1.1497712496990127, - "grad_norm": 6.5625, + "epoch": 6.81169757489301, + "grad_norm": 7.75, "learning_rate": 6.867111111111112e-05, - "loss": 0.5794, + "loss": 0.8875, "step": 9550 }, { - "epoch": 1.1509751986515773, - "grad_norm": 8.625, + "epoch": 6.818830242510699, + "grad_norm": 6.46875, "learning_rate": 6.862666666666667e-05, - "loss": 0.6171, + "loss": 0.7703, "step": 9560 }, { - "epoch": 1.1521791476041416, - "grad_norm": 6.875, + "epoch": 6.825962910128388, + "grad_norm": 6.375, "learning_rate": 6.858222222222222e-05, - "loss": 0.58, + "loss": 0.8, "step": 9570 }, { - "epoch": 1.153383096556706, - "grad_norm": 14.375, + "epoch": 6.833095577746077, + "grad_norm": 7.96875, "learning_rate": 6.853777777777778e-05, - "loss": 0.6651, + "loss": 0.8139, "step": 9580 }, { - "epoch": 1.1545870455092704, - "grad_norm": 6.65625, + "epoch": 6.840228245363766, + "grad_norm": 11.625, "learning_rate": 6.849333333333333e-05, - "loss": 0.621, + "loss": 0.8042, "step": 9590 }, { - "epoch": 1.1557909944618348, - "grad_norm": 8.25, + "epoch": 6.847360912981455, + "grad_norm": 10.8125, "learning_rate": 6.844888888888889e-05, - "loss": 0.6578, + "loss": 0.8403, "step": 9600 }, { - "epoch": 1.1557909944618348, + "epoch": 6.847360912981455, "eval/acc": 44.1860466003418, "step": 9600 }, { - "epoch": 1.1557909944618348, - "eval_loss": 2.841442108154297, - "eval_runtime": 0.2144, - "eval_samples_per_second": 200.545, - "eval_steps_per_second": 4.664, + "epoch": 6.847360912981455, + "eval_loss": 2.6575427055358887, + "eval_runtime": 0.2186, + "eval_samples_per_second": 196.745, + "eval_steps_per_second": 4.575, "step": 9600 }, { - "epoch": 1.1569949434143991, - "grad_norm": 6.09375, + "epoch": 6.854493580599144, + "grad_norm": 14.6875, "learning_rate": 6.840444444444445e-05, - "loss": 0.5215, + "loss": 0.8426, "step": 9610 }, { - "epoch": 1.1581988923669637, - "grad_norm": 9.1875, + "epoch": 6.861626248216833, + "grad_norm": 7.84375, "learning_rate": 6.836e-05, - "loss": 0.6458, + "loss": 0.8874, "step": 9620 }, { - "epoch": 1.159402841319528, - "grad_norm": 8.8125, + "epoch": 6.868758915834523, + "grad_norm": 8.9375, "learning_rate": 6.831555555555555e-05, - "loss": 0.6037, + "loss": 0.78, "step": 9630 }, { - "epoch": 1.1606067902720925, - "grad_norm": 7.0, + "epoch": 6.875891583452211, + "grad_norm": 6.1875, "learning_rate": 6.827111111111111e-05, - "loss": 0.5408, + "loss": 0.7788, "step": 9640 }, { - "epoch": 1.1618107392246568, - "grad_norm": 5.78125, + "epoch": 6.8830242510699, + "grad_norm": 6.34375, "learning_rate": 6.822666666666668e-05, - "loss": 0.5832, + "loss": 0.7385, "step": 9650 }, { - "epoch": 1.1630146881772212, - "grad_norm": 6.84375, + "epoch": 6.890156918687589, + "grad_norm": 7.59375, "learning_rate": 6.818222222222222e-05, - "loss": 0.5802, + "loss": 0.8938, "step": 9660 }, { - "epoch": 1.1642186371297858, - "grad_norm": 5.75, + "epoch": 6.897289586305278, + "grad_norm": 10.8125, "learning_rate": 6.813777777777777e-05, - "loss": 0.5377, + "loss": 0.8154, "step": 9670 }, { - "epoch": 1.1654225860823502, - "grad_norm": 7.5625, + "epoch": 6.904422253922967, + "grad_norm": 6.90625, "learning_rate": 6.809333333333333e-05, - "loss": 0.5657, + "loss": 0.9273, "step": 9680 }, { - "epoch": 1.1666265350349145, - "grad_norm": 6.15625, + "epoch": 6.911554921540656, + "grad_norm": 8.3125, "learning_rate": 6.80488888888889e-05, - "loss": 0.5107, + "loss": 0.8595, "step": 9690 }, { - "epoch": 1.167830483987479, - "grad_norm": 5.28125, + "epoch": 6.9186875891583455, + "grad_norm": 10.75, "learning_rate": 6.800444444444444e-05, - "loss": 0.5898, + "loss": 0.8569, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval/acc": 46.511627197265625, + "epoch": 6.9186875891583455, + "eval/acc": 39.53488540649414, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval_loss": 2.8665220737457275, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.771, - "eval_steps_per_second": 4.739, + "epoch": 6.9186875891583455, + "eval_loss": 2.6524884700775146, + "eval_runtime": 0.215, + "eval_samples_per_second": 200.025, + "eval_steps_per_second": 4.652, "step": 9700 }, { - "epoch": 1.1690344329400433, - "grad_norm": 7.0, + "epoch": 6.925820256776034, + "grad_norm": 7.90625, "learning_rate": 6.796e-05, - "loss": 0.6016, + "loss": 0.7726, "step": 9710 }, { - "epoch": 1.1702383818926076, - "grad_norm": 7.0, + "epoch": 6.932952924393724, + "grad_norm": 7.71875, "learning_rate": 6.791555555555556e-05, - "loss": 0.6048, + "loss": 0.789, "step": 9720 }, { - "epoch": 1.1714423308451722, - "grad_norm": 7.21875, + "epoch": 6.940085592011412, + "grad_norm": 7.4375, "learning_rate": 6.787111111111112e-05, - "loss": 0.5315, + "loss": 0.7525, "step": 9730 }, { - "epoch": 1.1726462797977366, - "grad_norm": 6.53125, + "epoch": 6.947218259629102, + "grad_norm": 6.96875, "learning_rate": 6.782666666666667e-05, - "loss": 0.5033, + "loss": 0.8183, "step": 9740 }, { - "epoch": 1.173850228750301, - "grad_norm": 6.34375, + "epoch": 6.95435092724679, + "grad_norm": 6.5625, "learning_rate": 6.778222222222223e-05, - "loss": 0.5615, + "loss": 0.8713, "step": 9750 }, { - "epoch": 1.1750541777028654, - "grad_norm": 6.34375, + "epoch": 6.961483594864479, + "grad_norm": 6.59375, "learning_rate": 6.773777777777778e-05, - "loss": 0.5494, + "loss": 0.8089, "step": 9760 }, { - "epoch": 1.1762581266554297, - "grad_norm": 7.3125, + "epoch": 6.968616262482168, + "grad_norm": 7.46875, "learning_rate": 6.769333333333334e-05, - "loss": 0.6047, + "loss": 0.8173, "step": 9770 }, { - "epoch": 1.1774620756079943, - "grad_norm": 6.53125, + "epoch": 6.975748930099857, + "grad_norm": 8.75, "learning_rate": 6.76488888888889e-05, - "loss": 0.6653, + "loss": 0.8359, "step": 9780 }, { - "epoch": 1.1786660245605587, - "grad_norm": 21.75, + "epoch": 6.9828815977175465, + "grad_norm": 6.96875, "learning_rate": 6.760444444444445e-05, - "loss": 0.5944, + "loss": 0.7308, "step": 9790 }, { - "epoch": 1.179869973513123, - "grad_norm": 17.25, + "epoch": 6.990014265335235, + "grad_norm": 8.6875, "learning_rate": 6.756e-05, - "loss": 0.6511, + "loss": 0.7651, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval/acc": 46.511627197265625, + "epoch": 6.990014265335235, + "eval/acc": 44.1860466003418, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval_loss": 2.8695812225341797, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.781, - "eval_steps_per_second": 4.669, + "epoch": 6.990014265335235, + "eval_loss": 2.581909418106079, + "eval_runtime": 0.217, + "eval_samples_per_second": 198.162, + "eval_steps_per_second": 4.608, "step": 9800 }, { - "epoch": 1.1810739224656874, - "grad_norm": 5.3125, + "epoch": 6.997146932952925, + "grad_norm": 7.6875, "learning_rate": 6.751555555555556e-05, - "loss": 0.6008, + "loss": 0.8653, "step": 9810 }, { - "epoch": 1.1822778714182518, - "grad_norm": 9.4375, + "epoch": 7.004279600570613, + "grad_norm": 8.5, "learning_rate": 6.747111111111112e-05, - "loss": 0.5898, + "loss": 0.8445, "step": 9820 }, { - "epoch": 1.1834818203708162, - "grad_norm": 6.6875, + "epoch": 7.011412268188303, + "grad_norm": 6.375, "learning_rate": 6.742666666666667e-05, - "loss": 0.5976, + "loss": 0.7759, "step": 9830 }, { - "epoch": 1.1846857693233808, - "grad_norm": 7.875, + "epoch": 7.018544935805991, + "grad_norm": 6.375, "learning_rate": 6.738222222222222e-05, - "loss": 0.5604, + "loss": 0.7709, "step": 9840 }, { - "epoch": 1.1858897182759451, - "grad_norm": 6.0625, + "epoch": 7.025677603423681, + "grad_norm": 7.8125, "learning_rate": 6.733777777777778e-05, - "loss": 0.736, + "loss": 0.768, "step": 9850 }, { - "epoch": 1.1870936672285095, - "grad_norm": 8.125, + "epoch": 7.032810271041369, + "grad_norm": 8.4375, "learning_rate": 6.729333333333334e-05, - "loss": 0.5235, + "loss": 0.8725, "step": 9860 }, { - "epoch": 1.1882976161810739, - "grad_norm": 6.46875, + "epoch": 7.039942938659059, + "grad_norm": 7.8125, "learning_rate": 6.724888888888889e-05, - "loss": 0.5716, + "loss": 0.8146, "step": 9870 }, { - "epoch": 1.1895015651336383, - "grad_norm": 6.21875, + "epoch": 7.0470756062767475, + "grad_norm": 70.0, "learning_rate": 6.720444444444445e-05, - "loss": 0.5337, + "loss": 0.8137, "step": 9880 }, { - "epoch": 1.1907055140862028, - "grad_norm": 7.28125, + "epoch": 7.054208273894437, + "grad_norm": 7.03125, "learning_rate": 6.716e-05, - "loss": 0.5203, + "loss": 0.8025, "step": 9890 }, { - "epoch": 1.1919094630387672, - "grad_norm": 8.1875, + "epoch": 7.0613409415121255, + "grad_norm": 7.15625, "learning_rate": 6.711555555555555e-05, - "loss": 0.5532, + "loss": 0.8237, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval/acc": 46.511627197265625, + "epoch": 7.0613409415121255, + "eval/acc": 62.79069900512695, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval_loss": 2.864424705505371, - "eval_runtime": 0.2199, - "eval_samples_per_second": 195.51, - "eval_steps_per_second": 4.547, + "epoch": 7.0613409415121255, + "eval_loss": 2.023484706878662, + "eval_runtime": 1.3641, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 0.733, "step": 9900 }, { - "epoch": 1.1931134119913316, - "grad_norm": 8.5625, + "epoch": 7.068473609129814, + "grad_norm": 10.375, "learning_rate": 6.707111111111111e-05, - "loss": 0.585, + "loss": 0.7141, "step": 9910 }, { - "epoch": 1.194317360943896, - "grad_norm": 8.25, + "epoch": 7.075606276747504, + "grad_norm": 9.25, "learning_rate": 6.702666666666667e-05, - "loss": 0.6533, + "loss": 0.7963, "step": 9920 }, { - "epoch": 1.1955213098964603, - "grad_norm": 8.8125, + "epoch": 7.082738944365192, + "grad_norm": 7.375, "learning_rate": 6.698222222222222e-05, - "loss": 0.5962, + "loss": 0.7935, "step": 9930 }, { - "epoch": 1.1967252588490247, - "grad_norm": 13.0625, + "epoch": 7.089871611982882, + "grad_norm": 6.8125, "learning_rate": 6.693777777777778e-05, - "loss": 0.6169, + "loss": 0.7882, "step": 9940 }, { - "epoch": 1.1979292078015893, - "grad_norm": 7.5625, + "epoch": 7.09700427960057, + "grad_norm": 7.0625, "learning_rate": 6.689333333333335e-05, - "loss": 0.5756, + "loss": 0.7698, "step": 9950 }, { - "epoch": 1.1991331567541537, - "grad_norm": 6.03125, + "epoch": 7.10413694721826, + "grad_norm": 6.9375, "learning_rate": 6.68488888888889e-05, - "loss": 0.5746, + "loss": 0.8595, "step": 9960 }, { - "epoch": 1.200337105706718, - "grad_norm": 4.875, + "epoch": 7.111269614835948, + "grad_norm": 9.5, "learning_rate": 6.680444444444444e-05, - "loss": 0.6586, + "loss": 0.8158, "step": 9970 }, { - "epoch": 1.2015410546592824, - "grad_norm": 7.375, + "epoch": 7.118402282453638, + "grad_norm": 8.375, "learning_rate": 6.676e-05, - "loss": 0.6928, + "loss": 0.7916, "step": 9980 }, { - "epoch": 1.2027450036118468, - "grad_norm": 8.875, + "epoch": 7.1255349500713265, + "grad_norm": 6.3125, "learning_rate": 6.671555555555555e-05, - "loss": 0.6166, + "loss": 0.7455, "step": 9990 }, { - "epoch": 1.2039489525644114, - "grad_norm": 7.96875, + "epoch": 7.132667617689016, + "grad_norm": 7.375, "learning_rate": 6.667111111111112e-05, - "loss": 0.6778, + "loss": 0.7398, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval/acc": 46.511627197265625, + "epoch": 7.132667617689016, + "eval/acc": 65.11627960205078, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval_loss": 2.8400421142578125, - "eval_runtime": 0.2085, - "eval_samples_per_second": 206.266, - "eval_steps_per_second": 4.797, + "epoch": 7.132667617689016, + "eval_loss": 2.0408403873443604, + "eval_runtime": 0.2184, + "eval_samples_per_second": 196.923, + "eval_steps_per_second": 4.58, "step": 10000 }, { - "epoch": 1.2051529015169757, - "grad_norm": 6.21875, + "epoch": 7.139800285306705, + "grad_norm": 8.375, "learning_rate": 6.662666666666668e-05, - "loss": 0.5977, + "loss": 0.8887, "step": 10010 }, { - "epoch": 1.20635685046954, - "grad_norm": 7.03125, + "epoch": 7.146932952924394, + "grad_norm": 8.5, "learning_rate": 6.658222222222223e-05, - "loss": 0.5471, + "loss": 0.8945, "step": 10020 }, { - "epoch": 1.2075607994221045, - "grad_norm": 7.3125, + "epoch": 7.154065620542083, + "grad_norm": 22.5, "learning_rate": 6.653777777777777e-05, - "loss": 0.587, + "loss": 0.7934, "step": 10030 }, { - "epoch": 1.2087647483746689, - "grad_norm": 7.28125, + "epoch": 7.161198288159771, + "grad_norm": 7.34375, "learning_rate": 6.649333333333334e-05, - "loss": 0.5015, + "loss": 0.8056, "step": 10040 }, { - "epoch": 1.2099686973272332, - "grad_norm": 8.3125, + "epoch": 7.168330955777461, + "grad_norm": 7.59375, "learning_rate": 6.64488888888889e-05, - "loss": 0.5784, + "loss": 0.7893, "step": 10050 }, { - "epoch": 1.2111726462797978, - "grad_norm": 6.46875, + "epoch": 7.175463623395149, + "grad_norm": 8.5, "learning_rate": 6.640444444444445e-05, - "loss": 0.5528, + "loss": 1.0099, "step": 10060 }, { - "epoch": 1.2123765952323622, - "grad_norm": 4.8125, + "epoch": 7.182596291012839, + "grad_norm": 8.0625, "learning_rate": 6.636e-05, - "loss": 0.6008, + "loss": 0.8701, "step": 10070 }, { - "epoch": 1.2135805441849266, - "grad_norm": 7.46875, + "epoch": 7.1897289586305275, + "grad_norm": 9.25, "learning_rate": 6.631555555555557e-05, - "loss": 0.5804, + "loss": 0.8203, "step": 10080 }, { - "epoch": 1.214784493137491, - "grad_norm": 8.375, + "epoch": 7.196861626248217, + "grad_norm": 7.90625, "learning_rate": 6.627111111111112e-05, - "loss": 0.5645, + "loss": 0.8197, "step": 10090 }, { - "epoch": 1.2159884420900553, - "grad_norm": 12.0, + "epoch": 7.203994293865906, + "grad_norm": 6.03125, "learning_rate": 6.622666666666667e-05, - "loss": 0.5773, + "loss": 0.8087, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval/acc": 44.1860466003418, + "epoch": 7.203994293865906, + "eval/acc": 60.46511459350586, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval_loss": 2.8810744285583496, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.671, - "eval_steps_per_second": 4.783, + "epoch": 7.203994293865906, + "eval_loss": 1.940862774848938, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.814, + "eval_steps_per_second": 4.391, "step": 10100 }, { - "epoch": 1.21719239104262, - "grad_norm": 9.625, + "epoch": 7.211126961483595, + "grad_norm": 8.1875, "learning_rate": 6.618222222222223e-05, - "loss": 0.6729, + "loss": 0.7932, "step": 10110 }, { - "epoch": 1.2183963399951843, - "grad_norm": 7.15625, + "epoch": 7.218259629101284, + "grad_norm": 7.4375, "learning_rate": 6.613777777777778e-05, - "loss": 0.613, + "loss": 0.7562, "step": 10120 }, { - "epoch": 1.2196002889477486, - "grad_norm": 5.34375, + "epoch": 7.225392296718973, + "grad_norm": 8.4375, "learning_rate": 6.609333333333334e-05, - "loss": 0.5637, + "loss": 0.8474, "step": 10130 }, { - "epoch": 1.220804237900313, - "grad_norm": 8.1875, + "epoch": 7.232524964336662, + "grad_norm": 8.0, "learning_rate": 6.604888888888889e-05, - "loss": 0.6426, + "loss": 0.8197, "step": 10140 }, { - "epoch": 1.2220081868528774, - "grad_norm": 7.34375, + "epoch": 7.239657631954351, + "grad_norm": 8.0625, "learning_rate": 6.600444444444445e-05, - "loss": 0.5698, + "loss": 0.7804, "step": 10150 }, { - "epoch": 1.2232121358054417, - "grad_norm": 9.25, + "epoch": 7.24679029957204, + "grad_norm": 21.25, "learning_rate": 6.596e-05, - "loss": 0.6375, + "loss": 0.8914, "step": 10160 }, { - "epoch": 1.2244160847580063, - "grad_norm": 6.25, + "epoch": 7.2539229671897285, + "grad_norm": 7.125, "learning_rate": 6.591555555555556e-05, - "loss": 0.5693, + "loss": 0.8185, "step": 10170 }, { - "epoch": 1.2256200337105707, - "grad_norm": 6.4375, + "epoch": 7.261055634807418, + "grad_norm": 6.6875, "learning_rate": 6.587111111111112e-05, - "loss": 0.5378, + "loss": 0.7911, "step": 10180 }, { - "epoch": 1.226823982663135, - "grad_norm": 8.375, + "epoch": 7.268188302425107, + "grad_norm": 7.21875, "learning_rate": 6.582666666666667e-05, - "loss": 0.7013, + "loss": 0.8004, "step": 10190 }, { - "epoch": 1.2280279316156995, - "grad_norm": 5.8125, + "epoch": 7.275320970042796, + "grad_norm": 7.0, "learning_rate": 6.578222222222222e-05, - "loss": 0.6519, + "loss": 0.7226, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval/acc": 46.511627197265625, + "epoch": 7.275320970042796, + "eval/acc": 60.46511459350586, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval_loss": 2.8267436027526855, - "eval_runtime": 0.2057, - "eval_samples_per_second": 209.0, - "eval_steps_per_second": 4.86, + "epoch": 7.275320970042796, + "eval_loss": 2.004242420196533, + "eval_runtime": 0.2197, + "eval_samples_per_second": 195.738, + "eval_steps_per_second": 4.552, "step": 10200 }, { - "epoch": 1.2292318805682638, - "grad_norm": 7.1875, + "epoch": 7.282453637660485, + "grad_norm": 16.25, "learning_rate": 6.573777777777778e-05, - "loss": 0.5266, + "loss": 0.8735, "step": 10210 }, { - "epoch": 1.2304358295208284, - "grad_norm": 6.875, + "epoch": 7.289586305278174, + "grad_norm": 6.8125, "learning_rate": 6.569333333333334e-05, - "loss": 0.5686, + "loss": 0.8356, "step": 10220 }, { - "epoch": 1.2316397784733928, - "grad_norm": 8.0, + "epoch": 7.296718972895863, + "grad_norm": 5.65625, "learning_rate": 6.564888888888889e-05, - "loss": 0.6414, + "loss": 0.8032, "step": 10230 }, { - "epoch": 1.2328437274259572, - "grad_norm": 6.8125, + "epoch": 7.303851640513552, + "grad_norm": 6.125, "learning_rate": 6.560444444444444e-05, - "loss": 0.6118, + "loss": 0.7803, "step": 10240 }, { - "epoch": 1.2340476763785215, - "grad_norm": 8.625, + "epoch": 7.310984308131241, + "grad_norm": 9.375, "learning_rate": 6.556e-05, - "loss": 0.5839, + "loss": 0.8748, "step": 10250 }, { - "epoch": 1.235251625331086, - "grad_norm": 7.34375, + "epoch": 7.31811697574893, + "grad_norm": 6.625, "learning_rate": 6.551555555555556e-05, - "loss": 0.6561, + "loss": 0.7793, "step": 10260 }, { - "epoch": 1.2364555742836503, - "grad_norm": 11.5625, + "epoch": 7.325249643366619, + "grad_norm": 13.625, "learning_rate": 6.547111111111111e-05, - "loss": 0.6036, + "loss": 0.8052, "step": 10270 }, { - "epoch": 1.2376595232362149, - "grad_norm": 7.875, + "epoch": 7.3323823109843085, + "grad_norm": 8.6875, "learning_rate": 6.542666666666667e-05, - "loss": 0.5566, + "loss": 0.8387, "step": 10280 }, { - "epoch": 1.2388634721887792, - "grad_norm": 7.59375, + "epoch": 7.339514978601997, + "grad_norm": 6.84375, "learning_rate": 6.538222222222222e-05, - "loss": 0.5778, + "loss": 0.8713, "step": 10290 }, { - "epoch": 1.2400674211413436, - "grad_norm": 7.25, + "epoch": 7.346647646219687, + "grad_norm": 9.875, "learning_rate": 6.533777777777777e-05, - "loss": 0.616, + "loss": 0.7266, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval/acc": 45.930233001708984, + "epoch": 7.346647646219687, + "eval/acc": 62.79069900512695, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval_loss": 2.851064682006836, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.029, - "eval_steps_per_second": 4.675, + "epoch": 7.346647646219687, + "eval_loss": 1.9304108619689941, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.684, + "eval_steps_per_second": 4.504, "step": 10300 }, { - "epoch": 1.241271370093908, - "grad_norm": 5.375, + "epoch": 7.353780313837375, + "grad_norm": 9.5625, "learning_rate": 6.529333333333333e-05, - "loss": 0.5576, + "loss": 0.7775, "step": 10310 }, { - "epoch": 1.2424753190464723, - "grad_norm": 9.9375, + "epoch": 7.360912981455064, + "grad_norm": 8.0625, "learning_rate": 6.52488888888889e-05, - "loss": 0.6111, + "loss": 0.7669, "step": 10320 }, { - "epoch": 1.243679267999037, - "grad_norm": 7.34375, + "epoch": 7.368045649072753, + "grad_norm": 7.0625, "learning_rate": 6.520444444444444e-05, - "loss": 0.6349, + "loss": 0.897, "step": 10330 }, { - "epoch": 1.2448832169516013, - "grad_norm": 7.8125, + "epoch": 7.375178316690442, + "grad_norm": 7.4375, "learning_rate": 6.515999999999999e-05, - "loss": 0.5117, + "loss": 0.7859, "step": 10340 }, { - "epoch": 1.2460871659041657, - "grad_norm": 9.9375, + "epoch": 7.382310984308131, + "grad_norm": 10.25, "learning_rate": 6.511555555555557e-05, - "loss": 0.5363, + "loss": 0.9447, "step": 10350 }, { - "epoch": 1.24729111485673, - "grad_norm": 10.0625, + "epoch": 7.38944365192582, + "grad_norm": 7.21875, "learning_rate": 6.507111111111112e-05, - "loss": 0.694, + "loss": 0.78, "step": 10360 }, { - "epoch": 1.2484950638092944, - "grad_norm": 9.625, + "epoch": 7.3965763195435095, + "grad_norm": 8.625, "learning_rate": 6.502666666666667e-05, - "loss": 0.5528, + "loss": 0.9362, "step": 10370 }, { - "epoch": 1.2496990127618588, - "grad_norm": 7.25, + "epoch": 7.403708987161198, + "grad_norm": 8.125, "learning_rate": 6.498222222222223e-05, - "loss": 0.5428, + "loss": 0.7343, "step": 10380 }, { - "epoch": 1.2509029617144234, - "grad_norm": 7.59375, + "epoch": 7.410841654778888, + "grad_norm": 8.125, "learning_rate": 6.493777777777779e-05, - "loss": 0.6291, + "loss": 0.8328, "step": 10390 }, { - "epoch": 1.2521069106669878, - "grad_norm": 7.28125, + "epoch": 7.417974322396576, + "grad_norm": 7.8125, "learning_rate": 6.489333333333334e-05, - "loss": 0.5882, + "loss": 0.8261, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval/acc": 46.511627197265625, + "epoch": 7.417974322396576, + "eval/acc": 62.79069900512695, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval_loss": 2.878549098968506, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.843, - "eval_steps_per_second": 4.741, + "epoch": 7.417974322396576, + "eval_loss": 1.9274901151657104, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.989, + "eval_steps_per_second": 4.511, "step": 10400 }, { - "epoch": 1.2533108596195521, - "grad_norm": 9.25, + "epoch": 7.425106990014266, + "grad_norm": 48.0, "learning_rate": 6.484888888888889e-05, - "loss": 0.5934, + "loss": 0.8167, "step": 10410 }, { - "epoch": 1.2545148085721165, - "grad_norm": 6.84375, + "epoch": 7.432239657631954, + "grad_norm": 8.4375, "learning_rate": 6.480444444444445e-05, - "loss": 0.714, + "loss": 0.8262, "step": 10420 }, { - "epoch": 1.2557187575246809, - "grad_norm": 8.375, + "epoch": 7.439372325249644, + "grad_norm": 6.90625, "learning_rate": 6.476e-05, - "loss": 0.7209, + "loss": 0.9254, "step": 10430 }, { - "epoch": 1.2569227064772455, - "grad_norm": 7.96875, + "epoch": 7.446504992867332, + "grad_norm": 8.5625, "learning_rate": 6.471555555555556e-05, - "loss": 0.6045, + "loss": 0.7657, "step": 10440 }, { - "epoch": 1.2581266554298098, - "grad_norm": 9.125, + "epoch": 7.453637660485022, + "grad_norm": 6.875, "learning_rate": 6.467111111111112e-05, - "loss": 0.632, + "loss": 0.8123, "step": 10450 }, { - "epoch": 1.2593306043823742, - "grad_norm": 6.9375, + "epoch": 7.4607703281027105, + "grad_norm": 8.5625, "learning_rate": 6.462666666666667e-05, - "loss": 0.5078, + "loss": 0.8951, "step": 10460 }, { - "epoch": 1.2605345533349386, - "grad_norm": 7.375, + "epoch": 7.467902995720399, + "grad_norm": 7.46875, "learning_rate": 6.458222222222222e-05, - "loss": 0.6352, + "loss": 0.8287, "step": 10470 }, { - "epoch": 1.261738502287503, - "grad_norm": 7.375, + "epoch": 7.4750356633380886, + "grad_norm": 6.28125, "learning_rate": 6.453777777777778e-05, - "loss": 0.6588, + "loss": 0.7364, "step": 10480 }, { - "epoch": 1.2629424512400673, - "grad_norm": 8.625, + "epoch": 7.482168330955777, + "grad_norm": 7.625, "learning_rate": 6.449333333333334e-05, - "loss": 0.6612, + "loss": 0.9265, "step": 10490 }, { - "epoch": 1.264146400192632, - "grad_norm": 6.78125, + "epoch": 7.489300998573467, + "grad_norm": 7.15625, "learning_rate": 6.444888888888889e-05, - "loss": 0.5578, + "loss": 0.7547, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval/acc": 46.511627197265625, + "epoch": 7.489300998573467, + "eval/acc": 62.79069900512695, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval_loss": 2.8044533729553223, - "eval_runtime": 0.2198, - "eval_samples_per_second": 195.595, - "eval_steps_per_second": 4.549, + "epoch": 7.489300998573467, + "eval_loss": 1.9239764213562012, + "eval_runtime": 0.2285, + "eval_samples_per_second": 188.187, + "eval_steps_per_second": 4.376, "step": 10500 }, { - "epoch": 1.2653503491451963, - "grad_norm": 7.0625, + "epoch": 7.496433666191155, + "grad_norm": 7.875, "learning_rate": 6.440444444444444e-05, - "loss": 0.5674, + "loss": 0.8612, "step": 10510 }, { - "epoch": 1.2665542980977607, - "grad_norm": 7.5, + "epoch": 7.503566333808845, + "grad_norm": 7.46875, "learning_rate": 6.436e-05, - "loss": 0.5692, + "loss": 0.8751, "step": 10520 }, { - "epoch": 1.267758247050325, - "grad_norm": 6.96875, + "epoch": 7.510699001426533, + "grad_norm": 6.78125, "learning_rate": 6.431555555555556e-05, - "loss": 0.5209, + "loss": 0.7706, "step": 10530 }, { - "epoch": 1.2689621960028894, - "grad_norm": 6.625, + "epoch": 7.517831669044223, + "grad_norm": 6.375, "learning_rate": 6.427111111111111e-05, - "loss": 0.7402, + "loss": 0.7602, "step": 10540 }, { - "epoch": 1.270166144955454, - "grad_norm": 8.5625, + "epoch": 7.5249643366619114, + "grad_norm": 7.1875, "learning_rate": 6.422666666666667e-05, - "loss": 0.6213, + "loss": 0.7953, "step": 10550 }, { - "epoch": 1.2713700939080184, - "grad_norm": 6.625, + "epoch": 7.532097004279601, + "grad_norm": 6.5, "learning_rate": 6.418222222222222e-05, - "loss": 0.587, + "loss": 0.871, "step": 10560 }, { - "epoch": 1.2725740428605827, - "grad_norm": 8.3125, + "epoch": 7.5392296718972895, + "grad_norm": 6.65625, "learning_rate": 6.413777777777778e-05, - "loss": 0.5949, + "loss": 0.7343, "step": 10570 }, { - "epoch": 1.273777991813147, - "grad_norm": 5.9375, + "epoch": 7.546362339514978, + "grad_norm": 6.3125, "learning_rate": 6.409333333333333e-05, - "loss": 0.5501, + "loss": 0.8275, "step": 10580 }, { - "epoch": 1.2749819407657115, - "grad_norm": 4.59375, + "epoch": 7.553495007132668, + "grad_norm": 6.125, "learning_rate": 6.40488888888889e-05, - "loss": 0.5145, + "loss": 0.8243, "step": 10590 }, { - "epoch": 1.2761858897182758, - "grad_norm": 8.6875, + "epoch": 7.560627674750357, + "grad_norm": 7.75, "learning_rate": 6.400444444444444e-05, - "loss": 0.6859, + "loss": 0.8731, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval/acc": 46.511627197265625, + "epoch": 7.560627674750357, + "eval/acc": 58.13953399658203, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval_loss": 2.836024045944214, - "eval_runtime": 0.2165, - "eval_samples_per_second": 198.581, - "eval_steps_per_second": 4.618, + "epoch": 7.560627674750357, + "eval_loss": 1.9751547574996948, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.634, + "eval_steps_per_second": 4.526, "step": 10600 }, { - "epoch": 1.2773898386708404, - "grad_norm": 7.6875, + "epoch": 7.567760342368046, + "grad_norm": 7.34375, "learning_rate": 6.396e-05, - "loss": 0.5479, + "loss": 0.7555, "step": 10610 }, { - "epoch": 1.2785937876234048, - "grad_norm": 6.46875, + "epoch": 7.574893009985734, + "grad_norm": 9.1875, "learning_rate": 6.391555555555557e-05, - "loss": 0.6267, + "loss": 0.7415, "step": 10620 }, { - "epoch": 1.2797977365759692, - "grad_norm": 7.8125, + "epoch": 7.582025677603424, + "grad_norm": 11.875, "learning_rate": 6.387111111111111e-05, - "loss": 0.6473, + "loss": 0.7363, "step": 10630 }, { - "epoch": 1.2810016855285336, - "grad_norm": 8.75, + "epoch": 7.589158345221112, + "grad_norm": 7.90625, "learning_rate": 6.382666666666666e-05, - "loss": 0.7012, + "loss": 0.858, "step": 10640 }, { - "epoch": 1.282205634481098, - "grad_norm": 7.0625, + "epoch": 7.596291012838802, + "grad_norm": 8.25, "learning_rate": 6.378222222222223e-05, - "loss": 0.6147, + "loss": 0.7934, "step": 10650 }, { - "epoch": 1.2834095834336625, - "grad_norm": 8.1875, + "epoch": 7.6034236804564905, + "grad_norm": 6.84375, "learning_rate": 6.373777777777779e-05, - "loss": 0.6508, + "loss": 0.7867, "step": 10660 }, { - "epoch": 1.2846135323862269, - "grad_norm": 7.21875, + "epoch": 7.61055634807418, + "grad_norm": 8.3125, "learning_rate": 6.369333333333334e-05, - "loss": 0.5718, + "loss": 0.8519, "step": 10670 }, { - "epoch": 1.2858174813387913, - "grad_norm": 6.40625, + "epoch": 7.617689015691869, + "grad_norm": 8.25, "learning_rate": 6.36488888888889e-05, - "loss": 0.6092, + "loss": 0.8771, "step": 10680 }, { - "epoch": 1.2870214302913556, - "grad_norm": 8.5625, + "epoch": 7.624821683309558, + "grad_norm": 6.1875, "learning_rate": 6.360444444444445e-05, - "loss": 0.6562, + "loss": 0.8483, "step": 10690 }, { - "epoch": 1.28822537924392, - "grad_norm": 6.71875, + "epoch": 7.631954350927247, + "grad_norm": 34.25, "learning_rate": 6.356000000000001e-05, - "loss": 0.5452, + "loss": 0.8799, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval/acc": 41.27906799316406, + "epoch": 7.631954350927247, + "eval/acc": 62.79069900512695, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval_loss": 2.846574306488037, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.316, - "eval_steps_per_second": 4.775, + "epoch": 7.631954350927247, + "eval_loss": 1.9270039796829224, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.545, + "eval_steps_per_second": 4.199, "step": 10700 }, { - "epoch": 1.2894293281964844, - "grad_norm": 7.09375, + "epoch": 7.639087018544936, + "grad_norm": 7.875, "learning_rate": 6.351555555555556e-05, - "loss": 0.5298, + "loss": 0.8321, "step": 10710 }, { - "epoch": 1.290633277149049, - "grad_norm": 7.8125, + "epoch": 7.646219686162625, + "grad_norm": 7.0, "learning_rate": 6.347111111111112e-05, - "loss": 0.6176, + "loss": 0.8192, "step": 10720 }, { - "epoch": 1.2918372261016133, - "grad_norm": 6.0, + "epoch": 7.653352353780313, + "grad_norm": 8.25, "learning_rate": 6.342666666666667e-05, - "loss": 0.527, + "loss": 0.7631, "step": 10730 }, { - "epoch": 1.2930411750541777, - "grad_norm": 20.0, + "epoch": 7.660485021398003, + "grad_norm": 7.1875, "learning_rate": 6.338222222222222e-05, - "loss": 0.6201, + "loss": 0.8088, "step": 10740 }, { - "epoch": 1.294245124006742, - "grad_norm": 8.25, + "epoch": 7.6676176890156915, + "grad_norm": 6.5, "learning_rate": 6.333777777777779e-05, - "loss": 0.6072, + "loss": 0.7612, "step": 10750 }, { - "epoch": 1.2954490729593064, - "grad_norm": 10.3125, + "epoch": 7.674750356633381, + "grad_norm": 7.5, "learning_rate": 6.329333333333334e-05, - "loss": 0.6123, + "loss": 0.8282, "step": 10760 }, { - "epoch": 1.296653021911871, - "grad_norm": 7.0625, + "epoch": 7.68188302425107, + "grad_norm": 6.46875, "learning_rate": 6.324888888888889e-05, - "loss": 0.5529, + "loss": 0.8197, "step": 10770 }, { - "epoch": 1.2978569708644354, - "grad_norm": 7.5625, + "epoch": 7.689015691868759, + "grad_norm": 12.0, "learning_rate": 6.320444444444445e-05, - "loss": 0.5527, + "loss": 0.8304, "step": 10780 }, { - "epoch": 1.2990609198169998, - "grad_norm": 9.375, + "epoch": 7.696148359486448, + "grad_norm": 7.875, "learning_rate": 6.316000000000001e-05, - "loss": 0.562, + "loss": 0.8242, "step": 10790 }, { - "epoch": 1.3002648687695642, - "grad_norm": 6.0, + "epoch": 7.703281027104137, + "grad_norm": 7.34375, "learning_rate": 6.311555555555556e-05, - "loss": 0.5706, + "loss": 0.7904, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval/acc": 39.53488540649414, + "epoch": 7.703281027104137, + "eval/acc": 60.46511459350586, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval_loss": 2.8325037956237793, - "eval_runtime": 0.9046, - "eval_samples_per_second": 47.536, - "eval_steps_per_second": 1.105, + "epoch": 7.703281027104137, + "eval_loss": 1.931999683380127, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.21, + "eval_steps_per_second": 4.493, "step": 10800 }, { - "epoch": 1.3014688177221285, - "grad_norm": 6.6875, + "epoch": 7.710413694721826, + "grad_norm": 8.625, "learning_rate": 6.307111111111111e-05, - "loss": 0.5263, + "loss": 0.861, "step": 10810 }, { - "epoch": 1.302672766674693, - "grad_norm": 7.3125, + "epoch": 7.717546362339515, + "grad_norm": 12.0, "learning_rate": 6.302666666666667e-05, - "loss": 0.5811, + "loss": 0.7917, "step": 10820 }, { - "epoch": 1.3038767156272573, - "grad_norm": 7.65625, + "epoch": 7.724679029957204, + "grad_norm": 6.5, "learning_rate": 6.298222222222222e-05, - "loss": 0.6056, + "loss": 0.709, "step": 10830 }, { - "epoch": 1.3050806645798219, - "grad_norm": 7.1875, + "epoch": 7.731811697574893, + "grad_norm": 6.96875, "learning_rate": 6.293777777777778e-05, - "loss": 0.625, + "loss": 0.8168, "step": 10840 }, { - "epoch": 1.3062846135323862, - "grad_norm": 8.0625, + "epoch": 7.738944365192582, + "grad_norm": 7.625, "learning_rate": 6.289333333333334e-05, - "loss": 0.5916, + "loss": 0.7357, "step": 10850 }, { - "epoch": 1.3074885624849506, - "grad_norm": 7.46875, + "epoch": 7.7460770328102715, + "grad_norm": 17.125, "learning_rate": 6.284888888888889e-05, - "loss": 0.5399, + "loss": 0.7115, "step": 10860 }, { - "epoch": 1.308692511437515, - "grad_norm": 6.21875, + "epoch": 7.75320970042796, + "grad_norm": 6.78125, "learning_rate": 6.280444444444444e-05, - "loss": 0.5895, + "loss": 0.6973, "step": 10870 }, { - "epoch": 1.3098964603900796, - "grad_norm": 15.5, + "epoch": 7.760342368045649, + "grad_norm": 6.75, "learning_rate": 6.276e-05, - "loss": 0.6447, + "loss": 0.7925, "step": 10880 }, { - "epoch": 1.311100409342644, - "grad_norm": 10.625, + "epoch": 7.767475035663338, + "grad_norm": 6.78125, "learning_rate": 6.271555555555556e-05, - "loss": 0.6577, + "loss": 0.7927, "step": 10890 }, { - "epoch": 1.3123043582952083, - "grad_norm": 8.0625, + "epoch": 7.774607703281027, + "grad_norm": 7.375, "learning_rate": 6.267111111111111e-05, - "loss": 0.6119, + "loss": 0.9383, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval/acc": 44.1860466003418, + "epoch": 7.774607703281027, + "eval/acc": 62.79069900512695, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval_loss": 2.8269896507263184, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.171, - "eval_steps_per_second": 4.725, + "epoch": 7.774607703281027, + "eval_loss": 1.947619915008545, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.442, + "eval_steps_per_second": 4.592, "step": 10900 }, { - "epoch": 1.3135083072477727, - "grad_norm": 6.21875, + "epoch": 7.781740370898716, + "grad_norm": 13.75, "learning_rate": 6.262666666666666e-05, - "loss": 0.5292, + "loss": 0.8463, "step": 10910 }, { - "epoch": 1.314712256200337, - "grad_norm": 7.5, + "epoch": 7.788873038516405, + "grad_norm": 10.875, "learning_rate": 6.258222222222222e-05, - "loss": 0.6174, + "loss": 0.7938, "step": 10920 }, { - "epoch": 1.3159162051529014, - "grad_norm": 7.71875, + "epoch": 7.796005706134094, + "grad_norm": 7.75, "learning_rate": 6.253777777777779e-05, - "loss": 0.6011, + "loss": 0.8174, "step": 10930 }, { - "epoch": 1.3171201541054658, - "grad_norm": 6.375, + "epoch": 7.803138373751783, + "grad_norm": 6.3125, "learning_rate": 6.249333333333333e-05, - "loss": 0.6249, + "loss": 0.7583, "step": 10940 }, { - "epoch": 1.3183241030580304, - "grad_norm": 7.1875, + "epoch": 7.8102710413694725, + "grad_norm": 6.625, "learning_rate": 6.24488888888889e-05, - "loss": 0.5995, + "loss": 0.7677, "step": 10950 }, { - "epoch": 1.3195280520105948, - "grad_norm": 6.46875, + "epoch": 7.817403708987161, + "grad_norm": 7.03125, "learning_rate": 6.240444444444444e-05, - "loss": 0.5469, + "loss": 0.8211, "step": 10960 }, { - "epoch": 1.3207320009631591, - "grad_norm": 8.1875, + "epoch": 7.824536376604851, + "grad_norm": 6.78125, "learning_rate": 6.236e-05, - "loss": 0.6817, + "loss": 0.8165, "step": 10970 }, { - "epoch": 1.3219359499157235, - "grad_norm": 8.875, + "epoch": 7.831669044222539, + "grad_norm": 7.25, "learning_rate": 6.231555555555555e-05, - "loss": 0.6015, + "loss": 0.8452, "step": 10980 }, { - "epoch": 1.323139898868288, - "grad_norm": 9.25, + "epoch": 7.838801711840228, + "grad_norm": 7.78125, "learning_rate": 6.227111111111112e-05, - "loss": 0.5469, + "loss": 0.7316, "step": 10990 }, { - "epoch": 1.3243438478208525, - "grad_norm": 5.4375, + "epoch": 7.845934379457917, + "grad_norm": 7.1875, "learning_rate": 6.222666666666666e-05, - "loss": 0.6355, + "loss": 0.7908, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval/acc": 44.1860466003418, + "epoch": 7.845934379457917, + "eval/acc": 60.46511459350586, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval_loss": 2.7861974239349365, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.615, - "eval_steps_per_second": 4.852, + "epoch": 7.845934379457917, + "eval_loss": 1.9402235746383667, + "eval_runtime": 0.2151, + "eval_samples_per_second": 199.927, + "eval_steps_per_second": 4.649, "step": 11000 }, { - "epoch": 1.3255477967734168, - "grad_norm": 6.5, + "epoch": 7.853067047075607, + "grad_norm": 7.3125, "learning_rate": 6.218222222222223e-05, - "loss": 0.6631, + "loss": 0.8226, "step": 11010 }, { - "epoch": 1.3267517457259812, - "grad_norm": 8.5625, + "epoch": 7.860199714693295, + "grad_norm": 6.1875, "learning_rate": 6.213777777777779e-05, - "loss": 0.7161, + "loss": 0.7946, "step": 11020 }, { - "epoch": 1.3279556946785456, - "grad_norm": 6.78125, + "epoch": 7.867332382310984, + "grad_norm": 6.21875, "learning_rate": 6.209333333333334e-05, - "loss": 0.5578, + "loss": 0.8494, "step": 11030 }, { - "epoch": 1.32915964363111, - "grad_norm": 9.0625, + "epoch": 7.8744650499286735, + "grad_norm": 6.875, "learning_rate": 6.204888888888889e-05, - "loss": 0.5986, + "loss": 0.7066, "step": 11040 }, { - "epoch": 1.3303635925836743, - "grad_norm": 10.25, + "epoch": 7.881597717546362, + "grad_norm": 6.375, "learning_rate": 6.200444444444445e-05, - "loss": 0.5198, + "loss": 0.8499, "step": 11050 }, { - "epoch": 1.331567541536239, - "grad_norm": 3.796875, + "epoch": 7.888730385164052, + "grad_norm": 8.0, "learning_rate": 6.196000000000001e-05, - "loss": 0.5459, + "loss": 0.8761, "step": 11060 }, { - "epoch": 1.3327714904888033, - "grad_norm": 7.125, + "epoch": 7.89586305278174, + "grad_norm": 5.75, "learning_rate": 6.191555555555556e-05, - "loss": 0.5896, + "loss": 0.8536, "step": 11070 }, { - "epoch": 1.3339754394413676, - "grad_norm": 7.34375, + "epoch": 7.90299572039943, + "grad_norm": 7.0, "learning_rate": 6.18711111111111e-05, - "loss": 0.5403, + "loss": 0.9413, "step": 11080 }, { - "epoch": 1.335179388393932, - "grad_norm": 9.125, + "epoch": 7.910128388017118, + "grad_norm": 8.0, "learning_rate": 6.182666666666667e-05, - "loss": 0.6377, + "loss": 0.7626, "step": 11090 }, { - "epoch": 1.3363833373464966, - "grad_norm": 8.8125, + "epoch": 7.917261055634808, + "grad_norm": 6.375, "learning_rate": 6.178222222222223e-05, - "loss": 0.6292, + "loss": 0.8177, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval/acc": 41.86046600341797, + "epoch": 7.917261055634808, + "eval/acc": 65.11627960205078, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval_loss": 2.8322744369506836, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.288, - "eval_steps_per_second": 4.728, + "epoch": 7.917261055634808, + "eval_loss": 1.8976689577102661, + "eval_runtime": 0.2399, + "eval_samples_per_second": 179.237, + "eval_steps_per_second": 4.168, "step": 11100 }, { - "epoch": 1.337587286299061, - "grad_norm": 6.28125, + "epoch": 7.924393723252496, + "grad_norm": 7.4375, "learning_rate": 6.173777777777778e-05, - "loss": 0.6421, + "loss": 0.8178, "step": 11110 }, { - "epoch": 1.3387912352516254, - "grad_norm": 6.21875, + "epoch": 7.931526390870186, + "grad_norm": 35.25, "learning_rate": 6.169333333333334e-05, - "loss": 0.6178, + "loss": 0.7931, "step": 11120 }, { - "epoch": 1.3399951842041897, - "grad_norm": 6.8125, + "epoch": 7.9386590584878745, + "grad_norm": 26.75, "learning_rate": 6.164888888888889e-05, - "loss": 0.704, + "loss": 0.7883, "step": 11130 }, { - "epoch": 1.341199133156754, - "grad_norm": 9.1875, + "epoch": 7.945791726105563, + "grad_norm": 6.375, "learning_rate": 6.160444444444444e-05, - "loss": 0.5763, + "loss": 0.7407, "step": 11140 }, { - "epoch": 1.3424030821093185, - "grad_norm": 9.625, + "epoch": 7.9529243937232525, + "grad_norm": 8.5, "learning_rate": 6.156e-05, - "loss": 0.6345, + "loss": 0.8509, "step": 11150 }, { - "epoch": 1.3436070310618828, - "grad_norm": 7.15625, + "epoch": 7.960057061340941, + "grad_norm": 7.34375, "learning_rate": 6.151555555555556e-05, - "loss": 0.5969, + "loss": 0.7948, "step": 11160 }, { - "epoch": 1.3448109800144474, - "grad_norm": 7.65625, + "epoch": 7.967189728958631, + "grad_norm": 5.90625, "learning_rate": 6.147111111111111e-05, - "loss": 0.6219, + "loss": 0.8066, "step": 11170 }, { - "epoch": 1.3460149289670118, - "grad_norm": 7.46875, + "epoch": 7.974322396576319, + "grad_norm": 6.8125, "learning_rate": 6.142666666666666e-05, - "loss": 0.5902, + "loss": 0.7545, "step": 11180 }, { - "epoch": 1.3472188779195762, - "grad_norm": 8.4375, + "epoch": 7.981455064194009, + "grad_norm": 7.40625, "learning_rate": 6.138222222222223e-05, - "loss": 0.6771, + "loss": 0.8842, "step": 11190 }, { - "epoch": 1.3484228268721405, - "grad_norm": 9.875, + "epoch": 7.988587731811697, + "grad_norm": 8.625, "learning_rate": 6.133777777777778e-05, - "loss": 0.5981, + "loss": 0.8874, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval/acc": 41.86046600341797, + "epoch": 7.988587731811697, + "eval/acc": 60.46511459350586, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval_loss": 2.8496346473693848, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.315, - "eval_steps_per_second": 4.658, + "epoch": 7.988587731811697, + "eval_loss": 1.9585436582565308, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.499, + "eval_steps_per_second": 4.663, "step": 11200 }, { - "epoch": 1.3496267758247051, - "grad_norm": 8.4375, + "epoch": 7.995720399429387, + "grad_norm": 10.125, "learning_rate": 6.129333333333333e-05, - "loss": 0.6014, + "loss": 0.8723, "step": 11210 }, { - "epoch": 1.3508307247772695, - "grad_norm": 6.625, + "epoch": 8.002853067047075, + "grad_norm": 6.375, "learning_rate": 6.12488888888889e-05, - "loss": 0.5484, + "loss": 0.7986, "step": 11220 }, { - "epoch": 1.3520346737298339, - "grad_norm": 8.8125, + "epoch": 8.009985734664765, + "grad_norm": 7.34375, "learning_rate": 6.120444444444444e-05, - "loss": 0.6505, + "loss": 0.8382, "step": 11230 }, { - "epoch": 1.3532386226823983, - "grad_norm": 9.0, + "epoch": 8.017118402282454, + "grad_norm": 6.21875, "learning_rate": 6.116e-05, - "loss": 0.7428, + "loss": 0.796, "step": 11240 }, { - "epoch": 1.3544425716349626, - "grad_norm": 6.03125, + "epoch": 8.024251069900142, + "grad_norm": 30.5, "learning_rate": 6.111555555555557e-05, - "loss": 0.5092, + "loss": 0.8541, "step": 11250 }, { - "epoch": 1.355646520587527, - "grad_norm": 8.375, + "epoch": 8.031383737517832, + "grad_norm": 7.90625, "learning_rate": 6.107111111111111e-05, - "loss": 0.6589, + "loss": 0.7689, "step": 11260 }, { - "epoch": 1.3568504695400914, - "grad_norm": 5.1875, + "epoch": 8.038516405135521, + "grad_norm": 10.375, "learning_rate": 6.102666666666666e-05, - "loss": 0.7026, + "loss": 0.803, "step": 11270 }, { - "epoch": 1.358054418492656, - "grad_norm": 19.125, + "epoch": 8.045649072753209, + "grad_norm": 8.3125, "learning_rate": 6.098222222222223e-05, - "loss": 0.6705, + "loss": 0.9584, "step": 11280 }, { - "epoch": 1.3592583674452203, - "grad_norm": 5.625, + "epoch": 8.052781740370898, + "grad_norm": 7.8125, "learning_rate": 6.093777777777778e-05, - "loss": 0.6484, + "loss": 0.761, "step": 11290 }, { - "epoch": 1.3604623163977847, - "grad_norm": 9.9375, + "epoch": 8.059914407988588, + "grad_norm": 9.125, "learning_rate": 6.0893333333333335e-05, - "loss": 0.5762, + "loss": 0.7506, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval/acc": 47.67441940307617, + "epoch": 8.059914407988588, + "eval/acc": 48.83720779418945, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval_loss": 2.804438591003418, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.261, - "eval_steps_per_second": 4.75, + "epoch": 8.059914407988588, + "eval_loss": 2.348471164703369, + "eval_runtime": 0.9666, + "eval_samples_per_second": 44.484, + "eval_steps_per_second": 1.035, "step": 11300 }, { - "epoch": 1.361666265350349, - "grad_norm": 11.4375, + "epoch": 8.067047075606277, + "grad_norm": 9.0625, "learning_rate": 6.084888888888889e-05, - "loss": 0.63, + "loss": 0.7246, "step": 11310 }, { - "epoch": 1.3628702143029137, - "grad_norm": 7.625, + "epoch": 8.074179743223965, + "grad_norm": 24.5, "learning_rate": 6.080444444444445e-05, - "loss": 0.5893, + "loss": 0.8399, "step": 11320 }, { - "epoch": 1.364074163255478, - "grad_norm": 7.5625, + "epoch": 8.081312410841655, + "grad_norm": 8.0625, "learning_rate": 6.076000000000001e-05, - "loss": 0.6789, + "loss": 0.8196, "step": 11330 }, { - "epoch": 1.3652781122080424, - "grad_norm": 6.875, + "epoch": 8.088445078459344, + "grad_norm": 7.5625, "learning_rate": 6.0715555555555556e-05, - "loss": 0.5739, + "loss": 0.7496, "step": 11340 }, { - "epoch": 1.3664820611606068, - "grad_norm": 7.9375, + "epoch": 8.095577746077034, + "grad_norm": 10.6875, "learning_rate": 6.067111111111111e-05, - "loss": 0.593, + "loss": 0.791, "step": 11350 }, { - "epoch": 1.3676860101131711, - "grad_norm": 6.03125, + "epoch": 8.102710413694721, + "grad_norm": 7.28125, "learning_rate": 6.062666666666667e-05, - "loss": 0.6003, + "loss": 0.7064, "step": 11360 }, { - "epoch": 1.3688899590657355, - "grad_norm": 7.21875, + "epoch": 8.10984308131241, + "grad_norm": 7.28125, "learning_rate": 6.058222222222223e-05, - "loss": 0.6658, + "loss": 0.8306, "step": 11370 }, { - "epoch": 1.3700939080182999, - "grad_norm": 6.25, + "epoch": 8.1169757489301, + "grad_norm": 7.84375, "learning_rate": 6.0537777777777784e-05, - "loss": 0.5438, + "loss": 0.8394, "step": 11380 }, { - "epoch": 1.3712978569708645, - "grad_norm": 7.21875, + "epoch": 8.12410841654779, + "grad_norm": 6.5625, "learning_rate": 6.049333333333333e-05, - "loss": 0.5269, + "loss": 0.789, "step": 11390 }, { - "epoch": 1.3725018059234289, - "grad_norm": 8.875, + "epoch": 8.131241084165477, + "grad_norm": 7.125, "learning_rate": 6.044888888888889e-05, - "loss": 0.5357, + "loss": 0.7752, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval/acc": 45.930233001708984, + "epoch": 8.131241084165477, + "eval/acc": 48.83720779418945, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval_loss": 2.79295015335083, - "eval_runtime": 0.5452, - "eval_samples_per_second": 78.877, - "eval_steps_per_second": 1.834, + "epoch": 8.131241084165477, + "eval_loss": 2.3455872535705566, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.559, + "eval_steps_per_second": 4.664, "step": 11400 }, { - "epoch": 1.3737057548759932, - "grad_norm": 6.375, + "epoch": 8.138373751783167, + "grad_norm": 6.75, "learning_rate": 6.040444444444445e-05, - "loss": 0.6255, + "loss": 0.7773, "step": 11410 }, { - "epoch": 1.3749097038285576, - "grad_norm": 6.96875, + "epoch": 8.145506419400856, + "grad_norm": 7.8125, "learning_rate": 6.0360000000000005e-05, - "loss": 0.6689, + "loss": 0.7369, "step": 11420 }, { - "epoch": 1.3761136527811222, - "grad_norm": 7.28125, + "epoch": 8.152639087018544, + "grad_norm": 6.4375, "learning_rate": 6.031555555555556e-05, - "loss": 0.5944, + "loss": 0.8158, "step": 11430 }, { - "epoch": 1.3773176017336866, - "grad_norm": 5.90625, + "epoch": 8.159771754636234, + "grad_norm": 7.53125, "learning_rate": 6.027111111111111e-05, - "loss": 0.6054, + "loss": 0.874, "step": 11440 }, { - "epoch": 1.378521550686251, - "grad_norm": 7.09375, + "epoch": 8.166904422253923, + "grad_norm": 8.0625, "learning_rate": 6.0226666666666664e-05, - "loss": 0.5204, + "loss": 0.7564, "step": 11450 }, { - "epoch": 1.3797254996388153, - "grad_norm": 8.125, + "epoch": 8.174037089871613, + "grad_norm": 6.65625, "learning_rate": 6.0182222222222226e-05, - "loss": 0.5088, + "loss": 0.8675, "step": 11460 }, { - "epoch": 1.3809294485913797, - "grad_norm": 7.96875, + "epoch": 8.1811697574893, + "grad_norm": 7.34375, "learning_rate": 6.013777777777778e-05, - "loss": 0.5873, + "loss": 0.8338, "step": 11470 }, { - "epoch": 1.382133397543944, - "grad_norm": 7.8125, + "epoch": 8.18830242510699, + "grad_norm": 8.75, "learning_rate": 6.0093333333333336e-05, - "loss": 0.5889, + "loss": 0.7316, "step": 11480 }, { - "epoch": 1.3833373464965084, - "grad_norm": 7.3125, + "epoch": 8.19543509272468, + "grad_norm": 8.625, "learning_rate": 6.0048888888888885e-05, - "loss": 0.6799, + "loss": 0.8842, "step": 11490 }, { - "epoch": 1.384541295449073, - "grad_norm": 17.0, + "epoch": 8.202567760342369, + "grad_norm": 11.3125, "learning_rate": 6.0004444444444453e-05, - "loss": 0.5965, + "loss": 0.7852, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval/acc": 46.511627197265625, + "epoch": 8.202567760342369, + "eval/acc": 48.83720779418945, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval_loss": 2.8261971473693848, - "eval_runtime": 1.1762, - "eval_samples_per_second": 36.557, - "eval_steps_per_second": 0.85, + "epoch": 8.202567760342369, + "eval_loss": 2.352907657623291, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.909, + "eval_steps_per_second": 4.719, "step": 11500 }, { - "epoch": 1.3857452444016374, - "grad_norm": 7.5, + "epoch": 8.209700427960057, + "grad_norm": 7.9375, "learning_rate": 5.996e-05, - "loss": 0.5685, + "loss": 0.7898, "step": 11510 }, { - "epoch": 1.3869491933542017, - "grad_norm": 7.8125, + "epoch": 8.216833095577746, + "grad_norm": 7.21875, "learning_rate": 5.991555555555556e-05, - "loss": 0.6299, + "loss": 0.7728, "step": 11520 }, { - "epoch": 1.3881531423067661, - "grad_norm": 6.34375, + "epoch": 8.223965763195435, + "grad_norm": 8.1875, "learning_rate": 5.987111111111111e-05, - "loss": 0.6243, + "loss": 0.7456, "step": 11530 }, { - "epoch": 1.3893570912593307, - "grad_norm": 7.84375, + "epoch": 8.231098430813125, + "grad_norm": 7.1875, "learning_rate": 5.982666666666666e-05, - "loss": 0.6387, + "loss": 0.8461, "step": 11540 }, { - "epoch": 1.390561040211895, - "grad_norm": 7.5, + "epoch": 8.238231098430813, + "grad_norm": 7.9375, "learning_rate": 5.978222222222223e-05, - "loss": 0.6561, + "loss": 0.7297, "step": 11550 }, { - "epoch": 1.3917649891644595, - "grad_norm": 9.125, + "epoch": 8.245363766048502, + "grad_norm": 6.75, "learning_rate": 5.973777777777778e-05, - "loss": 0.6064, + "loss": 0.8327, "step": 11560 }, { - "epoch": 1.3929689381170238, - "grad_norm": 8.3125, + "epoch": 8.252496433666192, + "grad_norm": 6.1875, "learning_rate": 5.969333333333333e-05, - "loss": 0.6107, + "loss": 0.8054, "step": 11570 }, { - "epoch": 1.3941728870695882, - "grad_norm": 8.0, + "epoch": 8.25962910128388, + "grad_norm": 8.375, "learning_rate": 5.964888888888889e-05, - "loss": 0.7101, + "loss": 0.7853, "step": 11580 }, { - "epoch": 1.3953768360221526, - "grad_norm": 4.84375, + "epoch": 8.266761768901569, + "grad_norm": 8.9375, "learning_rate": 5.960444444444445e-05, - "loss": 0.5981, + "loss": 0.7891, "step": 11590 }, { - "epoch": 1.396580784974717, - "grad_norm": 5.25, + "epoch": 8.273894436519258, + "grad_norm": 7.8125, "learning_rate": 5.9560000000000006e-05, - "loss": 0.5498, + "loss": 0.7407, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval/acc": 44.1860466003418, + "epoch": 8.273894436519258, + "eval/acc": 55.8139533996582, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval_loss": 2.822185516357422, - "eval_runtime": 2.963, - "eval_samples_per_second": 14.512, - "eval_steps_per_second": 0.337, + "epoch": 8.273894436519258, + "eval_loss": 2.3408679962158203, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.913, + "eval_steps_per_second": 4.719, "step": 11600 }, { - "epoch": 1.3977847339272815, - "grad_norm": 10.875, + "epoch": 8.281027104136948, + "grad_norm": 16.5, "learning_rate": 5.951555555555556e-05, - "loss": 0.65, + "loss": 0.7662, "step": 11610 }, { - "epoch": 1.398988682879846, - "grad_norm": 8.375, + "epoch": 8.288159771754636, + "grad_norm": 9.1875, "learning_rate": 5.947111111111111e-05, - "loss": 0.604, + "loss": 0.8136, "step": 11620 }, { - "epoch": 1.4001926318324103, - "grad_norm": 7.21875, + "epoch": 8.295292439372325, + "grad_norm": 6.5, "learning_rate": 5.942666666666668e-05, - "loss": 0.5246, + "loss": 0.8833, "step": 11630 }, { - "epoch": 1.4013965807849746, - "grad_norm": 7.8125, + "epoch": 8.302425106990015, + "grad_norm": 9.8125, "learning_rate": 5.938222222222223e-05, - "loss": 0.5189, + "loss": 0.7388, "step": 11640 }, { - "epoch": 1.4026005297375392, - "grad_norm": 4.78125, + "epoch": 8.309557774607704, + "grad_norm": 8.375, "learning_rate": 5.933777777777778e-05, - "loss": 0.5959, + "loss": 0.687, "step": 11650 }, { - "epoch": 1.4038044786901036, - "grad_norm": 8.875, + "epoch": 8.316690442225392, + "grad_norm": 6.75, "learning_rate": 5.929333333333334e-05, - "loss": 0.5887, + "loss": 0.7731, "step": 11660 }, { - "epoch": 1.405008427642668, - "grad_norm": 8.625, + "epoch": 8.323823109843081, + "grad_norm": 5.875, "learning_rate": 5.9248888888888886e-05, - "loss": 0.5538, + "loss": 0.8294, "step": 11670 }, { - "epoch": 1.4062123765952323, - "grad_norm": 5.8125, + "epoch": 8.33095577746077, + "grad_norm": 7.25, "learning_rate": 5.9204444444444454e-05, - "loss": 0.5945, + "loss": 0.8312, "step": 11680 }, { - "epoch": 1.4074163255477967, - "grad_norm": 6.9375, + "epoch": 8.338088445078458, + "grad_norm": 6.15625, "learning_rate": 5.916e-05, - "loss": 0.5444, + "loss": 0.8745, "step": 11690 }, { - "epoch": 1.408620274500361, - "grad_norm": 5.375, + "epoch": 8.345221112696148, + "grad_norm": 11.9375, "learning_rate": 5.911555555555556e-05, - "loss": 0.5762, + "loss": 0.8136, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval/acc": 45.930233001708984, + "epoch": 8.345221112696148, + "eval/acc": 53.488372802734375, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval_loss": 2.837869167327881, - "eval_runtime": 4.5845, - "eval_samples_per_second": 9.38, - "eval_steps_per_second": 0.218, + "epoch": 8.345221112696148, + "eval_loss": 2.348762273788452, + "eval_runtime": 1.1232, + "eval_samples_per_second": 38.285, + "eval_steps_per_second": 0.89, "step": 11700 }, { - "epoch": 1.4098242234529255, - "grad_norm": 6.90625, + "epoch": 8.352353780313837, + "grad_norm": 7.6875, "learning_rate": 5.907111111111111e-05, - "loss": 0.5918, + "loss": 0.8979, "step": 11710 }, { - "epoch": 1.41102817240549, - "grad_norm": 12.375, + "epoch": 8.359486447931527, + "grad_norm": 7.75, "learning_rate": 5.9026666666666675e-05, - "loss": 0.5955, + "loss": 0.7527, "step": 11720 }, { - "epoch": 1.4122321213580544, - "grad_norm": 13.4375, + "epoch": 8.366619115549215, + "grad_norm": 7.75, "learning_rate": 5.898222222222223e-05, - "loss": 0.5389, + "loss": 0.7397, "step": 11730 }, { - "epoch": 1.4134360703106188, - "grad_norm": 8.5625, + "epoch": 8.373751783166904, + "grad_norm": 7.125, "learning_rate": 5.893777777777778e-05, - "loss": 0.5927, + "loss": 0.7371, "step": 11740 }, { - "epoch": 1.4146400192631832, - "grad_norm": 7.625, + "epoch": 8.380884450784594, + "grad_norm": 7.09375, "learning_rate": 5.8893333333333334e-05, - "loss": 0.5979, + "loss": 0.7787, "step": 11750 }, { - "epoch": 1.4158439682157478, - "grad_norm": 6.5625, + "epoch": 8.388017118402283, + "grad_norm": 12.75, "learning_rate": 5.884888888888889e-05, - "loss": 0.4657, + "loss": 0.7745, "step": 11760 }, { - "epoch": 1.4170479171683121, - "grad_norm": 7.5, + "epoch": 8.39514978601997, + "grad_norm": 5.96875, "learning_rate": 5.880444444444445e-05, - "loss": 0.6833, + "loss": 0.7675, "step": 11770 }, { - "epoch": 1.4182518661208765, - "grad_norm": 12.5, + "epoch": 8.40228245363766, + "grad_norm": 7.28125, "learning_rate": 5.876000000000001e-05, - "loss": 0.6065, + "loss": 0.7369, "step": 11780 }, { - "epoch": 1.4194558150734409, - "grad_norm": 8.6875, + "epoch": 8.40941512125535, + "grad_norm": 8.5625, "learning_rate": 5.8715555555555555e-05, - "loss": 0.6406, + "loss": 0.7679, "step": 11790 }, { - "epoch": 1.4206597640260052, - "grad_norm": 7.625, + "epoch": 8.41654778887304, + "grad_norm": 6.09375, "learning_rate": 5.867111111111111e-05, - "loss": 0.54, + "loss": 0.7575, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval/acc": 44.1860466003418, + "epoch": 8.41654778887304, + "eval/acc": 48.83720779418945, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval_loss": 2.8732845783233643, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.731, - "eval_steps_per_second": 4.691, + "epoch": 8.41654778887304, + "eval_loss": 2.3886027336120605, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.752, + "eval_steps_per_second": 4.645, "step": 11800 }, { - "epoch": 1.4218637129785696, - "grad_norm": 13.25, + "epoch": 8.423680456490727, + "grad_norm": 7.75, "learning_rate": 5.862666666666667e-05, - "loss": 0.6655, + "loss": 0.7837, "step": 11810 }, { - "epoch": 1.423067661931134, - "grad_norm": 8.8125, + "epoch": 8.430813124108417, + "grad_norm": 7.03125, "learning_rate": 5.858222222222223e-05, - "loss": 0.5967, + "loss": 0.7153, "step": 11820 }, { - "epoch": 1.4242716108836986, - "grad_norm": 5.84375, + "epoch": 8.437945791726106, + "grad_norm": 9.3125, "learning_rate": 5.853777777777778e-05, - "loss": 0.6698, + "loss": 0.7655, "step": 11830 }, { - "epoch": 1.425475559836263, - "grad_norm": 5.375, + "epoch": 8.445078459343794, + "grad_norm": 7.34375, "learning_rate": 5.849333333333333e-05, - "loss": 0.5963, + "loss": 0.761, "step": 11840 }, { - "epoch": 1.4266795087888273, - "grad_norm": 6.625, + "epoch": 8.452211126961483, + "grad_norm": 8.875, "learning_rate": 5.8448888888888886e-05, - "loss": 0.5941, + "loss": 0.7985, "step": 11850 }, { - "epoch": 1.4278834577413917, - "grad_norm": 6.90625, + "epoch": 8.459343794579173, + "grad_norm": 6.96875, "learning_rate": 5.840444444444445e-05, - "loss": 0.6464, + "loss": 0.7208, "step": 11860 }, { - "epoch": 1.4290874066939563, - "grad_norm": 8.5625, + "epoch": 8.466476462196862, + "grad_norm": 6.25, "learning_rate": 5.8360000000000004e-05, - "loss": 0.5185, + "loss": 0.8474, "step": 11870 }, { - "epoch": 1.4302913556465207, - "grad_norm": 5.46875, + "epoch": 8.47360912981455, + "grad_norm": 5.5625, "learning_rate": 5.831555555555556e-05, - "loss": 0.6194, + "loss": 0.773, "step": 11880 }, { - "epoch": 1.431495304599085, - "grad_norm": 7.03125, + "epoch": 8.48074179743224, + "grad_norm": 19.125, "learning_rate": 5.827111111111111e-05, - "loss": 0.5993, + "loss": 0.7026, "step": 11890 }, { - "epoch": 1.4326992535516494, - "grad_norm": 8.25, + "epoch": 8.487874465049929, + "grad_norm": 8.4375, "learning_rate": 5.8226666666666676e-05, - "loss": 0.5726, + "loss": 0.7825, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval/acc": 44.1860466003418, + "epoch": 8.487874465049929, + "eval/acc": 48.83720779418945, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval_loss": 2.9054577350616455, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.241, - "eval_steps_per_second": 4.727, + "epoch": 8.487874465049929, + "eval_loss": 2.395317316055298, + "eval_runtime": 0.2104, + "eval_samples_per_second": 204.361, + "eval_steps_per_second": 4.753, "step": 11900 }, { - "epoch": 1.4339032025042138, - "grad_norm": 6.4375, + "epoch": 8.495007132667618, + "grad_norm": 8.5625, "learning_rate": 5.8182222222222225e-05, - "loss": 0.5805, + "loss": 0.8574, "step": 11910 }, { - "epoch": 1.4351071514567781, - "grad_norm": 9.5, + "epoch": 8.502139800285306, + "grad_norm": 8.0, "learning_rate": 5.813777777777778e-05, - "loss": 0.604, + "loss": 0.8031, "step": 11920 }, { - "epoch": 1.4363111004093425, - "grad_norm": 5.0, + "epoch": 8.509272467902996, + "grad_norm": 8.125, "learning_rate": 5.8093333333333335e-05, - "loss": 0.4871, + "loss": 0.8578, "step": 11930 }, { - "epoch": 1.437515049361907, - "grad_norm": 7.6875, + "epoch": 8.516405135520685, + "grad_norm": 8.3125, "learning_rate": 5.80488888888889e-05, - "loss": 0.5968, + "loss": 0.854, "step": 11940 }, { - "epoch": 1.4387189983144715, - "grad_norm": 7.0625, + "epoch": 8.523537803138375, + "grad_norm": 23.5, "learning_rate": 5.800444444444445e-05, - "loss": 0.5715, + "loss": 0.8375, "step": 11950 }, { - "epoch": 1.4399229472670358, - "grad_norm": 8.4375, + "epoch": 8.530670470756062, + "grad_norm": 6.625, "learning_rate": 5.796e-05, - "loss": 0.6258, + "loss": 0.7793, "step": 11960 }, { - "epoch": 1.4411268962196002, - "grad_norm": 7.78125, + "epoch": 8.537803138373752, + "grad_norm": 36.25, "learning_rate": 5.7915555555555556e-05, - "loss": 0.6474, + "loss": 0.7395, "step": 11970 }, { - "epoch": 1.4423308451721648, - "grad_norm": 6.625, + "epoch": 8.544935805991441, + "grad_norm": 7.96875, "learning_rate": 5.787111111111111e-05, - "loss": 0.6148, + "loss": 0.8492, "step": 11980 }, { - "epoch": 1.4435347941247292, - "grad_norm": 6.5625, + "epoch": 8.552068473609129, + "grad_norm": 7.3125, "learning_rate": 5.782666666666667e-05, - "loss": 0.5533, + "loss": 0.7591, "step": 11990 }, { - "epoch": 1.4447387430772936, - "grad_norm": 11.0625, + "epoch": 8.559201141226819, + "grad_norm": 13.75, "learning_rate": 5.778222222222223e-05, - "loss": 0.5756, + "loss": 0.7175, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval/acc": 46.511627197265625, + "epoch": 8.559201141226819, + "eval/acc": 48.83720779418945, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval_loss": 2.8763856887817383, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.613, - "eval_steps_per_second": 4.735, + "epoch": 8.559201141226819, + "eval_loss": 2.375894069671631, + "eval_runtime": 0.2121, + "eval_samples_per_second": 202.777, + "eval_steps_per_second": 4.716, "step": 12000 }, { - "epoch": 1.445942692029858, - "grad_norm": 10.625, + "epoch": 8.566333808844508, + "grad_norm": 8.375, "learning_rate": 5.773777777777778e-05, - "loss": 0.6164, + "loss": 0.8009, "step": 12010 }, { - "epoch": 1.4471466409824223, - "grad_norm": 8.3125, + "epoch": 8.573466476462198, + "grad_norm": 10.375, "learning_rate": 5.769333333333333e-05, - "loss": 0.5638, + "loss": 0.7651, "step": 12020 }, { - "epoch": 1.4483505899349867, - "grad_norm": 7.25, + "epoch": 8.580599144079885, + "grad_norm": 10.5, "learning_rate": 5.7648888888888894e-05, - "loss": 0.6082, + "loss": 0.7947, "step": 12030 }, { - "epoch": 1.449554538887551, - "grad_norm": 6.65625, + "epoch": 8.587731811697575, + "grad_norm": 9.375, "learning_rate": 5.760444444444445e-05, - "loss": 0.5113, + "loss": 0.8377, "step": 12040 }, { - "epoch": 1.4507584878401156, - "grad_norm": 8.5625, + "epoch": 8.594864479315264, + "grad_norm": 7.0, "learning_rate": 5.7560000000000005e-05, - "loss": 0.5966, + "loss": 0.7803, "step": 12050 }, { - "epoch": 1.45196243679268, - "grad_norm": 9.0625, + "epoch": 8.601997146932954, + "grad_norm": 7.03125, "learning_rate": 5.751555555555555e-05, - "loss": 0.4791, + "loss": 0.7129, "step": 12060 }, { - "epoch": 1.4531663857452444, - "grad_norm": 7.8125, + "epoch": 8.609129814550641, + "grad_norm": 9.6875, "learning_rate": 5.747111111111111e-05, - "loss": 0.5999, + "loss": 0.9395, "step": 12070 }, { - "epoch": 1.4543703346978087, - "grad_norm": 7.96875, + "epoch": 8.616262482168331, + "grad_norm": 7.1875, "learning_rate": 5.742666666666667e-05, - "loss": 0.5942, + "loss": 0.8461, "step": 12080 }, { - "epoch": 1.4555742836503733, - "grad_norm": 100.0, + "epoch": 8.62339514978602, + "grad_norm": 8.3125, "learning_rate": 5.7382222222222225e-05, - "loss": 0.6591, + "loss": 0.8533, "step": 12090 }, { - "epoch": 1.4567782326029377, - "grad_norm": 10.625, + "epoch": 8.63052781740371, + "grad_norm": 7.75, "learning_rate": 5.733777777777778e-05, - "loss": 0.5924, + "loss": 0.7819, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval/acc": 45.930233001708984, + "epoch": 8.63052781740371, + "eval/acc": 46.511627197265625, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval_loss": 2.9057791233062744, - "eval_runtime": 0.2182, - "eval_samples_per_second": 197.109, - "eval_steps_per_second": 4.584, + "epoch": 8.63052781740371, + "eval_loss": 2.367159605026245, + "eval_runtime": 0.35, + "eval_samples_per_second": 122.848, + "eval_steps_per_second": 2.857, "step": 12100 }, { - "epoch": 1.457982181555502, - "grad_norm": 6.21875, + "epoch": 8.637660485021398, + "grad_norm": 8.1875, "learning_rate": 5.729333333333333e-05, - "loss": 0.6354, + "loss": 0.8752, "step": 12110 }, { - "epoch": 1.4591861305080664, - "grad_norm": 7.59375, + "epoch": 8.644793152639087, + "grad_norm": 5.6875, "learning_rate": 5.72488888888889e-05, - "loss": 0.5777, + "loss": 0.8182, "step": 12120 }, { - "epoch": 1.4603900794606308, + "epoch": 8.651925820256777, "grad_norm": 7.09375, "learning_rate": 5.7204444444444446e-05, - "loss": 0.6189, + "loss": 0.8116, "step": 12130 }, { - "epoch": 1.4615940284131952, - "grad_norm": 7.84375, + "epoch": 8.659058487874464, + "grad_norm": 7.65625, "learning_rate": 5.716e-05, - "loss": 0.5161, + "loss": 0.7563, "step": 12140 }, { - "epoch": 1.4627979773657596, - "grad_norm": 9.25, + "epoch": 8.666191155492154, + "grad_norm": 20.75, "learning_rate": 5.711555555555556e-05, - "loss": 0.6892, + "loss": 0.6896, "step": 12150 }, { - "epoch": 1.4640019263183242, - "grad_norm": 7.875, + "epoch": 8.673323823109843, + "grad_norm": 9.25, "learning_rate": 5.7071111111111105e-05, - "loss": 0.4845, + "loss": 0.8233, "step": 12160 }, { - "epoch": 1.4652058752708885, - "grad_norm": 6.625, + "epoch": 8.680456490727533, + "grad_norm": 11.0625, "learning_rate": 5.7026666666666674e-05, - "loss": 0.6342, + "loss": 0.8978, "step": 12170 }, { - "epoch": 1.466409824223453, - "grad_norm": 7.03125, + "epoch": 8.68758915834522, + "grad_norm": 8.1875, "learning_rate": 5.698222222222222e-05, - "loss": 0.5427, + "loss": 0.7671, "step": 12180 }, { - "epoch": 1.4676137731760173, - "grad_norm": 6.84375, + "epoch": 8.69472182596291, + "grad_norm": 13.0625, "learning_rate": 5.693777777777778e-05, - "loss": 0.5672, + "loss": 0.7771, "step": 12190 }, { - "epoch": 1.4688177221285819, - "grad_norm": 9.0, + "epoch": 8.7018544935806, + "grad_norm": 8.25, "learning_rate": 5.689333333333333e-05, - "loss": 0.6318, + "loss": 0.758, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval/acc": 45.930233001708984, + "epoch": 8.7018544935806, + "eval/acc": 46.511627197265625, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval_loss": 2.8778676986694336, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.649, - "eval_steps_per_second": 4.69, + "epoch": 8.7018544935806, + "eval_loss": 2.3872835636138916, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.006, + "eval_steps_per_second": 4.675, "step": 12200 }, { - "epoch": 1.4700216710811462, - "grad_norm": 7.4375, + "epoch": 8.708987161198289, + "grad_norm": 6.5, "learning_rate": 5.6848888888888895e-05, - "loss": 0.5589, + "loss": 0.8066, "step": 12210 }, { - "epoch": 1.4712256200337106, - "grad_norm": 9.1875, + "epoch": 8.716119828815977, + "grad_norm": 7.21875, "learning_rate": 5.680444444444445e-05, - "loss": 0.6005, + "loss": 0.8287, "step": 12220 }, { - "epoch": 1.472429568986275, - "grad_norm": 7.875, + "epoch": 8.723252496433666, + "grad_norm": 10.625, "learning_rate": 5.6760000000000005e-05, - "loss": 0.5573, + "loss": 0.917, "step": 12230 }, { - "epoch": 1.4736335179388393, - "grad_norm": 7.71875, + "epoch": 8.730385164051356, + "grad_norm": 9.6875, "learning_rate": 5.6715555555555554e-05, - "loss": 0.5431, + "loss": 0.8417, "step": 12240 }, { - "epoch": 1.4748374668914037, - "grad_norm": 9.0625, + "epoch": 8.737517831669045, + "grad_norm": 8.6875, "learning_rate": 5.6671111111111116e-05, - "loss": 0.5939, + "loss": 0.8405, "step": 12250 }, { - "epoch": 1.476041415843968, - "grad_norm": 9.875, + "epoch": 8.744650499286733, + "grad_norm": 6.875, "learning_rate": 5.662666666666667e-05, - "loss": 0.5894, + "loss": 0.7838, "step": 12260 }, { - "epoch": 1.4772453647965327, - "grad_norm": 6.34375, + "epoch": 8.751783166904422, + "grad_norm": 6.25, "learning_rate": 5.6582222222222226e-05, - "loss": 0.5602, + "loss": 0.6897, "step": 12270 }, { - "epoch": 1.478449313749097, - "grad_norm": 7.34375, + "epoch": 8.758915834522112, + "grad_norm": 7.375, "learning_rate": 5.653777777777778e-05, - "loss": 0.6093, + "loss": 0.7716, "step": 12280 }, { - "epoch": 1.4796532627016614, - "grad_norm": 7.65625, + "epoch": 8.7660485021398, + "grad_norm": 7.96875, "learning_rate": 5.649333333333333e-05, - "loss": 0.5755, + "loss": 0.8497, "step": 12290 }, { - "epoch": 1.4808572116542258, - "grad_norm": 7.15625, + "epoch": 8.773181169757489, + "grad_norm": 7.75, "learning_rate": 5.64488888888889e-05, - "loss": 0.5593, + "loss": 0.747, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval/acc": 46.511627197265625, + "epoch": 8.773181169757489, + "eval/acc": 48.83720779418945, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval_loss": 2.8776164054870605, - "eval_runtime": 1.0978, - "eval_samples_per_second": 39.169, - "eval_steps_per_second": 0.911, + "epoch": 8.773181169757489, + "eval_loss": 2.3708367347717285, + "eval_runtime": 0.2183, + "eval_samples_per_second": 197.001, + "eval_steps_per_second": 4.581, "step": 12300 }, { - "epoch": 1.4820611606067904, - "grad_norm": 6.96875, + "epoch": 8.780313837375179, + "grad_norm": 7.28125, "learning_rate": 5.640444444444445e-05, - "loss": 0.6139, + "loss": 0.8225, "step": 12310 }, { - "epoch": 1.4832651095593548, - "grad_norm": 6.375, + "epoch": 8.787446504992868, + "grad_norm": 6.8125, "learning_rate": 5.636e-05, - "loss": 0.5185, + "loss": 0.684, "step": 12320 }, { - "epoch": 1.4844690585119191, - "grad_norm": 10.1875, + "epoch": 8.794579172610556, + "grad_norm": 5.84375, "learning_rate": 5.631555555555556e-05, - "loss": 0.5106, + "loss": 0.8008, "step": 12330 }, { - "epoch": 1.4856730074644835, - "grad_norm": 8.875, + "epoch": 8.801711840228245, + "grad_norm": 6.8125, "learning_rate": 5.627111111111112e-05, - "loss": 0.6202, + "loss": 0.7119, "step": 12340 }, { - "epoch": 1.4868769564170479, - "grad_norm": 7.90625, + "epoch": 8.808844507845935, + "grad_norm": 7.625, "learning_rate": 5.6226666666666675e-05, - "loss": 0.5785, + "loss": 0.7878, "step": 12350 }, { - "epoch": 1.4880809053696122, - "grad_norm": 7.625, + "epoch": 8.815977175463622, + "grad_norm": 6.5625, "learning_rate": 5.6182222222222223e-05, - "loss": 0.5529, + "loss": 0.8389, "step": 12360 }, { - "epoch": 1.4892848543221766, - "grad_norm": 6.53125, + "epoch": 8.823109843081312, + "grad_norm": 7.8125, "learning_rate": 5.613777777777778e-05, - "loss": 0.5533, + "loss": 0.8858, "step": 12370 }, { - "epoch": 1.4904888032747412, - "grad_norm": 7.09375, + "epoch": 8.830242510699001, + "grad_norm": 7.0, "learning_rate": 5.6093333333333334e-05, - "loss": 0.6117, + "loss": 0.797, "step": 12380 }, { - "epoch": 1.4916927522273056, - "grad_norm": 6.59375, + "epoch": 8.837375178316691, + "grad_norm": 8.125, "learning_rate": 5.6048888888888896e-05, - "loss": 0.602, + "loss": 0.7154, "step": 12390 }, { - "epoch": 1.49289670117987, - "grad_norm": 10.8125, + "epoch": 8.844507845934379, + "grad_norm": 6.59375, "learning_rate": 5.600444444444445e-05, - "loss": 0.5845, + "loss": 0.8543, "step": 12400 }, { - "epoch": 1.49289670117987, + "epoch": 8.844507845934379, "eval/acc": 46.511627197265625, "step": 12400 }, { - "epoch": 1.49289670117987, - "eval_loss": 2.860626697540283, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.684, - "eval_steps_per_second": 4.644, + "epoch": 8.844507845934379, + "eval_loss": 2.3827686309814453, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.553, + "eval_steps_per_second": 4.664, "step": 12400 }, { - "epoch": 1.4941006501324343, - "grad_norm": 7.40625, + "epoch": 8.851640513552068, + "grad_norm": 8.8125, "learning_rate": 5.596e-05, - "loss": 0.5725, + "loss": 0.8071, "step": 12410 }, { - "epoch": 1.495304599084999, - "grad_norm": 7.28125, + "epoch": 8.858773181169758, + "grad_norm": 6.0625, "learning_rate": 5.5915555555555555e-05, - "loss": 0.6233, + "loss": 0.7174, "step": 12420 }, { - "epoch": 1.4965085480375633, - "grad_norm": 7.375, + "epoch": 8.865905848787447, + "grad_norm": 9.8125, "learning_rate": 5.587111111111112e-05, - "loss": 0.6094, + "loss": 0.861, "step": 12430 }, { - "epoch": 1.4977124969901277, - "grad_norm": 8.8125, + "epoch": 8.873038516405135, + "grad_norm": 8.0, "learning_rate": 5.582666666666667e-05, - "loss": 0.6249, + "loss": 0.831, "step": 12440 }, { - "epoch": 1.498916445942692, - "grad_norm": 14.125, + "epoch": 8.880171184022824, + "grad_norm": 5.21875, "learning_rate": 5.578222222222223e-05, - "loss": 0.5612, + "loss": 0.7814, "step": 12450 }, { - "epoch": 1.5001203948952564, - "grad_norm": 9.125, + "epoch": 8.887303851640514, + "grad_norm": 6.78125, "learning_rate": 5.5737777777777776e-05, - "loss": 0.6067, + "loss": 0.6926, "step": 12460 }, { - "epoch": 1.5013243438478208, - "grad_norm": 7.875, + "epoch": 8.894436519258203, + "grad_norm": 8.6875, "learning_rate": 5.569333333333333e-05, - "loss": 0.5496, + "loss": 0.7977, "step": 12470 }, { - "epoch": 1.5025282928003851, - "grad_norm": 9.3125, + "epoch": 8.901569186875891, + "grad_norm": 6.5625, "learning_rate": 5.564888888888889e-05, - "loss": 0.5547, + "loss": 0.7647, "step": 12480 }, { - "epoch": 1.5037322417529497, - "grad_norm": 6.375, + "epoch": 8.90870185449358, + "grad_norm": 10.875, "learning_rate": 5.560444444444445e-05, - "loss": 0.5596, + "loss": 0.8469, "step": 12490 }, { - "epoch": 1.504936190705514, - "grad_norm": 8.125, + "epoch": 8.91583452211127, + "grad_norm": 12.0625, "learning_rate": 5.556e-05, - "loss": 0.5604, + "loss": 0.9152, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval/acc": 41.86046600341797, + "epoch": 8.91583452211127, + "eval/acc": 46.511627197265625, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval_loss": 2.8973793983459473, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.739, - "eval_steps_per_second": 4.668, + "epoch": 8.91583452211127, + "eval_loss": 2.3970413208007812, + "eval_runtime": 0.2133, + "eval_samples_per_second": 201.589, + "eval_steps_per_second": 4.688, "step": 12500 }, { - "epoch": 1.5061401396580785, - "grad_norm": 6.875, + "epoch": 8.922967189728958, + "grad_norm": 9.875, "learning_rate": 5.551555555555555e-05, - "loss": 0.5898, + "loss": 0.8202, "step": 12510 }, { - "epoch": 1.507344088610643, - "grad_norm": 10.6875, + "epoch": 8.930099857346647, + "grad_norm": 7.625, "learning_rate": 5.547111111111112e-05, - "loss": 0.5065, + "loss": 0.8159, "step": 12520 }, { - "epoch": 1.5085480375632074, - "grad_norm": 8.6875, + "epoch": 8.937232524964337, + "grad_norm": 7.875, "learning_rate": 5.542666666666667e-05, - "loss": 0.6214, + "loss": 0.684, "step": 12530 }, { - "epoch": 1.5097519865157718, - "grad_norm": 7.8125, + "epoch": 8.944365192582026, + "grad_norm": 6.59375, "learning_rate": 5.5382222222222224e-05, - "loss": 0.5012, + "loss": 0.7629, "step": 12540 }, { - "epoch": 1.5109559354683362, - "grad_norm": 7.25, + "epoch": 8.951497860199714, + "grad_norm": 6.90625, "learning_rate": 5.533777777777778e-05, - "loss": 0.5807, + "loss": 0.8227, "step": 12550 }, { - "epoch": 1.5121598844209005, - "grad_norm": 8.625, + "epoch": 8.958630527817403, + "grad_norm": 6.3125, "learning_rate": 5.529333333333334e-05, - "loss": 0.6293, + "loss": 0.8235, "step": 12560 }, { - "epoch": 1.513363833373465, - "grad_norm": 8.125, + "epoch": 8.965763195435093, + "grad_norm": 6.5, "learning_rate": 5.52488888888889e-05, - "loss": 0.5367, + "loss": 0.7865, "step": 12570 }, { - "epoch": 1.5145677823260293, - "grad_norm": 6.53125, + "epoch": 8.972895863052782, + "grad_norm": 5.875, "learning_rate": 5.5204444444444445e-05, - "loss": 0.6308, + "loss": 0.7331, "step": 12580 }, { - "epoch": 1.5157717312785937, - "grad_norm": 6.09375, + "epoch": 8.98002853067047, + "grad_norm": 7.15625, "learning_rate": 5.516e-05, - "loss": 0.571, + "loss": 0.8498, "step": 12590 }, { - "epoch": 1.5169756802311583, - "grad_norm": 9.625, + "epoch": 8.98716119828816, + "grad_norm": 7.75, "learning_rate": 5.5115555555555556e-05, - "loss": 0.6378, + "loss": 0.7825, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval/acc": 44.1860466003418, + "epoch": 8.98716119828816, + "eval/acc": 51.16279220581055, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval_loss": 2.856049060821533, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.899, - "eval_steps_per_second": 4.626, + "epoch": 8.98716119828816, + "eval_loss": 2.3282017707824707, + "eval_runtime": 0.2152, + "eval_samples_per_second": 199.842, + "eval_steps_per_second": 4.647, "step": 12600 }, { - "epoch": 1.5181796291837226, - "grad_norm": 8.0, + "epoch": 8.99429386590585, + "grad_norm": 7.0, "learning_rate": 5.507111111111112e-05, - "loss": 0.5873, + "loss": 0.8485, "step": 12610 }, { - "epoch": 1.519383578136287, - "grad_norm": 6.75, + "epoch": 9.001426533523539, + "grad_norm": 8.1875, "learning_rate": 5.502666666666667e-05, - "loss": 0.6427, + "loss": 0.8691, "step": 12620 }, { - "epoch": 1.5205875270888516, - "grad_norm": 9.1875, + "epoch": 9.008559201141226, + "grad_norm": 8.875, "learning_rate": 5.498222222222222e-05, - "loss": 0.6036, + "loss": 0.8085, "step": 12630 }, { - "epoch": 1.521791476041416, - "grad_norm": 6.3125, + "epoch": 9.015691868758916, + "grad_norm": 10.875, "learning_rate": 5.4937777777777777e-05, - "loss": 0.6524, + "loss": 0.7221, "step": 12640 }, { - "epoch": 1.5229954249939803, - "grad_norm": 4.6875, + "epoch": 9.022824536376605, + "grad_norm": 7.9375, "learning_rate": 5.489333333333334e-05, - "loss": 0.5334, + "loss": 0.8136, "step": 12650 }, { - "epoch": 1.5241993739465447, - "grad_norm": 11.5, + "epoch": 9.029957203994293, + "grad_norm": 6.78125, "learning_rate": 5.4848888888888894e-05, - "loss": 0.5485, + "loss": 0.6211, "step": 12660 }, { - "epoch": 1.525403322899109, - "grad_norm": 7.21875, + "epoch": 9.037089871611983, + "grad_norm": 7.09375, "learning_rate": 5.480444444444445e-05, - "loss": 0.646, + "loss": 0.7893, "step": 12670 }, { - "epoch": 1.5266072718516734, - "grad_norm": 7.5625, + "epoch": 9.044222539229672, + "grad_norm": 7.375, "learning_rate": 5.476e-05, - "loss": 0.5385, + "loss": 0.9348, "step": 12680 }, { - "epoch": 1.5278112208042378, - "grad_norm": 8.375, + "epoch": 9.051355206847362, + "grad_norm": 7.09375, "learning_rate": 5.471555555555555e-05, - "loss": 0.503, + "loss": 0.9088, "step": 12690 }, { - "epoch": 1.5290151697568022, - "grad_norm": 13.0625, + "epoch": 9.05848787446505, + "grad_norm": 7.9375, "learning_rate": 5.4671111111111115e-05, - "loss": 0.5886, + "loss": 0.8116, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval/acc": 44.76744079589844, + "epoch": 9.05848787446505, + "eval/acc": 32.55813980102539, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval_loss": 2.8569490909576416, - "eval_runtime": 0.2181, - "eval_samples_per_second": 197.134, - "eval_steps_per_second": 4.585, + "epoch": 9.05848787446505, + "eval_loss": 3.3768653869628906, + "eval_runtime": 1.089, + "eval_samples_per_second": 39.487, + "eval_steps_per_second": 0.918, "step": 12700 }, { - "epoch": 1.5302191187093668, - "grad_norm": 8.6875, + "epoch": 9.065620542082739, + "grad_norm": 7.875, "learning_rate": 5.462666666666667e-05, - "loss": 0.7241, + "loss": 0.7748, "step": 12710 }, { - "epoch": 1.5314230676619311, - "grad_norm": 8.1875, + "epoch": 9.072753209700428, + "grad_norm": 6.96875, "learning_rate": 5.4582222222222225e-05, - "loss": 0.5965, + "loss": 0.872, "step": 12720 }, { - "epoch": 1.5326270166144955, - "grad_norm": 6.96875, + "epoch": 9.079885877318118, + "grad_norm": 7.59375, "learning_rate": 5.4537777777777774e-05, - "loss": 0.5195, + "loss": 0.8539, "step": 12730 }, { - "epoch": 1.5338309655670601, - "grad_norm": 6.65625, + "epoch": 9.087018544935805, + "grad_norm": 6.9375, "learning_rate": 5.449333333333334e-05, - "loss": 0.5949, + "loss": 0.784, "step": 12740 }, { - "epoch": 1.5350349145196245, - "grad_norm": 10.25, + "epoch": 9.094151212553495, + "grad_norm": 6.5625, "learning_rate": 5.444888888888889e-05, - "loss": 0.5962, + "loss": 0.7998, "step": 12750 }, { - "epoch": 1.5362388634721889, - "grad_norm": 10.8125, + "epoch": 9.101283880171184, + "grad_norm": 7.0625, "learning_rate": 5.4404444444444446e-05, - "loss": 0.6544, + "loss": 0.8213, "step": 12760 }, { - "epoch": 1.5374428124247532, - "grad_norm": 9.8125, + "epoch": 9.108416547788874, + "grad_norm": 5.9375, "learning_rate": 5.436e-05, - "loss": 0.5681, + "loss": 0.8233, "step": 12770 }, { - "epoch": 1.5386467613773176, - "grad_norm": 7.875, + "epoch": 9.115549215406562, + "grad_norm": 6.53125, "learning_rate": 5.431555555555555e-05, - "loss": 0.5944, + "loss": 0.7617, "step": 12780 }, { - "epoch": 1.539850710329882, - "grad_norm": 6.75, + "epoch": 9.122681883024251, + "grad_norm": 7.3125, "learning_rate": 5.427111111111112e-05, - "loss": 0.5141, + "loss": 0.8139, "step": 12790 }, { - "epoch": 1.5410546592824463, - "grad_norm": 6.96875, + "epoch": 9.12981455064194, + "grad_norm": 7.625, "learning_rate": 5.422666666666667e-05, - "loss": 0.627, + "loss": 0.7742, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval/acc": 44.1860466003418, + "epoch": 9.12981455064194, + "eval/acc": 34.88372039794922, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval_loss": 2.8678698539733887, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.486, - "eval_steps_per_second": 4.709, + "epoch": 9.12981455064194, + "eval_loss": 3.372913122177124, + "eval_runtime": 0.2597, + "eval_samples_per_second": 165.6, + "eval_steps_per_second": 3.851, "step": 12800 }, { - "epoch": 1.5422586082350107, - "grad_norm": 5.1875, + "epoch": 9.136947218259628, + "grad_norm": 7.90625, "learning_rate": 5.418222222222222e-05, - "loss": 0.601, + "loss": 0.8071, "step": 12810 }, { - "epoch": 1.5434625571875753, - "grad_norm": 10.0625, + "epoch": 9.144079885877318, + "grad_norm": 6.5625, "learning_rate": 5.413777777777778e-05, - "loss": 0.6409, + "loss": 0.7691, "step": 12820 }, { - "epoch": 1.5446665061401397, - "grad_norm": 6.21875, + "epoch": 9.151212553495007, + "grad_norm": 8.375, "learning_rate": 5.409333333333334e-05, - "loss": 0.6065, + "loss": 0.8105, "step": 12830 }, { - "epoch": 1.545870455092704, - "grad_norm": 7.125, + "epoch": 9.158345221112697, + "grad_norm": 7.5, "learning_rate": 5.4048888888888895e-05, - "loss": 0.5369, + "loss": 0.83, "step": 12840 }, { - "epoch": 1.5470744040452686, - "grad_norm": 8.4375, + "epoch": 9.165477888730384, + "grad_norm": 7.21875, "learning_rate": 5.400444444444444e-05, - "loss": 0.6577, + "loss": 0.8158, "step": 12850 }, { - "epoch": 1.548278352997833, - "grad_norm": 7.09375, + "epoch": 9.172610556348074, + "grad_norm": 8.0625, "learning_rate": 5.396e-05, - "loss": 0.6092, + "loss": 0.7359, "step": 12860 }, { - "epoch": 1.5494823019503974, - "grad_norm": 6.5625, + "epoch": 9.179743223965763, + "grad_norm": 7.21875, "learning_rate": 5.391555555555556e-05, - "loss": 0.7309, + "loss": 0.7797, "step": 12870 }, { - "epoch": 1.5506862509029617, - "grad_norm": 6.96875, + "epoch": 9.186875891583453, + "grad_norm": 11.8125, "learning_rate": 5.3871111111111116e-05, - "loss": 0.6047, + "loss": 0.8005, "step": 12880 }, { - "epoch": 1.5518901998555261, - "grad_norm": 8.125, + "epoch": 9.19400855920114, + "grad_norm": 14.0, "learning_rate": 5.382666666666667e-05, - "loss": 0.6257, + "loss": 0.8764, "step": 12890 }, { - "epoch": 1.5530941488080905, - "grad_norm": 7.4375, + "epoch": 9.20114122681883, + "grad_norm": 6.96875, "learning_rate": 5.3782222222222226e-05, - "loss": 0.5871, + "loss": 0.6898, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval/acc": 44.1860466003418, + "epoch": 9.20114122681883, + "eval/acc": 32.55813980102539, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval_loss": 2.8619837760925293, - "eval_runtime": 0.2172, - "eval_samples_per_second": 197.932, - "eval_steps_per_second": 4.603, + "epoch": 9.20114122681883, + "eval_loss": 3.383354425430298, + "eval_runtime": 0.2453, + "eval_samples_per_second": 175.301, + "eval_steps_per_second": 4.077, "step": 12900 }, { - "epoch": 1.5542980977606549, - "grad_norm": 9.8125, + "epoch": 9.20827389443652, + "grad_norm": 8.125, "learning_rate": 5.3737777777777775e-05, - "loss": 0.5404, + "loss": 0.8066, "step": 12910 }, { - "epoch": 1.5555020467132192, - "grad_norm": 7.4375, + "epoch": 9.21540656205421, + "grad_norm": 7.96875, "learning_rate": 5.369333333333334e-05, - "loss": 0.5922, + "loss": 0.7809, "step": 12920 }, { - "epoch": 1.5567059956657838, - "grad_norm": 6.53125, + "epoch": 9.222539229671897, + "grad_norm": 7.15625, "learning_rate": 5.364888888888889e-05, - "loss": 0.6612, + "loss": 0.7242, "step": 12930 }, { - "epoch": 1.5579099446183482, - "grad_norm": 8.25, + "epoch": 9.229671897289586, + "grad_norm": 7.65625, "learning_rate": 5.360444444444445e-05, - "loss": 0.6073, + "loss": 0.8201, "step": 12940 }, { - "epoch": 1.5591138935709126, - "grad_norm": 8.875, + "epoch": 9.236804564907276, + "grad_norm": 8.75, "learning_rate": 5.356e-05, - "loss": 0.609, + "loss": 0.8531, "step": 12950 }, { - "epoch": 1.5603178425234772, - "grad_norm": 7.03125, + "epoch": 9.243937232524964, + "grad_norm": 7.3125, "learning_rate": 5.3515555555555564e-05, - "loss": 0.5725, + "loss": 0.8004, "step": 12960 }, { - "epoch": 1.5615217914760415, - "grad_norm": 11.375, + "epoch": 9.251069900142653, + "grad_norm": 9.1875, "learning_rate": 5.347111111111112e-05, - "loss": 0.6808, + "loss": 0.8026, "step": 12970 }, { - "epoch": 1.562725740428606, - "grad_norm": 8.4375, + "epoch": 9.258202567760343, + "grad_norm": 8.75, "learning_rate": 5.342666666666667e-05, - "loss": 0.6652, + "loss": 0.9001, "step": 12980 }, { - "epoch": 1.5639296893811703, - "grad_norm": 12.625, + "epoch": 9.265335235378032, + "grad_norm": 6.75, "learning_rate": 5.338222222222222e-05, - "loss": 0.6361, + "loss": 0.8698, "step": 12990 }, { - "epoch": 1.5651336383337346, - "grad_norm": 5.875, + "epoch": 9.27246790299572, + "grad_norm": 5.75, "learning_rate": 5.333777777777778e-05, - "loss": 0.539, + "loss": 0.7668, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval/acc": 42.44186019897461, + "epoch": 9.27246790299572, + "eval/acc": 34.88372039794922, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval_loss": 2.877701759338379, - "eval_runtime": 0.2098, - "eval_samples_per_second": 204.994, - "eval_steps_per_second": 4.767, + "epoch": 9.27246790299572, + "eval_loss": 3.3350794315338135, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.644, + "eval_steps_per_second": 4.457, "step": 13000 }, { - "epoch": 1.566337587286299, - "grad_norm": 9.1875, + "epoch": 9.27960057061341, + "grad_norm": 7.90625, "learning_rate": 5.329333333333334e-05, - "loss": 0.5718, + "loss": 0.8908, "step": 13010 }, { - "epoch": 1.5675415362388634, - "grad_norm": 6.78125, + "epoch": 9.286733238231099, + "grad_norm": 7.78125, "learning_rate": 5.3248888888888896e-05, - "loss": 0.6074, + "loss": 0.76, "step": 13020 }, { - "epoch": 1.5687454851914278, - "grad_norm": 7.28125, + "epoch": 9.293865905848788, + "grad_norm": 22.125, "learning_rate": 5.3204444444444444e-05, - "loss": 0.5788, + "loss": 0.8205, "step": 13030 }, { - "epoch": 1.5699494341439921, - "grad_norm": 6.78125, + "epoch": 9.300998573466476, + "grad_norm": 9.6875, "learning_rate": 5.316e-05, - "loss": 0.6445, + "loss": 0.7528, "step": 13040 }, { - "epoch": 1.5711533830965567, - "grad_norm": 7.34375, + "epoch": 9.308131241084165, + "grad_norm": 6.8125, "learning_rate": 5.311555555555556e-05, - "loss": 0.6391, + "loss": 0.8987, "step": 13050 }, { - "epoch": 1.572357332049121, - "grad_norm": 8.375, + "epoch": 9.315263908701855, + "grad_norm": 7.71875, "learning_rate": 5.3071111111111116e-05, - "loss": 0.6101, + "loss": 0.8056, "step": 13060 }, { - "epoch": 1.5735612810016857, - "grad_norm": 9.6875, + "epoch": 9.322396576319543, + "grad_norm": 6.78125, "learning_rate": 5.302666666666667e-05, - "loss": 0.6029, + "loss": 0.7962, "step": 13070 }, { - "epoch": 1.57476522995425, - "grad_norm": 7.5625, + "epoch": 9.329529243937232, + "grad_norm": 6.3125, "learning_rate": 5.298222222222222e-05, - "loss": 0.6034, + "loss": 0.846, "step": 13080 }, { - "epoch": 1.5759691789068144, - "grad_norm": 6.90625, + "epoch": 9.336661911554922, + "grad_norm": 8.75, "learning_rate": 5.2937777777777775e-05, - "loss": 0.629, + "loss": 0.8005, "step": 13090 }, { - "epoch": 1.5771731278593788, - "grad_norm": 8.0625, + "epoch": 9.343794579172611, + "grad_norm": 27.25, "learning_rate": 5.289333333333334e-05, - "loss": 0.5272, + "loss": 0.7313, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval/acc": 42.44186019897461, + "epoch": 9.343794579172611, + "eval/acc": 32.55813980102539, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval_loss": 2.883392095565796, - "eval_runtime": 0.2099, - "eval_samples_per_second": 204.824, - "eval_steps_per_second": 4.763, + "epoch": 9.343794579172611, + "eval_loss": 3.3405187129974365, + "eval_runtime": 0.2462, + "eval_samples_per_second": 174.636, + "eval_steps_per_second": 4.061, "step": 13100 }, { - "epoch": 1.5783770768119432, - "grad_norm": 7.46875, + "epoch": 9.350927246790299, + "grad_norm": 7.5625, "learning_rate": 5.284888888888889e-05, - "loss": 0.6273, + "loss": 0.8474, "step": 13110 }, { - "epoch": 1.5795810257645075, - "grad_norm": 7.90625, + "epoch": 9.358059914407988, + "grad_norm": 7.25, "learning_rate": 5.280444444444445e-05, - "loss": 0.6752, + "loss": 0.8104, "step": 13120 }, { - "epoch": 1.580784974717072, - "grad_norm": 9.6875, + "epoch": 9.365192582025678, + "grad_norm": 7.71875, "learning_rate": 5.2759999999999996e-05, - "loss": 0.5769, + "loss": 0.8638, "step": 13130 }, { - "epoch": 1.5819889236696363, - "grad_norm": 8.1875, + "epoch": 9.372325249643367, + "grad_norm": 8.25, "learning_rate": 5.2715555555555565e-05, - "loss": 0.5469, + "loss": 0.7968, "step": 13140 }, { - "epoch": 1.5831928726222007, - "grad_norm": 4.96875, + "epoch": 9.379457917261055, + "grad_norm": 9.4375, "learning_rate": 5.2671111111111114e-05, - "loss": 0.5805, + "loss": 0.692, "step": 13150 }, { - "epoch": 1.5843968215747652, - "grad_norm": 6.65625, + "epoch": 9.386590584878745, + "grad_norm": 6.1875, "learning_rate": 5.262666666666667e-05, - "loss": 0.527, + "loss": 0.8222, "step": 13160 }, { - "epoch": 1.5856007705273296, - "grad_norm": 9.375, + "epoch": 9.393723252496434, + "grad_norm": 9.9375, "learning_rate": 5.2582222222222224e-05, - "loss": 0.5363, + "loss": 0.8494, "step": 13170 }, { - "epoch": 1.5868047194798942, - "grad_norm": 6.15625, + "epoch": 9.400855920114124, + "grad_norm": 8.0, "learning_rate": 5.2537777777777786e-05, - "loss": 0.5795, + "loss": 0.8254, "step": 13180 }, { - "epoch": 1.5880086684324586, - "grad_norm": 6.59375, + "epoch": 9.407988587731811, + "grad_norm": 7.375, "learning_rate": 5.249333333333334e-05, - "loss": 0.567, + "loss": 0.8771, "step": 13190 }, { - "epoch": 1.589212617385023, - "grad_norm": 7.46875, + "epoch": 9.4151212553495, + "grad_norm": 7.34375, "learning_rate": 5.244888888888889e-05, - "loss": 0.5838, + "loss": 0.8563, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval/acc": 44.76744079589844, + "epoch": 9.4151212553495, + "eval/acc": 37.20930099487305, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval_loss": 2.885694742202759, - "eval_runtime": 0.2136, - "eval_samples_per_second": 201.314, - "eval_steps_per_second": 4.682, + "epoch": 9.4151212553495, + "eval_loss": 3.293537139892578, + "eval_runtime": 0.219, + "eval_samples_per_second": 196.361, + "eval_steps_per_second": 4.567, "step": 13200 }, { - "epoch": 1.5904165663375873, - "grad_norm": 7.625, + "epoch": 9.42225392296719, + "grad_norm": 7.1875, "learning_rate": 5.2404444444444445e-05, - "loss": 0.5836, + "loss": 0.769, "step": 13210 }, { - "epoch": 1.5916205152901517, - "grad_norm": 7.28125, + "epoch": 9.429386590584878, + "grad_norm": 8.5, "learning_rate": 5.236e-05, - "loss": 0.6374, + "loss": 0.778, "step": 13220 }, { - "epoch": 1.592824464242716, - "grad_norm": 5.59375, + "epoch": 9.436519258202567, + "grad_norm": 7.6875, "learning_rate": 5.231555555555556e-05, - "loss": 0.5608, + "loss": 0.8043, "step": 13230 }, { - "epoch": 1.5940284131952804, - "grad_norm": 6.15625, + "epoch": 9.443651925820257, + "grad_norm": 7.59375, "learning_rate": 5.227111111111112e-05, - "loss": 0.6031, + "loss": 0.7962, "step": 13240 }, { - "epoch": 1.5952323621478448, - "grad_norm": 6.84375, + "epoch": 9.450784593437946, + "grad_norm": 9.6875, "learning_rate": 5.2226666666666666e-05, - "loss": 0.6458, + "loss": 0.8623, "step": 13250 }, { - "epoch": 1.5964363111004092, - "grad_norm": 7.59375, + "epoch": 9.457917261055634, + "grad_norm": 7.125, "learning_rate": 5.218222222222222e-05, - "loss": 0.5275, + "loss": 0.7408, "step": 13260 }, { - "epoch": 1.5976402600529738, - "grad_norm": 9.4375, + "epoch": 9.465049928673324, + "grad_norm": 8.1875, "learning_rate": 5.213777777777778e-05, - "loss": 0.6249, + "loss": 0.7233, "step": 13270 }, { - "epoch": 1.5988442090055381, - "grad_norm": 8.375, + "epoch": 9.472182596291013, + "grad_norm": 9.375, "learning_rate": 5.209333333333334e-05, - "loss": 0.629, + "loss": 0.7349, "step": 13280 }, { - "epoch": 1.6000481579581027, - "grad_norm": 7.21875, + "epoch": 9.479315263908703, + "grad_norm": 6.75, "learning_rate": 5.2048888888888894e-05, - "loss": 0.6004, + "loss": 0.7311, "step": 13290 }, { - "epoch": 1.601252106910667, - "grad_norm": 6.9375, + "epoch": 9.48644793152639, + "grad_norm": 10.25, "learning_rate": 5.200444444444444e-05, - "loss": 0.4867, + "loss": 0.828, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval/acc": 46.511627197265625, + "epoch": 9.48644793152639, + "eval/acc": 34.88372039794922, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval_loss": 2.8910820484161377, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.807, - "eval_steps_per_second": 4.716, + "epoch": 9.48644793152639, + "eval_loss": 3.376410484313965, + "eval_runtime": 0.2205, + "eval_samples_per_second": 194.974, + "eval_steps_per_second": 4.534, "step": 13300 }, { - "epoch": 1.6024560558632315, - "grad_norm": 6.9375, + "epoch": 9.49358059914408, + "grad_norm": 12.1875, "learning_rate": 5.196e-05, - "loss": 0.6888, + "loss": 0.6994, "step": 13310 }, { - "epoch": 1.6036600048157958, - "grad_norm": 7.84375, + "epoch": 9.50071326676177, + "grad_norm": 10.375, "learning_rate": 5.191555555555556e-05, - "loss": 0.5953, + "loss": 0.7658, "step": 13320 }, { - "epoch": 1.6048639537683602, - "grad_norm": 10.0625, + "epoch": 9.507845934379457, + "grad_norm": 7.625, "learning_rate": 5.1871111111111114e-05, - "loss": 0.6347, + "loss": 0.7453, "step": 13330 }, { - "epoch": 1.6060679027209246, - "grad_norm": 10.75, + "epoch": 9.514978601997147, + "grad_norm": 8.0, "learning_rate": 5.182666666666667e-05, - "loss": 0.5822, + "loss": 0.7407, "step": 13340 }, { - "epoch": 1.607271851673489, - "grad_norm": 7.90625, + "epoch": 9.522111269614836, + "grad_norm": 6.96875, "learning_rate": 5.178222222222222e-05, - "loss": 0.5955, + "loss": 0.8234, "step": 13350 }, { - "epoch": 1.6084758006260533, - "grad_norm": 8.3125, + "epoch": 9.529243937232525, + "grad_norm": 6.59375, "learning_rate": 5.173777777777779e-05, - "loss": 0.5097, + "loss": 0.7517, "step": 13360 }, { - "epoch": 1.6096797495786177, - "grad_norm": 7.03125, + "epoch": 9.536376604850213, + "grad_norm": 7.15625, "learning_rate": 5.1693333333333335e-05, - "loss": 0.6034, + "loss": 0.6939, "step": 13370 }, { - "epoch": 1.6108836985311823, - "grad_norm": 7.375, + "epoch": 9.543509272467903, + "grad_norm": 9.6875, "learning_rate": 5.164888888888889e-05, - "loss": 0.4866, + "loss": 0.7602, "step": 13380 }, { - "epoch": 1.6120876474837467, - "grad_norm": 7.59375, + "epoch": 9.550641940085592, + "grad_norm": 7.375, "learning_rate": 5.1604444444444446e-05, - "loss": 0.548, + "loss": 0.8016, "step": 13390 }, { - "epoch": 1.6132915964363113, - "grad_norm": 7.625, + "epoch": 9.557774607703282, + "grad_norm": 6.9375, "learning_rate": 5.1559999999999994e-05, - "loss": 0.5695, + "loss": 0.8258, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval/acc": 45.930233001708984, + "epoch": 9.557774607703282, + "eval/acc": 34.88372039794922, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval_loss": 2.88386607170105, - "eval_runtime": 0.215, - "eval_samples_per_second": 199.962, - "eval_steps_per_second": 4.65, + "epoch": 9.557774607703282, + "eval_loss": 3.3766846656799316, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.368, + "eval_steps_per_second": 4.404, "step": 13400 }, { - "epoch": 1.6144955453888756, - "grad_norm": 8.5, + "epoch": 9.56490727532097, + "grad_norm": 6.875, "learning_rate": 5.151555555555556e-05, - "loss": 0.5547, + "loss": 0.7926, "step": 13410 }, { - "epoch": 1.61569949434144, - "grad_norm": 6.625, + "epoch": 9.572039942938659, + "grad_norm": 6.28125, "learning_rate": 5.147111111111111e-05, - "loss": 0.5789, + "loss": 0.6912, "step": 13420 }, { - "epoch": 1.6169034432940044, - "grad_norm": 13.8125, + "epoch": 9.579172610556348, + "grad_norm": 60.5, "learning_rate": 5.142666666666667e-05, - "loss": 0.6012, + "loss": 0.8117, "step": 13430 }, { - "epoch": 1.6181073922465687, - "grad_norm": 7.59375, + "epoch": 9.586305278174038, + "grad_norm": 10.5, "learning_rate": 5.138222222222222e-05, - "loss": 0.539, + "loss": 0.7794, "step": 13440 }, { - "epoch": 1.6193113411991331, - "grad_norm": 7.0, + "epoch": 9.593437945791726, + "grad_norm": 5.6875, "learning_rate": 5.1337777777777784e-05, - "loss": 0.5513, + "loss": 0.6753, "step": 13450 }, { - "epoch": 1.6205152901516975, - "grad_norm": 6.875, + "epoch": 9.600570613409415, + "grad_norm": 8.4375, "learning_rate": 5.129333333333334e-05, - "loss": 0.5788, + "loss": 0.8676, "step": 13460 }, { - "epoch": 1.6217192391042619, - "grad_norm": 10.0, + "epoch": 9.607703281027105, + "grad_norm": 7.34375, "learning_rate": 5.124888888888889e-05, - "loss": 0.6301, + "loss": 0.7326, "step": 13470 }, { - "epoch": 1.6229231880568262, - "grad_norm": 7.15625, + "epoch": 9.614835948644792, + "grad_norm": 13.9375, "learning_rate": 5.120444444444444e-05, - "loss": 0.5939, + "loss": 0.8177, "step": 13480 }, { - "epoch": 1.6241271370093908, - "grad_norm": 7.6875, + "epoch": 9.621968616262482, + "grad_norm": 8.3125, "learning_rate": 5.1160000000000005e-05, - "loss": 0.575, + "loss": 0.7928, "step": 13490 }, { - "epoch": 1.6253310859619552, - "grad_norm": 9.125, + "epoch": 9.629101283880171, + "grad_norm": 5.3125, "learning_rate": 5.111555555555556e-05, - "loss": 0.7391, + "loss": 0.7693, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval/acc": 46.511627197265625, + "epoch": 9.629101283880171, + "eval/acc": 37.20930099487305, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval_loss": 2.8773036003112793, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.837, - "eval_steps_per_second": 4.74, + "epoch": 9.629101283880171, + "eval_loss": 3.340432643890381, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.437, + "eval_steps_per_second": 4.522, "step": 13500 }, { - "epoch": 1.6265350349145198, - "grad_norm": 6.34375, + "epoch": 9.63623395149786, + "grad_norm": 6.4375, "learning_rate": 5.1071111111111115e-05, - "loss": 0.5839, + "loss": 0.7974, "step": 13510 }, { - "epoch": 1.6277389838670842, - "grad_norm": 8.125, + "epoch": 9.643366619115548, + "grad_norm": 11.375, "learning_rate": 5.1026666666666664e-05, - "loss": 0.6332, + "loss": 0.8533, "step": 13520 }, { - "epoch": 1.6289429328196485, - "grad_norm": 7.53125, + "epoch": 9.650499286733238, + "grad_norm": 8.4375, "learning_rate": 5.098222222222222e-05, - "loss": 0.5676, + "loss": 0.7578, "step": 13530 }, { - "epoch": 1.630146881772213, - "grad_norm": 5.8125, + "epoch": 9.657631954350927, + "grad_norm": 6.96875, "learning_rate": 5.093777777777778e-05, - "loss": 0.5453, + "loss": 0.8348, "step": 13540 }, { - "epoch": 1.6313508307247773, - "grad_norm": 7.15625, + "epoch": 9.664764621968617, + "grad_norm": 6.75, "learning_rate": 5.0893333333333336e-05, - "loss": 0.6429, + "loss": 0.7562, "step": 13550 }, { - "epoch": 1.6325547796773416, - "grad_norm": 9.25, + "epoch": 9.671897289586305, + "grad_norm": 7.4375, "learning_rate": 5.084888888888889e-05, - "loss": 0.6338, + "loss": 0.8667, "step": 13560 }, { - "epoch": 1.633758728629906, - "grad_norm": 9.1875, + "epoch": 9.679029957203994, + "grad_norm": 11.4375, "learning_rate": 5.080444444444445e-05, - "loss": 0.5679, + "loss": 0.7158, "step": 13570 }, { - "epoch": 1.6349626775824704, - "grad_norm": 6.8125, + "epoch": 9.686162624821684, + "grad_norm": 6.15625, "learning_rate": 5.076000000000001e-05, - "loss": 0.5705, + "loss": 0.7153, "step": 13580 }, { - "epoch": 1.6361666265350348, - "grad_norm": 10.375, + "epoch": 9.693295292439373, + "grad_norm": 10.25, "learning_rate": 5.0715555555555564e-05, - "loss": 0.6313, + "loss": 0.7698, "step": 13590 }, { - "epoch": 1.6373705754875993, - "grad_norm": 6.6875, + "epoch": 9.70042796005706, + "grad_norm": 12.0, "learning_rate": 5.067111111111111e-05, - "loss": 0.5634, + "loss": 0.8033, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval/acc": 46.511627197265625, + "epoch": 9.70042796005706, + "eval/acc": 37.20930099487305, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval_loss": 2.8538527488708496, - "eval_runtime": 0.2095, - "eval_samples_per_second": 205.241, - "eval_steps_per_second": 4.773, + "epoch": 9.70042796005706, + "eval_loss": 3.325901985168457, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.369, + "eval_steps_per_second": 4.474, "step": 13600 }, { - "epoch": 1.6385745244401637, - "grad_norm": 7.25, + "epoch": 9.70756062767475, + "grad_norm": 11.5, "learning_rate": 5.062666666666667e-05, - "loss": 0.656, + "loss": 0.7757, "step": 13610 }, { - "epoch": 1.6397784733927283, - "grad_norm": 9.875, + "epoch": 9.71469329529244, + "grad_norm": 7.0625, "learning_rate": 5.058222222222222e-05, - "loss": 0.6261, + "loss": 0.7335, "step": 13620 }, { - "epoch": 1.6409824223452927, - "grad_norm": 8.25, + "epoch": 9.721825962910128, + "grad_norm": 7.09375, "learning_rate": 5.0537777777777785e-05, - "loss": 0.5124, + "loss": 0.7219, "step": 13630 }, { - "epoch": 1.642186371297857, - "grad_norm": 7.5625, + "epoch": 9.728958630527817, + "grad_norm": 28.625, "learning_rate": 5.049333333333334e-05, - "loss": 0.6025, + "loss": 0.7445, "step": 13640 }, { - "epoch": 1.6433903202504214, - "grad_norm": 7.875, + "epoch": 9.736091298145507, + "grad_norm": 6.34375, "learning_rate": 5.044888888888889e-05, - "loss": 0.6363, + "loss": 0.7203, "step": 13650 }, { - "epoch": 1.6445942692029858, - "grad_norm": 8.625, + "epoch": 9.743223965763196, + "grad_norm": 12.25, "learning_rate": 5.0404444444444444e-05, - "loss": 0.6095, + "loss": 0.815, "step": 13660 }, { - "epoch": 1.6457982181555502, - "grad_norm": 7.53125, + "epoch": 9.750356633380884, + "grad_norm": 8.125, "learning_rate": 5.0360000000000006e-05, - "loss": 0.5122, + "loss": 0.6969, "step": 13670 }, { - "epoch": 1.6470021671081145, - "grad_norm": 7.625, + "epoch": 9.757489300998573, + "grad_norm": 8.9375, "learning_rate": 5.031555555555556e-05, - "loss": 0.6545, + "loss": 0.742, "step": 13680 }, { - "epoch": 1.648206116060679, - "grad_norm": 8.4375, + "epoch": 9.764621968616263, + "grad_norm": 17.125, "learning_rate": 5.0271111111111116e-05, - "loss": 0.6044, + "loss": 0.8526, "step": 13690 }, { - "epoch": 1.6494100650132433, - "grad_norm": 7.03125, + "epoch": 9.771754636233952, + "grad_norm": 9.3125, "learning_rate": 5.0226666666666665e-05, - "loss": 0.5725, + "loss": 0.795, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval/acc": 46.511627197265625, + "epoch": 9.771754636233952, + "eval/acc": 37.20930099487305, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval_loss": 2.8802239894866943, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.668, - "eval_steps_per_second": 4.783, + "epoch": 9.771754636233952, + "eval_loss": 3.3737363815307617, + "eval_runtime": 0.2349, + "eval_samples_per_second": 183.026, + "eval_steps_per_second": 4.256, "step": 13700 }, { - "epoch": 1.6506140139658079, - "grad_norm": 7.84375, + "epoch": 9.77888730385164, + "grad_norm": 7.28125, "learning_rate": 5.018222222222222e-05, - "loss": 0.6112, + "loss": 0.7804, "step": 13710 }, { - "epoch": 1.6518179629183722, - "grad_norm": 12.0625, + "epoch": 9.78601997146933, + "grad_norm": 8.25, "learning_rate": 5.013777777777778e-05, - "loss": 0.5524, + "loss": 0.8201, "step": 13720 }, { - "epoch": 1.6530219118709368, - "grad_norm": 7.84375, + "epoch": 9.793152639087019, + "grad_norm": 7.125, "learning_rate": 5.009333333333334e-05, - "loss": 0.6066, + "loss": 0.7495, "step": 13730 }, { - "epoch": 1.6542258608235012, - "grad_norm": 6.15625, + "epoch": 9.800285306704708, + "grad_norm": 7.96875, "learning_rate": 5.004888888888889e-05, - "loss": 0.5683, + "loss": 0.7827, "step": 13740 }, { - "epoch": 1.6554298097760656, - "grad_norm": 7.03125, + "epoch": 9.807417974322396, + "grad_norm": 6.5625, "learning_rate": 5.000444444444444e-05, - "loss": 0.6051, + "loss": 0.8317, "step": 13750 }, { - "epoch": 1.65663375872863, - "grad_norm": 7.75, + "epoch": 9.814550641940086, + "grad_norm": 7.8125, "learning_rate": 4.996e-05, - "loss": 0.5289, + "loss": 0.8547, "step": 13760 }, { - "epoch": 1.6578377076811943, - "grad_norm": 6.5, + "epoch": 9.821683309557775, + "grad_norm": 7.15625, "learning_rate": 4.991555555555556e-05, - "loss": 0.5031, + "loss": 0.8679, "step": 13770 }, { - "epoch": 1.6590416566337587, - "grad_norm": 7.34375, + "epoch": 9.828815977175463, + "grad_norm": 7.8125, "learning_rate": 4.987111111111111e-05, - "loss": 0.6406, + "loss": 0.7479, "step": 13780 }, { - "epoch": 1.660245605586323, - "grad_norm": 8.1875, + "epoch": 9.835948644793152, + "grad_norm": 15.5, "learning_rate": 4.982666666666667e-05, - "loss": 0.5593, + "loss": 0.8501, "step": 13790 }, { - "epoch": 1.6614495545388874, - "grad_norm": 12.25, + "epoch": 9.843081312410842, + "grad_norm": 8.1875, "learning_rate": 4.9782222222222224e-05, - "loss": 0.4899, + "loss": 0.7662, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval/acc": 46.511627197265625, + "epoch": 9.843081312410842, + "eval/acc": 37.20930099487305, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval_loss": 2.8814125061035156, - "eval_runtime": 0.2176, - "eval_samples_per_second": 197.608, - "eval_steps_per_second": 4.596, + "epoch": 9.843081312410842, + "eval_loss": 3.3852930068969727, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.716, + "eval_steps_per_second": 4.528, "step": 13800 }, { - "epoch": 1.6626535034914518, - "grad_norm": 5.40625, + "epoch": 9.850213980028531, + "grad_norm": 8.3125, "learning_rate": 4.973777777777778e-05, - "loss": 0.5799, + "loss": 0.8303, "step": 13810 }, { - "epoch": 1.6638574524440164, - "grad_norm": 7.65625, + "epoch": 9.857346647646219, + "grad_norm": 8.8125, "learning_rate": 4.9693333333333334e-05, - "loss": 0.6122, + "loss": 0.7875, "step": 13820 }, { - "epoch": 1.6650614013965808, - "grad_norm": 8.1875, + "epoch": 9.864479315263909, + "grad_norm": 7.625, "learning_rate": 4.964888888888889e-05, - "loss": 0.593, + "loss": 0.7952, "step": 13830 }, { - "epoch": 1.6662653503491454, - "grad_norm": 6.5625, + "epoch": 9.871611982881598, + "grad_norm": 6.96875, "learning_rate": 4.9604444444444445e-05, - "loss": 0.6101, + "loss": 0.8041, "step": 13840 }, { - "epoch": 1.6674692993017097, - "grad_norm": 11.625, + "epoch": 9.878744650499288, + "grad_norm": 6.375, "learning_rate": 4.956e-05, - "loss": 0.6803, + "loss": 0.6869, "step": 13850 }, { - "epoch": 1.668673248254274, - "grad_norm": 7.5625, + "epoch": 9.885877318116975, + "grad_norm": 7.125, "learning_rate": 4.951555555555556e-05, - "loss": 0.6574, + "loss": 0.7707, "step": 13860 }, { - "epoch": 1.6698771972068385, - "grad_norm": 6.09375, + "epoch": 9.893009985734665, + "grad_norm": 8.125, "learning_rate": 4.947111111111111e-05, - "loss": 0.6698, + "loss": 0.7512, "step": 13870 }, { - "epoch": 1.6710811461594028, - "grad_norm": 7.75, + "epoch": 9.900142653352354, + "grad_norm": 8.8125, "learning_rate": 4.942666666666667e-05, - "loss": 0.6114, + "loss": 0.8059, "step": 13880 }, { - "epoch": 1.6722850951119672, - "grad_norm": 7.25, + "epoch": 9.907275320970044, + "grad_norm": 7.90625, "learning_rate": 4.938222222222223e-05, - "loss": 0.6604, + "loss": 0.729, "step": 13890 }, { - "epoch": 1.6734890440645316, - "grad_norm": 8.25, + "epoch": 9.914407988587731, + "grad_norm": 6.625, "learning_rate": 4.933777777777778e-05, - "loss": 0.6482, + "loss": 0.7958, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval/acc": 46.511627197265625, + "epoch": 9.914407988587731, + "eval/acc": 37.20930099487305, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval_loss": 2.856815814971924, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.827, - "eval_steps_per_second": 4.717, + "epoch": 9.914407988587731, + "eval_loss": 3.400364875793457, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.329, + "eval_steps_per_second": 4.356, "step": 13900 }, { - "epoch": 1.674692993017096, - "grad_norm": 7.15625, + "epoch": 9.921540656205421, + "grad_norm": 7.0625, "learning_rate": 4.929333333333334e-05, - "loss": 0.6165, + "loss": 0.7314, "step": 13910 }, { - "epoch": 1.6758969419696603, - "grad_norm": 7.65625, + "epoch": 9.92867332382311, + "grad_norm": 6.78125, "learning_rate": 4.9248888888888886e-05, - "loss": 0.5861, + "loss": 0.7581, "step": 13920 }, { - "epoch": 1.677100890922225, - "grad_norm": 6.8125, + "epoch": 9.935805991440798, + "grad_norm": 8.6875, "learning_rate": 4.920444444444445e-05, - "loss": 0.6019, + "loss": 0.7865, "step": 13930 }, { - "epoch": 1.6783048398747893, - "grad_norm": 7.1875, + "epoch": 9.942938659058488, + "grad_norm": 7.78125, "learning_rate": 4.9160000000000004e-05, - "loss": 0.5639, + "loss": 0.7174, "step": 13940 }, { - "epoch": 1.6795087888273539, - "grad_norm": 8.875, + "epoch": 9.950071326676177, + "grad_norm": 7.6875, "learning_rate": 4.911555555555556e-05, - "loss": 0.6077, + "loss": 0.855, "step": 13950 }, { - "epoch": 1.6807127377799183, - "grad_norm": 7.75, + "epoch": 9.957203994293867, + "grad_norm": 7.46875, "learning_rate": 4.9071111111111114e-05, - "loss": 0.5095, + "loss": 0.7511, "step": 13960 }, { - "epoch": 1.6819166867324826, - "grad_norm": 6.75, + "epoch": 9.964336661911554, + "grad_norm": 6.34375, "learning_rate": 4.902666666666667e-05, - "loss": 0.6097, + "loss": 0.6901, "step": 13970 }, { - "epoch": 1.683120635685047, - "grad_norm": 10.0625, + "epoch": 9.971469329529244, + "grad_norm": 19.125, "learning_rate": 4.8982222222222225e-05, - "loss": 0.5662, + "loss": 0.7621, "step": 13980 }, { - "epoch": 1.6843245846376114, - "grad_norm": 9.25, + "epoch": 9.978601997146933, + "grad_norm": 9.375, "learning_rate": 4.893777777777778e-05, - "loss": 0.6319, + "loss": 0.7466, "step": 13990 }, { - "epoch": 1.6855285335901757, - "grad_norm": 6.28125, + "epoch": 9.985734664764623, + "grad_norm": 7.1875, "learning_rate": 4.8893333333333335e-05, - "loss": 0.5154, + "loss": 0.749, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval/acc": 46.511627197265625, + "epoch": 9.985734664764623, + "eval/acc": 37.20930099487305, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval_loss": 2.8956446647644043, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.844, - "eval_steps_per_second": 4.624, + "epoch": 9.985734664764623, + "eval_loss": 3.3502047061920166, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.024, + "eval_steps_per_second": 4.442, "step": 14000 }, { - "epoch": 1.68673248254274, - "grad_norm": 8.8125, + "epoch": 9.99286733238231, + "grad_norm": 9.5625, "learning_rate": 4.884888888888889e-05, - "loss": 0.5645, + "loss": 0.6932, "step": 14010 }, { - "epoch": 1.6879364314953045, - "grad_norm": 8.8125, + "epoch": 10.0, + "grad_norm": 5.875, "learning_rate": 4.8804444444444445e-05, - "loss": 0.6886, + "loss": 0.7939, "step": 14020 }, { - "epoch": 1.6891403804478688, - "grad_norm": 9.3125, + "epoch": 10.00713266761769, + "grad_norm": 7.15625, "learning_rate": 4.876e-05, - "loss": 0.5767, + "loss": 0.8124, "step": 14030 }, { - "epoch": 1.6903443294004334, - "grad_norm": 11.625, + "epoch": 10.014265335235377, + "grad_norm": 6.0625, "learning_rate": 4.8715555555555556e-05, - "loss": 0.542, + "loss": 0.6855, "step": 14040 }, { - "epoch": 1.6915482783529978, - "grad_norm": 6.5625, + "epoch": 10.021398002853067, + "grad_norm": 50.75, "learning_rate": 4.867111111111111e-05, - "loss": 0.538, + "loss": 0.8354, "step": 14050 }, { - "epoch": 1.6927522273055624, - "grad_norm": 6.9375, + "epoch": 10.028530670470756, + "grad_norm": 7.46875, "learning_rate": 4.862666666666667e-05, - "loss": 0.5314, + "loss": 0.8605, "step": 14060 }, { - "epoch": 1.6939561762581268, - "grad_norm": 7.9375, + "epoch": 10.035663338088446, + "grad_norm": 9.625, "learning_rate": 4.858222222222222e-05, - "loss": 0.5909, + "loss": 0.8626, "step": 14070 }, { - "epoch": 1.6951601252106911, - "grad_norm": 7.09375, + "epoch": 10.042796005706133, + "grad_norm": 6.125, "learning_rate": 4.8537777777777784e-05, - "loss": 0.5809, + "loss": 0.7302, "step": 14080 }, { - "epoch": 1.6963640741632555, - "grad_norm": 7.9375, + "epoch": 10.049928673323823, + "grad_norm": 8.0625, "learning_rate": 4.849333333333333e-05, - "loss": 0.576, + "loss": 0.9058, "step": 14090 }, { - "epoch": 1.69756802311582, - "grad_norm": 6.1875, + "epoch": 10.057061340941512, + "grad_norm": 7.3125, "learning_rate": 4.8448888888888894e-05, - "loss": 0.5162, + "loss": 0.7981, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval/acc": 45.930233001708984, + "epoch": 10.057061340941512, + "eval/acc": 46.511627197265625, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval_loss": 2.8892974853515625, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.308, - "eval_steps_per_second": 4.775, + "epoch": 10.057061340941512, + "eval_loss": 2.7371480464935303, + "eval_runtime": 1.1832, + "eval_samples_per_second": 36.344, + "eval_steps_per_second": 0.845, "step": 14100 }, { - "epoch": 1.6987719720683843, - "grad_norm": 6.84375, + "epoch": 10.064194008559202, + "grad_norm": 12.625, "learning_rate": 4.840444444444445e-05, - "loss": 0.6473, + "loss": 0.7461, "step": 14110 }, { - "epoch": 1.6999759210209486, - "grad_norm": 6.3125, + "epoch": 10.07132667617689, + "grad_norm": 7.375, "learning_rate": 4.836e-05, - "loss": 0.5388, + "loss": 0.8649, "step": 14120 }, { - "epoch": 1.701179869973513, - "grad_norm": 9.25, + "epoch": 10.078459343794579, + "grad_norm": 10.6875, "learning_rate": 4.831555555555556e-05, - "loss": 0.6188, + "loss": 0.8143, "step": 14130 }, { - "epoch": 1.7023838189260774, - "grad_norm": 8.875, + "epoch": 10.085592011412269, + "grad_norm": 43.5, "learning_rate": 4.827111111111111e-05, - "loss": 0.5347, + "loss": 0.8249, "step": 14140 }, { - "epoch": 1.703587767878642, - "grad_norm": 7.40625, + "epoch": 10.092724679029958, + "grad_norm": 6.6875, "learning_rate": 4.822666666666667e-05, - "loss": 0.5254, + "loss": 0.6324, "step": 14150 }, { - "epoch": 1.7047917168312063, - "grad_norm": 8.125, + "epoch": 10.099857346647646, + "grad_norm": 9.6875, "learning_rate": 4.8182222222222225e-05, - "loss": 0.5787, + "loss": 0.7795, "step": 14160 }, { - "epoch": 1.705995665783771, - "grad_norm": 9.875, + "epoch": 10.106990014265335, + "grad_norm": 7.8125, "learning_rate": 4.813777777777778e-05, - "loss": 0.5102, + "loss": 0.8453, "step": 14170 }, { - "epoch": 1.7071996147363353, - "grad_norm": 7.34375, + "epoch": 10.114122681883025, + "grad_norm": 6.21875, "learning_rate": 4.8093333333333336e-05, - "loss": 0.5871, + "loss": 0.735, "step": 14180 }, { - "epoch": 1.7084035636888997, - "grad_norm": 5.71875, + "epoch": 10.121255349500712, + "grad_norm": 8.1875, "learning_rate": 4.804888888888889e-05, - "loss": 0.6621, + "loss": 0.6646, "step": 14190 }, { - "epoch": 1.709607512641464, - "grad_norm": 8.625, + "epoch": 10.128388017118402, + "grad_norm": 38.0, "learning_rate": 4.8004444444444446e-05, - "loss": 0.5733, + "loss": 0.7963, "step": 14200 }, { - "epoch": 1.709607512641464, + "epoch": 10.128388017118402, "eval/acc": 44.1860466003418, "step": 14200 }, { - "epoch": 1.709607512641464, - "eval_loss": 2.9033682346343994, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.727, - "eval_steps_per_second": 4.691, + "epoch": 10.128388017118402, + "eval_loss": 2.7271535396575928, + "eval_runtime": 1.4583, + "eval_samples_per_second": 29.486, + "eval_steps_per_second": 0.686, "step": 14200 }, { - "epoch": 1.7108114615940284, - "grad_norm": 7.375, + "epoch": 10.135520684736091, + "grad_norm": 5.78125, "learning_rate": 4.796e-05, - "loss": 0.5375, + "loss": 0.693, "step": 14210 }, { - "epoch": 1.7120154105465928, - "grad_norm": 6.875, + "epoch": 10.142653352353781, + "grad_norm": 11.875, "learning_rate": 4.791555555555556e-05, - "loss": 0.5126, + "loss": 0.7578, "step": 14220 }, { - "epoch": 1.7132193594991572, - "grad_norm": 6.625, + "epoch": 10.149786019971469, + "grad_norm": 9.25, "learning_rate": 4.787111111111111e-05, - "loss": 0.5148, + "loss": 0.8127, "step": 14230 }, { - "epoch": 1.7144233084517215, - "grad_norm": 8.6875, + "epoch": 10.156918687589158, + "grad_norm": 9.1875, "learning_rate": 4.782666666666667e-05, - "loss": 0.6544, + "loss": 0.6935, "step": 14240 }, { - "epoch": 1.715627257404286, - "grad_norm": 7.28125, + "epoch": 10.164051355206848, + "grad_norm": 8.25, "learning_rate": 4.778222222222222e-05, - "loss": 0.5869, + "loss": 0.7233, "step": 14250 }, { - "epoch": 1.7168312063568505, - "grad_norm": 6.96875, + "epoch": 10.171184022824537, + "grad_norm": 6.71875, "learning_rate": 4.7737777777777785e-05, - "loss": 0.4857, + "loss": 0.8749, "step": 14260 }, { - "epoch": 1.7180351553094149, - "grad_norm": 6.90625, + "epoch": 10.178316690442225, + "grad_norm": 7.84375, "learning_rate": 4.769333333333333e-05, - "loss": 0.5416, + "loss": 0.7786, "step": 14270 }, { - "epoch": 1.7192391042619795, - "grad_norm": 9.5, + "epoch": 10.185449358059914, + "grad_norm": 9.1875, "learning_rate": 4.7648888888888895e-05, - "loss": 0.6318, + "loss": 0.7024, "step": 14280 }, { - "epoch": 1.7204430532145438, - "grad_norm": 8.125, + "epoch": 10.192582025677604, + "grad_norm": 7.78125, "learning_rate": 4.7604444444444443e-05, - "loss": 0.5763, + "loss": 0.8525, "step": 14290 }, { - "epoch": 1.7216470021671082, - "grad_norm": 6.6875, + "epoch": 10.199714693295292, + "grad_norm": 6.90625, "learning_rate": 4.7560000000000005e-05, - "loss": 0.568, + "loss": 0.8181, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval/acc": 47.093021392822266, + "epoch": 10.199714693295292, + "eval/acc": 46.511627197265625, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval_loss": 2.8553571701049805, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.63, - "eval_steps_per_second": 4.736, + "epoch": 10.199714693295292, + "eval_loss": 2.766357898712158, + "eval_runtime": 2.3027, + "eval_samples_per_second": 18.674, + "eval_steps_per_second": 0.434, "step": 14300 }, { - "epoch": 1.7228509511196726, - "grad_norm": 9.4375, + "epoch": 10.206847360912981, + "grad_norm": 5.90625, "learning_rate": 4.751555555555556e-05, - "loss": 0.6341, + "loss": 0.8026, "step": 14310 }, { - "epoch": 1.724054900072237, - "grad_norm": 6.90625, + "epoch": 10.21398002853067, + "grad_norm": 7.3125, "learning_rate": 4.747111111111111e-05, - "loss": 0.5031, + "loss": 0.8758, "step": 14320 }, { - "epoch": 1.7252588490248013, - "grad_norm": 7.59375, + "epoch": 10.22111269614836, + "grad_norm": 9.4375, "learning_rate": 4.742666666666667e-05, - "loss": 0.5438, + "loss": 0.7889, "step": 14330 }, { - "epoch": 1.7264627979773657, - "grad_norm": 6.09375, + "epoch": 10.228245363766048, + "grad_norm": 8.0, "learning_rate": 4.738222222222222e-05, - "loss": 0.6364, + "loss": 0.7343, "step": 14340 }, { - "epoch": 1.72766674692993, - "grad_norm": 7.0625, + "epoch": 10.235378031383737, + "grad_norm": 6.59375, "learning_rate": 4.733777777777778e-05, - "loss": 0.6025, + "loss": 0.788, "step": 14350 }, { - "epoch": 1.7288706958824944, - "grad_norm": 11.3125, + "epoch": 10.242510699001427, + "grad_norm": 9.1875, "learning_rate": 4.729333333333334e-05, - "loss": 0.5846, + "loss": 0.8068, "step": 14360 }, { - "epoch": 1.730074644835059, - "grad_norm": 5.5625, + "epoch": 10.249643366619116, + "grad_norm": 7.53125, "learning_rate": 4.724888888888889e-05, - "loss": 0.5983, + "loss": 0.8188, "step": 14370 }, { - "epoch": 1.7312785937876234, - "grad_norm": 8.6875, + "epoch": 10.256776034236804, + "grad_norm": 7.1875, "learning_rate": 4.720444444444445e-05, - "loss": 0.5374, + "loss": 0.7643, "step": 14380 }, { - "epoch": 1.732482542740188, - "grad_norm": 7.4375, + "epoch": 10.263908701854493, + "grad_norm": 9.125, "learning_rate": 4.716e-05, - "loss": 0.5893, + "loss": 0.7052, "step": 14390 }, { - "epoch": 1.7336864916927524, - "grad_norm": 6.90625, + "epoch": 10.271041369472183, + "grad_norm": 10.5625, "learning_rate": 4.711555555555556e-05, - "loss": 0.5874, + "loss": 0.762, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval/acc": 44.1860466003418, + "epoch": 10.271041369472183, + "eval/acc": 46.511627197265625, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval_loss": 2.897413969039917, - "eval_runtime": 0.2186, - "eval_samples_per_second": 196.696, - "eval_steps_per_second": 4.574, + "epoch": 10.271041369472183, + "eval_loss": 2.774780750274658, + "eval_runtime": 1.2232, + "eval_samples_per_second": 35.152, + "eval_steps_per_second": 0.817, "step": 14400 }, { - "epoch": 1.7348904406453167, - "grad_norm": 8.0625, + "epoch": 10.278174037089872, + "grad_norm": 7.5625, "learning_rate": 4.707111111111111e-05, - "loss": 0.6197, + "loss": 0.8322, "step": 14410 }, { - "epoch": 1.736094389597881, - "grad_norm": 7.875, + "epoch": 10.28530670470756, + "grad_norm": 35.25, "learning_rate": 4.702666666666667e-05, - "loss": 0.5427, + "loss": 0.8043, "step": 14420 }, { - "epoch": 1.7372983385504455, - "grad_norm": 10.5625, + "epoch": 10.29243937232525, + "grad_norm": 7.09375, "learning_rate": 4.6982222222222223e-05, - "loss": 0.5801, + "loss": 0.7257, "step": 14430 }, { - "epoch": 1.7385022875030098, - "grad_norm": 11.5625, + "epoch": 10.29957203994294, + "grad_norm": 15.375, "learning_rate": 4.693777777777778e-05, - "loss": 0.5667, + "loss": 0.7922, "step": 14440 }, { - "epoch": 1.7397062364555742, - "grad_norm": 7.71875, + "epoch": 10.306704707560627, + "grad_norm": 7.09375, "learning_rate": 4.6893333333333334e-05, - "loss": 0.6626, + "loss": 0.694, "step": 14450 }, { - "epoch": 1.7409101854081386, - "grad_norm": 5.90625, + "epoch": 10.313837375178316, + "grad_norm": 7.0625, "learning_rate": 4.684888888888889e-05, - "loss": 0.6022, + "loss": 0.7734, "step": 14460 }, { - "epoch": 1.742114134360703, - "grad_norm": 9.9375, + "epoch": 10.320970042796006, + "grad_norm": 6.75, "learning_rate": 4.6804444444444444e-05, - "loss": 0.6492, + "loss": 0.7469, "step": 14470 }, { - "epoch": 1.7433180833132675, - "grad_norm": 6.6875, + "epoch": 10.328102710413695, + "grad_norm": 5.9375, "learning_rate": 4.6760000000000006e-05, - "loss": 0.6251, + "loss": 0.6948, "step": 14480 }, { - "epoch": 1.744522032265832, - "grad_norm": 5.65625, + "epoch": 10.335235378031383, + "grad_norm": 7.15625, "learning_rate": 4.6715555555555555e-05, - "loss": 0.5951, + "loss": 0.7593, "step": 14490 }, { - "epoch": 1.7457259812183965, - "grad_norm": 7.53125, + "epoch": 10.342368045649073, + "grad_norm": 26.875, "learning_rate": 4.667111111111112e-05, - "loss": 0.5074, + "loss": 0.7302, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval/acc": 45.930233001708984, + "epoch": 10.342368045649073, + "eval/acc": 44.1860466003418, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval_loss": 2.843892812728882, - "eval_runtime": 0.2078, - "eval_samples_per_second": 206.911, - "eval_steps_per_second": 4.812, + "epoch": 10.342368045649073, + "eval_loss": 2.7937443256378174, + "eval_runtime": 0.2689, + "eval_samples_per_second": 159.899, + "eval_steps_per_second": 3.719, "step": 14500 }, { - "epoch": 1.7469299301709609, - "grad_norm": 8.8125, + "epoch": 10.349500713266762, + "grad_norm": 53.75, "learning_rate": 4.6626666666666665e-05, - "loss": 0.5739, + "loss": 0.8025, "step": 14510 }, { - "epoch": 1.7481338791235252, - "grad_norm": 9.5, + "epoch": 10.356633380884452, + "grad_norm": 10.4375, "learning_rate": 4.658222222222223e-05, - "loss": 0.5531, + "loss": 0.6807, "step": 14520 }, { - "epoch": 1.7493378280760896, - "grad_norm": 8.4375, + "epoch": 10.36376604850214, + "grad_norm": 17.5, "learning_rate": 4.653777777777778e-05, - "loss": 0.5929, + "loss": 0.7773, "step": 14530 }, { - "epoch": 1.750541777028654, - "grad_norm": 6.78125, + "epoch": 10.370898716119829, + "grad_norm": 9.0625, "learning_rate": 4.649333333333333e-05, - "loss": 0.6202, + "loss": 0.7322, "step": 14540 }, { - "epoch": 1.7517457259812184, - "grad_norm": 7.28125, + "epoch": 10.378031383737518, + "grad_norm": 7.5, "learning_rate": 4.644888888888889e-05, - "loss": 0.6164, + "loss": 0.801, "step": 14550 }, { - "epoch": 1.7529496749337827, - "grad_norm": 9.0625, + "epoch": 10.385164051355208, + "grad_norm": 7.03125, "learning_rate": 4.640444444444445e-05, - "loss": 0.6379, + "loss": 0.7887, "step": 14560 }, { - "epoch": 1.754153623886347, - "grad_norm": 8.9375, + "epoch": 10.392296718972895, + "grad_norm": 5.78125, "learning_rate": 4.636e-05, - "loss": 0.5701, + "loss": 0.75, "step": 14570 }, { - "epoch": 1.7553575728389115, - "grad_norm": 8.375, + "epoch": 10.399429386590585, + "grad_norm": 11.8125, "learning_rate": 4.631555555555556e-05, - "loss": 0.6824, + "loss": 0.7594, "step": 14580 }, { - "epoch": 1.756561521791476, - "grad_norm": 7.78125, + "epoch": 10.406562054208274, + "grad_norm": 26.375, "learning_rate": 4.6271111111111114e-05, - "loss": 0.7127, + "loss": 0.7863, "step": 14590 }, { - "epoch": 1.7577654707440404, - "grad_norm": 6.96875, + "epoch": 10.413694721825962, + "grad_norm": 11.875, "learning_rate": 4.622666666666667e-05, - "loss": 0.5165, + "loss": 0.7701, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval/acc": 46.511627197265625, + "epoch": 10.413694721825962, + "eval/acc": 44.1860466003418, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval_loss": 2.8514623641967773, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.924, - "eval_steps_per_second": 4.742, + "epoch": 10.413694721825962, + "eval_loss": 2.7834675312042236, + "eval_runtime": 0.4675, + "eval_samples_per_second": 91.973, + "eval_steps_per_second": 2.139, "step": 14600 }, { - "epoch": 1.758969419696605, - "grad_norm": 8.25, + "epoch": 10.420827389443652, + "grad_norm": 12.5625, "learning_rate": 4.6182222222222224e-05, - "loss": 0.6424, + "loss": 0.7568, "step": 14610 }, { - "epoch": 1.7601733686491694, - "grad_norm": 7.09375, + "epoch": 10.427960057061341, + "grad_norm": 7.40625, "learning_rate": 4.613777777777778e-05, - "loss": 0.5774, + "loss": 0.7547, "step": 14620 }, { - "epoch": 1.7613773176017338, - "grad_norm": 7.3125, + "epoch": 10.43509272467903, + "grad_norm": 8.875, "learning_rate": 4.6093333333333335e-05, - "loss": 0.5932, + "loss": 0.7594, "step": 14630 }, { - "epoch": 1.7625812665542981, - "grad_norm": 7.84375, + "epoch": 10.442225392296718, + "grad_norm": 25.0, "learning_rate": 4.604888888888889e-05, - "loss": 0.5451, + "loss": 0.8313, "step": 14640 }, { - "epoch": 1.7637852155068625, - "grad_norm": 6.875, + "epoch": 10.449358059914408, + "grad_norm": 7.9375, "learning_rate": 4.6004444444444445e-05, - "loss": 0.6025, + "loss": 0.8017, "step": 14650 }, { - "epoch": 1.7649891644594269, - "grad_norm": 6.71875, + "epoch": 10.456490727532097, + "grad_norm": 7.59375, "learning_rate": 4.596e-05, - "loss": 0.5298, + "loss": 0.7648, "step": 14660 }, { - "epoch": 1.7661931134119913, - "grad_norm": 8.75, + "epoch": 10.463623395149787, + "grad_norm": 8.5625, "learning_rate": 4.5915555555555556e-05, - "loss": 0.6542, + "loss": 0.6931, "step": 14670 }, { - "epoch": 1.7673970623645556, - "grad_norm": 8.875, + "epoch": 10.470756062767475, + "grad_norm": 9.8125, "learning_rate": 4.587111111111112e-05, - "loss": 0.5926, + "loss": 0.7128, "step": 14680 }, { - "epoch": 1.76860101131712, - "grad_norm": 8.9375, + "epoch": 10.477888730385164, + "grad_norm": 8.0, "learning_rate": 4.5826666666666666e-05, - "loss": 0.5747, + "loss": 0.8199, "step": 14690 }, { - "epoch": 1.7698049602696846, - "grad_norm": 7.34375, + "epoch": 10.485021398002853, + "grad_norm": 7.53125, "learning_rate": 4.578222222222223e-05, - "loss": 0.5349, + "loss": 0.8027, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval/acc": 44.1860466003418, + "epoch": 10.485021398002853, + "eval/acc": 48.83720779418945, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval_loss": 2.8648550510406494, - "eval_runtime": 0.2015, - "eval_samples_per_second": 213.416, - "eval_steps_per_second": 4.963, + "epoch": 10.485021398002853, + "eval_loss": 2.773456573486328, + "eval_runtime": 0.2477, + "eval_samples_per_second": 173.569, + "eval_steps_per_second": 4.036, "step": 14700 }, { - "epoch": 1.771008909222249, - "grad_norm": 7.5625, + "epoch": 10.492154065620543, + "grad_norm": 13.375, "learning_rate": 4.5737777777777777e-05, - "loss": 0.4603, + "loss": 0.8284, "step": 14710 }, { - "epoch": 1.7722128581748133, - "grad_norm": 8.4375, + "epoch": 10.49928673323823, + "grad_norm": 7.875, "learning_rate": 4.569333333333334e-05, - "loss": 0.5674, + "loss": 0.7522, "step": 14720 }, { - "epoch": 1.773416807127378, - "grad_norm": 6.625, + "epoch": 10.50641940085592, + "grad_norm": 6.375, "learning_rate": 4.5648888888888894e-05, - "loss": 0.5988, + "loss": 0.672, "step": 14730 }, { - "epoch": 1.7746207560799423, - "grad_norm": 8.4375, + "epoch": 10.51355206847361, + "grad_norm": 6.40625, "learning_rate": 4.560444444444444e-05, - "loss": 0.6072, + "loss": 0.8234, "step": 14740 }, { - "epoch": 1.7758247050325067, - "grad_norm": 7.84375, + "epoch": 10.520684736091297, + "grad_norm": 9.1875, "learning_rate": 4.5560000000000004e-05, - "loss": 0.5524, + "loss": 0.7505, "step": 14750 }, { - "epoch": 1.777028653985071, - "grad_norm": 6.0, + "epoch": 10.527817403708987, + "grad_norm": 7.25, "learning_rate": 4.551555555555555e-05, - "loss": 0.5633, + "loss": 0.7694, "step": 14760 }, { - "epoch": 1.7782326029376354, - "grad_norm": 12.625, + "epoch": 10.534950071326676, + "grad_norm": 6.3125, "learning_rate": 4.5471111111111115e-05, - "loss": 0.5669, + "loss": 0.7743, "step": 14770 }, { - "epoch": 1.7794365518901998, - "grad_norm": 6.5, + "epoch": 10.542082738944366, + "grad_norm": 10.0, "learning_rate": 4.542666666666667e-05, - "loss": 0.4503, + "loss": 0.8179, "step": 14780 }, { - "epoch": 1.7806405008427642, - "grad_norm": 7.46875, + "epoch": 10.549215406562054, + "grad_norm": 9.875, "learning_rate": 4.5382222222222225e-05, - "loss": 0.6596, + "loss": 0.9151, "step": 14790 }, { - "epoch": 1.7818444497953285, - "grad_norm": 6.125, + "epoch": 10.556348074179743, + "grad_norm": 8.6875, "learning_rate": 4.533777777777778e-05, - "loss": 0.6978, + "loss": 0.8133, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval/acc": 48.83720779418945, + "epoch": 10.556348074179743, + "eval/acc": 46.511627197265625, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval_loss": 2.870887279510498, - "eval_runtime": 0.9511, - "eval_samples_per_second": 45.209, - "eval_steps_per_second": 1.051, + "epoch": 10.556348074179743, + "eval_loss": 2.8140347003936768, + "eval_runtime": 0.2939, + "eval_samples_per_second": 146.299, + "eval_steps_per_second": 3.402, "step": 14800 }, { - "epoch": 1.7830483987478931, - "grad_norm": 8.375, + "epoch": 10.563480741797433, + "grad_norm": 7.0, "learning_rate": 4.5293333333333336e-05, - "loss": 0.581, + "loss": 0.7129, "step": 14810 }, { - "epoch": 1.7842523477004575, - "grad_norm": 6.1875, + "epoch": 10.570613409415122, + "grad_norm": 8.5, "learning_rate": 4.524888888888889e-05, - "loss": 0.526, + "loss": 0.7667, "step": 14820 }, { - "epoch": 1.7854562966530219, - "grad_norm": 6.40625, + "epoch": 10.57774607703281, + "grad_norm": 7.4375, "learning_rate": 4.5204444444444446e-05, - "loss": 0.5246, + "loss": 0.7692, "step": 14830 }, { - "epoch": 1.7866602456055865, - "grad_norm": 7.125, + "epoch": 10.5848787446505, + "grad_norm": 8.0625, "learning_rate": 4.516e-05, - "loss": 0.6338, + "loss": 0.7613, "step": 14840 }, { - "epoch": 1.7878641945581508, - "grad_norm": 10.3125, + "epoch": 10.592011412268189, + "grad_norm": 7.96875, "learning_rate": 4.5115555555555557e-05, - "loss": 0.5313, + "loss": 0.6925, "step": 14850 }, { - "epoch": 1.7890681435107152, - "grad_norm": 8.4375, + "epoch": 10.599144079885878, + "grad_norm": 14.375, "learning_rate": 4.507111111111111e-05, - "loss": 0.6848, + "loss": 0.84, "step": 14860 }, { - "epoch": 1.7902720924632796, - "grad_norm": 20.125, + "epoch": 10.606276747503566, + "grad_norm": 11.4375, "learning_rate": 4.502666666666667e-05, - "loss": 0.5839, + "loss": 0.8508, "step": 14870 }, { - "epoch": 1.791476041415844, - "grad_norm": 9.1875, + "epoch": 10.613409415121255, + "grad_norm": 8.375, "learning_rate": 4.498222222222222e-05, - "loss": 0.5869, + "loss": 0.7863, "step": 14880 }, { - "epoch": 1.7926799903684083, - "grad_norm": 9.8125, + "epoch": 10.620542082738945, + "grad_norm": 7.625, "learning_rate": 4.493777777777778e-05, - "loss": 0.5319, + "loss": 0.7177, "step": 14890 }, { - "epoch": 1.7938839393209727, - "grad_norm": 7.03125, + "epoch": 10.627674750356633, + "grad_norm": 10.375, "learning_rate": 4.489333333333334e-05, - "loss": 0.6254, + "loss": 0.7795, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval/acc": 46.511627197265625, + "epoch": 10.627674750356633, + "eval/acc": 44.1860466003418, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval_loss": 2.8519837856292725, - "eval_runtime": 0.2174, - "eval_samples_per_second": 197.755, - "eval_steps_per_second": 4.599, + "epoch": 10.627674750356633, + "eval_loss": 2.830230951309204, + "eval_runtime": 0.2428, + "eval_samples_per_second": 177.067, + "eval_steps_per_second": 4.118, "step": 14900 }, { - "epoch": 1.795087888273537, - "grad_norm": 8.9375, + "epoch": 10.634807417974322, + "grad_norm": 10.875, "learning_rate": 4.484888888888889e-05, - "loss": 0.613, + "loss": 0.7878, "step": 14910 }, { - "epoch": 1.7962918372261016, - "grad_norm": 9.1875, + "epoch": 10.641940085592012, + "grad_norm": 9.0, "learning_rate": 4.480444444444445e-05, - "loss": 0.6735, + "loss": 0.8517, "step": 14920 }, { - "epoch": 1.797495786178666, - "grad_norm": 7.4375, + "epoch": 10.649072753209701, + "grad_norm": 6.9375, "learning_rate": 4.4760000000000005e-05, - "loss": 0.5792, + "loss": 0.8469, "step": 14930 }, { - "epoch": 1.7986997351312304, - "grad_norm": 6.375, + "epoch": 10.656205420827389, + "grad_norm": 7.28125, "learning_rate": 4.4715555555555554e-05, - "loss": 0.5137, + "loss": 0.7262, "step": 14940 }, { - "epoch": 1.799903684083795, - "grad_norm": 8.0, + "epoch": 10.663338088445078, + "grad_norm": 6.15625, "learning_rate": 4.4671111111111116e-05, - "loss": 0.5431, + "loss": 0.739, "step": 14950 }, { - "epoch": 1.8011076330363593, - "grad_norm": 6.65625, + "epoch": 10.670470756062768, + "grad_norm": 7.84375, "learning_rate": 4.4626666666666664e-05, - "loss": 0.6689, + "loss": 0.7671, "step": 14960 }, { - "epoch": 1.8023115819889237, - "grad_norm": 7.5625, + "epoch": 10.677603423680456, + "grad_norm": 7.53125, "learning_rate": 4.4582222222222226e-05, - "loss": 0.6171, + "loss": 0.8059, "step": 14970 }, { - "epoch": 1.803515530941488, - "grad_norm": 6.9375, + "epoch": 10.684736091298145, + "grad_norm": 8.0, "learning_rate": 4.453777777777778e-05, - "loss": 0.6821, + "loss": 0.8167, "step": 14980 }, { - "epoch": 1.8047194798940525, - "grad_norm": 7.6875, + "epoch": 10.691868758915835, + "grad_norm": 7.4375, "learning_rate": 4.4493333333333337e-05, - "loss": 0.5024, + "loss": 0.7768, "step": 14990 }, { - "epoch": 1.8059234288466168, - "grad_norm": 7.90625, + "epoch": 10.699001426533524, + "grad_norm": 9.0625, "learning_rate": 4.444888888888889e-05, - "loss": 0.6338, + "loss": 0.7805, "step": 15000 }, { - "epoch": 1.8059234288466168, + "epoch": 10.699001426533524, "eval/acc": 46.511627197265625, "step": 15000 }, { - "epoch": 1.8059234288466168, - "eval_loss": 2.8593192100524902, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.704, - "eval_steps_per_second": 4.737, + "epoch": 10.699001426533524, + "eval_loss": 2.8128726482391357, + "eval_runtime": 0.239, + "eval_samples_per_second": 179.883, + "eval_steps_per_second": 4.183, "step": 15000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 18, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-15000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..200d2b25a393e0bc93226a8057ceb6fe571e09e7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9d7d24b303024c8f8b32f821406abda528e58a29cb179cc9cb74d27b9bb1bc6 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c232366270e69573710e4ffc821280a5f6e46d74 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b3a448b206afc6dc52858a4a6b015ce6b78d98fd496ec76352773962792dc26 +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b5d89dcd4dbf997a8ad32b8ee232374a204097c --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:389688a75b95eb192d3136c7c953f90047df3f91d2d3de8b437f74b29f66376f +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..704e35da564778a42dc647f6511b87e90f939780 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:827aaec8bc75ad4df0d253e9dff56955d359d2b7e3795c4f0510bb4fcad88957 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3329287ff68624b47dfc8252a325dfb6d4736e7d --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a036e48d22c9e4f6292211df78d141afd551b5b205fc3786f44c050b40d377d +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1a0e49062f0c06537e52a034dfc0929ec5f36bf --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b42cda05c72a4664aba6a97cd76be3c838cc095e1b3ed410228f04fbb8313c +size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json similarity index 57% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json index 439a57c337aaacbdb54e6419fa7aaffea6fd3d4e..f6618ed787d5a78e2b93e4448ed26e135e0f8970 100644 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.4078979051288227, + "epoch": 14.265335235378032, "eval_steps": 100, "global_step": 20000, "is_hyper_param_search": false, @@ -10,16610 +10,16610 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0012039489525644113, - "grad_norm": 29.25, + "epoch": 0.007132667617689016, + "grad_norm": 19.75, "learning_rate": 3.6e-07, - "loss": 5.6475, + "loss": 5.6319, "step": 10 }, { - "epoch": 0.0024078979051288226, - "grad_norm": 13.6875, + "epoch": 0.014265335235378032, + "grad_norm": 19.375, "learning_rate": 7.6e-07, - "loss": 5.6394, + "loss": 5.5914, "step": 20 }, { - "epoch": 0.003611846857693234, - "grad_norm": 36.0, + "epoch": 0.021398002853067047, + "grad_norm": 51.25, "learning_rate": 1.16e-06, - "loss": 5.6168, + "loss": 5.6495, "step": 30 }, { - "epoch": 0.004815795810257645, - "grad_norm": 17.0, + "epoch": 0.028530670470756064, + "grad_norm": 19.0, "learning_rate": 1.56e-06, - "loss": 5.6346, + "loss": 5.6581, "step": 40 }, { - "epoch": 0.006019744762822056, - "grad_norm": 16.5, + "epoch": 0.03566333808844508, + "grad_norm": 23.75, "learning_rate": 1.96e-06, - "loss": 5.6391, + "loss": 5.6366, "step": 50 }, { - "epoch": 0.007223693715386468, - "grad_norm": 16.5, + "epoch": 0.042796005706134094, + "grad_norm": 18.0, "learning_rate": 2.36e-06, - "loss": 5.6272, + "loss": 5.6411, "step": 60 }, { - "epoch": 0.00842764266795088, - "grad_norm": 14.8125, + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, "learning_rate": 2.7600000000000003e-06, - "loss": 5.5979, + "loss": 5.5919, "step": 70 }, { - "epoch": 0.00963159162051529, - "grad_norm": 22.375, + "epoch": 0.05706134094151213, + "grad_norm": 24.125, "learning_rate": 3.1600000000000007e-06, - "loss": 5.6515, + "loss": 5.6083, "step": 80 }, { - "epoch": 0.010835540573079701, - "grad_norm": 17.125, + "epoch": 0.06419400855920114, + "grad_norm": 18.25, "learning_rate": 3.5600000000000002e-06, - "loss": 5.6018, + "loss": 5.6599, "step": 90 }, { - "epoch": 0.012039489525644112, - "grad_norm": 14.9375, + "epoch": 0.07132667617689016, + "grad_norm": 18.25, "learning_rate": 3.96e-06, - "loss": 5.6342, + "loss": 5.6652, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval/acc": 3.4883720874786377, + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval_loss": 5.140806198120117, - "eval_runtime": 2.4165, - "eval_samples_per_second": 17.794, - "eval_steps_per_second": 0.414, + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, "step": 100 }, { - "epoch": 0.013243438478208525, - "grad_norm": 13.0, + "epoch": 0.07845934379457917, + "grad_norm": 21.0, "learning_rate": 4.360000000000001e-06, - "loss": 5.6124, + "loss": 5.6402, "step": 110 }, { - "epoch": 0.014447387430772935, - "grad_norm": 18.625, + "epoch": 0.08559201141226819, + "grad_norm": 16.875, "learning_rate": 4.76e-06, - "loss": 5.6127, + "loss": 5.6535, "step": 120 }, { - "epoch": 0.015651336383337346, - "grad_norm": 14.375, + "epoch": 0.09272467902995721, + "grad_norm": 21.5, "learning_rate": 5.1600000000000006e-06, - "loss": 5.5663, + "loss": 5.5821, "step": 130 }, { - "epoch": 0.01685528533590176, - "grad_norm": 11.9375, + "epoch": 0.09985734664764621, + "grad_norm": 18.5, "learning_rate": 5.56e-06, - "loss": 5.55, + "loss": 5.6184, "step": 140 }, { - "epoch": 0.018059234288466168, - "grad_norm": 14.5, + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, "learning_rate": 5.9600000000000005e-06, - "loss": 5.5839, + "loss": 5.5743, "step": 150 }, { - "epoch": 0.01926318324103058, - "grad_norm": 15.0625, + "epoch": 0.11412268188302425, + "grad_norm": 16.875, "learning_rate": 6.360000000000001e-06, - "loss": 5.5259, + "loss": 5.5684, "step": 160 }, { - "epoch": 0.020467132193594993, - "grad_norm": 14.8125, + "epoch": 0.12125534950071326, + "grad_norm": 22.125, "learning_rate": 6.76e-06, - "loss": 5.4812, + "loss": 5.535, "step": 170 }, { - "epoch": 0.021671081146159402, - "grad_norm": 15.375, + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, "learning_rate": 7.16e-06, - "loss": 5.4964, + "loss": 5.4357, "step": 180 }, { - "epoch": 0.022875030098723815, - "grad_norm": 14.0625, + "epoch": 0.1355206847360913, + "grad_norm": 16.375, "learning_rate": 7.5600000000000005e-06, - "loss": 5.4023, + "loss": 5.3766, "step": 190 }, { - "epoch": 0.024078979051288224, - "grad_norm": 18.625, + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, "learning_rate": 7.96e-06, - "loss": 5.3778, + "loss": 5.4437, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval/acc": 5.232558250427246, + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval_loss": 4.991551399230957, - "eval_runtime": 0.2363, - "eval_samples_per_second": 181.988, - "eval_steps_per_second": 4.232, + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, "step": 200 }, { - "epoch": 0.025282928003852637, - "grad_norm": 16.25, + "epoch": 0.14978601997146934, + "grad_norm": 16.75, "learning_rate": 8.36e-06, - "loss": 5.3983, + "loss": 5.4744, "step": 210 }, { - "epoch": 0.02648687695641705, - "grad_norm": 17.25, + "epoch": 0.15691868758915833, + "grad_norm": 43.25, "learning_rate": 8.76e-06, - "loss": 5.2953, + "loss": 5.381, "step": 220 }, { - "epoch": 0.02769082590898146, - "grad_norm": 15.9375, + "epoch": 0.16405135520684735, + "grad_norm": 21.0, "learning_rate": 9.16e-06, - "loss": 5.2266, + "loss": 5.3092, "step": 230 }, { - "epoch": 0.02889477486154587, - "grad_norm": 21.875, + "epoch": 0.17118402282453637, + "grad_norm": 26.75, "learning_rate": 9.560000000000002e-06, - "loss": 5.139, + "loss": 5.2752, "step": 240 }, { - "epoch": 0.03009872381411028, - "grad_norm": 17.875, + "epoch": 0.1783166904422254, + "grad_norm": 26.875, "learning_rate": 9.96e-06, - "loss": 5.0639, + "loss": 5.2194, "step": 250 }, { - "epoch": 0.03130267276667469, - "grad_norm": 18.875, + "epoch": 0.18544935805991442, + "grad_norm": 20.875, "learning_rate": 1.036e-05, - "loss": 5.0118, + "loss": 5.0657, "step": 260 }, { - "epoch": 0.032506621719239105, - "grad_norm": 26.0, + "epoch": 0.19258202567760344, + "grad_norm": 25.125, "learning_rate": 1.076e-05, - "loss": 4.8959, + "loss": 4.967, "step": 270 }, { - "epoch": 0.03371057067180352, - "grad_norm": 18.5, + "epoch": 0.19971469329529243, + "grad_norm": 30.125, "learning_rate": 1.1160000000000002e-05, - "loss": 4.8454, + "loss": 4.9544, "step": 280 }, { - "epoch": 0.03491451962436792, - "grad_norm": 28.0, + "epoch": 0.20684736091298145, + "grad_norm": 24.625, "learning_rate": 1.156e-05, - "loss": 4.6846, + "loss": 4.7585, "step": 290 }, { - "epoch": 0.036118468576932336, - "grad_norm": 25.5, + "epoch": 0.21398002853067047, + "grad_norm": 21.375, "learning_rate": 1.196e-05, - "loss": 4.5211, + "loss": 4.635, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval/acc": 6.395349025726318, + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval_loss": 4.604515075683594, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.428, - "eval_steps_per_second": 4.638, + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, "step": 300 }, { - "epoch": 0.03732241752949675, - "grad_norm": 28.0, + "epoch": 0.2211126961483595, + "grad_norm": 30.125, "learning_rate": 1.236e-05, - "loss": 4.3466, + "loss": 4.5333, "step": 310 }, { - "epoch": 0.03852636648206116, - "grad_norm": 27.125, + "epoch": 0.2282453637660485, + "grad_norm": 28.125, "learning_rate": 1.276e-05, - "loss": 4.1005, + "loss": 4.2888, "step": 320 }, { - "epoch": 0.039730315434625574, - "grad_norm": 33.0, + "epoch": 0.23537803138373753, + "grad_norm": 30.5, "learning_rate": 1.316e-05, - "loss": 3.7904, + "loss": 4.1744, "step": 330 }, { - "epoch": 0.040934264387189986, - "grad_norm": 32.75, + "epoch": 0.24251069900142652, + "grad_norm": 35.0, "learning_rate": 1.356e-05, - "loss": 3.4061, + "loss": 3.8812, "step": 340 }, { - "epoch": 0.04213821333975439, - "grad_norm": 31.125, + "epoch": 0.24964336661911554, + "grad_norm": 30.75, "learning_rate": 1.396e-05, - "loss": 3.2838, + "loss": 3.6772, "step": 350 }, { - "epoch": 0.043342162292318805, - "grad_norm": 23.75, + "epoch": 0.25677603423680456, + "grad_norm": 25.875, "learning_rate": 1.4360000000000001e-05, - "loss": 2.9101, + "loss": 3.3797, "step": 360 }, { - "epoch": 0.04454611124488322, - "grad_norm": 44.75, + "epoch": 0.26390870185449355, + "grad_norm": 31.375, "learning_rate": 1.4760000000000001e-05, - "loss": 2.6306, + "loss": 3.2338, "step": 370 }, { - "epoch": 0.04575006019744763, - "grad_norm": 33.25, + "epoch": 0.2710413694721826, + "grad_norm": 72.0, "learning_rate": 1.5160000000000002e-05, - "loss": 2.5454, + "loss": 2.976, "step": 380 }, { - "epoch": 0.04695400915001204, - "grad_norm": 31.375, + "epoch": 0.2781740370898716, + "grad_norm": 22.375, "learning_rate": 1.556e-05, - "loss": 2.5867, + "loss": 2.8207, "step": 390 }, { - "epoch": 0.04815795810257645, - "grad_norm": 18.5, + "epoch": 0.28530670470756064, + "grad_norm": 21.25, "learning_rate": 1.596e-05, - "loss": 2.3251, + "loss": 2.8341, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval/acc": 12.209301948547363, + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval_loss": 3.941906452178955, - "eval_runtime": 0.2265, - "eval_samples_per_second": 189.814, - "eval_steps_per_second": 4.414, + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, "step": 400 }, { - "epoch": 0.04936190705514086, - "grad_norm": 18.0, + "epoch": 0.29243937232524964, + "grad_norm": 21.0, "learning_rate": 1.636e-05, - "loss": 2.394, + "loss": 2.6431, "step": 410 }, { - "epoch": 0.05056585600770527, - "grad_norm": 22.375, + "epoch": 0.2995720399429387, + "grad_norm": 20.875, "learning_rate": 1.6760000000000002e-05, - "loss": 2.2856, + "loss": 2.6506, "step": 420 }, { - "epoch": 0.051769804960269686, - "grad_norm": 17.25, + "epoch": 0.3067047075606277, + "grad_norm": 21.125, "learning_rate": 1.7160000000000002e-05, - "loss": 2.3414, + "loss": 2.491, "step": 430 }, { - "epoch": 0.0529737539128341, - "grad_norm": 15.25, + "epoch": 0.31383737517831667, + "grad_norm": 31.75, "learning_rate": 1.756e-05, - "loss": 2.156, + "loss": 2.423, "step": 440 }, { - "epoch": 0.054177702865398504, - "grad_norm": 15.75, + "epoch": 0.3209700427960057, + "grad_norm": 19.375, "learning_rate": 1.796e-05, - "loss": 2.0164, + "loss": 2.5108, "step": 450 }, { - "epoch": 0.05538165181796292, - "grad_norm": 28.5, + "epoch": 0.3281027104136947, + "grad_norm": 17.375, "learning_rate": 1.8360000000000004e-05, - "loss": 1.9555, + "loss": 2.4584, "step": 460 }, { - "epoch": 0.05658560077052733, - "grad_norm": 19.25, + "epoch": 0.33523537803138376, + "grad_norm": 22.625, "learning_rate": 1.876e-05, - "loss": 2.0277, + "loss": 2.3526, "step": 470 }, { - "epoch": 0.05778954972309174, - "grad_norm": 15.375, + "epoch": 0.34236804564907275, + "grad_norm": 30.25, "learning_rate": 1.916e-05, - "loss": 2.1719, + "loss": 2.3634, "step": 480 }, { - "epoch": 0.058993498675656154, - "grad_norm": 18.875, + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, "learning_rate": 1.956e-05, - "loss": 2.013, + "loss": 2.3339, "step": 490 }, { - "epoch": 0.06019744762822056, - "grad_norm": 18.625, + "epoch": 0.3566333808844508, + "grad_norm": 19.5, "learning_rate": 1.9960000000000002e-05, - "loss": 1.8574, + "loss": 2.268, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval/acc": 20.930233001708984, + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval_loss": 3.6547293663024902, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.002, - "eval_steps_per_second": 4.674, + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, "step": 500 }, { - "epoch": 0.06140139658078497, - "grad_norm": 19.875, + "epoch": 0.3637660485021398, + "grad_norm": 29.375, "learning_rate": 2.036e-05, - "loss": 1.9431, + "loss": 2.2728, "step": 510 }, { - "epoch": 0.06260534553334939, - "grad_norm": 14.625, + "epoch": 0.37089871611982883, + "grad_norm": 21.25, "learning_rate": 2.076e-05, - "loss": 1.8311, + "loss": 2.1346, "step": 520 }, { - "epoch": 0.0638092944859138, - "grad_norm": 20.0, + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, "learning_rate": 2.116e-05, - "loss": 2.0005, + "loss": 2.2719, "step": 530 }, { - "epoch": 0.06501324343847821, - "grad_norm": 16.0, + "epoch": 0.38516405135520687, + "grad_norm": 27.75, "learning_rate": 2.1560000000000004e-05, - "loss": 1.7374, + "loss": 2.145, "step": 540 }, { - "epoch": 0.06621719239104262, - "grad_norm": 13.0625, + "epoch": 0.39229671897289586, + "grad_norm": 16.125, "learning_rate": 2.196e-05, - "loss": 1.7838, + "loss": 2.0912, "step": 550 }, { - "epoch": 0.06742114134360704, - "grad_norm": 16.5, + "epoch": 0.39942938659058486, + "grad_norm": 20.25, "learning_rate": 2.236e-05, - "loss": 1.8264, + "loss": 2.0302, "step": 560 }, { - "epoch": 0.06862509029617145, - "grad_norm": 20.5, + "epoch": 0.4065620542082739, + "grad_norm": 17.75, "learning_rate": 2.2760000000000002e-05, - "loss": 1.658, + "loss": 2.1832, "step": 570 }, { - "epoch": 0.06982903924873585, - "grad_norm": 25.75, + "epoch": 0.4136947218259629, + "grad_norm": 14.5, "learning_rate": 2.3160000000000002e-05, - "loss": 1.7826, + "loss": 1.9652, "step": 580 }, { - "epoch": 0.07103298820130026, - "grad_norm": 19.375, + "epoch": 0.42082738944365194, + "grad_norm": 17.0, "learning_rate": 2.356e-05, - "loss": 1.6539, + "loss": 1.8911, "step": 590 }, { - "epoch": 0.07223693715386467, - "grad_norm": 19.25, + "epoch": 0.42796005706134094, + "grad_norm": 20.0, "learning_rate": 2.396e-05, - "loss": 1.6278, + "loss": 2.0266, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval/acc": 20.930233001708984, + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval_loss": 3.387899398803711, - "eval_runtime": 0.2536, - "eval_samples_per_second": 169.572, - "eval_steps_per_second": 3.944, + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, "step": 600 }, { - "epoch": 0.07344088610642908, - "grad_norm": 12.0625, + "epoch": 0.43509272467902993, + "grad_norm": 25.5, "learning_rate": 2.4360000000000004e-05, - "loss": 1.5342, + "loss": 1.9116, "step": 610 }, { - "epoch": 0.0746448350589935, - "grad_norm": 15.625, + "epoch": 0.442225392296719, + "grad_norm": 25.375, "learning_rate": 2.476e-05, - "loss": 1.5919, + "loss": 1.7644, "step": 620 }, { - "epoch": 0.07584878401155791, - "grad_norm": 25.5, + "epoch": 0.44935805991440797, + "grad_norm": 15.5, "learning_rate": 2.516e-05, - "loss": 1.5713, + "loss": 1.9008, "step": 630 }, { - "epoch": 0.07705273296412232, - "grad_norm": 14.8125, + "epoch": 0.456490727532097, + "grad_norm": 16.875, "learning_rate": 2.556e-05, - "loss": 1.4714, + "loss": 1.619, "step": 640 }, { - "epoch": 0.07825668191668674, - "grad_norm": 21.5, + "epoch": 0.463623395149786, + "grad_norm": 37.25, "learning_rate": 2.5960000000000002e-05, - "loss": 1.5835, + "loss": 1.7725, "step": 650 }, { - "epoch": 0.07946063086925115, - "grad_norm": 58.0, + "epoch": 0.47075606276747506, + "grad_norm": 16.5, "learning_rate": 2.6360000000000002e-05, - "loss": 1.5369, + "loss": 1.7405, "step": 660 }, { - "epoch": 0.08066457982181556, - "grad_norm": 45.0, + "epoch": 0.47788873038516405, + "grad_norm": 16.25, "learning_rate": 2.676e-05, - "loss": 1.4629, + "loss": 1.5825, "step": 670 }, { - "epoch": 0.08186852877437997, - "grad_norm": 14.1875, + "epoch": 0.48502139800285304, + "grad_norm": 68.5, "learning_rate": 2.716e-05, - "loss": 1.4288, + "loss": 1.8379, "step": 680 }, { - "epoch": 0.08307247772694437, - "grad_norm": 40.25, + "epoch": 0.4921540656205421, + "grad_norm": 50.0, "learning_rate": 2.7560000000000004e-05, - "loss": 1.4729, + "loss": 1.7989, "step": 690 }, { - "epoch": 0.08427642667950878, - "grad_norm": 13.625, + "epoch": 0.4992867332382311, + "grad_norm": 16.25, "learning_rate": 2.7960000000000003e-05, - "loss": 1.4883, + "loss": 1.7058, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval/acc": 23.255813598632812, + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval_loss": 3.206946611404419, - "eval_runtime": 0.4188, - "eval_samples_per_second": 102.684, - "eval_steps_per_second": 2.388, + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, "step": 700 }, { - "epoch": 0.0854803756320732, - "grad_norm": 15.75, + "epoch": 0.5064194008559201, + "grad_norm": 14.625, "learning_rate": 2.8360000000000003e-05, - "loss": 1.5656, + "loss": 1.6542, "step": 710 }, { - "epoch": 0.08668432458463761, - "grad_norm": 22.25, + "epoch": 0.5135520684736091, + "grad_norm": 71.0, "learning_rate": 2.8760000000000002e-05, - "loss": 1.6742, + "loss": 1.6763, "step": 720 }, { - "epoch": 0.08788827353720202, - "grad_norm": 12.3125, + "epoch": 0.5206847360912982, + "grad_norm": 17.125, "learning_rate": 2.9160000000000005e-05, - "loss": 1.35, + "loss": 1.6858, "step": 730 }, { - "epoch": 0.08909222248976643, - "grad_norm": 13.8125, + "epoch": 0.5278174037089871, + "grad_norm": 19.75, "learning_rate": 2.9559999999999998e-05, - "loss": 1.4435, + "loss": 1.6718, "step": 740 }, { - "epoch": 0.09029617144233085, - "grad_norm": 13.1875, + "epoch": 0.5349500713266762, + "grad_norm": 13.375, "learning_rate": 2.9959999999999998e-05, - "loss": 1.3843, + "loss": 1.6164, "step": 750 }, { - "epoch": 0.09150012039489526, - "grad_norm": 13.3125, + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, "learning_rate": 3.036e-05, - "loss": 1.3327, + "loss": 1.6049, "step": 760 }, { - "epoch": 0.09270406934745967, - "grad_norm": 18.875, + "epoch": 0.5492154065620543, + "grad_norm": 35.75, "learning_rate": 3.076e-05, - "loss": 1.4628, + "loss": 1.5453, "step": 770 }, { - "epoch": 0.09390801830002408, - "grad_norm": 14.5625, + "epoch": 0.5563480741797432, + "grad_norm": 28.75, "learning_rate": 3.116e-05, - "loss": 1.3306, + "loss": 1.4818, "step": 780 }, { - "epoch": 0.09511196725258848, - "grad_norm": 18.75, + "epoch": 0.5634807417974322, + "grad_norm": 17.375, "learning_rate": 3.156e-05, - "loss": 1.4936, + "loss": 1.5647, "step": 790 }, { - "epoch": 0.0963159162051529, - "grad_norm": 11.5, + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, "learning_rate": 3.196e-05, - "loss": 1.3515, + "loss": 1.5206, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval/acc": 22.674419403076172, + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval_loss": 3.1510462760925293, - "eval_runtime": 0.2676, - "eval_samples_per_second": 160.701, - "eval_steps_per_second": 3.737, + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, "step": 800 }, { - "epoch": 0.09751986515771731, - "grad_norm": 11.6875, + "epoch": 0.5777460770328102, + "grad_norm": 17.125, "learning_rate": 3.236e-05, - "loss": 1.4593, + "loss": 1.6124, "step": 810 }, { - "epoch": 0.09872381411028172, - "grad_norm": 10.5625, + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, "learning_rate": 3.2760000000000005e-05, - "loss": 1.3453, + "loss": 1.4254, "step": 820 }, { - "epoch": 0.09992776306284613, - "grad_norm": 11.625, + "epoch": 0.5920114122681883, + "grad_norm": 15.0, "learning_rate": 3.316e-05, - "loss": 1.4041, + "loss": 1.7124, "step": 830 }, { - "epoch": 0.10113171201541055, - "grad_norm": 13.0, + "epoch": 0.5991440798858774, + "grad_norm": 14.75, "learning_rate": 3.3560000000000004e-05, - "loss": 1.2766, + "loss": 1.5384, "step": 840 }, { - "epoch": 0.10233566096797496, - "grad_norm": 40.0, + "epoch": 0.6062767475035663, + "grad_norm": 31.5, "learning_rate": 3.396e-05, - "loss": 1.2678, + "loss": 1.4899, "step": 850 }, { - "epoch": 0.10353960992053937, - "grad_norm": 13.75, + "epoch": 0.6134094151212554, + "grad_norm": 13.875, "learning_rate": 3.436e-05, - "loss": 1.2514, + "loss": 1.5377, "step": 860 }, { - "epoch": 0.10474355887310378, - "grad_norm": 11.75, + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, "learning_rate": 3.4760000000000006e-05, - "loss": 1.3518, + "loss": 1.4892, "step": 870 }, { - "epoch": 0.1059475078256682, - "grad_norm": 11.875, + "epoch": 0.6276747503566333, + "grad_norm": 37.25, "learning_rate": 3.516e-05, - "loss": 1.2675, + "loss": 1.4872, "step": 880 }, { - "epoch": 0.10715145677823261, - "grad_norm": 13.0, + "epoch": 0.6348074179743224, + "grad_norm": 18.875, "learning_rate": 3.5560000000000005e-05, - "loss": 1.294, + "loss": 1.536, "step": 890 }, { - "epoch": 0.10835540573079701, - "grad_norm": 13.0, + "epoch": 0.6419400855920114, + "grad_norm": 18.625, "learning_rate": 3.596e-05, - "loss": 1.1209, + "loss": 1.5208, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval/acc": 25.581396102905273, + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval_loss": 3.0571491718292236, - "eval_runtime": 0.3097, - "eval_samples_per_second": 138.846, - "eval_steps_per_second": 3.229, + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, "step": 900 }, { - "epoch": 0.10955935468336142, - "grad_norm": 12.75, + "epoch": 0.6490727532097005, + "grad_norm": 19.875, "learning_rate": 3.636e-05, - "loss": 1.2681, + "loss": 1.4606, "step": 910 }, { - "epoch": 0.11076330363592583, - "grad_norm": 17.0, + "epoch": 0.6562054208273894, + "grad_norm": 12.625, "learning_rate": 3.676e-05, - "loss": 1.2606, + "loss": 1.4728, "step": 920 }, { - "epoch": 0.11196725258849025, - "grad_norm": 11.375, + "epoch": 0.6633380884450785, + "grad_norm": 15.0, "learning_rate": 3.716e-05, - "loss": 1.2194, + "loss": 1.449, "step": 930 }, { - "epoch": 0.11317120154105466, - "grad_norm": 12.125, + "epoch": 0.6704707560627675, + "grad_norm": 19.0, "learning_rate": 3.756e-05, - "loss": 1.2905, + "loss": 1.5292, "step": 940 }, { - "epoch": 0.11437515049361907, - "grad_norm": 18.125, + "epoch": 0.6776034236804565, + "grad_norm": 111.5, "learning_rate": 3.796e-05, - "loss": 1.2563, + "loss": 1.4891, "step": 950 }, { - "epoch": 0.11557909944618348, - "grad_norm": 17.125, + "epoch": 0.6847360912981455, + "grad_norm": 14.75, "learning_rate": 3.836e-05, - "loss": 1.1894, + "loss": 1.4202, "step": 960 }, { - "epoch": 0.1167830483987479, - "grad_norm": 11.875, + "epoch": 0.6918687589158345, + "grad_norm": 20.25, "learning_rate": 3.876e-05, - "loss": 1.2441, + "loss": 1.5258, "step": 970 }, { - "epoch": 0.11798699735131231, - "grad_norm": 15.8125, + "epoch": 0.6990014265335235, + "grad_norm": 48.0, "learning_rate": 3.9160000000000005e-05, - "loss": 1.2627, + "loss": 1.3912, "step": 980 }, { - "epoch": 0.11919094630387672, - "grad_norm": 17.375, + "epoch": 0.7061340941512125, + "grad_norm": 13.0, "learning_rate": 3.956e-05, - "loss": 1.3929, + "loss": 1.4859, "step": 990 }, { - "epoch": 0.12039489525644112, - "grad_norm": 11.125, + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, "learning_rate": 3.9960000000000004e-05, - "loss": 1.1332, + "loss": 1.4614, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval/acc": 26.162790298461914, + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval_loss": 2.9910976886749268, - "eval_runtime": 0.2826, - "eval_samples_per_second": 152.17, - "eval_steps_per_second": 3.539, + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, "step": 1000 }, { - "epoch": 0.12159884420900553, - "grad_norm": 13.75, + "epoch": 0.7203994293865906, + "grad_norm": 16.625, "learning_rate": 4.0360000000000007e-05, - "loss": 1.2314, + "loss": 1.56, "step": 1010 }, { - "epoch": 0.12280279316156995, - "grad_norm": 11.875, + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, "learning_rate": 4.076e-05, - "loss": 1.2654, + "loss": 1.4469, "step": 1020 }, { - "epoch": 0.12400674211413436, - "grad_norm": 12.8125, + "epoch": 0.7346647646219686, + "grad_norm": 15.0, "learning_rate": 4.1160000000000006e-05, - "loss": 1.1432, + "loss": 1.381, "step": 1030 }, { - "epoch": 0.12521069106669877, - "grad_norm": 13.9375, + "epoch": 0.7417974322396577, + "grad_norm": 13.625, "learning_rate": 4.156e-05, - "loss": 1.1669, + "loss": 1.3749, "step": 1040 }, { - "epoch": 0.1264146400192632, - "grad_norm": 19.25, + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, "learning_rate": 4.196e-05, - "loss": 1.1836, + "loss": 1.3919, "step": 1050 }, { - "epoch": 0.1276185889718276, - "grad_norm": 11.375, + "epoch": 0.7560627674750356, + "grad_norm": 16.25, "learning_rate": 4.236e-05, - "loss": 1.2449, + "loss": 1.4208, "step": 1060 }, { - "epoch": 0.128822537924392, - "grad_norm": 10.6875, + "epoch": 0.7631954350927247, + "grad_norm": 27.75, "learning_rate": 4.276e-05, - "loss": 1.1361, + "loss": 1.3714, "step": 1070 }, { - "epoch": 0.13002648687695642, - "grad_norm": 11.5, + "epoch": 0.7703281027104137, + "grad_norm": 13.125, "learning_rate": 4.316e-05, - "loss": 1.1989, + "loss": 1.3344, "step": 1080 }, { - "epoch": 0.13123043582952082, - "grad_norm": 13.0, + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, "learning_rate": 4.356e-05, - "loss": 1.1004, + "loss": 1.3291, "step": 1090 }, { - "epoch": 0.13243438478208525, - "grad_norm": 10.125, + "epoch": 0.7845934379457917, + "grad_norm": 17.125, "learning_rate": 4.396e-05, - "loss": 1.1308, + "loss": 1.3536, "step": 1100 }, { - "epoch": 0.13243438478208525, + "epoch": 0.7845934379457917, "eval/acc": 27.9069766998291, "step": 1100 }, { - "epoch": 0.13243438478208525, - "eval_loss": 3.0177316665649414, - "eval_runtime": 0.2801, - "eval_samples_per_second": 153.54, - "eval_steps_per_second": 3.571, + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, "step": 1100 }, { - "epoch": 0.13363833373464964, - "grad_norm": 9.5, + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, "learning_rate": 4.436e-05, - "loss": 1.1862, + "loss": 1.4598, "step": 1110 }, { - "epoch": 0.13484228268721407, - "grad_norm": 13.75, + "epoch": 0.7988587731811697, + "grad_norm": 15.25, "learning_rate": 4.4760000000000005e-05, - "loss": 1.1764, + "loss": 1.3795, "step": 1120 }, { - "epoch": 0.13604623163977847, - "grad_norm": 30.625, + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, "learning_rate": 4.516e-05, - "loss": 1.0422, + "loss": 1.2518, "step": 1130 }, { - "epoch": 0.1372501805923429, - "grad_norm": 9.875, + "epoch": 0.8131241084165478, + "grad_norm": 16.625, "learning_rate": 4.5560000000000004e-05, - "loss": 1.1796, + "loss": 1.3104, "step": 1140 }, { - "epoch": 0.1384541295449073, - "grad_norm": 13.1875, + "epoch": 0.8202567760342369, + "grad_norm": 11.875, "learning_rate": 4.596e-05, - "loss": 1.0483, + "loss": 1.2996, "step": 1150 }, { - "epoch": 0.1396580784974717, - "grad_norm": 11.75, + "epoch": 0.8273894436519258, + "grad_norm": 24.125, "learning_rate": 4.636e-05, - "loss": 1.1647, + "loss": 1.2067, "step": 1160 }, { - "epoch": 0.14086202745003612, - "grad_norm": 13.375, + "epoch": 0.8345221112696148, + "grad_norm": 11.0, "learning_rate": 4.6760000000000006e-05, - "loss": 1.2839, + "loss": 1.3035, "step": 1170 }, { - "epoch": 0.14206597640260052, - "grad_norm": 42.0, + "epoch": 0.8416547788873039, + "grad_norm": 13.125, "learning_rate": 4.716e-05, - "loss": 1.1594, + "loss": 1.2859, "step": 1180 }, { - "epoch": 0.14326992535516495, - "grad_norm": 15.625, + "epoch": 0.8487874465049928, + "grad_norm": 11.0, "learning_rate": 4.7560000000000005e-05, - "loss": 1.1073, + "loss": 1.3982, "step": 1190 }, { - "epoch": 0.14447387430772934, - "grad_norm": 11.5, + "epoch": 0.8559201141226819, + "grad_norm": 12.875, "learning_rate": 4.796e-05, - "loss": 1.1593, + "loss": 1.299, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval/acc": 26.162790298461914, + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval_loss": 3.0329606533050537, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.829, - "eval_steps_per_second": 4.577, + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, "step": 1200 }, { - "epoch": 0.14567782326029377, - "grad_norm": 12.5625, + "epoch": 0.8630527817403709, + "grad_norm": 11.25, "learning_rate": 4.836e-05, - "loss": 1.1088, + "loss": 1.3549, "step": 1210 }, { - "epoch": 0.14688177221285817, - "grad_norm": 10.4375, + "epoch": 0.8701854493580599, + "grad_norm": 15.25, "learning_rate": 4.876e-05, - "loss": 1.1565, + "loss": 1.3649, "step": 1220 }, { - "epoch": 0.1480857211654226, - "grad_norm": 11.3125, + "epoch": 0.8773181169757489, + "grad_norm": 22.0, "learning_rate": 4.9160000000000004e-05, - "loss": 1.0596, + "loss": 1.2441, "step": 1230 }, { - "epoch": 0.149289670117987, - "grad_norm": 11.375, + "epoch": 0.884450784593438, + "grad_norm": 12.375, "learning_rate": 4.956e-05, - "loss": 1.2416, + "loss": 1.2196, "step": 1240 }, { - "epoch": 0.15049361907055142, - "grad_norm": 10.3125, + "epoch": 0.891583452211127, + "grad_norm": 14.25, "learning_rate": 4.996e-05, - "loss": 1.0492, + "loss": 1.3274, "step": 1250 }, { - "epoch": 0.15169756802311582, - "grad_norm": 10.9375, + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, "learning_rate": 5.0360000000000006e-05, - "loss": 1.0263, + "loss": 1.2896, "step": 1260 }, { - "epoch": 0.15290151697568022, - "grad_norm": 11.0625, + "epoch": 0.905848787446505, + "grad_norm": 16.875, "learning_rate": 5.076000000000001e-05, - "loss": 1.1197, + "loss": 1.3019, "step": 1270 }, { - "epoch": 0.15410546592824464, - "grad_norm": 33.25, + "epoch": 0.912981455064194, + "grad_norm": 26.375, "learning_rate": 5.1160000000000005e-05, - "loss": 1.0614, + "loss": 1.3756, "step": 1280 }, { - "epoch": 0.15530941488080904, - "grad_norm": 11.3125, + "epoch": 0.920114122681883, + "grad_norm": 18.25, "learning_rate": 5.1559999999999994e-05, - "loss": 1.0948, + "loss": 1.327, "step": 1290 }, { - "epoch": 0.15651336383337347, - "grad_norm": 24.5, + "epoch": 0.927246790299572, + "grad_norm": 11.3125, "learning_rate": 5.196e-05, - "loss": 1.1113, + "loss": 1.3237, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval/acc": 25.581396102905273, + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval_loss": 2.944797992706299, - "eval_runtime": 0.3019, - "eval_samples_per_second": 142.434, - "eval_steps_per_second": 3.312, + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, "step": 1300 }, { - "epoch": 0.15771731278593787, - "grad_norm": 12.4375, + "epoch": 0.9343794579172611, + "grad_norm": 18.125, "learning_rate": 5.236e-05, - "loss": 0.9531, + "loss": 1.256, "step": 1310 }, { - "epoch": 0.1589212617385023, - "grad_norm": 12.3125, + "epoch": 0.9415121255349501, + "grad_norm": 10.25, "learning_rate": 5.2759999999999996e-05, - "loss": 1.0079, + "loss": 1.1386, "step": 1320 }, { - "epoch": 0.1601252106910667, - "grad_norm": 13.1875, + "epoch": 0.948644793152639, + "grad_norm": 11.1875, "learning_rate": 5.316e-05, - "loss": 1.0674, + "loss": 1.3115, "step": 1330 }, { - "epoch": 0.16132915964363112, - "grad_norm": 16.875, + "epoch": 0.9557774607703281, + "grad_norm": 10.875, "learning_rate": 5.356e-05, - "loss": 1.1194, + "loss": 1.2315, "step": 1340 }, { - "epoch": 0.16253310859619552, - "grad_norm": 10.625, + "epoch": 0.9629101283880172, + "grad_norm": 12.0, "learning_rate": 5.396e-05, - "loss": 1.0057, + "loss": 1.3327, "step": 1350 }, { - "epoch": 0.16373705754875995, - "grad_norm": 9.125, + "epoch": 0.9700427960057061, + "grad_norm": 11.75, "learning_rate": 5.436e-05, - "loss": 1.1257, + "loss": 1.4052, "step": 1360 }, { - "epoch": 0.16494100650132434, - "grad_norm": 8.5, + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, "learning_rate": 5.476e-05, - "loss": 0.9545, + "loss": 1.1349, "step": 1370 }, { - "epoch": 0.16614495545388874, - "grad_norm": 10.25, + "epoch": 0.9843081312410842, + "grad_norm": 15.125, "learning_rate": 5.516e-05, - "loss": 1.0648, + "loss": 1.3803, "step": 1380 }, { - "epoch": 0.16734890440645317, - "grad_norm": 14.9375, + "epoch": 0.9914407988587732, + "grad_norm": 16.75, "learning_rate": 5.556e-05, - "loss": 1.0364, + "loss": 1.3536, "step": 1390 }, { - "epoch": 0.16855285335901757, - "grad_norm": 138.0, + "epoch": 0.9985734664764622, + "grad_norm": 10.625, "learning_rate": 5.596e-05, - "loss": 1.0255, + "loss": 1.2981, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval/acc": 27.9069766998291, + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval_loss": 2.763101100921631, - "eval_runtime": 0.2759, - "eval_samples_per_second": 155.826, - "eval_steps_per_second": 3.624, + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, "step": 1400 }, { - "epoch": 0.169756802311582, - "grad_norm": 11.8125, + "epoch": 1.005706134094151, + "grad_norm": 15.0, "learning_rate": 5.636e-05, - "loss": 0.9813, + "loss": 1.2173, "step": 1410 }, { - "epoch": 0.1709607512641464, - "grad_norm": 9.1875, + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, "learning_rate": 5.6760000000000005e-05, - "loss": 0.9929, + "loss": 1.1965, "step": 1420 }, { - "epoch": 0.17216470021671082, - "grad_norm": 10.875, + "epoch": 1.0199714693295292, + "grad_norm": 21.625, "learning_rate": 5.716e-05, - "loss": 0.9113, + "loss": 1.2494, "step": 1430 }, { - "epoch": 0.17336864916927522, - "grad_norm": 19.375, + "epoch": 1.0271041369472182, + "grad_norm": 13.0, "learning_rate": 5.7560000000000005e-05, - "loss": 1.0711, + "loss": 1.1948, "step": 1440 }, { - "epoch": 0.17457259812183964, - "grad_norm": 9.8125, + "epoch": 1.0342368045649073, + "grad_norm": 11.0, "learning_rate": 5.796e-05, - "loss": 0.9322, + "loss": 1.2641, "step": 1450 }, { - "epoch": 0.17577654707440404, - "grad_norm": 10.5, + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, "learning_rate": 5.8360000000000004e-05, - "loss": 1.0316, + "loss": 1.2526, "step": 1460 }, { - "epoch": 0.17698049602696847, - "grad_norm": 10.25, + "epoch": 1.0485021398002854, + "grad_norm": 46.0, "learning_rate": 5.876000000000001e-05, - "loss": 1.0165, + "loss": 1.0786, "step": 1470 }, { - "epoch": 0.17818444497953287, - "grad_norm": 10.4375, + "epoch": 1.0556348074179742, + "grad_norm": 11.0, "learning_rate": 5.916e-05, - "loss": 1.0229, + "loss": 1.3154, "step": 1480 }, { - "epoch": 0.17938839393209727, - "grad_norm": 14.4375, + "epoch": 1.0627674750356633, + "grad_norm": 18.75, "learning_rate": 5.9560000000000006e-05, - "loss": 0.9684, + "loss": 1.257, "step": 1490 }, { - "epoch": 0.1805923428846617, - "grad_norm": 8.375, + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, "learning_rate": 5.996e-05, - "loss": 0.9948, + "loss": 1.2636, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval/acc": 34.88372039794922, + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval_loss": 2.8177433013916016, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.732, - "eval_steps_per_second": 4.808, + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, "step": 1500 }, { - "epoch": 0.1817962918372261, - "grad_norm": 19.25, + "epoch": 1.0770328102710414, + "grad_norm": 13.75, "learning_rate": 6.0360000000000005e-05, - "loss": 0.9897, + "loss": 1.2602, "step": 1510 }, { - "epoch": 0.18300024078979052, - "grad_norm": 32.5, + "epoch": 1.0841654778887304, + "grad_norm": 11.625, "learning_rate": 6.076000000000001e-05, - "loss": 0.9217, + "loss": 1.0823, "step": 1520 }, { - "epoch": 0.18420418974235492, - "grad_norm": 9.5, + "epoch": 1.0912981455064195, + "grad_norm": 9.0, "learning_rate": 6.116e-05, - "loss": 1.0494, + "loss": 1.3059, "step": 1530 }, { - "epoch": 0.18540813869491934, - "grad_norm": 9.25, + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, "learning_rate": 6.156e-05, - "loss": 0.9359, + "loss": 1.2006, "step": 1540 }, { - "epoch": 0.18661208764748374, - "grad_norm": 11.375, + "epoch": 1.1055634807417973, + "grad_norm": 15.75, "learning_rate": 6.196000000000001e-05, - "loss": 0.9112, + "loss": 1.3731, "step": 1550 }, { - "epoch": 0.18781603660004817, - "grad_norm": 12.6875, + "epoch": 1.1126961483594864, + "grad_norm": 9.5, "learning_rate": 6.236e-05, - "loss": 1.07, + "loss": 1.1925, "step": 1560 }, { - "epoch": 0.18901998555261257, - "grad_norm": 11.1875, + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, "learning_rate": 6.276e-05, - "loss": 0.9853, + "loss": 1.1554, "step": 1570 }, { - "epoch": 0.19022393450517697, - "grad_norm": 8.375, + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, "learning_rate": 6.316000000000001e-05, - "loss": 0.9579, + "loss": 1.0875, "step": 1580 }, { - "epoch": 0.1914278834577414, - "grad_norm": 20.875, + "epoch": 1.1340941512125535, + "grad_norm": 10.875, "learning_rate": 6.356000000000001e-05, - "loss": 0.9401, + "loss": 1.1895, "step": 1590 }, { - "epoch": 0.1926318324103058, - "grad_norm": 8.9375, + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, "learning_rate": 6.396e-05, - "loss": 1.0279, + "loss": 1.2354, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval/acc": 30.23255729675293, + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval_loss": 2.8526248931884766, - "eval_runtime": 0.3114, - "eval_samples_per_second": 138.103, - "eval_steps_per_second": 3.212, + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, "step": 1600 }, { - "epoch": 0.19383578136287022, - "grad_norm": 7.78125, + "epoch": 1.1483594864479316, + "grad_norm": 12.375, "learning_rate": 6.436e-05, - "loss": 0.8743, + "loss": 1.2167, "step": 1610 }, { - "epoch": 0.19503973031543462, - "grad_norm": 9.8125, + "epoch": 1.1554921540656204, + "grad_norm": 10.375, "learning_rate": 6.476e-05, - "loss": 0.8702, + "loss": 1.1638, "step": 1620 }, { - "epoch": 0.19624367926799904, - "grad_norm": 12.4375, + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, "learning_rate": 6.515999999999999e-05, - "loss": 1.0028, + "loss": 1.1666, "step": 1630 }, { - "epoch": 0.19744762822056344, - "grad_norm": 10.125, + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, "learning_rate": 6.556e-05, - "loss": 0.9377, + "loss": 1.1961, "step": 1640 }, { - "epoch": 0.19865157717312787, - "grad_norm": 8.9375, + "epoch": 1.1768901569186876, + "grad_norm": 9.875, "learning_rate": 6.596e-05, - "loss": 1.031, + "loss": 1.2558, "step": 1650 }, { - "epoch": 0.19985552612569227, - "grad_norm": 8.5625, + "epoch": 1.1840228245363766, + "grad_norm": 10.375, "learning_rate": 6.636e-05, - "loss": 1.0162, + "loss": 1.1728, "step": 1660 }, { - "epoch": 0.2010594750782567, - "grad_norm": 33.75, + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, "learning_rate": 6.676e-05, - "loss": 0.9448, + "loss": 1.2947, "step": 1670 }, { - "epoch": 0.2022634240308211, - "grad_norm": 9.625, + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, "learning_rate": 6.716e-05, - "loss": 1.0077, + "loss": 1.2151, "step": 1680 }, { - "epoch": 0.2034673729833855, - "grad_norm": 8.6875, + "epoch": 1.2054208273894436, + "grad_norm": 10.5, "learning_rate": 6.756e-05, - "loss": 0.9654, + "loss": 1.0612, "step": 1690 }, { - "epoch": 0.20467132193594992, - "grad_norm": 12.625, + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, "learning_rate": 6.796e-05, - "loss": 0.8899, + "loss": 1.1079, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval/acc": 32.55813980102539, + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval_loss": 2.7813549041748047, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.701, - "eval_steps_per_second": 4.691, + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, "step": 1700 }, { - "epoch": 0.20587527088851432, - "grad_norm": 12.0, + "epoch": 1.2196861626248217, + "grad_norm": 11.25, "learning_rate": 6.836e-05, - "loss": 1.0412, + "loss": 1.1541, "step": 1710 }, { - "epoch": 0.20707921984107874, - "grad_norm": 11.75, + "epoch": 1.2268188302425107, + "grad_norm": 8.125, "learning_rate": 6.876e-05, - "loss": 0.9239, + "loss": 1.0772, "step": 1720 }, { - "epoch": 0.20828316879364314, - "grad_norm": 11.375, + "epoch": 1.2339514978601998, + "grad_norm": 18.125, "learning_rate": 6.916000000000001e-05, - "loss": 0.9243, + "loss": 1.1623, "step": 1730 }, { - "epoch": 0.20948711774620757, - "grad_norm": 12.0, + "epoch": 1.2410841654778888, + "grad_norm": 10.125, "learning_rate": 6.956e-05, - "loss": 1.0204, + "loss": 1.182, "step": 1740 }, { - "epoch": 0.21069106669877197, - "grad_norm": 13.0625, + "epoch": 1.2482168330955776, + "grad_norm": 9.75, "learning_rate": 6.996e-05, - "loss": 0.8811, + "loss": 1.0796, "step": 1750 }, { - "epoch": 0.2118950156513364, - "grad_norm": 17.0, + "epoch": 1.2553495007132667, + "grad_norm": 10.5, "learning_rate": 7.036e-05, - "loss": 0.8755, + "loss": 1.2374, "step": 1760 }, { - "epoch": 0.2130989646039008, - "grad_norm": 11.25, + "epoch": 1.2624821683309557, + "grad_norm": 20.875, "learning_rate": 7.076000000000001e-05, - "loss": 0.858, + "loss": 1.2718, "step": 1770 }, { - "epoch": 0.21430291355646522, - "grad_norm": 9.625, + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, "learning_rate": 7.116e-05, - "loss": 0.9076, + "loss": 1.0922, "step": 1780 }, { - "epoch": 0.21550686250902962, - "grad_norm": 10.4375, + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, "learning_rate": 7.156e-05, - "loss": 0.8817, + "loss": 1.0637, "step": 1790 }, { - "epoch": 0.21671081146159402, - "grad_norm": 12.8125, + "epoch": 1.2838801711840229, + "grad_norm": 9.5, "learning_rate": 7.196000000000001e-05, - "loss": 0.9121, + "loss": 1.1661, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval/acc": 30.813953399658203, + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval_loss": 2.6508796215057373, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.798, - "eval_steps_per_second": 4.577, + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, "step": 1800 }, { - "epoch": 0.21791476041415844, - "grad_norm": 16.5, + "epoch": 1.291012838801712, + "grad_norm": 14.3125, "learning_rate": 7.236e-05, - "loss": 0.9044, + "loss": 1.1139, "step": 1810 }, { - "epoch": 0.21911870936672284, - "grad_norm": 15.1875, + "epoch": 1.298145506419401, + "grad_norm": 41.5, "learning_rate": 7.276e-05, - "loss": 0.9552, + "loss": 1.0817, "step": 1820 }, { - "epoch": 0.22032265831928727, - "grad_norm": 11.375, + "epoch": 1.3052781740370898, + "grad_norm": 15.125, "learning_rate": 7.316000000000001e-05, - "loss": 0.9264, + "loss": 1.2462, "step": 1830 }, { - "epoch": 0.22152660727185167, - "grad_norm": 8.8125, + "epoch": 1.3124108416547788, + "grad_norm": 33.25, "learning_rate": 7.356000000000001e-05, - "loss": 0.8928, + "loss": 1.1143, "step": 1840 }, { - "epoch": 0.2227305562244161, - "grad_norm": 9.625, + "epoch": 1.3195435092724679, + "grad_norm": 13.625, "learning_rate": 7.396e-05, - "loss": 0.9515, + "loss": 1.1783, "step": 1850 }, { - "epoch": 0.2239345051769805, - "grad_norm": 31.0, + "epoch": 1.326676176890157, + "grad_norm": 18.375, "learning_rate": 7.436000000000001e-05, - "loss": 0.8989, + "loss": 1.2101, "step": 1860 }, { - "epoch": 0.22513845412954492, - "grad_norm": 9.5, + "epoch": 1.333808844507846, + "grad_norm": 13.875, "learning_rate": 7.476000000000001e-05, - "loss": 1.0206, + "loss": 1.1348, "step": 1870 }, { - "epoch": 0.22634240308210932, - "grad_norm": 8.625, + "epoch": 1.340941512125535, + "grad_norm": 13.9375, "learning_rate": 7.516e-05, - "loss": 0.8961, + "loss": 1.0747, "step": 1880 }, { - "epoch": 0.22754635203467374, - "grad_norm": 9.0, + "epoch": 1.3480741797432239, + "grad_norm": 29.75, "learning_rate": 7.556000000000002e-05, - "loss": 0.9421, + "loss": 1.1895, "step": 1890 }, { - "epoch": 0.22875030098723814, - "grad_norm": 12.0625, + "epoch": 1.355206847360913, + "grad_norm": 17.25, "learning_rate": 7.596000000000001e-05, - "loss": 0.9049, + "loss": 1.2512, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval/acc": 36.046512603759766, + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval_loss": 2.636018753051758, - "eval_runtime": 0.2084, - "eval_samples_per_second": 206.343, - "eval_steps_per_second": 4.799, + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, "step": 1900 }, { - "epoch": 0.22995424993980254, - "grad_norm": 8.0625, + "epoch": 1.362339514978602, + "grad_norm": 21.125, "learning_rate": 7.636e-05, - "loss": 0.8983, + "loss": 1.1306, "step": 1910 }, { - "epoch": 0.23115819889236697, - "grad_norm": 11.875, + "epoch": 1.369472182596291, + "grad_norm": 9.0625, "learning_rate": 7.676e-05, - "loss": 0.9293, + "loss": 1.1139, "step": 1920 }, { - "epoch": 0.23236214784493137, - "grad_norm": 11.75, + "epoch": 1.37660485021398, + "grad_norm": 30.25, "learning_rate": 7.716e-05, - "loss": 0.8602, + "loss": 1.1595, "step": 1930 }, { - "epoch": 0.2335660967974958, - "grad_norm": 11.5625, + "epoch": 1.383737517831669, + "grad_norm": 13.6875, "learning_rate": 7.756e-05, - "loss": 0.8078, + "loss": 1.2437, "step": 1940 }, { - "epoch": 0.2347700457500602, - "grad_norm": 9.125, + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, "learning_rate": 7.796e-05, - "loss": 0.8773, + "loss": 1.1005, "step": 1950 }, { - "epoch": 0.23597399470262462, - "grad_norm": 10.6875, + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, "learning_rate": 7.836e-05, - "loss": 0.8464, + "loss": 1.0748, "step": 1960 }, { - "epoch": 0.23717794365518902, - "grad_norm": 18.25, + "epoch": 1.405135520684736, + "grad_norm": 9.125, "learning_rate": 7.876e-05, - "loss": 0.8779, + "loss": 1.1576, "step": 1970 }, { - "epoch": 0.23838189260775344, - "grad_norm": 10.875, + "epoch": 1.412268188302425, + "grad_norm": 11.375, "learning_rate": 7.916e-05, - "loss": 0.9351, + "loss": 1.0982, "step": 1980 }, { - "epoch": 0.23958584156031784, - "grad_norm": 11.0, + "epoch": 1.4194008559201141, + "grad_norm": 10.375, "learning_rate": 7.956e-05, - "loss": 0.8581, + "loss": 1.132, "step": 1990 }, { - "epoch": 0.24078979051288224, - "grad_norm": 8.875, + "epoch": 1.4265335235378032, + "grad_norm": 16.375, "learning_rate": 7.996e-05, - "loss": 0.9799, + "loss": 1.121, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval/acc": 36.046512603759766, + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval_loss": 2.716654062271118, - "eval_runtime": 0.21, - "eval_samples_per_second": 204.721, - "eval_steps_per_second": 4.761, + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, "step": 2000 }, { - "epoch": 0.24199373946544667, - "grad_norm": 11.0625, + "epoch": 1.4336661911554922, + "grad_norm": 9.125, "learning_rate": 8.036e-05, - "loss": 0.8678, + "loss": 1.2079, "step": 2010 }, { - "epoch": 0.24319768841801107, + "epoch": 1.440798858773181, "grad_norm": 12.125, "learning_rate": 8.076e-05, - "loss": 0.8832, + "loss": 1.1098, "step": 2020 }, { - "epoch": 0.2444016373705755, - "grad_norm": 8.25, + "epoch": 1.44793152639087, + "grad_norm": 8.8125, "learning_rate": 8.116e-05, - "loss": 0.8689, + "loss": 0.9849, "step": 2030 }, { - "epoch": 0.2456055863231399, - "grad_norm": 6.53125, + "epoch": 1.4550641940085591, + "grad_norm": 9.0, "learning_rate": 8.156e-05, - "loss": 0.8829, + "loss": 1.0905, "step": 2040 }, { - "epoch": 0.24680953527570432, - "grad_norm": 9.5625, + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, "learning_rate": 8.196000000000001e-05, - "loss": 0.9181, + "loss": 1.2211, "step": 2050 }, { - "epoch": 0.24801348422826872, - "grad_norm": 22.875, + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, "learning_rate": 8.236e-05, - "loss": 0.8011, + "loss": 1.0968, "step": 2060 }, { - "epoch": 0.24921743318083314, - "grad_norm": 14.4375, + "epoch": 1.4764621968616263, + "grad_norm": 9.0, "learning_rate": 8.276e-05, - "loss": 0.9163, + "loss": 1.0973, "step": 2070 }, { - "epoch": 0.25042138213339754, - "grad_norm": 10.625, + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, "learning_rate": 8.316000000000001e-05, - "loss": 0.7869, + "loss": 1.1012, "step": 2080 }, { - "epoch": 0.25162533108596197, - "grad_norm": 11.0, + "epoch": 1.4907275320970044, + "grad_norm": 31.0, "learning_rate": 8.356e-05, - "loss": 0.8779, + "loss": 1.0437, "step": 2090 }, { - "epoch": 0.2528292800385264, - "grad_norm": 12.625, + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, "learning_rate": 8.396e-05, - "loss": 0.889, + "loss": 1.0934, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval/acc": 37.20930099487305, + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval_loss": 2.626293182373047, - "eval_runtime": 0.2735, - "eval_samples_per_second": 157.235, - "eval_steps_per_second": 3.657, + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, "step": 2100 }, { - "epoch": 0.25403322899109076, - "grad_norm": 8.3125, + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, "learning_rate": 8.436000000000001e-05, - "loss": 0.8363, + "loss": 1.0862, "step": 2110 }, { - "epoch": 0.2552371779436552, - "grad_norm": 8.625, + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, "learning_rate": 8.476000000000001e-05, - "loss": 0.8762, + "loss": 1.0786, "step": 2120 }, { - "epoch": 0.2564411268962196, - "grad_norm": 7.4375, + "epoch": 1.5192582025677603, + "grad_norm": 8.25, "learning_rate": 8.516e-05, - "loss": 0.7925, + "loss": 1.1496, "step": 2130 }, { - "epoch": 0.257645075848784, - "grad_norm": 9.1875, + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, "learning_rate": 8.556e-05, - "loss": 0.9575, + "loss": 1.1132, "step": 2140 }, { - "epoch": 0.2588490248013484, - "grad_norm": 9.8125, + "epoch": 1.5335235378031382, + "grad_norm": 21.375, "learning_rate": 8.596000000000001e-05, - "loss": 0.7551, + "loss": 1.1043, "step": 2150 }, { - "epoch": 0.26005297375391284, - "grad_norm": 7.15625, + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, "learning_rate": 8.636e-05, - "loss": 0.808, + "loss": 1.2549, "step": 2160 }, { - "epoch": 0.26125692270647727, - "grad_norm": 8.3125, + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, "learning_rate": 8.676e-05, - "loss": 0.9449, + "loss": 1.115, "step": 2170 }, { - "epoch": 0.26246087165904164, - "grad_norm": 11.5, + "epoch": 1.5549215406562054, + "grad_norm": 8.375, "learning_rate": 8.716000000000001e-05, - "loss": 0.8712, + "loss": 1.1963, "step": 2180 }, { - "epoch": 0.26366482061160607, - "grad_norm": 8.0, + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, "learning_rate": 8.756000000000001e-05, - "loss": 0.9389, + "loss": 1.1697, "step": 2190 }, { - "epoch": 0.2648687695641705, - "grad_norm": 13.5, + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, "learning_rate": 8.796e-05, - "loss": 0.7875, + "loss": 0.9716, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval/acc": 35.46511459350586, + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval_loss": 2.5862526893615723, - "eval_runtime": 0.2151, - "eval_samples_per_second": 199.927, - "eval_steps_per_second": 4.649, + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, "step": 2200 }, { - "epoch": 0.26607271851673486, - "grad_norm": 11.5625, + "epoch": 1.5763195435092725, + "grad_norm": 10.0, "learning_rate": 8.836000000000001e-05, - "loss": 0.9947, + "loss": 1.0254, "step": 2210 }, { - "epoch": 0.2672766674692993, - "grad_norm": 8.25, + "epoch": 1.5834522111269616, + "grad_norm": 12.625, "learning_rate": 8.876e-05, - "loss": 0.717, + "loss": 1.1672, "step": 2220 }, { - "epoch": 0.2684806164218637, - "grad_norm": 26.25, + "epoch": 1.5905848787446506, + "grad_norm": 11.5, "learning_rate": 8.916e-05, - "loss": 0.8688, + "loss": 1.0656, "step": 2230 }, { - "epoch": 0.26968456537442814, - "grad_norm": 11.5, + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, "learning_rate": 8.956e-05, - "loss": 0.9134, + "loss": 1.035, "step": 2240 }, { - "epoch": 0.2708885143269925, - "grad_norm": 6.875, + "epoch": 1.6048502139800287, + "grad_norm": 9.25, "learning_rate": 8.996e-05, - "loss": 0.8592, + "loss": 1.0972, "step": 2250 }, { - "epoch": 0.27209246327955694, - "grad_norm": 7.21875, + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, "learning_rate": 9.036e-05, - "loss": 0.6548, + "loss": 1.0148, "step": 2260 }, { - "epoch": 0.27329641223212137, - "grad_norm": 12.25, + "epoch": 1.6191155492154066, + "grad_norm": 13.5, "learning_rate": 9.076e-05, - "loss": 0.8613, + "loss": 1.1202, "step": 2270 }, { - "epoch": 0.2745003611846858, - "grad_norm": 8.875, + "epoch": 1.6262482168330956, + "grad_norm": 9.125, "learning_rate": 9.116e-05, - "loss": 0.7455, + "loss": 1.1134, "step": 2280 }, { - "epoch": 0.27570431013725016, - "grad_norm": 12.5625, + "epoch": 1.6333808844507844, + "grad_norm": 15.25, "learning_rate": 9.156e-05, - "loss": 0.8458, + "loss": 1.0373, "step": 2290 }, { - "epoch": 0.2769082590898146, - "grad_norm": 8.8125, + "epoch": 1.6405135520684735, + "grad_norm": 9.125, "learning_rate": 9.196000000000001e-05, - "loss": 0.8003, + "loss": 1.0654, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval/acc": 32.55813980102539, + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval_loss": 2.6594340801239014, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.965, - "eval_steps_per_second": 4.697, + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, "step": 2300 }, { - "epoch": 0.278112208042379, - "grad_norm": 10.6875, + "epoch": 1.6476462196861625, + "grad_norm": 8.25, "learning_rate": 9.236e-05, - "loss": 0.812, + "loss": 1.0218, "step": 2310 }, { - "epoch": 0.2793161569949434, - "grad_norm": 12.1875, + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, "learning_rate": 9.276e-05, - "loss": 0.781, + "loss": 1.106, "step": 2320 }, { - "epoch": 0.2805201059475078, - "grad_norm": 8.125, + "epoch": 1.6619115549215406, + "grad_norm": 8.25, "learning_rate": 9.316000000000001e-05, - "loss": 0.9682, + "loss": 1.0558, "step": 2330 }, { - "epoch": 0.28172405490007224, - "grad_norm": 8.8125, + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, "learning_rate": 9.356e-05, - "loss": 0.7531, + "loss": 0.9931, "step": 2340 }, { - "epoch": 0.28292800385263667, - "grad_norm": 7.375, + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, "learning_rate": 9.396e-05, - "loss": 0.7235, + "loss": 1.0683, "step": 2350 }, { - "epoch": 0.28413195280520104, - "grad_norm": 7.8125, + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, "learning_rate": 9.436e-05, - "loss": 0.9204, + "loss": 1.0631, "step": 2360 }, { - "epoch": 0.28533590175776546, - "grad_norm": 6.65625, + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, "learning_rate": 9.476000000000001e-05, - "loss": 0.7636, + "loss": 1.049, "step": 2370 }, { - "epoch": 0.2865398507103299, - "grad_norm": 9.625, + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, "learning_rate": 9.516e-05, - "loss": 0.855, + "loss": 1.0259, "step": 2380 }, { - "epoch": 0.2877437996628943, - "grad_norm": 9.6875, + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, "learning_rate": 9.556e-05, - "loss": 0.8643, + "loss": 1.0085, "step": 2390 }, { - "epoch": 0.2889477486154587, - "grad_norm": 7.1875, + "epoch": 1.7118402282453637, + "grad_norm": 131.0, "learning_rate": 9.596000000000001e-05, - "loss": 0.8258, + "loss": 0.944, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval/acc": 36.627906799316406, + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval_loss": 2.7174084186553955, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.672, - "eval_steps_per_second": 4.737, + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, "step": 2400 }, { - "epoch": 0.2901516975680231, - "grad_norm": 7.65625, + "epoch": 1.7189728958630528, + "grad_norm": 8.375, "learning_rate": 9.636e-05, - "loss": 0.8752, + "loss": 1.0069, "step": 2410 }, { - "epoch": 0.29135564652058754, - "grad_norm": 8.75, + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, "learning_rate": 9.676e-05, - "loss": 0.8082, + "loss": 1.0648, "step": 2420 }, { - "epoch": 0.2925595954731519, - "grad_norm": 10.4375, + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, "learning_rate": 9.716000000000001e-05, - "loss": 0.7538, + "loss": 1.0594, "step": 2430 }, { - "epoch": 0.29376354442571634, - "grad_norm": 6.4375, + "epoch": 1.7403708987161197, + "grad_norm": 8.75, "learning_rate": 9.756000000000001e-05, - "loss": 0.7766, + "loss": 1.2082, "step": 2440 }, { - "epoch": 0.29496749337828077, - "grad_norm": 7.96875, + "epoch": 1.7475035663338088, + "grad_norm": 9.875, "learning_rate": 9.796e-05, - "loss": 0.844, + "loss": 1.0225, "step": 2450 }, { - "epoch": 0.2961714423308452, - "grad_norm": 7.75, + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, "learning_rate": 9.836000000000001e-05, - "loss": 0.7127, + "loss": 0.9975, "step": 2460 }, { - "epoch": 0.29737539128340956, - "grad_norm": 11.5, + "epoch": 1.7617689015691869, + "grad_norm": 21.0, "learning_rate": 9.876000000000001e-05, - "loss": 0.8363, + "loss": 0.9533, "step": 2470 }, { - "epoch": 0.298579340235974, - "grad_norm": 6.4375, + "epoch": 1.768901569186876, + "grad_norm": 7.65625, "learning_rate": 9.916e-05, - "loss": 0.7429, + "loss": 0.9619, "step": 2480 }, { - "epoch": 0.2997832891885384, - "grad_norm": 11.5, + "epoch": 1.776034236804565, + "grad_norm": 13.625, "learning_rate": 9.956e-05, - "loss": 0.736, + "loss": 0.9425, "step": 2490 }, { - "epoch": 0.30098723814110284, - "grad_norm": 9.25, + "epoch": 1.783166904422254, + "grad_norm": 12.375, "learning_rate": 9.996000000000001e-05, - "loss": 0.8365, + "loss": 0.9893, "step": 2500 }, { - "epoch": 0.30098723814110284, + "epoch": 1.783166904422254, "eval/acc": 39.53488540649414, "step": 2500 }, { - "epoch": 0.30098723814110284, - "eval_loss": 2.713433027267456, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.919, - "eval_steps_per_second": 4.789, + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 2500 }, { - "epoch": 0.3021911870936672, - "grad_norm": 7.03125, + "epoch": 1.790299572039943, + "grad_norm": 10.0, "learning_rate": 9.996000000000001e-05, - "loss": 0.7664, + "loss": 1.0137, "step": 2510 }, { - "epoch": 0.30339513604623164, - "grad_norm": 7.75, + "epoch": 1.797432239657632, + "grad_norm": 10.125, "learning_rate": 9.991555555555556e-05, - "loss": 0.9128, + "loss": 1.059, "step": 2520 }, { - "epoch": 0.30459908499879607, - "grad_norm": 9.0, + "epoch": 1.804564907275321, + "grad_norm": 32.0, "learning_rate": 9.987111111111111e-05, - "loss": 0.8045, + "loss": 1.0498, "step": 2530 }, { - "epoch": 0.30580303395136044, - "grad_norm": 8.9375, + "epoch": 1.81169757489301, + "grad_norm": 10.125, "learning_rate": 9.982666666666667e-05, - "loss": 0.8292, + "loss": 1.1431, "step": 2540 }, { - "epoch": 0.30700698290392486, - "grad_norm": 7.40625, + "epoch": 1.818830242510699, + "grad_norm": 7.90625, "learning_rate": 9.978222222222223e-05, - "loss": 0.7557, + "loss": 1.0715, "step": 2550 }, { - "epoch": 0.3082109318564893, - "grad_norm": 7.625, + "epoch": 1.825962910128388, + "grad_norm": 10.9375, "learning_rate": 9.973777777777778e-05, - "loss": 0.683, + "loss": 1.0446, "step": 2560 }, { - "epoch": 0.3094148808090537, - "grad_norm": 8.1875, + "epoch": 1.833095577746077, + "grad_norm": 13.0, "learning_rate": 9.969333333333334e-05, - "loss": 0.8052, + "loss": 1.0291, "step": 2570 }, { - "epoch": 0.3106188297616181, - "grad_norm": 8.4375, + "epoch": 1.840228245363766, + "grad_norm": 9.75, "learning_rate": 9.964888888888889e-05, - "loss": 0.7819, + "loss": 0.9713, "step": 2580 }, { - "epoch": 0.3118227787141825, - "grad_norm": 10.8125, + "epoch": 1.847360912981455, + "grad_norm": 10.5625, "learning_rate": 9.960444444444444e-05, - "loss": 0.8452, + "loss": 1.2157, "step": 2590 }, { - "epoch": 0.31302672766674694, - "grad_norm": 6.21875, + "epoch": 1.854493580599144, + "grad_norm": 9.3125, "learning_rate": 9.956e-05, - "loss": 0.7478, + "loss": 1.0455, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval/acc": 34.88372039794922, + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval_loss": 2.6625020503997803, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.644, - "eval_steps_per_second": 4.852, + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, "step": 2600 }, { - "epoch": 0.31423067661931137, - "grad_norm": 7.375, + "epoch": 1.861626248216833, + "grad_norm": 10.5, "learning_rate": 9.951555555555556e-05, - "loss": 0.7623, + "loss": 1.0604, "step": 2610 }, { - "epoch": 0.31543462557187574, - "grad_norm": 9.0, + "epoch": 1.8687589158345221, + "grad_norm": 9.375, "learning_rate": 9.947111111111111e-05, - "loss": 0.8223, + "loss": 0.8715, "step": 2620 }, { - "epoch": 0.31663857452444016, - "grad_norm": 6.75, + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, "learning_rate": 9.942666666666667e-05, - "loss": 0.7797, + "loss": 1.0034, "step": 2630 }, { - "epoch": 0.3178425234770046, - "grad_norm": 9.125, + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, "learning_rate": 9.938222222222224e-05, - "loss": 0.6746, + "loss": 1.0557, "step": 2640 }, { - "epoch": 0.31904647242956896, - "grad_norm": 8.5, + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, "learning_rate": 9.933777777777779e-05, - "loss": 0.8434, + "loss": 0.974, "step": 2650 }, { - "epoch": 0.3202504213821334, - "grad_norm": 10.3125, + "epoch": 1.8972895863052783, + "grad_norm": 10.875, "learning_rate": 9.929333333333333e-05, - "loss": 0.8625, + "loss": 1.1366, "step": 2660 }, { - "epoch": 0.3214543703346978, - "grad_norm": 8.125, + "epoch": 1.9044222539229672, + "grad_norm": 28.75, "learning_rate": 9.92488888888889e-05, - "loss": 0.8003, + "loss": 1.0135, "step": 2670 }, { - "epoch": 0.32265831928726224, - "grad_norm": 8.5625, + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, "learning_rate": 9.920444444444444e-05, - "loss": 0.8145, + "loss": 1.0263, "step": 2680 }, { - "epoch": 0.3238622682398266, - "grad_norm": 8.0, + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, "learning_rate": 9.916e-05, - "loss": 0.6519, + "loss": 0.9952, "step": 2690 }, { - "epoch": 0.32506621719239104, - "grad_norm": 8.5625, + "epoch": 1.925820256776034, + "grad_norm": 8.8125, "learning_rate": 9.911555555555557e-05, - "loss": 0.7627, + "loss": 1.0438, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval/acc": 38.953487396240234, + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval_loss": 2.629239082336426, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.931, - "eval_steps_per_second": 4.626, + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, "step": 2700 }, { - "epoch": 0.32627016614495546, - "grad_norm": 7.625, + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, "learning_rate": 9.907111111111112e-05, - "loss": 0.7265, + "loss": 0.9522, "step": 2710 }, { - "epoch": 0.3274741150975199, - "grad_norm": 7.15625, + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, "learning_rate": 9.902666666666666e-05, - "loss": 0.7468, + "loss": 0.9729, "step": 2720 }, { - "epoch": 0.32867806405008426, - "grad_norm": 8.5, + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, "learning_rate": 9.898222222222223e-05, - "loss": 0.7816, + "loss": 1.0528, "step": 2730 }, { - "epoch": 0.3298820130026487, - "grad_norm": 6.8125, + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, "learning_rate": 9.893777777777779e-05, - "loss": 0.7828, + "loss": 1.1212, "step": 2740 }, { - "epoch": 0.3310859619552131, - "grad_norm": 8.5625, + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, "learning_rate": 9.889333333333334e-05, - "loss": 0.8273, + "loss": 0.9866, "step": 2750 }, { - "epoch": 0.3322899109077775, - "grad_norm": 7.28125, + "epoch": 1.9686162624821684, + "grad_norm": 8.25, "learning_rate": 9.884888888888889e-05, - "loss": 0.6265, + "loss": 0.8616, "step": 2760 }, { - "epoch": 0.3334938598603419, - "grad_norm": 7.78125, + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, "learning_rate": 9.880444444444445e-05, - "loss": 0.8716, + "loss": 0.9972, "step": 2770 }, { - "epoch": 0.33469780881290634, - "grad_norm": 6.0, + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, "learning_rate": 9.876000000000001e-05, - "loss": 0.7587, + "loss": 0.9781, "step": 2780 }, { - "epoch": 0.33590175776547077, - "grad_norm": 11.8125, + "epoch": 1.9900142653352355, + "grad_norm": 10.75, "learning_rate": 9.871555555555556e-05, - "loss": 0.836, + "loss": 1.0579, "step": 2790 }, { - "epoch": 0.33710570671803514, - "grad_norm": 8.3125, + "epoch": 1.9971469329529246, + "grad_norm": 8.25, "learning_rate": 9.867111111111112e-05, - "loss": 0.7196, + "loss": 1.0323, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval/acc": 34.88372039794922, + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval_loss": 2.5979089736938477, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.843, - "eval_steps_per_second": 4.717, + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, "step": 2800 }, { - "epoch": 0.33830965567059956, - "grad_norm": 8.125, + "epoch": 2.0042796005706136, + "grad_norm": 10.25, "learning_rate": 9.862666666666667e-05, - "loss": 0.7128, + "loss": 1.0597, "step": 2810 }, { - "epoch": 0.339513604623164, - "grad_norm": 7.0, + "epoch": 2.011412268188302, + "grad_norm": 7.0625, "learning_rate": 9.858222222222223e-05, - "loss": 0.8709, + "loss": 0.9582, "step": 2820 }, { - "epoch": 0.3407175535757284, - "grad_norm": 10.875, + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, "learning_rate": 9.853777777777778e-05, - "loss": 0.6885, + "loss": 1.0058, "step": 2830 }, { - "epoch": 0.3419215025282928, - "grad_norm": 6.625, + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, "learning_rate": 9.849333333333334e-05, - "loss": 0.8262, + "loss": 1.009, "step": 2840 }, { - "epoch": 0.3431254514808572, - "grad_norm": 9.0625, + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, "learning_rate": 9.844888888888889e-05, - "loss": 0.6365, + "loss": 0.93, "step": 2850 }, { - "epoch": 0.34432940043342164, - "grad_norm": 7.96875, + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, "learning_rate": 9.840444444444445e-05, - "loss": 0.8177, + "loss": 1.0953, "step": 2860 }, { - "epoch": 0.345533349385986, - "grad_norm": 6.71875, + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, "learning_rate": 9.836000000000001e-05, - "loss": 0.7043, + "loss": 1.0437, "step": 2870 }, { - "epoch": 0.34673729833855044, - "grad_norm": 10.4375, + "epoch": 2.0542082738944365, + "grad_norm": 8.75, "learning_rate": 9.831555555555556e-05, - "loss": 0.7503, + "loss": 0.9873, "step": 2880 }, { - "epoch": 0.34794124729111486, - "grad_norm": 7.375, + "epoch": 2.0613409415121255, + "grad_norm": 8.375, "learning_rate": 9.827111111111111e-05, - "loss": 0.7532, + "loss": 0.9414, "step": 2890 }, { - "epoch": 0.3491451962436793, - "grad_norm": 7.65625, + "epoch": 2.0684736091298146, + "grad_norm": 9.0, "learning_rate": 9.822666666666667e-05, - "loss": 0.6942, + "loss": 0.9625, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval/acc": 37.79069900512695, + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval_loss": 2.698911190032959, - "eval_runtime": 1.2554, - "eval_samples_per_second": 34.253, - "eval_steps_per_second": 0.797, + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, "step": 2900 }, { - "epoch": 0.35034914519624366, - "grad_norm": 7.1875, + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, "learning_rate": 9.818222222222223e-05, - "loss": 0.7651, + "loss": 1.0246, "step": 2910 }, { - "epoch": 0.3515530941488081, - "grad_norm": 6.0, + "epoch": 2.0827389443651927, + "grad_norm": 8.125, "learning_rate": 9.813777777777778e-05, - "loss": 0.7786, + "loss": 0.9646, "step": 2920 }, { - "epoch": 0.3527570431013725, - "grad_norm": 9.375, + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, "learning_rate": 9.809333333333333e-05, - "loss": 0.8285, + "loss": 1.0022, "step": 2930 }, { - "epoch": 0.35396099205393694, - "grad_norm": 6.4375, + "epoch": 2.097004279600571, + "grad_norm": 8.625, "learning_rate": 9.80488888888889e-05, - "loss": 0.7339, + "loss": 0.9834, "step": 2940 }, { - "epoch": 0.3551649410065013, - "grad_norm": 8.8125, + "epoch": 2.10413694721826, + "grad_norm": 45.25, "learning_rate": 9.800444444444446e-05, - "loss": 0.6948, + "loss": 0.9159, "step": 2950 }, { - "epoch": 0.35636888995906574, - "grad_norm": 11.4375, + "epoch": 2.1112696148359484, + "grad_norm": 9.375, "learning_rate": 9.796e-05, - "loss": 0.8455, + "loss": 1.0598, "step": 2960 }, { - "epoch": 0.35757283891163016, - "grad_norm": 8.5625, + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, "learning_rate": 9.791555555555557e-05, - "loss": 0.791, + "loss": 0.8848, "step": 2970 }, { - "epoch": 0.35877678786419454, - "grad_norm": 7.84375, + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, "learning_rate": 9.787111111111111e-05, - "loss": 0.8574, + "loss": 0.942, "step": 2980 }, { - "epoch": 0.35998073681675896, - "grad_norm": 9.4375, + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, "learning_rate": 9.782666666666666e-05, - "loss": 0.7923, + "loss": 0.9583, "step": 2990 }, { - "epoch": 0.3611846857693234, - "grad_norm": 8.0625, + "epoch": 2.1398002853067046, + "grad_norm": 9.0, "learning_rate": 9.778222222222222e-05, - "loss": 0.863, + "loss": 0.9836, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval/acc": 41.86046600341797, + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval_loss": 2.5240559577941895, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.269, - "eval_steps_per_second": 4.75, + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, "step": 3000 }, { - "epoch": 0.3623886347218878, - "grad_norm": 6.71875, + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, "learning_rate": 9.773777777777779e-05, - "loss": 0.7726, + "loss": 1.028, "step": 3010 }, { - "epoch": 0.3635925836744522, - "grad_norm": 8.125, + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, "learning_rate": 9.769333333333334e-05, - "loss": 0.8234, + "loss": 0.9209, "step": 3020 }, { - "epoch": 0.3647965326270166, - "grad_norm": 7.90625, + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, "learning_rate": 9.764888888888888e-05, - "loss": 0.8125, + "loss": 0.9999, "step": 3030 }, { - "epoch": 0.36600048157958104, - "grad_norm": 5.875, + "epoch": 2.168330955777461, + "grad_norm": 8.375, "learning_rate": 9.760444444444446e-05, - "loss": 0.739, + "loss": 0.9576, "step": 3040 }, { - "epoch": 0.3672044305321454, - "grad_norm": 32.75, + "epoch": 2.17546362339515, + "grad_norm": 7.4375, "learning_rate": 9.756000000000001e-05, - "loss": 0.8773, + "loss": 0.8832, "step": 3050 }, { - "epoch": 0.36840837948470984, - "grad_norm": 8.625, + "epoch": 2.182596291012839, + "grad_norm": 8.125, "learning_rate": 9.751555555555556e-05, - "loss": 0.6411, + "loss": 0.933, "step": 3060 }, { - "epoch": 0.36961232843727426, - "grad_norm": 10.0625, + "epoch": 2.189728958630528, + "grad_norm": 8.9375, "learning_rate": 9.747111111111112e-05, - "loss": 0.7757, + "loss": 0.9962, "step": 3070 }, { - "epoch": 0.3708162773898387, - "grad_norm": 7.78125, + "epoch": 2.196861626248217, + "grad_norm": 7.1875, "learning_rate": 9.742666666666667e-05, - "loss": 0.8144, + "loss": 1.003, "step": 3080 }, { - "epoch": 0.37202022634240306, - "grad_norm": 8.25, + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, "learning_rate": 9.738222222222223e-05, - "loss": 0.7915, + "loss": 0.9441, "step": 3090 }, { - "epoch": 0.3732241752949675, - "grad_norm": 9.5, + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, "learning_rate": 9.733777777777778e-05, - "loss": 0.7808, + "loss": 1.0335, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval/acc": 39.53488540649414, + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval_loss": 2.6263325214385986, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.065, - "eval_steps_per_second": 4.746, + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, "step": 3100 }, { - "epoch": 0.3744281242475319, - "grad_norm": 7.34375, + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, "learning_rate": 9.729333333333334e-05, - "loss": 0.6467, + "loss": 0.9694, "step": 3110 }, { - "epoch": 0.37563207320009634, - "grad_norm": 10.5625, + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, "learning_rate": 9.724888888888889e-05, - "loss": 0.7271, + "loss": 1.0386, "step": 3120 }, { - "epoch": 0.3768360221526607, - "grad_norm": 19.375, + "epoch": 2.232524964336662, + "grad_norm": 8.6875, "learning_rate": 9.720444444444445e-05, - "loss": 0.8248, + "loss": 0.9614, "step": 3130 }, { - "epoch": 0.37803997110522514, - "grad_norm": 11.6875, + "epoch": 2.239657631954351, + "grad_norm": 8.3125, "learning_rate": 9.716000000000001e-05, - "loss": 0.7468, + "loss": 1.0643, "step": 3140 }, { - "epoch": 0.37924392005778956, - "grad_norm": 6.71875, + "epoch": 2.24679029957204, + "grad_norm": 8.125, "learning_rate": 9.711555555555556e-05, - "loss": 0.8189, + "loss": 0.9243, "step": 3150 }, { - "epoch": 0.38044786901035393, - "grad_norm": 7.15625, + "epoch": 2.253922967189729, + "grad_norm": 9.125, "learning_rate": 9.707111111111111e-05, - "loss": 0.7265, + "loss": 0.8419, "step": 3160 }, { - "epoch": 0.38165181796291836, - "grad_norm": 11.9375, + "epoch": 2.261055634807418, + "grad_norm": 9.125, "learning_rate": 9.702666666666667e-05, - "loss": 0.7502, + "loss": 0.9961, "step": 3170 }, { - "epoch": 0.3828557669154828, - "grad_norm": 7.78125, + "epoch": 2.268188302425107, + "grad_norm": 6.3125, "learning_rate": 9.698222222222223e-05, - "loss": 0.8412, + "loss": 0.8931, "step": 3180 }, { - "epoch": 0.3840597158680472, - "grad_norm": 6.75, + "epoch": 2.275320970042796, + "grad_norm": 7.875, "learning_rate": 9.693777777777778e-05, - "loss": 0.8689, + "loss": 1.0057, "step": 3190 }, { - "epoch": 0.3852636648206116, - "grad_norm": 7.6875, + "epoch": 2.282453637660485, + "grad_norm": 6.90625, "learning_rate": 9.689333333333333e-05, - "loss": 0.8053, + "loss": 0.9606, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval/acc": 39.53488540649414, + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval_loss": 2.6145706176757812, - "eval_runtime": 0.2093, - "eval_samples_per_second": 205.398, - "eval_steps_per_second": 4.777, + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, "step": 3200 }, { - "epoch": 0.386467613773176, - "grad_norm": 7.65625, + "epoch": 2.289586305278174, + "grad_norm": 11.8125, "learning_rate": 9.684888888888889e-05, - "loss": 0.7601, + "loss": 0.9218, "step": 3210 }, { - "epoch": 0.38767156272574044, - "grad_norm": 19.25, + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, "learning_rate": 9.680444444444445e-05, - "loss": 0.7944, + "loss": 1.0111, "step": 3220 }, { - "epoch": 0.38887551167830486, - "grad_norm": 9.375, + "epoch": 2.3038516405135523, + "grad_norm": 8.625, "learning_rate": 9.676e-05, - "loss": 0.839, + "loss": 1.0968, "step": 3230 }, { - "epoch": 0.39007946063086923, - "grad_norm": 8.5, + "epoch": 2.310984308131241, + "grad_norm": 7.1875, "learning_rate": 9.671555555555556e-05, - "loss": 0.7794, + "loss": 1.0236, "step": 3240 }, { - "epoch": 0.39128340958343366, - "grad_norm": 7.78125, + "epoch": 2.31811697574893, + "grad_norm": 6.84375, "learning_rate": 9.667111111111111e-05, - "loss": 0.753, + "loss": 0.92, "step": 3250 }, { - "epoch": 0.3924873585359981, - "grad_norm": 7.15625, + "epoch": 2.325249643366619, + "grad_norm": 8.75, "learning_rate": 9.662666666666667e-05, - "loss": 0.7326, + "loss": 0.8205, "step": 3260 }, { - "epoch": 0.39369130748856246, - "grad_norm": 13.4375, + "epoch": 2.332382310984308, + "grad_norm": 30.75, "learning_rate": 9.658222222222222e-05, - "loss": 0.6754, + "loss": 0.9676, "step": 3270 }, { - "epoch": 0.3948952564411269, - "grad_norm": 6.71875, + "epoch": 2.339514978601997, + "grad_norm": 13.0, "learning_rate": 9.653777777777778e-05, - "loss": 0.757, + "loss": 0.9086, "step": 3280 }, { - "epoch": 0.3960992053936913, - "grad_norm": 7.5625, + "epoch": 2.346647646219686, + "grad_norm": 9.375, "learning_rate": 9.649333333333333e-05, - "loss": 0.9203, + "loss": 1.0504, "step": 3290 }, { - "epoch": 0.39730315434625574, - "grad_norm": 8.375, + "epoch": 2.353780313837375, + "grad_norm": 39.0, "learning_rate": 9.64488888888889e-05, - "loss": 0.8552, + "loss": 0.9481, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval/acc": 44.1860466003418, + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval_loss": 2.571866273880005, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.479, - "eval_steps_per_second": 4.802, + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, "step": 3300 }, { - "epoch": 0.3985071032988201, - "grad_norm": 7.5625, + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, "learning_rate": 9.640444444444446e-05, - "loss": 0.7811, + "loss": 0.9641, "step": 3310 }, { - "epoch": 0.39971105225138454, - "grad_norm": 11.75, + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, "learning_rate": 9.636e-05, - "loss": 0.6717, + "loss": 0.9624, "step": 3320 }, { - "epoch": 0.40091500120394896, - "grad_norm": 8.1875, + "epoch": 2.3751783166904423, + "grad_norm": 12.625, "learning_rate": 9.631555555555555e-05, - "loss": 0.838, + "loss": 1.0082, "step": 3330 }, { - "epoch": 0.4021189501565134, - "grad_norm": 6.40625, + "epoch": 2.3823109843081314, + "grad_norm": 7.25, "learning_rate": 9.627111111111112e-05, - "loss": 0.8568, + "loss": 1.0249, "step": 3340 }, { - "epoch": 0.40332289910907776, - "grad_norm": 7.3125, + "epoch": 2.3894436519258204, + "grad_norm": 13.375, "learning_rate": 9.622666666666668e-05, - "loss": 0.6742, + "loss": 1.0153, "step": 3350 }, { - "epoch": 0.4045268480616422, - "grad_norm": 7.875, + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, "learning_rate": 9.618222222222223e-05, - "loss": 0.7849, + "loss": 0.9533, "step": 3360 }, { - "epoch": 0.4057307970142066, - "grad_norm": 8.5625, + "epoch": 2.403708987161198, + "grad_norm": 9.25, "learning_rate": 9.613777777777779e-05, - "loss": 0.7537, + "loss": 1.1051, "step": 3370 }, { - "epoch": 0.406934745966771, - "grad_norm": 8.5625, + "epoch": 2.410841654778887, + "grad_norm": 9.5625, "learning_rate": 9.609333333333334e-05, - "loss": 0.6935, + "loss": 1.0551, "step": 3380 }, { - "epoch": 0.4081386949193354, - "grad_norm": 6.3125, + "epoch": 2.417974322396576, + "grad_norm": 7.21875, "learning_rate": 9.604888888888889e-05, - "loss": 0.8065, + "loss": 0.9032, "step": 3390 }, { - "epoch": 0.40934264387189984, - "grad_norm": 26.25, + "epoch": 2.425106990014265, + "grad_norm": 8.5625, "learning_rate": 9.600444444444445e-05, - "loss": 0.6558, + "loss": 1.1008, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval/acc": 37.20930099487305, + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval_loss": 2.7212982177734375, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.345, - "eval_steps_per_second": 4.775, + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, "step": 3400 }, { - "epoch": 0.41054659282446426, - "grad_norm": 6.84375, + "epoch": 2.4322396576319543, + "grad_norm": 10.375, "learning_rate": 9.596000000000001e-05, - "loss": 0.7642, + "loss": 0.9562, "step": 3410 }, { - "epoch": 0.41175054177702863, - "grad_norm": 7.0625, + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, "learning_rate": 9.591555555555556e-05, - "loss": 0.7185, + "loss": 1.0756, "step": 3420 }, { - "epoch": 0.41295449072959306, - "grad_norm": 7.15625, + "epoch": 2.4465049928673324, + "grad_norm": 9.125, "learning_rate": 9.58711111111111e-05, - "loss": 0.6634, + "loss": 0.9554, "step": 3430 }, { - "epoch": 0.4141584396821575, - "grad_norm": 4.96875, + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, "learning_rate": 9.582666666666668e-05, - "loss": 0.6383, + "loss": 0.9122, "step": 3440 }, { - "epoch": 0.4153623886347219, - "grad_norm": 7.15625, + "epoch": 2.4607703281027105, + "grad_norm": 8.625, "learning_rate": 9.578222222222223e-05, - "loss": 0.8032, + "loss": 0.9311, "step": 3450 }, { - "epoch": 0.4165663375872863, - "grad_norm": 9.0625, + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, "learning_rate": 9.573777777777778e-05, - "loss": 0.7294, + "loss": 1.0023, "step": 3460 }, { - "epoch": 0.4177702865398507, - "grad_norm": 9.5, + "epoch": 2.4750356633380886, + "grad_norm": 8.125, "learning_rate": 9.569333333333334e-05, - "loss": 0.802, + "loss": 0.9172, "step": 3470 }, { - "epoch": 0.41897423549241514, - "grad_norm": 7.0, + "epoch": 2.4821683309557776, + "grad_norm": 7.375, "learning_rate": 9.56488888888889e-05, - "loss": 0.7307, + "loss": 0.9407, "step": 3480 }, { - "epoch": 0.4201781844449795, - "grad_norm": 6.34375, + "epoch": 2.4893009985734667, + "grad_norm": 10.25, "learning_rate": 9.560444444444445e-05, - "loss": 0.7239, + "loss": 0.9433, "step": 3490 }, { - "epoch": 0.42138213339754393, - "grad_norm": 6.5, + "epoch": 2.4964336661911553, + "grad_norm": 8.625, "learning_rate": 9.556e-05, - "loss": 0.6711, + "loss": 0.9934, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval/acc": 39.53488540649414, + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval_loss": 2.569326400756836, - "eval_runtime": 0.2066, - "eval_samples_per_second": 208.137, - "eval_steps_per_second": 4.84, + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 3500 }, { - "epoch": 0.42258608235010836, - "grad_norm": 8.125, + "epoch": 2.5035663338088447, + "grad_norm": 7.625, "learning_rate": 9.551555555555556e-05, - "loss": 0.695, + "loss": 0.9157, "step": 3510 }, { - "epoch": 0.4237900313026728, - "grad_norm": 8.3125, + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, "learning_rate": 9.547111111111111e-05, - "loss": 0.8691, + "loss": 0.9202, "step": 3520 }, { - "epoch": 0.42499398025523716, - "grad_norm": 8.6875, + "epoch": 2.5178316690442224, + "grad_norm": 9.25, "learning_rate": 9.542666666666667e-05, - "loss": 0.7582, + "loss": 0.8526, "step": 3530 }, { - "epoch": 0.4261979292078016, - "grad_norm": 7.25, + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, "learning_rate": 9.538222222222223e-05, - "loss": 0.7143, + "loss": 0.9562, "step": 3540 }, { - "epoch": 0.427401878160366, - "grad_norm": 8.6875, + "epoch": 2.5320970042796005, + "grad_norm": 9.75, "learning_rate": 9.533777777777778e-05, - "loss": 0.6754, + "loss": 0.9927, "step": 3550 }, { - "epoch": 0.42860582711293044, - "grad_norm": 7.8125, + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, "learning_rate": 9.529333333333333e-05, - "loss": 0.7153, + "loss": 0.9263, "step": 3560 }, { - "epoch": 0.4298097760654948, - "grad_norm": 7.5625, + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, "learning_rate": 9.52488888888889e-05, - "loss": 0.7293, + "loss": 0.9367, "step": 3570 }, { - "epoch": 0.43101372501805923, - "grad_norm": 7.5625, + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, "learning_rate": 9.520444444444446e-05, - "loss": 0.7066, + "loss": 0.9284, "step": 3580 }, { - "epoch": 0.43221767397062366, - "grad_norm": 8.1875, + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, "learning_rate": 9.516e-05, - "loss": 0.691, + "loss": 0.8394, "step": 3590 }, { - "epoch": 0.43342162292318803, - "grad_norm": 7.125, + "epoch": 2.5677603423680457, + "grad_norm": 10.25, "learning_rate": 9.511555555555555e-05, - "loss": 0.8239, + "loss": 0.9336, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval/acc": 44.1860466003418, + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval_loss": 2.4877374172210693, - "eval_runtime": 0.3957, - "eval_samples_per_second": 108.658, - "eval_steps_per_second": 2.527, + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, "step": 3600 }, { - "epoch": 0.43462557187575246, - "grad_norm": 6.375, + "epoch": 2.574893009985735, + "grad_norm": 10.0625, "learning_rate": 9.507111111111111e-05, - "loss": 0.6782, + "loss": 1.0005, "step": 3610 }, { - "epoch": 0.4358295208283169, - "grad_norm": 7.1875, + "epoch": 2.582025677603424, + "grad_norm": 8.375, "learning_rate": 9.502666666666668e-05, - "loss": 0.7602, + "loss": 0.9319, "step": 3620 }, { - "epoch": 0.4370334697808813, - "grad_norm": 8.125, + "epoch": 2.5891583452211124, + "grad_norm": 8.5, "learning_rate": 9.498222222222222e-05, - "loss": 0.7232, + "loss": 0.9125, "step": 3630 }, { - "epoch": 0.4382374187334457, - "grad_norm": 7.84375, + "epoch": 2.596291012838802, + "grad_norm": 7.71875, "learning_rate": 9.493777777777779e-05, - "loss": 0.729, + "loss": 0.9279, "step": 3640 }, { - "epoch": 0.4394413676860101, - "grad_norm": 8.375, + "epoch": 2.6034236804564905, + "grad_norm": 11.875, "learning_rate": 9.489333333333334e-05, - "loss": 0.8222, + "loss": 0.952, "step": 3650 }, { - "epoch": 0.44064531663857454, - "grad_norm": 8.125, + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, "learning_rate": 9.48488888888889e-05, - "loss": 0.6918, + "loss": 1.0043, "step": 3660 }, { - "epoch": 0.44184926559113896, - "grad_norm": 8.1875, + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, "learning_rate": 9.480444444444445e-05, - "loss": 0.6761, + "loss": 0.8932, "step": 3670 }, { - "epoch": 0.44305321454370333, - "grad_norm": 5.65625, + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, "learning_rate": 9.476000000000001e-05, - "loss": 0.7532, + "loss": 0.8775, "step": 3680 }, { - "epoch": 0.44425716349626776, - "grad_norm": 8.8125, + "epoch": 2.6319543509272467, + "grad_norm": 9.0, "learning_rate": 9.471555555555556e-05, - "loss": 0.7072, + "loss": 0.9756, "step": 3690 }, { - "epoch": 0.4454611124488322, - "grad_norm": 6.5625, + "epoch": 2.6390870185449358, + "grad_norm": 7.375, "learning_rate": 9.46711111111111e-05, - "loss": 0.8405, + "loss": 0.9345, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval/acc": 39.53488540649414, + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval_loss": 2.615053176879883, - "eval_runtime": 4.8304, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 0.207, + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, "step": 3700 }, { - "epoch": 0.44666506140139656, - "grad_norm": 8.6875, + "epoch": 2.646219686162625, + "grad_norm": 8.4375, "learning_rate": 9.462666666666668e-05, - "loss": 0.7249, + "loss": 0.9851, "step": 3710 }, { - "epoch": 0.447869010353961, - "grad_norm": 8.4375, + "epoch": 2.653352353780314, + "grad_norm": 31.75, "learning_rate": 9.458222222222223e-05, - "loss": 0.8561, + "loss": 0.9712, "step": 3720 }, { - "epoch": 0.4490729593065254, - "grad_norm": 7.3125, + "epoch": 2.660485021398003, + "grad_norm": 6.75, "learning_rate": 9.453777777777778e-05, - "loss": 0.7884, + "loss": 0.8641, "step": 3730 }, { - "epoch": 0.45027690825908984, - "grad_norm": 7.34375, + "epoch": 2.667617689015692, + "grad_norm": 6.5625, "learning_rate": 9.449333333333334e-05, - "loss": 0.7169, + "loss": 0.945, "step": 3740 }, { - "epoch": 0.4514808572116542, - "grad_norm": 5.5, + "epoch": 2.674750356633381, + "grad_norm": 6.0625, "learning_rate": 9.44488888888889e-05, - "loss": 0.7542, + "loss": 0.9535, "step": 3750 }, { - "epoch": 0.45268480616421863, - "grad_norm": 6.09375, + "epoch": 2.68188302425107, + "grad_norm": 7.90625, "learning_rate": 9.440444444444445e-05, - "loss": 0.6292, + "loss": 0.8844, "step": 3760 }, { - "epoch": 0.45388875511678306, - "grad_norm": 8.9375, + "epoch": 2.689015691868759, + "grad_norm": 9.8125, "learning_rate": 9.436e-05, - "loss": 0.6682, + "loss": 0.9064, "step": 3770 }, { - "epoch": 0.4550927040693475, - "grad_norm": 5.09375, + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, "learning_rate": 9.431555555555556e-05, - "loss": 0.6499, + "loss": 1.0119, "step": 3780 }, { - "epoch": 0.45629665302191186, - "grad_norm": 8.5, + "epoch": 2.703281027104137, + "grad_norm": 7.15625, "learning_rate": 9.427111111111112e-05, - "loss": 0.7859, + "loss": 0.9655, "step": 3790 }, { - "epoch": 0.4575006019744763, - "grad_norm": 14.5, + "epoch": 2.710413694721826, + "grad_norm": 9.4375, "learning_rate": 9.422666666666667e-05, - "loss": 0.7987, + "loss": 0.9187, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval/acc": 39.53488540649414, + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval_loss": 2.645066022872925, - "eval_runtime": 0.6165, - "eval_samples_per_second": 69.745, - "eval_steps_per_second": 1.622, + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, "step": 3800 }, { - "epoch": 0.4587045509270407, - "grad_norm": 6.25, + "epoch": 2.717546362339515, + "grad_norm": 9.25, "learning_rate": 9.418222222222223e-05, - "loss": 0.7035, + "loss": 0.8689, "step": 3810 }, { - "epoch": 0.4599084998796051, - "grad_norm": 6.46875, + "epoch": 2.724679029957204, + "grad_norm": 8.0625, "learning_rate": 9.413777777777778e-05, - "loss": 0.6329, + "loss": 0.9138, "step": 3820 }, { - "epoch": 0.4611124488321695, - "grad_norm": 8.875, + "epoch": 2.731811697574893, + "grad_norm": 14.3125, "learning_rate": 9.409333333333333e-05, - "loss": 0.7553, + "loss": 0.9129, "step": 3830 }, { - "epoch": 0.46231639778473393, - "grad_norm": 9.3125, + "epoch": 2.738944365192582, + "grad_norm": 6.78125, "learning_rate": 9.404888888888889e-05, - "loss": 0.6551, + "loss": 0.8666, "step": 3840 }, { - "epoch": 0.46352034673729836, - "grad_norm": 11.0625, + "epoch": 2.746077032810271, + "grad_norm": 7.4375, "learning_rate": 9.400444444444445e-05, - "loss": 0.6634, + "loss": 0.9474, "step": 3850 }, { - "epoch": 0.46472429568986273, - "grad_norm": 6.71875, + "epoch": 2.75320970042796, + "grad_norm": 7.46875, "learning_rate": 9.396e-05, - "loss": 0.6527, + "loss": 0.9312, "step": 3860 }, { - "epoch": 0.46592824464242716, - "grad_norm": 6.75, + "epoch": 2.760342368045649, + "grad_norm": 7.84375, "learning_rate": 9.391555555555555e-05, - "loss": 0.8268, + "loss": 0.943, "step": 3870 }, { - "epoch": 0.4671321935949916, - "grad_norm": 7.78125, + "epoch": 2.767475035663338, + "grad_norm": 8.125, "learning_rate": 9.387111111111113e-05, - "loss": 0.742, + "loss": 0.9471, "step": 3880 }, { - "epoch": 0.468336142547556, - "grad_norm": 6.53125, + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, "learning_rate": 9.382666666666667e-05, - "loss": 0.7446, + "loss": 0.9785, "step": 3890 }, { - "epoch": 0.4695400915001204, - "grad_norm": 7.0625, + "epoch": 2.7817403708987163, + "grad_norm": 10.5, "learning_rate": 9.378222222222222e-05, - "loss": 0.7764, + "loss": 1.0151, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval/acc": 37.79069900512695, + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval_loss": 2.6463897228240967, - "eval_runtime": 1.4145, - "eval_samples_per_second": 30.4, - "eval_steps_per_second": 0.707, + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, "step": 3900 }, { - "epoch": 0.4707440404526848, - "grad_norm": 5.625, + "epoch": 2.788873038516405, + "grad_norm": 9.75, "learning_rate": 9.373777777777778e-05, - "loss": 0.7248, + "loss": 0.9148, "step": 3910 }, { - "epoch": 0.47194798940524924, - "grad_norm": 7.09375, + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, "learning_rate": 9.369333333333333e-05, - "loss": 0.6977, + "loss": 1.0314, "step": 3920 }, { - "epoch": 0.4731519383578136, - "grad_norm": 7.53125, + "epoch": 2.803138373751783, + "grad_norm": 8.375, "learning_rate": 9.36488888888889e-05, - "loss": 0.6496, + "loss": 0.9076, "step": 3930 }, { - "epoch": 0.47435588731037803, - "grad_norm": 11.0, + "epoch": 2.810271041369472, + "grad_norm": 6.46875, "learning_rate": 9.360444444444444e-05, - "loss": 0.7309, + "loss": 0.8218, "step": 3940 }, { - "epoch": 0.47555983626294246, - "grad_norm": 10.5625, + "epoch": 2.817403708987161, + "grad_norm": 7.96875, "learning_rate": 9.356e-05, - "loss": 0.7837, + "loss": 0.9415, "step": 3950 }, { - "epoch": 0.4767637852155069, - "grad_norm": 6.9375, + "epoch": 2.82453637660485, + "grad_norm": 7.53125, "learning_rate": 9.351555555555555e-05, - "loss": 0.6769, + "loss": 0.9593, "step": 3960 }, { - "epoch": 0.47796773416807126, - "grad_norm": 6.84375, + "epoch": 2.831669044222539, + "grad_norm": 5.96875, "learning_rate": 9.347111111111112e-05, - "loss": 0.642, + "loss": 0.9134, "step": 3970 }, { - "epoch": 0.4791716831206357, - "grad_norm": 9.125, + "epoch": 2.8388017118402282, + "grad_norm": 8.25, "learning_rate": 9.342666666666668e-05, - "loss": 0.6947, + "loss": 0.9339, "step": 3980 }, { - "epoch": 0.4803756320732001, - "grad_norm": 7.4375, + "epoch": 2.8459343794579173, + "grad_norm": 9.625, "learning_rate": 9.338222222222223e-05, - "loss": 0.5902, + "loss": 1.0018, "step": 3990 }, { - "epoch": 0.4815795810257645, - "grad_norm": 8.1875, + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, "learning_rate": 9.333777777777777e-05, - "loss": 0.6075, + "loss": 0.9302, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval/acc": 34.88372039794922, + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval_loss": 2.6985960006713867, - "eval_runtime": 0.2767, - "eval_samples_per_second": 155.399, - "eval_steps_per_second": 3.614, + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, "step": 4000 }, { - "epoch": 0.4827835299783289, - "grad_norm": 6.8125, + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, "learning_rate": 9.329333333333334e-05, - "loss": 0.7166, + "loss": 0.9375, "step": 4010 }, { - "epoch": 0.48398747893089333, - "grad_norm": 6.375, + "epoch": 2.8673323823109844, + "grad_norm": 11.875, "learning_rate": 9.32488888888889e-05, - "loss": 0.6136, + "loss": 0.8406, "step": 4020 }, { - "epoch": 0.48519142788345776, - "grad_norm": 6.09375, + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, "learning_rate": 9.320444444444445e-05, - "loss": 0.7948, + "loss": 0.8863, "step": 4030 }, { - "epoch": 0.48639537683602213, - "grad_norm": 7.5625, + "epoch": 2.881597717546362, + "grad_norm": 6.9375, "learning_rate": 9.316000000000001e-05, - "loss": 0.7253, + "loss": 0.9546, "step": 4040 }, { - "epoch": 0.48759932578858656, - "grad_norm": 7.1875, + "epoch": 2.8887303851640516, + "grad_norm": 8.625, "learning_rate": 9.311555555555556e-05, - "loss": 0.7386, + "loss": 1.0175, "step": 4050 }, { - "epoch": 0.488803274741151, - "grad_norm": 7.71875, + "epoch": 2.89586305278174, + "grad_norm": 45.0, "learning_rate": 9.307111111111112e-05, - "loss": 0.7222, + "loss": 0.9058, "step": 4060 }, { - "epoch": 0.4900072236937154, - "grad_norm": 10.8125, + "epoch": 2.9029957203994297, + "grad_norm": 13.625, "learning_rate": 9.302666666666667e-05, - "loss": 0.6298, + "loss": 0.9137, "step": 4070 }, { - "epoch": 0.4912111726462798, - "grad_norm": 14.25, + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, "learning_rate": 9.298222222222223e-05, - "loss": 0.6551, + "loss": 0.8862, "step": 4080 }, { - "epoch": 0.4924151215988442, - "grad_norm": 7.75, + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, "learning_rate": 9.293777777777778e-05, - "loss": 0.7201, + "loss": 0.9152, "step": 4090 }, { - "epoch": 0.49361907055140863, - "grad_norm": 9.0625, + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, "learning_rate": 9.289333333333334e-05, - "loss": 0.708, + "loss": 0.9623, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval/acc": 34.88372039794922, + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval_loss": 2.7673676013946533, - "eval_runtime": 0.3468, - "eval_samples_per_second": 124.003, - "eval_steps_per_second": 2.884, + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, "step": 4100 }, { - "epoch": 0.494823019503973, - "grad_norm": 7.9375, + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, "learning_rate": 9.28488888888889e-05, - "loss": 0.6997, + "loss": 0.9088, "step": 4110 }, { - "epoch": 0.49602696845653743, - "grad_norm": 6.84375, + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, "learning_rate": 9.280444444444445e-05, - "loss": 0.6195, + "loss": 0.9927, "step": 4120 }, { - "epoch": 0.49723091740910186, - "grad_norm": 7.40625, + "epoch": 2.9457917261055635, + "grad_norm": 75.0, "learning_rate": 9.276e-05, - "loss": 0.765, + "loss": 0.912, "step": 4130 }, { - "epoch": 0.4984348663616663, - "grad_norm": 7.8125, + "epoch": 2.9529243937232525, + "grad_norm": 9.125, "learning_rate": 9.271555555555556e-05, - "loss": 0.7097, + "loss": 0.9878, "step": 4140 }, { - "epoch": 0.49963881531423066, - "grad_norm": 7.75, + "epoch": 2.9600570613409416, + "grad_norm": 7.125, "learning_rate": 9.267111111111112e-05, - "loss": 0.7067, + "loss": 0.8785, "step": 4150 }, { - "epoch": 0.5008427642667951, - "grad_norm": 27.875, + "epoch": 2.9671897289586306, + "grad_norm": 8.25, "learning_rate": 9.262666666666667e-05, - "loss": 0.7989, + "loss": 0.9296, "step": 4160 }, { - "epoch": 0.5020467132193595, - "grad_norm": 8.0, + "epoch": 2.9743223965763197, + "grad_norm": 8.75, "learning_rate": 9.258222222222222e-05, - "loss": 0.6744, + "loss": 0.9284, "step": 4170 }, { - "epoch": 0.5032506621719239, - "grad_norm": 7.96875, + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, "learning_rate": 9.253777777777778e-05, - "loss": 0.738, + "loss": 0.9566, "step": 4180 }, { - "epoch": 0.5044546111244883, - "grad_norm": 7.21875, + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, "learning_rate": 9.249333333333334e-05, - "loss": 0.7021, + "loss": 0.8368, "step": 4190 }, { - "epoch": 0.5056585600770528, - "grad_norm": 9.6875, + "epoch": 2.995720399429387, + "grad_norm": 9.875, "learning_rate": 9.244888888888889e-05, - "loss": 0.7133, + "loss": 1.0306, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval/acc": 32.55813980102539, + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval_loss": 2.7288577556610107, - "eval_runtime": 0.2266, - "eval_samples_per_second": 189.803, - "eval_steps_per_second": 4.414, + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, "step": 4200 }, { - "epoch": 0.5068625090296172, - "grad_norm": 10.5, + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, "learning_rate": 9.240444444444445e-05, - "loss": 0.6886, + "loss": 0.957, "step": 4210 }, { - "epoch": 0.5080664579821815, - "grad_norm": 9.0625, + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, "learning_rate": 9.236e-05, - "loss": 0.7944, + "loss": 0.884, "step": 4220 }, { - "epoch": 0.509270406934746, - "grad_norm": 7.78125, + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, "learning_rate": 9.231555555555555e-05, - "loss": 0.7869, + "loss": 0.9064, "step": 4230 }, { - "epoch": 0.5104743558873104, - "grad_norm": 6.375, + "epoch": 3.0242510699001426, + "grad_norm": 8.0, "learning_rate": 9.227111111111111e-05, - "loss": 0.6245, + "loss": 0.9164, "step": 4240 }, { - "epoch": 0.5116783048398748, - "grad_norm": 9.9375, + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, "learning_rate": 9.222666666666668e-05, - "loss": 0.7006, + "loss": 0.9787, "step": 4250 }, { - "epoch": 0.5128822537924392, - "grad_norm": 6.1875, + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, "learning_rate": 9.218222222222222e-05, - "loss": 0.7588, + "loss": 0.8852, "step": 4260 }, { - "epoch": 0.5140862027450036, - "grad_norm": 10.6875, + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, "learning_rate": 9.213777777777777e-05, - "loss": 0.737, + "loss": 1.0092, "step": 4270 }, { - "epoch": 0.515290151697568, - "grad_norm": 6.15625, + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, "learning_rate": 9.209333333333335e-05, - "loss": 0.6774, + "loss": 0.9972, "step": 4280 }, { - "epoch": 0.5164941006501325, - "grad_norm": 8.8125, + "epoch": 3.059914407988588, + "grad_norm": 7.25, "learning_rate": 9.20488888888889e-05, - "loss": 0.6972, + "loss": 0.9237, "step": 4290 }, { - "epoch": 0.5176980496026968, - "grad_norm": 6.40625, + "epoch": 3.067047075606277, + "grad_norm": 6.4375, "learning_rate": 9.200444444444445e-05, - "loss": 0.6423, + "loss": 0.9096, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval/acc": 38.953487396240234, + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval_loss": 2.7444300651550293, - "eval_runtime": 0.2708, - "eval_samples_per_second": 158.776, - "eval_steps_per_second": 3.692, + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, "step": 4300 }, { - "epoch": 0.5189019985552613, - "grad_norm": 6.8125, + "epoch": 3.074179743223966, + "grad_norm": 8.4375, "learning_rate": 9.196000000000001e-05, - "loss": 0.7705, + "loss": 0.9697, "step": 4310 }, { - "epoch": 0.5201059475078257, - "grad_norm": 5.90625, + "epoch": 3.081312410841655, + "grad_norm": 8.4375, "learning_rate": 9.191555555555556e-05, - "loss": 0.7534, + "loss": 0.8379, "step": 4320 }, { - "epoch": 0.52130989646039, - "grad_norm": 9.25, + "epoch": 3.088445078459344, + "grad_norm": 8.125, "learning_rate": 9.187111111111112e-05, - "loss": 0.6586, + "loss": 0.8576, "step": 4330 }, { - "epoch": 0.5225138454129545, - "grad_norm": 7.53125, + "epoch": 3.0955777460770326, + "grad_norm": 10.75, "learning_rate": 9.182666666666667e-05, - "loss": 0.7459, + "loss": 0.9616, "step": 4340 }, { - "epoch": 0.5237177943655189, - "grad_norm": 6.09375, + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, "learning_rate": 9.178222222222223e-05, - "loss": 0.7088, + "loss": 0.7674, "step": 4350 }, { - "epoch": 0.5249217433180833, - "grad_norm": 8.5, + "epoch": 3.1098430813124107, + "grad_norm": 8.375, "learning_rate": 9.173777777777778e-05, - "loss": 0.7313, + "loss": 0.8712, "step": 4360 }, { - "epoch": 0.5261256922706478, - "grad_norm": 8.8125, + "epoch": 3.1169757489300998, + "grad_norm": 8.375, "learning_rate": 9.169333333333334e-05, - "loss": 0.7364, + "loss": 0.8599, "step": 4370 }, { - "epoch": 0.5273296412232121, - "grad_norm": 7.09375, + "epoch": 3.124108416547789, + "grad_norm": 7.1875, "learning_rate": 9.16488888888889e-05, - "loss": 0.6962, + "loss": 0.9736, "step": 4380 }, { - "epoch": 0.5285335901757765, - "grad_norm": 6.28125, + "epoch": 3.131241084165478, + "grad_norm": 7.75, "learning_rate": 9.160444444444445e-05, - "loss": 0.6817, + "loss": 0.8663, "step": 4390 }, { - "epoch": 0.529737539128341, - "grad_norm": 8.25, + "epoch": 3.138373751783167, + "grad_norm": 7.53125, "learning_rate": 9.156e-05, - "loss": 0.6786, + "loss": 0.9221, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval/acc": 34.88372039794922, + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval_loss": 2.728501081466675, - "eval_runtime": 0.3599, - "eval_samples_per_second": 119.474, - "eval_steps_per_second": 2.778, + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, "step": 4400 }, { - "epoch": 0.5309414880809054, - "grad_norm": 7.59375, + "epoch": 3.145506419400856, + "grad_norm": 8.125, "learning_rate": 9.151555555555556e-05, - "loss": 0.6744, + "loss": 0.9144, "step": 4410 }, { - "epoch": 0.5321454370334697, - "grad_norm": 8.0625, + "epoch": 3.152639087018545, + "grad_norm": 7.46875, "learning_rate": 9.147111111111112e-05, - "loss": 0.8287, + "loss": 0.9445, "step": 4420 }, { - "epoch": 0.5333493859860342, - "grad_norm": 8.1875, + "epoch": 3.159771754636234, + "grad_norm": 6.9375, "learning_rate": 9.142666666666667e-05, - "loss": 0.7069, + "loss": 0.8308, "step": 4430 }, { - "epoch": 0.5345533349385986, - "grad_norm": 8.125, + "epoch": 3.166904422253923, + "grad_norm": 7.53125, "learning_rate": 9.138222222222222e-05, - "loss": 0.662, + "loss": 0.8428, "step": 4440 }, { - "epoch": 0.5357572838911631, - "grad_norm": 7.46875, + "epoch": 3.174037089871612, + "grad_norm": 7.96875, "learning_rate": 9.133777777777778e-05, - "loss": 0.7424, + "loss": 0.9022, "step": 4450 }, { - "epoch": 0.5369612328437274, - "grad_norm": 6.96875, + "epoch": 3.181169757489301, + "grad_norm": 6.875, "learning_rate": 9.129333333333334e-05, - "loss": 0.7308, + "loss": 0.9955, "step": 4460 }, { - "epoch": 0.5381651817962918, - "grad_norm": 8.3125, + "epoch": 3.18830242510699, + "grad_norm": 9.5625, "learning_rate": 9.124888888888889e-05, - "loss": 0.7524, + "loss": 0.9493, "step": 4470 }, { - "epoch": 0.5393691307488563, - "grad_norm": 6.40625, + "epoch": 3.195435092724679, + "grad_norm": 9.0625, "learning_rate": 9.120444444444445e-05, - "loss": 0.7523, + "loss": 0.9608, "step": 4480 }, { - "epoch": 0.5405730797014207, - "grad_norm": 7.65625, + "epoch": 3.202567760342368, + "grad_norm": 8.625, "learning_rate": 9.116e-05, - "loss": 0.647, + "loss": 0.821, "step": 4490 }, { - "epoch": 0.541777028653985, - "grad_norm": 6.875, + "epoch": 3.209700427960057, + "grad_norm": 8.125, "learning_rate": 9.111555555555556e-05, - "loss": 0.6547, + "loss": 0.9175, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval/acc": 37.20930099487305, + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval_loss": 2.8390543460845947, - "eval_runtime": 0.2096, - "eval_samples_per_second": 205.2, - "eval_steps_per_second": 4.772, + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, "step": 4500 }, { - "epoch": 0.5429809776065495, - "grad_norm": 9.375, + "epoch": 3.216833095577746, + "grad_norm": 8.0625, "learning_rate": 9.107111111111111e-05, - "loss": 0.6773, + "loss": 0.9169, "step": 4510 }, { - "epoch": 0.5441849265591139, - "grad_norm": 10.1875, + "epoch": 3.223965763195435, + "grad_norm": 8.3125, "learning_rate": 9.102666666666667e-05, - "loss": 0.704, + "loss": 0.8001, "step": 4520 }, { - "epoch": 0.5453888755116783, - "grad_norm": 5.0625, + "epoch": 3.231098430813124, + "grad_norm": 7.3125, "learning_rate": 9.098222222222222e-05, - "loss": 0.6303, + "loss": 0.8513, "step": 4530 }, { - "epoch": 0.5465928244642427, - "grad_norm": 8.25, + "epoch": 3.238231098430813, + "grad_norm": 7.625, "learning_rate": 9.093777777777777e-05, - "loss": 0.7469, + "loss": 0.912, "step": 4540 }, { - "epoch": 0.5477967734168071, - "grad_norm": 7.375, + "epoch": 3.245363766048502, + "grad_norm": 6.46875, "learning_rate": 9.089333333333335e-05, - "loss": 0.6995, + "loss": 0.9418, "step": 4550 }, { - "epoch": 0.5490007223693716, - "grad_norm": 7.78125, + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, "learning_rate": 9.08488888888889e-05, - "loss": 0.6965, + "loss": 0.871, "step": 4560 }, { - "epoch": 0.550204671321936, - "grad_norm": 13.625, + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, "learning_rate": 9.080444444444444e-05, - "loss": 0.759, + "loss": 0.8507, "step": 4570 }, { - "epoch": 0.5514086202745003, - "grad_norm": 6.875, + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, "learning_rate": 9.076e-05, - "loss": 0.7284, + "loss": 0.8058, "step": 4580 }, { - "epoch": 0.5526125692270648, - "grad_norm": 5.875, + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, "learning_rate": 9.071555555555557e-05, - "loss": 0.6721, + "loss": 0.7959, "step": 4590 }, { - "epoch": 0.5538165181796292, - "grad_norm": 5.46875, + "epoch": 3.281027104136947, + "grad_norm": 6.375, "learning_rate": 9.067111111111112e-05, - "loss": 0.6522, + "loss": 0.9206, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval/acc": 39.53488540649414, + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval_loss": 2.801618814468384, - "eval_runtime": 0.2155, - "eval_samples_per_second": 199.501, - "eval_steps_per_second": 4.64, + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, "step": 4600 }, { - "epoch": 0.5550204671321936, - "grad_norm": 8.5625, + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, "learning_rate": 9.062666666666666e-05, - "loss": 0.6399, + "loss": 0.8306, "step": 4610 }, { - "epoch": 0.556224416084758, - "grad_norm": 7.40625, + "epoch": 3.295292439372325, + "grad_norm": 6.9375, "learning_rate": 9.058222222222223e-05, - "loss": 0.7303, + "loss": 0.8958, "step": 4620 }, { - "epoch": 0.5574283650373224, - "grad_norm": 6.96875, + "epoch": 3.302425106990014, + "grad_norm": 7.96875, "learning_rate": 9.053777777777777e-05, - "loss": 0.7126, + "loss": 0.8919, "step": 4630 }, { - "epoch": 0.5586323139898868, - "grad_norm": 7.15625, + "epoch": 3.309557774607703, + "grad_norm": 6.9375, "learning_rate": 9.049333333333334e-05, - "loss": 0.702, + "loss": 0.8844, "step": 4640 }, { - "epoch": 0.5598362629424513, - "grad_norm": 6.625, + "epoch": 3.316690442225392, + "grad_norm": 7.21875, "learning_rate": 9.04488888888889e-05, - "loss": 0.6957, + "loss": 0.8335, "step": 4650 }, { - "epoch": 0.5610402118950156, - "grad_norm": 7.90625, + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, "learning_rate": 9.040444444444445e-05, - "loss": 0.703, + "loss": 0.9337, "step": 4660 }, { - "epoch": 0.5622441608475801, - "grad_norm": 7.75, + "epoch": 3.3309557774607703, + "grad_norm": 9.25, "learning_rate": 9.036e-05, - "loss": 0.7195, + "loss": 1.0282, "step": 4670 }, { - "epoch": 0.5634481098001445, - "grad_norm": 6.59375, + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, "learning_rate": 9.031555555555557e-05, - "loss": 0.6445, + "loss": 0.9401, "step": 4680 }, { - "epoch": 0.5646520587527089, - "grad_norm": 25.125, + "epoch": 3.3452211126961484, + "grad_norm": 7.25, "learning_rate": 9.027111111111112e-05, - "loss": 0.699, + "loss": 0.908, "step": 4690 }, { - "epoch": 0.5658560077052733, - "grad_norm": 8.125, + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, "learning_rate": 9.022666666666667e-05, - "loss": 0.716, + "loss": 0.9262, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval/acc": 34.88372039794922, + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval_loss": 2.777444839477539, - "eval_runtime": 0.218, - "eval_samples_per_second": 197.287, - "eval_steps_per_second": 4.588, + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, "step": 4700 }, { - "epoch": 0.5670599566578377, - "grad_norm": 7.0, + "epoch": 3.3594864479315265, + "grad_norm": 13.0, "learning_rate": 9.018222222222223e-05, - "loss": 0.693, + "loss": 0.9692, "step": 4710 }, { - "epoch": 0.5682639056104021, - "grad_norm": 8.8125, + "epoch": 3.3666191155492156, + "grad_norm": 5.875, "learning_rate": 9.013777777777779e-05, - "loss": 0.7, + "loss": 0.9071, "step": 4720 }, { - "epoch": 0.5694678545629666, - "grad_norm": 7.0, + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, "learning_rate": 9.009333333333334e-05, - "loss": 0.6616, + "loss": 0.8528, "step": 4730 }, { - "epoch": 0.5706718035155309, - "grad_norm": 7.75, + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, "learning_rate": 9.004888888888889e-05, - "loss": 0.7987, + "loss": 0.9408, "step": 4740 }, { - "epoch": 0.5718757524680953, - "grad_norm": 6.53125, + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, "learning_rate": 9.000444444444445e-05, - "loss": 0.7162, + "loss": 1.0017, "step": 4750 }, { - "epoch": 0.5730797014206598, - "grad_norm": 8.6875, + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, "learning_rate": 8.996e-05, - "loss": 0.673, + "loss": 0.9107, "step": 4760 }, { - "epoch": 0.5742836503732242, - "grad_norm": 6.5625, + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, "learning_rate": 8.991555555555556e-05, - "loss": 0.7389, + "loss": 0.9387, "step": 4770 }, { - "epoch": 0.5754875993257886, - "grad_norm": 7.25, + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, "learning_rate": 8.987111111111112e-05, - "loss": 0.6674, + "loss": 0.9775, "step": 4780 }, { - "epoch": 0.576691548278353, - "grad_norm": 8.8125, + "epoch": 3.4165477888730384, + "grad_norm": 8.375, "learning_rate": 8.982666666666667e-05, - "loss": 0.7464, + "loss": 0.8173, "step": 4790 }, { - "epoch": 0.5778954972309174, - "grad_norm": 7.65625, + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, "learning_rate": 8.978222222222222e-05, - "loss": 0.6979, + "loss": 0.9068, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval/acc": 37.20930099487305, + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval_loss": 2.7990331649780273, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.72, - "eval_steps_per_second": 4.831, + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, "step": 4800 }, { - "epoch": 0.5790994461834819, - "grad_norm": 6.90625, + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, "learning_rate": 8.973777777777778e-05, - "loss": 0.7292, + "loss": 0.8262, "step": 4810 }, { - "epoch": 0.5803033951360462, - "grad_norm": 7.34375, + "epoch": 3.4379457917261056, + "grad_norm": 9.125, "learning_rate": 8.969333333333334e-05, - "loss": 0.6484, + "loss": 0.9207, "step": 4820 }, { - "epoch": 0.5815073440886106, - "grad_norm": 7.96875, + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, "learning_rate": 8.964888888888889e-05, - "loss": 0.6246, + "loss": 1.0115, "step": 4830 }, { - "epoch": 0.5827112930411751, - "grad_norm": 5.4375, + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, "learning_rate": 8.960444444444444e-05, - "loss": 0.6978, + "loss": 0.9031, "step": 4840 }, { - "epoch": 0.5839152419937395, - "grad_norm": 7.25, + "epoch": 3.4593437945791727, + "grad_norm": 7.875, "learning_rate": 8.956e-05, - "loss": 0.6848, + "loss": 0.9626, "step": 4850 }, { - "epoch": 0.5851191909463038, - "grad_norm": 8.9375, + "epoch": 3.466476462196862, + "grad_norm": 4.625, "learning_rate": 8.951555555555557e-05, - "loss": 0.7541, + "loss": 0.7793, "step": 4860 }, { - "epoch": 0.5863231398988683, - "grad_norm": 8.6875, + "epoch": 3.473609129814551, + "grad_norm": 7.40625, "learning_rate": 8.947111111111111e-05, - "loss": 0.6872, + "loss": 0.8733, "step": 4870 }, { - "epoch": 0.5875270888514327, - "grad_norm": 6.375, + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, "learning_rate": 8.942666666666668e-05, - "loss": 0.7521, + "loss": 0.8448, "step": 4880 }, { - "epoch": 0.5887310378039972, - "grad_norm": 7.34375, + "epoch": 3.4878744650499285, + "grad_norm": 8.625, "learning_rate": 8.938222222222222e-05, - "loss": 0.6741, + "loss": 0.815, "step": 4890 }, { - "epoch": 0.5899349867565615, - "grad_norm": 9.25, + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, "learning_rate": 8.933777777777779e-05, - "loss": 0.7085, + "loss": 0.7837, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval/acc": 32.55813980102539, + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval_loss": 2.822793483734131, - "eval_runtime": 0.2077, - "eval_samples_per_second": 206.985, - "eval_steps_per_second": 4.814, + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, "step": 4900 }, { - "epoch": 0.5911389357091259, - "grad_norm": 6.75, + "epoch": 3.5021398002853066, + "grad_norm": 7.25, "learning_rate": 8.929333333333333e-05, - "loss": 0.6908, + "loss": 0.9082, "step": 4910 }, { - "epoch": 0.5923428846616904, - "grad_norm": 14.3125, + "epoch": 3.5092724679029956, + "grad_norm": 9.0, "learning_rate": 8.92488888888889e-05, - "loss": 0.6954, + "loss": 0.8041, "step": 4920 }, { - "epoch": 0.5935468336142548, - "grad_norm": 5.03125, + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, "learning_rate": 8.920444444444444e-05, - "loss": 0.6255, + "loss": 0.878, "step": 4930 }, { - "epoch": 0.5947507825668191, - "grad_norm": 7.3125, + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, "learning_rate": 8.916e-05, - "loss": 0.6094, + "loss": 0.8609, "step": 4940 }, { - "epoch": 0.5959547315193836, - "grad_norm": 6.875, + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, "learning_rate": 8.911555555555557e-05, - "loss": 0.6488, + "loss": 0.8203, "step": 4950 }, { - "epoch": 0.597158680471948, - "grad_norm": 6.90625, + "epoch": 3.537803138373752, + "grad_norm": 6.4375, "learning_rate": 8.907111111111112e-05, - "loss": 0.6333, + "loss": 0.8976, "step": 4960 }, { - "epoch": 0.5983626294245123, - "grad_norm": 7.0, + "epoch": 3.544935805991441, + "grad_norm": 15.0, "learning_rate": 8.902666666666667e-05, - "loss": 0.6687, + "loss": 0.8585, "step": 4970 }, { - "epoch": 0.5995665783770768, - "grad_norm": 8.9375, + "epoch": 3.55206847360913, + "grad_norm": 6.21875, "learning_rate": 8.898222222222223e-05, - "loss": 0.6762, + "loss": 0.9642, "step": 4980 }, { - "epoch": 0.6007705273296412, - "grad_norm": 7.53125, + "epoch": 3.559201141226819, + "grad_norm": 9.8125, "learning_rate": 8.893777777777779e-05, - "loss": 0.6007, + "loss": 0.9241, "step": 4990 }, { - "epoch": 0.6019744762822057, - "grad_norm": 5.78125, + "epoch": 3.566333808844508, + "grad_norm": 9.25, "learning_rate": 8.889333333333334e-05, - "loss": 0.682, + "loss": 0.7841, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval/acc": 32.55813980102539, + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval_loss": 2.827073097229004, - "eval_runtime": 0.2073, - "eval_samples_per_second": 207.385, - "eval_steps_per_second": 4.823, + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, "step": 5000 }, { - "epoch": 0.60317842523477, - "grad_norm": 8.25, + "epoch": 3.5734664764621966, + "grad_norm": 7.53125, "learning_rate": 8.884888888888889e-05, - "loss": 0.6711, + "loss": 0.8513, "step": 5010 }, { - "epoch": 0.6043823741873344, - "grad_norm": 7.34375, + "epoch": 3.580599144079886, + "grad_norm": 7.3125, "learning_rate": 8.880444444444445e-05, - "loss": 0.6916, + "loss": 0.9502, "step": 5020 }, { - "epoch": 0.6055863231398989, - "grad_norm": 6.6875, + "epoch": 3.5877318116975747, + "grad_norm": 7.375, "learning_rate": 8.876e-05, - "loss": 0.6601, + "loss": 0.9329, "step": 5030 }, { - "epoch": 0.6067902720924633, - "grad_norm": 6.34375, + "epoch": 3.5948644793152638, + "grad_norm": 7.3125, "learning_rate": 8.871555555555556e-05, - "loss": 0.6945, + "loss": 0.8648, "step": 5040 }, { - "epoch": 0.6079942210450276, - "grad_norm": 6.9375, + "epoch": 3.601997146932953, + "grad_norm": 6.5, "learning_rate": 8.867111111111112e-05, - "loss": 0.6492, + "loss": 0.8019, "step": 5050 }, { - "epoch": 0.6091981699975921, - "grad_norm": 7.1875, + "epoch": 3.609129814550642, + "grad_norm": 9.0, "learning_rate": 8.862666666666667e-05, - "loss": 0.5963, + "loss": 0.8829, "step": 5060 }, { - "epoch": 0.6104021189501565, - "grad_norm": 7.1875, + "epoch": 3.616262482168331, + "grad_norm": 6.46875, "learning_rate": 8.858222222222222e-05, - "loss": 0.6715, + "loss": 0.8419, "step": 5070 }, { - "epoch": 0.6116060679027209, - "grad_norm": 9.25, + "epoch": 3.62339514978602, + "grad_norm": 8.9375, "learning_rate": 8.853777777777778e-05, - "loss": 0.7572, + "loss": 0.9345, "step": 5080 }, { - "epoch": 0.6128100168552854, - "grad_norm": 6.3125, + "epoch": 3.630527817403709, + "grad_norm": 7.09375, "learning_rate": 8.849333333333334e-05, - "loss": 0.7521, + "loss": 0.8204, "step": 5090 }, { - "epoch": 0.6140139658078497, - "grad_norm": 6.9375, + "epoch": 3.637660485021398, + "grad_norm": 7.71875, "learning_rate": 8.844888888888889e-05, - "loss": 0.6313, + "loss": 0.9305, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval/acc": 34.88372039794922, + "epoch": 3.637660485021398, + "eval/acc": 39.53488540649414, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval_loss": 2.9495913982391357, - "eval_runtime": 0.2063, - "eval_samples_per_second": 208.439, - "eval_steps_per_second": 4.847, + "epoch": 3.637660485021398, + "eval_loss": 2.0034291744232178, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 5100 }, { - "epoch": 0.6152179147604142, - "grad_norm": 9.0, + "epoch": 3.644793152639087, + "grad_norm": 6.09375, "learning_rate": 8.840444444444444e-05, - "loss": 0.7974, + "loss": 0.9168, "step": 5110 }, { - "epoch": 0.6164218637129786, - "grad_norm": 5.46875, + "epoch": 3.651925820256776, + "grad_norm": 8.25, "learning_rate": 8.836000000000001e-05, - "loss": 0.6245, + "loss": 0.8155, "step": 5120 }, { - "epoch": 0.617625812665543, - "grad_norm": 9.4375, + "epoch": 3.659058487874465, + "grad_norm": 7.84375, "learning_rate": 8.831555555555556e-05, - "loss": 0.7513, + "loss": 0.8641, "step": 5130 }, { - "epoch": 0.6188297616181074, - "grad_norm": 8.125, + "epoch": 3.666191155492154, + "grad_norm": 6.5, "learning_rate": 8.827111111111111e-05, - "loss": 0.6427, + "loss": 0.8623, "step": 5140 }, { - "epoch": 0.6200337105706718, - "grad_norm": 5.78125, + "epoch": 3.6733238231098433, + "grad_norm": 21.125, "learning_rate": 8.822666666666667e-05, - "loss": 0.6801, + "loss": 0.8205, "step": 5150 }, { - "epoch": 0.6212376595232362, - "grad_norm": 8.8125, + "epoch": 3.680456490727532, + "grad_norm": 7.28125, "learning_rate": 8.818222222222222e-05, - "loss": 0.5978, + "loss": 0.7993, "step": 5160 }, { - "epoch": 0.6224416084758007, - "grad_norm": 8.0, + "epoch": 3.6875891583452214, + "grad_norm": 36.0, "learning_rate": 8.813777777777778e-05, - "loss": 0.6697, + "loss": 0.9083, "step": 5170 }, { - "epoch": 0.623645557428365, - "grad_norm": 8.1875, + "epoch": 3.69472182596291, + "grad_norm": 8.125, "learning_rate": 8.809333333333333e-05, - "loss": 0.7621, + "loss": 0.9264, "step": 5180 }, { - "epoch": 0.6248495063809294, - "grad_norm": 6.4375, + "epoch": 3.701854493580599, + "grad_norm": 10.75, "learning_rate": 8.80488888888889e-05, - "loss": 0.6934, + "loss": 0.8496, "step": 5190 }, { - "epoch": 0.6260534553334939, - "grad_norm": 7.8125, + "epoch": 3.708987161198288, + "grad_norm": 7.78125, "learning_rate": 8.800444444444444e-05, - "loss": 0.7008, + "loss": 0.8718, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval/acc": 34.88372039794922, + "epoch": 3.708987161198288, + "eval/acc": 39.53488540649414, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval_loss": 2.8201522827148438, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.368, - "eval_steps_per_second": 4.729, + "epoch": 3.708987161198288, + "eval_loss": 2.0305864810943604, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.683, + "eval_steps_per_second": 4.504, "step": 5200 }, { - "epoch": 0.6272574042860583, - "grad_norm": 5.78125, + "epoch": 3.716119828815977, + "grad_norm": 9.3125, "learning_rate": 8.796e-05, - "loss": 0.7211, + "loss": 1.0077, "step": 5210 }, { - "epoch": 0.6284613532386227, - "grad_norm": 6.25, + "epoch": 3.723252496433666, + "grad_norm": 11.4375, "learning_rate": 8.791555555555557e-05, - "loss": 0.654, + "loss": 0.8364, "step": 5220 }, { - "epoch": 0.6296653021911871, - "grad_norm": 9.0625, + "epoch": 3.7303851640513552, + "grad_norm": 15.125, "learning_rate": 8.787111111111112e-05, - "loss": 0.6348, + "loss": 0.8557, "step": 5230 }, { - "epoch": 0.6308692511437515, - "grad_norm": 7.59375, + "epoch": 3.7375178316690443, + "grad_norm": 7.875, "learning_rate": 8.782666666666666e-05, - "loss": 0.6363, + "loss": 0.8674, "step": 5240 }, { - "epoch": 0.632073200096316, - "grad_norm": 6.25, + "epoch": 3.7446504992867333, + "grad_norm": 7.84375, "learning_rate": 8.778222222222223e-05, - "loss": 0.629, + "loss": 0.8788, "step": 5250 }, { - "epoch": 0.6332771490488803, - "grad_norm": 12.375, + "epoch": 3.7517831669044224, + "grad_norm": 7.59375, "learning_rate": 8.773777777777779e-05, - "loss": 0.771, + "loss": 0.8098, "step": 5260 }, { - "epoch": 0.6344810980014447, - "grad_norm": 5.96875, + "epoch": 3.7589158345221114, + "grad_norm": 7.40625, "learning_rate": 8.769333333333334e-05, - "loss": 0.589, + "loss": 0.8895, "step": 5270 }, { - "epoch": 0.6356850469540092, - "grad_norm": 7.1875, + "epoch": 3.7660485021398005, + "grad_norm": 6.78125, "learning_rate": 8.76488888888889e-05, - "loss": 0.5794, + "loss": 0.823, "step": 5280 }, { - "epoch": 0.6368889959065736, - "grad_norm": 7.09375, + "epoch": 3.773181169757489, + "grad_norm": 8.125, "learning_rate": 8.760444444444445e-05, - "loss": 0.6449, + "loss": 0.8418, "step": 5290 }, { - "epoch": 0.6380929448591379, - "grad_norm": 11.1875, + "epoch": 3.7803138373751786, + "grad_norm": 8.4375, "learning_rate": 8.756000000000001e-05, - "loss": 0.6708, + "loss": 0.8202, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval/acc": 36.627906799316406, + "epoch": 3.7803138373751786, + "eval/acc": 41.86046600341797, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval_loss": 2.902387857437134, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.456, - "eval_steps_per_second": 4.732, + "epoch": 3.7803138373751786, + "eval_loss": 2.100001811981201, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.218, + "eval_steps_per_second": 4.47, "step": 5300 }, { - "epoch": 0.6392968938117024, - "grad_norm": 8.625, + "epoch": 3.787446504992867, + "grad_norm": 7.78125, "learning_rate": 8.751555555555556e-05, - "loss": 0.5895, + "loss": 0.9786, "step": 5310 }, { - "epoch": 0.6405008427642668, - "grad_norm": 8.625, + "epoch": 3.794579172610556, + "grad_norm": 14.125, "learning_rate": 8.747111111111112e-05, - "loss": 0.6012, + "loss": 1.0893, "step": 5320 }, { - "epoch": 0.6417047917168313, - "grad_norm": 5.25, + "epoch": 3.8017118402282453, + "grad_norm": 6.71875, "learning_rate": 8.742666666666667e-05, - "loss": 0.6262, + "loss": 0.8484, "step": 5330 }, { - "epoch": 0.6429087406693956, - "grad_norm": 8.5625, + "epoch": 3.8088445078459343, + "grad_norm": 7.53125, "learning_rate": 8.738222222222222e-05, - "loss": 0.7584, + "loss": 0.922, "step": 5340 }, { - "epoch": 0.64411268962196, - "grad_norm": 7.53125, + "epoch": 3.8159771754636234, + "grad_norm": 6.9375, "learning_rate": 8.733777777777779e-05, - "loss": 0.6793, + "loss": 0.87, "step": 5350 }, { - "epoch": 0.6453166385745245, - "grad_norm": 9.625, + "epoch": 3.8231098430813124, + "grad_norm": 6.75, "learning_rate": 8.729333333333334e-05, - "loss": 0.6166, + "loss": 0.9272, "step": 5360 }, { - "epoch": 0.6465205875270889, - "grad_norm": 7.0625, + "epoch": 3.8302425106990015, + "grad_norm": 6.875, "learning_rate": 8.724888888888889e-05, - "loss": 0.667, + "loss": 0.8358, "step": 5370 }, { - "epoch": 0.6477245364796532, - "grad_norm": 6.90625, + "epoch": 3.8373751783166905, + "grad_norm": 7.53125, "learning_rate": 8.720444444444445e-05, - "loss": 0.6427, + "loss": 0.8764, "step": 5380 }, { - "epoch": 0.6489284854322177, + "epoch": 3.8445078459343796, "grad_norm": 7.96875, "learning_rate": 8.716000000000001e-05, - "loss": 0.7689, + "loss": 0.9348, "step": 5390 }, { - "epoch": 0.6501324343847821, - "grad_norm": 8.9375, + "epoch": 3.8516405135520686, + "grad_norm": 7.5625, "learning_rate": 8.711555555555556e-05, - "loss": 0.6957, + "loss": 0.9033, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval/acc": 34.88372039794922, + "epoch": 3.8516405135520686, + "eval/acc": 39.53488540649414, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval_loss": 2.8916988372802734, - "eval_runtime": 0.2068, - "eval_samples_per_second": 207.976, - "eval_steps_per_second": 4.837, + "epoch": 3.8516405135520686, + "eval_loss": 2.0633187294006348, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.324, + "eval_steps_per_second": 4.449, "step": 5400 }, { - "epoch": 0.6513363833373464, - "grad_norm": 6.34375, + "epoch": 3.8587731811697576, + "grad_norm": 6.90625, "learning_rate": 8.707111111111111e-05, - "loss": 0.6811, + "loss": 0.9344, "step": 5410 }, { - "epoch": 0.6525403322899109, - "grad_norm": 6.71875, + "epoch": 3.8659058487874463, + "grad_norm": 7.5, "learning_rate": 8.702666666666667e-05, - "loss": 0.6849, + "loss": 0.9346, "step": 5420 }, { - "epoch": 0.6537442812424753, - "grad_norm": 6.46875, + "epoch": 3.8730385164051357, + "grad_norm": 7.03125, "learning_rate": 8.698222222222223e-05, - "loss": 0.6134, + "loss": 0.8835, "step": 5430 }, { - "epoch": 0.6549482301950398, - "grad_norm": 10.5, + "epoch": 3.8801711840228243, + "grad_norm": 6.3125, "learning_rate": 8.693777777777778e-05, - "loss": 0.6213, + "loss": 0.8434, "step": 5440 }, { - "epoch": 0.6561521791476042, - "grad_norm": 6.25, + "epoch": 3.8873038516405134, + "grad_norm": 7.03125, "learning_rate": 8.689333333333334e-05, - "loss": 0.6892, + "loss": 0.8555, "step": 5450 }, { - "epoch": 0.6573561281001685, - "grad_norm": 7.0, + "epoch": 3.8944365192582024, + "grad_norm": 8.0, "learning_rate": 8.684888888888889e-05, - "loss": 0.6003, + "loss": 0.9287, "step": 5460 }, { - "epoch": 0.658560077052733, - "grad_norm": 7.46875, + "epoch": 3.9015691868758915, + "grad_norm": 8.1875, "learning_rate": 8.680444444444444e-05, - "loss": 0.726, + "loss": 0.8738, "step": 5470 }, { - "epoch": 0.6597640260052974, - "grad_norm": 6.0, + "epoch": 3.9087018544935805, + "grad_norm": 7.96875, "learning_rate": 8.676e-05, - "loss": 0.7526, + "loss": 0.8189, "step": 5480 }, { - "epoch": 0.6609679749578617, - "grad_norm": 9.875, + "epoch": 3.9158345221112696, + "grad_norm": 10.1875, "learning_rate": 8.671555555555556e-05, - "loss": 0.603, + "loss": 0.8983, "step": 5490 }, { - "epoch": 0.6621719239104262, - "grad_norm": 13.6875, + "epoch": 3.9229671897289586, + "grad_norm": 10.375, "learning_rate": 8.667111111111111e-05, - "loss": 0.6759, + "loss": 0.8083, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval/acc": 34.88372039794922, + "epoch": 3.9229671897289586, + "eval/acc": 39.53488540649414, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval_loss": 2.915025234222412, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.294, - "eval_steps_per_second": 4.821, + "epoch": 3.9229671897289586, + "eval_loss": 2.089243173599243, + "eval_runtime": 0.2203, + "eval_samples_per_second": 195.23, + "eval_steps_per_second": 4.54, "step": 5500 }, { - "epoch": 0.6633758728629906, - "grad_norm": 8.8125, + "epoch": 3.9300998573466477, + "grad_norm": 13.125, "learning_rate": 8.662666666666666e-05, - "loss": 0.6582, + "loss": 0.8747, "step": 5510 }, { - "epoch": 0.664579821815555, - "grad_norm": 7.6875, + "epoch": 3.9372325249643367, + "grad_norm": 8.25, "learning_rate": 8.658222222222224e-05, - "loss": 0.6219, + "loss": 0.8609, "step": 5520 }, { - "epoch": 0.6657837707681195, - "grad_norm": 9.25, + "epoch": 3.944365192582026, + "grad_norm": 6.75, "learning_rate": 8.653777777777779e-05, - "loss": 0.742, + "loss": 0.8563, "step": 5530 }, { - "epoch": 0.6669877197206838, - "grad_norm": 6.59375, + "epoch": 3.951497860199715, + "grad_norm": 7.75, "learning_rate": 8.649333333333333e-05, - "loss": 0.653, + "loss": 0.8912, "step": 5540 }, { - "epoch": 0.6681916686732483, - "grad_norm": 9.25, + "epoch": 3.9586305278174034, + "grad_norm": 6.40625, "learning_rate": 8.64488888888889e-05, - "loss": 0.67, + "loss": 0.7477, "step": 5550 }, { - "epoch": 0.6693956176258127, - "grad_norm": 7.59375, + "epoch": 3.965763195435093, + "grad_norm": 7.0, "learning_rate": 8.640444444444444e-05, - "loss": 0.7448, + "loss": 0.8185, "step": 5560 }, { - "epoch": 0.670599566578377, - "grad_norm": 7.125, + "epoch": 3.9728958630527815, + "grad_norm": 5.6875, "learning_rate": 8.636e-05, - "loss": 0.607, + "loss": 0.9497, "step": 5570 }, { - "epoch": 0.6718035155309415, - "grad_norm": 6.59375, + "epoch": 3.980028530670471, + "grad_norm": 8.0, "learning_rate": 8.631555555555556e-05, - "loss": 0.6398, + "loss": 0.8117, "step": 5580 }, { - "epoch": 0.6730074644835059, - "grad_norm": 6.21875, + "epoch": 3.9871611982881596, + "grad_norm": 6.625, "learning_rate": 8.627111111111112e-05, - "loss": 0.6334, + "loss": 0.8245, "step": 5590 }, { - "epoch": 0.6742114134360703, - "grad_norm": 7.0625, + "epoch": 3.9942938659058487, + "grad_norm": 6.96875, "learning_rate": 8.622666666666667e-05, - "loss": 0.6878, + "loss": 0.902, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval/acc": 32.55813980102539, + "epoch": 3.9942938659058487, + "eval/acc": 39.53488540649414, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval_loss": 2.8182010650634766, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.724, - "eval_steps_per_second": 4.831, + "epoch": 3.9942938659058487, + "eval_loss": 2.186225652694702, + "eval_runtime": 0.2194, + "eval_samples_per_second": 196.001, + "eval_steps_per_second": 4.558, "step": 5600 }, { - "epoch": 0.6754153623886348, - "grad_norm": 7.9375, + "epoch": 4.001426533523538, + "grad_norm": 6.78125, "learning_rate": 8.618222222222223e-05, - "loss": 0.6577, + "loss": 0.8757, "step": 5610 }, { - "epoch": 0.6766193113411991, - "grad_norm": 7.34375, + "epoch": 4.008559201141227, + "grad_norm": 11.0625, "learning_rate": 8.613777777777779e-05, - "loss": 0.7787, + "loss": 0.885, "step": 5620 }, { - "epoch": 0.6778232602937635, - "grad_norm": 6.96875, + "epoch": 4.015691868758916, + "grad_norm": 6.4375, "learning_rate": 8.609333333333334e-05, - "loss": 0.7849, + "loss": 0.8611, "step": 5630 }, { - "epoch": 0.679027209246328, - "grad_norm": 16.125, + "epoch": 4.022824536376604, + "grad_norm": 14.8125, "learning_rate": 8.604888888888889e-05, - "loss": 0.8503, + "loss": 0.8262, "step": 5640 }, { - "epoch": 0.6802311581988923, - "grad_norm": 7.625, + "epoch": 4.029957203994294, + "grad_norm": 8.0625, "learning_rate": 8.600444444444445e-05, - "loss": 0.6215, + "loss": 0.7549, "step": 5650 }, { - "epoch": 0.6814351071514568, - "grad_norm": 7.28125, + "epoch": 4.0370898716119825, + "grad_norm": 6.84375, "learning_rate": 8.596000000000001e-05, - "loss": 0.6894, + "loss": 0.8725, "step": 5660 }, { - "epoch": 0.6826390561040212, - "grad_norm": 6.28125, + "epoch": 4.044222539229672, + "grad_norm": 8.0, "learning_rate": 8.591555555555556e-05, - "loss": 0.616, + "loss": 0.8846, "step": 5670 }, { - "epoch": 0.6838430050565856, - "grad_norm": 6.125, + "epoch": 4.051355206847361, + "grad_norm": 7.84375, "learning_rate": 8.587111111111111e-05, - "loss": 0.6417, + "loss": 0.9373, "step": 5680 }, { - "epoch": 0.68504695400915, - "grad_norm": 7.78125, + "epoch": 4.05848787446505, + "grad_norm": 6.84375, "learning_rate": 8.582666666666667e-05, - "loss": 0.7842, + "loss": 0.7823, "step": 5690 }, { - "epoch": 0.6862509029617144, - "grad_norm": 9.4375, + "epoch": 4.065620542082739, + "grad_norm": 11.4375, "learning_rate": 8.578222222222223e-05, - "loss": 0.6562, + "loss": 0.9588, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval/acc": 32.55813980102539, + "epoch": 4.065620542082739, + "eval/acc": 37.20930099487305, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval_loss": 2.861806869506836, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.449, - "eval_steps_per_second": 4.801, + "epoch": 4.065620542082739, + "eval_loss": 2.841008424758911, + "eval_runtime": 1.3984, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.715, "step": 5700 }, { - "epoch": 0.6874548519142788, - "grad_norm": 6.46875, + "epoch": 4.072753209700428, + "grad_norm": 5.5625, "learning_rate": 8.573777777777778e-05, - "loss": 0.6165, + "loss": 0.8014, "step": 5710 }, { - "epoch": 0.6886588008668433, - "grad_norm": 7.0625, + "epoch": 4.079885877318117, + "grad_norm": 6.90625, "learning_rate": 8.569333333333334e-05, - "loss": 0.7014, + "loss": 0.818, "step": 5720 }, { - "epoch": 0.6898627498194077, - "grad_norm": 8.0625, + "epoch": 4.087018544935806, + "grad_norm": 8.4375, "learning_rate": 8.564888888888889e-05, - "loss": 0.7459, + "loss": 0.8142, "step": 5730 }, { - "epoch": 0.691066698771972, - "grad_norm": 5.84375, + "epoch": 4.094151212553495, + "grad_norm": 7.75, "learning_rate": 8.560444444444445e-05, - "loss": 0.6708, + "loss": 0.863, "step": 5740 }, { - "epoch": 0.6922706477245365, - "grad_norm": 7.9375, + "epoch": 4.101283880171184, + "grad_norm": 6.90625, "learning_rate": 8.556e-05, - "loss": 0.6487, + "loss": 0.8501, "step": 5750 }, { - "epoch": 0.6934745966771009, - "grad_norm": 8.125, + "epoch": 4.108416547788873, + "grad_norm": 7.15625, "learning_rate": 8.551555555555556e-05, - "loss": 0.6634, + "loss": 0.8293, "step": 5760 }, { - "epoch": 0.6946785456296654, - "grad_norm": 5.0, + "epoch": 4.1155492154065625, + "grad_norm": 8.125, "learning_rate": 8.547111111111111e-05, - "loss": 0.6575, + "loss": 0.8655, "step": 5770 }, { - "epoch": 0.6958824945822297, - "grad_norm": 6.28125, + "epoch": 4.122681883024251, + "grad_norm": 7.75, "learning_rate": 8.542666666666666e-05, - "loss": 0.6661, + "loss": 0.7958, "step": 5780 }, { - "epoch": 0.6970864435347941, - "grad_norm": 6.5, + "epoch": 4.12981455064194, + "grad_norm": 8.3125, "learning_rate": 8.538222222222224e-05, - "loss": 0.6922, + "loss": 0.9186, "step": 5790 }, { - "epoch": 0.6982903924873586, - "grad_norm": 9.0625, + "epoch": 4.136947218259629, + "grad_norm": 7.0625, "learning_rate": 8.533777777777778e-05, - "loss": 0.687, + "loss": 0.9135, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval/acc": 37.79069900512695, + "epoch": 4.136947218259629, + "eval/acc": 37.20930099487305, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval_loss": 2.878754138946533, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.039, - "eval_steps_per_second": 4.745, + "epoch": 4.136947218259629, + "eval_loss": 2.8186914920806885, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.722, + "eval_steps_per_second": 4.645, "step": 5800 }, { - "epoch": 0.699494341439923, - "grad_norm": 8.875, + "epoch": 4.144079885877318, + "grad_norm": 8.125, "learning_rate": 8.529333333333333e-05, - "loss": 0.7106, + "loss": 0.8248, "step": 5810 }, { - "epoch": 0.7006982903924873, - "grad_norm": 8.3125, + "epoch": 4.151212553495007, + "grad_norm": 7.65625, "learning_rate": 8.52488888888889e-05, - "loss": 0.5969, + "loss": 0.9186, "step": 5820 }, { - "epoch": 0.7019022393450518, - "grad_norm": 6.40625, + "epoch": 4.158345221112696, + "grad_norm": 7.6875, "learning_rate": 8.520444444444446e-05, - "loss": 0.6795, + "loss": 0.8367, "step": 5830 }, { - "epoch": 0.7031061882976162, - "grad_norm": 8.5625, + "epoch": 4.165477888730385, + "grad_norm": 9.75, "learning_rate": 8.516e-05, - "loss": 0.7621, + "loss": 0.8898, "step": 5840 }, { - "epoch": 0.7043101372501805, - "grad_norm": 9.5625, + "epoch": 4.172610556348074, + "grad_norm": 8.5625, "learning_rate": 8.511555555555555e-05, - "loss": 0.7035, + "loss": 0.9218, "step": 5850 }, { - "epoch": 0.705514086202745, - "grad_norm": 11.3125, + "epoch": 4.1797432239657635, + "grad_norm": 6.0, "learning_rate": 8.507111111111112e-05, - "loss": 0.8043, + "loss": 0.8784, "step": 5860 }, { - "epoch": 0.7067180351553094, - "grad_norm": 7.4375, + "epoch": 4.186875891583452, + "grad_norm": 8.5625, "learning_rate": 8.502666666666666e-05, - "loss": 0.6349, + "loss": 0.8361, "step": 5870 }, { - "epoch": 0.7079219841078739, - "grad_norm": 6.28125, + "epoch": 4.194008559201142, + "grad_norm": 7.40625, "learning_rate": 8.498222222222223e-05, - "loss": 0.6593, + "loss": 0.816, "step": 5880 }, { - "epoch": 0.7091259330604383, - "grad_norm": 6.4375, + "epoch": 4.20114122681883, + "grad_norm": 7.84375, "learning_rate": 8.493777777777779e-05, - "loss": 0.6236, + "loss": 0.897, "step": 5890 }, { - "epoch": 0.7103298820130026, - "grad_norm": 7.84375, + "epoch": 4.20827389443652, + "grad_norm": 10.0625, "learning_rate": 8.489333333333334e-05, - "loss": 0.6134, + "loss": 0.7807, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval/acc": 34.88372039794922, + "epoch": 4.20827389443652, + "eval/acc": 37.20930099487305, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval_loss": 2.918956756591797, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.945, - "eval_steps_per_second": 4.696, + "epoch": 4.20827389443652, + "eval_loss": 2.890333890914917, + "eval_runtime": 0.2187, + "eval_samples_per_second": 196.595, + "eval_steps_per_second": 4.572, "step": 5900 }, { - "epoch": 0.7115338309655671, - "grad_norm": 7.40625, + "epoch": 4.215406562054208, + "grad_norm": 7.6875, "learning_rate": 8.484888888888888e-05, - "loss": 0.5883, + "loss": 0.8786, "step": 5910 }, { - "epoch": 0.7127377799181315, - "grad_norm": 7.0625, + "epoch": 4.222539229671897, + "grad_norm": 7.46875, "learning_rate": 8.480444444444445e-05, - "loss": 0.6805, + "loss": 0.8689, "step": 5920 }, { - "epoch": 0.7139417288706958, - "grad_norm": 5.25, + "epoch": 4.229671897289586, + "grad_norm": 14.125, "learning_rate": 8.476000000000001e-05, - "loss": 0.5638, + "loss": 0.83, "step": 5930 }, { - "epoch": 0.7151456778232603, - "grad_norm": 5.84375, + "epoch": 4.236804564907275, + "grad_norm": 6.09375, "learning_rate": 8.471555555555556e-05, - "loss": 0.6112, + "loss": 0.8921, "step": 5940 }, { - "epoch": 0.7163496267758247, - "grad_norm": 6.5625, + "epoch": 4.2439372325249645, + "grad_norm": 8.875, "learning_rate": 8.467111111111112e-05, - "loss": 0.6147, + "loss": 0.9293, "step": 5950 }, { - "epoch": 0.7175535757283891, - "grad_norm": 6.15625, + "epoch": 4.251069900142653, + "grad_norm": 10.5625, "learning_rate": 8.462666666666667e-05, - "loss": 0.7292, + "loss": 0.7955, "step": 5960 }, { - "epoch": 0.7187575246809536, - "grad_norm": 8.25, + "epoch": 4.258202567760343, + "grad_norm": 15.25, "learning_rate": 8.458222222222223e-05, - "loss": 0.6048, + "loss": 0.9267, "step": 5970 }, { - "epoch": 0.7199614736335179, - "grad_norm": 8.0625, + "epoch": 4.265335235378031, + "grad_norm": 8.0, "learning_rate": 8.453777777777778e-05, - "loss": 0.581, + "loss": 0.7665, "step": 5980 }, { - "epoch": 0.7211654225860824, - "grad_norm": 7.90625, + "epoch": 4.272467902995721, + "grad_norm": 6.4375, "learning_rate": 8.449333333333334e-05, - "loss": 0.6918, + "loss": 0.8212, "step": 5990 }, { - "epoch": 0.7223693715386468, - "grad_norm": 5.65625, + "epoch": 4.279600570613409, + "grad_norm": 8.0625, "learning_rate": 8.444888888888889e-05, - "loss": 0.6774, + "loss": 0.8294, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval/acc": 36.627906799316406, + "epoch": 4.279600570613409, + "eval/acc": 34.88372039794922, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval_loss": 2.936192512512207, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.531, - "eval_steps_per_second": 4.733, + "epoch": 4.279600570613409, + "eval_loss": 2.8812708854675293, + "eval_runtime": 0.2262, + "eval_samples_per_second": 190.082, + "eval_steps_per_second": 4.421, "step": 6000 }, { - "epoch": 0.7235733204912111, - "grad_norm": 7.59375, + "epoch": 4.286733238231099, + "grad_norm": 5.625, "learning_rate": 8.440444444444445e-05, - "loss": 0.5982, + "loss": 0.8813, "step": 6010 }, { - "epoch": 0.7247772694437756, - "grad_norm": 9.0625, + "epoch": 4.293865905848787, + "grad_norm": 8.375, "learning_rate": 8.436000000000001e-05, - "loss": 0.6048, + "loss": 0.8792, "step": 6020 }, { - "epoch": 0.72598121839634, - "grad_norm": 7.46875, + "epoch": 4.300998573466477, + "grad_norm": 9.125, "learning_rate": 8.431555555555556e-05, - "loss": 0.7024, + "loss": 0.9509, "step": 6030 }, { - "epoch": 0.7271851673489044, - "grad_norm": 8.0625, + "epoch": 4.3081312410841655, + "grad_norm": 7.34375, "learning_rate": 8.427111111111111e-05, - "loss": 0.7556, + "loss": 0.9452, "step": 6040 }, { - "epoch": 0.7283891163014689, - "grad_norm": 6.78125, + "epoch": 4.315263908701855, + "grad_norm": 8.25, "learning_rate": 8.422666666666667e-05, - "loss": 0.7187, + "loss": 0.8801, "step": 6050 }, { - "epoch": 0.7295930652540332, - "grad_norm": 6.8125, + "epoch": 4.3223965763195435, + "grad_norm": 6.75, "learning_rate": 8.418222222222223e-05, - "loss": 0.5774, + "loss": 0.805, "step": 6060 }, { - "epoch": 0.7307970142065976, - "grad_norm": 6.9375, + "epoch": 4.329529243937232, + "grad_norm": 8.375, "learning_rate": 8.413777777777778e-05, - "loss": 0.6724, + "loss": 0.8176, "step": 6070 }, { - "epoch": 0.7320009631591621, + "epoch": 4.336661911554922, "grad_norm": 6.1875, "learning_rate": 8.409333333333333e-05, - "loss": 0.6109, + "loss": 0.8662, "step": 6080 }, { - "epoch": 0.7332049121117264, - "grad_norm": 5.84375, + "epoch": 4.34379457917261, + "grad_norm": 6.03125, "learning_rate": 8.404888888888889e-05, - "loss": 0.6251, + "loss": 0.9121, "step": 6090 }, { - "epoch": 0.7344088610642908, - "grad_norm": 6.78125, + "epoch": 4.3509272467903, + "grad_norm": 5.6875, "learning_rate": 8.400444444444445e-05, - "loss": 0.6916, + "loss": 0.8697, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval/acc": 32.55813980102539, + "epoch": 4.3509272467903, + "eval/acc": 39.53488540649414, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval_loss": 2.947686195373535, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.91, - "eval_steps_per_second": 4.789, + "epoch": 4.3509272467903, + "eval_loss": 2.7605249881744385, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.191, + "eval_steps_per_second": 4.493, "step": 6100 }, { - "epoch": 0.7356128100168553, - "grad_norm": 6.96875, + "epoch": 4.358059914407988, + "grad_norm": 8.125, "learning_rate": 8.396e-05, - "loss": 0.6525, + "loss": 0.783, "step": 6110 }, { - "epoch": 0.7368167589694197, - "grad_norm": 9.625, + "epoch": 4.365192582025678, + "grad_norm": 6.71875, "learning_rate": 8.391555555555556e-05, - "loss": 0.6107, + "loss": 0.7273, "step": 6120 }, { - "epoch": 0.7380207079219842, - "grad_norm": 5.84375, + "epoch": 4.372325249643366, + "grad_norm": 7.625, "learning_rate": 8.387111111111111e-05, - "loss": 0.6339, + "loss": 0.9497, "step": 6130 }, { - "epoch": 0.7392246568745485, - "grad_norm": 8.0, + "epoch": 4.379457917261056, + "grad_norm": 7.625, "learning_rate": 8.382666666666667e-05, - "loss": 0.6243, + "loss": 0.9318, "step": 6140 }, { - "epoch": 0.7404286058271129, - "grad_norm": 7.9375, + "epoch": 4.3865905848787445, + "grad_norm": 7.5625, "learning_rate": 8.378222222222222e-05, - "loss": 0.6644, + "loss": 0.7827, "step": 6150 }, { - "epoch": 0.7416325547796774, + "epoch": 4.393723252496434, "grad_norm": 7.4375, "learning_rate": 8.373777777777779e-05, - "loss": 0.6117, + "loss": 0.8471, "step": 6160 }, { - "epoch": 0.7428365037322417, - "grad_norm": 7.28125, + "epoch": 4.400855920114123, + "grad_norm": 5.59375, "learning_rate": 8.369333333333333e-05, - "loss": 0.6253, + "loss": 0.866, "step": 6170 }, { - "epoch": 0.7440404526848061, - "grad_norm": 6.59375, + "epoch": 4.407988587731811, + "grad_norm": 5.34375, "learning_rate": 8.364888888888888e-05, - "loss": 0.5973, + "loss": 0.8237, "step": 6180 }, { - "epoch": 0.7452444016373706, - "grad_norm": 8.5, + "epoch": 4.415121255349501, + "grad_norm": 9.375, "learning_rate": 8.360444444444446e-05, - "loss": 0.5938, + "loss": 0.896, "step": 6190 }, { - "epoch": 0.746448350589935, - "grad_norm": 6.40625, + "epoch": 4.422253922967189, + "grad_norm": 7.78125, "learning_rate": 8.356e-05, - "loss": 0.7276, + "loss": 0.8402, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval/acc": 34.88372039794922, + "epoch": 4.422253922967189, + "eval/acc": 37.20930099487305, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval_loss": 3.0573887825012207, - "eval_runtime": 0.2067, - "eval_samples_per_second": 208.014, - "eval_steps_per_second": 4.838, + "epoch": 4.422253922967189, + "eval_loss": 2.8444175720214844, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.997, + "eval_steps_per_second": 4.512, "step": 6200 }, { - "epoch": 0.7476522995424993, - "grad_norm": 6.75, + "epoch": 4.429386590584879, + "grad_norm": 7.625, "learning_rate": 8.351555555555555e-05, - "loss": 0.6518, + "loss": 0.8708, "step": 6210 }, { - "epoch": 0.7488562484950638, - "grad_norm": 6.5, + "epoch": 4.436519258202567, + "grad_norm": 7.28125, "learning_rate": 8.347111111111112e-05, - "loss": 0.5737, + "loss": 0.8505, "step": 6220 }, { - "epoch": 0.7500601974476282, - "grad_norm": 7.96875, + "epoch": 4.443651925820257, + "grad_norm": 7.28125, "learning_rate": 8.342666666666668e-05, - "loss": 0.743, + "loss": 0.878, "step": 6230 }, { - "epoch": 0.7512641464001927, - "grad_norm": 8.375, + "epoch": 4.4507845934379455, + "grad_norm": 8.0, "learning_rate": 8.338222222222223e-05, - "loss": 0.6803, + "loss": 0.7568, "step": 6240 }, { - "epoch": 0.752468095352757, - "grad_norm": 10.9375, + "epoch": 4.457917261055635, + "grad_norm": 7.28125, "learning_rate": 8.333777777777778e-05, - "loss": 0.8047, + "loss": 0.7909, "step": 6250 }, { - "epoch": 0.7536720443053214, - "grad_norm": 6.21875, + "epoch": 4.465049928673324, + "grad_norm": 10.625, "learning_rate": 8.329333333333334e-05, - "loss": 0.5941, + "loss": 0.8732, "step": 6260 }, { - "epoch": 0.7548759932578859, - "grad_norm": 7.0, + "epoch": 4.472182596291013, + "grad_norm": 7.40625, "learning_rate": 8.324888888888889e-05, - "loss": 0.673, + "loss": 0.8827, "step": 6270 }, { - "epoch": 0.7560799422104503, - "grad_norm": 5.6875, + "epoch": 4.479315263908702, + "grad_norm": 11.25, "learning_rate": 8.320444444444445e-05, - "loss": 0.6869, + "loss": 0.7889, "step": 6280 }, { - "epoch": 0.7572838911630146, - "grad_norm": 7.46875, + "epoch": 4.486447931526391, + "grad_norm": 7.59375, "learning_rate": 8.316000000000001e-05, - "loss": 0.7399, + "loss": 0.7808, "step": 6290 }, { - "epoch": 0.7584878401155791, - "grad_norm": 7.21875, + "epoch": 4.49358059914408, + "grad_norm": 5.40625, "learning_rate": 8.311555555555556e-05, - "loss": 0.6582, + "loss": 0.8223, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval/acc": 34.88372039794922, + "epoch": 4.49358059914408, + "eval/acc": 37.20930099487305, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval_loss": 2.991325616836548, - "eval_runtime": 0.2058, - "eval_samples_per_second": 208.93, - "eval_steps_per_second": 4.859, + "epoch": 4.49358059914408, + "eval_loss": 2.798743963241577, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.44, + "eval_steps_per_second": 4.592, "step": 6300 }, { - "epoch": 0.7596917890681435, - "grad_norm": 7.5625, + "epoch": 4.500713266761769, + "grad_norm": 7.9375, "learning_rate": 8.307111111111111e-05, - "loss": 0.6455, + "loss": 0.8588, "step": 6310 }, { - "epoch": 0.7608957380207079, - "grad_norm": 5.0625, + "epoch": 4.507845934379458, + "grad_norm": 8.0625, "learning_rate": 8.302666666666667e-05, - "loss": 0.6269, + "loss": 0.9003, "step": 6320 }, { - "epoch": 0.7620996869732723, - "grad_norm": 7.15625, + "epoch": 4.5149786019971465, + "grad_norm": 7.21875, "learning_rate": 8.298222222222223e-05, - "loss": 0.6453, + "loss": 0.8942, "step": 6330 }, { - "epoch": 0.7633036359258367, - "grad_norm": 6.34375, + "epoch": 4.522111269614836, + "grad_norm": 7.625, "learning_rate": 8.293777777777778e-05, - "loss": 0.6721, + "loss": 0.8622, "step": 6340 }, { - "epoch": 0.7645075848784012, - "grad_norm": 7.59375, + "epoch": 4.529243937232525, + "grad_norm": 5.53125, "learning_rate": 8.289333333333333e-05, - "loss": 0.569, + "loss": 0.8048, "step": 6350 }, { - "epoch": 0.7657115338309656, - "grad_norm": 6.78125, + "epoch": 4.536376604850214, + "grad_norm": 9.125, "learning_rate": 8.28488888888889e-05, - "loss": 0.6221, + "loss": 0.8506, "step": 6360 }, { - "epoch": 0.76691548278353, - "grad_norm": 9.875, + "epoch": 4.543509272467903, + "grad_norm": 6.125, "learning_rate": 8.280444444444445e-05, - "loss": 0.6623, + "loss": 0.7767, "step": 6370 }, { - "epoch": 0.7681194317360944, - "grad_norm": 7.125, + "epoch": 4.550641940085592, + "grad_norm": 6.90625, "learning_rate": 8.276e-05, - "loss": 0.7166, + "loss": 0.9143, "step": 6380 }, { - "epoch": 0.7693233806886588, - "grad_norm": 7.59375, + "epoch": 4.557774607703281, + "grad_norm": 5.84375, "learning_rate": 8.271555555555556e-05, - "loss": 0.6984, + "loss": 0.8641, "step": 6390 }, { - "epoch": 0.7705273296412232, - "grad_norm": 9.4375, + "epoch": 4.56490727532097, + "grad_norm": 6.3125, "learning_rate": 8.267111111111111e-05, - "loss": 0.7095, + "loss": 0.8297, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval/acc": 34.88372039794922, + "epoch": 4.56490727532097, + "eval/acc": 37.20930099487305, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval_loss": 3.0461771488189697, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.746, - "eval_steps_per_second": 4.808, + "epoch": 4.56490727532097, + "eval_loss": 2.804457426071167, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.742, + "eval_steps_per_second": 4.529, "step": 6400 }, { - "epoch": 0.7717312785937877, - "grad_norm": 9.375, + "epoch": 4.572039942938659, + "grad_norm": 7.15625, "learning_rate": 8.262666666666667e-05, - "loss": 0.6975, + "loss": 0.7398, "step": 6410 }, { - "epoch": 0.772935227546352, - "grad_norm": 6.75, + "epoch": 4.579172610556348, + "grad_norm": 6.125, "learning_rate": 8.258222222222222e-05, - "loss": 0.5826, + "loss": 0.8443, "step": 6420 }, { - "epoch": 0.7741391764989164, - "grad_norm": 8.25, + "epoch": 4.586305278174037, + "grad_norm": 9.25, "learning_rate": 8.253777777777778e-05, - "loss": 0.6596, + "loss": 0.7983, "step": 6430 }, { - "epoch": 0.7753431254514809, - "grad_norm": 6.375, + "epoch": 4.5934379457917265, + "grad_norm": 7.3125, "learning_rate": 8.249333333333333e-05, - "loss": 0.6624, + "loss": 0.9705, "step": 6440 }, { - "epoch": 0.7765470744040452, - "grad_norm": 7.375, + "epoch": 4.600570613409415, + "grad_norm": 7.34375, "learning_rate": 8.24488888888889e-05, - "loss": 0.6221, + "loss": 1.0079, "step": 6450 }, { - "epoch": 0.7777510233566097, - "grad_norm": 8.125, + "epoch": 4.607703281027105, + "grad_norm": 8.875, "learning_rate": 8.240444444444446e-05, - "loss": 0.6819, + "loss": 0.8982, "step": 6460 }, { - "epoch": 0.7789549723091741, - "grad_norm": 4.375, + "epoch": 4.614835948644793, + "grad_norm": 8.375, "learning_rate": 8.236e-05, - "loss": 0.588, + "loss": 0.8417, "step": 6470 }, { - "epoch": 0.7801589212617385, - "grad_norm": 8.875, + "epoch": 4.621968616262482, + "grad_norm": 7.78125, "learning_rate": 8.231555555555555e-05, - "loss": 0.7451, + "loss": 0.8566, "step": 6480 }, { - "epoch": 0.781362870214303, - "grad_norm": 8.5, + "epoch": 4.629101283880171, + "grad_norm": 6.5625, "learning_rate": 8.227111111111111e-05, - "loss": 0.64, + "loss": 0.8155, "step": 6490 }, { - "epoch": 0.7825668191668673, - "grad_norm": 6.59375, + "epoch": 4.63623395149786, + "grad_norm": 5.875, "learning_rate": 8.222666666666668e-05, - "loss": 0.6879, + "loss": 0.9449, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval/acc": 32.55813980102539, + "epoch": 4.63623395149786, + "eval/acc": 41.86046600341797, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval_loss": 2.970376491546631, - "eval_runtime": 0.2075, - "eval_samples_per_second": 207.198, - "eval_steps_per_second": 4.819, + "epoch": 4.63623395149786, + "eval_loss": 2.761596918106079, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.549, + "eval_steps_per_second": 4.664, "step": 6500 }, { - "epoch": 0.7837707681194317, - "grad_norm": 6.96875, + "epoch": 4.643366619115549, + "grad_norm": 7.5, "learning_rate": 8.218222222222223e-05, - "loss": 0.6584, + "loss": 0.8549, "step": 6510 }, { - "epoch": 0.7849747170719962, - "grad_norm": 7.3125, + "epoch": 4.650499286733238, + "grad_norm": 7.0625, "learning_rate": 8.213777777777777e-05, - "loss": 0.6892, + "loss": 0.8473, "step": 6520 }, { - "epoch": 0.7861786660245605, - "grad_norm": 6.28125, + "epoch": 4.6576319543509275, + "grad_norm": 7.1875, "learning_rate": 8.209333333333334e-05, - "loss": 0.6658, + "loss": 0.8773, "step": 6530 }, { - "epoch": 0.7873826149771249, - "grad_norm": 7.3125, + "epoch": 4.664764621968616, + "grad_norm": 7.25, "learning_rate": 8.20488888888889e-05, - "loss": 0.6379, + "loss": 0.789, "step": 6540 }, { - "epoch": 0.7885865639296894, - "grad_norm": 6.09375, + "epoch": 4.671897289586306, + "grad_norm": 7.34375, "learning_rate": 8.200444444444445e-05, - "loss": 0.5797, + "loss": 0.852, "step": 6550 }, { - "epoch": 0.7897905128822538, - "grad_norm": 7.03125, + "epoch": 4.679029957203994, + "grad_norm": 5.65625, "learning_rate": 8.196000000000001e-05, - "loss": 0.6778, + "loss": 0.8291, "step": 6560 }, { - "epoch": 0.7909944618348183, - "grad_norm": 7.46875, + "epoch": 4.686162624821684, + "grad_norm": 5.5625, "learning_rate": 8.191555555555556e-05, - "loss": 0.669, + "loss": 0.7943, "step": 6570 }, { - "epoch": 0.7921984107873826, - "grad_norm": 7.46875, + "epoch": 4.693295292439372, + "grad_norm": 9.25, "learning_rate": 8.18711111111111e-05, - "loss": 0.7272, + "loss": 0.8418, "step": 6580 }, { - "epoch": 0.793402359739947, - "grad_norm": 6.3125, + "epoch": 4.700427960057061, + "grad_norm": 6.75, "learning_rate": 8.182666666666667e-05, - "loss": 0.5767, + "loss": 0.8661, "step": 6590 }, { - "epoch": 0.7946063086925115, - "grad_norm": 7.28125, + "epoch": 4.70756062767475, + "grad_norm": 7.40625, "learning_rate": 8.178222222222223e-05, - "loss": 0.6776, + "loss": 0.768, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval/acc": 34.88372039794922, + "epoch": 4.70756062767475, + "eval/acc": 41.86046600341797, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval_loss": 2.941105842590332, - "eval_runtime": 0.2071, - "eval_samples_per_second": 207.595, - "eval_steps_per_second": 4.828, + "epoch": 4.70756062767475, + "eval_loss": 2.8003947734832764, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.665, + "eval_steps_per_second": 4.527, "step": 6600 }, { - "epoch": 0.7958102576450758, - "grad_norm": 7.125, + "epoch": 4.71469329529244, + "grad_norm": 7.1875, "learning_rate": 8.173777777777778e-05, - "loss": 0.6368, + "loss": 0.9038, "step": 6610 }, { - "epoch": 0.7970142065976402, - "grad_norm": 6.34375, + "epoch": 4.7218259629101285, + "grad_norm": 6.46875, "learning_rate": 8.169333333333334e-05, - "loss": 0.6504, + "loss": 0.7185, "step": 6620 }, { - "epoch": 0.7982181555502047, - "grad_norm": 5.46875, + "epoch": 4.728958630527817, + "grad_norm": 6.3125, "learning_rate": 8.16488888888889e-05, - "loss": 0.6305, + "loss": 0.9515, "step": 6630 }, { - "epoch": 0.7994221045027691, - "grad_norm": 6.3125, + "epoch": 4.736091298145507, + "grad_norm": 6.46875, "learning_rate": 8.160444444444445e-05, - "loss": 0.6538, + "loss": 0.8127, "step": 6640 }, { - "epoch": 0.8006260534553334, - "grad_norm": 9.0625, + "epoch": 4.743223965763195, + "grad_norm": 6.4375, "learning_rate": 8.156e-05, - "loss": 0.6747, + "loss": 0.8914, "step": 6650 }, { - "epoch": 0.8018300024078979, - "grad_norm": 13.0, + "epoch": 4.750356633380885, + "grad_norm": 6.8125, "learning_rate": 8.151555555555556e-05, - "loss": 0.6412, + "loss": 0.8545, "step": 6660 }, { - "epoch": 0.8030339513604623, - "grad_norm": 7.0, + "epoch": 4.757489300998573, + "grad_norm": 7.21875, "learning_rate": 8.147111111111112e-05, - "loss": 0.6479, + "loss": 0.6783, "step": 6670 }, { - "epoch": 0.8042379003130268, - "grad_norm": 7.375, + "epoch": 4.764621968616263, + "grad_norm": 7.03125, "learning_rate": 8.142666666666667e-05, - "loss": 0.6577, + "loss": 0.9337, "step": 6680 }, { - "epoch": 0.8054418492655911, - "grad_norm": 7.625, + "epoch": 4.771754636233951, + "grad_norm": 10.5625, "learning_rate": 8.138222222222223e-05, - "loss": 0.7217, + "loss": 0.8181, "step": 6690 }, { - "epoch": 0.8066457982181555, - "grad_norm": 5.625, + "epoch": 4.778887303851641, + "grad_norm": 7.375, "learning_rate": 8.133777777777778e-05, - "loss": 0.6363, + "loss": 0.8639, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval/acc": 34.88372039794922, + "epoch": 4.778887303851641, + "eval/acc": 37.20930099487305, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval_loss": 2.8945717811584473, - "eval_runtime": 0.2054, - "eval_samples_per_second": 209.381, - "eval_steps_per_second": 4.869, + "epoch": 4.778887303851641, + "eval_loss": 2.8262782096862793, + "eval_runtime": 0.2194, + "eval_samples_per_second": 195.949, + "eval_steps_per_second": 4.557, "step": 6700 }, { - "epoch": 0.80784974717072, - "grad_norm": 8.0625, + "epoch": 4.7860199714693294, + "grad_norm": 10.8125, "learning_rate": 8.129333333333333e-05, - "loss": 0.6784, + "loss": 0.8742, "step": 6710 }, { - "epoch": 0.8090536961232844, - "grad_norm": 9.1875, + "epoch": 4.793152639087019, + "grad_norm": 5.53125, "learning_rate": 8.124888888888889e-05, - "loss": 0.6187, + "loss": 0.7438, "step": 6720 }, { - "epoch": 0.8102576450758487, - "grad_norm": 9.1875, + "epoch": 4.8002853067047075, + "grad_norm": 6.65625, "learning_rate": 8.120444444444445e-05, - "loss": 0.6461, + "loss": 0.7859, "step": 6730 }, { - "epoch": 0.8114615940284132, - "grad_norm": 7.375, + "epoch": 4.807417974322396, + "grad_norm": 6.78125, "learning_rate": 8.116e-05, - "loss": 0.7325, + "loss": 0.8942, "step": 6740 }, { - "epoch": 0.8126655429809776, - "grad_norm": 7.71875, + "epoch": 4.814550641940086, + "grad_norm": 8.4375, "learning_rate": 8.111555555555555e-05, - "loss": 0.6758, + "loss": 0.8483, "step": 6750 }, { - "epoch": 0.813869491933542, - "grad_norm": 10.125, + "epoch": 4.821683309557774, + "grad_norm": 6.40625, "learning_rate": 8.107111111111113e-05, - "loss": 0.6223, + "loss": 0.8284, "step": 6760 }, { - "epoch": 0.8150734408861064, - "grad_norm": 7.90625, + "epoch": 4.828815977175464, + "grad_norm": 6.84375, "learning_rate": 8.102666666666667e-05, - "loss": 0.6115, + "loss": 0.8887, "step": 6770 }, { - "epoch": 0.8162773898386708, - "grad_norm": 5.375, + "epoch": 4.835948644793152, + "grad_norm": 8.875, "learning_rate": 8.098222222222222e-05, - "loss": 0.5747, + "loss": 0.8431, "step": 6780 }, { - "epoch": 0.8174813387912353, - "grad_norm": 7.375, + "epoch": 4.843081312410842, + "grad_norm": 6.90625, "learning_rate": 8.093777777777779e-05, - "loss": 0.618, + "loss": 0.8325, "step": 6790 }, { - "epoch": 0.8186852877437997, - "grad_norm": 7.125, + "epoch": 4.85021398002853, + "grad_norm": 7.0, "learning_rate": 8.089333333333333e-05, - "loss": 0.6603, + "loss": 0.7742, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval/acc": 34.88372039794922, + "epoch": 4.85021398002853, + "eval/acc": 39.53488540649414, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval_loss": 2.9451656341552734, - "eval_runtime": 1.2476, - "eval_samples_per_second": 34.466, - "eval_steps_per_second": 0.802, + "epoch": 4.85021398002853, + "eval_loss": 2.7403292655944824, + "eval_runtime": 0.5509, + "eval_samples_per_second": 78.059, + "eval_steps_per_second": 1.815, "step": 6800 }, { - "epoch": 0.819889236696364, - "grad_norm": 6.28125, + "epoch": 4.85734664764622, + "grad_norm": 6.625, "learning_rate": 8.08488888888889e-05, - "loss": 0.5918, + "loss": 0.8418, "step": 6810 }, { - "epoch": 0.8210931856489285, - "grad_norm": 8.6875, + "epoch": 4.8644793152639085, + "grad_norm": 7.65625, "learning_rate": 8.080444444444444e-05, - "loss": 0.5911, + "loss": 0.9022, "step": 6820 }, { - "epoch": 0.8222971346014929, - "grad_norm": 6.75, + "epoch": 4.871611982881598, + "grad_norm": 7.75, "learning_rate": 8.076e-05, - "loss": 0.6648, + "loss": 0.8201, "step": 6830 }, { - "epoch": 0.8235010835540573, - "grad_norm": 6.78125, + "epoch": 4.878744650499287, + "grad_norm": 7.84375, "learning_rate": 8.071555555555555e-05, - "loss": 0.6044, + "loss": 0.8144, "step": 6840 }, { - "epoch": 0.8247050325066217, - "grad_norm": 15.1875, + "epoch": 4.885877318116976, + "grad_norm": 8.3125, "learning_rate": 8.067111111111112e-05, - "loss": 0.6896, + "loss": 0.8821, "step": 6850 }, { - "epoch": 0.8259089814591861, - "grad_norm": 7.6875, + "epoch": 4.893009985734665, + "grad_norm": 9.0, "learning_rate": 8.062666666666668e-05, - "loss": 0.5829, + "loss": 0.8572, "step": 6860 }, { - "epoch": 0.8271129304117505, - "grad_norm": 5.21875, + "epoch": 4.900142653352354, + "grad_norm": 10.0, "learning_rate": 8.058222222222223e-05, - "loss": 0.6934, + "loss": 0.7498, "step": 6870 }, { - "epoch": 0.828316879364315, - "grad_norm": 10.375, + "epoch": 4.907275320970043, + "grad_norm": 6.09375, "learning_rate": 8.053777777777778e-05, - "loss": 0.7309, + "loss": 0.8709, "step": 6880 }, { - "epoch": 0.8295208283168793, - "grad_norm": 8.1875, + "epoch": 4.914407988587731, + "grad_norm": 7.84375, "learning_rate": 8.049333333333334e-05, - "loss": 0.7213, + "loss": 0.8045, "step": 6890 }, { - "epoch": 0.8307247772694438, - "grad_norm": 5.15625, + "epoch": 4.921540656205421, + "grad_norm": 7.0625, "learning_rate": 8.04488888888889e-05, - "loss": 0.6034, + "loss": 0.8919, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval/acc": 32.55813980102539, + "epoch": 4.921540656205421, + "eval/acc": 34.88372039794922, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval_loss": 2.8601129055023193, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.302, - "eval_steps_per_second": 4.821, + "epoch": 4.921540656205421, + "eval_loss": 2.8702921867370605, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.143, + "eval_steps_per_second": 4.515, "step": 6900 }, { - "epoch": 0.8319287262220082, - "grad_norm": 7.25, + "epoch": 4.9286733238231095, + "grad_norm": 18.125, "learning_rate": 8.040444444444445e-05, - "loss": 0.5585, + "loss": 0.8407, "step": 6910 }, { - "epoch": 0.8331326751745726, - "grad_norm": 5.9375, + "epoch": 4.935805991440799, + "grad_norm": 7.8125, "learning_rate": 8.036e-05, - "loss": 0.7539, + "loss": 0.9023, "step": 6920 }, { - "epoch": 0.834336624127137, - "grad_norm": 8.0, + "epoch": 4.942938659058488, + "grad_norm": 6.53125, "learning_rate": 8.031555555555556e-05, - "loss": 0.6104, + "loss": 0.7747, "step": 6930 }, { - "epoch": 0.8355405730797014, - "grad_norm": 7.4375, + "epoch": 4.950071326676177, + "grad_norm": 7.3125, "learning_rate": 8.027111111111112e-05, - "loss": 0.613, + "loss": 0.7357, "step": 6940 }, { - "epoch": 0.8367445220322658, - "grad_norm": 8.1875, + "epoch": 4.957203994293866, + "grad_norm": 5.71875, "learning_rate": 8.022666666666667e-05, - "loss": 0.6647, + "loss": 0.8914, "step": 6950 }, { - "epoch": 0.8379484709848303, - "grad_norm": 7.4375, + "epoch": 4.964336661911555, + "grad_norm": 7.9375, "learning_rate": 8.018222222222223e-05, - "loss": 0.7037, + "loss": 0.8626, "step": 6960 }, { - "epoch": 0.8391524199373946, - "grad_norm": 7.25, + "epoch": 4.971469329529244, + "grad_norm": 6.9375, "learning_rate": 8.013777777777778e-05, - "loss": 0.5853, + "loss": 0.8388, "step": 6970 }, { - "epoch": 0.840356368889959, - "grad_norm": 8.75, + "epoch": 4.978601997146933, + "grad_norm": 6.5, "learning_rate": 8.009333333333334e-05, - "loss": 0.6264, + "loss": 0.8321, "step": 6980 }, { - "epoch": 0.8415603178425235, - "grad_norm": 8.4375, + "epoch": 4.985734664764622, + "grad_norm": 6.6875, "learning_rate": 8.004888888888889e-05, - "loss": 0.6221, + "loss": 0.8276, "step": 6990 }, { - "epoch": 0.8427642667950879, - "grad_norm": 8.3125, + "epoch": 4.9928673323823105, + "grad_norm": 10.5625, "learning_rate": 8.000444444444445e-05, - "loss": 0.6408, + "loss": 0.8847, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval/acc": 33.72093200683594, + "epoch": 4.9928673323823105, + "eval/acc": 39.53488540649414, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval_loss": 2.9269802570343018, - "eval_runtime": 0.2045, - "eval_samples_per_second": 210.301, - "eval_steps_per_second": 4.891, + "epoch": 4.9928673323823105, + "eval_loss": 2.7940218448638916, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.063, + "eval_steps_per_second": 4.467, "step": 7000 }, { - "epoch": 0.8439682157476524, - "grad_norm": 9.125, + "epoch": 5.0, + "grad_norm": 7.1875, "learning_rate": 7.996e-05, - "loss": 0.6321, + "loss": 0.9472, "step": 7010 }, { - "epoch": 0.8451721647002167, - "grad_norm": 7.125, + "epoch": 5.007132667617689, + "grad_norm": 7.25, "learning_rate": 7.991555555555555e-05, - "loss": 0.5927, + "loss": 0.9009, "step": 7020 }, { - "epoch": 0.8463761136527811, - "grad_norm": 7.65625, + "epoch": 5.014265335235378, + "grad_norm": 7.34375, "learning_rate": 7.987111111111112e-05, - "loss": 0.6574, + "loss": 0.8805, "step": 7030 }, { - "epoch": 0.8475800626053456, - "grad_norm": 7.0, + "epoch": 5.021398002853067, + "grad_norm": 5.78125, "learning_rate": 7.982666666666667e-05, - "loss": 0.7185, + "loss": 0.8475, "step": 7040 }, { - "epoch": 0.84878401155791, - "grad_norm": 7.3125, + "epoch": 5.028530670470756, + "grad_norm": 5.53125, "learning_rate": 7.978222222222222e-05, - "loss": 0.7157, + "loss": 0.7598, "step": 7050 }, { - "epoch": 0.8499879605104743, - "grad_norm": 5.6875, + "epoch": 5.035663338088445, + "grad_norm": 6.25, "learning_rate": 7.973777777777778e-05, - "loss": 0.606, + "loss": 0.8605, "step": 7060 }, { - "epoch": 0.8511919094630388, - "grad_norm": 6.28125, + "epoch": 5.042796005706134, + "grad_norm": 7.46875, "learning_rate": 7.969333333333335e-05, - "loss": 0.6493, + "loss": 0.9293, "step": 7070 }, { - "epoch": 0.8523958584156032, - "grad_norm": 7.8125, + "epoch": 5.049928673323823, + "grad_norm": 5.9375, "learning_rate": 7.96488888888889e-05, - "loss": 0.6123, + "loss": 0.7984, "step": 7080 }, { - "epoch": 0.8535998073681675, + "epoch": 5.057061340941512, "grad_norm": 8.375, "learning_rate": 7.960444444444444e-05, - "loss": 0.6035, + "loss": 0.8222, "step": 7090 }, { - "epoch": 0.854803756320732, - "grad_norm": 7.78125, + "epoch": 5.064194008559201, + "grad_norm": 6.9375, "learning_rate": 7.956e-05, - "loss": 0.5902, + "loss": 0.8535, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval/acc": 37.20930099487305, + "epoch": 5.064194008559201, + "eval/acc": 41.86046600341797, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval_loss": 2.926543712615967, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.37, - "eval_steps_per_second": 4.73, + "epoch": 5.064194008559201, + "eval_loss": 2.631981134414673, + "eval_runtime": 2.5832, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.387, "step": 7100 }, { - "epoch": 0.8560077052732964, - "grad_norm": 6.0625, + "epoch": 5.0713266761768905, + "grad_norm": 6.5625, "learning_rate": 7.951555555555555e-05, - "loss": 0.6464, + "loss": 0.8668, "step": 7110 }, { - "epoch": 0.8572116542258609, + "epoch": 5.078459343794579, "grad_norm": 9.0, "learning_rate": 7.947111111111111e-05, - "loss": 0.7656, + "loss": 0.8142, "step": 7120 }, { - "epoch": 0.8584156031784252, - "grad_norm": 7.21875, + "epoch": 5.085592011412269, + "grad_norm": 8.3125, "learning_rate": 7.942666666666668e-05, - "loss": 0.5546, + "loss": 0.9271, "step": 7130 }, { - "epoch": 0.8596195521309896, - "grad_norm": 8.5, + "epoch": 5.092724679029957, + "grad_norm": 7.875, "learning_rate": 7.938222222222222e-05, - "loss": 0.6538, + "loss": 0.8213, "step": 7140 }, { - "epoch": 0.8608235010835541, - "grad_norm": 8.0625, + "epoch": 5.099857346647646, + "grad_norm": 6.8125, "learning_rate": 7.933777777777777e-05, - "loss": 0.7057, + "loss": 0.8511, "step": 7150 }, { - "epoch": 0.8620274500361185, - "grad_norm": 7.34375, + "epoch": 5.106990014265335, + "grad_norm": 7.53125, "learning_rate": 7.929333333333334e-05, - "loss": 0.6287, + "loss": 0.8525, "step": 7160 }, { - "epoch": 0.8632313989886828, - "grad_norm": 6.53125, + "epoch": 5.114122681883024, + "grad_norm": 7.21875, "learning_rate": 7.92488888888889e-05, - "loss": 0.6231, + "loss": 0.8554, "step": 7170 }, { - "epoch": 0.8644353479412473, - "grad_norm": 18.5, + "epoch": 5.121255349500713, + "grad_norm": 6.84375, "learning_rate": 7.920444444444445e-05, - "loss": 0.664, + "loss": 0.8128, "step": 7180 }, { - "epoch": 0.8656392968938117, - "grad_norm": 8.875, + "epoch": 5.128388017118402, + "grad_norm": 7.84375, "learning_rate": 7.916e-05, - "loss": 0.6286, + "loss": 0.7726, "step": 7190 }, { - "epoch": 0.8668432458463761, - "grad_norm": 6.0625, + "epoch": 5.1355206847360915, + "grad_norm": 7.78125, "learning_rate": 7.911555555555556e-05, - "loss": 0.6808, + "loss": 0.8902, "step": 7200 }, { - "epoch": 0.8668432458463761, + "epoch": 5.1355206847360915, "eval/acc": 37.20930099487305, "step": 7200 }, { - "epoch": 0.8668432458463761, - "eval_loss": 2.9467363357543945, - "eval_runtime": 0.2052, - "eval_samples_per_second": 209.502, - "eval_steps_per_second": 4.872, + "epoch": 5.1355206847360915, + "eval_loss": 2.5633885860443115, + "eval_runtime": 0.2541, + "eval_samples_per_second": 169.248, + "eval_steps_per_second": 3.936, "step": 7200 }, { - "epoch": 0.8680471947989405, - "grad_norm": 7.9375, + "epoch": 5.14265335235378, + "grad_norm": 6.8125, "learning_rate": 7.907111111111112e-05, - "loss": 0.6626, + "loss": 0.7482, "step": 7210 }, { - "epoch": 0.8692511437515049, - "grad_norm": 7.15625, + "epoch": 5.14978601997147, + "grad_norm": 42.0, "learning_rate": 7.902666666666667e-05, - "loss": 0.7685, + "loss": 0.9007, "step": 7220 }, { - "epoch": 0.8704550927040694, - "grad_norm": 10.3125, + "epoch": 5.156918687589158, + "grad_norm": 6.0625, "learning_rate": 7.898222222222223e-05, - "loss": 0.6848, + "loss": 0.8643, "step": 7230 }, { - "epoch": 0.8716590416566338, - "grad_norm": 7.21875, + "epoch": 5.164051355206848, + "grad_norm": 7.03125, "learning_rate": 7.893777777777778e-05, - "loss": 0.6433, + "loss": 0.8899, "step": 7240 }, { - "epoch": 0.8728629906091981, - "grad_norm": 6.34375, + "epoch": 5.171184022824536, + "grad_norm": 7.53125, "learning_rate": 7.889333333333334e-05, - "loss": 0.6121, + "loss": 0.7462, "step": 7250 }, { - "epoch": 0.8740669395617626, - "grad_norm": 7.40625, + "epoch": 5.178316690442226, + "grad_norm": 7.21875, "learning_rate": 7.884888888888889e-05, - "loss": 0.6391, + "loss": 0.9199, "step": 7260 }, { - "epoch": 0.875270888514327, - "grad_norm": 7.96875, + "epoch": 5.185449358059914, + "grad_norm": 8.1875, "learning_rate": 7.880444444444445e-05, - "loss": 0.638, + "loss": 0.7966, "step": 7270 }, { - "epoch": 0.8764748374668914, - "grad_norm": 6.28125, + "epoch": 5.192582025677604, + "grad_norm": 8.0, "learning_rate": 7.876e-05, - "loss": 0.6214, + "loss": 0.9086, "step": 7280 }, { - "epoch": 0.8776787864194558, - "grad_norm": 9.125, + "epoch": 5.1997146932952925, + "grad_norm": 7.46875, "learning_rate": 7.871555555555556e-05, - "loss": 0.7473, + "loss": 0.9184, "step": 7290 }, { - "epoch": 0.8788827353720202, - "grad_norm": 7.5, + "epoch": 5.206847360912981, + "grad_norm": 7.28125, "learning_rate": 7.867111111111112e-05, - "loss": 0.68, + "loss": 0.742, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval/acc": 34.88372039794922, + "epoch": 5.206847360912981, + "eval/acc": 39.53488540649414, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval_loss": 2.999979257583618, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.25, - "eval_steps_per_second": 4.703, + "epoch": 5.206847360912981, + "eval_loss": 2.5178542137145996, + "eval_runtime": 0.2274, + "eval_samples_per_second": 189.112, + "eval_steps_per_second": 4.398, "step": 7300 }, { - "epoch": 0.8800866843245846, - "grad_norm": 7.03125, + "epoch": 5.2139800285306706, + "grad_norm": 10.4375, "learning_rate": 7.862666666666667e-05, - "loss": 0.4952, + "loss": 0.8737, "step": 7310 }, { - "epoch": 0.8812906332771491, - "grad_norm": 7.65625, + "epoch": 5.221112696148359, + "grad_norm": 6.8125, "learning_rate": 7.858222222222222e-05, - "loss": 0.7879, + "loss": 0.8197, "step": 7320 }, { - "epoch": 0.8824945822297134, - "grad_norm": 7.71875, + "epoch": 5.228245363766049, + "grad_norm": 8.125, "learning_rate": 7.853777777777778e-05, - "loss": 0.6093, + "loss": 0.9561, "step": 7330 }, { - "epoch": 0.8836985311822779, - "grad_norm": 8.125, + "epoch": 5.235378031383737, + "grad_norm": 9.5, "learning_rate": 7.849333333333334e-05, - "loss": 0.6522, + "loss": 0.9066, "step": 7340 }, { - "epoch": 0.8849024801348423, - "grad_norm": 8.9375, + "epoch": 5.242510699001427, + "grad_norm": 6.09375, "learning_rate": 7.844888888888889e-05, - "loss": 0.6861, + "loss": 0.839, "step": 7350 }, { - "epoch": 0.8861064290874067, - "grad_norm": 6.9375, + "epoch": 5.249643366619115, + "grad_norm": 8.0625, "learning_rate": 7.840444444444445e-05, - "loss": 0.6023, + "loss": 0.8996, "step": 7360 }, { - "epoch": 0.8873103780399711, - "grad_norm": 8.1875, + "epoch": 5.256776034236805, + "grad_norm": 6.3125, "learning_rate": 7.836e-05, - "loss": 0.5156, + "loss": 0.8253, "step": 7370 }, { - "epoch": 0.8885143269925355, - "grad_norm": 7.125, + "epoch": 5.263908701854493, + "grad_norm": 6.15625, "learning_rate": 7.831555555555556e-05, - "loss": 0.6841, + "loss": 0.7275, "step": 7380 }, { - "epoch": 0.8897182759450999, - "grad_norm": 8.0625, + "epoch": 5.271041369472183, + "grad_norm": 6.375, "learning_rate": 7.827111111111111e-05, - "loss": 0.5521, + "loss": 0.8548, "step": 7390 }, { - "epoch": 0.8909222248976644, - "grad_norm": 7.03125, + "epoch": 5.2781740370898715, + "grad_norm": 8.0625, "learning_rate": 7.822666666666667e-05, - "loss": 0.7556, + "loss": 0.8754, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval/acc": 32.55813980102539, + "epoch": 5.2781740370898715, + "eval/acc": 39.53488540649414, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval_loss": 2.882596015930176, - "eval_runtime": 0.2414, - "eval_samples_per_second": 178.131, - "eval_steps_per_second": 4.143, + "epoch": 5.2781740370898715, + "eval_loss": 2.599212408065796, + "eval_runtime": 0.2355, + "eval_samples_per_second": 182.56, + "eval_steps_per_second": 4.246, "step": 7400 }, { - "epoch": 0.8921261738502287, - "grad_norm": 6.8125, + "epoch": 5.285306704707561, + "grad_norm": 8.875, "learning_rate": 7.818222222222222e-05, - "loss": 0.6252, + "loss": 0.8725, "step": 7410 }, { - "epoch": 0.8933301228027931, - "grad_norm": 6.5, + "epoch": 5.29243937232525, + "grad_norm": 8.0625, "learning_rate": 7.813777777777777e-05, - "loss": 0.6405, + "loss": 0.8689, "step": 7420 }, { - "epoch": 0.8945340717553576, - "grad_norm": 7.25, + "epoch": 5.299572039942939, + "grad_norm": 7.59375, "learning_rate": 7.809333333333335e-05, - "loss": 0.5753, + "loss": 0.7615, "step": 7430 }, { - "epoch": 0.895738020707922, - "grad_norm": 8.4375, + "epoch": 5.306704707560628, + "grad_norm": 6.3125, "learning_rate": 7.80488888888889e-05, - "loss": 0.5782, + "loss": 0.8141, "step": 7440 }, { - "epoch": 0.8969419696604864, - "grad_norm": 7.875, + "epoch": 5.313837375178316, + "grad_norm": 6.84375, "learning_rate": 7.800444444444444e-05, - "loss": 0.6364, + "loss": 0.8328, "step": 7450 }, { - "epoch": 0.8981459186130508, - "grad_norm": 6.15625, + "epoch": 5.320970042796006, + "grad_norm": 7.71875, "learning_rate": 7.796e-05, - "loss": 0.6243, + "loss": 0.8158, "step": 7460 }, { - "epoch": 0.8993498675656152, - "grad_norm": 7.5, + "epoch": 5.328102710413694, + "grad_norm": 7.0625, "learning_rate": 7.791555555555557e-05, - "loss": 0.6401, + "loss": 0.7663, "step": 7470 }, { - "epoch": 0.9005538165181797, - "grad_norm": 6.03125, + "epoch": 5.335235378031384, + "grad_norm": 8.1875, "learning_rate": 7.787111111111112e-05, - "loss": 0.5183, + "loss": 0.7704, "step": 7480 }, { - "epoch": 0.901757765470744, - "grad_norm": 6.5, + "epoch": 5.3423680456490725, + "grad_norm": 8.0, "learning_rate": 7.782666666666666e-05, - "loss": 0.6057, + "loss": 0.8511, "step": 7490 }, { - "epoch": 0.9029617144233084, - "grad_norm": 9.0, + "epoch": 5.349500713266762, + "grad_norm": 5.15625, "learning_rate": 7.778222222222223e-05, - "loss": 0.6341, + "loss": 0.783, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval/acc": 34.30232620239258, + "epoch": 5.349500713266762, + "eval/acc": 39.53488540649414, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval_loss": 2.997713804244995, - "eval_runtime": 1.0811, - "eval_samples_per_second": 39.775, - "eval_steps_per_second": 0.925, + "epoch": 5.349500713266762, + "eval_loss": 2.6000046730041504, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.876, + "eval_steps_per_second": 4.392, "step": 7500 }, { - "epoch": 0.9041656633758729, - "grad_norm": 7.03125, + "epoch": 5.356633380884451, + "grad_norm": 7.6875, "learning_rate": 7.773777777777778e-05, - "loss": 0.6595, + "loss": 0.7674, "step": 7510 }, { - "epoch": 0.9053696123284373, - "grad_norm": 7.84375, + "epoch": 5.36376604850214, + "grad_norm": 6.53125, "learning_rate": 7.769333333333334e-05, - "loss": 0.7769, + "loss": 0.8338, "step": 7520 }, { - "epoch": 0.9065735612810016, - "grad_norm": 6.78125, + "epoch": 5.370898716119829, + "grad_norm": 5.8125, "learning_rate": 7.76488888888889e-05, - "loss": 0.6876, + "loss": 0.8279, "step": 7530 }, { - "epoch": 0.9077775102335661, - "grad_norm": 9.375, + "epoch": 5.378031383737518, + "grad_norm": 7.0625, "learning_rate": 7.760444444444445e-05, - "loss": 0.6271, + "loss": 0.7954, "step": 7540 }, { - "epoch": 0.9089814591861305, - "grad_norm": 6.96875, + "epoch": 5.385164051355207, + "grad_norm": 8.0, "learning_rate": 7.756e-05, - "loss": 0.6117, + "loss": 0.8632, "step": 7550 }, { - "epoch": 0.910185408138695, - "grad_norm": 6.28125, + "epoch": 5.392296718972895, + "grad_norm": 6.84375, "learning_rate": 7.751555555555556e-05, - "loss": 0.6461, + "loss": 0.8191, "step": 7560 }, { - "epoch": 0.9113893570912593, - "grad_norm": 7.96875, + "epoch": 5.399429386590585, + "grad_norm": 7.375, "learning_rate": 7.747111111111112e-05, - "loss": 0.6543, + "loss": 0.708, "step": 7570 }, { - "epoch": 0.9125933060438237, - "grad_norm": 10.0, + "epoch": 5.4065620542082735, + "grad_norm": 7.15625, "learning_rate": 7.742666666666667e-05, - "loss": 0.686, + "loss": 0.6851, "step": 7580 }, { - "epoch": 0.9137972549963882, - "grad_norm": 7.90625, + "epoch": 5.413694721825963, + "grad_norm": 7.25, "learning_rate": 7.738222222222222e-05, - "loss": 0.6634, + "loss": 0.8769, "step": 7590 }, { - "epoch": 0.9150012039489526, - "grad_norm": 11.5625, + "epoch": 5.420827389443652, + "grad_norm": 7.6875, "learning_rate": 7.733777777777779e-05, - "loss": 0.6627, + "loss": 0.8316, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval/acc": 37.20930099487305, + "epoch": 5.420827389443652, + "eval/acc": 39.53488540649414, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval_loss": 2.908363103866577, - "eval_runtime": 2.6366, - "eval_samples_per_second": 16.309, - "eval_steps_per_second": 0.379, + "epoch": 5.420827389443652, + "eval_loss": 2.583944797515869, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.433, + "eval_steps_per_second": 4.522, "step": 7600 }, { - "epoch": 0.9162051529015169, - "grad_norm": 5.65625, + "epoch": 5.427960057061341, + "grad_norm": 7.625, "learning_rate": 7.729333333333334e-05, - "loss": 0.5503, + "loss": 0.8444, "step": 7610 }, { - "epoch": 0.9174091018540814, - "grad_norm": 7.15625, + "epoch": 5.43509272467903, + "grad_norm": 6.6875, "learning_rate": 7.724888888888889e-05, - "loss": 0.5263, + "loss": 0.8101, "step": 7620 }, { - "epoch": 0.9186130508066458, - "grad_norm": 5.96875, + "epoch": 5.442225392296719, + "grad_norm": 6.375, "learning_rate": 7.720444444444445e-05, - "loss": 0.6969, + "loss": 0.8094, "step": 7630 }, { - "epoch": 0.9198169997592102, - "grad_norm": 8.0625, + "epoch": 5.449358059914408, + "grad_norm": 7.09375, "learning_rate": 7.716e-05, - "loss": 0.6371, + "loss": 0.9292, "step": 7640 }, { - "epoch": 0.9210209487117746, - "grad_norm": 7.5625, + "epoch": 5.456490727532097, + "grad_norm": 8.0, "learning_rate": 7.711555555555556e-05, - "loss": 0.6406, + "loss": 0.8544, "step": 7650 }, { - "epoch": 0.922224897664339, - "grad_norm": 10.6875, + "epoch": 5.463623395149786, + "grad_norm": 5.625, "learning_rate": 7.707111111111111e-05, - "loss": 0.7058, + "loss": 0.787, "step": 7660 }, { - "epoch": 0.9234288466169035, - "grad_norm": 12.5625, + "epoch": 5.470756062767475, + "grad_norm": 8.375, "learning_rate": 7.702666666666667e-05, - "loss": 0.7067, + "loss": 0.8763, "step": 7670 }, { - "epoch": 0.9246327955694679, - "grad_norm": 7.21875, + "epoch": 5.477888730385164, + "grad_norm": 12.9375, "learning_rate": 7.698222222222222e-05, - "loss": 0.5543, + "loss": 0.8317, "step": 7680 }, { - "epoch": 0.9258367445220322, - "grad_norm": 10.125, + "epoch": 5.4850213980028535, + "grad_norm": 8.125, "learning_rate": 7.693777777777778e-05, - "loss": 0.6719, + "loss": 0.8156, "step": 7690 }, { - "epoch": 0.9270406934745967, - "grad_norm": 7.03125, + "epoch": 5.492154065620542, + "grad_norm": 6.96875, "learning_rate": 7.689333333333334e-05, - "loss": 0.5764, + "loss": 0.8998, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval/acc": 35.46511459350586, + "epoch": 5.492154065620542, + "eval/acc": 39.53488540649414, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval_loss": 2.8986358642578125, - "eval_runtime": 4.4935, - "eval_samples_per_second": 9.569, - "eval_steps_per_second": 0.223, + "epoch": 5.492154065620542, + "eval_loss": 2.6069791316986084, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.632, + "eval_steps_per_second": 4.457, "step": 7700 }, { - "epoch": 0.9282446424271611, - "grad_norm": 10.25, + "epoch": 5.499286733238231, + "grad_norm": 7.5625, "learning_rate": 7.68488888888889e-05, - "loss": 0.6302, + "loss": 0.7881, "step": 7710 }, { - "epoch": 0.9294485913797255, - "grad_norm": 9.8125, + "epoch": 5.50641940085592, + "grad_norm": 6.65625, "learning_rate": 7.680444444444444e-05, - "loss": 0.6236, + "loss": 0.8379, "step": 7720 }, { - "epoch": 0.93065254033229, - "grad_norm": 6.5, + "epoch": 5.513552068473609, + "grad_norm": 6.34375, "learning_rate": 7.676e-05, - "loss": 0.7159, + "loss": 0.844, "step": 7730 }, { - "epoch": 0.9318564892848543, - "grad_norm": 7.1875, + "epoch": 5.520684736091298, + "grad_norm": 8.3125, "learning_rate": 7.671555555555557e-05, - "loss": 0.6257, + "loss": 0.8762, "step": 7740 }, { - "epoch": 0.9330604382374187, - "grad_norm": 7.3125, + "epoch": 5.527817403708987, + "grad_norm": 7.09375, "learning_rate": 7.667111111111111e-05, - "loss": 0.5247, + "loss": 0.8621, "step": 7750 }, { - "epoch": 0.9342643871899832, - "grad_norm": 5.0, + "epoch": 5.534950071326676, + "grad_norm": 8.5625, "learning_rate": 7.662666666666666e-05, - "loss": 0.5185, + "loss": 1.0092, "step": 7760 }, { - "epoch": 0.9354683361425475, - "grad_norm": 13.375, + "epoch": 5.542082738944365, + "grad_norm": 6.3125, "learning_rate": 7.658222222222222e-05, - "loss": 0.8069, + "loss": 0.8743, "step": 7770 }, { - "epoch": 0.936672285095112, - "grad_norm": 10.3125, + "epoch": 5.5492154065620545, + "grad_norm": 6.0625, "learning_rate": 7.653777777777779e-05, - "loss": 0.6619, + "loss": 0.754, "step": 7780 }, { - "epoch": 0.9378762340476764, - "grad_norm": 7.1875, + "epoch": 5.556348074179743, + "grad_norm": 7.6875, "learning_rate": 7.649333333333334e-05, - "loss": 0.785, + "loss": 0.8504, "step": 7790 }, { - "epoch": 0.9390801830002408, - "grad_norm": 6.0625, + "epoch": 5.563480741797433, + "grad_norm": 8.3125, "learning_rate": 7.64488888888889e-05, - "loss": 0.6064, + "loss": 0.7512, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval/acc": 40.69767379760742, + "epoch": 5.563480741797433, + "eval/acc": 37.20930099487305, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval_loss": 2.9625086784362793, - "eval_runtime": 1.0058, - "eval_samples_per_second": 42.753, - "eval_steps_per_second": 0.994, + "epoch": 5.563480741797433, + "eval_loss": 2.610304594039917, + "eval_runtime": 0.2338, + "eval_samples_per_second": 183.899, + "eval_steps_per_second": 4.277, "step": 7800 }, { - "epoch": 0.9402841319528052, - "grad_norm": 7.0, + "epoch": 5.570613409415121, + "grad_norm": 6.71875, "learning_rate": 7.640444444444445e-05, - "loss": 0.5744, + "loss": 0.8204, "step": 7810 }, { - "epoch": 0.9414880809053696, - "grad_norm": 7.78125, + "epoch": 5.57774607703281, + "grad_norm": 6.625, "learning_rate": 7.636e-05, - "loss": 0.6294, + "loss": 0.734, "step": 7820 }, { - "epoch": 0.942692029857934, - "grad_norm": 6.46875, + "epoch": 5.584878744650499, + "grad_norm": 5.65625, "learning_rate": 7.631555555555556e-05, - "loss": 0.7608, + "loss": 0.8047, "step": 7830 }, { - "epoch": 0.9438959788104985, - "grad_norm": 6.71875, + "epoch": 5.592011412268189, + "grad_norm": 6.40625, "learning_rate": 7.627111111111112e-05, - "loss": 0.6084, + "loss": 0.7179, "step": 7840 }, { - "epoch": 0.9450999277630628, - "grad_norm": 7.15625, + "epoch": 5.599144079885877, + "grad_norm": 6.78125, "learning_rate": 7.622666666666667e-05, - "loss": 0.5791, + "loss": 0.849, "step": 7850 }, { - "epoch": 0.9463038767156272, - "grad_norm": 10.1875, + "epoch": 5.606276747503566, + "grad_norm": 8.8125, "learning_rate": 7.618222222222221e-05, - "loss": 0.683, + "loss": 0.8817, "step": 7860 }, { - "epoch": 0.9475078256681917, - "grad_norm": 7.59375, + "epoch": 5.6134094151212555, + "grad_norm": 6.375, "learning_rate": 7.613777777777779e-05, - "loss": 0.6413, + "loss": 0.8812, "step": 7870 }, { - "epoch": 0.9487117746207561, - "grad_norm": 5.71875, + "epoch": 5.620542082738944, + "grad_norm": 13.125, "learning_rate": 7.609333333333334e-05, - "loss": 0.5985, + "loss": 0.8522, "step": 7880 }, { - "epoch": 0.9499157235733204, - "grad_norm": 8.625, + "epoch": 5.627674750356634, + "grad_norm": 7.0625, "learning_rate": 7.604888888888889e-05, - "loss": 0.572, + "loss": 0.731, "step": 7890 }, { - "epoch": 0.9511196725258849, - "grad_norm": 15.75, + "epoch": 5.634807417974322, + "grad_norm": 7.21875, "learning_rate": 7.600444444444445e-05, - "loss": 0.674, + "loss": 0.8841, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval/acc": 34.88372039794922, + "epoch": 5.634807417974322, + "eval/acc": 39.53488540649414, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval_loss": 2.9794013500213623, - "eval_runtime": 0.209, - "eval_samples_per_second": 205.705, - "eval_steps_per_second": 4.784, + "epoch": 5.634807417974322, + "eval_loss": 2.6105217933654785, + "eval_runtime": 0.2306, + "eval_samples_per_second": 186.447, + "eval_steps_per_second": 4.336, "step": 7900 }, { - "epoch": 0.9523236214784493, - "grad_norm": 8.25, + "epoch": 5.641940085592012, + "grad_norm": 7.625, "learning_rate": 7.596000000000001e-05, - "loss": 0.7455, + "loss": 0.8654, "step": 7910 }, { - "epoch": 0.9535275704310138, - "grad_norm": 7.40625, + "epoch": 5.6490727532097, + "grad_norm": 26.75, "learning_rate": 7.591555555555556e-05, - "loss": 0.5746, + "loss": 0.8103, "step": 7920 }, { - "epoch": 0.9547315193835781, - "grad_norm": 7.78125, + "epoch": 5.65620542082739, + "grad_norm": 7.375, "learning_rate": 7.587111111111112e-05, - "loss": 0.6232, + "loss": 0.7461, "step": 7930 }, { - "epoch": 0.9559354683361425, - "grad_norm": 10.9375, + "epoch": 5.663338088445078, + "grad_norm": 6.09375, "learning_rate": 7.582666666666667e-05, - "loss": 0.7393, + "loss": 0.9693, "step": 7940 }, { - "epoch": 0.957139417288707, - "grad_norm": 8.6875, + "epoch": 5.670470756062768, + "grad_norm": 7.09375, "learning_rate": 7.578222222222222e-05, - "loss": 0.6138, + "loss": 0.8595, "step": 7950 }, { - "epoch": 0.9583433662412714, - "grad_norm": 7.625, + "epoch": 5.6776034236804565, + "grad_norm": 7.3125, "learning_rate": 7.573777777777778e-05, - "loss": 0.637, + "loss": 0.8541, "step": 7960 }, { - "epoch": 0.9595473151938357, - "grad_norm": 6.90625, + "epoch": 5.684736091298145, + "grad_norm": 7.90625, "learning_rate": 7.569333333333334e-05, - "loss": 0.606, + "loss": 0.8774, "step": 7970 }, { - "epoch": 0.9607512641464002, - "grad_norm": 8.8125, + "epoch": 5.6918687589158345, + "grad_norm": 9.0, "learning_rate": 7.564888888888889e-05, - "loss": 0.7135, + "loss": 0.8823, "step": 7980 }, { - "epoch": 0.9619552130989646, - "grad_norm": 6.84375, + "epoch": 5.699001426533523, + "grad_norm": 6.09375, "learning_rate": 7.560444444444444e-05, - "loss": 0.6138, + "loss": 0.7302, "step": 7990 }, { - "epoch": 0.963159162051529, - "grad_norm": 8.25, + "epoch": 5.706134094151213, + "grad_norm": 7.21875, "learning_rate": 7.556000000000002e-05, - "loss": 0.7128, + "loss": 0.8339, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval/acc": 34.88372039794922, + "epoch": 5.706134094151213, + "eval/acc": 37.20930099487305, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval_loss": 2.9879119396209717, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.451, - "eval_steps_per_second": 4.708, + "epoch": 5.706134094151213, + "eval_loss": 2.576781988143921, + "eval_runtime": 0.2231, + "eval_samples_per_second": 192.779, + "eval_steps_per_second": 4.483, "step": 8000 }, { - "epoch": 0.9643631110040934, - "grad_norm": 9.875, + "epoch": 5.713266761768901, + "grad_norm": 7.75, "learning_rate": 7.551555555555556e-05, - "loss": 0.5835, + "loss": 0.7642, "step": 8010 }, { - "epoch": 0.9655670599566578, - "grad_norm": 8.8125, + "epoch": 5.720399429386591, + "grad_norm": 7.8125, "learning_rate": 7.547111111111111e-05, - "loss": 0.6138, + "loss": 0.9188, "step": 8020 }, { - "epoch": 0.9667710089092223, - "grad_norm": 8.3125, + "epoch": 5.727532097004279, + "grad_norm": 7.28125, "learning_rate": 7.542666666666667e-05, - "loss": 0.6638, + "loss": 0.8202, "step": 8030 }, { - "epoch": 0.9679749578617867, - "grad_norm": 7.0, + "epoch": 5.734664764621969, + "grad_norm": 9.0, "learning_rate": 7.538222222222222e-05, - "loss": 0.6484, + "loss": 0.8286, "step": 8040 }, { - "epoch": 0.969178906814351, - "grad_norm": 8.25, + "epoch": 5.741797432239657, + "grad_norm": 7.25, "learning_rate": 7.533777777777778e-05, - "loss": 0.6291, + "loss": 0.7856, "step": 8050 }, { - "epoch": 0.9703828557669155, - "grad_norm": 9.75, + "epoch": 5.748930099857347, + "grad_norm": 6.90625, "learning_rate": 7.529333333333333e-05, - "loss": 0.71, + "loss": 0.8832, "step": 8060 }, { - "epoch": 0.9715868047194799, - "grad_norm": 6.375, + "epoch": 5.7560627674750355, + "grad_norm": 6.09375, "learning_rate": 7.52488888888889e-05, - "loss": 0.5791, + "loss": 0.7606, "step": 8070 }, { - "epoch": 0.9727907536720443, - "grad_norm": 7.40625, + "epoch": 5.763195435092725, + "grad_norm": 6.625, "learning_rate": 7.520444444444444e-05, - "loss": 0.6359, + "loss": 0.8706, "step": 8080 }, { - "epoch": 0.9739947026246087, - "grad_norm": 8.125, + "epoch": 5.770328102710414, + "grad_norm": 7.25, "learning_rate": 7.516e-05, - "loss": 0.5274, + "loss": 0.8542, "step": 8090 }, { - "epoch": 0.9751986515771731, - "grad_norm": 8.6875, + "epoch": 5.777460770328103, + "grad_norm": 6.84375, "learning_rate": 7.511555555555557e-05, - "loss": 0.5887, + "loss": 0.7988, "step": 8100 }, { - "epoch": 0.9751986515771731, + "epoch": 5.777460770328103, "eval/acc": 37.20930099487305, "step": 8100 }, { - "epoch": 0.9751986515771731, - "eval_loss": 3.0165836811065674, - "eval_runtime": 0.2158, - "eval_samples_per_second": 199.215, - "eval_steps_per_second": 4.633, + "epoch": 5.777460770328103, + "eval_loss": 2.598762273788452, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.04, + "eval_steps_per_second": 4.443, "step": 8100 }, { - "epoch": 0.9764026005297375, - "grad_norm": 9.125, + "epoch": 5.784593437945792, + "grad_norm": 8.875, "learning_rate": 7.507111111111112e-05, - "loss": 0.6646, + "loss": 0.8825, "step": 8110 }, { - "epoch": 0.977606549482302, - "grad_norm": 6.4375, + "epoch": 5.79172610556348, + "grad_norm": 7.375, "learning_rate": 7.502666666666666e-05, - "loss": 0.6757, + "loss": 0.8316, "step": 8120 }, { - "epoch": 0.9788104984348663, - "grad_norm": 6.90625, + "epoch": 5.79885877318117, + "grad_norm": 8.125, "learning_rate": 7.498222222222223e-05, - "loss": 0.6722, + "loss": 0.8567, "step": 8130 }, { - "epoch": 0.9800144473874308, - "grad_norm": 5.34375, + "epoch": 5.805991440798858, + "grad_norm": 6.3125, "learning_rate": 7.493777777777779e-05, - "loss": 0.5574, + "loss": 0.8415, "step": 8140 }, { - "epoch": 0.9812183963399952, - "grad_norm": 12.25, + "epoch": 5.813124108416548, + "grad_norm": 8.5, "learning_rate": 7.489333333333334e-05, - "loss": 0.5701, + "loss": 0.8369, "step": 8150 }, { - "epoch": 0.9824223452925596, - "grad_norm": 5.09375, + "epoch": 5.8202567760342365, + "grad_norm": 13.25, "learning_rate": 7.484888888888889e-05, - "loss": 0.7311, + "loss": 0.8692, "step": 8160 }, { - "epoch": 0.983626294245124, - "grad_norm": 9.6875, + "epoch": 5.827389443651926, + "grad_norm": 7.71875, "learning_rate": 7.480444444444445e-05, - "loss": 0.6314, + "loss": 0.8535, "step": 8170 }, { - "epoch": 0.9848302431976884, - "grad_norm": 7.46875, + "epoch": 5.834522111269615, + "grad_norm": 7.6875, "learning_rate": 7.476000000000001e-05, - "loss": 0.6023, + "loss": 0.8701, "step": 8180 }, { - "epoch": 0.9860341921502528, - "grad_norm": 4.53125, + "epoch": 5.841654778887304, + "grad_norm": 5.46875, "learning_rate": 7.471555555555556e-05, - "loss": 0.5998, + "loss": 0.7843, "step": 8190 }, { - "epoch": 0.9872381411028173, - "grad_norm": 7.3125, + "epoch": 5.848787446504993, + "grad_norm": 7.46875, "learning_rate": 7.467111111111112e-05, - "loss": 0.6607, + "loss": 0.7914, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval/acc": 34.88372039794922, + "epoch": 5.848787446504993, + "eval/acc": 37.20930099487305, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval_loss": 2.9665606021881104, - "eval_runtime": 0.2143, - "eval_samples_per_second": 200.607, - "eval_steps_per_second": 4.665, + "epoch": 5.848787446504993, + "eval_loss": 2.566337823867798, + "eval_runtime": 0.3566, + "eval_samples_per_second": 120.59, + "eval_steps_per_second": 2.804, "step": 8200 }, { - "epoch": 0.9884420900553816, - "grad_norm": 7.96875, + "epoch": 5.855920114122682, + "grad_norm": 7.03125, "learning_rate": 7.462666666666667e-05, - "loss": 0.7103, + "loss": 0.849, "step": 8210 }, { - "epoch": 0.989646039007946, - "grad_norm": 10.3125, + "epoch": 5.863052781740371, + "grad_norm": 7.5625, "learning_rate": 7.458222222222223e-05, - "loss": 0.5721, + "loss": 0.8066, "step": 8220 }, { - "epoch": 0.9908499879605105, - "grad_norm": 8.5, + "epoch": 5.870185449358059, + "grad_norm": 6.875, "learning_rate": 7.453777777777778e-05, - "loss": 0.7032, + "loss": 0.8556, "step": 8230 }, { - "epoch": 0.9920539369130749, - "grad_norm": 6.21875, + "epoch": 5.877318116975749, + "grad_norm": 8.0, "learning_rate": 7.449333333333334e-05, - "loss": 0.6547, + "loss": 0.9098, "step": 8240 }, { - "epoch": 0.9932578858656393, - "grad_norm": 7.84375, + "epoch": 5.884450784593438, + "grad_norm": 8.375, "learning_rate": 7.444888888888889e-05, - "loss": 0.6587, + "loss": 0.8183, "step": 8250 }, { - "epoch": 0.9944618348182037, - "grad_norm": 6.53125, + "epoch": 5.891583452211127, + "grad_norm": 13.9375, "learning_rate": 7.440444444444444e-05, - "loss": 0.5486, + "loss": 0.8316, "step": 8260 }, { - "epoch": 0.9956657837707681, - "grad_norm": 8.1875, + "epoch": 5.898716119828816, + "grad_norm": 7.25, "learning_rate": 7.436000000000001e-05, - "loss": 0.6284, + "loss": 0.8563, "step": 8270 }, { - "epoch": 0.9968697327233326, - "grad_norm": 7.59375, + "epoch": 5.905848787446505, + "grad_norm": 10.75, "learning_rate": 7.431555555555556e-05, - "loss": 0.7033, + "loss": 0.8473, "step": 8280 }, { - "epoch": 0.9980736816758969, - "grad_norm": 9.0625, + "epoch": 5.912981455064194, + "grad_norm": 14.1875, "learning_rate": 7.427111111111111e-05, - "loss": 0.6621, + "loss": 0.774, "step": 8290 }, { - "epoch": 0.9992776306284613, - "grad_norm": 8.1875, + "epoch": 5.920114122681883, + "grad_norm": 6.8125, "learning_rate": 7.422666666666667e-05, - "loss": 0.6675, + "loss": 0.8783, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval/acc": 34.30232620239258, + "epoch": 5.920114122681883, + "eval/acc": 34.88372039794922, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval_loss": 2.9075520038604736, - "eval_runtime": 0.7142, - "eval_samples_per_second": 60.205, - "eval_steps_per_second": 1.4, + "epoch": 5.920114122681883, + "eval_loss": 2.6135735511779785, + "eval_runtime": 0.2367, + "eval_samples_per_second": 181.665, + "eval_steps_per_second": 4.225, "step": 8300 }, { - "epoch": 1.0004815795810258, - "grad_norm": 6.34375, + "epoch": 5.927246790299572, + "grad_norm": 7.5625, "learning_rate": 7.418222222222223e-05, - "loss": 0.6534, + "loss": 0.9057, "step": 8310 }, { - "epoch": 1.0016855285335902, - "grad_norm": 7.5, + "epoch": 5.934379457917261, + "grad_norm": 7.875, "learning_rate": 7.413777777777778e-05, - "loss": 0.5778, + "loss": 0.8854, "step": 8320 }, { - "epoch": 1.0028894774861545, - "grad_norm": 8.25, + "epoch": 5.94151212553495, + "grad_norm": 8.1875, "learning_rate": 7.409333333333333e-05, - "loss": 0.6143, + "loss": 0.8049, "step": 8330 }, { - "epoch": 1.004093426438719, - "grad_norm": 6.40625, + "epoch": 5.948644793152639, + "grad_norm": 6.90625, "learning_rate": 7.404888888888889e-05, - "loss": 0.5399, + "loss": 0.7738, "step": 8340 }, { - "epoch": 1.0052973753912835, - "grad_norm": 8.6875, + "epoch": 5.955777460770328, + "grad_norm": 7.90625, "learning_rate": 7.400444444444444e-05, - "loss": 0.6422, + "loss": 0.8268, "step": 8350 }, { - "epoch": 1.0065013243438479, - "grad_norm": 6.5625, + "epoch": 5.9629101283880175, + "grad_norm": 8.3125, "learning_rate": 7.396e-05, - "loss": 0.5578, + "loss": 0.8336, "step": 8360 }, { - "epoch": 1.0077052732964122, - "grad_norm": 6.15625, + "epoch": 5.970042796005706, + "grad_norm": 7.375, "learning_rate": 7.391555555555557e-05, - "loss": 0.6529, + "loss": 0.8282, "step": 8370 }, { - "epoch": 1.0089092222489766, - "grad_norm": 8.875, + "epoch": 5.977175463623395, + "grad_norm": 6.8125, "learning_rate": 7.387111111111111e-05, - "loss": 0.7195, + "loss": 0.8234, "step": 8380 }, { - "epoch": 1.010113171201541, - "grad_norm": 14.0, + "epoch": 5.984308131241084, + "grad_norm": 7.15625, "learning_rate": 7.382666666666666e-05, - "loss": 0.6301, + "loss": 0.8771, "step": 8390 }, { - "epoch": 1.0113171201541056, - "grad_norm": 7.46875, + "epoch": 5.991440798858774, + "grad_norm": 8.5, "learning_rate": 7.378222222222222e-05, - "loss": 0.6439, + "loss": 0.8572, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval/acc": 44.1860466003418, + "epoch": 5.991440798858774, + "eval/acc": 34.88372039794922, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval_loss": 2.8671419620513916, - "eval_runtime": 7.1371, - "eval_samples_per_second": 6.025, - "eval_steps_per_second": 0.14, + "epoch": 5.991440798858774, + "eval_loss": 2.5367989540100098, + "eval_runtime": 0.224, + "eval_samples_per_second": 191.97, + "eval_steps_per_second": 4.464, "step": 8400 }, { - "epoch": 1.01252106910667, - "grad_norm": 11.5625, + "epoch": 5.998573466476462, + "grad_norm": 7.0, "learning_rate": 7.373777777777779e-05, - "loss": 0.618, + "loss": 0.7468, "step": 8410 }, { - "epoch": 1.0137250180592343, - "grad_norm": 6.1875, + "epoch": 6.005706134094151, + "grad_norm": 7.78125, "learning_rate": 7.369333333333333e-05, - "loss": 0.666, + "loss": 0.7882, "step": 8420 }, { - "epoch": 1.0149289670117987, - "grad_norm": 7.34375, + "epoch": 6.01283880171184, + "grad_norm": 9.1875, "learning_rate": 7.364888888888888e-05, - "loss": 0.6237, + "loss": 0.9419, "step": 8430 }, { - "epoch": 1.016132915964363, - "grad_norm": 7.21875, + "epoch": 6.019971469329529, + "grad_norm": 17.625, "learning_rate": 7.360444444444445e-05, - "loss": 0.5974, + "loss": 0.7904, "step": 8440 }, { - "epoch": 1.0173368649169274, - "grad_norm": 8.625, + "epoch": 6.0271041369472185, + "grad_norm": 8.0625, "learning_rate": 7.356000000000001e-05, - "loss": 0.5766, + "loss": 0.8125, "step": 8450 }, { - "epoch": 1.018540813869492, - "grad_norm": 7.71875, + "epoch": 6.034236804564907, + "grad_norm": 7.4375, "learning_rate": 7.351555555555556e-05, - "loss": 0.6754, + "loss": 0.8002, "step": 8460 }, { - "epoch": 1.0197447628220564, - "grad_norm": 6.8125, + "epoch": 6.041369472182597, + "grad_norm": 5.6875, "learning_rate": 7.347111111111112e-05, - "loss": 0.6515, + "loss": 0.7719, "step": 8470 }, { - "epoch": 1.0209487117746208, - "grad_norm": 7.40625, + "epoch": 6.048502139800285, + "grad_norm": 8.9375, "learning_rate": 7.342666666666667e-05, - "loss": 0.6191, + "loss": 0.8122, "step": 8480 }, { - "epoch": 1.0221526607271851, - "grad_norm": 7.34375, + "epoch": 6.055634807417975, + "grad_norm": 9.875, "learning_rate": 7.338222222222223e-05, - "loss": 0.5703, + "loss": 0.8052, "step": 8490 }, { - "epoch": 1.0233566096797495, - "grad_norm": 8.125, + "epoch": 6.062767475035663, + "grad_norm": 9.125, "learning_rate": 7.333777777777778e-05, - "loss": 0.585, + "loss": 0.8171, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval/acc": 44.1860466003418, + "epoch": 6.062767475035663, + "eval/acc": 46.511627197265625, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval_loss": 2.8172407150268555, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.34, - "eval_steps_per_second": 4.729, + "epoch": 6.062767475035663, + "eval_loss": 2.4180805683135986, + "eval_runtime": 1.182, + "eval_samples_per_second": 36.38, + "eval_steps_per_second": 0.846, "step": 8500 }, { - "epoch": 1.0245605586323139, - "grad_norm": 8.375, + "epoch": 6.069900142653353, + "grad_norm": 6.84375, "learning_rate": 7.329333333333334e-05, - "loss": 0.5365, + "loss": 0.9028, "step": 8510 }, { - "epoch": 1.0257645075848785, - "grad_norm": 8.0, + "epoch": 6.077032810271041, + "grad_norm": 23.625, "learning_rate": 7.324888888888889e-05, - "loss": 0.5976, + "loss": 0.8576, "step": 8520 }, { - "epoch": 1.0269684565374428, - "grad_norm": 6.59375, + "epoch": 6.08416547788873, + "grad_norm": 6.96875, "learning_rate": 7.320444444444445e-05, - "loss": 0.6249, + "loss": 0.8407, "step": 8530 }, { - "epoch": 1.0281724054900072, - "grad_norm": 8.625, + "epoch": 6.0912981455064195, + "grad_norm": 8.6875, "learning_rate": 7.316000000000001e-05, - "loss": 0.5953, + "loss": 0.8419, "step": 8540 }, { - "epoch": 1.0293763544425716, - "grad_norm": 4.875, + "epoch": 6.098430813124108, + "grad_norm": 6.90625, "learning_rate": 7.311555555555556e-05, - "loss": 0.5528, + "loss": 0.7802, "step": 8550 }, { - "epoch": 1.030580303395136, - "grad_norm": 5.28125, + "epoch": 6.1055634807417976, + "grad_norm": 6.34375, "learning_rate": 7.307111111111111e-05, - "loss": 0.5181, + "loss": 0.7716, "step": 8560 }, { - "epoch": 1.0317842523477005, - "grad_norm": 9.9375, + "epoch": 6.112696148359486, + "grad_norm": 13.5, "learning_rate": 7.302666666666667e-05, - "loss": 0.5991, + "loss": 0.8538, "step": 8570 }, { - "epoch": 1.032988201300265, - "grad_norm": 5.78125, + "epoch": 6.119828815977176, + "grad_norm": 6.59375, "learning_rate": 7.298222222222223e-05, - "loss": 0.6822, + "loss": 0.6951, "step": 8580 }, { - "epoch": 1.0341921502528293, - "grad_norm": 7.84375, + "epoch": 6.126961483594864, + "grad_norm": 7.0625, "learning_rate": 7.293777777777778e-05, - "loss": 0.671, + "loss": 0.794, "step": 8590 }, { - "epoch": 1.0353960992053937, - "grad_norm": 8.4375, + "epoch": 6.134094151212554, + "grad_norm": 7.15625, "learning_rate": 7.289333333333334e-05, - "loss": 0.6266, + "loss": 0.8058, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval/acc": 41.27906799316406, + "epoch": 6.134094151212554, + "eval/acc": 46.511627197265625, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval_loss": 2.89090895652771, - "eval_runtime": 0.2168, - "eval_samples_per_second": 198.358, - "eval_steps_per_second": 4.613, + "epoch": 6.134094151212554, + "eval_loss": 2.5194764137268066, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.736, + "eval_steps_per_second": 4.529, "step": 8600 }, { - "epoch": 1.036600048157958, - "grad_norm": 6.84375, + "epoch": 6.141226818830242, + "grad_norm": 8.0625, "learning_rate": 7.284888888888889e-05, - "loss": 0.5829, + "loss": 0.8754, "step": 8610 }, { - "epoch": 1.0378039971105224, - "grad_norm": 12.6875, + "epoch": 6.148359486447932, + "grad_norm": 4.875, "learning_rate": 7.280444444444445e-05, - "loss": 0.6336, + "loss": 0.7852, "step": 8620 }, { - "epoch": 1.039007946063087, - "grad_norm": 5.78125, + "epoch": 6.1554921540656204, + "grad_norm": 8.0, "learning_rate": 7.276e-05, - "loss": 0.5621, + "loss": 0.8064, "step": 8630 }, { - "epoch": 1.0402118950156514, - "grad_norm": 6.78125, + "epoch": 6.16262482168331, + "grad_norm": 6.3125, "learning_rate": 7.271555555555556e-05, - "loss": 0.5822, + "loss": 0.7643, "step": 8640 }, { - "epoch": 1.0414158439682157, - "grad_norm": 5.40625, + "epoch": 6.1697574893009985, + "grad_norm": 8.875, "learning_rate": 7.267111111111111e-05, - "loss": 0.6402, + "loss": 0.7702, "step": 8650 }, { - "epoch": 1.04261979292078, - "grad_norm": 5.84375, + "epoch": 6.176890156918688, + "grad_norm": 18.5, "learning_rate": 7.262666666666666e-05, - "loss": 0.5793, + "loss": 0.903, "step": 8660 }, { - "epoch": 1.0438237418733445, - "grad_norm": 9.375, + "epoch": 6.184022824536377, + "grad_norm": 9.875, "learning_rate": 7.258222222222224e-05, - "loss": 0.6447, + "loss": 0.788, "step": 8670 }, { - "epoch": 1.045027690825909, - "grad_norm": 8.4375, + "epoch": 6.191155492154065, + "grad_norm": 7.71875, "learning_rate": 7.253777777777778e-05, - "loss": 0.6428, + "loss": 0.7504, "step": 8680 }, { - "epoch": 1.0462316397784734, - "grad_norm": 8.5, + "epoch": 6.198288159771755, + "grad_norm": 7.5, "learning_rate": 7.249333333333333e-05, - "loss": 0.6219, + "loss": 0.8821, "step": 8690 }, { - "epoch": 1.0474355887310378, - "grad_norm": 8.0625, + "epoch": 6.205420827389443, + "grad_norm": 6.71875, "learning_rate": 7.24488888888889e-05, - "loss": 0.5728, + "loss": 0.9166, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval/acc": 41.86046600341797, + "epoch": 6.205420827389443, + "eval/acc": 48.83720779418945, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval_loss": 2.881147861480713, - "eval_runtime": 0.2167, - "eval_samples_per_second": 198.476, - "eval_steps_per_second": 4.616, + "epoch": 6.205420827389443, + "eval_loss": 2.488805055618286, + "eval_runtime": 0.2195, + "eval_samples_per_second": 195.91, + "eval_steps_per_second": 4.556, "step": 8700 }, { - "epoch": 1.0486395376836022, - "grad_norm": 7.09375, + "epoch": 6.212553495007133, + "grad_norm": 8.3125, "learning_rate": 7.240444444444446e-05, - "loss": 0.6532, + "loss": 0.7724, "step": 8710 }, { - "epoch": 1.0498434866361666, - "grad_norm": 7.0625, + "epoch": 6.219686162624821, + "grad_norm": 7.84375, "learning_rate": 7.236e-05, - "loss": 0.5758, + "loss": 0.8881, "step": 8720 }, { - "epoch": 1.051047435588731, - "grad_norm": 8.375, + "epoch": 6.226818830242511, + "grad_norm": 7.21875, "learning_rate": 7.231555555555555e-05, - "loss": 0.6071, + "loss": 0.8538, "step": 8730 }, { - "epoch": 1.0522513845412955, - "grad_norm": 7.34375, + "epoch": 6.2339514978601995, + "grad_norm": 7.5, "learning_rate": 7.227111111111112e-05, - "loss": 0.6905, + "loss": 0.8909, "step": 8740 }, { - "epoch": 1.05345533349386, - "grad_norm": 6.59375, + "epoch": 6.241084165477889, + "grad_norm": 7.25, "learning_rate": 7.222666666666666e-05, - "loss": 0.584, + "loss": 0.7965, "step": 8750 }, { - "epoch": 1.0546592824464243, - "grad_norm": 7.4375, + "epoch": 6.248216833095578, + "grad_norm": 7.46875, "learning_rate": 7.218222222222223e-05, - "loss": 0.6222, + "loss": 0.8547, "step": 8760 }, { - "epoch": 1.0558632313989886, - "grad_norm": 7.1875, + "epoch": 6.255349500713267, + "grad_norm": 6.1875, "learning_rate": 7.213777777777779e-05, - "loss": 0.6167, + "loss": 0.7528, "step": 8770 }, { - "epoch": 1.057067180351553, - "grad_norm": 7.875, + "epoch": 6.262482168330956, + "grad_norm": 7.03125, "learning_rate": 7.209333333333334e-05, - "loss": 0.5766, + "loss": 0.8632, "step": 8780 }, { - "epoch": 1.0582711293041176, - "grad_norm": 7.96875, + "epoch": 6.269614835948644, + "grad_norm": 8.375, "learning_rate": 7.204888888888888e-05, - "loss": 0.5747, + "loss": 0.7832, "step": 8790 }, { - "epoch": 1.059475078256682, - "grad_norm": 7.5, + "epoch": 6.276747503566334, + "grad_norm": 8.125, "learning_rate": 7.200444444444445e-05, - "loss": 0.5361, + "loss": 0.7659, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval/acc": 44.1860466003418, + "epoch": 6.276747503566334, + "eval/acc": 48.83720779418945, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval_loss": 2.9378437995910645, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.123, - "eval_steps_per_second": 4.631, + "epoch": 6.276747503566334, + "eval_loss": 2.4990618228912354, + "eval_runtime": 0.2586, + "eval_samples_per_second": 166.3, + "eval_steps_per_second": 3.867, "step": 8800 }, { - "epoch": 1.0606790272092463, - "grad_norm": 6.46875, + "epoch": 6.283880171184022, + "grad_norm": 7.375, "learning_rate": 7.196000000000001e-05, - "loss": 0.5386, + "loss": 0.7402, "step": 8810 }, { - "epoch": 1.0618829761618107, - "grad_norm": 5.75, + "epoch": 6.291012838801712, + "grad_norm": 7.0, "learning_rate": 7.191555555555556e-05, - "loss": 0.5701, + "loss": 0.8381, "step": 8820 }, { - "epoch": 1.063086925114375, - "grad_norm": 10.5625, + "epoch": 6.2981455064194005, + "grad_norm": 15.75, "learning_rate": 7.18711111111111e-05, - "loss": 0.6061, + "loss": 0.8837, "step": 8830 }, { - "epoch": 1.0642908740669395, - "grad_norm": 6.75, + "epoch": 6.30527817403709, + "grad_norm": 5.46875, "learning_rate": 7.182666666666668e-05, - "loss": 0.6201, + "loss": 0.8638, "step": 8840 }, { - "epoch": 1.065494823019504, - "grad_norm": 9.625, + "epoch": 6.312410841654779, + "grad_norm": 5.46875, "learning_rate": 7.178222222222223e-05, - "loss": 0.6315, + "loss": 0.8348, "step": 8850 }, { - "epoch": 1.0666987719720684, - "grad_norm": 6.15625, + "epoch": 6.319543509272468, + "grad_norm": 7.9375, "learning_rate": 7.173777777777778e-05, - "loss": 0.6142, + "loss": 0.8598, "step": 8860 }, { - "epoch": 1.0679027209246328, - "grad_norm": 8.875, + "epoch": 6.326676176890157, + "grad_norm": 7.15625, "learning_rate": 7.169333333333334e-05, - "loss": 0.6545, + "loss": 0.8124, "step": 8870 }, { - "epoch": 1.0691066698771972, - "grad_norm": 6.5, + "epoch": 6.333808844507846, + "grad_norm": 6.28125, "learning_rate": 7.164888888888889e-05, - "loss": 0.6305, + "loss": 0.8184, "step": 8880 }, { - "epoch": 1.0703106188297615, - "grad_norm": 12.5, + "epoch": 6.340941512125535, + "grad_norm": 7.25, "learning_rate": 7.160444444444445e-05, - "loss": 0.6451, + "loss": 0.8522, "step": 8890 }, { - "epoch": 1.0715145677823261, - "grad_norm": 6.28125, + "epoch": 6.348074179743224, + "grad_norm": 8.4375, "learning_rate": 7.156e-05, - "loss": 0.5406, + "loss": 0.894, "step": 8900 }, { - "epoch": 1.0715145677823261, + "epoch": 6.348074179743224, "eval/acc": 46.511627197265625, "step": 8900 }, { - "epoch": 1.0715145677823261, - "eval_loss": 2.895603656768799, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.356, - "eval_steps_per_second": 4.729, + "epoch": 6.348074179743224, + "eval_loss": 2.4536728858947754, + "eval_runtime": 0.2168, + "eval_samples_per_second": 198.347, + "eval_steps_per_second": 4.613, "step": 8900 }, { - "epoch": 1.0727185167348905, - "grad_norm": 5.8125, + "epoch": 6.355206847360913, + "grad_norm": 8.3125, "learning_rate": 7.151555555555556e-05, - "loss": 0.6598, + "loss": 0.8331, "step": 8910 }, { - "epoch": 1.0739224656874549, - "grad_norm": 6.9375, + "epoch": 6.362339514978602, + "grad_norm": 13.1875, "learning_rate": 7.147111111111111e-05, - "loss": 0.5678, + "loss": 0.8107, "step": 8920 }, { - "epoch": 1.0751264146400192, - "grad_norm": 7.25, + "epoch": 6.369472182596291, + "grad_norm": 7.0, "learning_rate": 7.142666666666667e-05, - "loss": 0.6032, + "loss": 0.9504, "step": 8930 }, { - "epoch": 1.0763303635925836, - "grad_norm": 7.09375, + "epoch": 6.37660485021398, + "grad_norm": 9.5625, "learning_rate": 7.138222222222223e-05, - "loss": 0.5432, + "loss": 0.766, "step": 8940 }, { - "epoch": 1.077534312545148, - "grad_norm": 8.6875, + "epoch": 6.383737517831669, + "grad_norm": 13.4375, "learning_rate": 7.133777777777778e-05, - "loss": 0.6408, + "loss": 0.7923, "step": 8950 }, { - "epoch": 1.0787382614977126, - "grad_norm": 8.1875, + "epoch": 6.390870185449358, + "grad_norm": 6.6875, "learning_rate": 7.129333333333333e-05, - "loss": 0.5834, + "loss": 0.7777, "step": 8960 }, { - "epoch": 1.079942210450277, - "grad_norm": 8.75, + "epoch": 6.398002853067047, + "grad_norm": 6.09375, "learning_rate": 7.124888888888889e-05, - "loss": 0.5956, + "loss": 0.7729, "step": 8970 }, { - "epoch": 1.0811461594028413, - "grad_norm": 6.90625, + "epoch": 6.405135520684736, + "grad_norm": 6.46875, "learning_rate": 7.120444444444445e-05, - "loss": 0.6124, + "loss": 0.8118, "step": 8980 }, { - "epoch": 1.0823501083554057, - "grad_norm": 9.5625, + "epoch": 6.412268188302425, + "grad_norm": 6.21875, "learning_rate": 7.116e-05, - "loss": 0.6513, + "loss": 0.9006, "step": 8990 }, { - "epoch": 1.08355405730797, - "grad_norm": 8.0, + "epoch": 6.419400855920114, + "grad_norm": 6.5625, "learning_rate": 7.111555555555555e-05, - "loss": 0.6044, + "loss": 0.7092, "step": 9000 }, { - "epoch": 1.08355405730797, + "epoch": 6.419400855920114, "eval/acc": 44.1860466003418, "step": 9000 }, { - "epoch": 1.08355405730797, - "eval_loss": 2.894747257232666, - "eval_runtime": 0.2236, - "eval_samples_per_second": 192.288, - "eval_steps_per_second": 4.472, + "epoch": 6.419400855920114, + "eval_loss": 2.533996343612671, + "eval_runtime": 0.3418, + "eval_samples_per_second": 125.802, + "eval_steps_per_second": 2.926, "step": 9000 }, { - "epoch": 1.0847580062605346, - "grad_norm": 7.1875, + "epoch": 6.426533523537803, + "grad_norm": 7.59375, "learning_rate": 7.107111111111111e-05, - "loss": 0.4939, + "loss": 0.7684, "step": 9010 }, { - "epoch": 1.085961955213099, - "grad_norm": 8.25, + "epoch": 6.433666191155492, + "grad_norm": 6.8125, "learning_rate": 7.102666666666668e-05, - "loss": 0.7751, + "loss": 0.7654, "step": 9020 }, { - "epoch": 1.0871659041656634, - "grad_norm": 6.875, + "epoch": 6.4407988587731815, + "grad_norm": 7.5625, "learning_rate": 7.098222222222222e-05, - "loss": 0.593, + "loss": 0.8404, "step": 9030 }, { - "epoch": 1.0883698531182278, - "grad_norm": 7.5625, + "epoch": 6.44793152639087, + "grad_norm": 8.5, "learning_rate": 7.093777777777779e-05, - "loss": 0.587, + "loss": 0.8519, "step": 9040 }, { - "epoch": 1.0895738020707921, - "grad_norm": 9.5625, + "epoch": 6.45506419400856, + "grad_norm": 6.53125, "learning_rate": 7.089333333333333e-05, - "loss": 0.639, + "loss": 0.8487, "step": 9050 }, { - "epoch": 1.0907777510233565, - "grad_norm": 8.25, + "epoch": 6.462196861626248, + "grad_norm": 7.59375, "learning_rate": 7.084888888888888e-05, - "loss": 0.6537, + "loss": 0.8695, "step": 9060 }, { - "epoch": 1.091981699975921, - "grad_norm": 9.9375, + "epoch": 6.469329529243938, + "grad_norm": 8.4375, "learning_rate": 7.080444444444444e-05, - "loss": 0.6134, + "loss": 0.7864, "step": 9070 }, { - "epoch": 1.0931856489284855, - "grad_norm": 9.375, + "epoch": 6.476462196861626, + "grad_norm": 66.5, "learning_rate": 7.076000000000001e-05, - "loss": 0.5259, + "loss": 0.7726, "step": 9080 }, { - "epoch": 1.0943895978810498, - "grad_norm": 7.90625, + "epoch": 6.483594864479315, + "grad_norm": 6.96875, "learning_rate": 7.071555555555556e-05, - "loss": 0.7362, + "loss": 0.7832, "step": 9090 }, { - "epoch": 1.0955935468336142, - "grad_norm": 7.46875, + "epoch": 6.490727532097004, + "grad_norm": 7.40625, "learning_rate": 7.06711111111111e-05, - "loss": 0.6197, + "loss": 0.8063, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval/acc": 41.86046600341797, + "epoch": 6.490727532097004, + "eval/acc": 44.1860466003418, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval_loss": 2.920775890350342, - "eval_runtime": 0.2089, - "eval_samples_per_second": 205.889, - "eval_steps_per_second": 4.788, + "epoch": 6.490727532097004, + "eval_loss": 2.5438809394836426, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.594, + "eval_steps_per_second": 4.642, "step": 9100 }, { - "epoch": 1.0967974957861786, - "grad_norm": 9.6875, + "epoch": 6.497860199714693, + "grad_norm": 7.21875, "learning_rate": 7.062666666666668e-05, - "loss": 0.5682, + "loss": 0.7605, "step": 9110 }, { - "epoch": 1.0980014447387432, - "grad_norm": 6.5625, + "epoch": 6.5049928673323825, + "grad_norm": 7.90625, "learning_rate": 7.058222222222223e-05, - "loss": 0.572, + "loss": 0.8032, "step": 9120 }, { - "epoch": 1.0992053936913075, - "grad_norm": 7.5, + "epoch": 6.512125534950071, + "grad_norm": 6.9375, "learning_rate": 7.053777777777778e-05, - "loss": 0.5307, + "loss": 0.743, "step": 9130 }, { - "epoch": 1.100409342643872, - "grad_norm": 7.75, + "epoch": 6.519258202567761, + "grad_norm": 5.65625, "learning_rate": 7.049333333333334e-05, - "loss": 0.5432, + "loss": 0.8261, "step": 9140 }, { - "epoch": 1.1016132915964363, - "grad_norm": 6.84375, + "epoch": 6.526390870185449, + "grad_norm": 7.03125, "learning_rate": 7.04488888888889e-05, - "loss": 0.6012, + "loss": 0.8099, "step": 9150 }, { - "epoch": 1.1028172405490007, - "grad_norm": 6.84375, + "epoch": 6.533523537803139, + "grad_norm": 7.15625, "learning_rate": 7.040444444444445e-05, - "loss": 0.5776, + "loss": 0.817, "step": 9160 }, { - "epoch": 1.104021189501565, - "grad_norm": 8.0625, + "epoch": 6.540656205420827, + "grad_norm": 11.625, "learning_rate": 7.036e-05, - "loss": 0.5353, + "loss": 0.782, "step": 9170 }, { - "epoch": 1.1052251384541296, - "grad_norm": 5.65625, + "epoch": 6.547788873038517, + "grad_norm": 7.5625, "learning_rate": 7.031555555555556e-05, - "loss": 0.5664, + "loss": 0.8145, "step": 9180 }, { - "epoch": 1.106429087406694, - "grad_norm": 14.0, + "epoch": 6.554921540656205, + "grad_norm": 7.5625, "learning_rate": 7.027111111111111e-05, - "loss": 0.6547, + "loss": 0.8822, "step": 9190 }, { - "epoch": 1.1076330363592584, - "grad_norm": 7.9375, + "epoch": 6.562054208273894, + "grad_norm": 6.53125, "learning_rate": 7.022666666666667e-05, - "loss": 0.6063, + "loss": 0.8132, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval/acc": 41.86046600341797, + "epoch": 6.562054208273894, + "eval/acc": 44.1860466003418, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval_loss": 2.9192073345184326, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.15, - "eval_steps_per_second": 4.794, + "epoch": 6.562054208273894, + "eval_loss": 2.528564929962158, + "eval_runtime": 0.2169, + "eval_samples_per_second": 198.28, + "eval_steps_per_second": 4.611, "step": 9200 }, { - "epoch": 1.1088369853118227, - "grad_norm": 11.5625, + "epoch": 6.5691868758915835, + "grad_norm": 7.21875, "learning_rate": 7.018222222222223e-05, - "loss": 0.6251, + "loss": 0.7858, "step": 9210 }, { - "epoch": 1.110040934264387, - "grad_norm": 7.34375, + "epoch": 6.576319543509273, + "grad_norm": 6.4375, "learning_rate": 7.013777777777778e-05, - "loss": 0.5408, + "loss": 0.7098, "step": 9220 }, { - "epoch": 1.1112448832169517, - "grad_norm": 6.75, + "epoch": 6.5834522111269616, + "grad_norm": 7.125, "learning_rate": 7.009333333333333e-05, - "loss": 0.6217, + "loss": 0.8362, "step": 9230 }, { - "epoch": 1.112448832169516, - "grad_norm": 8.375, + "epoch": 6.59058487874465, + "grad_norm": 5.78125, "learning_rate": 7.004888888888889e-05, - "loss": 0.6792, + "loss": 0.7737, "step": 9240 }, { - "epoch": 1.1136527811220804, - "grad_norm": 8.375, + "epoch": 6.59771754636234, + "grad_norm": 9.0625, "learning_rate": 7.000444444444445e-05, - "loss": 0.5786, + "loss": 0.857, "step": 9250 }, { - "epoch": 1.1148567300746448, - "grad_norm": 11.0, + "epoch": 6.604850213980028, + "grad_norm": 9.125, "learning_rate": 6.996e-05, - "loss": 0.6588, + "loss": 0.7562, "step": 9260 }, { - "epoch": 1.1160606790272092, - "grad_norm": 6.75, + "epoch": 6.611982881597718, + "grad_norm": 8.3125, "learning_rate": 6.991555555555556e-05, - "loss": 0.6016, + "loss": 0.8619, "step": 9270 }, { - "epoch": 1.1172646279797736, - "grad_norm": 9.1875, + "epoch": 6.619115549215406, + "grad_norm": 6.78125, "learning_rate": 6.987111111111111e-05, - "loss": 0.5728, + "loss": 0.7212, "step": 9280 }, { - "epoch": 1.1184685769323381, - "grad_norm": 8.0625, + "epoch": 6.626248216833096, + "grad_norm": 26.125, "learning_rate": 6.982666666666667e-05, - "loss": 0.669, + "loss": 0.951, "step": 9290 }, { - "epoch": 1.1196725258849025, - "grad_norm": 9.6875, + "epoch": 6.633380884450784, + "grad_norm": 7.03125, "learning_rate": 6.978222222222222e-05, - "loss": 0.625, + "loss": 0.7791, "step": 9300 }, { - "epoch": 1.1196725258849025, + "epoch": 6.633380884450784, "eval/acc": 44.1860466003418, "step": 9300 }, { - "epoch": 1.1196725258849025, - "eval_loss": 2.8807859420776367, - "eval_runtime": 0.2269, - "eval_samples_per_second": 189.503, - "eval_steps_per_second": 4.407, + "epoch": 6.633380884450784, + "eval_loss": 2.587022304534912, + "eval_runtime": 0.2175, + "eval_samples_per_second": 197.663, + "eval_steps_per_second": 4.597, "step": 9300 }, { - "epoch": 1.1208764748374669, - "grad_norm": 7.09375, + "epoch": 6.640513552068474, + "grad_norm": 6.6875, "learning_rate": 6.973777777777778e-05, - "loss": 0.5112, + "loss": 0.8082, "step": 9310 }, { - "epoch": 1.1220804237900313, - "grad_norm": 19.375, + "epoch": 6.6476462196861625, + "grad_norm": 7.625, "learning_rate": 6.969333333333333e-05, - "loss": 0.7337, + "loss": 0.6863, "step": 9320 }, { - "epoch": 1.1232843727425956, - "grad_norm": 8.25, + "epoch": 6.654778887303852, + "grad_norm": 8.625, "learning_rate": 6.96488888888889e-05, - "loss": 0.6687, + "loss": 0.7921, "step": 9330 }, { - "epoch": 1.1244883216951602, - "grad_norm": 8.125, + "epoch": 6.661911554921541, + "grad_norm": 6.5, "learning_rate": 6.960444444444446e-05, - "loss": 0.5604, + "loss": 0.7762, "step": 9340 }, { - "epoch": 1.1256922706477246, - "grad_norm": 9.1875, + "epoch": 6.669044222539229, + "grad_norm": 12.6875, "learning_rate": 6.956e-05, - "loss": 0.6999, + "loss": 0.7977, "step": 9350 }, { - "epoch": 1.126896219600289, - "grad_norm": 8.5, + "epoch": 6.676176890156919, + "grad_norm": 6.84375, "learning_rate": 6.951555555555555e-05, - "loss": 0.5909, + "loss": 0.907, "step": 9360 }, { - "epoch": 1.1281001685528533, - "grad_norm": 7.21875, + "epoch": 6.683309557774607, + "grad_norm": 7.15625, "learning_rate": 6.947111111111112e-05, - "loss": 0.5857, + "loss": 0.792, "step": 9370 }, { - "epoch": 1.1293041175054177, - "grad_norm": 6.84375, + "epoch": 6.690442225392297, + "grad_norm": 8.5, "learning_rate": 6.942666666666668e-05, - "loss": 0.5965, + "loss": 0.7838, "step": 9380 }, { - "epoch": 1.130508066457982, - "grad_norm": 6.59375, + "epoch": 6.697574893009985, + "grad_norm": 8.1875, "learning_rate": 6.938222222222223e-05, - "loss": 0.6098, + "loss": 0.8141, "step": 9390 }, { - "epoch": 1.1317120154105467, - "grad_norm": 9.3125, + "epoch": 6.704707560627675, + "grad_norm": 7.875, "learning_rate": 6.933777777777777e-05, - "loss": 0.5917, + "loss": 0.8348, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval/acc": 44.1860466003418, + "epoch": 6.704707560627675, + "eval/acc": 39.53488540649414, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval_loss": 2.906259536743164, - "eval_runtime": 0.2551, - "eval_samples_per_second": 168.571, - "eval_steps_per_second": 3.92, + "epoch": 6.704707560627675, + "eval_loss": 2.6398463249206543, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.145, + "eval_steps_per_second": 4.585, "step": 9400 }, { - "epoch": 1.132915964363111, - "grad_norm": 8.3125, + "epoch": 6.7118402282453635, + "grad_norm": 6.625, "learning_rate": 6.929333333333334e-05, - "loss": 0.5629, + "loss": 0.889, "step": 9410 }, { - "epoch": 1.1341199133156754, - "grad_norm": 5.21875, + "epoch": 6.718972895863053, + "grad_norm": 7.3125, "learning_rate": 6.92488888888889e-05, - "loss": 0.4913, + "loss": 0.7913, "step": 9420 }, { - "epoch": 1.1353238622682398, - "grad_norm": 7.5625, + "epoch": 6.726105563480742, + "grad_norm": 10.875, "learning_rate": 6.920444444444445e-05, - "loss": 0.5868, + "loss": 0.8099, "step": 9430 }, { - "epoch": 1.1365278112208042, - "grad_norm": 8.8125, + "epoch": 6.733238231098431, + "grad_norm": 23.75, "learning_rate": 6.916000000000001e-05, - "loss": 0.6205, + "loss": 0.7098, "step": 9440 }, { - "epoch": 1.1377317601733687, - "grad_norm": 6.78125, + "epoch": 6.74037089871612, + "grad_norm": 6.625, "learning_rate": 6.911555555555556e-05, - "loss": 0.6569, + "loss": 0.7859, "step": 9450 }, { - "epoch": 1.1389357091259331, - "grad_norm": 7.9375, + "epoch": 6.747503566333809, + "grad_norm": 5.875, "learning_rate": 6.907111111111112e-05, - "loss": 0.5849, + "loss": 0.7947, "step": 9460 }, { - "epoch": 1.1401396580784975, - "grad_norm": 8.6875, + "epoch": 6.754636233951498, + "grad_norm": 7.25, "learning_rate": 6.902666666666667e-05, - "loss": 0.5997, + "loss": 0.927, "step": 9470 }, { - "epoch": 1.1413436070310619, - "grad_norm": 12.75, + "epoch": 6.761768901569187, + "grad_norm": 12.875, "learning_rate": 6.898222222222223e-05, - "loss": 0.6568, + "loss": 0.8474, "step": 9480 }, { - "epoch": 1.1425475559836262, - "grad_norm": 7.6875, + "epoch": 6.768901569186876, + "grad_norm": 6.8125, "learning_rate": 6.893777777777778e-05, - "loss": 0.6542, + "loss": 0.848, "step": 9490 }, { - "epoch": 1.1437515049361906, - "grad_norm": 6.59375, + "epoch": 6.7760342368045645, + "grad_norm": 7.96875, "learning_rate": 6.889333333333333e-05, - "loss": 0.4745, + "loss": 0.8081, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval/acc": 48.83720779418945, + "epoch": 6.7760342368045645, + "eval/acc": 41.86046600341797, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval_loss": 2.8177154064178467, - "eval_runtime": 0.2171, - "eval_samples_per_second": 198.092, - "eval_steps_per_second": 4.607, + "epoch": 6.7760342368045645, + "eval_loss": 2.6681759357452393, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.372, + "eval_steps_per_second": 4.451, "step": 9500 }, { - "epoch": 1.1449554538887552, - "grad_norm": 6.53125, + "epoch": 6.783166904422254, + "grad_norm": 9.0625, "learning_rate": 6.88488888888889e-05, - "loss": 0.664, + "loss": 0.8836, "step": 9510 }, { - "epoch": 1.1461594028413196, - "grad_norm": 7.6875, + "epoch": 6.790299572039943, + "grad_norm": 15.875, "learning_rate": 6.880444444444445e-05, - "loss": 0.5778, + "loss": 0.8696, "step": 9520 }, { - "epoch": 1.147363351793884, - "grad_norm": 6.84375, + "epoch": 6.797432239657632, + "grad_norm": 10.75, "learning_rate": 6.876e-05, - "loss": 0.6596, + "loss": 0.844, "step": 9530 }, { - "epoch": 1.1485673007464483, - "grad_norm": 8.75, + "epoch": 6.804564907275321, + "grad_norm": 23.875, "learning_rate": 6.871555555555556e-05, - "loss": 0.6422, + "loss": 0.823, "step": 9540 }, { - "epoch": 1.1497712496990127, - "grad_norm": 6.5625, + "epoch": 6.81169757489301, + "grad_norm": 7.75, "learning_rate": 6.867111111111112e-05, - "loss": 0.5794, + "loss": 0.8875, "step": 9550 }, { - "epoch": 1.1509751986515773, - "grad_norm": 8.625, + "epoch": 6.818830242510699, + "grad_norm": 6.46875, "learning_rate": 6.862666666666667e-05, - "loss": 0.6171, + "loss": 0.7703, "step": 9560 }, { - "epoch": 1.1521791476041416, - "grad_norm": 6.875, + "epoch": 6.825962910128388, + "grad_norm": 6.375, "learning_rate": 6.858222222222222e-05, - "loss": 0.58, + "loss": 0.8, "step": 9570 }, { - "epoch": 1.153383096556706, - "grad_norm": 14.375, + "epoch": 6.833095577746077, + "grad_norm": 7.96875, "learning_rate": 6.853777777777778e-05, - "loss": 0.6651, + "loss": 0.8139, "step": 9580 }, { - "epoch": 1.1545870455092704, - "grad_norm": 6.65625, + "epoch": 6.840228245363766, + "grad_norm": 11.625, "learning_rate": 6.849333333333333e-05, - "loss": 0.621, + "loss": 0.8042, "step": 9590 }, { - "epoch": 1.1557909944618348, - "grad_norm": 8.25, + "epoch": 6.847360912981455, + "grad_norm": 10.8125, "learning_rate": 6.844888888888889e-05, - "loss": 0.6578, + "loss": 0.8403, "step": 9600 }, { - "epoch": 1.1557909944618348, + "epoch": 6.847360912981455, "eval/acc": 44.1860466003418, "step": 9600 }, { - "epoch": 1.1557909944618348, - "eval_loss": 2.841442108154297, - "eval_runtime": 0.2144, - "eval_samples_per_second": 200.545, - "eval_steps_per_second": 4.664, + "epoch": 6.847360912981455, + "eval_loss": 2.6575427055358887, + "eval_runtime": 0.2186, + "eval_samples_per_second": 196.745, + "eval_steps_per_second": 4.575, "step": 9600 }, { - "epoch": 1.1569949434143991, - "grad_norm": 6.09375, + "epoch": 6.854493580599144, + "grad_norm": 14.6875, "learning_rate": 6.840444444444445e-05, - "loss": 0.5215, + "loss": 0.8426, "step": 9610 }, { - "epoch": 1.1581988923669637, - "grad_norm": 9.1875, + "epoch": 6.861626248216833, + "grad_norm": 7.84375, "learning_rate": 6.836e-05, - "loss": 0.6458, + "loss": 0.8874, "step": 9620 }, { - "epoch": 1.159402841319528, - "grad_norm": 8.8125, + "epoch": 6.868758915834523, + "grad_norm": 8.9375, "learning_rate": 6.831555555555555e-05, - "loss": 0.6037, + "loss": 0.78, "step": 9630 }, { - "epoch": 1.1606067902720925, - "grad_norm": 7.0, + "epoch": 6.875891583452211, + "grad_norm": 6.1875, "learning_rate": 6.827111111111111e-05, - "loss": 0.5408, + "loss": 0.7788, "step": 9640 }, { - "epoch": 1.1618107392246568, - "grad_norm": 5.78125, + "epoch": 6.8830242510699, + "grad_norm": 6.34375, "learning_rate": 6.822666666666668e-05, - "loss": 0.5832, + "loss": 0.7385, "step": 9650 }, { - "epoch": 1.1630146881772212, - "grad_norm": 6.84375, + "epoch": 6.890156918687589, + "grad_norm": 7.59375, "learning_rate": 6.818222222222222e-05, - "loss": 0.5802, + "loss": 0.8938, "step": 9660 }, { - "epoch": 1.1642186371297858, - "grad_norm": 5.75, + "epoch": 6.897289586305278, + "grad_norm": 10.8125, "learning_rate": 6.813777777777777e-05, - "loss": 0.5377, + "loss": 0.8154, "step": 9670 }, { - "epoch": 1.1654225860823502, - "grad_norm": 7.5625, + "epoch": 6.904422253922967, + "grad_norm": 6.90625, "learning_rate": 6.809333333333333e-05, - "loss": 0.5657, + "loss": 0.9273, "step": 9680 }, { - "epoch": 1.1666265350349145, - "grad_norm": 6.15625, + "epoch": 6.911554921540656, + "grad_norm": 8.3125, "learning_rate": 6.80488888888889e-05, - "loss": 0.5107, + "loss": 0.8595, "step": 9690 }, { - "epoch": 1.167830483987479, - "grad_norm": 5.28125, + "epoch": 6.9186875891583455, + "grad_norm": 10.75, "learning_rate": 6.800444444444444e-05, - "loss": 0.5898, + "loss": 0.8569, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval/acc": 46.511627197265625, + "epoch": 6.9186875891583455, + "eval/acc": 39.53488540649414, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval_loss": 2.8665220737457275, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.771, - "eval_steps_per_second": 4.739, + "epoch": 6.9186875891583455, + "eval_loss": 2.6524884700775146, + "eval_runtime": 0.215, + "eval_samples_per_second": 200.025, + "eval_steps_per_second": 4.652, "step": 9700 }, { - "epoch": 1.1690344329400433, - "grad_norm": 7.0, + "epoch": 6.925820256776034, + "grad_norm": 7.90625, "learning_rate": 6.796e-05, - "loss": 0.6016, + "loss": 0.7726, "step": 9710 }, { - "epoch": 1.1702383818926076, - "grad_norm": 7.0, + "epoch": 6.932952924393724, + "grad_norm": 7.71875, "learning_rate": 6.791555555555556e-05, - "loss": 0.6048, + "loss": 0.789, "step": 9720 }, { - "epoch": 1.1714423308451722, - "grad_norm": 7.21875, + "epoch": 6.940085592011412, + "grad_norm": 7.4375, "learning_rate": 6.787111111111112e-05, - "loss": 0.5315, + "loss": 0.7525, "step": 9730 }, { - "epoch": 1.1726462797977366, - "grad_norm": 6.53125, + "epoch": 6.947218259629102, + "grad_norm": 6.96875, "learning_rate": 6.782666666666667e-05, - "loss": 0.5033, + "loss": 0.8183, "step": 9740 }, { - "epoch": 1.173850228750301, - "grad_norm": 6.34375, + "epoch": 6.95435092724679, + "grad_norm": 6.5625, "learning_rate": 6.778222222222223e-05, - "loss": 0.5615, + "loss": 0.8713, "step": 9750 }, { - "epoch": 1.1750541777028654, - "grad_norm": 6.34375, + "epoch": 6.961483594864479, + "grad_norm": 6.59375, "learning_rate": 6.773777777777778e-05, - "loss": 0.5494, + "loss": 0.8089, "step": 9760 }, { - "epoch": 1.1762581266554297, - "grad_norm": 7.3125, + "epoch": 6.968616262482168, + "grad_norm": 7.46875, "learning_rate": 6.769333333333334e-05, - "loss": 0.6047, + "loss": 0.8173, "step": 9770 }, { - "epoch": 1.1774620756079943, - "grad_norm": 6.53125, + "epoch": 6.975748930099857, + "grad_norm": 8.75, "learning_rate": 6.76488888888889e-05, - "loss": 0.6653, + "loss": 0.8359, "step": 9780 }, { - "epoch": 1.1786660245605587, - "grad_norm": 21.75, + "epoch": 6.9828815977175465, + "grad_norm": 6.96875, "learning_rate": 6.760444444444445e-05, - "loss": 0.5944, + "loss": 0.7308, "step": 9790 }, { - "epoch": 1.179869973513123, - "grad_norm": 17.25, + "epoch": 6.990014265335235, + "grad_norm": 8.6875, "learning_rate": 6.756e-05, - "loss": 0.6511, + "loss": 0.7651, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval/acc": 46.511627197265625, + "epoch": 6.990014265335235, + "eval/acc": 44.1860466003418, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval_loss": 2.8695812225341797, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.781, - "eval_steps_per_second": 4.669, + "epoch": 6.990014265335235, + "eval_loss": 2.581909418106079, + "eval_runtime": 0.217, + "eval_samples_per_second": 198.162, + "eval_steps_per_second": 4.608, "step": 9800 }, { - "epoch": 1.1810739224656874, - "grad_norm": 5.3125, + "epoch": 6.997146932952925, + "grad_norm": 7.6875, "learning_rate": 6.751555555555556e-05, - "loss": 0.6008, + "loss": 0.8653, "step": 9810 }, { - "epoch": 1.1822778714182518, - "grad_norm": 9.4375, + "epoch": 7.004279600570613, + "grad_norm": 8.5, "learning_rate": 6.747111111111112e-05, - "loss": 0.5898, + "loss": 0.8445, "step": 9820 }, { - "epoch": 1.1834818203708162, - "grad_norm": 6.6875, + "epoch": 7.011412268188303, + "grad_norm": 6.375, "learning_rate": 6.742666666666667e-05, - "loss": 0.5976, + "loss": 0.7759, "step": 9830 }, { - "epoch": 1.1846857693233808, - "grad_norm": 7.875, + "epoch": 7.018544935805991, + "grad_norm": 6.375, "learning_rate": 6.738222222222222e-05, - "loss": 0.5604, + "loss": 0.7709, "step": 9840 }, { - "epoch": 1.1858897182759451, - "grad_norm": 6.0625, + "epoch": 7.025677603423681, + "grad_norm": 7.8125, "learning_rate": 6.733777777777778e-05, - "loss": 0.736, + "loss": 0.768, "step": 9850 }, { - "epoch": 1.1870936672285095, - "grad_norm": 8.125, + "epoch": 7.032810271041369, + "grad_norm": 8.4375, "learning_rate": 6.729333333333334e-05, - "loss": 0.5235, + "loss": 0.8725, "step": 9860 }, { - "epoch": 1.1882976161810739, - "grad_norm": 6.46875, + "epoch": 7.039942938659059, + "grad_norm": 7.8125, "learning_rate": 6.724888888888889e-05, - "loss": 0.5716, + "loss": 0.8146, "step": 9870 }, { - "epoch": 1.1895015651336383, - "grad_norm": 6.21875, + "epoch": 7.0470756062767475, + "grad_norm": 70.0, "learning_rate": 6.720444444444445e-05, - "loss": 0.5337, + "loss": 0.8137, "step": 9880 }, { - "epoch": 1.1907055140862028, - "grad_norm": 7.28125, + "epoch": 7.054208273894437, + "grad_norm": 7.03125, "learning_rate": 6.716e-05, - "loss": 0.5203, + "loss": 0.8025, "step": 9890 }, { - "epoch": 1.1919094630387672, - "grad_norm": 8.1875, + "epoch": 7.0613409415121255, + "grad_norm": 7.15625, "learning_rate": 6.711555555555555e-05, - "loss": 0.5532, + "loss": 0.8237, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval/acc": 46.511627197265625, + "epoch": 7.0613409415121255, + "eval/acc": 62.79069900512695, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval_loss": 2.864424705505371, - "eval_runtime": 0.2199, - "eval_samples_per_second": 195.51, - "eval_steps_per_second": 4.547, + "epoch": 7.0613409415121255, + "eval_loss": 2.023484706878662, + "eval_runtime": 1.3641, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 0.733, "step": 9900 }, { - "epoch": 1.1931134119913316, - "grad_norm": 8.5625, + "epoch": 7.068473609129814, + "grad_norm": 10.375, "learning_rate": 6.707111111111111e-05, - "loss": 0.585, + "loss": 0.7141, "step": 9910 }, { - "epoch": 1.194317360943896, - "grad_norm": 8.25, + "epoch": 7.075606276747504, + "grad_norm": 9.25, "learning_rate": 6.702666666666667e-05, - "loss": 0.6533, + "loss": 0.7963, "step": 9920 }, { - "epoch": 1.1955213098964603, - "grad_norm": 8.8125, + "epoch": 7.082738944365192, + "grad_norm": 7.375, "learning_rate": 6.698222222222222e-05, - "loss": 0.5962, + "loss": 0.7935, "step": 9930 }, { - "epoch": 1.1967252588490247, - "grad_norm": 13.0625, + "epoch": 7.089871611982882, + "grad_norm": 6.8125, "learning_rate": 6.693777777777778e-05, - "loss": 0.6169, + "loss": 0.7882, "step": 9940 }, { - "epoch": 1.1979292078015893, - "grad_norm": 7.5625, + "epoch": 7.09700427960057, + "grad_norm": 7.0625, "learning_rate": 6.689333333333335e-05, - "loss": 0.5756, + "loss": 0.7698, "step": 9950 }, { - "epoch": 1.1991331567541537, - "grad_norm": 6.03125, + "epoch": 7.10413694721826, + "grad_norm": 6.9375, "learning_rate": 6.68488888888889e-05, - "loss": 0.5746, + "loss": 0.8595, "step": 9960 }, { - "epoch": 1.200337105706718, - "grad_norm": 4.875, + "epoch": 7.111269614835948, + "grad_norm": 9.5, "learning_rate": 6.680444444444444e-05, - "loss": 0.6586, + "loss": 0.8158, "step": 9970 }, { - "epoch": 1.2015410546592824, - "grad_norm": 7.375, + "epoch": 7.118402282453638, + "grad_norm": 8.375, "learning_rate": 6.676e-05, - "loss": 0.6928, + "loss": 0.7916, "step": 9980 }, { - "epoch": 1.2027450036118468, - "grad_norm": 8.875, + "epoch": 7.1255349500713265, + "grad_norm": 6.3125, "learning_rate": 6.671555555555555e-05, - "loss": 0.6166, + "loss": 0.7455, "step": 9990 }, { - "epoch": 1.2039489525644114, - "grad_norm": 7.96875, + "epoch": 7.132667617689016, + "grad_norm": 7.375, "learning_rate": 6.667111111111112e-05, - "loss": 0.6778, + "loss": 0.7398, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval/acc": 46.511627197265625, + "epoch": 7.132667617689016, + "eval/acc": 65.11627960205078, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval_loss": 2.8400421142578125, - "eval_runtime": 0.2085, - "eval_samples_per_second": 206.266, - "eval_steps_per_second": 4.797, + "epoch": 7.132667617689016, + "eval_loss": 2.0408403873443604, + "eval_runtime": 0.2184, + "eval_samples_per_second": 196.923, + "eval_steps_per_second": 4.58, "step": 10000 }, { - "epoch": 1.2051529015169757, - "grad_norm": 6.21875, + "epoch": 7.139800285306705, + "grad_norm": 8.375, "learning_rate": 6.662666666666668e-05, - "loss": 0.5977, + "loss": 0.8887, "step": 10010 }, { - "epoch": 1.20635685046954, - "grad_norm": 7.03125, + "epoch": 7.146932952924394, + "grad_norm": 8.5, "learning_rate": 6.658222222222223e-05, - "loss": 0.5471, + "loss": 0.8945, "step": 10020 }, { - "epoch": 1.2075607994221045, - "grad_norm": 7.3125, + "epoch": 7.154065620542083, + "grad_norm": 22.5, "learning_rate": 6.653777777777777e-05, - "loss": 0.587, + "loss": 0.7934, "step": 10030 }, { - "epoch": 1.2087647483746689, - "grad_norm": 7.28125, + "epoch": 7.161198288159771, + "grad_norm": 7.34375, "learning_rate": 6.649333333333334e-05, - "loss": 0.5015, + "loss": 0.8056, "step": 10040 }, { - "epoch": 1.2099686973272332, - "grad_norm": 8.3125, + "epoch": 7.168330955777461, + "grad_norm": 7.59375, "learning_rate": 6.64488888888889e-05, - "loss": 0.5784, + "loss": 0.7893, "step": 10050 }, { - "epoch": 1.2111726462797978, - "grad_norm": 6.46875, + "epoch": 7.175463623395149, + "grad_norm": 8.5, "learning_rate": 6.640444444444445e-05, - "loss": 0.5528, + "loss": 1.0099, "step": 10060 }, { - "epoch": 1.2123765952323622, - "grad_norm": 4.8125, + "epoch": 7.182596291012839, + "grad_norm": 8.0625, "learning_rate": 6.636e-05, - "loss": 0.6008, + "loss": 0.8701, "step": 10070 }, { - "epoch": 1.2135805441849266, - "grad_norm": 7.46875, + "epoch": 7.1897289586305275, + "grad_norm": 9.25, "learning_rate": 6.631555555555557e-05, - "loss": 0.5804, + "loss": 0.8203, "step": 10080 }, { - "epoch": 1.214784493137491, - "grad_norm": 8.375, + "epoch": 7.196861626248217, + "grad_norm": 7.90625, "learning_rate": 6.627111111111112e-05, - "loss": 0.5645, + "loss": 0.8197, "step": 10090 }, { - "epoch": 1.2159884420900553, - "grad_norm": 12.0, + "epoch": 7.203994293865906, + "grad_norm": 6.03125, "learning_rate": 6.622666666666667e-05, - "loss": 0.5773, + "loss": 0.8087, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval/acc": 44.1860466003418, + "epoch": 7.203994293865906, + "eval/acc": 60.46511459350586, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval_loss": 2.8810744285583496, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.671, - "eval_steps_per_second": 4.783, + "epoch": 7.203994293865906, + "eval_loss": 1.940862774848938, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.814, + "eval_steps_per_second": 4.391, "step": 10100 }, { - "epoch": 1.21719239104262, - "grad_norm": 9.625, + "epoch": 7.211126961483595, + "grad_norm": 8.1875, "learning_rate": 6.618222222222223e-05, - "loss": 0.6729, + "loss": 0.7932, "step": 10110 }, { - "epoch": 1.2183963399951843, - "grad_norm": 7.15625, + "epoch": 7.218259629101284, + "grad_norm": 7.4375, "learning_rate": 6.613777777777778e-05, - "loss": 0.613, + "loss": 0.7562, "step": 10120 }, { - "epoch": 1.2196002889477486, - "grad_norm": 5.34375, + "epoch": 7.225392296718973, + "grad_norm": 8.4375, "learning_rate": 6.609333333333334e-05, - "loss": 0.5637, + "loss": 0.8474, "step": 10130 }, { - "epoch": 1.220804237900313, - "grad_norm": 8.1875, + "epoch": 7.232524964336662, + "grad_norm": 8.0, "learning_rate": 6.604888888888889e-05, - "loss": 0.6426, + "loss": 0.8197, "step": 10140 }, { - "epoch": 1.2220081868528774, - "grad_norm": 7.34375, + "epoch": 7.239657631954351, + "grad_norm": 8.0625, "learning_rate": 6.600444444444445e-05, - "loss": 0.5698, + "loss": 0.7804, "step": 10150 }, { - "epoch": 1.2232121358054417, - "grad_norm": 9.25, + "epoch": 7.24679029957204, + "grad_norm": 21.25, "learning_rate": 6.596e-05, - "loss": 0.6375, + "loss": 0.8914, "step": 10160 }, { - "epoch": 1.2244160847580063, - "grad_norm": 6.25, + "epoch": 7.2539229671897285, + "grad_norm": 7.125, "learning_rate": 6.591555555555556e-05, - "loss": 0.5693, + "loss": 0.8185, "step": 10170 }, { - "epoch": 1.2256200337105707, - "grad_norm": 6.4375, + "epoch": 7.261055634807418, + "grad_norm": 6.6875, "learning_rate": 6.587111111111112e-05, - "loss": 0.5378, + "loss": 0.7911, "step": 10180 }, { - "epoch": 1.226823982663135, - "grad_norm": 8.375, + "epoch": 7.268188302425107, + "grad_norm": 7.21875, "learning_rate": 6.582666666666667e-05, - "loss": 0.7013, + "loss": 0.8004, "step": 10190 }, { - "epoch": 1.2280279316156995, - "grad_norm": 5.8125, + "epoch": 7.275320970042796, + "grad_norm": 7.0, "learning_rate": 6.578222222222222e-05, - "loss": 0.6519, + "loss": 0.7226, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval/acc": 46.511627197265625, + "epoch": 7.275320970042796, + "eval/acc": 60.46511459350586, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval_loss": 2.8267436027526855, - "eval_runtime": 0.2057, - "eval_samples_per_second": 209.0, - "eval_steps_per_second": 4.86, + "epoch": 7.275320970042796, + "eval_loss": 2.004242420196533, + "eval_runtime": 0.2197, + "eval_samples_per_second": 195.738, + "eval_steps_per_second": 4.552, "step": 10200 }, { - "epoch": 1.2292318805682638, - "grad_norm": 7.1875, + "epoch": 7.282453637660485, + "grad_norm": 16.25, "learning_rate": 6.573777777777778e-05, - "loss": 0.5266, + "loss": 0.8735, "step": 10210 }, { - "epoch": 1.2304358295208284, - "grad_norm": 6.875, + "epoch": 7.289586305278174, + "grad_norm": 6.8125, "learning_rate": 6.569333333333334e-05, - "loss": 0.5686, + "loss": 0.8356, "step": 10220 }, { - "epoch": 1.2316397784733928, - "grad_norm": 8.0, + "epoch": 7.296718972895863, + "grad_norm": 5.65625, "learning_rate": 6.564888888888889e-05, - "loss": 0.6414, + "loss": 0.8032, "step": 10230 }, { - "epoch": 1.2328437274259572, - "grad_norm": 6.8125, + "epoch": 7.303851640513552, + "grad_norm": 6.125, "learning_rate": 6.560444444444444e-05, - "loss": 0.6118, + "loss": 0.7803, "step": 10240 }, { - "epoch": 1.2340476763785215, - "grad_norm": 8.625, + "epoch": 7.310984308131241, + "grad_norm": 9.375, "learning_rate": 6.556e-05, - "loss": 0.5839, + "loss": 0.8748, "step": 10250 }, { - "epoch": 1.235251625331086, - "grad_norm": 7.34375, + "epoch": 7.31811697574893, + "grad_norm": 6.625, "learning_rate": 6.551555555555556e-05, - "loss": 0.6561, + "loss": 0.7793, "step": 10260 }, { - "epoch": 1.2364555742836503, - "grad_norm": 11.5625, + "epoch": 7.325249643366619, + "grad_norm": 13.625, "learning_rate": 6.547111111111111e-05, - "loss": 0.6036, + "loss": 0.8052, "step": 10270 }, { - "epoch": 1.2376595232362149, - "grad_norm": 7.875, + "epoch": 7.3323823109843085, + "grad_norm": 8.6875, "learning_rate": 6.542666666666667e-05, - "loss": 0.5566, + "loss": 0.8387, "step": 10280 }, { - "epoch": 1.2388634721887792, - "grad_norm": 7.59375, + "epoch": 7.339514978601997, + "grad_norm": 6.84375, "learning_rate": 6.538222222222222e-05, - "loss": 0.5778, + "loss": 0.8713, "step": 10290 }, { - "epoch": 1.2400674211413436, - "grad_norm": 7.25, + "epoch": 7.346647646219687, + "grad_norm": 9.875, "learning_rate": 6.533777777777777e-05, - "loss": 0.616, + "loss": 0.7266, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval/acc": 45.930233001708984, + "epoch": 7.346647646219687, + "eval/acc": 62.79069900512695, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval_loss": 2.851064682006836, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.029, - "eval_steps_per_second": 4.675, + "epoch": 7.346647646219687, + "eval_loss": 1.9304108619689941, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.684, + "eval_steps_per_second": 4.504, "step": 10300 }, { - "epoch": 1.241271370093908, - "grad_norm": 5.375, + "epoch": 7.353780313837375, + "grad_norm": 9.5625, "learning_rate": 6.529333333333333e-05, - "loss": 0.5576, + "loss": 0.7775, "step": 10310 }, { - "epoch": 1.2424753190464723, - "grad_norm": 9.9375, + "epoch": 7.360912981455064, + "grad_norm": 8.0625, "learning_rate": 6.52488888888889e-05, - "loss": 0.6111, + "loss": 0.7669, "step": 10320 }, { - "epoch": 1.243679267999037, - "grad_norm": 7.34375, + "epoch": 7.368045649072753, + "grad_norm": 7.0625, "learning_rate": 6.520444444444444e-05, - "loss": 0.6349, + "loss": 0.897, "step": 10330 }, { - "epoch": 1.2448832169516013, - "grad_norm": 7.8125, + "epoch": 7.375178316690442, + "grad_norm": 7.4375, "learning_rate": 6.515999999999999e-05, - "loss": 0.5117, + "loss": 0.7859, "step": 10340 }, { - "epoch": 1.2460871659041657, - "grad_norm": 9.9375, + "epoch": 7.382310984308131, + "grad_norm": 10.25, "learning_rate": 6.511555555555557e-05, - "loss": 0.5363, + "loss": 0.9447, "step": 10350 }, { - "epoch": 1.24729111485673, - "grad_norm": 10.0625, + "epoch": 7.38944365192582, + "grad_norm": 7.21875, "learning_rate": 6.507111111111112e-05, - "loss": 0.694, + "loss": 0.78, "step": 10360 }, { - "epoch": 1.2484950638092944, - "grad_norm": 9.625, + "epoch": 7.3965763195435095, + "grad_norm": 8.625, "learning_rate": 6.502666666666667e-05, - "loss": 0.5528, + "loss": 0.9362, "step": 10370 }, { - "epoch": 1.2496990127618588, - "grad_norm": 7.25, + "epoch": 7.403708987161198, + "grad_norm": 8.125, "learning_rate": 6.498222222222223e-05, - "loss": 0.5428, + "loss": 0.7343, "step": 10380 }, { - "epoch": 1.2509029617144234, - "grad_norm": 7.59375, + "epoch": 7.410841654778888, + "grad_norm": 8.125, "learning_rate": 6.493777777777779e-05, - "loss": 0.6291, + "loss": 0.8328, "step": 10390 }, { - "epoch": 1.2521069106669878, - "grad_norm": 7.28125, + "epoch": 7.417974322396576, + "grad_norm": 7.8125, "learning_rate": 6.489333333333334e-05, - "loss": 0.5882, + "loss": 0.8261, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval/acc": 46.511627197265625, + "epoch": 7.417974322396576, + "eval/acc": 62.79069900512695, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval_loss": 2.878549098968506, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.843, - "eval_steps_per_second": 4.741, + "epoch": 7.417974322396576, + "eval_loss": 1.9274901151657104, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.989, + "eval_steps_per_second": 4.511, "step": 10400 }, { - "epoch": 1.2533108596195521, - "grad_norm": 9.25, + "epoch": 7.425106990014266, + "grad_norm": 48.0, "learning_rate": 6.484888888888889e-05, - "loss": 0.5934, + "loss": 0.8167, "step": 10410 }, { - "epoch": 1.2545148085721165, - "grad_norm": 6.84375, + "epoch": 7.432239657631954, + "grad_norm": 8.4375, "learning_rate": 6.480444444444445e-05, - "loss": 0.714, + "loss": 0.8262, "step": 10420 }, { - "epoch": 1.2557187575246809, - "grad_norm": 8.375, + "epoch": 7.439372325249644, + "grad_norm": 6.90625, "learning_rate": 6.476e-05, - "loss": 0.7209, + "loss": 0.9254, "step": 10430 }, { - "epoch": 1.2569227064772455, - "grad_norm": 7.96875, + "epoch": 7.446504992867332, + "grad_norm": 8.5625, "learning_rate": 6.471555555555556e-05, - "loss": 0.6045, + "loss": 0.7657, "step": 10440 }, { - "epoch": 1.2581266554298098, - "grad_norm": 9.125, + "epoch": 7.453637660485022, + "grad_norm": 6.875, "learning_rate": 6.467111111111112e-05, - "loss": 0.632, + "loss": 0.8123, "step": 10450 }, { - "epoch": 1.2593306043823742, - "grad_norm": 6.9375, + "epoch": 7.4607703281027105, + "grad_norm": 8.5625, "learning_rate": 6.462666666666667e-05, - "loss": 0.5078, + "loss": 0.8951, "step": 10460 }, { - "epoch": 1.2605345533349386, - "grad_norm": 7.375, + "epoch": 7.467902995720399, + "grad_norm": 7.46875, "learning_rate": 6.458222222222222e-05, - "loss": 0.6352, + "loss": 0.8287, "step": 10470 }, { - "epoch": 1.261738502287503, - "grad_norm": 7.375, + "epoch": 7.4750356633380886, + "grad_norm": 6.28125, "learning_rate": 6.453777777777778e-05, - "loss": 0.6588, + "loss": 0.7364, "step": 10480 }, { - "epoch": 1.2629424512400673, - "grad_norm": 8.625, + "epoch": 7.482168330955777, + "grad_norm": 7.625, "learning_rate": 6.449333333333334e-05, - "loss": 0.6612, + "loss": 0.9265, "step": 10490 }, { - "epoch": 1.264146400192632, - "grad_norm": 6.78125, + "epoch": 7.489300998573467, + "grad_norm": 7.15625, "learning_rate": 6.444888888888889e-05, - "loss": 0.5578, + "loss": 0.7547, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval/acc": 46.511627197265625, + "epoch": 7.489300998573467, + "eval/acc": 62.79069900512695, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval_loss": 2.8044533729553223, - "eval_runtime": 0.2198, - "eval_samples_per_second": 195.595, - "eval_steps_per_second": 4.549, + "epoch": 7.489300998573467, + "eval_loss": 1.9239764213562012, + "eval_runtime": 0.2285, + "eval_samples_per_second": 188.187, + "eval_steps_per_second": 4.376, "step": 10500 }, { - "epoch": 1.2653503491451963, - "grad_norm": 7.0625, + "epoch": 7.496433666191155, + "grad_norm": 7.875, "learning_rate": 6.440444444444444e-05, - "loss": 0.5674, + "loss": 0.8612, "step": 10510 }, { - "epoch": 1.2665542980977607, - "grad_norm": 7.5, + "epoch": 7.503566333808845, + "grad_norm": 7.46875, "learning_rate": 6.436e-05, - "loss": 0.5692, + "loss": 0.8751, "step": 10520 }, { - "epoch": 1.267758247050325, - "grad_norm": 6.96875, + "epoch": 7.510699001426533, + "grad_norm": 6.78125, "learning_rate": 6.431555555555556e-05, - "loss": 0.5209, + "loss": 0.7706, "step": 10530 }, { - "epoch": 1.2689621960028894, - "grad_norm": 6.625, + "epoch": 7.517831669044223, + "grad_norm": 6.375, "learning_rate": 6.427111111111111e-05, - "loss": 0.7402, + "loss": 0.7602, "step": 10540 }, { - "epoch": 1.270166144955454, - "grad_norm": 8.5625, + "epoch": 7.5249643366619114, + "grad_norm": 7.1875, "learning_rate": 6.422666666666667e-05, - "loss": 0.6213, + "loss": 0.7953, "step": 10550 }, { - "epoch": 1.2713700939080184, - "grad_norm": 6.625, + "epoch": 7.532097004279601, + "grad_norm": 6.5, "learning_rate": 6.418222222222222e-05, - "loss": 0.587, + "loss": 0.871, "step": 10560 }, { - "epoch": 1.2725740428605827, - "grad_norm": 8.3125, + "epoch": 7.5392296718972895, + "grad_norm": 6.65625, "learning_rate": 6.413777777777778e-05, - "loss": 0.5949, + "loss": 0.7343, "step": 10570 }, { - "epoch": 1.273777991813147, - "grad_norm": 5.9375, + "epoch": 7.546362339514978, + "grad_norm": 6.3125, "learning_rate": 6.409333333333333e-05, - "loss": 0.5501, + "loss": 0.8275, "step": 10580 }, { - "epoch": 1.2749819407657115, - "grad_norm": 4.59375, + "epoch": 7.553495007132668, + "grad_norm": 6.125, "learning_rate": 6.40488888888889e-05, - "loss": 0.5145, + "loss": 0.8243, "step": 10590 }, { - "epoch": 1.2761858897182758, - "grad_norm": 8.6875, + "epoch": 7.560627674750357, + "grad_norm": 7.75, "learning_rate": 6.400444444444444e-05, - "loss": 0.6859, + "loss": 0.8731, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval/acc": 46.511627197265625, + "epoch": 7.560627674750357, + "eval/acc": 58.13953399658203, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval_loss": 2.836024045944214, - "eval_runtime": 0.2165, - "eval_samples_per_second": 198.581, - "eval_steps_per_second": 4.618, + "epoch": 7.560627674750357, + "eval_loss": 1.9751547574996948, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.634, + "eval_steps_per_second": 4.526, "step": 10600 }, { - "epoch": 1.2773898386708404, - "grad_norm": 7.6875, + "epoch": 7.567760342368046, + "grad_norm": 7.34375, "learning_rate": 6.396e-05, - "loss": 0.5479, + "loss": 0.7555, "step": 10610 }, { - "epoch": 1.2785937876234048, - "grad_norm": 6.46875, + "epoch": 7.574893009985734, + "grad_norm": 9.1875, "learning_rate": 6.391555555555557e-05, - "loss": 0.6267, + "loss": 0.7415, "step": 10620 }, { - "epoch": 1.2797977365759692, - "grad_norm": 7.8125, + "epoch": 7.582025677603424, + "grad_norm": 11.875, "learning_rate": 6.387111111111111e-05, - "loss": 0.6473, + "loss": 0.7363, "step": 10630 }, { - "epoch": 1.2810016855285336, - "grad_norm": 8.75, + "epoch": 7.589158345221112, + "grad_norm": 7.90625, "learning_rate": 6.382666666666666e-05, - "loss": 0.7012, + "loss": 0.858, "step": 10640 }, { - "epoch": 1.282205634481098, - "grad_norm": 7.0625, + "epoch": 7.596291012838802, + "grad_norm": 8.25, "learning_rate": 6.378222222222223e-05, - "loss": 0.6147, + "loss": 0.7934, "step": 10650 }, { - "epoch": 1.2834095834336625, - "grad_norm": 8.1875, + "epoch": 7.6034236804564905, + "grad_norm": 6.84375, "learning_rate": 6.373777777777779e-05, - "loss": 0.6508, + "loss": 0.7867, "step": 10660 }, { - "epoch": 1.2846135323862269, - "grad_norm": 7.21875, + "epoch": 7.61055634807418, + "grad_norm": 8.3125, "learning_rate": 6.369333333333334e-05, - "loss": 0.5718, + "loss": 0.8519, "step": 10670 }, { - "epoch": 1.2858174813387913, - "grad_norm": 6.40625, + "epoch": 7.617689015691869, + "grad_norm": 8.25, "learning_rate": 6.36488888888889e-05, - "loss": 0.6092, + "loss": 0.8771, "step": 10680 }, { - "epoch": 1.2870214302913556, - "grad_norm": 8.5625, + "epoch": 7.624821683309558, + "grad_norm": 6.1875, "learning_rate": 6.360444444444445e-05, - "loss": 0.6562, + "loss": 0.8483, "step": 10690 }, { - "epoch": 1.28822537924392, - "grad_norm": 6.71875, + "epoch": 7.631954350927247, + "grad_norm": 34.25, "learning_rate": 6.356000000000001e-05, - "loss": 0.5452, + "loss": 0.8799, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval/acc": 41.27906799316406, + "epoch": 7.631954350927247, + "eval/acc": 62.79069900512695, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval_loss": 2.846574306488037, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.316, - "eval_steps_per_second": 4.775, + "epoch": 7.631954350927247, + "eval_loss": 1.9270039796829224, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.545, + "eval_steps_per_second": 4.199, "step": 10700 }, { - "epoch": 1.2894293281964844, - "grad_norm": 7.09375, + "epoch": 7.639087018544936, + "grad_norm": 7.875, "learning_rate": 6.351555555555556e-05, - "loss": 0.5298, + "loss": 0.8321, "step": 10710 }, { - "epoch": 1.290633277149049, - "grad_norm": 7.8125, + "epoch": 7.646219686162625, + "grad_norm": 7.0, "learning_rate": 6.347111111111112e-05, - "loss": 0.6176, + "loss": 0.8192, "step": 10720 }, { - "epoch": 1.2918372261016133, - "grad_norm": 6.0, + "epoch": 7.653352353780313, + "grad_norm": 8.25, "learning_rate": 6.342666666666667e-05, - "loss": 0.527, + "loss": 0.7631, "step": 10730 }, { - "epoch": 1.2930411750541777, - "grad_norm": 20.0, + "epoch": 7.660485021398003, + "grad_norm": 7.1875, "learning_rate": 6.338222222222222e-05, - "loss": 0.6201, + "loss": 0.8088, "step": 10740 }, { - "epoch": 1.294245124006742, - "grad_norm": 8.25, + "epoch": 7.6676176890156915, + "grad_norm": 6.5, "learning_rate": 6.333777777777779e-05, - "loss": 0.6072, + "loss": 0.7612, "step": 10750 }, { - "epoch": 1.2954490729593064, - "grad_norm": 10.3125, + "epoch": 7.674750356633381, + "grad_norm": 7.5, "learning_rate": 6.329333333333334e-05, - "loss": 0.6123, + "loss": 0.8282, "step": 10760 }, { - "epoch": 1.296653021911871, - "grad_norm": 7.0625, + "epoch": 7.68188302425107, + "grad_norm": 6.46875, "learning_rate": 6.324888888888889e-05, - "loss": 0.5529, + "loss": 0.8197, "step": 10770 }, { - "epoch": 1.2978569708644354, - "grad_norm": 7.5625, + "epoch": 7.689015691868759, + "grad_norm": 12.0, "learning_rate": 6.320444444444445e-05, - "loss": 0.5527, + "loss": 0.8304, "step": 10780 }, { - "epoch": 1.2990609198169998, - "grad_norm": 9.375, + "epoch": 7.696148359486448, + "grad_norm": 7.875, "learning_rate": 6.316000000000001e-05, - "loss": 0.562, + "loss": 0.8242, "step": 10790 }, { - "epoch": 1.3002648687695642, - "grad_norm": 6.0, + "epoch": 7.703281027104137, + "grad_norm": 7.34375, "learning_rate": 6.311555555555556e-05, - "loss": 0.5706, + "loss": 0.7904, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval/acc": 39.53488540649414, + "epoch": 7.703281027104137, + "eval/acc": 60.46511459350586, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval_loss": 2.8325037956237793, - "eval_runtime": 0.9046, - "eval_samples_per_second": 47.536, - "eval_steps_per_second": 1.105, + "epoch": 7.703281027104137, + "eval_loss": 1.931999683380127, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.21, + "eval_steps_per_second": 4.493, "step": 10800 }, { - "epoch": 1.3014688177221285, - "grad_norm": 6.6875, + "epoch": 7.710413694721826, + "grad_norm": 8.625, "learning_rate": 6.307111111111111e-05, - "loss": 0.5263, + "loss": 0.861, "step": 10810 }, { - "epoch": 1.302672766674693, - "grad_norm": 7.3125, + "epoch": 7.717546362339515, + "grad_norm": 12.0, "learning_rate": 6.302666666666667e-05, - "loss": 0.5811, + "loss": 0.7917, "step": 10820 }, { - "epoch": 1.3038767156272573, - "grad_norm": 7.65625, + "epoch": 7.724679029957204, + "grad_norm": 6.5, "learning_rate": 6.298222222222222e-05, - "loss": 0.6056, + "loss": 0.709, "step": 10830 }, { - "epoch": 1.3050806645798219, - "grad_norm": 7.1875, + "epoch": 7.731811697574893, + "grad_norm": 6.96875, "learning_rate": 6.293777777777778e-05, - "loss": 0.625, + "loss": 0.8168, "step": 10840 }, { - "epoch": 1.3062846135323862, - "grad_norm": 8.0625, + "epoch": 7.738944365192582, + "grad_norm": 7.625, "learning_rate": 6.289333333333334e-05, - "loss": 0.5916, + "loss": 0.7357, "step": 10850 }, { - "epoch": 1.3074885624849506, - "grad_norm": 7.46875, + "epoch": 7.7460770328102715, + "grad_norm": 17.125, "learning_rate": 6.284888888888889e-05, - "loss": 0.5399, + "loss": 0.7115, "step": 10860 }, { - "epoch": 1.308692511437515, - "grad_norm": 6.21875, + "epoch": 7.75320970042796, + "grad_norm": 6.78125, "learning_rate": 6.280444444444444e-05, - "loss": 0.5895, + "loss": 0.6973, "step": 10870 }, { - "epoch": 1.3098964603900796, - "grad_norm": 15.5, + "epoch": 7.760342368045649, + "grad_norm": 6.75, "learning_rate": 6.276e-05, - "loss": 0.6447, + "loss": 0.7925, "step": 10880 }, { - "epoch": 1.311100409342644, - "grad_norm": 10.625, + "epoch": 7.767475035663338, + "grad_norm": 6.78125, "learning_rate": 6.271555555555556e-05, - "loss": 0.6577, + "loss": 0.7927, "step": 10890 }, { - "epoch": 1.3123043582952083, - "grad_norm": 8.0625, + "epoch": 7.774607703281027, + "grad_norm": 7.375, "learning_rate": 6.267111111111111e-05, - "loss": 0.6119, + "loss": 0.9383, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval/acc": 44.1860466003418, + "epoch": 7.774607703281027, + "eval/acc": 62.79069900512695, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval_loss": 2.8269896507263184, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.171, - "eval_steps_per_second": 4.725, + "epoch": 7.774607703281027, + "eval_loss": 1.947619915008545, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.442, + "eval_steps_per_second": 4.592, "step": 10900 }, { - "epoch": 1.3135083072477727, - "grad_norm": 6.21875, + "epoch": 7.781740370898716, + "grad_norm": 13.75, "learning_rate": 6.262666666666666e-05, - "loss": 0.5292, + "loss": 0.8463, "step": 10910 }, { - "epoch": 1.314712256200337, - "grad_norm": 7.5, + "epoch": 7.788873038516405, + "grad_norm": 10.875, "learning_rate": 6.258222222222222e-05, - "loss": 0.6174, + "loss": 0.7938, "step": 10920 }, { - "epoch": 1.3159162051529014, - "grad_norm": 7.71875, + "epoch": 7.796005706134094, + "grad_norm": 7.75, "learning_rate": 6.253777777777779e-05, - "loss": 0.6011, + "loss": 0.8174, "step": 10930 }, { - "epoch": 1.3171201541054658, - "grad_norm": 6.375, + "epoch": 7.803138373751783, + "grad_norm": 6.3125, "learning_rate": 6.249333333333333e-05, - "loss": 0.6249, + "loss": 0.7583, "step": 10940 }, { - "epoch": 1.3183241030580304, - "grad_norm": 7.1875, + "epoch": 7.8102710413694725, + "grad_norm": 6.625, "learning_rate": 6.24488888888889e-05, - "loss": 0.5995, + "loss": 0.7677, "step": 10950 }, { - "epoch": 1.3195280520105948, - "grad_norm": 6.46875, + "epoch": 7.817403708987161, + "grad_norm": 7.03125, "learning_rate": 6.240444444444444e-05, - "loss": 0.5469, + "loss": 0.8211, "step": 10960 }, { - "epoch": 1.3207320009631591, - "grad_norm": 8.1875, + "epoch": 7.824536376604851, + "grad_norm": 6.78125, "learning_rate": 6.236e-05, - "loss": 0.6817, + "loss": 0.8165, "step": 10970 }, { - "epoch": 1.3219359499157235, - "grad_norm": 8.875, + "epoch": 7.831669044222539, + "grad_norm": 7.25, "learning_rate": 6.231555555555555e-05, - "loss": 0.6015, + "loss": 0.8452, "step": 10980 }, { - "epoch": 1.323139898868288, - "grad_norm": 9.25, + "epoch": 7.838801711840228, + "grad_norm": 7.78125, "learning_rate": 6.227111111111112e-05, - "loss": 0.5469, + "loss": 0.7316, "step": 10990 }, { - "epoch": 1.3243438478208525, - "grad_norm": 5.4375, + "epoch": 7.845934379457917, + "grad_norm": 7.1875, "learning_rate": 6.222666666666666e-05, - "loss": 0.6355, + "loss": 0.7908, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval/acc": 44.1860466003418, + "epoch": 7.845934379457917, + "eval/acc": 60.46511459350586, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval_loss": 2.7861974239349365, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.615, - "eval_steps_per_second": 4.852, + "epoch": 7.845934379457917, + "eval_loss": 1.9402235746383667, + "eval_runtime": 0.2151, + "eval_samples_per_second": 199.927, + "eval_steps_per_second": 4.649, "step": 11000 }, { - "epoch": 1.3255477967734168, - "grad_norm": 6.5, + "epoch": 7.853067047075607, + "grad_norm": 7.3125, "learning_rate": 6.218222222222223e-05, - "loss": 0.6631, + "loss": 0.8226, "step": 11010 }, { - "epoch": 1.3267517457259812, - "grad_norm": 8.5625, + "epoch": 7.860199714693295, + "grad_norm": 6.1875, "learning_rate": 6.213777777777779e-05, - "loss": 0.7161, + "loss": 0.7946, "step": 11020 }, { - "epoch": 1.3279556946785456, - "grad_norm": 6.78125, + "epoch": 7.867332382310984, + "grad_norm": 6.21875, "learning_rate": 6.209333333333334e-05, - "loss": 0.5578, + "loss": 0.8494, "step": 11030 }, { - "epoch": 1.32915964363111, - "grad_norm": 9.0625, + "epoch": 7.8744650499286735, + "grad_norm": 6.875, "learning_rate": 6.204888888888889e-05, - "loss": 0.5986, + "loss": 0.7066, "step": 11040 }, { - "epoch": 1.3303635925836743, - "grad_norm": 10.25, + "epoch": 7.881597717546362, + "grad_norm": 6.375, "learning_rate": 6.200444444444445e-05, - "loss": 0.5198, + "loss": 0.8499, "step": 11050 }, { - "epoch": 1.331567541536239, - "grad_norm": 3.796875, + "epoch": 7.888730385164052, + "grad_norm": 8.0, "learning_rate": 6.196000000000001e-05, - "loss": 0.5459, + "loss": 0.8761, "step": 11060 }, { - "epoch": 1.3327714904888033, - "grad_norm": 7.125, + "epoch": 7.89586305278174, + "grad_norm": 5.75, "learning_rate": 6.191555555555556e-05, - "loss": 0.5896, + "loss": 0.8536, "step": 11070 }, { - "epoch": 1.3339754394413676, - "grad_norm": 7.34375, + "epoch": 7.90299572039943, + "grad_norm": 7.0, "learning_rate": 6.18711111111111e-05, - "loss": 0.5403, + "loss": 0.9413, "step": 11080 }, { - "epoch": 1.335179388393932, - "grad_norm": 9.125, + "epoch": 7.910128388017118, + "grad_norm": 8.0, "learning_rate": 6.182666666666667e-05, - "loss": 0.6377, + "loss": 0.7626, "step": 11090 }, { - "epoch": 1.3363833373464966, - "grad_norm": 8.8125, + "epoch": 7.917261055634808, + "grad_norm": 6.375, "learning_rate": 6.178222222222223e-05, - "loss": 0.6292, + "loss": 0.8177, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval/acc": 41.86046600341797, + "epoch": 7.917261055634808, + "eval/acc": 65.11627960205078, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval_loss": 2.8322744369506836, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.288, - "eval_steps_per_second": 4.728, + "epoch": 7.917261055634808, + "eval_loss": 1.8976689577102661, + "eval_runtime": 0.2399, + "eval_samples_per_second": 179.237, + "eval_steps_per_second": 4.168, "step": 11100 }, { - "epoch": 1.337587286299061, - "grad_norm": 6.28125, + "epoch": 7.924393723252496, + "grad_norm": 7.4375, "learning_rate": 6.173777777777778e-05, - "loss": 0.6421, + "loss": 0.8178, "step": 11110 }, { - "epoch": 1.3387912352516254, - "grad_norm": 6.21875, + "epoch": 7.931526390870186, + "grad_norm": 35.25, "learning_rate": 6.169333333333334e-05, - "loss": 0.6178, + "loss": 0.7931, "step": 11120 }, { - "epoch": 1.3399951842041897, - "grad_norm": 6.8125, + "epoch": 7.9386590584878745, + "grad_norm": 26.75, "learning_rate": 6.164888888888889e-05, - "loss": 0.704, + "loss": 0.7883, "step": 11130 }, { - "epoch": 1.341199133156754, - "grad_norm": 9.1875, + "epoch": 7.945791726105563, + "grad_norm": 6.375, "learning_rate": 6.160444444444444e-05, - "loss": 0.5763, + "loss": 0.7407, "step": 11140 }, { - "epoch": 1.3424030821093185, - "grad_norm": 9.625, + "epoch": 7.9529243937232525, + "grad_norm": 8.5, "learning_rate": 6.156e-05, - "loss": 0.6345, + "loss": 0.8509, "step": 11150 }, { - "epoch": 1.3436070310618828, - "grad_norm": 7.15625, + "epoch": 7.960057061340941, + "grad_norm": 7.34375, "learning_rate": 6.151555555555556e-05, - "loss": 0.5969, + "loss": 0.7948, "step": 11160 }, { - "epoch": 1.3448109800144474, - "grad_norm": 7.65625, + "epoch": 7.967189728958631, + "grad_norm": 5.90625, "learning_rate": 6.147111111111111e-05, - "loss": 0.6219, + "loss": 0.8066, "step": 11170 }, { - "epoch": 1.3460149289670118, - "grad_norm": 7.46875, + "epoch": 7.974322396576319, + "grad_norm": 6.8125, "learning_rate": 6.142666666666666e-05, - "loss": 0.5902, + "loss": 0.7545, "step": 11180 }, { - "epoch": 1.3472188779195762, - "grad_norm": 8.4375, + "epoch": 7.981455064194009, + "grad_norm": 7.40625, "learning_rate": 6.138222222222223e-05, - "loss": 0.6771, + "loss": 0.8842, "step": 11190 }, { - "epoch": 1.3484228268721405, - "grad_norm": 9.875, + "epoch": 7.988587731811697, + "grad_norm": 8.625, "learning_rate": 6.133777777777778e-05, - "loss": 0.5981, + "loss": 0.8874, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval/acc": 41.86046600341797, + "epoch": 7.988587731811697, + "eval/acc": 60.46511459350586, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval_loss": 2.8496346473693848, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.315, - "eval_steps_per_second": 4.658, + "epoch": 7.988587731811697, + "eval_loss": 1.9585436582565308, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.499, + "eval_steps_per_second": 4.663, "step": 11200 }, { - "epoch": 1.3496267758247051, - "grad_norm": 8.4375, + "epoch": 7.995720399429387, + "grad_norm": 10.125, "learning_rate": 6.129333333333333e-05, - "loss": 0.6014, + "loss": 0.8723, "step": 11210 }, { - "epoch": 1.3508307247772695, - "grad_norm": 6.625, + "epoch": 8.002853067047075, + "grad_norm": 6.375, "learning_rate": 6.12488888888889e-05, - "loss": 0.5484, + "loss": 0.7986, "step": 11220 }, { - "epoch": 1.3520346737298339, - "grad_norm": 8.8125, + "epoch": 8.009985734664765, + "grad_norm": 7.34375, "learning_rate": 6.120444444444444e-05, - "loss": 0.6505, + "loss": 0.8382, "step": 11230 }, { - "epoch": 1.3532386226823983, - "grad_norm": 9.0, + "epoch": 8.017118402282454, + "grad_norm": 6.21875, "learning_rate": 6.116e-05, - "loss": 0.7428, + "loss": 0.796, "step": 11240 }, { - "epoch": 1.3544425716349626, - "grad_norm": 6.03125, + "epoch": 8.024251069900142, + "grad_norm": 30.5, "learning_rate": 6.111555555555557e-05, - "loss": 0.5092, + "loss": 0.8541, "step": 11250 }, { - "epoch": 1.355646520587527, - "grad_norm": 8.375, + "epoch": 8.031383737517832, + "grad_norm": 7.90625, "learning_rate": 6.107111111111111e-05, - "loss": 0.6589, + "loss": 0.7689, "step": 11260 }, { - "epoch": 1.3568504695400914, - "grad_norm": 5.1875, + "epoch": 8.038516405135521, + "grad_norm": 10.375, "learning_rate": 6.102666666666666e-05, - "loss": 0.7026, + "loss": 0.803, "step": 11270 }, { - "epoch": 1.358054418492656, - "grad_norm": 19.125, + "epoch": 8.045649072753209, + "grad_norm": 8.3125, "learning_rate": 6.098222222222223e-05, - "loss": 0.6705, + "loss": 0.9584, "step": 11280 }, { - "epoch": 1.3592583674452203, - "grad_norm": 5.625, + "epoch": 8.052781740370898, + "grad_norm": 7.8125, "learning_rate": 6.093777777777778e-05, - "loss": 0.6484, + "loss": 0.761, "step": 11290 }, { - "epoch": 1.3604623163977847, - "grad_norm": 9.9375, + "epoch": 8.059914407988588, + "grad_norm": 9.125, "learning_rate": 6.0893333333333335e-05, - "loss": 0.5762, + "loss": 0.7506, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval/acc": 47.67441940307617, + "epoch": 8.059914407988588, + "eval/acc": 48.83720779418945, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval_loss": 2.804438591003418, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.261, - "eval_steps_per_second": 4.75, + "epoch": 8.059914407988588, + "eval_loss": 2.348471164703369, + "eval_runtime": 0.9666, + "eval_samples_per_second": 44.484, + "eval_steps_per_second": 1.035, "step": 11300 }, { - "epoch": 1.361666265350349, - "grad_norm": 11.4375, + "epoch": 8.067047075606277, + "grad_norm": 9.0625, "learning_rate": 6.084888888888889e-05, - "loss": 0.63, + "loss": 0.7246, "step": 11310 }, { - "epoch": 1.3628702143029137, - "grad_norm": 7.625, + "epoch": 8.074179743223965, + "grad_norm": 24.5, "learning_rate": 6.080444444444445e-05, - "loss": 0.5893, + "loss": 0.8399, "step": 11320 }, { - "epoch": 1.364074163255478, - "grad_norm": 7.5625, + "epoch": 8.081312410841655, + "grad_norm": 8.0625, "learning_rate": 6.076000000000001e-05, - "loss": 0.6789, + "loss": 0.8196, "step": 11330 }, { - "epoch": 1.3652781122080424, - "grad_norm": 6.875, + "epoch": 8.088445078459344, + "grad_norm": 7.5625, "learning_rate": 6.0715555555555556e-05, - "loss": 0.5739, + "loss": 0.7496, "step": 11340 }, { - "epoch": 1.3664820611606068, - "grad_norm": 7.9375, + "epoch": 8.095577746077034, + "grad_norm": 10.6875, "learning_rate": 6.067111111111111e-05, - "loss": 0.593, + "loss": 0.791, "step": 11350 }, { - "epoch": 1.3676860101131711, - "grad_norm": 6.03125, + "epoch": 8.102710413694721, + "grad_norm": 7.28125, "learning_rate": 6.062666666666667e-05, - "loss": 0.6003, + "loss": 0.7064, "step": 11360 }, { - "epoch": 1.3688899590657355, - "grad_norm": 7.21875, + "epoch": 8.10984308131241, + "grad_norm": 7.28125, "learning_rate": 6.058222222222223e-05, - "loss": 0.6658, + "loss": 0.8306, "step": 11370 }, { - "epoch": 1.3700939080182999, - "grad_norm": 6.25, + "epoch": 8.1169757489301, + "grad_norm": 7.84375, "learning_rate": 6.0537777777777784e-05, - "loss": 0.5438, + "loss": 0.8394, "step": 11380 }, { - "epoch": 1.3712978569708645, - "grad_norm": 7.21875, + "epoch": 8.12410841654779, + "grad_norm": 6.5625, "learning_rate": 6.049333333333333e-05, - "loss": 0.5269, + "loss": 0.789, "step": 11390 }, { - "epoch": 1.3725018059234289, - "grad_norm": 8.875, + "epoch": 8.131241084165477, + "grad_norm": 7.125, "learning_rate": 6.044888888888889e-05, - "loss": 0.5357, + "loss": 0.7752, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval/acc": 45.930233001708984, + "epoch": 8.131241084165477, + "eval/acc": 48.83720779418945, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval_loss": 2.79295015335083, - "eval_runtime": 0.5452, - "eval_samples_per_second": 78.877, - "eval_steps_per_second": 1.834, + "epoch": 8.131241084165477, + "eval_loss": 2.3455872535705566, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.559, + "eval_steps_per_second": 4.664, "step": 11400 }, { - "epoch": 1.3737057548759932, - "grad_norm": 6.375, + "epoch": 8.138373751783167, + "grad_norm": 6.75, "learning_rate": 6.040444444444445e-05, - "loss": 0.6255, + "loss": 0.7773, "step": 11410 }, { - "epoch": 1.3749097038285576, - "grad_norm": 6.96875, + "epoch": 8.145506419400856, + "grad_norm": 7.8125, "learning_rate": 6.0360000000000005e-05, - "loss": 0.6689, + "loss": 0.7369, "step": 11420 }, { - "epoch": 1.3761136527811222, - "grad_norm": 7.28125, + "epoch": 8.152639087018544, + "grad_norm": 6.4375, "learning_rate": 6.031555555555556e-05, - "loss": 0.5944, + "loss": 0.8158, "step": 11430 }, { - "epoch": 1.3773176017336866, - "grad_norm": 5.90625, + "epoch": 8.159771754636234, + "grad_norm": 7.53125, "learning_rate": 6.027111111111111e-05, - "loss": 0.6054, + "loss": 0.874, "step": 11440 }, { - "epoch": 1.378521550686251, - "grad_norm": 7.09375, + "epoch": 8.166904422253923, + "grad_norm": 8.0625, "learning_rate": 6.0226666666666664e-05, - "loss": 0.5204, + "loss": 0.7564, "step": 11450 }, { - "epoch": 1.3797254996388153, - "grad_norm": 8.125, + "epoch": 8.174037089871613, + "grad_norm": 6.65625, "learning_rate": 6.0182222222222226e-05, - "loss": 0.5088, + "loss": 0.8675, "step": 11460 }, { - "epoch": 1.3809294485913797, - "grad_norm": 7.96875, + "epoch": 8.1811697574893, + "grad_norm": 7.34375, "learning_rate": 6.013777777777778e-05, - "loss": 0.5873, + "loss": 0.8338, "step": 11470 }, { - "epoch": 1.382133397543944, - "grad_norm": 7.8125, + "epoch": 8.18830242510699, + "grad_norm": 8.75, "learning_rate": 6.0093333333333336e-05, - "loss": 0.5889, + "loss": 0.7316, "step": 11480 }, { - "epoch": 1.3833373464965084, - "grad_norm": 7.3125, + "epoch": 8.19543509272468, + "grad_norm": 8.625, "learning_rate": 6.0048888888888885e-05, - "loss": 0.6799, + "loss": 0.8842, "step": 11490 }, { - "epoch": 1.384541295449073, - "grad_norm": 17.0, + "epoch": 8.202567760342369, + "grad_norm": 11.3125, "learning_rate": 6.0004444444444453e-05, - "loss": 0.5965, + "loss": 0.7852, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval/acc": 46.511627197265625, + "epoch": 8.202567760342369, + "eval/acc": 48.83720779418945, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval_loss": 2.8261971473693848, - "eval_runtime": 1.1762, - "eval_samples_per_second": 36.557, - "eval_steps_per_second": 0.85, + "epoch": 8.202567760342369, + "eval_loss": 2.352907657623291, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.909, + "eval_steps_per_second": 4.719, "step": 11500 }, { - "epoch": 1.3857452444016374, - "grad_norm": 7.5, + "epoch": 8.209700427960057, + "grad_norm": 7.9375, "learning_rate": 5.996e-05, - "loss": 0.5685, + "loss": 0.7898, "step": 11510 }, { - "epoch": 1.3869491933542017, - "grad_norm": 7.8125, + "epoch": 8.216833095577746, + "grad_norm": 7.21875, "learning_rate": 5.991555555555556e-05, - "loss": 0.6299, + "loss": 0.7728, "step": 11520 }, { - "epoch": 1.3881531423067661, - "grad_norm": 6.34375, + "epoch": 8.223965763195435, + "grad_norm": 8.1875, "learning_rate": 5.987111111111111e-05, - "loss": 0.6243, + "loss": 0.7456, "step": 11530 }, { - "epoch": 1.3893570912593307, - "grad_norm": 7.84375, + "epoch": 8.231098430813125, + "grad_norm": 7.1875, "learning_rate": 5.982666666666666e-05, - "loss": 0.6387, + "loss": 0.8461, "step": 11540 }, { - "epoch": 1.390561040211895, - "grad_norm": 7.5, + "epoch": 8.238231098430813, + "grad_norm": 7.9375, "learning_rate": 5.978222222222223e-05, - "loss": 0.6561, + "loss": 0.7297, "step": 11550 }, { - "epoch": 1.3917649891644595, - "grad_norm": 9.125, + "epoch": 8.245363766048502, + "grad_norm": 6.75, "learning_rate": 5.973777777777778e-05, - "loss": 0.6064, + "loss": 0.8327, "step": 11560 }, { - "epoch": 1.3929689381170238, - "grad_norm": 8.3125, + "epoch": 8.252496433666192, + "grad_norm": 6.1875, "learning_rate": 5.969333333333333e-05, - "loss": 0.6107, + "loss": 0.8054, "step": 11570 }, { - "epoch": 1.3941728870695882, - "grad_norm": 8.0, + "epoch": 8.25962910128388, + "grad_norm": 8.375, "learning_rate": 5.964888888888889e-05, - "loss": 0.7101, + "loss": 0.7853, "step": 11580 }, { - "epoch": 1.3953768360221526, - "grad_norm": 4.84375, + "epoch": 8.266761768901569, + "grad_norm": 8.9375, "learning_rate": 5.960444444444445e-05, - "loss": 0.5981, + "loss": 0.7891, "step": 11590 }, { - "epoch": 1.396580784974717, - "grad_norm": 5.25, + "epoch": 8.273894436519258, + "grad_norm": 7.8125, "learning_rate": 5.9560000000000006e-05, - "loss": 0.5498, + "loss": 0.7407, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval/acc": 44.1860466003418, + "epoch": 8.273894436519258, + "eval/acc": 55.8139533996582, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval_loss": 2.822185516357422, - "eval_runtime": 2.963, - "eval_samples_per_second": 14.512, - "eval_steps_per_second": 0.337, + "epoch": 8.273894436519258, + "eval_loss": 2.3408679962158203, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.913, + "eval_steps_per_second": 4.719, "step": 11600 }, { - "epoch": 1.3977847339272815, - "grad_norm": 10.875, + "epoch": 8.281027104136948, + "grad_norm": 16.5, "learning_rate": 5.951555555555556e-05, - "loss": 0.65, + "loss": 0.7662, "step": 11610 }, { - "epoch": 1.398988682879846, - "grad_norm": 8.375, + "epoch": 8.288159771754636, + "grad_norm": 9.1875, "learning_rate": 5.947111111111111e-05, - "loss": 0.604, + "loss": 0.8136, "step": 11620 }, { - "epoch": 1.4001926318324103, - "grad_norm": 7.21875, + "epoch": 8.295292439372325, + "grad_norm": 6.5, "learning_rate": 5.942666666666668e-05, - "loss": 0.5246, + "loss": 0.8833, "step": 11630 }, { - "epoch": 1.4013965807849746, - "grad_norm": 7.8125, + "epoch": 8.302425106990015, + "grad_norm": 9.8125, "learning_rate": 5.938222222222223e-05, - "loss": 0.5189, + "loss": 0.7388, "step": 11640 }, { - "epoch": 1.4026005297375392, - "grad_norm": 4.78125, + "epoch": 8.309557774607704, + "grad_norm": 8.375, "learning_rate": 5.933777777777778e-05, - "loss": 0.5959, + "loss": 0.687, "step": 11650 }, { - "epoch": 1.4038044786901036, - "grad_norm": 8.875, + "epoch": 8.316690442225392, + "grad_norm": 6.75, "learning_rate": 5.929333333333334e-05, - "loss": 0.5887, + "loss": 0.7731, "step": 11660 }, { - "epoch": 1.405008427642668, - "grad_norm": 8.625, + "epoch": 8.323823109843081, + "grad_norm": 5.875, "learning_rate": 5.9248888888888886e-05, - "loss": 0.5538, + "loss": 0.8294, "step": 11670 }, { - "epoch": 1.4062123765952323, - "grad_norm": 5.8125, + "epoch": 8.33095577746077, + "grad_norm": 7.25, "learning_rate": 5.9204444444444454e-05, - "loss": 0.5945, + "loss": 0.8312, "step": 11680 }, { - "epoch": 1.4074163255477967, - "grad_norm": 6.9375, + "epoch": 8.338088445078458, + "grad_norm": 6.15625, "learning_rate": 5.916e-05, - "loss": 0.5444, + "loss": 0.8745, "step": 11690 }, { - "epoch": 1.408620274500361, - "grad_norm": 5.375, + "epoch": 8.345221112696148, + "grad_norm": 11.9375, "learning_rate": 5.911555555555556e-05, - "loss": 0.5762, + "loss": 0.8136, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval/acc": 45.930233001708984, + "epoch": 8.345221112696148, + "eval/acc": 53.488372802734375, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval_loss": 2.837869167327881, - "eval_runtime": 4.5845, - "eval_samples_per_second": 9.38, - "eval_steps_per_second": 0.218, + "epoch": 8.345221112696148, + "eval_loss": 2.348762273788452, + "eval_runtime": 1.1232, + "eval_samples_per_second": 38.285, + "eval_steps_per_second": 0.89, "step": 11700 }, { - "epoch": 1.4098242234529255, - "grad_norm": 6.90625, + "epoch": 8.352353780313837, + "grad_norm": 7.6875, "learning_rate": 5.907111111111111e-05, - "loss": 0.5918, + "loss": 0.8979, "step": 11710 }, { - "epoch": 1.41102817240549, - "grad_norm": 12.375, + "epoch": 8.359486447931527, + "grad_norm": 7.75, "learning_rate": 5.9026666666666675e-05, - "loss": 0.5955, + "loss": 0.7527, "step": 11720 }, { - "epoch": 1.4122321213580544, - "grad_norm": 13.4375, + "epoch": 8.366619115549215, + "grad_norm": 7.75, "learning_rate": 5.898222222222223e-05, - "loss": 0.5389, + "loss": 0.7397, "step": 11730 }, { - "epoch": 1.4134360703106188, - "grad_norm": 8.5625, + "epoch": 8.373751783166904, + "grad_norm": 7.125, "learning_rate": 5.893777777777778e-05, - "loss": 0.5927, + "loss": 0.7371, "step": 11740 }, { - "epoch": 1.4146400192631832, - "grad_norm": 7.625, + "epoch": 8.380884450784594, + "grad_norm": 7.09375, "learning_rate": 5.8893333333333334e-05, - "loss": 0.5979, + "loss": 0.7787, "step": 11750 }, { - "epoch": 1.4158439682157478, - "grad_norm": 6.5625, + "epoch": 8.388017118402283, + "grad_norm": 12.75, "learning_rate": 5.884888888888889e-05, - "loss": 0.4657, + "loss": 0.7745, "step": 11760 }, { - "epoch": 1.4170479171683121, - "grad_norm": 7.5, + "epoch": 8.39514978601997, + "grad_norm": 5.96875, "learning_rate": 5.880444444444445e-05, - "loss": 0.6833, + "loss": 0.7675, "step": 11770 }, { - "epoch": 1.4182518661208765, - "grad_norm": 12.5, + "epoch": 8.40228245363766, + "grad_norm": 7.28125, "learning_rate": 5.876000000000001e-05, - "loss": 0.6065, + "loss": 0.7369, "step": 11780 }, { - "epoch": 1.4194558150734409, - "grad_norm": 8.6875, + "epoch": 8.40941512125535, + "grad_norm": 8.5625, "learning_rate": 5.8715555555555555e-05, - "loss": 0.6406, + "loss": 0.7679, "step": 11790 }, { - "epoch": 1.4206597640260052, - "grad_norm": 7.625, + "epoch": 8.41654778887304, + "grad_norm": 6.09375, "learning_rate": 5.867111111111111e-05, - "loss": 0.54, + "loss": 0.7575, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval/acc": 44.1860466003418, + "epoch": 8.41654778887304, + "eval/acc": 48.83720779418945, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval_loss": 2.8732845783233643, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.731, - "eval_steps_per_second": 4.691, + "epoch": 8.41654778887304, + "eval_loss": 2.3886027336120605, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.752, + "eval_steps_per_second": 4.645, "step": 11800 }, { - "epoch": 1.4218637129785696, - "grad_norm": 13.25, + "epoch": 8.423680456490727, + "grad_norm": 7.75, "learning_rate": 5.862666666666667e-05, - "loss": 0.6655, + "loss": 0.7837, "step": 11810 }, { - "epoch": 1.423067661931134, - "grad_norm": 8.8125, + "epoch": 8.430813124108417, + "grad_norm": 7.03125, "learning_rate": 5.858222222222223e-05, - "loss": 0.5967, + "loss": 0.7153, "step": 11820 }, { - "epoch": 1.4242716108836986, - "grad_norm": 5.84375, + "epoch": 8.437945791726106, + "grad_norm": 9.3125, "learning_rate": 5.853777777777778e-05, - "loss": 0.6698, + "loss": 0.7655, "step": 11830 }, { - "epoch": 1.425475559836263, - "grad_norm": 5.375, + "epoch": 8.445078459343794, + "grad_norm": 7.34375, "learning_rate": 5.849333333333333e-05, - "loss": 0.5963, + "loss": 0.761, "step": 11840 }, { - "epoch": 1.4266795087888273, - "grad_norm": 6.625, + "epoch": 8.452211126961483, + "grad_norm": 8.875, "learning_rate": 5.8448888888888886e-05, - "loss": 0.5941, + "loss": 0.7985, "step": 11850 }, { - "epoch": 1.4278834577413917, - "grad_norm": 6.90625, + "epoch": 8.459343794579173, + "grad_norm": 6.96875, "learning_rate": 5.840444444444445e-05, - "loss": 0.6464, + "loss": 0.7208, "step": 11860 }, { - "epoch": 1.4290874066939563, - "grad_norm": 8.5625, + "epoch": 8.466476462196862, + "grad_norm": 6.25, "learning_rate": 5.8360000000000004e-05, - "loss": 0.5185, + "loss": 0.8474, "step": 11870 }, { - "epoch": 1.4302913556465207, - "grad_norm": 5.46875, + "epoch": 8.47360912981455, + "grad_norm": 5.5625, "learning_rate": 5.831555555555556e-05, - "loss": 0.6194, + "loss": 0.773, "step": 11880 }, { - "epoch": 1.431495304599085, - "grad_norm": 7.03125, + "epoch": 8.48074179743224, + "grad_norm": 19.125, "learning_rate": 5.827111111111111e-05, - "loss": 0.5993, + "loss": 0.7026, "step": 11890 }, { - "epoch": 1.4326992535516494, - "grad_norm": 8.25, + "epoch": 8.487874465049929, + "grad_norm": 8.4375, "learning_rate": 5.8226666666666676e-05, - "loss": 0.5726, + "loss": 0.7825, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval/acc": 44.1860466003418, + "epoch": 8.487874465049929, + "eval/acc": 48.83720779418945, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval_loss": 2.9054577350616455, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.241, - "eval_steps_per_second": 4.727, + "epoch": 8.487874465049929, + "eval_loss": 2.395317316055298, + "eval_runtime": 0.2104, + "eval_samples_per_second": 204.361, + "eval_steps_per_second": 4.753, "step": 11900 }, { - "epoch": 1.4339032025042138, - "grad_norm": 6.4375, + "epoch": 8.495007132667618, + "grad_norm": 8.5625, "learning_rate": 5.8182222222222225e-05, - "loss": 0.5805, + "loss": 0.8574, "step": 11910 }, { - "epoch": 1.4351071514567781, - "grad_norm": 9.5, + "epoch": 8.502139800285306, + "grad_norm": 8.0, "learning_rate": 5.813777777777778e-05, - "loss": 0.604, + "loss": 0.8031, "step": 11920 }, { - "epoch": 1.4363111004093425, - "grad_norm": 5.0, + "epoch": 8.509272467902996, + "grad_norm": 8.125, "learning_rate": 5.8093333333333335e-05, - "loss": 0.4871, + "loss": 0.8578, "step": 11930 }, { - "epoch": 1.437515049361907, - "grad_norm": 7.6875, + "epoch": 8.516405135520685, + "grad_norm": 8.3125, "learning_rate": 5.80488888888889e-05, - "loss": 0.5968, + "loss": 0.854, "step": 11940 }, { - "epoch": 1.4387189983144715, - "grad_norm": 7.0625, + "epoch": 8.523537803138375, + "grad_norm": 23.5, "learning_rate": 5.800444444444445e-05, - "loss": 0.5715, + "loss": 0.8375, "step": 11950 }, { - "epoch": 1.4399229472670358, - "grad_norm": 8.4375, + "epoch": 8.530670470756062, + "grad_norm": 6.625, "learning_rate": 5.796e-05, - "loss": 0.6258, + "loss": 0.7793, "step": 11960 }, { - "epoch": 1.4411268962196002, - "grad_norm": 7.78125, + "epoch": 8.537803138373752, + "grad_norm": 36.25, "learning_rate": 5.7915555555555556e-05, - "loss": 0.6474, + "loss": 0.7395, "step": 11970 }, { - "epoch": 1.4423308451721648, - "grad_norm": 6.625, + "epoch": 8.544935805991441, + "grad_norm": 7.96875, "learning_rate": 5.787111111111111e-05, - "loss": 0.6148, + "loss": 0.8492, "step": 11980 }, { - "epoch": 1.4435347941247292, - "grad_norm": 6.5625, + "epoch": 8.552068473609129, + "grad_norm": 7.3125, "learning_rate": 5.782666666666667e-05, - "loss": 0.5533, + "loss": 0.7591, "step": 11990 }, { - "epoch": 1.4447387430772936, - "grad_norm": 11.0625, + "epoch": 8.559201141226819, + "grad_norm": 13.75, "learning_rate": 5.778222222222223e-05, - "loss": 0.5756, + "loss": 0.7175, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval/acc": 46.511627197265625, + "epoch": 8.559201141226819, + "eval/acc": 48.83720779418945, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval_loss": 2.8763856887817383, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.613, - "eval_steps_per_second": 4.735, + "epoch": 8.559201141226819, + "eval_loss": 2.375894069671631, + "eval_runtime": 0.2121, + "eval_samples_per_second": 202.777, + "eval_steps_per_second": 4.716, "step": 12000 }, { - "epoch": 1.445942692029858, - "grad_norm": 10.625, + "epoch": 8.566333808844508, + "grad_norm": 8.375, "learning_rate": 5.773777777777778e-05, - "loss": 0.6164, + "loss": 0.8009, "step": 12010 }, { - "epoch": 1.4471466409824223, - "grad_norm": 8.3125, + "epoch": 8.573466476462198, + "grad_norm": 10.375, "learning_rate": 5.769333333333333e-05, - "loss": 0.5638, + "loss": 0.7651, "step": 12020 }, { - "epoch": 1.4483505899349867, - "grad_norm": 7.25, + "epoch": 8.580599144079885, + "grad_norm": 10.5, "learning_rate": 5.7648888888888894e-05, - "loss": 0.6082, + "loss": 0.7947, "step": 12030 }, { - "epoch": 1.449554538887551, - "grad_norm": 6.65625, + "epoch": 8.587731811697575, + "grad_norm": 9.375, "learning_rate": 5.760444444444445e-05, - "loss": 0.5113, + "loss": 0.8377, "step": 12040 }, { - "epoch": 1.4507584878401156, - "grad_norm": 8.5625, + "epoch": 8.594864479315264, + "grad_norm": 7.0, "learning_rate": 5.7560000000000005e-05, - "loss": 0.5966, + "loss": 0.7803, "step": 12050 }, { - "epoch": 1.45196243679268, - "grad_norm": 9.0625, + "epoch": 8.601997146932954, + "grad_norm": 7.03125, "learning_rate": 5.751555555555555e-05, - "loss": 0.4791, + "loss": 0.7129, "step": 12060 }, { - "epoch": 1.4531663857452444, - "grad_norm": 7.8125, + "epoch": 8.609129814550641, + "grad_norm": 9.6875, "learning_rate": 5.747111111111111e-05, - "loss": 0.5999, + "loss": 0.9395, "step": 12070 }, { - "epoch": 1.4543703346978087, - "grad_norm": 7.96875, + "epoch": 8.616262482168331, + "grad_norm": 7.1875, "learning_rate": 5.742666666666667e-05, - "loss": 0.5942, + "loss": 0.8461, "step": 12080 }, { - "epoch": 1.4555742836503733, - "grad_norm": 100.0, + "epoch": 8.62339514978602, + "grad_norm": 8.3125, "learning_rate": 5.7382222222222225e-05, - "loss": 0.6591, + "loss": 0.8533, "step": 12090 }, { - "epoch": 1.4567782326029377, - "grad_norm": 10.625, + "epoch": 8.63052781740371, + "grad_norm": 7.75, "learning_rate": 5.733777777777778e-05, - "loss": 0.5924, + "loss": 0.7819, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval/acc": 45.930233001708984, + "epoch": 8.63052781740371, + "eval/acc": 46.511627197265625, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval_loss": 2.9057791233062744, - "eval_runtime": 0.2182, - "eval_samples_per_second": 197.109, - "eval_steps_per_second": 4.584, + "epoch": 8.63052781740371, + "eval_loss": 2.367159605026245, + "eval_runtime": 0.35, + "eval_samples_per_second": 122.848, + "eval_steps_per_second": 2.857, "step": 12100 }, { - "epoch": 1.457982181555502, - "grad_norm": 6.21875, + "epoch": 8.637660485021398, + "grad_norm": 8.1875, "learning_rate": 5.729333333333333e-05, - "loss": 0.6354, + "loss": 0.8752, "step": 12110 }, { - "epoch": 1.4591861305080664, - "grad_norm": 7.59375, + "epoch": 8.644793152639087, + "grad_norm": 5.6875, "learning_rate": 5.72488888888889e-05, - "loss": 0.5777, + "loss": 0.8182, "step": 12120 }, { - "epoch": 1.4603900794606308, + "epoch": 8.651925820256777, "grad_norm": 7.09375, "learning_rate": 5.7204444444444446e-05, - "loss": 0.6189, + "loss": 0.8116, "step": 12130 }, { - "epoch": 1.4615940284131952, - "grad_norm": 7.84375, + "epoch": 8.659058487874464, + "grad_norm": 7.65625, "learning_rate": 5.716e-05, - "loss": 0.5161, + "loss": 0.7563, "step": 12140 }, { - "epoch": 1.4627979773657596, - "grad_norm": 9.25, + "epoch": 8.666191155492154, + "grad_norm": 20.75, "learning_rate": 5.711555555555556e-05, - "loss": 0.6892, + "loss": 0.6896, "step": 12150 }, { - "epoch": 1.4640019263183242, - "grad_norm": 7.875, + "epoch": 8.673323823109843, + "grad_norm": 9.25, "learning_rate": 5.7071111111111105e-05, - "loss": 0.4845, + "loss": 0.8233, "step": 12160 }, { - "epoch": 1.4652058752708885, - "grad_norm": 6.625, + "epoch": 8.680456490727533, + "grad_norm": 11.0625, "learning_rate": 5.7026666666666674e-05, - "loss": 0.6342, + "loss": 0.8978, "step": 12170 }, { - "epoch": 1.466409824223453, - "grad_norm": 7.03125, + "epoch": 8.68758915834522, + "grad_norm": 8.1875, "learning_rate": 5.698222222222222e-05, - "loss": 0.5427, + "loss": 0.7671, "step": 12180 }, { - "epoch": 1.4676137731760173, - "grad_norm": 6.84375, + "epoch": 8.69472182596291, + "grad_norm": 13.0625, "learning_rate": 5.693777777777778e-05, - "loss": 0.5672, + "loss": 0.7771, "step": 12190 }, { - "epoch": 1.4688177221285819, - "grad_norm": 9.0, + "epoch": 8.7018544935806, + "grad_norm": 8.25, "learning_rate": 5.689333333333333e-05, - "loss": 0.6318, + "loss": 0.758, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval/acc": 45.930233001708984, + "epoch": 8.7018544935806, + "eval/acc": 46.511627197265625, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval_loss": 2.8778676986694336, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.649, - "eval_steps_per_second": 4.69, + "epoch": 8.7018544935806, + "eval_loss": 2.3872835636138916, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.006, + "eval_steps_per_second": 4.675, "step": 12200 }, { - "epoch": 1.4700216710811462, - "grad_norm": 7.4375, + "epoch": 8.708987161198289, + "grad_norm": 6.5, "learning_rate": 5.6848888888888895e-05, - "loss": 0.5589, + "loss": 0.8066, "step": 12210 }, { - "epoch": 1.4712256200337106, - "grad_norm": 9.1875, + "epoch": 8.716119828815977, + "grad_norm": 7.21875, "learning_rate": 5.680444444444445e-05, - "loss": 0.6005, + "loss": 0.8287, "step": 12220 }, { - "epoch": 1.472429568986275, - "grad_norm": 7.875, + "epoch": 8.723252496433666, + "grad_norm": 10.625, "learning_rate": 5.6760000000000005e-05, - "loss": 0.5573, + "loss": 0.917, "step": 12230 }, { - "epoch": 1.4736335179388393, - "grad_norm": 7.71875, + "epoch": 8.730385164051356, + "grad_norm": 9.6875, "learning_rate": 5.6715555555555554e-05, - "loss": 0.5431, + "loss": 0.8417, "step": 12240 }, { - "epoch": 1.4748374668914037, - "grad_norm": 9.0625, + "epoch": 8.737517831669045, + "grad_norm": 8.6875, "learning_rate": 5.6671111111111116e-05, - "loss": 0.5939, + "loss": 0.8405, "step": 12250 }, { - "epoch": 1.476041415843968, - "grad_norm": 9.875, + "epoch": 8.744650499286733, + "grad_norm": 6.875, "learning_rate": 5.662666666666667e-05, - "loss": 0.5894, + "loss": 0.7838, "step": 12260 }, { - "epoch": 1.4772453647965327, - "grad_norm": 6.34375, + "epoch": 8.751783166904422, + "grad_norm": 6.25, "learning_rate": 5.6582222222222226e-05, - "loss": 0.5602, + "loss": 0.6897, "step": 12270 }, { - "epoch": 1.478449313749097, - "grad_norm": 7.34375, + "epoch": 8.758915834522112, + "grad_norm": 7.375, "learning_rate": 5.653777777777778e-05, - "loss": 0.6093, + "loss": 0.7716, "step": 12280 }, { - "epoch": 1.4796532627016614, - "grad_norm": 7.65625, + "epoch": 8.7660485021398, + "grad_norm": 7.96875, "learning_rate": 5.649333333333333e-05, - "loss": 0.5755, + "loss": 0.8497, "step": 12290 }, { - "epoch": 1.4808572116542258, - "grad_norm": 7.15625, + "epoch": 8.773181169757489, + "grad_norm": 7.75, "learning_rate": 5.64488888888889e-05, - "loss": 0.5593, + "loss": 0.747, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval/acc": 46.511627197265625, + "epoch": 8.773181169757489, + "eval/acc": 48.83720779418945, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval_loss": 2.8776164054870605, - "eval_runtime": 1.0978, - "eval_samples_per_second": 39.169, - "eval_steps_per_second": 0.911, + "epoch": 8.773181169757489, + "eval_loss": 2.3708367347717285, + "eval_runtime": 0.2183, + "eval_samples_per_second": 197.001, + "eval_steps_per_second": 4.581, "step": 12300 }, { - "epoch": 1.4820611606067904, - "grad_norm": 6.96875, + "epoch": 8.780313837375179, + "grad_norm": 7.28125, "learning_rate": 5.640444444444445e-05, - "loss": 0.6139, + "loss": 0.8225, "step": 12310 }, { - "epoch": 1.4832651095593548, - "grad_norm": 6.375, + "epoch": 8.787446504992868, + "grad_norm": 6.8125, "learning_rate": 5.636e-05, - "loss": 0.5185, + "loss": 0.684, "step": 12320 }, { - "epoch": 1.4844690585119191, - "grad_norm": 10.1875, + "epoch": 8.794579172610556, + "grad_norm": 5.84375, "learning_rate": 5.631555555555556e-05, - "loss": 0.5106, + "loss": 0.8008, "step": 12330 }, { - "epoch": 1.4856730074644835, - "grad_norm": 8.875, + "epoch": 8.801711840228245, + "grad_norm": 6.8125, "learning_rate": 5.627111111111112e-05, - "loss": 0.6202, + "loss": 0.7119, "step": 12340 }, { - "epoch": 1.4868769564170479, - "grad_norm": 7.90625, + "epoch": 8.808844507845935, + "grad_norm": 7.625, "learning_rate": 5.6226666666666675e-05, - "loss": 0.5785, + "loss": 0.7878, "step": 12350 }, { - "epoch": 1.4880809053696122, - "grad_norm": 7.625, + "epoch": 8.815977175463622, + "grad_norm": 6.5625, "learning_rate": 5.6182222222222223e-05, - "loss": 0.5529, + "loss": 0.8389, "step": 12360 }, { - "epoch": 1.4892848543221766, - "grad_norm": 6.53125, + "epoch": 8.823109843081312, + "grad_norm": 7.8125, "learning_rate": 5.613777777777778e-05, - "loss": 0.5533, + "loss": 0.8858, "step": 12370 }, { - "epoch": 1.4904888032747412, - "grad_norm": 7.09375, + "epoch": 8.830242510699001, + "grad_norm": 7.0, "learning_rate": 5.6093333333333334e-05, - "loss": 0.6117, + "loss": 0.797, "step": 12380 }, { - "epoch": 1.4916927522273056, - "grad_norm": 6.59375, + "epoch": 8.837375178316691, + "grad_norm": 8.125, "learning_rate": 5.6048888888888896e-05, - "loss": 0.602, + "loss": 0.7154, "step": 12390 }, { - "epoch": 1.49289670117987, - "grad_norm": 10.8125, + "epoch": 8.844507845934379, + "grad_norm": 6.59375, "learning_rate": 5.600444444444445e-05, - "loss": 0.5845, + "loss": 0.8543, "step": 12400 }, { - "epoch": 1.49289670117987, + "epoch": 8.844507845934379, "eval/acc": 46.511627197265625, "step": 12400 }, { - "epoch": 1.49289670117987, - "eval_loss": 2.860626697540283, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.684, - "eval_steps_per_second": 4.644, + "epoch": 8.844507845934379, + "eval_loss": 2.3827686309814453, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.553, + "eval_steps_per_second": 4.664, "step": 12400 }, { - "epoch": 1.4941006501324343, - "grad_norm": 7.40625, + "epoch": 8.851640513552068, + "grad_norm": 8.8125, "learning_rate": 5.596e-05, - "loss": 0.5725, + "loss": 0.8071, "step": 12410 }, { - "epoch": 1.495304599084999, - "grad_norm": 7.28125, + "epoch": 8.858773181169758, + "grad_norm": 6.0625, "learning_rate": 5.5915555555555555e-05, - "loss": 0.6233, + "loss": 0.7174, "step": 12420 }, { - "epoch": 1.4965085480375633, - "grad_norm": 7.375, + "epoch": 8.865905848787447, + "grad_norm": 9.8125, "learning_rate": 5.587111111111112e-05, - "loss": 0.6094, + "loss": 0.861, "step": 12430 }, { - "epoch": 1.4977124969901277, - "grad_norm": 8.8125, + "epoch": 8.873038516405135, + "grad_norm": 8.0, "learning_rate": 5.582666666666667e-05, - "loss": 0.6249, + "loss": 0.831, "step": 12440 }, { - "epoch": 1.498916445942692, - "grad_norm": 14.125, + "epoch": 8.880171184022824, + "grad_norm": 5.21875, "learning_rate": 5.578222222222223e-05, - "loss": 0.5612, + "loss": 0.7814, "step": 12450 }, { - "epoch": 1.5001203948952564, - "grad_norm": 9.125, + "epoch": 8.887303851640514, + "grad_norm": 6.78125, "learning_rate": 5.5737777777777776e-05, - "loss": 0.6067, + "loss": 0.6926, "step": 12460 }, { - "epoch": 1.5013243438478208, - "grad_norm": 7.875, + "epoch": 8.894436519258203, + "grad_norm": 8.6875, "learning_rate": 5.569333333333333e-05, - "loss": 0.5496, + "loss": 0.7977, "step": 12470 }, { - "epoch": 1.5025282928003851, - "grad_norm": 9.3125, + "epoch": 8.901569186875891, + "grad_norm": 6.5625, "learning_rate": 5.564888888888889e-05, - "loss": 0.5547, + "loss": 0.7647, "step": 12480 }, { - "epoch": 1.5037322417529497, - "grad_norm": 6.375, + "epoch": 8.90870185449358, + "grad_norm": 10.875, "learning_rate": 5.560444444444445e-05, - "loss": 0.5596, + "loss": 0.8469, "step": 12490 }, { - "epoch": 1.504936190705514, - "grad_norm": 8.125, + "epoch": 8.91583452211127, + "grad_norm": 12.0625, "learning_rate": 5.556e-05, - "loss": 0.5604, + "loss": 0.9152, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval/acc": 41.86046600341797, + "epoch": 8.91583452211127, + "eval/acc": 46.511627197265625, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval_loss": 2.8973793983459473, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.739, - "eval_steps_per_second": 4.668, + "epoch": 8.91583452211127, + "eval_loss": 2.3970413208007812, + "eval_runtime": 0.2133, + "eval_samples_per_second": 201.589, + "eval_steps_per_second": 4.688, "step": 12500 }, { - "epoch": 1.5061401396580785, - "grad_norm": 6.875, + "epoch": 8.922967189728958, + "grad_norm": 9.875, "learning_rate": 5.551555555555555e-05, - "loss": 0.5898, + "loss": 0.8202, "step": 12510 }, { - "epoch": 1.507344088610643, - "grad_norm": 10.6875, + "epoch": 8.930099857346647, + "grad_norm": 7.625, "learning_rate": 5.547111111111112e-05, - "loss": 0.5065, + "loss": 0.8159, "step": 12520 }, { - "epoch": 1.5085480375632074, - "grad_norm": 8.6875, + "epoch": 8.937232524964337, + "grad_norm": 7.875, "learning_rate": 5.542666666666667e-05, - "loss": 0.6214, + "loss": 0.684, "step": 12530 }, { - "epoch": 1.5097519865157718, - "grad_norm": 7.8125, + "epoch": 8.944365192582026, + "grad_norm": 6.59375, "learning_rate": 5.5382222222222224e-05, - "loss": 0.5012, + "loss": 0.7629, "step": 12540 }, { - "epoch": 1.5109559354683362, - "grad_norm": 7.25, + "epoch": 8.951497860199714, + "grad_norm": 6.90625, "learning_rate": 5.533777777777778e-05, - "loss": 0.5807, + "loss": 0.8227, "step": 12550 }, { - "epoch": 1.5121598844209005, - "grad_norm": 8.625, + "epoch": 8.958630527817403, + "grad_norm": 6.3125, "learning_rate": 5.529333333333334e-05, - "loss": 0.6293, + "loss": 0.8235, "step": 12560 }, { - "epoch": 1.513363833373465, - "grad_norm": 8.125, + "epoch": 8.965763195435093, + "grad_norm": 6.5, "learning_rate": 5.52488888888889e-05, - "loss": 0.5367, + "loss": 0.7865, "step": 12570 }, { - "epoch": 1.5145677823260293, - "grad_norm": 6.53125, + "epoch": 8.972895863052782, + "grad_norm": 5.875, "learning_rate": 5.5204444444444445e-05, - "loss": 0.6308, + "loss": 0.7331, "step": 12580 }, { - "epoch": 1.5157717312785937, - "grad_norm": 6.09375, + "epoch": 8.98002853067047, + "grad_norm": 7.15625, "learning_rate": 5.516e-05, - "loss": 0.571, + "loss": 0.8498, "step": 12590 }, { - "epoch": 1.5169756802311583, - "grad_norm": 9.625, + "epoch": 8.98716119828816, + "grad_norm": 7.75, "learning_rate": 5.5115555555555556e-05, - "loss": 0.6378, + "loss": 0.7825, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval/acc": 44.1860466003418, + "epoch": 8.98716119828816, + "eval/acc": 51.16279220581055, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval_loss": 2.856049060821533, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.899, - "eval_steps_per_second": 4.626, + "epoch": 8.98716119828816, + "eval_loss": 2.3282017707824707, + "eval_runtime": 0.2152, + "eval_samples_per_second": 199.842, + "eval_steps_per_second": 4.647, "step": 12600 }, { - "epoch": 1.5181796291837226, - "grad_norm": 8.0, + "epoch": 8.99429386590585, + "grad_norm": 7.0, "learning_rate": 5.507111111111112e-05, - "loss": 0.5873, + "loss": 0.8485, "step": 12610 }, { - "epoch": 1.519383578136287, - "grad_norm": 6.75, + "epoch": 9.001426533523539, + "grad_norm": 8.1875, "learning_rate": 5.502666666666667e-05, - "loss": 0.6427, + "loss": 0.8691, "step": 12620 }, { - "epoch": 1.5205875270888516, - "grad_norm": 9.1875, + "epoch": 9.008559201141226, + "grad_norm": 8.875, "learning_rate": 5.498222222222222e-05, - "loss": 0.6036, + "loss": 0.8085, "step": 12630 }, { - "epoch": 1.521791476041416, - "grad_norm": 6.3125, + "epoch": 9.015691868758916, + "grad_norm": 10.875, "learning_rate": 5.4937777777777777e-05, - "loss": 0.6524, + "loss": 0.7221, "step": 12640 }, { - "epoch": 1.5229954249939803, - "grad_norm": 4.6875, + "epoch": 9.022824536376605, + "grad_norm": 7.9375, "learning_rate": 5.489333333333334e-05, - "loss": 0.5334, + "loss": 0.8136, "step": 12650 }, { - "epoch": 1.5241993739465447, - "grad_norm": 11.5, + "epoch": 9.029957203994293, + "grad_norm": 6.78125, "learning_rate": 5.4848888888888894e-05, - "loss": 0.5485, + "loss": 0.6211, "step": 12660 }, { - "epoch": 1.525403322899109, - "grad_norm": 7.21875, + "epoch": 9.037089871611983, + "grad_norm": 7.09375, "learning_rate": 5.480444444444445e-05, - "loss": 0.646, + "loss": 0.7893, "step": 12670 }, { - "epoch": 1.5266072718516734, - "grad_norm": 7.5625, + "epoch": 9.044222539229672, + "grad_norm": 7.375, "learning_rate": 5.476e-05, - "loss": 0.5385, + "loss": 0.9348, "step": 12680 }, { - "epoch": 1.5278112208042378, - "grad_norm": 8.375, + "epoch": 9.051355206847362, + "grad_norm": 7.09375, "learning_rate": 5.471555555555555e-05, - "loss": 0.503, + "loss": 0.9088, "step": 12690 }, { - "epoch": 1.5290151697568022, - "grad_norm": 13.0625, + "epoch": 9.05848787446505, + "grad_norm": 7.9375, "learning_rate": 5.4671111111111115e-05, - "loss": 0.5886, + "loss": 0.8116, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval/acc": 44.76744079589844, + "epoch": 9.05848787446505, + "eval/acc": 32.55813980102539, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval_loss": 2.8569490909576416, - "eval_runtime": 0.2181, - "eval_samples_per_second": 197.134, - "eval_steps_per_second": 4.585, + "epoch": 9.05848787446505, + "eval_loss": 3.3768653869628906, + "eval_runtime": 1.089, + "eval_samples_per_second": 39.487, + "eval_steps_per_second": 0.918, "step": 12700 }, { - "epoch": 1.5302191187093668, - "grad_norm": 8.6875, + "epoch": 9.065620542082739, + "grad_norm": 7.875, "learning_rate": 5.462666666666667e-05, - "loss": 0.7241, + "loss": 0.7748, "step": 12710 }, { - "epoch": 1.5314230676619311, - "grad_norm": 8.1875, + "epoch": 9.072753209700428, + "grad_norm": 6.96875, "learning_rate": 5.4582222222222225e-05, - "loss": 0.5965, + "loss": 0.872, "step": 12720 }, { - "epoch": 1.5326270166144955, - "grad_norm": 6.96875, + "epoch": 9.079885877318118, + "grad_norm": 7.59375, "learning_rate": 5.4537777777777774e-05, - "loss": 0.5195, + "loss": 0.8539, "step": 12730 }, { - "epoch": 1.5338309655670601, - "grad_norm": 6.65625, + "epoch": 9.087018544935805, + "grad_norm": 6.9375, "learning_rate": 5.449333333333334e-05, - "loss": 0.5949, + "loss": 0.784, "step": 12740 }, { - "epoch": 1.5350349145196245, - "grad_norm": 10.25, + "epoch": 9.094151212553495, + "grad_norm": 6.5625, "learning_rate": 5.444888888888889e-05, - "loss": 0.5962, + "loss": 0.7998, "step": 12750 }, { - "epoch": 1.5362388634721889, - "grad_norm": 10.8125, + "epoch": 9.101283880171184, + "grad_norm": 7.0625, "learning_rate": 5.4404444444444446e-05, - "loss": 0.6544, + "loss": 0.8213, "step": 12760 }, { - "epoch": 1.5374428124247532, - "grad_norm": 9.8125, + "epoch": 9.108416547788874, + "grad_norm": 5.9375, "learning_rate": 5.436e-05, - "loss": 0.5681, + "loss": 0.8233, "step": 12770 }, { - "epoch": 1.5386467613773176, - "grad_norm": 7.875, + "epoch": 9.115549215406562, + "grad_norm": 6.53125, "learning_rate": 5.431555555555555e-05, - "loss": 0.5944, + "loss": 0.7617, "step": 12780 }, { - "epoch": 1.539850710329882, - "grad_norm": 6.75, + "epoch": 9.122681883024251, + "grad_norm": 7.3125, "learning_rate": 5.427111111111112e-05, - "loss": 0.5141, + "loss": 0.8139, "step": 12790 }, { - "epoch": 1.5410546592824463, - "grad_norm": 6.96875, + "epoch": 9.12981455064194, + "grad_norm": 7.625, "learning_rate": 5.422666666666667e-05, - "loss": 0.627, + "loss": 0.7742, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval/acc": 44.1860466003418, + "epoch": 9.12981455064194, + "eval/acc": 34.88372039794922, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval_loss": 2.8678698539733887, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.486, - "eval_steps_per_second": 4.709, + "epoch": 9.12981455064194, + "eval_loss": 3.372913122177124, + "eval_runtime": 0.2597, + "eval_samples_per_second": 165.6, + "eval_steps_per_second": 3.851, "step": 12800 }, { - "epoch": 1.5422586082350107, - "grad_norm": 5.1875, + "epoch": 9.136947218259628, + "grad_norm": 7.90625, "learning_rate": 5.418222222222222e-05, - "loss": 0.601, + "loss": 0.8071, "step": 12810 }, { - "epoch": 1.5434625571875753, - "grad_norm": 10.0625, + "epoch": 9.144079885877318, + "grad_norm": 6.5625, "learning_rate": 5.413777777777778e-05, - "loss": 0.6409, + "loss": 0.7691, "step": 12820 }, { - "epoch": 1.5446665061401397, - "grad_norm": 6.21875, + "epoch": 9.151212553495007, + "grad_norm": 8.375, "learning_rate": 5.409333333333334e-05, - "loss": 0.6065, + "loss": 0.8105, "step": 12830 }, { - "epoch": 1.545870455092704, - "grad_norm": 7.125, + "epoch": 9.158345221112697, + "grad_norm": 7.5, "learning_rate": 5.4048888888888895e-05, - "loss": 0.5369, + "loss": 0.83, "step": 12840 }, { - "epoch": 1.5470744040452686, - "grad_norm": 8.4375, + "epoch": 9.165477888730384, + "grad_norm": 7.21875, "learning_rate": 5.400444444444444e-05, - "loss": 0.6577, + "loss": 0.8158, "step": 12850 }, { - "epoch": 1.548278352997833, - "grad_norm": 7.09375, + "epoch": 9.172610556348074, + "grad_norm": 8.0625, "learning_rate": 5.396e-05, - "loss": 0.6092, + "loss": 0.7359, "step": 12860 }, { - "epoch": 1.5494823019503974, - "grad_norm": 6.5625, + "epoch": 9.179743223965763, + "grad_norm": 7.21875, "learning_rate": 5.391555555555556e-05, - "loss": 0.7309, + "loss": 0.7797, "step": 12870 }, { - "epoch": 1.5506862509029617, - "grad_norm": 6.96875, + "epoch": 9.186875891583453, + "grad_norm": 11.8125, "learning_rate": 5.3871111111111116e-05, - "loss": 0.6047, + "loss": 0.8005, "step": 12880 }, { - "epoch": 1.5518901998555261, - "grad_norm": 8.125, + "epoch": 9.19400855920114, + "grad_norm": 14.0, "learning_rate": 5.382666666666667e-05, - "loss": 0.6257, + "loss": 0.8764, "step": 12890 }, { - "epoch": 1.5530941488080905, - "grad_norm": 7.4375, + "epoch": 9.20114122681883, + "grad_norm": 6.96875, "learning_rate": 5.3782222222222226e-05, - "loss": 0.5871, + "loss": 0.6898, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval/acc": 44.1860466003418, + "epoch": 9.20114122681883, + "eval/acc": 32.55813980102539, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval_loss": 2.8619837760925293, - "eval_runtime": 0.2172, - "eval_samples_per_second": 197.932, - "eval_steps_per_second": 4.603, + "epoch": 9.20114122681883, + "eval_loss": 3.383354425430298, + "eval_runtime": 0.2453, + "eval_samples_per_second": 175.301, + "eval_steps_per_second": 4.077, "step": 12900 }, { - "epoch": 1.5542980977606549, - "grad_norm": 9.8125, + "epoch": 9.20827389443652, + "grad_norm": 8.125, "learning_rate": 5.3737777777777775e-05, - "loss": 0.5404, + "loss": 0.8066, "step": 12910 }, { - "epoch": 1.5555020467132192, - "grad_norm": 7.4375, + "epoch": 9.21540656205421, + "grad_norm": 7.96875, "learning_rate": 5.369333333333334e-05, - "loss": 0.5922, + "loss": 0.7809, "step": 12920 }, { - "epoch": 1.5567059956657838, - "grad_norm": 6.53125, + "epoch": 9.222539229671897, + "grad_norm": 7.15625, "learning_rate": 5.364888888888889e-05, - "loss": 0.6612, + "loss": 0.7242, "step": 12930 }, { - "epoch": 1.5579099446183482, - "grad_norm": 8.25, + "epoch": 9.229671897289586, + "grad_norm": 7.65625, "learning_rate": 5.360444444444445e-05, - "loss": 0.6073, + "loss": 0.8201, "step": 12940 }, { - "epoch": 1.5591138935709126, - "grad_norm": 8.875, + "epoch": 9.236804564907276, + "grad_norm": 8.75, "learning_rate": 5.356e-05, - "loss": 0.609, + "loss": 0.8531, "step": 12950 }, { - "epoch": 1.5603178425234772, - "grad_norm": 7.03125, + "epoch": 9.243937232524964, + "grad_norm": 7.3125, "learning_rate": 5.3515555555555564e-05, - "loss": 0.5725, + "loss": 0.8004, "step": 12960 }, { - "epoch": 1.5615217914760415, - "grad_norm": 11.375, + "epoch": 9.251069900142653, + "grad_norm": 9.1875, "learning_rate": 5.347111111111112e-05, - "loss": 0.6808, + "loss": 0.8026, "step": 12970 }, { - "epoch": 1.562725740428606, - "grad_norm": 8.4375, + "epoch": 9.258202567760343, + "grad_norm": 8.75, "learning_rate": 5.342666666666667e-05, - "loss": 0.6652, + "loss": 0.9001, "step": 12980 }, { - "epoch": 1.5639296893811703, - "grad_norm": 12.625, + "epoch": 9.265335235378032, + "grad_norm": 6.75, "learning_rate": 5.338222222222222e-05, - "loss": 0.6361, + "loss": 0.8698, "step": 12990 }, { - "epoch": 1.5651336383337346, - "grad_norm": 5.875, + "epoch": 9.27246790299572, + "grad_norm": 5.75, "learning_rate": 5.333777777777778e-05, - "loss": 0.539, + "loss": 0.7668, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval/acc": 42.44186019897461, + "epoch": 9.27246790299572, + "eval/acc": 34.88372039794922, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval_loss": 2.877701759338379, - "eval_runtime": 0.2098, - "eval_samples_per_second": 204.994, - "eval_steps_per_second": 4.767, + "epoch": 9.27246790299572, + "eval_loss": 3.3350794315338135, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.644, + "eval_steps_per_second": 4.457, "step": 13000 }, { - "epoch": 1.566337587286299, - "grad_norm": 9.1875, + "epoch": 9.27960057061341, + "grad_norm": 7.90625, "learning_rate": 5.329333333333334e-05, - "loss": 0.5718, + "loss": 0.8908, "step": 13010 }, { - "epoch": 1.5675415362388634, - "grad_norm": 6.78125, + "epoch": 9.286733238231099, + "grad_norm": 7.78125, "learning_rate": 5.3248888888888896e-05, - "loss": 0.6074, + "loss": 0.76, "step": 13020 }, { - "epoch": 1.5687454851914278, - "grad_norm": 7.28125, + "epoch": 9.293865905848788, + "grad_norm": 22.125, "learning_rate": 5.3204444444444444e-05, - "loss": 0.5788, + "loss": 0.8205, "step": 13030 }, { - "epoch": 1.5699494341439921, - "grad_norm": 6.78125, + "epoch": 9.300998573466476, + "grad_norm": 9.6875, "learning_rate": 5.316e-05, - "loss": 0.6445, + "loss": 0.7528, "step": 13040 }, { - "epoch": 1.5711533830965567, - "grad_norm": 7.34375, + "epoch": 9.308131241084165, + "grad_norm": 6.8125, "learning_rate": 5.311555555555556e-05, - "loss": 0.6391, + "loss": 0.8987, "step": 13050 }, { - "epoch": 1.572357332049121, - "grad_norm": 8.375, + "epoch": 9.315263908701855, + "grad_norm": 7.71875, "learning_rate": 5.3071111111111116e-05, - "loss": 0.6101, + "loss": 0.8056, "step": 13060 }, { - "epoch": 1.5735612810016857, - "grad_norm": 9.6875, + "epoch": 9.322396576319543, + "grad_norm": 6.78125, "learning_rate": 5.302666666666667e-05, - "loss": 0.6029, + "loss": 0.7962, "step": 13070 }, { - "epoch": 1.57476522995425, - "grad_norm": 7.5625, + "epoch": 9.329529243937232, + "grad_norm": 6.3125, "learning_rate": 5.298222222222222e-05, - "loss": 0.6034, + "loss": 0.846, "step": 13080 }, { - "epoch": 1.5759691789068144, - "grad_norm": 6.90625, + "epoch": 9.336661911554922, + "grad_norm": 8.75, "learning_rate": 5.2937777777777775e-05, - "loss": 0.629, + "loss": 0.8005, "step": 13090 }, { - "epoch": 1.5771731278593788, - "grad_norm": 8.0625, + "epoch": 9.343794579172611, + "grad_norm": 27.25, "learning_rate": 5.289333333333334e-05, - "loss": 0.5272, + "loss": 0.7313, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval/acc": 42.44186019897461, + "epoch": 9.343794579172611, + "eval/acc": 32.55813980102539, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval_loss": 2.883392095565796, - "eval_runtime": 0.2099, - "eval_samples_per_second": 204.824, - "eval_steps_per_second": 4.763, + "epoch": 9.343794579172611, + "eval_loss": 3.3405187129974365, + "eval_runtime": 0.2462, + "eval_samples_per_second": 174.636, + "eval_steps_per_second": 4.061, "step": 13100 }, { - "epoch": 1.5783770768119432, - "grad_norm": 7.46875, + "epoch": 9.350927246790299, + "grad_norm": 7.5625, "learning_rate": 5.284888888888889e-05, - "loss": 0.6273, + "loss": 0.8474, "step": 13110 }, { - "epoch": 1.5795810257645075, - "grad_norm": 7.90625, + "epoch": 9.358059914407988, + "grad_norm": 7.25, "learning_rate": 5.280444444444445e-05, - "loss": 0.6752, + "loss": 0.8104, "step": 13120 }, { - "epoch": 1.580784974717072, - "grad_norm": 9.6875, + "epoch": 9.365192582025678, + "grad_norm": 7.71875, "learning_rate": 5.2759999999999996e-05, - "loss": 0.5769, + "loss": 0.8638, "step": 13130 }, { - "epoch": 1.5819889236696363, - "grad_norm": 8.1875, + "epoch": 9.372325249643367, + "grad_norm": 8.25, "learning_rate": 5.2715555555555565e-05, - "loss": 0.5469, + "loss": 0.7968, "step": 13140 }, { - "epoch": 1.5831928726222007, - "grad_norm": 4.96875, + "epoch": 9.379457917261055, + "grad_norm": 9.4375, "learning_rate": 5.2671111111111114e-05, - "loss": 0.5805, + "loss": 0.692, "step": 13150 }, { - "epoch": 1.5843968215747652, - "grad_norm": 6.65625, + "epoch": 9.386590584878745, + "grad_norm": 6.1875, "learning_rate": 5.262666666666667e-05, - "loss": 0.527, + "loss": 0.8222, "step": 13160 }, { - "epoch": 1.5856007705273296, - "grad_norm": 9.375, + "epoch": 9.393723252496434, + "grad_norm": 9.9375, "learning_rate": 5.2582222222222224e-05, - "loss": 0.5363, + "loss": 0.8494, "step": 13170 }, { - "epoch": 1.5868047194798942, - "grad_norm": 6.15625, + "epoch": 9.400855920114124, + "grad_norm": 8.0, "learning_rate": 5.2537777777777786e-05, - "loss": 0.5795, + "loss": 0.8254, "step": 13180 }, { - "epoch": 1.5880086684324586, - "grad_norm": 6.59375, + "epoch": 9.407988587731811, + "grad_norm": 7.375, "learning_rate": 5.249333333333334e-05, - "loss": 0.567, + "loss": 0.8771, "step": 13190 }, { - "epoch": 1.589212617385023, - "grad_norm": 7.46875, + "epoch": 9.4151212553495, + "grad_norm": 7.34375, "learning_rate": 5.244888888888889e-05, - "loss": 0.5838, + "loss": 0.8563, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval/acc": 44.76744079589844, + "epoch": 9.4151212553495, + "eval/acc": 37.20930099487305, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval_loss": 2.885694742202759, - "eval_runtime": 0.2136, - "eval_samples_per_second": 201.314, - "eval_steps_per_second": 4.682, + "epoch": 9.4151212553495, + "eval_loss": 3.293537139892578, + "eval_runtime": 0.219, + "eval_samples_per_second": 196.361, + "eval_steps_per_second": 4.567, "step": 13200 }, { - "epoch": 1.5904165663375873, - "grad_norm": 7.625, + "epoch": 9.42225392296719, + "grad_norm": 7.1875, "learning_rate": 5.2404444444444445e-05, - "loss": 0.5836, + "loss": 0.769, "step": 13210 }, { - "epoch": 1.5916205152901517, - "grad_norm": 7.28125, + "epoch": 9.429386590584878, + "grad_norm": 8.5, "learning_rate": 5.236e-05, - "loss": 0.6374, + "loss": 0.778, "step": 13220 }, { - "epoch": 1.592824464242716, - "grad_norm": 5.59375, + "epoch": 9.436519258202567, + "grad_norm": 7.6875, "learning_rate": 5.231555555555556e-05, - "loss": 0.5608, + "loss": 0.8043, "step": 13230 }, { - "epoch": 1.5940284131952804, - "grad_norm": 6.15625, + "epoch": 9.443651925820257, + "grad_norm": 7.59375, "learning_rate": 5.227111111111112e-05, - "loss": 0.6031, + "loss": 0.7962, "step": 13240 }, { - "epoch": 1.5952323621478448, - "grad_norm": 6.84375, + "epoch": 9.450784593437946, + "grad_norm": 9.6875, "learning_rate": 5.2226666666666666e-05, - "loss": 0.6458, + "loss": 0.8623, "step": 13250 }, { - "epoch": 1.5964363111004092, - "grad_norm": 7.59375, + "epoch": 9.457917261055634, + "grad_norm": 7.125, "learning_rate": 5.218222222222222e-05, - "loss": 0.5275, + "loss": 0.7408, "step": 13260 }, { - "epoch": 1.5976402600529738, - "grad_norm": 9.4375, + "epoch": 9.465049928673324, + "grad_norm": 8.1875, "learning_rate": 5.213777777777778e-05, - "loss": 0.6249, + "loss": 0.7233, "step": 13270 }, { - "epoch": 1.5988442090055381, - "grad_norm": 8.375, + "epoch": 9.472182596291013, + "grad_norm": 9.375, "learning_rate": 5.209333333333334e-05, - "loss": 0.629, + "loss": 0.7349, "step": 13280 }, { - "epoch": 1.6000481579581027, - "grad_norm": 7.21875, + "epoch": 9.479315263908703, + "grad_norm": 6.75, "learning_rate": 5.2048888888888894e-05, - "loss": 0.6004, + "loss": 0.7311, "step": 13290 }, { - "epoch": 1.601252106910667, - "grad_norm": 6.9375, + "epoch": 9.48644793152639, + "grad_norm": 10.25, "learning_rate": 5.200444444444444e-05, - "loss": 0.4867, + "loss": 0.828, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval/acc": 46.511627197265625, + "epoch": 9.48644793152639, + "eval/acc": 34.88372039794922, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval_loss": 2.8910820484161377, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.807, - "eval_steps_per_second": 4.716, + "epoch": 9.48644793152639, + "eval_loss": 3.376410484313965, + "eval_runtime": 0.2205, + "eval_samples_per_second": 194.974, + "eval_steps_per_second": 4.534, "step": 13300 }, { - "epoch": 1.6024560558632315, - "grad_norm": 6.9375, + "epoch": 9.49358059914408, + "grad_norm": 12.1875, "learning_rate": 5.196e-05, - "loss": 0.6888, + "loss": 0.6994, "step": 13310 }, { - "epoch": 1.6036600048157958, - "grad_norm": 7.84375, + "epoch": 9.50071326676177, + "grad_norm": 10.375, "learning_rate": 5.191555555555556e-05, - "loss": 0.5953, + "loss": 0.7658, "step": 13320 }, { - "epoch": 1.6048639537683602, - "grad_norm": 10.0625, + "epoch": 9.507845934379457, + "grad_norm": 7.625, "learning_rate": 5.1871111111111114e-05, - "loss": 0.6347, + "loss": 0.7453, "step": 13330 }, { - "epoch": 1.6060679027209246, - "grad_norm": 10.75, + "epoch": 9.514978601997147, + "grad_norm": 8.0, "learning_rate": 5.182666666666667e-05, - "loss": 0.5822, + "loss": 0.7407, "step": 13340 }, { - "epoch": 1.607271851673489, - "grad_norm": 7.90625, + "epoch": 9.522111269614836, + "grad_norm": 6.96875, "learning_rate": 5.178222222222222e-05, - "loss": 0.5955, + "loss": 0.8234, "step": 13350 }, { - "epoch": 1.6084758006260533, - "grad_norm": 8.3125, + "epoch": 9.529243937232525, + "grad_norm": 6.59375, "learning_rate": 5.173777777777779e-05, - "loss": 0.5097, + "loss": 0.7517, "step": 13360 }, { - "epoch": 1.6096797495786177, - "grad_norm": 7.03125, + "epoch": 9.536376604850213, + "grad_norm": 7.15625, "learning_rate": 5.1693333333333335e-05, - "loss": 0.6034, + "loss": 0.6939, "step": 13370 }, { - "epoch": 1.6108836985311823, - "grad_norm": 7.375, + "epoch": 9.543509272467903, + "grad_norm": 9.6875, "learning_rate": 5.164888888888889e-05, - "loss": 0.4866, + "loss": 0.7602, "step": 13380 }, { - "epoch": 1.6120876474837467, - "grad_norm": 7.59375, + "epoch": 9.550641940085592, + "grad_norm": 7.375, "learning_rate": 5.1604444444444446e-05, - "loss": 0.548, + "loss": 0.8016, "step": 13390 }, { - "epoch": 1.6132915964363113, - "grad_norm": 7.625, + "epoch": 9.557774607703282, + "grad_norm": 6.9375, "learning_rate": 5.1559999999999994e-05, - "loss": 0.5695, + "loss": 0.8258, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval/acc": 45.930233001708984, + "epoch": 9.557774607703282, + "eval/acc": 34.88372039794922, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval_loss": 2.88386607170105, - "eval_runtime": 0.215, - "eval_samples_per_second": 199.962, - "eval_steps_per_second": 4.65, + "epoch": 9.557774607703282, + "eval_loss": 3.3766846656799316, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.368, + "eval_steps_per_second": 4.404, "step": 13400 }, { - "epoch": 1.6144955453888756, - "grad_norm": 8.5, + "epoch": 9.56490727532097, + "grad_norm": 6.875, "learning_rate": 5.151555555555556e-05, - "loss": 0.5547, + "loss": 0.7926, "step": 13410 }, { - "epoch": 1.61569949434144, - "grad_norm": 6.625, + "epoch": 9.572039942938659, + "grad_norm": 6.28125, "learning_rate": 5.147111111111111e-05, - "loss": 0.5789, + "loss": 0.6912, "step": 13420 }, { - "epoch": 1.6169034432940044, - "grad_norm": 13.8125, + "epoch": 9.579172610556348, + "grad_norm": 60.5, "learning_rate": 5.142666666666667e-05, - "loss": 0.6012, + "loss": 0.8117, "step": 13430 }, { - "epoch": 1.6181073922465687, - "grad_norm": 7.59375, + "epoch": 9.586305278174038, + "grad_norm": 10.5, "learning_rate": 5.138222222222222e-05, - "loss": 0.539, + "loss": 0.7794, "step": 13440 }, { - "epoch": 1.6193113411991331, - "grad_norm": 7.0, + "epoch": 9.593437945791726, + "grad_norm": 5.6875, "learning_rate": 5.1337777777777784e-05, - "loss": 0.5513, + "loss": 0.6753, "step": 13450 }, { - "epoch": 1.6205152901516975, - "grad_norm": 6.875, + "epoch": 9.600570613409415, + "grad_norm": 8.4375, "learning_rate": 5.129333333333334e-05, - "loss": 0.5788, + "loss": 0.8676, "step": 13460 }, { - "epoch": 1.6217192391042619, - "grad_norm": 10.0, + "epoch": 9.607703281027105, + "grad_norm": 7.34375, "learning_rate": 5.124888888888889e-05, - "loss": 0.6301, + "loss": 0.7326, "step": 13470 }, { - "epoch": 1.6229231880568262, - "grad_norm": 7.15625, + "epoch": 9.614835948644792, + "grad_norm": 13.9375, "learning_rate": 5.120444444444444e-05, - "loss": 0.5939, + "loss": 0.8177, "step": 13480 }, { - "epoch": 1.6241271370093908, - "grad_norm": 7.6875, + "epoch": 9.621968616262482, + "grad_norm": 8.3125, "learning_rate": 5.1160000000000005e-05, - "loss": 0.575, + "loss": 0.7928, "step": 13490 }, { - "epoch": 1.6253310859619552, - "grad_norm": 9.125, + "epoch": 9.629101283880171, + "grad_norm": 5.3125, "learning_rate": 5.111555555555556e-05, - "loss": 0.7391, + "loss": 0.7693, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval/acc": 46.511627197265625, + "epoch": 9.629101283880171, + "eval/acc": 37.20930099487305, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval_loss": 2.8773036003112793, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.837, - "eval_steps_per_second": 4.74, + "epoch": 9.629101283880171, + "eval_loss": 3.340432643890381, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.437, + "eval_steps_per_second": 4.522, "step": 13500 }, { - "epoch": 1.6265350349145198, - "grad_norm": 6.34375, + "epoch": 9.63623395149786, + "grad_norm": 6.4375, "learning_rate": 5.1071111111111115e-05, - "loss": 0.5839, + "loss": 0.7974, "step": 13510 }, { - "epoch": 1.6277389838670842, - "grad_norm": 8.125, + "epoch": 9.643366619115548, + "grad_norm": 11.375, "learning_rate": 5.1026666666666664e-05, - "loss": 0.6332, + "loss": 0.8533, "step": 13520 }, { - "epoch": 1.6289429328196485, - "grad_norm": 7.53125, + "epoch": 9.650499286733238, + "grad_norm": 8.4375, "learning_rate": 5.098222222222222e-05, - "loss": 0.5676, + "loss": 0.7578, "step": 13530 }, { - "epoch": 1.630146881772213, - "grad_norm": 5.8125, + "epoch": 9.657631954350927, + "grad_norm": 6.96875, "learning_rate": 5.093777777777778e-05, - "loss": 0.5453, + "loss": 0.8348, "step": 13540 }, { - "epoch": 1.6313508307247773, - "grad_norm": 7.15625, + "epoch": 9.664764621968617, + "grad_norm": 6.75, "learning_rate": 5.0893333333333336e-05, - "loss": 0.6429, + "loss": 0.7562, "step": 13550 }, { - "epoch": 1.6325547796773416, - "grad_norm": 9.25, + "epoch": 9.671897289586305, + "grad_norm": 7.4375, "learning_rate": 5.084888888888889e-05, - "loss": 0.6338, + "loss": 0.8667, "step": 13560 }, { - "epoch": 1.633758728629906, - "grad_norm": 9.1875, + "epoch": 9.679029957203994, + "grad_norm": 11.4375, "learning_rate": 5.080444444444445e-05, - "loss": 0.5679, + "loss": 0.7158, "step": 13570 }, { - "epoch": 1.6349626775824704, - "grad_norm": 6.8125, + "epoch": 9.686162624821684, + "grad_norm": 6.15625, "learning_rate": 5.076000000000001e-05, - "loss": 0.5705, + "loss": 0.7153, "step": 13580 }, { - "epoch": 1.6361666265350348, - "grad_norm": 10.375, + "epoch": 9.693295292439373, + "grad_norm": 10.25, "learning_rate": 5.0715555555555564e-05, - "loss": 0.6313, + "loss": 0.7698, "step": 13590 }, { - "epoch": 1.6373705754875993, - "grad_norm": 6.6875, + "epoch": 9.70042796005706, + "grad_norm": 12.0, "learning_rate": 5.067111111111111e-05, - "loss": 0.5634, + "loss": 0.8033, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval/acc": 46.511627197265625, + "epoch": 9.70042796005706, + "eval/acc": 37.20930099487305, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval_loss": 2.8538527488708496, - "eval_runtime": 0.2095, - "eval_samples_per_second": 205.241, - "eval_steps_per_second": 4.773, + "epoch": 9.70042796005706, + "eval_loss": 3.325901985168457, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.369, + "eval_steps_per_second": 4.474, "step": 13600 }, { - "epoch": 1.6385745244401637, - "grad_norm": 7.25, + "epoch": 9.70756062767475, + "grad_norm": 11.5, "learning_rate": 5.062666666666667e-05, - "loss": 0.656, + "loss": 0.7757, "step": 13610 }, { - "epoch": 1.6397784733927283, - "grad_norm": 9.875, + "epoch": 9.71469329529244, + "grad_norm": 7.0625, "learning_rate": 5.058222222222222e-05, - "loss": 0.6261, + "loss": 0.7335, "step": 13620 }, { - "epoch": 1.6409824223452927, - "grad_norm": 8.25, + "epoch": 9.721825962910128, + "grad_norm": 7.09375, "learning_rate": 5.0537777777777785e-05, - "loss": 0.5124, + "loss": 0.7219, "step": 13630 }, { - "epoch": 1.642186371297857, - "grad_norm": 7.5625, + "epoch": 9.728958630527817, + "grad_norm": 28.625, "learning_rate": 5.049333333333334e-05, - "loss": 0.6025, + "loss": 0.7445, "step": 13640 }, { - "epoch": 1.6433903202504214, - "grad_norm": 7.875, + "epoch": 9.736091298145507, + "grad_norm": 6.34375, "learning_rate": 5.044888888888889e-05, - "loss": 0.6363, + "loss": 0.7203, "step": 13650 }, { - "epoch": 1.6445942692029858, - "grad_norm": 8.625, + "epoch": 9.743223965763196, + "grad_norm": 12.25, "learning_rate": 5.0404444444444444e-05, - "loss": 0.6095, + "loss": 0.815, "step": 13660 }, { - "epoch": 1.6457982181555502, - "grad_norm": 7.53125, + "epoch": 9.750356633380884, + "grad_norm": 8.125, "learning_rate": 5.0360000000000006e-05, - "loss": 0.5122, + "loss": 0.6969, "step": 13670 }, { - "epoch": 1.6470021671081145, - "grad_norm": 7.625, + "epoch": 9.757489300998573, + "grad_norm": 8.9375, "learning_rate": 5.031555555555556e-05, - "loss": 0.6545, + "loss": 0.742, "step": 13680 }, { - "epoch": 1.648206116060679, - "grad_norm": 8.4375, + "epoch": 9.764621968616263, + "grad_norm": 17.125, "learning_rate": 5.0271111111111116e-05, - "loss": 0.6044, + "loss": 0.8526, "step": 13690 }, { - "epoch": 1.6494100650132433, - "grad_norm": 7.03125, + "epoch": 9.771754636233952, + "grad_norm": 9.3125, "learning_rate": 5.0226666666666665e-05, - "loss": 0.5725, + "loss": 0.795, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval/acc": 46.511627197265625, + "epoch": 9.771754636233952, + "eval/acc": 37.20930099487305, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval_loss": 2.8802239894866943, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.668, - "eval_steps_per_second": 4.783, + "epoch": 9.771754636233952, + "eval_loss": 3.3737363815307617, + "eval_runtime": 0.2349, + "eval_samples_per_second": 183.026, + "eval_steps_per_second": 4.256, "step": 13700 }, { - "epoch": 1.6506140139658079, - "grad_norm": 7.84375, + "epoch": 9.77888730385164, + "grad_norm": 7.28125, "learning_rate": 5.018222222222222e-05, - "loss": 0.6112, + "loss": 0.7804, "step": 13710 }, { - "epoch": 1.6518179629183722, - "grad_norm": 12.0625, + "epoch": 9.78601997146933, + "grad_norm": 8.25, "learning_rate": 5.013777777777778e-05, - "loss": 0.5524, + "loss": 0.8201, "step": 13720 }, { - "epoch": 1.6530219118709368, - "grad_norm": 7.84375, + "epoch": 9.793152639087019, + "grad_norm": 7.125, "learning_rate": 5.009333333333334e-05, - "loss": 0.6066, + "loss": 0.7495, "step": 13730 }, { - "epoch": 1.6542258608235012, - "grad_norm": 6.15625, + "epoch": 9.800285306704708, + "grad_norm": 7.96875, "learning_rate": 5.004888888888889e-05, - "loss": 0.5683, + "loss": 0.7827, "step": 13740 }, { - "epoch": 1.6554298097760656, - "grad_norm": 7.03125, + "epoch": 9.807417974322396, + "grad_norm": 6.5625, "learning_rate": 5.000444444444444e-05, - "loss": 0.6051, + "loss": 0.8317, "step": 13750 }, { - "epoch": 1.65663375872863, - "grad_norm": 7.75, + "epoch": 9.814550641940086, + "grad_norm": 7.8125, "learning_rate": 4.996e-05, - "loss": 0.5289, + "loss": 0.8547, "step": 13760 }, { - "epoch": 1.6578377076811943, - "grad_norm": 6.5, + "epoch": 9.821683309557775, + "grad_norm": 7.15625, "learning_rate": 4.991555555555556e-05, - "loss": 0.5031, + "loss": 0.8679, "step": 13770 }, { - "epoch": 1.6590416566337587, - "grad_norm": 7.34375, + "epoch": 9.828815977175463, + "grad_norm": 7.8125, "learning_rate": 4.987111111111111e-05, - "loss": 0.6406, + "loss": 0.7479, "step": 13780 }, { - "epoch": 1.660245605586323, - "grad_norm": 8.1875, + "epoch": 9.835948644793152, + "grad_norm": 15.5, "learning_rate": 4.982666666666667e-05, - "loss": 0.5593, + "loss": 0.8501, "step": 13790 }, { - "epoch": 1.6614495545388874, - "grad_norm": 12.25, + "epoch": 9.843081312410842, + "grad_norm": 8.1875, "learning_rate": 4.9782222222222224e-05, - "loss": 0.4899, + "loss": 0.7662, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval/acc": 46.511627197265625, + "epoch": 9.843081312410842, + "eval/acc": 37.20930099487305, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval_loss": 2.8814125061035156, - "eval_runtime": 0.2176, - "eval_samples_per_second": 197.608, - "eval_steps_per_second": 4.596, + "epoch": 9.843081312410842, + "eval_loss": 3.3852930068969727, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.716, + "eval_steps_per_second": 4.528, "step": 13800 }, { - "epoch": 1.6626535034914518, - "grad_norm": 5.40625, + "epoch": 9.850213980028531, + "grad_norm": 8.3125, "learning_rate": 4.973777777777778e-05, - "loss": 0.5799, + "loss": 0.8303, "step": 13810 }, { - "epoch": 1.6638574524440164, - "grad_norm": 7.65625, + "epoch": 9.857346647646219, + "grad_norm": 8.8125, "learning_rate": 4.9693333333333334e-05, - "loss": 0.6122, + "loss": 0.7875, "step": 13820 }, { - "epoch": 1.6650614013965808, - "grad_norm": 8.1875, + "epoch": 9.864479315263909, + "grad_norm": 7.625, "learning_rate": 4.964888888888889e-05, - "loss": 0.593, + "loss": 0.7952, "step": 13830 }, { - "epoch": 1.6662653503491454, - "grad_norm": 6.5625, + "epoch": 9.871611982881598, + "grad_norm": 6.96875, "learning_rate": 4.9604444444444445e-05, - "loss": 0.6101, + "loss": 0.8041, "step": 13840 }, { - "epoch": 1.6674692993017097, - "grad_norm": 11.625, + "epoch": 9.878744650499288, + "grad_norm": 6.375, "learning_rate": 4.956e-05, - "loss": 0.6803, + "loss": 0.6869, "step": 13850 }, { - "epoch": 1.668673248254274, - "grad_norm": 7.5625, + "epoch": 9.885877318116975, + "grad_norm": 7.125, "learning_rate": 4.951555555555556e-05, - "loss": 0.6574, + "loss": 0.7707, "step": 13860 }, { - "epoch": 1.6698771972068385, - "grad_norm": 6.09375, + "epoch": 9.893009985734665, + "grad_norm": 8.125, "learning_rate": 4.947111111111111e-05, - "loss": 0.6698, + "loss": 0.7512, "step": 13870 }, { - "epoch": 1.6710811461594028, - "grad_norm": 7.75, + "epoch": 9.900142653352354, + "grad_norm": 8.8125, "learning_rate": 4.942666666666667e-05, - "loss": 0.6114, + "loss": 0.8059, "step": 13880 }, { - "epoch": 1.6722850951119672, - "grad_norm": 7.25, + "epoch": 9.907275320970044, + "grad_norm": 7.90625, "learning_rate": 4.938222222222223e-05, - "loss": 0.6604, + "loss": 0.729, "step": 13890 }, { - "epoch": 1.6734890440645316, - "grad_norm": 8.25, + "epoch": 9.914407988587731, + "grad_norm": 6.625, "learning_rate": 4.933777777777778e-05, - "loss": 0.6482, + "loss": 0.7958, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval/acc": 46.511627197265625, + "epoch": 9.914407988587731, + "eval/acc": 37.20930099487305, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval_loss": 2.856815814971924, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.827, - "eval_steps_per_second": 4.717, + "epoch": 9.914407988587731, + "eval_loss": 3.400364875793457, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.329, + "eval_steps_per_second": 4.356, "step": 13900 }, { - "epoch": 1.674692993017096, - "grad_norm": 7.15625, + "epoch": 9.921540656205421, + "grad_norm": 7.0625, "learning_rate": 4.929333333333334e-05, - "loss": 0.6165, + "loss": 0.7314, "step": 13910 }, { - "epoch": 1.6758969419696603, - "grad_norm": 7.65625, + "epoch": 9.92867332382311, + "grad_norm": 6.78125, "learning_rate": 4.9248888888888886e-05, - "loss": 0.5861, + "loss": 0.7581, "step": 13920 }, { - "epoch": 1.677100890922225, - "grad_norm": 6.8125, + "epoch": 9.935805991440798, + "grad_norm": 8.6875, "learning_rate": 4.920444444444445e-05, - "loss": 0.6019, + "loss": 0.7865, "step": 13930 }, { - "epoch": 1.6783048398747893, - "grad_norm": 7.1875, + "epoch": 9.942938659058488, + "grad_norm": 7.78125, "learning_rate": 4.9160000000000004e-05, - "loss": 0.5639, + "loss": 0.7174, "step": 13940 }, { - "epoch": 1.6795087888273539, - "grad_norm": 8.875, + "epoch": 9.950071326676177, + "grad_norm": 7.6875, "learning_rate": 4.911555555555556e-05, - "loss": 0.6077, + "loss": 0.855, "step": 13950 }, { - "epoch": 1.6807127377799183, - "grad_norm": 7.75, + "epoch": 9.957203994293867, + "grad_norm": 7.46875, "learning_rate": 4.9071111111111114e-05, - "loss": 0.5095, + "loss": 0.7511, "step": 13960 }, { - "epoch": 1.6819166867324826, - "grad_norm": 6.75, + "epoch": 9.964336661911554, + "grad_norm": 6.34375, "learning_rate": 4.902666666666667e-05, - "loss": 0.6097, + "loss": 0.6901, "step": 13970 }, { - "epoch": 1.683120635685047, - "grad_norm": 10.0625, + "epoch": 9.971469329529244, + "grad_norm": 19.125, "learning_rate": 4.8982222222222225e-05, - "loss": 0.5662, + "loss": 0.7621, "step": 13980 }, { - "epoch": 1.6843245846376114, - "grad_norm": 9.25, + "epoch": 9.978601997146933, + "grad_norm": 9.375, "learning_rate": 4.893777777777778e-05, - "loss": 0.6319, + "loss": 0.7466, "step": 13990 }, { - "epoch": 1.6855285335901757, - "grad_norm": 6.28125, + "epoch": 9.985734664764623, + "grad_norm": 7.1875, "learning_rate": 4.8893333333333335e-05, - "loss": 0.5154, + "loss": 0.749, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval/acc": 46.511627197265625, + "epoch": 9.985734664764623, + "eval/acc": 37.20930099487305, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval_loss": 2.8956446647644043, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.844, - "eval_steps_per_second": 4.624, + "epoch": 9.985734664764623, + "eval_loss": 3.3502047061920166, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.024, + "eval_steps_per_second": 4.442, "step": 14000 }, { - "epoch": 1.68673248254274, - "grad_norm": 8.8125, + "epoch": 9.99286733238231, + "grad_norm": 9.5625, "learning_rate": 4.884888888888889e-05, - "loss": 0.5645, + "loss": 0.6932, "step": 14010 }, { - "epoch": 1.6879364314953045, - "grad_norm": 8.8125, + "epoch": 10.0, + "grad_norm": 5.875, "learning_rate": 4.8804444444444445e-05, - "loss": 0.6886, + "loss": 0.7939, "step": 14020 }, { - "epoch": 1.6891403804478688, - "grad_norm": 9.3125, + "epoch": 10.00713266761769, + "grad_norm": 7.15625, "learning_rate": 4.876e-05, - "loss": 0.5767, + "loss": 0.8124, "step": 14030 }, { - "epoch": 1.6903443294004334, - "grad_norm": 11.625, + "epoch": 10.014265335235377, + "grad_norm": 6.0625, "learning_rate": 4.8715555555555556e-05, - "loss": 0.542, + "loss": 0.6855, "step": 14040 }, { - "epoch": 1.6915482783529978, - "grad_norm": 6.5625, + "epoch": 10.021398002853067, + "grad_norm": 50.75, "learning_rate": 4.867111111111111e-05, - "loss": 0.538, + "loss": 0.8354, "step": 14050 }, { - "epoch": 1.6927522273055624, - "grad_norm": 6.9375, + "epoch": 10.028530670470756, + "grad_norm": 7.46875, "learning_rate": 4.862666666666667e-05, - "loss": 0.5314, + "loss": 0.8605, "step": 14060 }, { - "epoch": 1.6939561762581268, - "grad_norm": 7.9375, + "epoch": 10.035663338088446, + "grad_norm": 9.625, "learning_rate": 4.858222222222222e-05, - "loss": 0.5909, + "loss": 0.8626, "step": 14070 }, { - "epoch": 1.6951601252106911, - "grad_norm": 7.09375, + "epoch": 10.042796005706133, + "grad_norm": 6.125, "learning_rate": 4.8537777777777784e-05, - "loss": 0.5809, + "loss": 0.7302, "step": 14080 }, { - "epoch": 1.6963640741632555, - "grad_norm": 7.9375, + "epoch": 10.049928673323823, + "grad_norm": 8.0625, "learning_rate": 4.849333333333333e-05, - "loss": 0.576, + "loss": 0.9058, "step": 14090 }, { - "epoch": 1.69756802311582, - "grad_norm": 6.1875, + "epoch": 10.057061340941512, + "grad_norm": 7.3125, "learning_rate": 4.8448888888888894e-05, - "loss": 0.5162, + "loss": 0.7981, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval/acc": 45.930233001708984, + "epoch": 10.057061340941512, + "eval/acc": 46.511627197265625, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval_loss": 2.8892974853515625, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.308, - "eval_steps_per_second": 4.775, + "epoch": 10.057061340941512, + "eval_loss": 2.7371480464935303, + "eval_runtime": 1.1832, + "eval_samples_per_second": 36.344, + "eval_steps_per_second": 0.845, "step": 14100 }, { - "epoch": 1.6987719720683843, - "grad_norm": 6.84375, + "epoch": 10.064194008559202, + "grad_norm": 12.625, "learning_rate": 4.840444444444445e-05, - "loss": 0.6473, + "loss": 0.7461, "step": 14110 }, { - "epoch": 1.6999759210209486, - "grad_norm": 6.3125, + "epoch": 10.07132667617689, + "grad_norm": 7.375, "learning_rate": 4.836e-05, - "loss": 0.5388, + "loss": 0.8649, "step": 14120 }, { - "epoch": 1.701179869973513, - "grad_norm": 9.25, + "epoch": 10.078459343794579, + "grad_norm": 10.6875, "learning_rate": 4.831555555555556e-05, - "loss": 0.6188, + "loss": 0.8143, "step": 14130 }, { - "epoch": 1.7023838189260774, - "grad_norm": 8.875, + "epoch": 10.085592011412269, + "grad_norm": 43.5, "learning_rate": 4.827111111111111e-05, - "loss": 0.5347, + "loss": 0.8249, "step": 14140 }, { - "epoch": 1.703587767878642, - "grad_norm": 7.40625, + "epoch": 10.092724679029958, + "grad_norm": 6.6875, "learning_rate": 4.822666666666667e-05, - "loss": 0.5254, + "loss": 0.6324, "step": 14150 }, { - "epoch": 1.7047917168312063, - "grad_norm": 8.125, + "epoch": 10.099857346647646, + "grad_norm": 9.6875, "learning_rate": 4.8182222222222225e-05, - "loss": 0.5787, + "loss": 0.7795, "step": 14160 }, { - "epoch": 1.705995665783771, - "grad_norm": 9.875, + "epoch": 10.106990014265335, + "grad_norm": 7.8125, "learning_rate": 4.813777777777778e-05, - "loss": 0.5102, + "loss": 0.8453, "step": 14170 }, { - "epoch": 1.7071996147363353, - "grad_norm": 7.34375, + "epoch": 10.114122681883025, + "grad_norm": 6.21875, "learning_rate": 4.8093333333333336e-05, - "loss": 0.5871, + "loss": 0.735, "step": 14180 }, { - "epoch": 1.7084035636888997, - "grad_norm": 5.71875, + "epoch": 10.121255349500712, + "grad_norm": 8.1875, "learning_rate": 4.804888888888889e-05, - "loss": 0.6621, + "loss": 0.6646, "step": 14190 }, { - "epoch": 1.709607512641464, - "grad_norm": 8.625, + "epoch": 10.128388017118402, + "grad_norm": 38.0, "learning_rate": 4.8004444444444446e-05, - "loss": 0.5733, + "loss": 0.7963, "step": 14200 }, { - "epoch": 1.709607512641464, + "epoch": 10.128388017118402, "eval/acc": 44.1860466003418, "step": 14200 }, { - "epoch": 1.709607512641464, - "eval_loss": 2.9033682346343994, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.727, - "eval_steps_per_second": 4.691, + "epoch": 10.128388017118402, + "eval_loss": 2.7271535396575928, + "eval_runtime": 1.4583, + "eval_samples_per_second": 29.486, + "eval_steps_per_second": 0.686, "step": 14200 }, { - "epoch": 1.7108114615940284, - "grad_norm": 7.375, + "epoch": 10.135520684736091, + "grad_norm": 5.78125, "learning_rate": 4.796e-05, - "loss": 0.5375, + "loss": 0.693, "step": 14210 }, { - "epoch": 1.7120154105465928, - "grad_norm": 6.875, + "epoch": 10.142653352353781, + "grad_norm": 11.875, "learning_rate": 4.791555555555556e-05, - "loss": 0.5126, + "loss": 0.7578, "step": 14220 }, { - "epoch": 1.7132193594991572, - "grad_norm": 6.625, + "epoch": 10.149786019971469, + "grad_norm": 9.25, "learning_rate": 4.787111111111111e-05, - "loss": 0.5148, + "loss": 0.8127, "step": 14230 }, { - "epoch": 1.7144233084517215, - "grad_norm": 8.6875, + "epoch": 10.156918687589158, + "grad_norm": 9.1875, "learning_rate": 4.782666666666667e-05, - "loss": 0.6544, + "loss": 0.6935, "step": 14240 }, { - "epoch": 1.715627257404286, - "grad_norm": 7.28125, + "epoch": 10.164051355206848, + "grad_norm": 8.25, "learning_rate": 4.778222222222222e-05, - "loss": 0.5869, + "loss": 0.7233, "step": 14250 }, { - "epoch": 1.7168312063568505, - "grad_norm": 6.96875, + "epoch": 10.171184022824537, + "grad_norm": 6.71875, "learning_rate": 4.7737777777777785e-05, - "loss": 0.4857, + "loss": 0.8749, "step": 14260 }, { - "epoch": 1.7180351553094149, - "grad_norm": 6.90625, + "epoch": 10.178316690442225, + "grad_norm": 7.84375, "learning_rate": 4.769333333333333e-05, - "loss": 0.5416, + "loss": 0.7786, "step": 14270 }, { - "epoch": 1.7192391042619795, - "grad_norm": 9.5, + "epoch": 10.185449358059914, + "grad_norm": 9.1875, "learning_rate": 4.7648888888888895e-05, - "loss": 0.6318, + "loss": 0.7024, "step": 14280 }, { - "epoch": 1.7204430532145438, - "grad_norm": 8.125, + "epoch": 10.192582025677604, + "grad_norm": 7.78125, "learning_rate": 4.7604444444444443e-05, - "loss": 0.5763, + "loss": 0.8525, "step": 14290 }, { - "epoch": 1.7216470021671082, - "grad_norm": 6.6875, + "epoch": 10.199714693295292, + "grad_norm": 6.90625, "learning_rate": 4.7560000000000005e-05, - "loss": 0.568, + "loss": 0.8181, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval/acc": 47.093021392822266, + "epoch": 10.199714693295292, + "eval/acc": 46.511627197265625, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval_loss": 2.8553571701049805, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.63, - "eval_steps_per_second": 4.736, + "epoch": 10.199714693295292, + "eval_loss": 2.766357898712158, + "eval_runtime": 2.3027, + "eval_samples_per_second": 18.674, + "eval_steps_per_second": 0.434, "step": 14300 }, { - "epoch": 1.7228509511196726, - "grad_norm": 9.4375, + "epoch": 10.206847360912981, + "grad_norm": 5.90625, "learning_rate": 4.751555555555556e-05, - "loss": 0.6341, + "loss": 0.8026, "step": 14310 }, { - "epoch": 1.724054900072237, - "grad_norm": 6.90625, + "epoch": 10.21398002853067, + "grad_norm": 7.3125, "learning_rate": 4.747111111111111e-05, - "loss": 0.5031, + "loss": 0.8758, "step": 14320 }, { - "epoch": 1.7252588490248013, - "grad_norm": 7.59375, + "epoch": 10.22111269614836, + "grad_norm": 9.4375, "learning_rate": 4.742666666666667e-05, - "loss": 0.5438, + "loss": 0.7889, "step": 14330 }, { - "epoch": 1.7264627979773657, - "grad_norm": 6.09375, + "epoch": 10.228245363766048, + "grad_norm": 8.0, "learning_rate": 4.738222222222222e-05, - "loss": 0.6364, + "loss": 0.7343, "step": 14340 }, { - "epoch": 1.72766674692993, - "grad_norm": 7.0625, + "epoch": 10.235378031383737, + "grad_norm": 6.59375, "learning_rate": 4.733777777777778e-05, - "loss": 0.6025, + "loss": 0.788, "step": 14350 }, { - "epoch": 1.7288706958824944, - "grad_norm": 11.3125, + "epoch": 10.242510699001427, + "grad_norm": 9.1875, "learning_rate": 4.729333333333334e-05, - "loss": 0.5846, + "loss": 0.8068, "step": 14360 }, { - "epoch": 1.730074644835059, - "grad_norm": 5.5625, + "epoch": 10.249643366619116, + "grad_norm": 7.53125, "learning_rate": 4.724888888888889e-05, - "loss": 0.5983, + "loss": 0.8188, "step": 14370 }, { - "epoch": 1.7312785937876234, - "grad_norm": 8.6875, + "epoch": 10.256776034236804, + "grad_norm": 7.1875, "learning_rate": 4.720444444444445e-05, - "loss": 0.5374, + "loss": 0.7643, "step": 14380 }, { - "epoch": 1.732482542740188, - "grad_norm": 7.4375, + "epoch": 10.263908701854493, + "grad_norm": 9.125, "learning_rate": 4.716e-05, - "loss": 0.5893, + "loss": 0.7052, "step": 14390 }, { - "epoch": 1.7336864916927524, - "grad_norm": 6.90625, + "epoch": 10.271041369472183, + "grad_norm": 10.5625, "learning_rate": 4.711555555555556e-05, - "loss": 0.5874, + "loss": 0.762, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval/acc": 44.1860466003418, + "epoch": 10.271041369472183, + "eval/acc": 46.511627197265625, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval_loss": 2.897413969039917, - "eval_runtime": 0.2186, - "eval_samples_per_second": 196.696, - "eval_steps_per_second": 4.574, + "epoch": 10.271041369472183, + "eval_loss": 2.774780750274658, + "eval_runtime": 1.2232, + "eval_samples_per_second": 35.152, + "eval_steps_per_second": 0.817, "step": 14400 }, { - "epoch": 1.7348904406453167, - "grad_norm": 8.0625, + "epoch": 10.278174037089872, + "grad_norm": 7.5625, "learning_rate": 4.707111111111111e-05, - "loss": 0.6197, + "loss": 0.8322, "step": 14410 }, { - "epoch": 1.736094389597881, - "grad_norm": 7.875, + "epoch": 10.28530670470756, + "grad_norm": 35.25, "learning_rate": 4.702666666666667e-05, - "loss": 0.5427, + "loss": 0.8043, "step": 14420 }, { - "epoch": 1.7372983385504455, - "grad_norm": 10.5625, + "epoch": 10.29243937232525, + "grad_norm": 7.09375, "learning_rate": 4.6982222222222223e-05, - "loss": 0.5801, + "loss": 0.7257, "step": 14430 }, { - "epoch": 1.7385022875030098, - "grad_norm": 11.5625, + "epoch": 10.29957203994294, + "grad_norm": 15.375, "learning_rate": 4.693777777777778e-05, - "loss": 0.5667, + "loss": 0.7922, "step": 14440 }, { - "epoch": 1.7397062364555742, - "grad_norm": 7.71875, + "epoch": 10.306704707560627, + "grad_norm": 7.09375, "learning_rate": 4.6893333333333334e-05, - "loss": 0.6626, + "loss": 0.694, "step": 14450 }, { - "epoch": 1.7409101854081386, - "grad_norm": 5.90625, + "epoch": 10.313837375178316, + "grad_norm": 7.0625, "learning_rate": 4.684888888888889e-05, - "loss": 0.6022, + "loss": 0.7734, "step": 14460 }, { - "epoch": 1.742114134360703, - "grad_norm": 9.9375, + "epoch": 10.320970042796006, + "grad_norm": 6.75, "learning_rate": 4.6804444444444444e-05, - "loss": 0.6492, + "loss": 0.7469, "step": 14470 }, { - "epoch": 1.7433180833132675, - "grad_norm": 6.6875, + "epoch": 10.328102710413695, + "grad_norm": 5.9375, "learning_rate": 4.6760000000000006e-05, - "loss": 0.6251, + "loss": 0.6948, "step": 14480 }, { - "epoch": 1.744522032265832, - "grad_norm": 5.65625, + "epoch": 10.335235378031383, + "grad_norm": 7.15625, "learning_rate": 4.6715555555555555e-05, - "loss": 0.5951, + "loss": 0.7593, "step": 14490 }, { - "epoch": 1.7457259812183965, - "grad_norm": 7.53125, + "epoch": 10.342368045649073, + "grad_norm": 26.875, "learning_rate": 4.667111111111112e-05, - "loss": 0.5074, + "loss": 0.7302, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval/acc": 45.930233001708984, + "epoch": 10.342368045649073, + "eval/acc": 44.1860466003418, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval_loss": 2.843892812728882, - "eval_runtime": 0.2078, - "eval_samples_per_second": 206.911, - "eval_steps_per_second": 4.812, + "epoch": 10.342368045649073, + "eval_loss": 2.7937443256378174, + "eval_runtime": 0.2689, + "eval_samples_per_second": 159.899, + "eval_steps_per_second": 3.719, "step": 14500 }, { - "epoch": 1.7469299301709609, - "grad_norm": 8.8125, + "epoch": 10.349500713266762, + "grad_norm": 53.75, "learning_rate": 4.6626666666666665e-05, - "loss": 0.5739, + "loss": 0.8025, "step": 14510 }, { - "epoch": 1.7481338791235252, - "grad_norm": 9.5, + "epoch": 10.356633380884452, + "grad_norm": 10.4375, "learning_rate": 4.658222222222223e-05, - "loss": 0.5531, + "loss": 0.6807, "step": 14520 }, { - "epoch": 1.7493378280760896, - "grad_norm": 8.4375, + "epoch": 10.36376604850214, + "grad_norm": 17.5, "learning_rate": 4.653777777777778e-05, - "loss": 0.5929, + "loss": 0.7773, "step": 14530 }, { - "epoch": 1.750541777028654, - "grad_norm": 6.78125, + "epoch": 10.370898716119829, + "grad_norm": 9.0625, "learning_rate": 4.649333333333333e-05, - "loss": 0.6202, + "loss": 0.7322, "step": 14540 }, { - "epoch": 1.7517457259812184, - "grad_norm": 7.28125, + "epoch": 10.378031383737518, + "grad_norm": 7.5, "learning_rate": 4.644888888888889e-05, - "loss": 0.6164, + "loss": 0.801, "step": 14550 }, { - "epoch": 1.7529496749337827, - "grad_norm": 9.0625, + "epoch": 10.385164051355208, + "grad_norm": 7.03125, "learning_rate": 4.640444444444445e-05, - "loss": 0.6379, + "loss": 0.7887, "step": 14560 }, { - "epoch": 1.754153623886347, - "grad_norm": 8.9375, + "epoch": 10.392296718972895, + "grad_norm": 5.78125, "learning_rate": 4.636e-05, - "loss": 0.5701, + "loss": 0.75, "step": 14570 }, { - "epoch": 1.7553575728389115, - "grad_norm": 8.375, + "epoch": 10.399429386590585, + "grad_norm": 11.8125, "learning_rate": 4.631555555555556e-05, - "loss": 0.6824, + "loss": 0.7594, "step": 14580 }, { - "epoch": 1.756561521791476, - "grad_norm": 7.78125, + "epoch": 10.406562054208274, + "grad_norm": 26.375, "learning_rate": 4.6271111111111114e-05, - "loss": 0.7127, + "loss": 0.7863, "step": 14590 }, { - "epoch": 1.7577654707440404, - "grad_norm": 6.96875, + "epoch": 10.413694721825962, + "grad_norm": 11.875, "learning_rate": 4.622666666666667e-05, - "loss": 0.5165, + "loss": 0.7701, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval/acc": 46.511627197265625, + "epoch": 10.413694721825962, + "eval/acc": 44.1860466003418, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval_loss": 2.8514623641967773, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.924, - "eval_steps_per_second": 4.742, + "epoch": 10.413694721825962, + "eval_loss": 2.7834675312042236, + "eval_runtime": 0.4675, + "eval_samples_per_second": 91.973, + "eval_steps_per_second": 2.139, "step": 14600 }, { - "epoch": 1.758969419696605, - "grad_norm": 8.25, + "epoch": 10.420827389443652, + "grad_norm": 12.5625, "learning_rate": 4.6182222222222224e-05, - "loss": 0.6424, + "loss": 0.7568, "step": 14610 }, { - "epoch": 1.7601733686491694, - "grad_norm": 7.09375, + "epoch": 10.427960057061341, + "grad_norm": 7.40625, "learning_rate": 4.613777777777778e-05, - "loss": 0.5774, + "loss": 0.7547, "step": 14620 }, { - "epoch": 1.7613773176017338, - "grad_norm": 7.3125, + "epoch": 10.43509272467903, + "grad_norm": 8.875, "learning_rate": 4.6093333333333335e-05, - "loss": 0.5932, + "loss": 0.7594, "step": 14630 }, { - "epoch": 1.7625812665542981, - "grad_norm": 7.84375, + "epoch": 10.442225392296718, + "grad_norm": 25.0, "learning_rate": 4.604888888888889e-05, - "loss": 0.5451, + "loss": 0.8313, "step": 14640 }, { - "epoch": 1.7637852155068625, - "grad_norm": 6.875, + "epoch": 10.449358059914408, + "grad_norm": 7.9375, "learning_rate": 4.6004444444444445e-05, - "loss": 0.6025, + "loss": 0.8017, "step": 14650 }, { - "epoch": 1.7649891644594269, - "grad_norm": 6.71875, + "epoch": 10.456490727532097, + "grad_norm": 7.59375, "learning_rate": 4.596e-05, - "loss": 0.5298, + "loss": 0.7648, "step": 14660 }, { - "epoch": 1.7661931134119913, - "grad_norm": 8.75, + "epoch": 10.463623395149787, + "grad_norm": 8.5625, "learning_rate": 4.5915555555555556e-05, - "loss": 0.6542, + "loss": 0.6931, "step": 14670 }, { - "epoch": 1.7673970623645556, - "grad_norm": 8.875, + "epoch": 10.470756062767475, + "grad_norm": 9.8125, "learning_rate": 4.587111111111112e-05, - "loss": 0.5926, + "loss": 0.7128, "step": 14680 }, { - "epoch": 1.76860101131712, - "grad_norm": 8.9375, + "epoch": 10.477888730385164, + "grad_norm": 8.0, "learning_rate": 4.5826666666666666e-05, - "loss": 0.5747, + "loss": 0.8199, "step": 14690 }, { - "epoch": 1.7698049602696846, - "grad_norm": 7.34375, + "epoch": 10.485021398002853, + "grad_norm": 7.53125, "learning_rate": 4.578222222222223e-05, - "loss": 0.5349, + "loss": 0.8027, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval/acc": 44.1860466003418, + "epoch": 10.485021398002853, + "eval/acc": 48.83720779418945, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval_loss": 2.8648550510406494, - "eval_runtime": 0.2015, - "eval_samples_per_second": 213.416, - "eval_steps_per_second": 4.963, + "epoch": 10.485021398002853, + "eval_loss": 2.773456573486328, + "eval_runtime": 0.2477, + "eval_samples_per_second": 173.569, + "eval_steps_per_second": 4.036, "step": 14700 }, { - "epoch": 1.771008909222249, - "grad_norm": 7.5625, + "epoch": 10.492154065620543, + "grad_norm": 13.375, "learning_rate": 4.5737777777777777e-05, - "loss": 0.4603, + "loss": 0.8284, "step": 14710 }, { - "epoch": 1.7722128581748133, - "grad_norm": 8.4375, + "epoch": 10.49928673323823, + "grad_norm": 7.875, "learning_rate": 4.569333333333334e-05, - "loss": 0.5674, + "loss": 0.7522, "step": 14720 }, { - "epoch": 1.773416807127378, - "grad_norm": 6.625, + "epoch": 10.50641940085592, + "grad_norm": 6.375, "learning_rate": 4.5648888888888894e-05, - "loss": 0.5988, + "loss": 0.672, "step": 14730 }, { - "epoch": 1.7746207560799423, - "grad_norm": 8.4375, + "epoch": 10.51355206847361, + "grad_norm": 6.40625, "learning_rate": 4.560444444444444e-05, - "loss": 0.6072, + "loss": 0.8234, "step": 14740 }, { - "epoch": 1.7758247050325067, - "grad_norm": 7.84375, + "epoch": 10.520684736091297, + "grad_norm": 9.1875, "learning_rate": 4.5560000000000004e-05, - "loss": 0.5524, + "loss": 0.7505, "step": 14750 }, { - "epoch": 1.777028653985071, - "grad_norm": 6.0, + "epoch": 10.527817403708987, + "grad_norm": 7.25, "learning_rate": 4.551555555555555e-05, - "loss": 0.5633, + "loss": 0.7694, "step": 14760 }, { - "epoch": 1.7782326029376354, - "grad_norm": 12.625, + "epoch": 10.534950071326676, + "grad_norm": 6.3125, "learning_rate": 4.5471111111111115e-05, - "loss": 0.5669, + "loss": 0.7743, "step": 14770 }, { - "epoch": 1.7794365518901998, - "grad_norm": 6.5, + "epoch": 10.542082738944366, + "grad_norm": 10.0, "learning_rate": 4.542666666666667e-05, - "loss": 0.4503, + "loss": 0.8179, "step": 14780 }, { - "epoch": 1.7806405008427642, - "grad_norm": 7.46875, + "epoch": 10.549215406562054, + "grad_norm": 9.875, "learning_rate": 4.5382222222222225e-05, - "loss": 0.6596, + "loss": 0.9151, "step": 14790 }, { - "epoch": 1.7818444497953285, - "grad_norm": 6.125, + "epoch": 10.556348074179743, + "grad_norm": 8.6875, "learning_rate": 4.533777777777778e-05, - "loss": 0.6978, + "loss": 0.8133, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval/acc": 48.83720779418945, + "epoch": 10.556348074179743, + "eval/acc": 46.511627197265625, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval_loss": 2.870887279510498, - "eval_runtime": 0.9511, - "eval_samples_per_second": 45.209, - "eval_steps_per_second": 1.051, + "epoch": 10.556348074179743, + "eval_loss": 2.8140347003936768, + "eval_runtime": 0.2939, + "eval_samples_per_second": 146.299, + "eval_steps_per_second": 3.402, "step": 14800 }, { - "epoch": 1.7830483987478931, - "grad_norm": 8.375, + "epoch": 10.563480741797433, + "grad_norm": 7.0, "learning_rate": 4.5293333333333336e-05, - "loss": 0.581, + "loss": 0.7129, "step": 14810 }, { - "epoch": 1.7842523477004575, - "grad_norm": 6.1875, + "epoch": 10.570613409415122, + "grad_norm": 8.5, "learning_rate": 4.524888888888889e-05, - "loss": 0.526, + "loss": 0.7667, "step": 14820 }, { - "epoch": 1.7854562966530219, - "grad_norm": 6.40625, + "epoch": 10.57774607703281, + "grad_norm": 7.4375, "learning_rate": 4.5204444444444446e-05, - "loss": 0.5246, + "loss": 0.7692, "step": 14830 }, { - "epoch": 1.7866602456055865, - "grad_norm": 7.125, + "epoch": 10.5848787446505, + "grad_norm": 8.0625, "learning_rate": 4.516e-05, - "loss": 0.6338, + "loss": 0.7613, "step": 14840 }, { - "epoch": 1.7878641945581508, - "grad_norm": 10.3125, + "epoch": 10.592011412268189, + "grad_norm": 7.96875, "learning_rate": 4.5115555555555557e-05, - "loss": 0.5313, + "loss": 0.6925, "step": 14850 }, { - "epoch": 1.7890681435107152, - "grad_norm": 8.4375, + "epoch": 10.599144079885878, + "grad_norm": 14.375, "learning_rate": 4.507111111111111e-05, - "loss": 0.6848, + "loss": 0.84, "step": 14860 }, { - "epoch": 1.7902720924632796, - "grad_norm": 20.125, + "epoch": 10.606276747503566, + "grad_norm": 11.4375, "learning_rate": 4.502666666666667e-05, - "loss": 0.5839, + "loss": 0.8508, "step": 14870 }, { - "epoch": 1.791476041415844, - "grad_norm": 9.1875, + "epoch": 10.613409415121255, + "grad_norm": 8.375, "learning_rate": 4.498222222222222e-05, - "loss": 0.5869, + "loss": 0.7863, "step": 14880 }, { - "epoch": 1.7926799903684083, - "grad_norm": 9.8125, + "epoch": 10.620542082738945, + "grad_norm": 7.625, "learning_rate": 4.493777777777778e-05, - "loss": 0.5319, + "loss": 0.7177, "step": 14890 }, { - "epoch": 1.7938839393209727, - "grad_norm": 7.03125, + "epoch": 10.627674750356633, + "grad_norm": 10.375, "learning_rate": 4.489333333333334e-05, - "loss": 0.6254, + "loss": 0.7795, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval/acc": 46.511627197265625, + "epoch": 10.627674750356633, + "eval/acc": 44.1860466003418, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval_loss": 2.8519837856292725, - "eval_runtime": 0.2174, - "eval_samples_per_second": 197.755, - "eval_steps_per_second": 4.599, + "epoch": 10.627674750356633, + "eval_loss": 2.830230951309204, + "eval_runtime": 0.2428, + "eval_samples_per_second": 177.067, + "eval_steps_per_second": 4.118, "step": 14900 }, { - "epoch": 1.795087888273537, - "grad_norm": 8.9375, + "epoch": 10.634807417974322, + "grad_norm": 10.875, "learning_rate": 4.484888888888889e-05, - "loss": 0.613, + "loss": 0.7878, "step": 14910 }, { - "epoch": 1.7962918372261016, - "grad_norm": 9.1875, + "epoch": 10.641940085592012, + "grad_norm": 9.0, "learning_rate": 4.480444444444445e-05, - "loss": 0.6735, + "loss": 0.8517, "step": 14920 }, { - "epoch": 1.797495786178666, - "grad_norm": 7.4375, + "epoch": 10.649072753209701, + "grad_norm": 6.9375, "learning_rate": 4.4760000000000005e-05, - "loss": 0.5792, + "loss": 0.8469, "step": 14930 }, { - "epoch": 1.7986997351312304, - "grad_norm": 6.375, + "epoch": 10.656205420827389, + "grad_norm": 7.28125, "learning_rate": 4.4715555555555554e-05, - "loss": 0.5137, + "loss": 0.7262, "step": 14940 }, { - "epoch": 1.799903684083795, - "grad_norm": 8.0, + "epoch": 10.663338088445078, + "grad_norm": 6.15625, "learning_rate": 4.4671111111111116e-05, - "loss": 0.5431, + "loss": 0.739, "step": 14950 }, { - "epoch": 1.8011076330363593, - "grad_norm": 6.65625, + "epoch": 10.670470756062768, + "grad_norm": 7.84375, "learning_rate": 4.4626666666666664e-05, - "loss": 0.6689, + "loss": 0.7671, "step": 14960 }, { - "epoch": 1.8023115819889237, - "grad_norm": 7.5625, + "epoch": 10.677603423680456, + "grad_norm": 7.53125, "learning_rate": 4.4582222222222226e-05, - "loss": 0.6171, + "loss": 0.8059, "step": 14970 }, { - "epoch": 1.803515530941488, - "grad_norm": 6.9375, + "epoch": 10.684736091298145, + "grad_norm": 8.0, "learning_rate": 4.453777777777778e-05, - "loss": 0.6821, + "loss": 0.8167, "step": 14980 }, { - "epoch": 1.8047194798940525, - "grad_norm": 7.6875, + "epoch": 10.691868758915835, + "grad_norm": 7.4375, "learning_rate": 4.4493333333333337e-05, - "loss": 0.5024, + "loss": 0.7768, "step": 14990 }, { - "epoch": 1.8059234288466168, - "grad_norm": 7.90625, + "epoch": 10.699001426533524, + "grad_norm": 9.0625, "learning_rate": 4.444888888888889e-05, - "loss": 0.6338, + "loss": 0.7805, "step": 15000 }, { - "epoch": 1.8059234288466168, + "epoch": 10.699001426533524, "eval/acc": 46.511627197265625, "step": 15000 }, { - "epoch": 1.8059234288466168, - "eval_loss": 2.8593192100524902, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.704, - "eval_steps_per_second": 4.737, + "epoch": 10.699001426533524, + "eval_loss": 2.8128726482391357, + "eval_runtime": 0.239, + "eval_samples_per_second": 179.883, + "eval_steps_per_second": 4.183, "step": 15000 }, { - "epoch": 1.8071273777991812, - "grad_norm": 8.3125, + "epoch": 10.706134094151212, + "grad_norm": 6.59375, "learning_rate": 4.440444444444445e-05, - "loss": 0.5786, + "loss": 0.7292, "step": 15010 }, { - "epoch": 1.8083313267517456, - "grad_norm": 7.78125, + "epoch": 10.713266761768901, + "grad_norm": 7.625, "learning_rate": 4.436e-05, - "loss": 0.6018, + "loss": 0.7543, "step": 15020 }, { - "epoch": 1.8095352757043102, - "grad_norm": 9.625, + "epoch": 10.72039942938659, + "grad_norm": 7.6875, "learning_rate": 4.431555555555556e-05, - "loss": 0.5862, + "loss": 0.812, "step": 15030 }, { - "epoch": 1.8107392246568745, - "grad_norm": 7.59375, + "epoch": 10.72753209700428, + "grad_norm": 8.375, "learning_rate": 4.427111111111111e-05, - "loss": 0.5643, + "loss": 0.8101, "step": 15040 }, { - "epoch": 1.811943173609439, - "grad_norm": 9.6875, + "epoch": 10.734664764621968, + "grad_norm": 9.3125, "learning_rate": 4.422666666666667e-05, - "loss": 0.5234, + "loss": 0.7984, "step": 15050 }, { - "epoch": 1.8131471225620035, - "grad_norm": 5.15625, + "epoch": 10.741797432239657, + "grad_norm": 8.0, "learning_rate": 4.418222222222222e-05, - "loss": 0.5302, + "loss": 0.7325, "step": 15060 }, { - "epoch": 1.8143510715145679, - "grad_norm": 8.5625, + "epoch": 10.748930099857347, + "grad_norm": 7.125, "learning_rate": 4.413777777777778e-05, - "loss": 0.5529, + "loss": 0.823, "step": 15070 }, { - "epoch": 1.8155550204671322, - "grad_norm": 8.8125, + "epoch": 10.756062767475036, + "grad_norm": 7.40625, "learning_rate": 4.4093333333333334e-05, - "loss": 0.5429, + "loss": 0.8095, "step": 15080 }, { - "epoch": 1.8167589694196966, - "grad_norm": 8.1875, + "epoch": 10.763195435092724, + "grad_norm": 7.875, "learning_rate": 4.404888888888889e-05, - "loss": 0.6234, + "loss": 0.7326, "step": 15090 }, { - "epoch": 1.817962918372261, - "grad_norm": 10.6875, + "epoch": 10.770328102710414, + "grad_norm": 31.0, "learning_rate": 4.400444444444445e-05, - "loss": 0.5107, + "loss": 0.8735, "step": 15100 }, { - "epoch": 1.817962918372261, + "epoch": 10.770328102710414, "eval/acc": 46.511627197265625, "step": 15100 }, { - "epoch": 1.817962918372261, - "eval_loss": 2.8768866062164307, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.465, - "eval_steps_per_second": 4.708, + "epoch": 10.770328102710414, + "eval_loss": 2.815082311630249, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.248, + "eval_steps_per_second": 3.541, "step": 15100 }, { - "epoch": 1.8191668673248254, - "grad_norm": 7.34375, + "epoch": 10.777460770328103, + "grad_norm": 9.625, "learning_rate": 4.396e-05, - "loss": 0.5985, + "loss": 0.8593, "step": 15110 }, { - "epoch": 1.8203708162773897, - "grad_norm": 8.4375, + "epoch": 10.78459343794579, + "grad_norm": 8.0, "learning_rate": 4.391555555555556e-05, - "loss": 0.5498, + "loss": 0.8309, "step": 15120 }, { - "epoch": 1.821574765229954, - "grad_norm": 10.5, + "epoch": 10.79172610556348, + "grad_norm": 7.53125, "learning_rate": 4.387111111111111e-05, - "loss": 0.7284, + "loss": 0.6465, "step": 15130 }, { - "epoch": 1.8227787141825187, - "grad_norm": 9.3125, + "epoch": 10.79885877318117, + "grad_norm": 9.25, "learning_rate": 4.382666666666667e-05, - "loss": 0.6616, + "loss": 0.8751, "step": 15140 }, { - "epoch": 1.823982663135083, - "grad_norm": 6.90625, + "epoch": 10.80599144079886, + "grad_norm": 7.6875, "learning_rate": 4.378222222222223e-05, - "loss": 0.541, + "loss": 0.7533, "step": 15150 }, { - "epoch": 1.8251866120876474, - "grad_norm": 8.25, + "epoch": 10.813124108416547, + "grad_norm": 8.5625, "learning_rate": 4.3737777777777775e-05, - "loss": 0.6453, + "loss": 0.7803, "step": 15160 }, { - "epoch": 1.826390561040212, - "grad_norm": 10.875, + "epoch": 10.820256776034237, + "grad_norm": 6.09375, "learning_rate": 4.369333333333334e-05, - "loss": 0.5483, + "loss": 0.6925, "step": 15170 }, { - "epoch": 1.8275945099927764, - "grad_norm": 6.9375, + "epoch": 10.827389443651926, + "grad_norm": 8.1875, "learning_rate": 4.3648888888888886e-05, - "loss": 0.4767, + "loss": 0.8491, "step": 15180 }, { - "epoch": 1.8287984589453408, - "grad_norm": 8.5, + "epoch": 10.834522111269616, + "grad_norm": 13.125, "learning_rate": 4.360444444444445e-05, - "loss": 0.5885, + "loss": 0.7565, "step": 15190 }, { - "epoch": 1.8300024078979051, - "grad_norm": 8.5625, + "epoch": 10.841654778887303, + "grad_norm": 9.25, "learning_rate": 4.356e-05, - "loss": 0.5084, + "loss": 0.9506, "step": 15200 }, { - "epoch": 1.8300024078979051, - "eval/acc": 46.511627197265625, + "epoch": 10.841654778887303, + "eval/acc": 41.86046600341797, "step": 15200 }, { - "epoch": 1.8300024078979051, - "eval_loss": 2.888690233230591, - "eval_runtime": 0.2472, - "eval_samples_per_second": 173.974, - "eval_steps_per_second": 4.046, + "epoch": 10.841654778887303, + "eval_loss": 2.834817409515381, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.267, + "eval_steps_per_second": 4.285, "step": 15200 }, { - "epoch": 1.8312063568504695, - "grad_norm": 7.5, + "epoch": 10.848787446504993, + "grad_norm": 6.71875, "learning_rate": 4.351555555555556e-05, - "loss": 0.6484, + "loss": 0.8631, "step": 15210 }, { - "epoch": 1.8324103058030339, - "grad_norm": 7.75, + "epoch": 10.855920114122682, + "grad_norm": 7.25, "learning_rate": 4.3471111111111114e-05, - "loss": 0.5007, + "loss": 0.8258, "step": 15220 }, { - "epoch": 1.8336142547555982, - "grad_norm": 13.5625, + "epoch": 10.863052781740372, + "grad_norm": 5.59375, "learning_rate": 4.342666666666667e-05, - "loss": 0.5155, + "loss": 0.7895, "step": 15230 }, { - "epoch": 1.8348182037081626, - "grad_norm": 12.75, + "epoch": 10.87018544935806, + "grad_norm": 6.65625, "learning_rate": 4.3382222222222224e-05, - "loss": 0.6569, + "loss": 0.8473, "step": 15240 }, { - "epoch": 1.8360221526607272, - "grad_norm": 8.5625, + "epoch": 10.877318116975749, + "grad_norm": 6.59375, "learning_rate": 4.333777777777778e-05, - "loss": 0.591, + "loss": 0.8323, "step": 15250 }, { - "epoch": 1.8372261016132916, - "grad_norm": 8.9375, + "epoch": 10.884450784593438, + "grad_norm": 9.3125, "learning_rate": 4.3293333333333334e-05, - "loss": 0.6035, + "loss": 0.7446, "step": 15260 }, { - "epoch": 1.838430050565856, - "grad_norm": 8.875, + "epoch": 10.891583452211126, + "grad_norm": 15.125, "learning_rate": 4.324888888888889e-05, - "loss": 0.6398, + "loss": 0.8885, "step": 15270 }, { - "epoch": 1.8396339995184205, - "grad_norm": 6.40625, + "epoch": 10.898716119828816, + "grad_norm": 21.25, "learning_rate": 4.3204444444444445e-05, - "loss": 0.6009, + "loss": 0.7624, "step": 15280 }, { - "epoch": 1.840837948470985, - "grad_norm": 6.875, + "epoch": 10.905848787446505, + "grad_norm": 7.125, "learning_rate": 4.316e-05, - "loss": 0.4908, + "loss": 0.6841, "step": 15290 }, { - "epoch": 1.8420418974235493, - "grad_norm": 7.0, + "epoch": 10.912981455064195, + "grad_norm": 8.5625, "learning_rate": 4.311555555555556e-05, - "loss": 0.613, + "loss": 0.8645, "step": 15300 }, { - "epoch": 1.8420418974235493, + "epoch": 10.912981455064195, "eval/acc": 46.511627197265625, "step": 15300 }, { - "epoch": 1.8420418974235493, - "eval_loss": 2.8745079040527344, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.14, - "eval_steps_per_second": 4.794, + "epoch": 10.912981455064195, + "eval_loss": 2.790827512741089, + "eval_runtime": 0.2527, + "eval_samples_per_second": 170.168, + "eval_steps_per_second": 3.957, "step": 15300 }, { - "epoch": 1.8432458463761137, - "grad_norm": 7.21875, + "epoch": 10.920114122681882, + "grad_norm": 9.0625, "learning_rate": 4.307111111111111e-05, - "loss": 0.5939, + "loss": 0.779, "step": 15310 }, { - "epoch": 1.844449795328678, - "grad_norm": 6.8125, + "epoch": 10.927246790299572, + "grad_norm": 5.96875, "learning_rate": 4.302666666666667e-05, - "loss": 0.5942, + "loss": 0.7987, "step": 15320 }, { - "epoch": 1.8456537442812424, - "grad_norm": 7.46875, + "epoch": 10.934379457917261, + "grad_norm": 7.25, "learning_rate": 4.298222222222222e-05, - "loss": 0.5722, + "loss": 0.8278, "step": 15330 }, { - "epoch": 1.8468576932338068, - "grad_norm": 9.6875, + "epoch": 10.94151212553495, + "grad_norm": 7.625, "learning_rate": 4.293777777777778e-05, - "loss": 0.6481, + "loss": 0.8022, "step": 15340 }, { - "epoch": 1.8480616421863711, - "grad_norm": 6.96875, + "epoch": 10.948644793152638, + "grad_norm": 6.6875, "learning_rate": 4.289333333333334e-05, - "loss": 0.5667, + "loss": 0.7333, "step": 15350 }, { - "epoch": 1.8492655911389357, - "grad_norm": 6.6875, + "epoch": 10.955777460770328, + "grad_norm": 7.4375, "learning_rate": 4.284888888888889e-05, - "loss": 0.553, + "loss": 0.7398, "step": 15360 }, { - "epoch": 1.8504695400915, - "grad_norm": 11.4375, + "epoch": 10.962910128388017, + "grad_norm": 10.1875, "learning_rate": 4.280444444444445e-05, - "loss": 0.6384, + "loss": 0.7397, "step": 15370 }, { - "epoch": 1.8516734890440645, - "grad_norm": 9.3125, + "epoch": 10.970042796005707, + "grad_norm": 7.375, "learning_rate": 4.276e-05, - "loss": 0.591, + "loss": 0.7993, "step": 15380 }, { - "epoch": 1.852877437996629, - "grad_norm": 8.4375, + "epoch": 10.977175463623395, + "grad_norm": 7.59375, "learning_rate": 4.271555555555556e-05, - "loss": 0.5953, + "loss": 0.7811, "step": 15390 }, { - "epoch": 1.8540813869491934, - "grad_norm": 6.4375, + "epoch": 10.984308131241084, + "grad_norm": 7.46875, "learning_rate": 4.2671111111111114e-05, - "loss": 0.6149, + "loss": 0.7611, "step": 15400 }, { - "epoch": 1.8540813869491934, + "epoch": 10.984308131241084, "eval/acc": 46.511627197265625, "step": 15400 }, { - "epoch": 1.8540813869491934, - "eval_loss": 2.8363993167877197, - "eval_runtime": 0.3986, - "eval_samples_per_second": 107.885, - "eval_steps_per_second": 2.509, + "epoch": 10.984308131241084, + "eval_loss": 2.8109776973724365, + "eval_runtime": 0.2365, + "eval_samples_per_second": 181.849, + "eval_steps_per_second": 4.229, "step": 15400 }, { - "epoch": 1.8552853359017578, + "epoch": 10.991440798858774, "grad_norm": 7.46875, "learning_rate": 4.262666666666667e-05, - "loss": 0.5669, + "loss": 0.8165, "step": 15410 }, { - "epoch": 1.8564892848543222, - "grad_norm": 8.0625, + "epoch": 10.998573466476461, + "grad_norm": 8.625, "learning_rate": 4.2582222222222225e-05, - "loss": 0.618, + "loss": 0.8449, "step": 15420 }, { - "epoch": 1.8576932338068866, - "grad_norm": 7.21875, + "epoch": 11.00570613409415, + "grad_norm": 7.28125, "learning_rate": 4.253777777777778e-05, - "loss": 0.5657, + "loss": 0.7556, "step": 15430 }, { - "epoch": 1.858897182759451, - "grad_norm": 6.90625, + "epoch": 11.01283880171184, + "grad_norm": 30.375, "learning_rate": 4.2493333333333335e-05, - "loss": 0.6795, + "loss": 0.8541, "step": 15440 }, { - "epoch": 1.8601011317120153, - "grad_norm": 7.71875, + "epoch": 11.01997146932953, + "grad_norm": 8.375, "learning_rate": 4.244888888888889e-05, - "loss": 0.5312, + "loss": 0.8616, "step": 15450 }, { - "epoch": 1.8613050806645797, - "grad_norm": 8.4375, + "epoch": 11.027104136947218, + "grad_norm": 8.5, "learning_rate": 4.2404444444444446e-05, - "loss": 0.6856, + "loss": 0.7837, "step": 15460 }, { - "epoch": 1.8625090296171443, - "grad_norm": 9.6875, + "epoch": 11.034236804564907, + "grad_norm": 9.125, "learning_rate": 4.236e-05, - "loss": 0.6227, + "loss": 0.6939, "step": 15470 }, { - "epoch": 1.8637129785697086, - "grad_norm": 18.75, + "epoch": 11.041369472182597, + "grad_norm": 8.1875, "learning_rate": 4.2315555555555556e-05, - "loss": 0.5966, + "loss": 0.6788, "step": 15480 }, { - "epoch": 1.864916927522273, - "grad_norm": 8.9375, + "epoch": 11.048502139800286, + "grad_norm": 8.1875, "learning_rate": 4.227111111111111e-05, - "loss": 0.6483, + "loss": 0.7654, "step": 15490 }, { - "epoch": 1.8661208764748376, - "grad_norm": 7.625, + "epoch": 11.055634807417974, + "grad_norm": 8.375, "learning_rate": 4.222666666666667e-05, - "loss": 0.6927, + "loss": 0.7765, "step": 15500 }, { - "epoch": 1.8661208764748376, - "eval/acc": 44.1860466003418, + "epoch": 11.055634807417974, + "eval/acc": 41.86046600341797, "step": 15500 }, { - "epoch": 1.8661208764748376, - "eval_loss": 2.852205991744995, - "eval_runtime": 4.5374, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 0.22, + "epoch": 11.055634807417974, + "eval_loss": 2.01023268699646, + "eval_runtime": 5.0492, + "eval_samples_per_second": 8.516, + "eval_steps_per_second": 0.198, "step": 15500 }, { - "epoch": 1.867324825427402, - "grad_norm": 6.09375, + "epoch": 11.062767475035663, + "grad_norm": 9.125, "learning_rate": 4.218222222222222e-05, - "loss": 0.5317, + "loss": 0.8758, "step": 15510 }, { - "epoch": 1.8685287743799663, - "grad_norm": 6.09375, + "epoch": 11.069900142653353, + "grad_norm": 8.375, "learning_rate": 4.2137777777777784e-05, - "loss": 0.5278, + "loss": 0.8337, "step": 15520 }, { - "epoch": 1.8697327233325307, - "grad_norm": 15.375, + "epoch": 11.077032810271042, + "grad_norm": 7.78125, "learning_rate": 4.209333333333333e-05, - "loss": 0.6451, + "loss": 0.8168, "step": 15530 }, { - "epoch": 1.870936672285095, - "grad_norm": 9.5, + "epoch": 11.08416547788873, + "grad_norm": 7.03125, "learning_rate": 4.2048888888888894e-05, - "loss": 0.5186, + "loss": 0.8345, "step": 15540 }, { - "epoch": 1.8721406212376595, - "grad_norm": 7.375, + "epoch": 11.09129814550642, + "grad_norm": 8.8125, "learning_rate": 4.200444444444445e-05, - "loss": 0.5882, + "loss": 0.7392, "step": 15550 }, { - "epoch": 1.8733445701902238, - "grad_norm": 5.1875, + "epoch": 11.098430813124109, + "grad_norm": 8.9375, "learning_rate": 4.196e-05, - "loss": 0.5213, + "loss": 0.7623, "step": 15560 }, { - "epoch": 1.8745485191427882, - "grad_norm": 9.3125, + "epoch": 11.105563480741797, + "grad_norm": 9.875, "learning_rate": 4.191555555555556e-05, - "loss": 0.58, + "loss": 0.6797, "step": 15570 }, { - "epoch": 1.8757524680953528, - "grad_norm": 6.75, + "epoch": 11.112696148359486, + "grad_norm": 7.96875, "learning_rate": 4.187111111111111e-05, - "loss": 0.5244, + "loss": 0.7957, "step": 15580 }, { - "epoch": 1.8769564170479172, - "grad_norm": 11.625, + "epoch": 11.119828815977176, + "grad_norm": 7.0625, "learning_rate": 4.182666666666667e-05, - "loss": 0.5446, + "loss": 0.7701, "step": 15590 }, { - "epoch": 1.8781603660004815, - "grad_norm": 7.96875, + "epoch": 11.126961483594865, + "grad_norm": 6.90625, "learning_rate": 4.1782222222222226e-05, - "loss": 0.5606, + "loss": 0.8514, "step": 15600 }, { - "epoch": 1.8781603660004815, - "eval/acc": 46.511627197265625, + "epoch": 11.126961483594865, + "eval/acc": 39.53488540649414, "step": 15600 }, { - "epoch": 1.8781603660004815, - "eval_loss": 2.845609664916992, - "eval_runtime": 4.5649, - "eval_samples_per_second": 9.42, - "eval_steps_per_second": 0.219, + "epoch": 11.126961483594865, + "eval_loss": 2.0217747688293457, + "eval_runtime": 0.2343, + "eval_samples_per_second": 183.512, + "eval_steps_per_second": 4.268, "step": 15600 }, { - "epoch": 1.8793643149530461, - "grad_norm": 8.875, + "epoch": 11.134094151212553, + "grad_norm": 7.3125, "learning_rate": 4.173777777777778e-05, - "loss": 0.6415, + "loss": 0.7757, "step": 15610 }, { - "epoch": 1.8805682639056105, - "grad_norm": 7.75, + "epoch": 11.141226818830242, + "grad_norm": 6.6875, "learning_rate": 4.1693333333333336e-05, - "loss": 0.5137, + "loss": 0.6947, "step": 15620 }, { - "epoch": 1.8817722128581749, - "grad_norm": 7.5, + "epoch": 11.148359486447932, + "grad_norm": 7.3125, "learning_rate": 4.164888888888889e-05, - "loss": 0.5945, + "loss": 0.8118, "step": 15630 }, { - "epoch": 1.8829761618107392, - "grad_norm": 9.25, + "epoch": 11.155492154065621, + "grad_norm": 8.0625, "learning_rate": 4.160444444444445e-05, - "loss": 0.5678, + "loss": 0.848, "step": 15640 }, { - "epoch": 1.8841801107633036, - "grad_norm": 8.125, + "epoch": 11.162624821683309, + "grad_norm": 6.6875, "learning_rate": 4.156e-05, - "loss": 0.5011, + "loss": 0.7256, "step": 15650 }, { - "epoch": 1.885384059715868, - "grad_norm": 6.125, + "epoch": 11.169757489300999, + "grad_norm": 6.3125, "learning_rate": 4.151555555555556e-05, - "loss": 0.5921, + "loss": 0.7898, "step": 15660 }, { - "epoch": 1.8865880086684323, - "grad_norm": 7.15625, + "epoch": 11.176890156918688, + "grad_norm": 5.53125, "learning_rate": 4.147111111111111e-05, - "loss": 0.5706, + "loss": 0.7057, "step": 15670 }, { - "epoch": 1.8877919576209967, - "grad_norm": 7.15625, + "epoch": 11.184022824536376, + "grad_norm": 8.0, "learning_rate": 4.142666666666667e-05, - "loss": 0.6199, + "loss": 0.7065, "step": 15680 }, { - "epoch": 1.8889959065735613, - "grad_norm": 8.375, + "epoch": 11.191155492154065, + "grad_norm": 7.0, "learning_rate": 4.138222222222222e-05, - "loss": 0.6039, + "loss": 0.812, "step": 15690 }, { - "epoch": 1.8901998555261257, - "grad_norm": 9.0, + "epoch": 11.198288159771755, + "grad_norm": 35.0, "learning_rate": 4.133777777777778e-05, - "loss": 0.5823, + "loss": 0.7953, "step": 15700 }, { - "epoch": 1.8901998555261257, - "eval/acc": 47.093021392822266, + "epoch": 11.198288159771755, + "eval/acc": 39.53488540649414, "step": 15700 }, { - "epoch": 1.8901998555261257, - "eval_loss": 2.8587894439697266, - "eval_runtime": 0.2205, - "eval_samples_per_second": 195.01, - "eval_steps_per_second": 4.535, + "epoch": 11.198288159771755, + "eval_loss": 2.0371451377868652, + "eval_runtime": 0.2311, + "eval_samples_per_second": 186.077, + "eval_steps_per_second": 4.327, "step": 15700 }, { - "epoch": 1.89140380447869, - "grad_norm": 6.59375, + "epoch": 11.205420827389444, + "grad_norm": 6.8125, "learning_rate": 4.129333333333333e-05, - "loss": 0.6721, + "loss": 0.8272, "step": 15710 }, { - "epoch": 1.8926077534312546, - "grad_norm": 6.1875, + "epoch": 11.212553495007132, + "grad_norm": 9.625, "learning_rate": 4.1248888888888895e-05, - "loss": 0.5404, + "loss": 0.782, "step": 15720 }, { - "epoch": 1.893811702383819, - "grad_norm": 8.0, + "epoch": 11.219686162624821, + "grad_norm": 7.59375, "learning_rate": 4.1204444444444444e-05, - "loss": 0.6024, + "loss": 0.8176, "step": 15730 }, { - "epoch": 1.8950156513363834, - "grad_norm": 7.15625, + "epoch": 11.226818830242511, + "grad_norm": 7.40625, "learning_rate": 4.1160000000000006e-05, - "loss": 0.6063, + "loss": 0.7592, "step": 15740 }, { - "epoch": 1.8962196002889478, - "grad_norm": 6.125, + "epoch": 11.2339514978602, + "grad_norm": 9.1875, "learning_rate": 4.1115555555555554e-05, - "loss": 0.6106, + "loss": 0.7587, "step": 15750 }, { - "epoch": 1.8974235492415121, - "grad_norm": 7.09375, + "epoch": 11.241084165477888, + "grad_norm": 17.375, "learning_rate": 4.1071111111111116e-05, - "loss": 0.5872, + "loss": 0.7165, "step": 15760 }, { - "epoch": 1.8986274981940765, - "grad_norm": 4.59375, + "epoch": 11.248216833095578, + "grad_norm": 8.5625, "learning_rate": 4.102666666666667e-05, - "loss": 0.5415, + "loss": 0.7391, "step": 15770 }, { - "epoch": 1.8998314471466409, - "grad_norm": 7.3125, + "epoch": 11.255349500713267, + "grad_norm": 9.75, "learning_rate": 4.098222222222222e-05, - "loss": 0.6141, + "loss": 0.745, "step": 15780 }, { - "epoch": 1.9010353960992052, - "grad_norm": 9.5, + "epoch": 11.262482168330957, + "grad_norm": 8.75, "learning_rate": 4.093777777777778e-05, - "loss": 0.6448, + "loss": 0.7635, "step": 15790 }, { - "epoch": 1.9022393450517698, - "grad_norm": 7.1875, + "epoch": 11.269614835948644, + "grad_norm": 6.5, "learning_rate": 4.089333333333333e-05, - "loss": 0.5461, + "loss": 0.8273, "step": 15800 }, { - "epoch": 1.9022393450517698, - "eval/acc": 48.83720779418945, + "epoch": 11.269614835948644, + "eval/acc": 41.86046600341797, "step": 15800 }, { - "epoch": 1.9022393450517698, - "eval_loss": 2.854886293411255, - "eval_runtime": 0.2206, - "eval_samples_per_second": 194.945, - "eval_steps_per_second": 4.534, + "epoch": 11.269614835948644, + "eval_loss": 2.0061967372894287, + "eval_runtime": 0.2356, + "eval_samples_per_second": 182.538, + "eval_steps_per_second": 4.245, "step": 15800 }, { - "epoch": 1.9034432940043342, - "grad_norm": 20.625, + "epoch": 11.276747503566334, + "grad_norm": 7.0, "learning_rate": 4.084888888888889e-05, - "loss": 0.631, + "loss": 0.7411, "step": 15810 }, { - "epoch": 1.9046472429568986, - "grad_norm": 6.875, + "epoch": 11.283880171184023, + "grad_norm": 11.375, "learning_rate": 4.080444444444445e-05, - "loss": 0.5455, + "loss": 0.7564, "step": 15820 }, { - "epoch": 1.9058511919094632, - "grad_norm": 6.4375, + "epoch": 11.291012838801711, + "grad_norm": 8.5, "learning_rate": 4.076e-05, - "loss": 0.5827, + "loss": 0.8688, "step": 15830 }, { - "epoch": 1.9070551408620275, - "grad_norm": 6.875, + "epoch": 11.2981455064194, + "grad_norm": 7.03125, "learning_rate": 4.071555555555556e-05, - "loss": 0.5239, + "loss": 0.7351, "step": 15840 }, { - "epoch": 1.908259089814592, - "grad_norm": 12.8125, + "epoch": 11.30527817403709, + "grad_norm": 9.0, "learning_rate": 4.067111111111111e-05, - "loss": 0.5824, + "loss": 0.7432, "step": 15850 }, { - "epoch": 1.9094630387671563, - "grad_norm": 7.65625, + "epoch": 11.31241084165478, + "grad_norm": 9.875, "learning_rate": 4.062666666666667e-05, - "loss": 0.53, + "loss": 0.7984, "step": 15860 }, { - "epoch": 1.9106669877197207, - "grad_norm": 7.5, + "epoch": 11.319543509272467, + "grad_norm": 7.1875, "learning_rate": 4.0582222222222224e-05, - "loss": 0.6126, + "loss": 0.8125, "step": 15870 }, { - "epoch": 1.911870936672285, - "grad_norm": 7.84375, + "epoch": 11.326676176890157, + "grad_norm": 7.5, "learning_rate": 4.053777777777778e-05, - "loss": 0.5268, + "loss": 0.7995, "step": 15880 }, { - "epoch": 1.9130748856248494, - "grad_norm": 8.75, + "epoch": 11.333808844507846, + "grad_norm": 9.0, "learning_rate": 4.0493333333333334e-05, - "loss": 0.6154, + "loss": 0.7915, "step": 15890 }, { - "epoch": 1.9142788345774138, - "grad_norm": 10.0625, + "epoch": 11.340941512125536, + "grad_norm": 9.5625, "learning_rate": 4.044888888888889e-05, - "loss": 0.6189, + "loss": 0.8598, "step": 15900 }, { - "epoch": 1.9142788345774138, - "eval/acc": 44.1860466003418, + "epoch": 11.340941512125536, + "eval/acc": 37.20930099487305, "step": 15900 }, { - "epoch": 1.9142788345774138, - "eval_loss": 2.848970890045166, - "eval_runtime": 0.216, - "eval_samples_per_second": 199.097, - "eval_steps_per_second": 4.63, + "epoch": 11.340941512125536, + "eval_loss": 2.0159339904785156, + "eval_runtime": 0.7376, + "eval_samples_per_second": 58.299, + "eval_steps_per_second": 1.356, "step": 15900 }, { - "epoch": 1.9154827835299784, - "grad_norm": 7.5625, + "epoch": 11.348074179743223, + "grad_norm": 8.25, "learning_rate": 4.0404444444444445e-05, - "loss": 0.6334, + "loss": 0.8052, "step": 15910 }, { - "epoch": 1.9166867324825427, - "grad_norm": 6.1875, + "epoch": 11.355206847360913, + "grad_norm": 10.75, "learning_rate": 4.0360000000000007e-05, - "loss": 0.5638, + "loss": 0.8008, "step": 15920 }, { - "epoch": 1.917890681435107, - "grad_norm": 15.0, + "epoch": 11.362339514978602, + "grad_norm": 6.8125, "learning_rate": 4.0315555555555555e-05, - "loss": 0.5426, + "loss": 0.7922, "step": 15930 }, { - "epoch": 1.9190946303876717, - "grad_norm": 7.125, + "epoch": 11.36947218259629, + "grad_norm": 11.8125, "learning_rate": 4.027111111111112e-05, - "loss": 0.524, + "loss": 0.7762, "step": 15940 }, { - "epoch": 1.920298579340236, - "grad_norm": 11.6875, + "epoch": 11.37660485021398, + "grad_norm": 8.4375, "learning_rate": 4.0226666666666666e-05, - "loss": 0.5846, + "loss": 0.7765, "step": 15950 }, { - "epoch": 1.9215025282928004, - "grad_norm": 8.3125, + "epoch": 11.383737517831669, + "grad_norm": 8.25, "learning_rate": 4.018222222222223e-05, - "loss": 0.5928, + "loss": 0.7922, "step": 15960 }, { - "epoch": 1.9227064772453648, - "grad_norm": 7.40625, + "epoch": 11.390870185449359, + "grad_norm": 9.5, "learning_rate": 4.013777777777778e-05, - "loss": 0.5476, + "loss": 0.7839, "step": 15970 }, { - "epoch": 1.9239104261979292, - "grad_norm": 5.5625, + "epoch": 11.398002853067046, + "grad_norm": 8.125, "learning_rate": 4.009333333333333e-05, - "loss": 0.5344, + "loss": 0.6813, "step": 15980 }, { - "epoch": 1.9251143751504936, - "grad_norm": 6.5, + "epoch": 11.405135520684736, + "grad_norm": 6.28125, "learning_rate": 4.004888888888889e-05, - "loss": 0.4896, + "loss": 0.8077, "step": 15990 }, { - "epoch": 1.926318324103058, - "grad_norm": 6.96875, + "epoch": 11.412268188302425, + "grad_norm": 8.3125, "learning_rate": 4.000444444444444e-05, - "loss": 0.5144, + "loss": 0.7049, "step": 16000 }, { - "epoch": 1.926318324103058, - "eval/acc": 44.76744079589844, + "epoch": 11.412268188302425, + "eval/acc": 41.86046600341797, "step": 16000 }, { - "epoch": 1.926318324103058, - "eval_loss": 2.851822853088379, - "eval_runtime": 0.2165, - "eval_samples_per_second": 198.64, - "eval_steps_per_second": 4.62, + "epoch": 11.412268188302425, + "eval_loss": 2.0035645961761475, + "eval_runtime": 0.3154, + "eval_samples_per_second": 136.335, + "eval_steps_per_second": 3.171, "step": 16000 }, { - "epoch": 1.9275222730556223, - "grad_norm": 12.0625, + "epoch": 11.419400855920115, + "grad_norm": 6.75, "learning_rate": 3.9960000000000004e-05, - "loss": 0.5676, + "loss": 0.7572, "step": 16010 }, { - "epoch": 1.9287262220081869, - "grad_norm": 8.75, + "epoch": 11.426533523537802, + "grad_norm": 7.9375, "learning_rate": 3.991555555555556e-05, - "loss": 0.4875, + "loss": 0.7396, "step": 16020 }, { - "epoch": 1.9299301709607513, - "grad_norm": 7.5, + "epoch": 11.433666191155492, + "grad_norm": 8.5, "learning_rate": 3.9871111111111114e-05, - "loss": 0.6563, + "loss": 0.7213, "step": 16030 }, { - "epoch": 1.9311341199133156, - "grad_norm": 6.25, + "epoch": 11.440798858773181, + "grad_norm": 8.75, "learning_rate": 3.982666666666667e-05, - "loss": 0.5485, + "loss": 0.7403, "step": 16040 }, { - "epoch": 1.9323380688658802, - "grad_norm": 7.53125, + "epoch": 11.447931526390871, + "grad_norm": 6.625, "learning_rate": 3.9782222222222225e-05, - "loss": 0.5059, + "loss": 0.7124, "step": 16050 }, { - "epoch": 1.9335420178184446, - "grad_norm": 6.9375, + "epoch": 11.455064194008559, + "grad_norm": 8.75, "learning_rate": 3.973777777777778e-05, - "loss": 0.5767, + "loss": 0.7377, "step": 16060 }, { - "epoch": 1.934745966771009, - "grad_norm": 5.78125, + "epoch": 11.462196861626248, + "grad_norm": 9.1875, "learning_rate": 3.9693333333333335e-05, - "loss": 0.6159, + "loss": 0.8129, "step": 16070 }, { - "epoch": 1.9359499157235733, - "grad_norm": 9.6875, + "epoch": 11.469329529243938, + "grad_norm": 8.5, "learning_rate": 3.964888888888889e-05, - "loss": 0.5748, + "loss": 0.8099, "step": 16080 }, { - "epoch": 1.9371538646761377, - "grad_norm": 7.8125, + "epoch": 11.476462196861625, + "grad_norm": 8.75, "learning_rate": 3.9604444444444445e-05, - "loss": 0.5972, + "loss": 0.8657, "step": 16090 }, { - "epoch": 1.938357813628702, - "grad_norm": 7.6875, + "epoch": 11.483594864479315, + "grad_norm": 6.9375, "learning_rate": 3.956e-05, - "loss": 0.4898, + "loss": 0.8028, "step": 16100 }, { - "epoch": 1.938357813628702, - "eval/acc": 41.86046600341797, + "epoch": 11.483594864479315, + "eval/acc": 39.53488540649414, "step": 16100 }, { - "epoch": 1.938357813628702, - "eval_loss": 2.8877687454223633, - "eval_runtime": 0.5978, - "eval_samples_per_second": 71.93, - "eval_steps_per_second": 1.673, + "epoch": 11.483594864479315, + "eval_loss": 2.01182222366333, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.159, + "eval_steps_per_second": 4.306, "step": 16100 }, { - "epoch": 1.9395617625812664, - "grad_norm": 7.96875, + "epoch": 11.490727532097004, + "grad_norm": 22.0, "learning_rate": 3.9515555555555556e-05, - "loss": 0.5813, + "loss": 0.9008, "step": 16110 }, { - "epoch": 1.9407657115338308, - "grad_norm": 7.0625, + "epoch": 11.497860199714694, + "grad_norm": 7.96875, "learning_rate": 3.947111111111111e-05, - "loss": 0.5686, + "loss": 0.749, "step": 16120 }, { - "epoch": 1.9419696604863954, - "grad_norm": 7.15625, + "epoch": 11.504992867332382, + "grad_norm": 9.625, "learning_rate": 3.9426666666666666e-05, - "loss": 0.5711, + "loss": 0.7008, "step": 16130 }, { - "epoch": 1.9431736094389598, - "grad_norm": 9.25, + "epoch": 11.512125534950071, + "grad_norm": 7.90625, "learning_rate": 3.938222222222223e-05, - "loss": 0.5875, + "loss": 0.6725, "step": 16140 }, { - "epoch": 1.9443775583915242, - "grad_norm": 6.65625, + "epoch": 11.51925820256776, + "grad_norm": 6.9375, "learning_rate": 3.933777777777778e-05, - "loss": 0.5032, + "loss": 0.7104, "step": 16150 }, { - "epoch": 1.9455815073440887, - "grad_norm": 8.4375, + "epoch": 11.52639087018545, + "grad_norm": 9.3125, "learning_rate": 3.929333333333334e-05, - "loss": 0.5379, + "loss": 0.7202, "step": 16160 }, { - "epoch": 1.9467854562966531, - "grad_norm": 11.25, + "epoch": 11.533523537803138, + "grad_norm": 8.5625, "learning_rate": 3.924888888888889e-05, - "loss": 0.5714, + "loss": 0.8841, "step": 16170 }, { - "epoch": 1.9479894052492175, - "grad_norm": 7.59375, + "epoch": 11.540656205420827, + "grad_norm": 8.625, "learning_rate": 3.920444444444444e-05, - "loss": 0.7453, + "loss": 0.8151, "step": 16180 }, { - "epoch": 1.9491933542017819, - "grad_norm": 8.9375, + "epoch": 11.547788873038517, + "grad_norm": 8.6875, "learning_rate": 3.9160000000000005e-05, - "loss": 0.6535, + "loss": 0.6946, "step": 16190 }, { - "epoch": 1.9503973031543462, - "grad_norm": 6.71875, + "epoch": 11.554921540656206, + "grad_norm": 5.46875, "learning_rate": 3.911555555555555e-05, - "loss": 0.535, + "loss": 0.8014, "step": 16200 }, { - "epoch": 1.9503973031543462, - "eval/acc": 42.44186019897461, + "epoch": 11.554921540656206, + "eval/acc": 37.20930099487305, "step": 16200 }, { - "epoch": 1.9503973031543462, - "eval_loss": 2.87386417388916, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.07, - "eval_steps_per_second": 4.746, + "epoch": 11.554921540656206, + "eval_loss": 2.008047580718994, + "eval_runtime": 0.2377, + "eval_samples_per_second": 180.881, + "eval_steps_per_second": 4.207, "step": 16200 }, { - "epoch": 1.9516012521069106, - "grad_norm": 10.0625, + "epoch": 11.562054208273894, + "grad_norm": 6.5, "learning_rate": 3.9071111111111115e-05, - "loss": 0.5891, + "loss": 0.8634, "step": 16210 }, { - "epoch": 1.952805201059475, - "grad_norm": 37.0, + "epoch": 11.569186875891583, + "grad_norm": 10.0625, "learning_rate": 3.902666666666667e-05, - "loss": 0.584, + "loss": 0.8836, "step": 16220 }, { - "epoch": 1.9540091500120393, - "grad_norm": 8.3125, + "epoch": 11.576319543509273, + "grad_norm": 6.34375, "learning_rate": 3.8982222222222225e-05, - "loss": 0.5307, + "loss": 0.6787, "step": 16230 }, { - "epoch": 1.955213098964604, - "grad_norm": 16.125, + "epoch": 11.58345221112696, + "grad_norm": 7.8125, "learning_rate": 3.893777777777778e-05, - "loss": 0.5957, + "loss": 0.7925, "step": 16240 }, { - "epoch": 1.9564170479171683, - "grad_norm": 7.65625, + "epoch": 11.59058487874465, + "grad_norm": 10.6875, "learning_rate": 3.8893333333333336e-05, - "loss": 0.592, + "loss": 0.7393, "step": 16250 }, { - "epoch": 1.9576209968697327, - "grad_norm": 8.8125, + "epoch": 11.59771754636234, + "grad_norm": 6.65625, "learning_rate": 3.884888888888889e-05, - "loss": 0.468, + "loss": 0.7407, "step": 16260 }, { - "epoch": 1.9588249458222973, - "grad_norm": 7.25, + "epoch": 11.60485021398003, + "grad_norm": 6.5625, "learning_rate": 3.8804444444444446e-05, - "loss": 0.5946, + "loss": 0.8039, "step": 16270 }, { - "epoch": 1.9600288947748616, - "grad_norm": 7.84375, + "epoch": 11.611982881597717, + "grad_norm": 7.3125, "learning_rate": 3.876e-05, - "loss": 0.5809, + "loss": 0.8564, "step": 16280 }, { - "epoch": 1.961232843727426, - "grad_norm": 7.15625, + "epoch": 11.619115549215406, + "grad_norm": 6.6875, "learning_rate": 3.871555555555556e-05, - "loss": 0.5817, + "loss": 0.7674, "step": 16290 }, { - "epoch": 1.9624367926799904, - "grad_norm": 8.125, + "epoch": 11.626248216833096, + "grad_norm": 7.8125, "learning_rate": 3.867111111111111e-05, - "loss": 0.5724, + "loss": 0.8431, "step": 16300 }, { - "epoch": 1.9624367926799904, - "eval/acc": 44.1860466003418, + "epoch": 11.626248216833096, + "eval/acc": 41.86046600341797, "step": 16300 }, { - "epoch": 1.9624367926799904, - "eval_loss": 2.8631114959716797, - "eval_runtime": 0.2127, - "eval_samples_per_second": 202.139, - "eval_steps_per_second": 4.701, + "epoch": 11.626248216833096, + "eval_loss": 1.9884032011032104, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.225, + "eval_steps_per_second": 4.47, "step": 16300 }, { - "epoch": 1.9636407416325548, - "grad_norm": 8.5, + "epoch": 11.633380884450785, + "grad_norm": 6.84375, "learning_rate": 3.862666666666667e-05, - "loss": 0.5632, + "loss": 0.7772, "step": 16310 }, { - "epoch": 1.9648446905851191, - "grad_norm": 5.46875, + "epoch": 11.640513552068473, + "grad_norm": 6.40625, "learning_rate": 3.858222222222222e-05, - "loss": 0.5247, + "loss": 0.8256, "step": 16320 }, { - "epoch": 1.9660486395376835, - "grad_norm": 7.34375, + "epoch": 11.647646219686163, + "grad_norm": 7.96875, "learning_rate": 3.853777777777778e-05, - "loss": 0.5958, + "loss": 0.7004, "step": 16330 }, { - "epoch": 1.9672525884902479, - "grad_norm": 5.625, + "epoch": 11.654778887303852, + "grad_norm": 8.125, "learning_rate": 3.849333333333334e-05, - "loss": 0.6062, + "loss": 0.8883, "step": 16340 }, { - "epoch": 1.9684565374428125, - "grad_norm": 15.625, + "epoch": 11.661911554921542, + "grad_norm": 17.5, "learning_rate": 3.844888888888889e-05, - "loss": 0.4598, + "loss": 0.7894, "step": 16350 }, { - "epoch": 1.9696604863953768, - "grad_norm": 9.625, + "epoch": 11.66904422253923, + "grad_norm": 8.0, "learning_rate": 3.840444444444445e-05, - "loss": 0.5428, + "loss": 0.8491, "step": 16360 }, { - "epoch": 1.9708644353479412, - "grad_norm": 7.5625, + "epoch": 11.676176890156919, + "grad_norm": 6.78125, "learning_rate": 3.836e-05, - "loss": 0.5846, + "loss": 0.8265, "step": 16370 }, { - "epoch": 1.9720683843005058, - "grad_norm": 7.21875, + "epoch": 11.683309557774608, + "grad_norm": 8.25, "learning_rate": 3.831555555555556e-05, - "loss": 0.5767, + "loss": 0.7288, "step": 16380 }, { - "epoch": 1.9732723332530702, - "grad_norm": 7.875, + "epoch": 11.690442225392296, + "grad_norm": 7.96875, "learning_rate": 3.8271111111111116e-05, - "loss": 0.5815, + "loss": 0.7468, "step": 16390 }, { - "epoch": 1.9744762822056345, + "epoch": 11.697574893009985, "grad_norm": 7.28125, "learning_rate": 3.8226666666666664e-05, - "loss": 0.6038, + "loss": 0.7557, "step": 16400 }, { - "epoch": 1.9744762822056345, - "eval/acc": 44.1860466003418, + "epoch": 11.697574893009985, + "eval/acc": 39.53488540649414, "step": 16400 }, { - "epoch": 1.9744762822056345, - "eval_loss": 2.8775668144226074, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.634, - "eval_steps_per_second": 4.736, + "epoch": 11.697574893009985, + "eval_loss": 1.9937853813171387, + "eval_runtime": 0.2332, + "eval_samples_per_second": 184.37, + "eval_steps_per_second": 4.288, "step": 16400 }, { - "epoch": 1.975680231158199, - "grad_norm": 6.53125, + "epoch": 11.704707560627675, + "grad_norm": 9.5625, "learning_rate": 3.8182222222222226e-05, - "loss": 0.5933, + "loss": 0.8767, "step": 16410 }, { - "epoch": 1.9768841801107633, - "grad_norm": 7.78125, + "epoch": 11.711840228245364, + "grad_norm": 7.96875, "learning_rate": 3.8137777777777775e-05, - "loss": 0.5237, + "loss": 0.6997, "step": 16420 }, { - "epoch": 1.9780881290633276, - "grad_norm": 7.34375, + "epoch": 11.718972895863052, + "grad_norm": 6.75, "learning_rate": 3.809333333333334e-05, - "loss": 0.5757, + "loss": 0.8482, "step": 16430 }, { - "epoch": 1.979292078015892, - "grad_norm": 11.3125, + "epoch": 11.726105563480742, + "grad_norm": 9.25, "learning_rate": 3.804888888888889e-05, - "loss": 0.6774, + "loss": 0.8189, "step": 16440 }, { - "epoch": 1.9804960269684564, - "grad_norm": 5.75, + "epoch": 11.733238231098431, + "grad_norm": 7.21875, "learning_rate": 3.800444444444445e-05, - "loss": 0.5364, + "loss": 0.7069, "step": 16450 }, { - "epoch": 1.981699975921021, - "grad_norm": 6.28125, + "epoch": 11.74037089871612, + "grad_norm": 8.125, "learning_rate": 3.796e-05, - "loss": 0.6244, + "loss": 0.8172, "step": 16460 }, { - "epoch": 1.9829039248735854, - "grad_norm": 7.75, + "epoch": 11.747503566333808, + "grad_norm": 8.0625, "learning_rate": 3.791555555555556e-05, - "loss": 0.497, + "loss": 0.7332, "step": 16470 }, { - "epoch": 1.9841078738261497, - "grad_norm": 10.1875, + "epoch": 11.754636233951498, + "grad_norm": 7.75, "learning_rate": 3.787111111111111e-05, - "loss": 0.5933, + "loss": 0.7746, "step": 16480 }, { - "epoch": 1.9853118227787143, - "grad_norm": 6.40625, + "epoch": 11.761768901569187, + "grad_norm": 8.8125, "learning_rate": 3.782666666666667e-05, - "loss": 0.5994, + "loss": 0.7679, "step": 16490 }, { - "epoch": 1.9865157717312787, - "grad_norm": 7.21875, + "epoch": 11.768901569186877, + "grad_norm": 7.25, "learning_rate": 3.778222222222222e-05, - "loss": 0.7191, + "loss": 0.9753, "step": 16500 }, { - "epoch": 1.9865157717312787, - "eval/acc": 44.1860466003418, + "epoch": 11.768901569186877, + "eval/acc": 39.53488540649414, "step": 16500 }, { - "epoch": 1.9865157717312787, - "eval_loss": 2.8654353618621826, - "eval_runtime": 0.2125, - "eval_samples_per_second": 202.312, - "eval_steps_per_second": 4.705, + "epoch": 11.768901569186877, + "eval_loss": 1.9911694526672363, + "eval_runtime": 0.2278, + "eval_samples_per_second": 188.761, + "eval_steps_per_second": 4.39, "step": 16500 }, { - "epoch": 1.987719720683843, - "grad_norm": 9.25, + "epoch": 11.776034236804565, + "grad_norm": 7.375, "learning_rate": 3.773777777777778e-05, - "loss": 0.6151, + "loss": 0.7303, "step": 16510 }, { - "epoch": 1.9889236696364074, - "grad_norm": 7.34375, + "epoch": 11.783166904422254, + "grad_norm": 6.65625, "learning_rate": 3.7693333333333334e-05, - "loss": 0.5837, + "loss": 0.7036, "step": 16520 }, { - "epoch": 1.9901276185889718, - "grad_norm": 7.03125, + "epoch": 11.790299572039943, + "grad_norm": 7.8125, "learning_rate": 3.764888888888889e-05, - "loss": 0.5674, + "loss": 0.6873, "step": 16530 }, { - "epoch": 1.9913315675415362, - "grad_norm": 25.0, + "epoch": 11.797432239657631, + "grad_norm": 13.3125, "learning_rate": 3.760444444444445e-05, - "loss": 0.5812, + "loss": 0.8784, "step": 16540 }, { - "epoch": 1.9925355164941005, - "grad_norm": 19.125, + "epoch": 11.80456490727532, + "grad_norm": 10.5625, "learning_rate": 3.756e-05, - "loss": 0.6122, + "loss": 0.8149, "step": 16550 }, { - "epoch": 1.993739465446665, - "grad_norm": 10.0625, + "epoch": 11.81169757489301, + "grad_norm": 9.75, "learning_rate": 3.751555555555556e-05, - "loss": 0.623, + "loss": 0.7988, "step": 16560 }, { - "epoch": 1.9949434143992295, - "grad_norm": 8.1875, + "epoch": 11.8188302425107, + "grad_norm": 8.5625, "learning_rate": 3.747111111111111e-05, - "loss": 0.6221, + "loss": 0.8117, "step": 16570 }, { - "epoch": 1.9961473633517939, - "grad_norm": 7.59375, + "epoch": 11.825962910128387, + "grad_norm": 9.8125, "learning_rate": 3.742666666666667e-05, - "loss": 0.6054, + "loss": 0.7908, "step": 16580 }, { - "epoch": 1.9973513123043583, - "grad_norm": 10.25, + "epoch": 11.833095577746077, + "grad_norm": 8.0, "learning_rate": 3.738222222222223e-05, - "loss": 0.539, + "loss": 0.8379, "step": 16590 }, { - "epoch": 1.9985552612569228, - "grad_norm": 8.0, + "epoch": 11.840228245363766, + "grad_norm": 7.21875, "learning_rate": 3.7337777777777776e-05, - "loss": 0.5606, + "loss": 0.7278, "step": 16600 }, { - "epoch": 1.9985552612569228, - "eval/acc": 44.1860466003418, + "epoch": 11.840228245363766, + "eval/acc": 37.20930099487305, "step": 16600 }, { - "epoch": 1.9985552612569228, - "eval_loss": 2.8806099891662598, - "eval_runtime": 0.2152, - "eval_samples_per_second": 199.797, - "eval_steps_per_second": 4.646, + "epoch": 11.840228245363766, + "eval_loss": 1.9659804105758667, + "eval_runtime": 0.2325, + "eval_samples_per_second": 184.932, + "eval_steps_per_second": 4.301, "step": 16600 }, { - "epoch": 1.9997592102094872, - "grad_norm": 9.4375, + "epoch": 11.847360912981456, + "grad_norm": 8.0, "learning_rate": 3.729333333333334e-05, - "loss": 0.5944, + "loss": 0.773, "step": 16610 }, { - "epoch": 2.0009631591620516, - "grad_norm": 6.6875, + "epoch": 11.854493580599144, + "grad_norm": 5.78125, "learning_rate": 3.7248888888888886e-05, - "loss": 0.5375, + "loss": 0.7377, "step": 16620 }, { - "epoch": 2.002167108114616, - "grad_norm": 6.15625, + "epoch": 11.861626248216833, + "grad_norm": 8.875, "learning_rate": 3.720444444444445e-05, - "loss": 0.5144, + "loss": 0.6644, "step": 16630 }, { - "epoch": 2.0033710570671803, - "grad_norm": 7.375, + "epoch": 11.868758915834523, + "grad_norm": 7.125, "learning_rate": 3.716e-05, - "loss": 0.5942, + "loss": 0.8759, "step": 16640 }, { - "epoch": 2.0045750060197447, - "grad_norm": 7.125, + "epoch": 11.87589158345221, + "grad_norm": 7.0625, "learning_rate": 3.711555555555556e-05, - "loss": 0.5861, + "loss": 0.8503, "step": 16650 }, { - "epoch": 2.005778954972309, - "grad_norm": 9.1875, + "epoch": 11.8830242510699, + "grad_norm": 5.75, "learning_rate": 3.7071111111111114e-05, - "loss": 0.6177, + "loss": 0.7204, "step": 16660 }, { - "epoch": 2.0069829039248734, - "grad_norm": 6.15625, + "epoch": 11.89015691868759, + "grad_norm": 7.4375, "learning_rate": 3.702666666666667e-05, - "loss": 0.6381, + "loss": 0.8646, "step": 16670 }, { - "epoch": 2.008186852877438, - "grad_norm": 5.8125, + "epoch": 11.897289586305279, + "grad_norm": 7.875, "learning_rate": 3.6982222222222224e-05, - "loss": 0.5626, + "loss": 0.7951, "step": 16680 }, { - "epoch": 2.0093908018300026, - "grad_norm": 8.0625, + "epoch": 11.904422253922966, + "grad_norm": 7.75, "learning_rate": 3.693777777777778e-05, - "loss": 0.5352, + "loss": 0.7474, "step": 16690 }, { - "epoch": 2.010594750782567, - "grad_norm": 9.1875, + "epoch": 11.911554921540656, + "grad_norm": 8.0625, "learning_rate": 3.6893333333333335e-05, - "loss": 0.6053, + "loss": 0.8184, "step": 16700 }, { - "epoch": 2.010594750782567, - "eval/acc": 39.53488540649414, + "epoch": 11.911554921540656, + "eval/acc": 41.86046600341797, "step": 16700 }, { - "epoch": 2.010594750782567, - "eval_loss": 2.075368881225586, - "eval_runtime": 6.8777, - "eval_samples_per_second": 6.252, - "eval_steps_per_second": 0.145, + "epoch": 11.911554921540656, + "eval_loss": 1.9796830415725708, + "eval_runtime": 0.2288, + "eval_samples_per_second": 187.97, + "eval_steps_per_second": 4.371, "step": 16700 }, { - "epoch": 2.0117986997351314, - "grad_norm": 7.59375, + "epoch": 11.918687589158345, + "grad_norm": 8.5625, "learning_rate": 3.684888888888889e-05, - "loss": 0.5228, + "loss": 0.7233, "step": 16710 }, { - "epoch": 2.0130026486876957, - "grad_norm": 5.3125, + "epoch": 11.925820256776035, + "grad_norm": 6.84375, "learning_rate": 3.6804444444444445e-05, - "loss": 0.4805, + "loss": 0.8783, "step": 16720 }, { - "epoch": 2.01420659764026, - "grad_norm": 6.34375, + "epoch": 11.932952924393723, + "grad_norm": 8.0625, "learning_rate": 3.676e-05, - "loss": 0.5505, + "loss": 0.7848, "step": 16730 }, { - "epoch": 2.0154105465928245, - "grad_norm": 8.25, + "epoch": 11.940085592011412, + "grad_norm": 7.0, "learning_rate": 3.6715555555555556e-05, - "loss": 0.5271, + "loss": 0.7663, "step": 16740 }, { - "epoch": 2.016614495545389, - "grad_norm": 6.90625, + "epoch": 11.947218259629102, + "grad_norm": 8.25, "learning_rate": 3.667111111111111e-05, - "loss": 0.5843, + "loss": 0.7711, "step": 16750 }, { - "epoch": 2.0178184444979532, - "grad_norm": 6.25, + "epoch": 11.95435092724679, + "grad_norm": 8.5625, "learning_rate": 3.662666666666667e-05, - "loss": 0.5916, + "loss": 0.7848, "step": 16760 }, { - "epoch": 2.0190223934505176, - "grad_norm": 7.59375, + "epoch": 11.961483594864479, + "grad_norm": 13.375, "learning_rate": 3.658222222222222e-05, - "loss": 0.5564, + "loss": 0.8355, "step": 16770 }, { - "epoch": 2.020226342403082, - "grad_norm": 7.8125, + "epoch": 11.968616262482168, + "grad_norm": 8.1875, "learning_rate": 3.653777777777778e-05, - "loss": 0.5907, + "loss": 0.8452, "step": 16780 }, { - "epoch": 2.0214302913556463, - "grad_norm": 11.875, + "epoch": 11.975748930099858, + "grad_norm": 7.53125, "learning_rate": 3.649333333333333e-05, - "loss": 0.5908, + "loss": 0.8508, "step": 16790 }, { - "epoch": 2.022634240308211, - "grad_norm": 8.0, + "epoch": 11.982881597717546, + "grad_norm": 19.5, "learning_rate": 3.644888888888889e-05, - "loss": 0.5578, + "loss": 0.8187, "step": 16800 }, { - "epoch": 2.022634240308211, - "eval/acc": 41.86046600341797, + "epoch": 11.982881597717546, + "eval/acc": 39.53488540649414, "step": 16800 }, { - "epoch": 2.022634240308211, - "eval_loss": 2.0900626182556152, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.144, - "eval_steps_per_second": 4.631, + "epoch": 11.982881597717546, + "eval_loss": 1.9583516120910645, + "eval_runtime": 0.2249, + "eval_samples_per_second": 191.207, + "eval_steps_per_second": 4.447, "step": 16800 }, { - "epoch": 2.0238381892607755, - "grad_norm": 9.0, + "epoch": 11.990014265335235, + "grad_norm": 10.125, "learning_rate": 3.640444444444445e-05, - "loss": 0.6217, + "loss": 0.8547, "step": 16810 }, { - "epoch": 2.02504213821334, - "grad_norm": 9.1875, + "epoch": 11.997146932952925, + "grad_norm": 15.125, "learning_rate": 3.636e-05, - "loss": 0.6252, + "loss": 0.7173, "step": 16820 }, { - "epoch": 2.0262460871659043, - "grad_norm": 11.5, + "epoch": 12.004279600570614, + "grad_norm": 6.53125, "learning_rate": 3.631555555555556e-05, - "loss": 0.5154, + "loss": 0.7688, "step": 16830 }, { - "epoch": 2.0274500361184686, - "grad_norm": 8.3125, + "epoch": 12.011412268188302, + "grad_norm": 78.5, "learning_rate": 3.627111111111111e-05, - "loss": 0.505, + "loss": 0.8184, "step": 16840 }, { - "epoch": 2.028653985071033, - "grad_norm": 6.28125, + "epoch": 12.018544935805991, + "grad_norm": 8.5625, "learning_rate": 3.622666666666667e-05, - "loss": 0.4879, + "loss": 0.8062, "step": 16850 }, { - "epoch": 2.0298579340235974, - "grad_norm": 7.65625, + "epoch": 12.02567760342368, + "grad_norm": 9.25, "learning_rate": 3.6182222222222225e-05, - "loss": 0.5616, + "loss": 0.839, "step": 16860 }, { - "epoch": 2.0310618829761617, - "grad_norm": 6.9375, + "epoch": 12.03281027104137, + "grad_norm": 9.375, "learning_rate": 3.613777777777778e-05, - "loss": 0.5375, + "loss": 0.84, "step": 16870 }, { - "epoch": 2.032265831928726, - "grad_norm": 6.90625, + "epoch": 12.039942938659058, + "grad_norm": 7.46875, "learning_rate": 3.6093333333333336e-05, - "loss": 0.6451, + "loss": 0.7653, "step": 16880 }, { - "epoch": 2.0334697808812905, - "grad_norm": 5.0, + "epoch": 12.047075606276747, + "grad_norm": 14.875, "learning_rate": 3.604888888888889e-05, - "loss": 0.5616, + "loss": 0.7917, "step": 16890 }, { - "epoch": 2.034673729833855, - "grad_norm": 8.0, + "epoch": 12.054208273894437, + "grad_norm": 11.0, "learning_rate": 3.6004444444444446e-05, - "loss": 0.6754, + "loss": 0.7125, "step": 16900 }, { - "epoch": 2.034673729833855, - "eval/acc": 39.53488540649414, + "epoch": 12.054208273894437, + "eval/acc": 37.20930099487305, "step": 16900 }, { - "epoch": 2.034673729833855, - "eval_loss": 2.1007204055786133, - "eval_runtime": 0.2204, - "eval_samples_per_second": 195.141, - "eval_steps_per_second": 4.538, + "epoch": 12.054208273894437, + "eval_loss": 3.0164332389831543, + "eval_runtime": 5.2863, + "eval_samples_per_second": 8.134, + "eval_steps_per_second": 0.189, "step": 16900 }, { - "epoch": 2.0358776787864192, - "grad_norm": 7.5625, + "epoch": 12.061340941512125, + "grad_norm": 7.34375, "learning_rate": 3.596e-05, - "loss": 0.5048, + "loss": 0.8146, "step": 16910 }, { - "epoch": 2.037081627738984, - "grad_norm": 9.875, + "epoch": 12.068473609129814, + "grad_norm": 6.96875, "learning_rate": 3.5915555555555557e-05, - "loss": 0.5794, + "loss": 0.7891, "step": 16920 }, { - "epoch": 2.0382855766915484, - "grad_norm": 6.21875, + "epoch": 12.075606276747504, + "grad_norm": 6.78125, "learning_rate": 3.587111111111111e-05, - "loss": 0.5909, + "loss": 0.7922, "step": 16930 }, { - "epoch": 2.039489525644113, - "grad_norm": 7.09375, + "epoch": 12.082738944365193, + "grad_norm": 11.8125, "learning_rate": 3.582666666666667e-05, - "loss": 0.6029, + "loss": 0.7896, "step": 16940 }, { - "epoch": 2.040693474596677, - "grad_norm": 7.375, + "epoch": 12.08987161198288, + "grad_norm": 6.25, "learning_rate": 3.578222222222222e-05, - "loss": 0.5345, + "loss": 0.7579, "step": 16950 }, { - "epoch": 2.0418974235492415, - "grad_norm": 7.53125, + "epoch": 12.09700427960057, + "grad_norm": 7.9375, "learning_rate": 3.5737777777777784e-05, - "loss": 0.5384, + "loss": 0.8181, "step": 16960 }, { - "epoch": 2.043101372501806, - "grad_norm": 6.125, + "epoch": 12.10413694721826, + "grad_norm": 12.75, "learning_rate": 3.569333333333333e-05, - "loss": 0.5809, + "loss": 0.7577, "step": 16970 }, { - "epoch": 2.0443053214543703, - "grad_norm": 8.0625, + "epoch": 12.11126961483595, + "grad_norm": 50.75, "learning_rate": 3.5648888888888895e-05, - "loss": 0.5626, + "loss": 0.7314, "step": 16980 }, { - "epoch": 2.0455092704069346, - "grad_norm": 7.71875, + "epoch": 12.118402282453637, + "grad_norm": 8.625, "learning_rate": 3.560444444444444e-05, - "loss": 0.5675, + "loss": 0.7212, "step": 16990 }, { - "epoch": 2.046713219359499, - "grad_norm": 6.96875, + "epoch": 12.125534950071327, + "grad_norm": 7.90625, "learning_rate": 3.5560000000000005e-05, - "loss": 0.4864, + "loss": 0.8013, "step": 17000 }, { - "epoch": 2.046713219359499, - "eval/acc": 39.53488540649414, + "epoch": 12.125534950071327, + "eval/acc": 37.20930099487305, "step": 17000 }, { - "epoch": 2.046713219359499, - "eval_loss": 2.098133087158203, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.073, - "eval_steps_per_second": 4.746, + "epoch": 12.125534950071327, + "eval_loss": 3.037986993789673, + "eval_runtime": 0.2607, + "eval_samples_per_second": 164.968, + "eval_steps_per_second": 3.836, "step": 17000 }, { - "epoch": 2.0479171683120634, - "grad_norm": 7.875, + "epoch": 12.132667617689016, + "grad_norm": 8.0, "learning_rate": 3.551555555555556e-05, - "loss": 0.5531, + "loss": 0.7848, "step": 17010 }, { - "epoch": 2.0491211172646278, - "grad_norm": 7.09375, + "epoch": 12.139800285306706, + "grad_norm": 8.9375, "learning_rate": 3.547111111111111e-05, - "loss": 0.6313, + "loss": 0.7902, "step": 17020 }, { - "epoch": 2.0503250662171926, - "grad_norm": 6.3125, + "epoch": 12.146932952924393, + "grad_norm": 8.625, "learning_rate": 3.542666666666667e-05, - "loss": 0.5699, + "loss": 0.8201, "step": 17030 }, { - "epoch": 2.051529015169757, - "grad_norm": 7.09375, + "epoch": 12.154065620542083, + "grad_norm": 8.0625, "learning_rate": 3.538222222222222e-05, - "loss": 0.5396, + "loss": 0.7456, "step": 17040 }, { - "epoch": 2.0527329641223213, - "grad_norm": 12.625, + "epoch": 12.161198288159772, + "grad_norm": 6.375, "learning_rate": 3.533777777777778e-05, - "loss": 0.5393, + "loss": 0.7404, "step": 17050 }, { - "epoch": 2.0539369130748857, - "grad_norm": 7.5, + "epoch": 12.16833095577746, + "grad_norm": 10.5, "learning_rate": 3.5293333333333336e-05, - "loss": 0.5704, + "loss": 0.6491, "step": 17060 }, { - "epoch": 2.05514086202745, - "grad_norm": 7.59375, + "epoch": 12.17546362339515, + "grad_norm": 6.03125, "learning_rate": 3.524888888888889e-05, - "loss": 0.5339, + "loss": 0.8083, "step": 17070 }, { - "epoch": 2.0563448109800144, - "grad_norm": 8.5, + "epoch": 12.182596291012839, + "grad_norm": 7.15625, "learning_rate": 3.520444444444445e-05, - "loss": 0.5485, + "loss": 0.7394, "step": 17080 }, { - "epoch": 2.057548759932579, - "grad_norm": 8.1875, + "epoch": 12.189728958630528, + "grad_norm": 8.0, "learning_rate": 3.516e-05, - "loss": 0.5944, + "loss": 0.7967, "step": 17090 }, { - "epoch": 2.058752708885143, - "grad_norm": 8.9375, + "epoch": 12.196861626248216, + "grad_norm": 7.40625, "learning_rate": 3.511555555555556e-05, - "loss": 0.5787, + "loss": 0.7497, "step": 17100 }, { - "epoch": 2.058752708885143, + "epoch": 12.196861626248216, "eval/acc": 37.20930099487305, "step": 17100 }, { - "epoch": 2.058752708885143, - "eval_loss": 2.1219515800476074, - "eval_runtime": 0.2166, - "eval_samples_per_second": 198.5, - "eval_steps_per_second": 4.616, + "epoch": 12.196861626248216, + "eval_loss": 3.029151678085327, + "eval_runtime": 0.2337, + "eval_samples_per_second": 183.992, + "eval_steps_per_second": 4.279, "step": 17100 }, { - "epoch": 2.0599566578377075, - "grad_norm": 9.625, + "epoch": 12.203994293865906, + "grad_norm": 7.75, "learning_rate": 3.507111111111111e-05, - "loss": 0.6232, + "loss": 0.697, "step": 17110 }, { - "epoch": 2.061160606790272, - "grad_norm": 10.375, + "epoch": 12.211126961483595, + "grad_norm": 12.0625, "learning_rate": 3.502666666666667e-05, - "loss": 0.7104, + "loss": 0.7795, "step": 17120 }, { - "epoch": 2.0623645557428363, - "grad_norm": 11.1875, + "epoch": 12.218259629101285, + "grad_norm": 7.1875, "learning_rate": 3.498222222222222e-05, - "loss": 0.5668, + "loss": 0.7974, "step": 17130 }, { - "epoch": 2.063568504695401, - "grad_norm": 8.25, + "epoch": 12.225392296718972, + "grad_norm": 8.875, "learning_rate": 3.493777777777778e-05, - "loss": 0.5739, + "loss": 0.8272, "step": 17140 }, { - "epoch": 2.0647724536479655, - "grad_norm": 7.34375, + "epoch": 12.232524964336662, + "grad_norm": 15.4375, "learning_rate": 3.4893333333333334e-05, - "loss": 0.5701, + "loss": 0.8373, "step": 17150 }, { - "epoch": 2.06597640260053, - "grad_norm": 6.03125, + "epoch": 12.239657631954351, + "grad_norm": 6.9375, "learning_rate": 3.484888888888889e-05, - "loss": 0.5174, + "loss": 0.7922, "step": 17160 }, { - "epoch": 2.067180351553094, - "grad_norm": 6.3125, + "epoch": 12.24679029957204, + "grad_norm": 7.53125, "learning_rate": 3.4804444444444444e-05, - "loss": 0.5081, + "loss": 0.8099, "step": 17170 }, { - "epoch": 2.0683843005056586, - "grad_norm": 9.5, + "epoch": 12.253922967189729, + "grad_norm": 6.28125, "learning_rate": 3.4760000000000006e-05, - "loss": 0.5513, + "loss": 0.7522, "step": 17180 }, { - "epoch": 2.069588249458223, - "grad_norm": 12.875, + "epoch": 12.261055634807418, + "grad_norm": 8.8125, "learning_rate": 3.4715555555555554e-05, - "loss": 0.6219, + "loss": 0.7338, "step": 17190 }, { - "epoch": 2.0707921984107873, - "grad_norm": 16.125, + "epoch": 12.268188302425107, + "grad_norm": 6.34375, "learning_rate": 3.4671111111111116e-05, - "loss": 0.5014, + "loss": 0.7782, "step": 17200 }, { - "epoch": 2.0707921984107873, - "eval/acc": 39.53488540649414, + "epoch": 12.268188302425107, + "eval/acc": 37.20930099487305, "step": 17200 }, { - "epoch": 2.0707921984107873, - "eval_loss": 2.13092303276062, - "eval_runtime": 0.2186, - "eval_samples_per_second": 196.676, - "eval_steps_per_second": 4.574, + "epoch": 12.268188302425107, + "eval_loss": 3.063300848007202, + "eval_runtime": 0.2346, + "eval_samples_per_second": 183.302, + "eval_steps_per_second": 4.263, "step": 17200 }, { - "epoch": 2.0719961473633517, - "grad_norm": 8.3125, + "epoch": 12.275320970042795, + "grad_norm": 7.1875, "learning_rate": 3.462666666666667e-05, - "loss": 0.5354, + "loss": 0.7274, "step": 17210 }, { - "epoch": 2.073200096315916, - "grad_norm": 9.5, + "epoch": 12.282453637660485, + "grad_norm": 6.28125, "learning_rate": 3.458222222222222e-05, - "loss": 0.5422, + "loss": 0.7543, "step": 17220 }, { - "epoch": 2.0744040452684804, - "grad_norm": 8.9375, + "epoch": 12.289586305278174, + "grad_norm": 12.75, "learning_rate": 3.453777777777778e-05, - "loss": 0.5771, + "loss": 0.8159, "step": 17230 }, { - "epoch": 2.075607994221045, - "grad_norm": 24.375, + "epoch": 12.296718972895864, + "grad_norm": 7.03125, "learning_rate": 3.449333333333333e-05, - "loss": 0.5147, + "loss": 0.7927, "step": 17240 }, { - "epoch": 2.0768119431736096, - "grad_norm": 6.28125, + "epoch": 12.303851640513551, + "grad_norm": 7.09375, "learning_rate": 3.444888888888889e-05, - "loss": 0.6286, + "loss": 0.7862, "step": 17250 }, { - "epoch": 2.078015892126174, - "grad_norm": 6.65625, + "epoch": 12.310984308131241, + "grad_norm": 11.375, "learning_rate": 3.440444444444445e-05, - "loss": 0.5529, + "loss": 0.8505, "step": 17260 }, { - "epoch": 2.0792198410787384, - "grad_norm": 8.0, + "epoch": 12.31811697574893, + "grad_norm": 6.40625, "learning_rate": 3.436e-05, - "loss": 0.6149, + "loss": 0.7432, "step": 17270 }, { - "epoch": 2.0804237900313027, - "grad_norm": 8.4375, + "epoch": 12.32524964336662, + "grad_norm": 7.5, "learning_rate": 3.431555555555556e-05, - "loss": 0.5663, + "loss": 0.8234, "step": 17280 }, { - "epoch": 2.081627738983867, - "grad_norm": 4.5625, + "epoch": 12.332382310984308, + "grad_norm": 8.0, "learning_rate": 3.4271111111111114e-05, - "loss": 0.5788, + "loss": 0.8929, "step": 17290 }, { - "epoch": 2.0828316879364315, + "epoch": 12.339514978601997, "grad_norm": 8.8125, "learning_rate": 3.422666666666667e-05, - "loss": 0.6019, + "loss": 0.7827, "step": 17300 }, { - "epoch": 2.0828316879364315, - "eval/acc": 41.86046600341797, + "epoch": 12.339514978601997, + "eval/acc": 37.20930099487305, "step": 17300 }, { - "epoch": 2.0828316879364315, - "eval_loss": 2.138639211654663, - "eval_runtime": 0.2194, - "eval_samples_per_second": 195.991, - "eval_steps_per_second": 4.558, + "epoch": 12.339514978601997, + "eval_loss": 3.0306241512298584, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.496, + "eval_steps_per_second": 4.198, "step": 17300 }, { - "epoch": 2.084035636888996, - "grad_norm": 4.8125, + "epoch": 12.346647646219687, + "grad_norm": 7.125, "learning_rate": 3.4182222222222224e-05, - "loss": 0.5519, + "loss": 0.8023, "step": 17310 }, { - "epoch": 2.08523958584156, - "grad_norm": 8.0625, + "epoch": 12.353780313837376, + "grad_norm": 6.375, "learning_rate": 3.413777777777778e-05, - "loss": 0.5612, + "loss": 0.7275, "step": 17320 }, { - "epoch": 2.0864435347941246, - "grad_norm": 8.375, + "epoch": 12.360912981455064, + "grad_norm": 9.75, "learning_rate": 3.4093333333333334e-05, - "loss": 0.5622, + "loss": 0.8026, "step": 17330 }, { - "epoch": 2.087647483746689, - "grad_norm": 8.6875, + "epoch": 12.368045649072753, + "grad_norm": 6.75, "learning_rate": 3.404888888888889e-05, - "loss": 0.6023, + "loss": 0.7214, "step": 17340 }, { - "epoch": 2.0888514326992533, + "epoch": 12.375178316690443, "grad_norm": 9.0, "learning_rate": 3.4004444444444445e-05, - "loss": 0.5416, + "loss": 0.808, "step": 17350 }, { - "epoch": 2.090055381651818, - "grad_norm": 6.875, + "epoch": 12.38231098430813, + "grad_norm": 6.53125, "learning_rate": 3.396e-05, - "loss": 0.5268, + "loss": 0.7779, "step": 17360 }, { - "epoch": 2.0912593306043825, - "grad_norm": 8.1875, + "epoch": 12.38944365192582, + "grad_norm": 9.625, "learning_rate": 3.3915555555555555e-05, - "loss": 0.6577, + "loss": 0.6953, "step": 17370 }, { - "epoch": 2.092463279556947, - "grad_norm": 8.25, + "epoch": 12.39657631954351, + "grad_norm": 7.1875, "learning_rate": 3.387111111111112e-05, - "loss": 0.6687, + "loss": 0.7577, "step": 17380 }, { - "epoch": 2.0936672285095113, - "grad_norm": 6.8125, + "epoch": 12.403708987161199, + "grad_norm": 10.375, "learning_rate": 3.3826666666666666e-05, - "loss": 0.5323, + "loss": 0.7793, "step": 17390 }, { - "epoch": 2.0948711774620756, - "grad_norm": 7.53125, + "epoch": 12.410841654778887, + "grad_norm": 8.0, "learning_rate": 3.378222222222223e-05, - "loss": 0.5394, + "loss": 0.8273, "step": 17400 }, { - "epoch": 2.0948711774620756, + "epoch": 12.410841654778887, "eval/acc": 37.20930099487305, "step": 17400 }, { - "epoch": 2.0948711774620756, - "eval_loss": 2.1255204677581787, - "eval_runtime": 0.22, - "eval_samples_per_second": 195.45, - "eval_steps_per_second": 4.545, + "epoch": 12.410841654778887, + "eval_loss": 3.02496075630188, + "eval_runtime": 0.2305, + "eval_samples_per_second": 186.586, + "eval_steps_per_second": 4.339, "step": 17400 }, { - "epoch": 2.09607512641464, - "grad_norm": 8.6875, + "epoch": 12.417974322396576, + "grad_norm": 6.0625, "learning_rate": 3.3737777777777776e-05, - "loss": 0.575, + "loss": 0.7008, "step": 17410 }, { - "epoch": 2.0972790753672044, - "grad_norm": 7.90625, + "epoch": 12.425106990014266, + "grad_norm": 8.0, "learning_rate": 3.369333333333333e-05, - "loss": 0.5201, + "loss": 0.7961, "step": 17420 }, { - "epoch": 2.0984830243197687, - "grad_norm": 7.78125, + "epoch": 12.432239657631955, + "grad_norm": 8.9375, "learning_rate": 3.3648888888888893e-05, - "loss": 0.4728, + "loss": 0.7806, "step": 17430 }, { - "epoch": 2.099686973272333, - "grad_norm": 6.34375, + "epoch": 12.439372325249643, + "grad_norm": 8.75, "learning_rate": 3.360444444444444e-05, - "loss": 0.6064, + "loss": 0.6974, "step": 17440 }, { - "epoch": 2.1008909222248975, - "grad_norm": 6.71875, + "epoch": 12.446504992867332, + "grad_norm": 5.59375, "learning_rate": 3.3560000000000004e-05, - "loss": 0.5533, + "loss": 0.6685, "step": 17450 }, { - "epoch": 2.102094871177462, - "grad_norm": 8.3125, + "epoch": 12.453637660485022, + "grad_norm": 5.8125, "learning_rate": 3.351555555555555e-05, - "loss": 0.6109, + "loss": 0.7812, "step": 17460 }, { - "epoch": 2.1032988201300267, - "grad_norm": 8.75, + "epoch": 12.46077032810271, + "grad_norm": 8.8125, "learning_rate": 3.3471111111111114e-05, - "loss": 0.6461, + "loss": 0.8677, "step": 17470 }, { - "epoch": 2.104502769082591, - "grad_norm": 6.53125, + "epoch": 12.467902995720399, + "grad_norm": 8.3125, "learning_rate": 3.342666666666667e-05, - "loss": 0.6019, + "loss": 0.7837, "step": 17480 }, { - "epoch": 2.1057067180351554, - "grad_norm": 9.1875, + "epoch": 12.475035663338089, + "grad_norm": 11.0625, "learning_rate": 3.3382222222222225e-05, - "loss": 0.5926, + "loss": 0.7833, "step": 17490 }, { - "epoch": 2.10691066698772, - "grad_norm": 9.625, + "epoch": 12.482168330955778, + "grad_norm": 7.75, "learning_rate": 3.333777777777778e-05, - "loss": 0.5351, + "loss": 0.8063, "step": 17500 }, { - "epoch": 2.10691066698772, + "epoch": 12.482168330955778, "eval/acc": 37.20930099487305, "step": 17500 }, { - "epoch": 2.10691066698772, - "eval_loss": 2.1321325302124023, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.543, - "eval_steps_per_second": 4.734, + "epoch": 12.482168330955778, + "eval_loss": 3.0436007976531982, + "eval_runtime": 0.2352, + "eval_samples_per_second": 182.855, + "eval_steps_per_second": 4.252, "step": 17500 }, { - "epoch": 2.108114615940284, - "grad_norm": 7.4375, + "epoch": 12.489300998573466, + "grad_norm": 5.46875, "learning_rate": 3.3293333333333335e-05, - "loss": 0.5186, + "loss": 0.8146, "step": 17510 }, { - "epoch": 2.1093185648928485, - "grad_norm": 5.65625, + "epoch": 12.496433666191155, + "grad_norm": 7.6875, "learning_rate": 3.324888888888889e-05, - "loss": 0.532, + "loss": 0.7964, "step": 17520 }, { - "epoch": 2.110522513845413, - "grad_norm": 7.3125, + "epoch": 12.503566333808845, + "grad_norm": 6.03125, "learning_rate": 3.3204444444444446e-05, - "loss": 0.578, + "loss": 0.7377, "step": 17530 }, { - "epoch": 2.1117264627979773, - "grad_norm": 8.75, + "epoch": 12.510699001426534, + "grad_norm": 7.90625, "learning_rate": 3.316e-05, - "loss": 0.5241, + "loss": 0.7658, "step": 17540 }, { - "epoch": 2.1129304117505416, - "grad_norm": 5.65625, + "epoch": 12.517831669044222, + "grad_norm": 8.0625, "learning_rate": 3.3115555555555556e-05, - "loss": 0.5194, + "loss": 0.7035, "step": 17550 }, { - "epoch": 2.114134360703106, - "grad_norm": 8.375, + "epoch": 12.524964336661911, + "grad_norm": 7.75, "learning_rate": 3.307111111111111e-05, - "loss": 0.5815, + "loss": 0.794, "step": 17560 }, { - "epoch": 2.1153383096556704, - "grad_norm": 5.75, + "epoch": 12.532097004279601, + "grad_norm": 9.125, "learning_rate": 3.302666666666667e-05, - "loss": 0.5333, + "loss": 0.8046, "step": 17570 }, { - "epoch": 2.116542258608235, - "grad_norm": 8.125, + "epoch": 12.539229671897289, + "grad_norm": 8.8125, "learning_rate": 3.298222222222223e-05, - "loss": 0.5816, + "loss": 0.7632, "step": 17580 }, { - "epoch": 2.1177462075607996, - "grad_norm": 7.15625, + "epoch": 12.546362339514978, + "grad_norm": 9.375, "learning_rate": 3.293777777777778e-05, - "loss": 0.6058, + "loss": 0.8554, "step": 17590 }, { - "epoch": 2.118950156513364, - "grad_norm": 7.09375, + "epoch": 12.553495007132668, + "grad_norm": 7.0625, "learning_rate": 3.289333333333334e-05, - "loss": 0.6255, + "loss": 0.647, "step": 17600 }, { - "epoch": 2.118950156513364, - "eval/acc": 39.53488540649414, + "epoch": 12.553495007132668, + "eval/acc": 37.20930099487305, "step": 17600 }, { - "epoch": 2.118950156513364, - "eval_loss": 2.1110424995422363, - "eval_runtime": 0.2184, - "eval_samples_per_second": 196.909, - "eval_steps_per_second": 4.579, + "epoch": 12.553495007132668, + "eval_loss": 3.04180908203125, + "eval_runtime": 0.2372, + "eval_samples_per_second": 181.25, + "eval_steps_per_second": 4.215, "step": 17600 }, { - "epoch": 2.1201541054659283, - "grad_norm": 9.6875, + "epoch": 12.560627674750357, + "grad_norm": 7.4375, "learning_rate": 3.284888888888889e-05, - "loss": 0.619, + "loss": 0.7701, "step": 17610 }, { - "epoch": 2.1213580544184927, - "grad_norm": 7.09375, + "epoch": 12.567760342368045, + "grad_norm": 81.0, "learning_rate": 3.280444444444445e-05, - "loss": 0.4951, + "loss": 0.7707, "step": 17620 }, { - "epoch": 2.122562003371057, - "grad_norm": 6.625, + "epoch": 12.574893009985734, + "grad_norm": 6.75, "learning_rate": 3.2760000000000005e-05, - "loss": 0.5048, + "loss": 0.7871, "step": 17630 }, { - "epoch": 2.1237659523236214, - "grad_norm": 7.21875, + "epoch": 12.582025677603424, + "grad_norm": 6.28125, "learning_rate": 3.271555555555555e-05, - "loss": 0.6324, + "loss": 0.8382, "step": 17640 }, { - "epoch": 2.124969901276186, - "grad_norm": 7.125, + "epoch": 12.589158345221113, + "grad_norm": 8.875, "learning_rate": 3.2671111111111115e-05, - "loss": 0.5301, + "loss": 0.76, "step": 17650 }, { - "epoch": 2.12617385022875, - "grad_norm": 7.90625, + "epoch": 12.596291012838801, + "grad_norm": 7.75, "learning_rate": 3.2626666666666664e-05, - "loss": 0.5618, + "loss": 0.794, "step": 17660 }, { - "epoch": 2.1273777991813145, - "grad_norm": 7.84375, + "epoch": 12.60342368045649, + "grad_norm": 7.15625, "learning_rate": 3.2582222222222226e-05, - "loss": 0.57, + "loss": 0.7451, "step": 17670 }, { - "epoch": 2.128581748133879, - "grad_norm": 10.6875, + "epoch": 12.61055634807418, + "grad_norm": 6.75, "learning_rate": 3.253777777777778e-05, - "loss": 0.5702, + "loss": 0.7608, "step": 17680 }, { - "epoch": 2.1297856970864437, - "grad_norm": 6.625, + "epoch": 12.61768901569187, + "grad_norm": 8.8125, "learning_rate": 3.2493333333333336e-05, - "loss": 0.4928, + "loss": 0.8062, "step": 17690 }, { - "epoch": 2.130989646039008, - "grad_norm": 8.5, + "epoch": 12.624821683309557, + "grad_norm": 9.5625, "learning_rate": 3.244888888888889e-05, - "loss": 0.5074, + "loss": 0.6761, "step": 17700 }, { - "epoch": 2.130989646039008, - "eval/acc": 39.53488540649414, + "epoch": 12.624821683309557, + "eval/acc": 37.20930099487305, "step": 17700 }, { - "epoch": 2.130989646039008, - "eval_loss": 2.114666700363159, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.316, - "eval_steps_per_second": 4.659, + "epoch": 12.624821683309557, + "eval_loss": 3.029242515563965, + "eval_runtime": 0.2291, + "eval_samples_per_second": 187.698, + "eval_steps_per_second": 4.365, "step": 17700 }, { - "epoch": 2.1321935949915725, - "grad_norm": 9.6875, + "epoch": 12.631954350927247, + "grad_norm": 24.125, "learning_rate": 3.240444444444445e-05, - "loss": 0.6014, + "loss": 0.6484, "step": 17710 }, { - "epoch": 2.133397543944137, - "grad_norm": 7.1875, + "epoch": 12.639087018544936, + "grad_norm": 5.125, "learning_rate": 3.236e-05, - "loss": 0.5963, + "loss": 0.716, "step": 17720 }, { - "epoch": 2.134601492896701, - "grad_norm": 7.65625, + "epoch": 12.646219686162624, + "grad_norm": 10.0625, "learning_rate": 3.231555555555556e-05, - "loss": 0.5963, + "loss": 0.7955, "step": 17730 }, { - "epoch": 2.1358054418492656, - "grad_norm": 13.75, + "epoch": 12.653352353780313, + "grad_norm": 10.6875, "learning_rate": 3.227111111111111e-05, - "loss": 0.6004, + "loss": 0.8146, "step": 17740 }, { - "epoch": 2.13700939080183, - "grad_norm": 6.9375, + "epoch": 12.660485021398003, + "grad_norm": 8.0, "learning_rate": 3.222666666666667e-05, - "loss": 0.5469, + "loss": 0.7711, "step": 17750 }, { - "epoch": 2.1382133397543943, - "grad_norm": 15.125, + "epoch": 12.667617689015692, + "grad_norm": 8.0, "learning_rate": 3.218222222222222e-05, - "loss": 0.53, + "loss": 0.8463, "step": 17760 }, { - "epoch": 2.1394172887069587, - "grad_norm": 6.5, + "epoch": 12.67475035663338, + "grad_norm": 9.0, "learning_rate": 3.213777777777778e-05, - "loss": 0.6509, + "loss": 0.8483, "step": 17770 }, { - "epoch": 2.140621237659523, - "grad_norm": 8.5, + "epoch": 12.68188302425107, + "grad_norm": 7.78125, "learning_rate": 3.209333333333333e-05, - "loss": 0.5594, + "loss": 0.8471, "step": 17780 }, { - "epoch": 2.1418251866120874, - "grad_norm": 15.375, + "epoch": 12.689015691868759, + "grad_norm": 7.9375, "learning_rate": 3.204888888888889e-05, - "loss": 0.611, + "loss": 0.7851, "step": 17790 }, { - "epoch": 2.1430291355646522, - "grad_norm": 7.96875, + "epoch": 12.696148359486449, + "grad_norm": 7.59375, "learning_rate": 3.200444444444445e-05, - "loss": 0.5779, + "loss": 0.7442, "step": 17800 }, { - "epoch": 2.1430291355646522, - "eval/acc": 39.53488540649414, + "epoch": 12.696148359486449, + "eval/acc": 37.20930099487305, "step": 17800 }, { - "epoch": 2.1430291355646522, - "eval_loss": 2.0950489044189453, - "eval_runtime": 0.2125, - "eval_samples_per_second": 202.365, - "eval_steps_per_second": 4.706, + "epoch": 12.696148359486449, + "eval_loss": 3.0417492389678955, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.151, + "eval_steps_per_second": 4.399, "step": 17800 }, { - "epoch": 2.1442330845172166, - "grad_norm": 6.90625, + "epoch": 12.703281027104136, + "grad_norm": 7.5, "learning_rate": 3.196e-05, - "loss": 0.5222, + "loss": 0.8628, "step": 17810 }, { - "epoch": 2.145437033469781, - "grad_norm": 12.0625, + "epoch": 12.710413694721826, + "grad_norm": 9.5, "learning_rate": 3.191555555555556e-05, - "loss": 0.6021, + "loss": 0.7934, "step": 17820 }, { - "epoch": 2.1466409824223454, - "grad_norm": 6.8125, + "epoch": 12.717546362339515, + "grad_norm": 7.15625, "learning_rate": 3.187111111111111e-05, - "loss": 0.5762, + "loss": 0.7312, "step": 17830 }, { - "epoch": 2.1478449313749097, - "grad_norm": 6.84375, + "epoch": 12.724679029957205, + "grad_norm": 8.375, "learning_rate": 3.1826666666666665e-05, - "loss": 0.5999, + "loss": 0.7718, "step": 17840 }, { - "epoch": 2.149048880327474, - "grad_norm": 9.75, + "epoch": 12.731811697574893, + "grad_norm": 13.75, "learning_rate": 3.178222222222223e-05, - "loss": 0.6187, + "loss": 0.7138, "step": 17850 }, { - "epoch": 2.1502528292800385, - "grad_norm": 7.0, + "epoch": 12.738944365192582, + "grad_norm": 7.5, "learning_rate": 3.1737777777777775e-05, - "loss": 0.5659, + "loss": 0.8093, "step": 17860 }, { - "epoch": 2.151456778232603, - "grad_norm": 9.8125, + "epoch": 12.746077032810271, + "grad_norm": 30.5, "learning_rate": 3.169333333333334e-05, - "loss": 0.4921, + "loss": 0.8289, "step": 17870 }, { - "epoch": 2.152660727185167, - "grad_norm": 7.96875, + "epoch": 12.75320970042796, + "grad_norm": 5.625, "learning_rate": 3.164888888888889e-05, - "loss": 0.5766, + "loss": 0.7022, "step": 17880 }, { - "epoch": 2.1538646761377316, - "grad_norm": 7.03125, + "epoch": 12.760342368045649, + "grad_norm": 6.40625, "learning_rate": 3.160444444444445e-05, - "loss": 0.6072, + "loss": 0.6812, "step": 17890 }, { - "epoch": 2.155068625090296, - "grad_norm": 9.625, + "epoch": 12.767475035663338, + "grad_norm": 9.0, "learning_rate": 3.156e-05, - "loss": 0.5249, + "loss": 0.717, "step": 17900 }, { - "epoch": 2.155068625090296, - "eval/acc": 38.372093200683594, + "epoch": 12.767475035663338, + "eval/acc": 37.20930099487305, "step": 17900 }, { - "epoch": 2.155068625090296, - "eval_loss": 2.1039204597473145, - "eval_runtime": 0.2141, - "eval_samples_per_second": 200.859, - "eval_steps_per_second": 4.671, + "epoch": 12.767475035663338, + "eval_loss": 3.0246851444244385, + "eval_runtime": 0.2524, + "eval_samples_per_second": 170.366, + "eval_steps_per_second": 3.962, "step": 17900 }, { - "epoch": 2.1562725740428608, - "grad_norm": 7.4375, + "epoch": 12.774607703281028, + "grad_norm": 8.25, "learning_rate": 3.151555555555556e-05, - "loss": 0.5919, + "loss": 0.7882, "step": 17910 }, { - "epoch": 2.157476522995425, - "grad_norm": 5.21875, + "epoch": 12.781740370898715, + "grad_norm": 7.125, "learning_rate": 3.147111111111111e-05, - "loss": 0.5935, + "loss": 0.7156, "step": 17920 }, { - "epoch": 2.1586804719479895, - "grad_norm": 5.75, + "epoch": 12.788873038516405, + "grad_norm": 7.46875, "learning_rate": 3.142666666666667e-05, - "loss": 0.5401, + "loss": 0.7049, "step": 17930 }, { - "epoch": 2.159884420900554, - "grad_norm": 10.9375, + "epoch": 12.796005706134094, + "grad_norm": 8.6875, "learning_rate": 3.1382222222222224e-05, - "loss": 0.5899, + "loss": 0.9339, "step": 17940 }, { - "epoch": 2.1610883698531183, - "grad_norm": 7.6875, + "epoch": 12.803138373751784, + "grad_norm": 7.3125, "learning_rate": 3.133777777777778e-05, - "loss": 0.6101, + "loss": 0.6531, "step": 17950 }, { - "epoch": 2.1622923188056826, - "grad_norm": 9.0, + "epoch": 12.810271041369472, + "grad_norm": 6.84375, "learning_rate": 3.1293333333333334e-05, - "loss": 0.6088, + "loss": 0.7576, "step": 17960 }, { - "epoch": 2.163496267758247, - "grad_norm": 9.875, + "epoch": 12.817403708987161, + "grad_norm": 7.25, "learning_rate": 3.124888888888889e-05, - "loss": 0.5768, + "loss": 0.7153, "step": 17970 }, { - "epoch": 2.1647002167108114, - "grad_norm": 6.5625, + "epoch": 12.82453637660485, + "grad_norm": 7.1875, "learning_rate": 3.1204444444444445e-05, - "loss": 0.6349, + "loss": 0.7138, "step": 17980 }, { - "epoch": 2.1659041656633757, - "grad_norm": 9.3125, + "epoch": 12.83166904422254, + "grad_norm": 8.25, "learning_rate": 3.116e-05, - "loss": 0.4973, + "loss": 0.738, "step": 17990 }, { - "epoch": 2.16710811461594, - "grad_norm": 6.28125, + "epoch": 12.838801711840228, + "grad_norm": 9.5, "learning_rate": 3.111555555555556e-05, - "loss": 0.6212, + "loss": 0.7525, "step": 18000 }, { - "epoch": 2.16710811461594, - "eval/acc": 39.53488540649414, + "epoch": 12.838801711840228, + "eval/acc": 37.20930099487305, "step": 18000 }, { - "epoch": 2.16710811461594, - "eval_loss": 2.1069729328155518, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.644, - "eval_steps_per_second": 4.736, + "epoch": 12.838801711840228, + "eval_loss": 3.0545501708984375, + "eval_runtime": 0.2299, + "eval_samples_per_second": 187.055, + "eval_steps_per_second": 4.35, "step": 18000 }, { - "epoch": 2.1683120635685045, - "grad_norm": 10.3125, + "epoch": 12.845934379457917, + "grad_norm": 7.4375, "learning_rate": 3.107111111111111e-05, - "loss": 0.5577, + "loss": 0.8798, "step": 18010 }, { - "epoch": 2.1695160125210693, - "grad_norm": 7.75, + "epoch": 12.853067047075607, + "grad_norm": 8.625, "learning_rate": 3.102666666666667e-05, - "loss": 0.5199, + "loss": 0.9301, "step": 18020 }, { - "epoch": 2.1707199614736337, - "grad_norm": 9.4375, + "epoch": 12.860199714693294, + "grad_norm": 8.0625, "learning_rate": 3.098222222222222e-05, - "loss": 0.6411, + "loss": 0.808, "step": 18030 }, { - "epoch": 2.171923910426198, - "grad_norm": 9.0625, + "epoch": 12.867332382310984, + "grad_norm": 6.75, "learning_rate": 3.0937777777777776e-05, - "loss": 0.574, + "loss": 0.848, "step": 18040 }, { - "epoch": 2.1731278593787624, - "grad_norm": 9.125, + "epoch": 12.874465049928673, + "grad_norm": 6.1875, "learning_rate": 3.089333333333334e-05, - "loss": 0.5277, + "loss": 0.667, "step": 18050 }, { - "epoch": 2.1743318083313268, - "grad_norm": 23.875, + "epoch": 12.881597717546363, + "grad_norm": 17.125, "learning_rate": 3.0848888888888886e-05, - "loss": 0.5406, + "loss": 0.778, "step": 18060 }, { - "epoch": 2.175535757283891, - "grad_norm": 9.0, + "epoch": 12.88873038516405, + "grad_norm": 7.28125, "learning_rate": 3.080444444444445e-05, - "loss": 0.5573, + "loss": 0.7401, "step": 18070 }, { - "epoch": 2.1767397062364555, - "grad_norm": 8.1875, + "epoch": 12.89586305278174, + "grad_norm": 10.25, "learning_rate": 3.076e-05, - "loss": 0.5394, + "loss": 0.8231, "step": 18080 }, { - "epoch": 2.17794365518902, - "grad_norm": 9.3125, + "epoch": 12.90299572039943, + "grad_norm": 6.9375, "learning_rate": 3.071555555555556e-05, - "loss": 0.6329, + "loss": 0.8827, "step": 18090 }, { - "epoch": 2.1791476041415843, - "grad_norm": 17.875, + "epoch": 12.91012838801712, + "grad_norm": 7.4375, "learning_rate": 3.0671111111111114e-05, - "loss": 0.527, + "loss": 0.7568, "step": 18100 }, { - "epoch": 2.1791476041415843, - "eval/acc": 41.86046600341797, + "epoch": 12.91012838801712, + "eval/acc": 37.20930099487305, "step": 18100 }, { - "epoch": 2.1791476041415843, - "eval_loss": 2.1123554706573486, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.299, - "eval_steps_per_second": 4.705, + "epoch": 12.91012838801712, + "eval_loss": 3.0568392276763916, + "eval_runtime": 0.2321, + "eval_samples_per_second": 185.239, + "eval_steps_per_second": 4.308, "step": 18100 }, { - "epoch": 2.1803515530941486, - "grad_norm": 7.34375, + "epoch": 12.917261055634807, + "grad_norm": 8.5, "learning_rate": 3.062666666666667e-05, - "loss": 0.5937, + "loss": 0.7822, "step": 18110 }, { - "epoch": 2.181555502046713, - "grad_norm": 3.9375, + "epoch": 12.924393723252496, + "grad_norm": 7.375, "learning_rate": 3.0582222222222225e-05, - "loss": 0.6157, + "loss": 0.8247, "step": 18120 }, { - "epoch": 2.182759450999278, - "grad_norm": 6.4375, + "epoch": 12.931526390870186, + "grad_norm": 8.8125, "learning_rate": 3.053777777777778e-05, - "loss": 0.5808, + "loss": 0.8443, "step": 18130 }, { - "epoch": 2.183963399951842, - "grad_norm": 7.90625, + "epoch": 12.938659058487875, + "grad_norm": 6.28125, "learning_rate": 3.0493333333333335e-05, - "loss": 0.5375, + "loss": 0.803, "step": 18140 }, { - "epoch": 2.1851673489044066, - "grad_norm": 6.09375, + "epoch": 12.945791726105563, + "grad_norm": 38.75, "learning_rate": 3.0448888888888887e-05, - "loss": 0.5572, + "loss": 0.7168, "step": 18150 }, { - "epoch": 2.186371297856971, - "grad_norm": 9.875, + "epoch": 12.952924393723253, + "grad_norm": 8.0, "learning_rate": 3.0404444444444445e-05, - "loss": 0.5788, + "loss": 0.8827, "step": 18160 }, { - "epoch": 2.1875752468095353, - "grad_norm": 7.53125, + "epoch": 12.960057061340942, + "grad_norm": 17.75, "learning_rate": 3.036e-05, - "loss": 0.7093, + "loss": 0.822, "step": 18170 }, { - "epoch": 2.1887791957620997, - "grad_norm": 8.375, + "epoch": 12.96718972895863, + "grad_norm": 7.1875, "learning_rate": 3.031555555555556e-05, - "loss": 0.513, + "loss": 0.7958, "step": 18180 }, { - "epoch": 2.189983144714664, - "grad_norm": 12.125, + "epoch": 12.97432239657632, + "grad_norm": 6.125, "learning_rate": 3.027111111111111e-05, - "loss": 0.5108, + "loss": 0.676, "step": 18190 }, { - "epoch": 2.1911870936672284, - "grad_norm": 8.5, + "epoch": 12.981455064194009, + "grad_norm": 7.8125, "learning_rate": 3.022666666666667e-05, - "loss": 0.5063, + "loss": 0.8696, "step": 18200 }, { - "epoch": 2.1911870936672284, - "eval/acc": 39.53488540649414, + "epoch": 12.981455064194009, + "eval/acc": 37.20930099487305, "step": 18200 }, { - "epoch": 2.1911870936672284, - "eval_loss": 2.0905356407165527, - "eval_runtime": 0.2118, - "eval_samples_per_second": 202.993, - "eval_steps_per_second": 4.721, + "epoch": 12.981455064194009, + "eval_loss": 3.040698528289795, + "eval_runtime": 0.2626, + "eval_samples_per_second": 163.753, + "eval_steps_per_second": 3.808, "step": 18200 }, { - "epoch": 2.192391042619793, - "grad_norm": 8.0, + "epoch": 12.988587731811698, + "grad_norm": 11.1875, "learning_rate": 3.018222222222222e-05, - "loss": 0.6047, + "loss": 0.7538, "step": 18210 }, { - "epoch": 2.193594991572357, - "grad_norm": 7.53125, + "epoch": 12.995720399429386, + "grad_norm": 12.0, "learning_rate": 3.013777777777778e-05, - "loss": 0.5624, + "loss": 0.7873, "step": 18220 }, { - "epoch": 2.1947989405249215, - "grad_norm": 5.15625, + "epoch": 13.002853067047075, + "grad_norm": 8.4375, "learning_rate": 3.0093333333333335e-05, - "loss": 0.4844, + "loss": 0.7741, "step": 18230 }, { - "epoch": 2.1960028894774863, - "grad_norm": 7.53125, + "epoch": 13.009985734664765, + "grad_norm": 7.5, "learning_rate": 3.0048888888888894e-05, - "loss": 0.5534, + "loss": 0.7644, "step": 18240 }, { - "epoch": 2.1972068384300507, - "grad_norm": 6.75, + "epoch": 13.017118402282454, + "grad_norm": 6.84375, "learning_rate": 3.0004444444444446e-05, - "loss": 0.5302, + "loss": 0.769, "step": 18250 }, { - "epoch": 2.198410787382615, - "grad_norm": 8.5625, + "epoch": 13.024251069900142, + "grad_norm": 6.375, "learning_rate": 2.9959999999999998e-05, - "loss": 0.5995, + "loss": 0.7497, "step": 18260 }, { - "epoch": 2.1996147363351795, - "grad_norm": 7.28125, + "epoch": 13.031383737517832, + "grad_norm": 7.84375, "learning_rate": 2.9915555555555556e-05, - "loss": 0.5813, + "loss": 0.7643, "step": 18270 }, { - "epoch": 2.200818685287744, - "grad_norm": 6.28125, + "epoch": 13.038516405135521, + "grad_norm": 7.40625, "learning_rate": 2.987111111111111e-05, - "loss": 0.5663, + "loss": 0.7955, "step": 18280 }, { - "epoch": 2.202022634240308, - "grad_norm": 8.8125, + "epoch": 13.045649072753209, + "grad_norm": 14.125, "learning_rate": 2.982666666666667e-05, - "loss": 0.6051, + "loss": 0.7396, "step": 18290 }, { - "epoch": 2.2032265831928726, - "grad_norm": 7.15625, + "epoch": 13.052781740370898, + "grad_norm": 6.6875, "learning_rate": 2.9782222222222222e-05, - "loss": 0.5273, + "loss": 0.7616, "step": 18300 }, { - "epoch": 2.2032265831928726, - "eval/acc": 39.53488540649414, + "epoch": 13.052781740370898, + "eval/acc": 44.1860466003418, "step": 18300 }, { - "epoch": 2.2032265831928726, - "eval_loss": 2.106367588043213, - "eval_runtime": 0.2176, - "eval_samples_per_second": 197.621, - "eval_steps_per_second": 4.596, + "epoch": 13.052781740370898, + "eval_loss": 2.3592934608459473, + "eval_runtime": 4.7062, + "eval_samples_per_second": 9.137, + "eval_steps_per_second": 0.212, "step": 18300 }, { - "epoch": 2.204430532145437, - "grad_norm": 8.625, + "epoch": 13.059914407988588, + "grad_norm": 6.875, "learning_rate": 2.973777777777778e-05, - "loss": 0.5102, + "loss": 0.7691, "step": 18310 }, { - "epoch": 2.2056344810980013, - "grad_norm": 9.0, + "epoch": 13.067047075606277, + "grad_norm": 12.4375, "learning_rate": 2.9693333333333333e-05, - "loss": 0.6453, + "loss": 0.8092, "step": 18320 }, { - "epoch": 2.2068384300505657, - "grad_norm": 6.15625, + "epoch": 13.074179743223965, + "grad_norm": 7.65625, "learning_rate": 2.964888888888889e-05, - "loss": 0.5051, + "loss": 0.7653, "step": 18330 }, { - "epoch": 2.20804237900313, - "grad_norm": 7.875, + "epoch": 13.081312410841655, + "grad_norm": 9.9375, "learning_rate": 2.9604444444444446e-05, - "loss": 0.5706, + "loss": 0.7936, "step": 18340 }, { - "epoch": 2.209246327955695, - "grad_norm": 7.84375, + "epoch": 13.088445078459344, + "grad_norm": 11.0625, "learning_rate": 2.9559999999999998e-05, - "loss": 0.5046, + "loss": 0.7666, "step": 18350 }, { - "epoch": 2.2104502769082592, - "grad_norm": 7.34375, + "epoch": 13.095577746077034, + "grad_norm": 12.75, "learning_rate": 2.9515555555555557e-05, - "loss": 0.5694, + "loss": 0.874, "step": 18360 }, { - "epoch": 2.2116542258608236, - "grad_norm": 10.625, + "epoch": 13.102710413694721, + "grad_norm": 6.6875, "learning_rate": 2.9471111111111112e-05, - "loss": 0.5679, + "loss": 0.6979, "step": 18370 }, { - "epoch": 2.212858174813388, - "grad_norm": 6.90625, + "epoch": 13.10984308131241, + "grad_norm": 7.6875, "learning_rate": 2.942666666666667e-05, - "loss": 0.6488, + "loss": 0.8215, "step": 18380 }, { - "epoch": 2.2140621237659524, - "grad_norm": 11.625, + "epoch": 13.1169757489301, + "grad_norm": 6.75, "learning_rate": 2.9382222222222222e-05, - "loss": 0.6556, + "loss": 0.6941, "step": 18390 }, { - "epoch": 2.2152660727185167, - "grad_norm": 6.96875, + "epoch": 13.12410841654779, + "grad_norm": 8.5625, "learning_rate": 2.933777777777778e-05, - "loss": 0.4951, + "loss": 0.7365, "step": 18400 }, { - "epoch": 2.2152660727185167, - "eval/acc": 39.53488540649414, + "epoch": 13.12410841654779, + "eval/acc": 44.1860466003418, "step": 18400 }, { - "epoch": 2.2152660727185167, - "eval_loss": 2.115079402923584, - "eval_runtime": 0.2123, - "eval_samples_per_second": 202.524, - "eval_steps_per_second": 4.71, + "epoch": 13.12410841654779, + "eval_loss": 2.368854284286499, + "eval_runtime": 7.2366, + "eval_samples_per_second": 5.942, + "eval_steps_per_second": 0.138, "step": 18400 }, { - "epoch": 2.216470021671081, - "grad_norm": 6.9375, + "epoch": 13.131241084165477, + "grad_norm": 9.0625, "learning_rate": 2.9293333333333333e-05, - "loss": 0.5073, + "loss": 0.7201, "step": 18410 }, { - "epoch": 2.2176739706236455, - "grad_norm": 9.8125, + "epoch": 13.138373751783167, + "grad_norm": 8.4375, "learning_rate": 2.924888888888889e-05, - "loss": 0.6202, + "loss": 0.9061, "step": 18420 }, { - "epoch": 2.21887791957621, - "grad_norm": 7.40625, + "epoch": 13.145506419400856, + "grad_norm": 5.875, "learning_rate": 2.9204444444444447e-05, - "loss": 0.6417, + "loss": 0.7767, "step": 18430 }, { - "epoch": 2.220081868528774, - "grad_norm": 7.0, + "epoch": 13.152639087018544, + "grad_norm": 6.53125, "learning_rate": 2.9160000000000005e-05, - "loss": 0.5751, + "loss": 0.8086, "step": 18440 }, { - "epoch": 2.2212858174813386, - "grad_norm": 16.375, + "epoch": 13.159771754636234, + "grad_norm": 8.1875, "learning_rate": 2.9115555555555557e-05, - "loss": 0.6271, + "loss": 0.7938, "step": 18450 }, { - "epoch": 2.2224897664339034, - "grad_norm": 8.875, + "epoch": 13.166904422253923, + "grad_norm": 7.84375, "learning_rate": 2.907111111111111e-05, - "loss": 0.5437, + "loss": 0.8435, "step": 18460 }, { - "epoch": 2.2236937153864678, - "grad_norm": 7.75, + "epoch": 13.174037089871613, + "grad_norm": 8.3125, "learning_rate": 2.9026666666666668e-05, - "loss": 0.5829, + "loss": 0.7333, "step": 18470 }, { - "epoch": 2.224897664339032, + "epoch": 13.1811697574893, "grad_norm": 8.1875, "learning_rate": 2.8982222222222223e-05, - "loss": 0.5081, + "loss": 0.7546, "step": 18480 }, { - "epoch": 2.2261016132915965, - "grad_norm": 9.125, + "epoch": 13.18830242510699, + "grad_norm": 7.09375, "learning_rate": 2.893777777777778e-05, - "loss": 0.5637, + "loss": 0.7321, "step": 18490 }, { - "epoch": 2.227305562244161, - "grad_norm": 8.4375, + "epoch": 13.19543509272468, + "grad_norm": 9.375, "learning_rate": 2.8893333333333333e-05, - "loss": 0.5615, + "loss": 0.8419, "step": 18500 }, { - "epoch": 2.227305562244161, - "eval/acc": 41.27906799316406, + "epoch": 13.19543509272468, + "eval/acc": 44.1860466003418, "step": 18500 }, { - "epoch": 2.227305562244161, - "eval_loss": 2.0838871002197266, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.487, - "eval_steps_per_second": 4.709, + "epoch": 13.19543509272468, + "eval_loss": 2.343879461288452, + "eval_runtime": 0.2875, + "eval_samples_per_second": 149.591, + "eval_steps_per_second": 3.479, "step": 18500 }, { - "epoch": 2.2285095111967252, - "grad_norm": 7.875, + "epoch": 13.202567760342369, + "grad_norm": 9.8125, "learning_rate": 2.8848888888888892e-05, - "loss": 0.5499, + "loss": 0.7606, "step": 18510 }, { - "epoch": 2.2297134601492896, - "grad_norm": 9.6875, + "epoch": 13.209700427960057, + "grad_norm": 6.125, "learning_rate": 2.8804444444444444e-05, - "loss": 0.5668, + "loss": 0.7617, "step": 18520 }, { - "epoch": 2.230917409101854, - "grad_norm": 6.21875, + "epoch": 13.216833095577746, + "grad_norm": 6.4375, "learning_rate": 2.8760000000000002e-05, - "loss": 0.4969, + "loss": 0.6793, "step": 18530 }, { - "epoch": 2.2321213580544184, - "grad_norm": 8.125, + "epoch": 13.223965763195435, + "grad_norm": 8.5625, "learning_rate": 2.8715555555555558e-05, - "loss": 0.6084, + "loss": 0.7353, "step": 18540 }, { - "epoch": 2.2333253070069827, - "grad_norm": 6.21875, + "epoch": 13.231098430813125, + "grad_norm": 6.53125, "learning_rate": 2.8671111111111116e-05, - "loss": 0.5255, + "loss": 0.7745, "step": 18550 }, { - "epoch": 2.234529255959547, - "grad_norm": 5.71875, + "epoch": 13.238231098430813, + "grad_norm": 10.3125, "learning_rate": 2.8626666666666668e-05, - "loss": 0.5353, + "loss": 0.7891, "step": 18560 }, { - "epoch": 2.235733204912112, - "grad_norm": 26.875, + "epoch": 13.245363766048502, + "grad_norm": 6.03125, "learning_rate": 2.858222222222222e-05, - "loss": 0.6619, + "loss": 0.7913, "step": 18570 }, { - "epoch": 2.2369371538646763, - "grad_norm": 8.75, + "epoch": 13.252496433666192, + "grad_norm": 6.5, "learning_rate": 2.853777777777778e-05, - "loss": 0.5125, + "loss": 0.7704, "step": 18580 }, { - "epoch": 2.2381411028172407, - "grad_norm": 6.4375, + "epoch": 13.25962910128388, + "grad_norm": 11.5, "learning_rate": 2.8493333333333334e-05, - "loss": 0.576, + "loss": 0.7956, "step": 18590 }, { - "epoch": 2.239345051769805, - "grad_norm": 6.875, + "epoch": 13.266761768901569, + "grad_norm": 7.34375, "learning_rate": 2.8448888888888892e-05, - "loss": 0.5807, + "loss": 0.7904, "step": 18600 }, { - "epoch": 2.239345051769805, + "epoch": 13.266761768901569, "eval/acc": 39.53488540649414, "step": 18600 }, { - "epoch": 2.239345051769805, - "eval_loss": 2.097263813018799, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.684, - "eval_steps_per_second": 4.644, + "epoch": 13.266761768901569, + "eval_loss": 2.3613929748535156, + "eval_runtime": 0.2243, + "eval_samples_per_second": 191.727, + "eval_steps_per_second": 4.459, "step": 18600 }, { - "epoch": 2.2405490007223694, - "grad_norm": 8.5625, + "epoch": 13.273894436519258, + "grad_norm": 7.46875, "learning_rate": 2.8404444444444444e-05, - "loss": 0.5527, + "loss": 0.7528, "step": 18610 }, { - "epoch": 2.2417529496749338, - "grad_norm": 5.53125, + "epoch": 13.281027104136948, + "grad_norm": 8.0625, "learning_rate": 2.8360000000000003e-05, - "loss": 0.4939, + "loss": 0.7475, "step": 18620 }, { - "epoch": 2.242956898627498, - "grad_norm": 8.3125, + "epoch": 13.288159771754636, + "grad_norm": 9.9375, "learning_rate": 2.8315555555555555e-05, - "loss": 0.5648, + "loss": 0.7382, "step": 18630 }, { - "epoch": 2.2441608475800625, - "grad_norm": 5.71875, + "epoch": 13.295292439372325, + "grad_norm": 7.28125, "learning_rate": 2.8271111111111113e-05, - "loss": 0.5207, + "loss": 0.8196, "step": 18640 }, { - "epoch": 2.245364796532627, - "grad_norm": 12.9375, + "epoch": 13.302425106990015, + "grad_norm": 8.5, "learning_rate": 2.822666666666667e-05, - "loss": 0.5526, + "loss": 0.9212, "step": 18650 }, { - "epoch": 2.2465687454851913, - "grad_norm": 5.75, + "epoch": 13.309557774607704, + "grad_norm": 7.71875, "learning_rate": 2.818222222222222e-05, - "loss": 0.5586, + "loss": 0.7357, "step": 18660 }, { - "epoch": 2.2477726944377556, - "grad_norm": 13.5625, + "epoch": 13.316690442225392, + "grad_norm": 6.5, "learning_rate": 2.813777777777778e-05, - "loss": 0.5714, + "loss": 0.7228, "step": 18670 }, { - "epoch": 2.2489766433903204, - "grad_norm": 7.65625, + "epoch": 13.323823109843081, + "grad_norm": 6.5625, "learning_rate": 2.8093333333333334e-05, - "loss": 0.5244, + "loss": 0.8229, "step": 18680 }, { - "epoch": 2.250180592342885, - "grad_norm": 10.125, + "epoch": 13.33095577746077, + "grad_norm": 18.25, "learning_rate": 2.8048888888888893e-05, - "loss": 0.6128, + "loss": 0.8386, "step": 18690 }, { - "epoch": 2.251384541295449, - "grad_norm": 7.65625, + "epoch": 13.338088445078458, + "grad_norm": 11.0625, "learning_rate": 2.8004444444444445e-05, - "loss": 0.5419, + "loss": 0.7779, "step": 18700 }, { - "epoch": 2.251384541295449, - "eval/acc": 40.11627960205078, + "epoch": 13.338088445078458, + "eval/acc": 39.53488540649414, "step": 18700 }, { - "epoch": 2.251384541295449, - "eval_loss": 2.088261127471924, - "eval_runtime": 0.2127, - "eval_samples_per_second": 202.124, - "eval_steps_per_second": 4.701, + "epoch": 13.338088445078458, + "eval_loss": 2.363105535507202, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.351, + "eval_steps_per_second": 4.404, "step": 18700 }, { - "epoch": 2.2525884902480136, - "grad_norm": 8.25, + "epoch": 13.345221112696148, + "grad_norm": 6.75, "learning_rate": 2.7960000000000003e-05, - "loss": 0.5735, + "loss": 0.7851, "step": 18710 }, { - "epoch": 2.253792439200578, - "grad_norm": 20.25, + "epoch": 13.352353780313837, + "grad_norm": 7.28125, "learning_rate": 2.7915555555555555e-05, - "loss": 0.5479, + "loss": 0.7634, "step": 18720 }, { - "epoch": 2.2549963881531423, - "grad_norm": 9.0, + "epoch": 13.359486447931527, + "grad_norm": 7.0, "learning_rate": 2.7871111111111114e-05, - "loss": 0.549, + "loss": 0.8063, "step": 18730 }, { - "epoch": 2.2562003371057067, - "grad_norm": 10.25, + "epoch": 13.366619115549215, + "grad_norm": 7.15625, "learning_rate": 2.782666666666667e-05, - "loss": 0.637, + "loss": 0.7746, "step": 18740 }, { - "epoch": 2.257404286058271, - "grad_norm": 7.5625, + "epoch": 13.373751783166904, + "grad_norm": 7.75, "learning_rate": 2.7782222222222228e-05, - "loss": 0.5333, + "loss": 0.7927, "step": 18750 }, { - "epoch": 2.2586082350108354, - "grad_norm": 8.5625, + "epoch": 13.380884450784594, + "grad_norm": 10.4375, "learning_rate": 2.773777777777778e-05, - "loss": 0.5804, + "loss": 0.7496, "step": 18760 }, { - "epoch": 2.2598121839634, - "grad_norm": 8.0, + "epoch": 13.388017118402283, + "grad_norm": 13.0625, "learning_rate": 2.769333333333333e-05, - "loss": 0.4839, + "loss": 0.7797, "step": 18770 }, { - "epoch": 2.261016132915964, - "grad_norm": 6.53125, + "epoch": 13.39514978601997, + "grad_norm": 7.15625, "learning_rate": 2.764888888888889e-05, - "loss": 0.5463, + "loss": 0.7104, "step": 18780 }, { - "epoch": 2.262220081868529, - "grad_norm": 7.03125, + "epoch": 13.40228245363766, + "grad_norm": 123.0, "learning_rate": 2.7604444444444445e-05, - "loss": 0.6316, + "loss": 0.7854, "step": 18790 }, { - "epoch": 2.2634240308210933, - "grad_norm": 10.0625, + "epoch": 13.40941512125535, + "grad_norm": 8.9375, "learning_rate": 2.7560000000000004e-05, - "loss": 0.5341, + "loss": 0.7275, "step": 18800 }, { - "epoch": 2.2634240308210933, - "eval/acc": 40.11627960205078, + "epoch": 13.40941512125535, + "eval/acc": 46.511627197265625, "step": 18800 }, { - "epoch": 2.2634240308210933, - "eval_loss": 2.0978806018829346, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.871, - "eval_steps_per_second": 4.741, + "epoch": 13.40941512125535, + "eval_loss": 2.3632330894470215, + "eval_runtime": 0.223, + "eval_samples_per_second": 192.833, + "eval_steps_per_second": 4.484, "step": 18800 }, { - "epoch": 2.2646279797736577, - "grad_norm": 6.1875, + "epoch": 13.41654778887304, + "grad_norm": 8.0625, "learning_rate": 2.7515555555555556e-05, - "loss": 0.6241, + "loss": 0.7429, "step": 18810 }, { - "epoch": 2.265831928726222, - "grad_norm": 9.375, + "epoch": 13.423680456490727, + "grad_norm": 11.0, "learning_rate": 2.7471111111111114e-05, - "loss": 0.5504, + "loss": 0.7845, "step": 18820 }, { - "epoch": 2.2670358776787864, - "grad_norm": 6.96875, + "epoch": 13.430813124108417, + "grad_norm": 6.6875, "learning_rate": 2.7426666666666666e-05, - "loss": 0.5541, + "loss": 0.6946, "step": 18830 }, { - "epoch": 2.268239826631351, - "grad_norm": 8.875, + "epoch": 13.437945791726106, + "grad_norm": 6.875, "learning_rate": 2.7382222222222225e-05, - "loss": 0.5631, + "loss": 0.8386, "step": 18840 }, { - "epoch": 2.269443775583915, - "grad_norm": 7.25, + "epoch": 13.445078459343794, + "grad_norm": 10.375, "learning_rate": 2.733777777777778e-05, - "loss": 0.5903, + "loss": 0.7235, "step": 18850 }, { - "epoch": 2.2706477245364796, - "grad_norm": 7.625, + "epoch": 13.452211126961483, + "grad_norm": 7.90625, "learning_rate": 2.7293333333333332e-05, - "loss": 0.6442, + "loss": 0.7586, "step": 18860 }, { - "epoch": 2.271851673489044, - "grad_norm": 6.65625, + "epoch": 13.459343794579173, + "grad_norm": 7.34375, "learning_rate": 2.724888888888889e-05, - "loss": 0.5116, + "loss": 0.7546, "step": 18870 }, { - "epoch": 2.2730556224416083, - "grad_norm": 9.0625, + "epoch": 13.466476462196862, + "grad_norm": 5.9375, "learning_rate": 2.7204444444444442e-05, - "loss": 0.5303, + "loss": 0.7515, "step": 18880 }, { - "epoch": 2.2742595713941727, - "grad_norm": 8.4375, + "epoch": 13.47360912981455, + "grad_norm": 8.1875, "learning_rate": 2.716e-05, - "loss": 0.5667, + "loss": 0.7242, "step": 18890 }, { - "epoch": 2.2754635203467375, - "grad_norm": 8.75, + "epoch": 13.48074179743224, + "grad_norm": 6.53125, "learning_rate": 2.7115555555555556e-05, - "loss": 0.5621, + "loss": 0.7571, "step": 18900 }, { - "epoch": 2.2754635203467375, - "eval/acc": 38.953487396240234, + "epoch": 13.48074179743224, + "eval/acc": 44.1860466003418, "step": 18900 }, { - "epoch": 2.2754635203467375, - "eval_loss": 2.1006851196289062, - "eval_runtime": 0.2174, - "eval_samples_per_second": 197.827, - "eval_steps_per_second": 4.601, + "epoch": 13.48074179743224, + "eval_loss": 2.3733906745910645, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.866, + "eval_steps_per_second": 4.392, "step": 18900 }, { - "epoch": 2.276667469299302, - "grad_norm": 6.0625, + "epoch": 13.487874465049929, + "grad_norm": 8.125, "learning_rate": 2.7071111111111115e-05, - "loss": 0.5304, + "loss": 0.813, "step": 18910 }, { - "epoch": 2.2778714182518662, - "grad_norm": 8.25, + "epoch": 13.495007132667618, + "grad_norm": 9.4375, "learning_rate": 2.7026666666666667e-05, - "loss": 0.5421, + "loss": 0.7889, "step": 18920 }, { - "epoch": 2.2790753672044306, - "grad_norm": 5.34375, + "epoch": 13.502139800285306, + "grad_norm": 6.875, "learning_rate": 2.6982222222222225e-05, - "loss": 0.5085, + "loss": 0.6809, "step": 18930 }, { - "epoch": 2.280279316156995, - "grad_norm": 9.8125, + "epoch": 13.509272467902996, + "grad_norm": 17.875, "learning_rate": 2.6937777777777777e-05, - "loss": 0.4657, + "loss": 0.7368, "step": 18940 }, { - "epoch": 2.2814832651095593, - "grad_norm": 7.78125, + "epoch": 13.516405135520685, + "grad_norm": 7.3125, "learning_rate": 2.6893333333333336e-05, - "loss": 0.5507, + "loss": 0.7115, "step": 18950 }, { - "epoch": 2.2826872140621237, - "grad_norm": 6.40625, + "epoch": 13.523537803138375, + "grad_norm": 7.84375, "learning_rate": 2.684888888888889e-05, - "loss": 0.6328, + "loss": 0.6828, "step": 18960 }, { - "epoch": 2.283891163014688, - "grad_norm": 9.6875, + "epoch": 13.530670470756062, + "grad_norm": 8.4375, "learning_rate": 2.6804444444444443e-05, - "loss": 0.4663, + "loss": 0.7184, "step": 18970 }, { - "epoch": 2.2850951119672525, - "grad_norm": 6.84375, + "epoch": 13.537803138373752, + "grad_norm": 65.0, "learning_rate": 2.676e-05, - "loss": 0.5825, + "loss": 0.7808, "step": 18980 }, { - "epoch": 2.286299060919817, - "grad_norm": 7.25, + "epoch": 13.544935805991441, + "grad_norm": 8.75, "learning_rate": 2.6715555555555553e-05, - "loss": 0.5971, + "loss": 0.9181, "step": 18990 }, { - "epoch": 2.287503009872381, - "grad_norm": 8.875, + "epoch": 13.552068473609129, + "grad_norm": 9.3125, "learning_rate": 2.6671111111111112e-05, - "loss": 0.5891, + "loss": 0.7868, "step": 19000 }, { - "epoch": 2.287503009872381, - "eval/acc": 41.86046600341797, + "epoch": 13.552068473609129, + "eval/acc": 39.53488540649414, "step": 19000 }, { - "epoch": 2.287503009872381, - "eval_loss": 2.106297492980957, - "eval_runtime": 0.2175, - "eval_samples_per_second": 197.744, - "eval_steps_per_second": 4.599, + "epoch": 13.552068473609129, + "eval_loss": 2.3711330890655518, + "eval_runtime": 0.2397, + "eval_samples_per_second": 179.384, + "eval_steps_per_second": 4.172, "step": 19000 }, { - "epoch": 2.288706958824946, - "grad_norm": 6.25, + "epoch": 13.559201141226819, + "grad_norm": 23.625, "learning_rate": 2.6626666666666667e-05, - "loss": 0.5987, + "loss": 0.8008, "step": 19010 }, { - "epoch": 2.2899109077775104, - "grad_norm": 7.25, + "epoch": 13.566333808844508, + "grad_norm": 7.75, "learning_rate": 2.6582222222222226e-05, - "loss": 0.5449, + "loss": 0.7326, "step": 19020 }, { - "epoch": 2.2911148567300748, - "grad_norm": 9.125, + "epoch": 13.573466476462198, + "grad_norm": 9.375, "learning_rate": 2.6537777777777777e-05, - "loss": 0.52, + "loss": 0.7914, "step": 19030 }, { - "epoch": 2.292318805682639, - "grad_norm": 6.9375, + "epoch": 13.580599144079885, + "grad_norm": 5.875, "learning_rate": 2.6493333333333336e-05, - "loss": 0.5207, + "loss": 0.7849, "step": 19040 }, { - "epoch": 2.2935227546352035, - "grad_norm": 9.625, + "epoch": 13.587731811697575, + "grad_norm": 7.96875, "learning_rate": 2.644888888888889e-05, - "loss": 0.5534, + "loss": 0.7314, "step": 19050 }, { - "epoch": 2.294726703587768, - "grad_norm": 8.3125, + "epoch": 13.594864479315264, + "grad_norm": 7.90625, "learning_rate": 2.640444444444445e-05, - "loss": 0.6109, + "loss": 0.8637, "step": 19060 }, { - "epoch": 2.2959306525403322, - "grad_norm": 9.625, + "epoch": 13.601997146932954, + "grad_norm": 8.5625, "learning_rate": 2.6360000000000002e-05, - "loss": 0.6587, + "loss": 0.8337, "step": 19070 }, { - "epoch": 2.2971346014928966, - "grad_norm": 8.5625, + "epoch": 13.609129814550641, + "grad_norm": 8.75, "learning_rate": 2.6315555555555554e-05, - "loss": 0.6906, + "loss": 0.7362, "step": 19080 }, { - "epoch": 2.298338550445461, - "grad_norm": 6.375, + "epoch": 13.616262482168331, + "grad_norm": 7.3125, "learning_rate": 2.6271111111111112e-05, - "loss": 0.6105, + "loss": 0.7999, "step": 19090 }, { - "epoch": 2.2995424993980254, - "grad_norm": 7.40625, + "epoch": 13.62339514978602, + "grad_norm": 7.21875, "learning_rate": 2.6226666666666667e-05, - "loss": 0.5454, + "loss": 0.8105, "step": 19100 }, { - "epoch": 2.2995424993980254, - "eval/acc": 40.11627960205078, + "epoch": 13.62339514978602, + "eval/acc": 41.86046600341797, "step": 19100 }, { - "epoch": 2.2995424993980254, - "eval_loss": 2.120891571044922, - "eval_runtime": 0.2196, - "eval_samples_per_second": 195.827, - "eval_steps_per_second": 4.554, + "epoch": 13.62339514978602, + "eval_loss": 2.3755757808685303, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.454, + "eval_steps_per_second": 4.313, "step": 19100 }, { - "epoch": 2.3007464483505897, - "grad_norm": 8.75, + "epoch": 13.63052781740371, + "grad_norm": 30.5, "learning_rate": 2.6182222222222226e-05, - "loss": 0.6018, + "loss": 0.7981, "step": 19110 }, { - "epoch": 2.3019503973031545, - "grad_norm": 8.5625, + "epoch": 13.637660485021398, + "grad_norm": 7.0, "learning_rate": 2.6137777777777778e-05, - "loss": 0.6087, + "loss": 0.7317, "step": 19120 }, { - "epoch": 2.303154346255719, - "grad_norm": 8.375, + "epoch": 13.644793152639087, + "grad_norm": 7.40625, "learning_rate": 2.6093333333333336e-05, - "loss": 0.5712, + "loss": 0.808, "step": 19130 }, { - "epoch": 2.3043582952082833, - "grad_norm": 10.1875, + "epoch": 13.651925820256777, + "grad_norm": 8.5, "learning_rate": 2.604888888888889e-05, - "loss": 0.6849, + "loss": 0.8222, "step": 19140 }, { - "epoch": 2.3055622441608477, - "grad_norm": 5.625, + "epoch": 13.659058487874464, + "grad_norm": 8.625, "learning_rate": 2.6004444444444447e-05, - "loss": 0.5341, + "loss": 0.7333, "step": 19150 }, { - "epoch": 2.306766193113412, - "grad_norm": 5.65625, + "epoch": 13.666191155492154, + "grad_norm": 8.1875, "learning_rate": 2.5960000000000002e-05, - "loss": 0.5575, + "loss": 0.9, "step": 19160 }, { - "epoch": 2.3079701420659764, - "grad_norm": 7.4375, + "epoch": 13.673323823109843, + "grad_norm": 10.25, "learning_rate": 2.5915555555555554e-05, - "loss": 0.5442, + "loss": 0.7297, "step": 19170 }, { - "epoch": 2.3091740910185408, - "grad_norm": 13.4375, + "epoch": 13.680456490727533, + "grad_norm": 8.875, "learning_rate": 2.5871111111111113e-05, - "loss": 0.6246, + "loss": 0.8401, "step": 19180 }, { - "epoch": 2.310378039971105, - "grad_norm": 7.5, + "epoch": 13.68758915834522, + "grad_norm": 8.375, "learning_rate": 2.5826666666666664e-05, - "loss": 0.5112, + "loss": 0.803, "step": 19190 }, { - "epoch": 2.3115819889236695, - "grad_norm": 7.625, + "epoch": 13.69472182596291, + "grad_norm": 7.09375, "learning_rate": 2.5782222222222223e-05, - "loss": 0.5412, + "loss": 0.703, "step": 19200 }, { - "epoch": 2.3115819889236695, - "eval/acc": 39.53488540649414, + "epoch": 13.69472182596291, + "eval/acc": 41.86046600341797, "step": 19200 }, { - "epoch": 2.3115819889236695, - "eval_loss": 2.1130447387695312, - "eval_runtime": 0.3633, - "eval_samples_per_second": 118.375, - "eval_steps_per_second": 2.753, + "epoch": 13.69472182596291, + "eval_loss": 2.3849244117736816, + "eval_runtime": 0.2258, + "eval_samples_per_second": 190.453, + "eval_steps_per_second": 4.429, "step": 19200 }, { - "epoch": 2.312785937876234, - "grad_norm": 7.125, + "epoch": 13.7018544935806, + "grad_norm": 7.71875, "learning_rate": 2.573777777777778e-05, - "loss": 0.5188, + "loss": 0.7636, "step": 19210 }, { - "epoch": 2.3139898868287982, - "grad_norm": 7.84375, + "epoch": 13.708987161198289, + "grad_norm": 9.8125, "learning_rate": 2.5693333333333337e-05, - "loss": 0.5767, + "loss": 0.8003, "step": 19220 }, { - "epoch": 2.315193835781363, - "grad_norm": 7.71875, + "epoch": 13.716119828815977, + "grad_norm": 5.90625, "learning_rate": 2.564888888888889e-05, - "loss": 0.6596, + "loss": 0.7509, "step": 19230 }, { - "epoch": 2.3163977847339274, - "grad_norm": 9.6875, + "epoch": 13.723252496433666, + "grad_norm": 8.8125, "learning_rate": 2.5604444444444447e-05, - "loss": 0.6366, + "loss": 0.8054, "step": 19240 }, { - "epoch": 2.317601733686492, - "grad_norm": 6.21875, + "epoch": 13.730385164051356, + "grad_norm": 8.0, "learning_rate": 2.556e-05, - "loss": 0.5676, + "loss": 0.8303, "step": 19250 }, { - "epoch": 2.318805682639056, - "grad_norm": 6.125, + "epoch": 13.737517831669045, + "grad_norm": 15.6875, "learning_rate": 2.5515555555555558e-05, - "loss": 0.5419, + "loss": 0.8224, "step": 19260 }, { - "epoch": 2.3200096315916205, - "grad_norm": 8.125, + "epoch": 13.744650499286733, + "grad_norm": 17.25, "learning_rate": 2.5471111111111113e-05, - "loss": 0.5438, + "loss": 0.7978, "step": 19270 }, { - "epoch": 2.321213580544185, - "grad_norm": 8.5, + "epoch": 13.751783166904422, + "grad_norm": 9.625, "learning_rate": 2.5426666666666665e-05, - "loss": 0.5033, + "loss": 0.7964, "step": 19280 }, { - "epoch": 2.3224175294967493, - "grad_norm": 6.0, + "epoch": 13.758915834522112, + "grad_norm": 6.875, "learning_rate": 2.5382222222222224e-05, - "loss": 0.5529, + "loss": 0.8014, "step": 19290 }, { - "epoch": 2.3236214784493137, - "grad_norm": 7.34375, + "epoch": 13.7660485021398, + "grad_norm": 7.09375, "learning_rate": 2.5337777777777775e-05, - "loss": 0.5337, + "loss": 0.7804, "step": 19300 }, { - "epoch": 2.3236214784493137, - "eval/acc": 39.53488540649414, + "epoch": 13.7660485021398, + "eval/acc": 44.1860466003418, "step": 19300 }, { - "epoch": 2.3236214784493137, - "eval_loss": 2.0968053340911865, - "eval_runtime": 0.2285, - "eval_samples_per_second": 188.168, - "eval_steps_per_second": 4.376, + "epoch": 13.7660485021398, + "eval_loss": 2.3624305725097656, + "eval_runtime": 0.262, + "eval_samples_per_second": 164.141, + "eval_steps_per_second": 3.817, "step": 19300 }, { - "epoch": 2.324825427401878, - "grad_norm": 9.0625, + "epoch": 13.773181169757489, + "grad_norm": 6.46875, "learning_rate": 2.5293333333333334e-05, - "loss": 0.5345, + "loss": 0.7614, "step": 19310 }, { - "epoch": 2.3260293763544424, - "grad_norm": 7.125, + "epoch": 13.780313837375179, + "grad_norm": 7.8125, "learning_rate": 2.524888888888889e-05, - "loss": 0.5493, + "loss": 0.7323, "step": 19320 }, { - "epoch": 2.3272333253070068, - "grad_norm": 8.5625, + "epoch": 13.787446504992868, + "grad_norm": 6.375, "learning_rate": 2.5204444444444448e-05, - "loss": 0.5026, + "loss": 0.6908, "step": 19330 }, { - "epoch": 2.3284372742595716, - "grad_norm": 8.3125, + "epoch": 13.794579172610556, + "grad_norm": 10.4375, "learning_rate": 2.516e-05, - "loss": 0.6307, + "loss": 0.7742, "step": 19340 }, { - "epoch": 2.329641223212136, - "grad_norm": 7.84375, + "epoch": 13.801711840228245, + "grad_norm": 7.4375, "learning_rate": 2.5115555555555558e-05, - "loss": 0.5954, + "loss": 0.7686, "step": 19350 }, { - "epoch": 2.3308451721647003, - "grad_norm": 8.8125, + "epoch": 13.808844507845935, + "grad_norm": 8.875, "learning_rate": 2.5071111111111114e-05, - "loss": 0.5993, + "loss": 0.8334, "step": 19360 }, { - "epoch": 2.3320491211172647, - "grad_norm": 6.03125, + "epoch": 13.815977175463622, + "grad_norm": 7.5, "learning_rate": 2.5026666666666672e-05, - "loss": 0.5018, + "loss": 0.7758, "step": 19370 }, { - "epoch": 2.333253070069829, - "grad_norm": 8.0, + "epoch": 13.823109843081312, + "grad_norm": 7.53125, "learning_rate": 2.4982222222222224e-05, - "loss": 0.5332, + "loss": 0.7814, "step": 19380 }, { - "epoch": 2.3344570190223934, - "grad_norm": 7.84375, + "epoch": 13.830242510699001, + "grad_norm": 14.5625, "learning_rate": 2.493777777777778e-05, - "loss": 0.6697, + "loss": 0.8628, "step": 19390 }, { - "epoch": 2.335660967974958, - "grad_norm": 7.40625, + "epoch": 13.837375178316691, + "grad_norm": 6.5625, "learning_rate": 2.4893333333333334e-05, - "loss": 0.5305, + "loss": 0.7898, "step": 19400 }, { - "epoch": 2.335660967974958, - "eval/acc": 40.11627960205078, + "epoch": 13.837375178316691, + "eval/acc": 41.86046600341797, "step": 19400 }, { - "epoch": 2.335660967974958, - "eval_loss": 2.104180335998535, - "eval_runtime": 0.3958, - "eval_samples_per_second": 108.651, - "eval_steps_per_second": 2.527, + "epoch": 13.837375178316691, + "eval_loss": 2.359168767929077, + "eval_runtime": 0.2248, + "eval_samples_per_second": 191.308, + "eval_steps_per_second": 4.449, "step": 19400 }, { - "epoch": 2.336864916927522, - "grad_norm": 8.375, + "epoch": 13.844507845934379, + "grad_norm": 15.375, "learning_rate": 2.484888888888889e-05, - "loss": 0.5422, + "loss": 0.786, "step": 19410 }, { - "epoch": 2.3380688658800866, - "grad_norm": 16.875, + "epoch": 13.851640513552068, + "grad_norm": 5.59375, "learning_rate": 2.4804444444444448e-05, - "loss": 0.6836, + "loss": 0.8072, "step": 19420 }, { - "epoch": 2.339272814832651, - "grad_norm": 7.09375, + "epoch": 13.858773181169758, + "grad_norm": 8.1875, "learning_rate": 2.476e-05, - "loss": 0.6242, + "loss": 0.7777, "step": 19430 }, { - "epoch": 2.3404767637852153, - "grad_norm": 7.28125, + "epoch": 13.865905848787447, + "grad_norm": 8.8125, "learning_rate": 2.4715555555555555e-05, - "loss": 0.616, + "loss": 0.8148, "step": 19440 }, { - "epoch": 2.34168071273778, - "grad_norm": 7.9375, + "epoch": 13.873038516405135, + "grad_norm": 10.9375, "learning_rate": 2.467111111111111e-05, - "loss": 0.4962, + "loss": 0.7659, "step": 19450 }, { - "epoch": 2.3428846616903445, - "grad_norm": 6.4375, + "epoch": 13.880171184022824, + "grad_norm": 5.59375, "learning_rate": 2.4626666666666666e-05, - "loss": 0.4496, + "loss": 0.7819, "step": 19460 }, { - "epoch": 2.344088610642909, - "grad_norm": 8.6875, + "epoch": 13.887303851640514, + "grad_norm": 8.125, "learning_rate": 2.4582222222222224e-05, - "loss": 0.5492, + "loss": 0.8459, "step": 19470 }, { - "epoch": 2.3452925595954732, - "grad_norm": 40.5, + "epoch": 13.894436519258203, + "grad_norm": 7.1875, "learning_rate": 2.453777777777778e-05, - "loss": 0.6082, + "loss": 0.7448, "step": 19480 }, { - "epoch": 2.3464965085480376, - "grad_norm": 7.53125, + "epoch": 13.901569186875891, + "grad_norm": 7.6875, "learning_rate": 2.4493333333333335e-05, - "loss": 0.4591, + "loss": 0.8096, "step": 19490 }, { - "epoch": 2.347700457500602, - "grad_norm": 9.4375, + "epoch": 13.90870185449358, + "grad_norm": 12.0, "learning_rate": 2.444888888888889e-05, - "loss": 0.5899, + "loss": 0.7402, "step": 19500 }, { - "epoch": 2.347700457500602, + "epoch": 13.90870185449358, "eval/acc": 39.53488540649414, "step": 19500 }, { - "epoch": 2.347700457500602, - "eval_loss": 2.1160874366760254, - "eval_runtime": 4.1188, - "eval_samples_per_second": 10.44, - "eval_steps_per_second": 0.243, + "epoch": 13.90870185449358, + "eval_loss": 2.3664777278900146, + "eval_runtime": 0.2287, + "eval_samples_per_second": 188.004, + "eval_steps_per_second": 4.372, "step": 19500 }, { - "epoch": 2.3489044064531663, - "grad_norm": 7.625, + "epoch": 13.91583452211127, + "grad_norm": 8.5, "learning_rate": 2.4404444444444445e-05, - "loss": 0.5605, + "loss": 0.796, "step": 19510 }, { - "epoch": 2.3501083554057307, - "grad_norm": 7.59375, + "epoch": 13.922967189728958, + "grad_norm": 10.3125, "learning_rate": 2.4360000000000004e-05, - "loss": 0.5862, + "loss": 0.7331, "step": 19520 }, { - "epoch": 2.351312304358295, - "grad_norm": 8.0625, + "epoch": 13.930099857346647, + "grad_norm": 9.0, "learning_rate": 2.431555555555556e-05, - "loss": 0.583, + "loss": 0.6982, "step": 19530 }, { - "epoch": 2.3525162533108595, - "grad_norm": 7.09375, + "epoch": 13.937232524964337, + "grad_norm": 8.0625, "learning_rate": 2.427111111111111e-05, - "loss": 0.5206, + "loss": 0.7831, "step": 19540 }, { - "epoch": 2.353720202263424, - "grad_norm": 9.1875, + "epoch": 13.944365192582026, + "grad_norm": 8.1875, "learning_rate": 2.4226666666666666e-05, - "loss": 0.6646, + "loss": 0.7795, "step": 19550 }, { - "epoch": 2.3549241512159886, - "grad_norm": 9.5625, + "epoch": 13.951497860199714, + "grad_norm": 6.25, "learning_rate": 2.418222222222222e-05, - "loss": 0.5658, + "loss": 0.7416, "step": 19560 }, { - "epoch": 2.356128100168553, - "grad_norm": 6.5625, + "epoch": 13.958630527817403, + "grad_norm": 7.03125, "learning_rate": 2.413777777777778e-05, - "loss": 0.5273, + "loss": 0.7553, "step": 19570 }, { - "epoch": 2.3573320491211174, - "grad_norm": 6.15625, + "epoch": 13.965763195435093, + "grad_norm": 7.4375, "learning_rate": 2.4093333333333335e-05, - "loss": 0.647, + "loss": 0.7573, "step": 19580 }, { - "epoch": 2.3585359980736817, - "grad_norm": 7.9375, + "epoch": 13.972895863052782, + "grad_norm": 7.75, "learning_rate": 2.404888888888889e-05, - "loss": 0.5751, + "loss": 0.7041, "step": 19590 }, { - "epoch": 2.359739947026246, - "grad_norm": 10.125, + "epoch": 13.98002853067047, + "grad_norm": 8.75, "learning_rate": 2.4004444444444446e-05, - "loss": 0.5891, + "loss": 0.8235, "step": 19600 }, { - "epoch": 2.359739947026246, - "eval/acc": 40.11627960205078, + "epoch": 13.98002853067047, + "eval/acc": 44.1860466003418, "step": 19600 }, { - "epoch": 2.359739947026246, - "eval_loss": 2.0887560844421387, - "eval_runtime": 3.2754, - "eval_samples_per_second": 13.128, - "eval_steps_per_second": 0.305, + "epoch": 13.98002853067047, + "eval_loss": 2.3475139141082764, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.095, + "eval_steps_per_second": 4.467, "step": 19600 }, { - "epoch": 2.3609438959788105, - "grad_norm": 8.0625, + "epoch": 13.98716119828816, + "grad_norm": 8.9375, "learning_rate": 2.396e-05, - "loss": 0.5342, + "loss": 0.8462, "step": 19610 }, { - "epoch": 2.362147844931375, + "epoch": 13.99429386590585, "grad_norm": 7.9375, "learning_rate": 2.3915555555555556e-05, - "loss": 0.535, + "loss": 0.7937, "step": 19620 }, { - "epoch": 2.3633517938839392, - "grad_norm": 6.96875, + "epoch": 14.001426533523539, + "grad_norm": 13.1875, "learning_rate": 2.3871111111111115e-05, - "loss": 0.5114, + "loss": 0.7584, "step": 19630 }, { - "epoch": 2.3645557428365036, - "grad_norm": 8.4375, + "epoch": 14.008559201141226, + "grad_norm": 6.25, "learning_rate": 2.3826666666666667e-05, - "loss": 0.5442, + "loss": 0.7309, "step": 19640 }, { - "epoch": 2.365759691789068, - "grad_norm": 8.0, + "epoch": 14.015691868758916, + "grad_norm": 6.9375, "learning_rate": 2.3782222222222222e-05, - "loss": 0.4812, + "loss": 0.7802, "step": 19650 }, { - "epoch": 2.3669636407416323, - "grad_norm": 7.25, + "epoch": 14.022824536376605, + "grad_norm": 7.4375, "learning_rate": 2.3737777777777777e-05, - "loss": 0.6373, + "loss": 0.6789, "step": 19660 }, { - "epoch": 2.368167589694197, - "grad_norm": 7.0, + "epoch": 14.029957203994293, + "grad_norm": 8.25, "learning_rate": 2.3693333333333332e-05, - "loss": 0.5311, + "loss": 0.6285, "step": 19670 }, { - "epoch": 2.3693715386467615, - "grad_norm": 8.25, + "epoch": 14.037089871611983, + "grad_norm": 15.9375, "learning_rate": 2.364888888888889e-05, - "loss": 0.6208, + "loss": 0.8654, "step": 19680 }, { - "epoch": 2.370575487599326, - "grad_norm": 7.15625, + "epoch": 14.044222539229672, + "grad_norm": 8.625, "learning_rate": 2.3604444444444446e-05, - "loss": 0.5762, + "loss": 0.7778, "step": 19690 }, { - "epoch": 2.3717794365518903, - "grad_norm": 11.4375, + "epoch": 14.051355206847362, + "grad_norm": 8.0625, "learning_rate": 2.356e-05, - "loss": 0.5301, + "loss": 0.8099, "step": 19700 }, { - "epoch": 2.3717794365518903, - "eval/acc": 41.86046600341797, + "epoch": 14.051355206847362, + "eval/acc": 51.16279220581055, "step": 19700 }, { - "epoch": 2.3717794365518903, - "eval_loss": 2.0873513221740723, - "eval_runtime": 0.2195, - "eval_samples_per_second": 195.888, - "eval_steps_per_second": 4.556, + "epoch": 14.051355206847362, + "eval_loss": 2.568962574005127, + "eval_runtime": 7.2029, + "eval_samples_per_second": 5.97, + "eval_steps_per_second": 0.139, "step": 19700 }, { - "epoch": 2.3729833855044546, - "grad_norm": 7.65625, + "epoch": 14.05848787446505, + "grad_norm": 10.5, "learning_rate": 2.3515555555555557e-05, - "loss": 0.5212, + "loss": 0.7666, "step": 19710 }, { - "epoch": 2.374187334457019, - "grad_norm": 8.1875, + "epoch": 14.065620542082739, + "grad_norm": 10.9375, "learning_rate": 2.3471111111111112e-05, - "loss": 0.5576, + "loss": 0.7339, "step": 19720 }, { - "epoch": 2.3753912834095834, - "grad_norm": 8.625, + "epoch": 14.072753209700428, + "grad_norm": 7.21875, "learning_rate": 2.342666666666667e-05, - "loss": 0.5025, + "loss": 0.7559, "step": 19730 }, { - "epoch": 2.3765952323621478, - "grad_norm": 7.34375, + "epoch": 14.079885877318118, + "grad_norm": 15.0625, "learning_rate": 2.3382222222222222e-05, - "loss": 0.4975, + "loss": 0.7539, "step": 19740 }, { - "epoch": 2.377799181314712, - "grad_norm": 7.59375, + "epoch": 14.087018544935805, + "grad_norm": 6.90625, "learning_rate": 2.3337777777777778e-05, - "loss": 0.5568, + "loss": 0.744, "step": 19750 }, { - "epoch": 2.3790031302672765, - "grad_norm": 5.03125, + "epoch": 14.094151212553495, + "grad_norm": 7.28125, "learning_rate": 2.3293333333333333e-05, - "loss": 0.508, + "loss": 0.8038, "step": 19760 }, { - "epoch": 2.380207079219841, - "grad_norm": 6.53125, + "epoch": 14.101283880171184, + "grad_norm": 8.8125, "learning_rate": 2.3248888888888888e-05, - "loss": 0.5257, + "loss": 0.819, "step": 19770 }, { - "epoch": 2.3814110281724057, - "grad_norm": 8.125, + "epoch": 14.108416547788874, + "grad_norm": 7.1875, "learning_rate": 2.3204444444444447e-05, - "loss": 0.5114, + "loss": 0.7756, "step": 19780 }, { - "epoch": 2.38261497712497, - "grad_norm": 8.3125, + "epoch": 14.115549215406562, + "grad_norm": 7.4375, "learning_rate": 2.3160000000000002e-05, - "loss": 0.572, + "loss": 0.8243, "step": 19790 }, { - "epoch": 2.3838189260775344, - "grad_norm": 5.9375, + "epoch": 14.122681883024251, + "grad_norm": 6.21875, "learning_rate": 2.3115555555555557e-05, - "loss": 0.548, + "loss": 0.7094, "step": 19800 }, { - "epoch": 2.3838189260775344, - "eval/acc": 41.86046600341797, + "epoch": 14.122681883024251, + "eval/acc": 51.16279220581055, "step": 19800 }, { - "epoch": 2.3838189260775344, - "eval_loss": 2.076712131500244, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.681, - "eval_steps_per_second": 4.644, + "epoch": 14.122681883024251, + "eval_loss": 2.565817356109619, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.132, + "eval_steps_per_second": 4.584, "step": 19800 }, { - "epoch": 2.385022875030099, - "grad_norm": 9.5625, + "epoch": 14.12981455064194, + "grad_norm": 6.9375, "learning_rate": 2.3071111111111112e-05, - "loss": 0.5948, + "loss": 0.7453, "step": 19810 }, { - "epoch": 2.386226823982663, - "grad_norm": 12.6875, + "epoch": 14.136947218259628, + "grad_norm": 7.21875, "learning_rate": 2.3026666666666668e-05, - "loss": 0.5362, + "loss": 0.8192, "step": 19820 }, { - "epoch": 2.3874307729352275, - "grad_norm": 8.625, + "epoch": 14.144079885877318, + "grad_norm": 7.9375, "learning_rate": 2.2982222222222223e-05, - "loss": 0.5482, + "loss": 0.7633, "step": 19830 }, { - "epoch": 2.388634721887792, - "grad_norm": 5.40625, + "epoch": 14.151212553495007, + "grad_norm": 6.875, "learning_rate": 2.293777777777778e-05, - "loss": 0.5819, + "loss": 0.7686, "step": 19840 }, { - "epoch": 2.3898386708403563, - "grad_norm": 6.96875, + "epoch": 14.158345221112697, + "grad_norm": 9.5, "learning_rate": 2.2893333333333333e-05, - "loss": 0.5025, + "loss": 0.7597, "step": 19850 }, { - "epoch": 2.3910426197929207, - "grad_norm": 5.75, + "epoch": 14.165477888730384, + "grad_norm": 7.03125, "learning_rate": 2.284888888888889e-05, - "loss": 0.5603, + "loss": 0.7675, "step": 19860 }, { - "epoch": 2.392246568745485, - "grad_norm": 9.875, + "epoch": 14.172610556348074, + "grad_norm": 9.625, "learning_rate": 2.2804444444444444e-05, - "loss": 0.602, + "loss": 0.7373, "step": 19870 }, { - "epoch": 2.3934505176980494, - "grad_norm": 7.5625, + "epoch": 14.179743223965763, + "grad_norm": 7.0, "learning_rate": 2.2760000000000002e-05, - "loss": 0.5969, + "loss": 0.801, "step": 19880 }, { - "epoch": 2.394654466650614, - "grad_norm": 8.0, + "epoch": 14.186875891583453, + "grad_norm": 11.875, "learning_rate": 2.2715555555555558e-05, - "loss": 0.5586, + "loss": 0.7575, "step": 19890 }, { - "epoch": 2.3958584156031786, - "grad_norm": 7.25, + "epoch": 14.19400855920114, + "grad_norm": 8.375, "learning_rate": 2.2671111111111113e-05, - "loss": 0.6009, + "loss": 0.8073, "step": 19900 }, { - "epoch": 2.3958584156031786, - "eval/acc": 41.27906799316406, + "epoch": 14.19400855920114, + "eval/acc": 53.488372802734375, "step": 19900 }, { - "epoch": 2.3958584156031786, - "eval_loss": 2.0865280628204346, - "eval_runtime": 0.2179, - "eval_samples_per_second": 197.344, - "eval_steps_per_second": 4.589, + "epoch": 14.19400855920114, + "eval_loss": 2.5913121700286865, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.262, + "eval_steps_per_second": 3.541, "step": 19900 }, { - "epoch": 2.397062364555743, - "grad_norm": 8.5625, + "epoch": 14.20114122681883, + "grad_norm": 7.71875, "learning_rate": 2.2626666666666668e-05, - "loss": 0.574, + "loss": 0.7899, "step": 19910 }, { - "epoch": 2.3982663135083073, - "grad_norm": 7.78125, + "epoch": 14.20827389443652, + "grad_norm": 11.625, "learning_rate": 2.2582222222222223e-05, - "loss": 0.5355, + "loss": 0.8058, "step": 19920 }, { - "epoch": 2.3994702624608717, - "grad_norm": 9.5, + "epoch": 14.21540656205421, + "grad_norm": 8.4375, "learning_rate": 2.253777777777778e-05, - "loss": 0.6201, + "loss": 0.7907, "step": 19930 }, { - "epoch": 2.400674211413436, - "grad_norm": 9.4375, + "epoch": 14.222539229671897, + "grad_norm": 7.3125, "learning_rate": 2.2493333333333337e-05, - "loss": 0.5864, + "loss": 0.7557, "step": 19940 }, { - "epoch": 2.4018781603660004, - "grad_norm": 7.96875, + "epoch": 14.229671897289586, + "grad_norm": 6.25, "learning_rate": 2.244888888888889e-05, - "loss": 0.5624, + "loss": 0.7859, "step": 19950 }, { - "epoch": 2.403082109318565, - "grad_norm": 8.4375, + "epoch": 14.236804564907276, + "grad_norm": 9.25, "learning_rate": 2.2404444444444444e-05, - "loss": 0.6663, + "loss": 0.734, "step": 19960 }, { - "epoch": 2.404286058271129, - "grad_norm": 7.71875, + "epoch": 14.243937232524964, + "grad_norm": 7.78125, "learning_rate": 2.236e-05, - "loss": 0.5064, + "loss": 0.7623, "step": 19970 }, { - "epoch": 2.4054900072236935, - "grad_norm": 11.3125, + "epoch": 14.251069900142653, + "grad_norm": 7.15625, "learning_rate": 2.2315555555555555e-05, - "loss": 0.5555, + "loss": 0.6912, "step": 19980 }, { - "epoch": 2.406693956176258, - "grad_norm": 7.96875, + "epoch": 14.258202567760343, + "grad_norm": 6.5, "learning_rate": 2.2271111111111113e-05, - "loss": 0.5172, + "loss": 0.7367, "step": 19990 }, { - "epoch": 2.4078979051288227, - "grad_norm": 10.25, + "epoch": 14.265335235378032, + "grad_norm": 7.375, "learning_rate": 2.222666666666667e-05, - "loss": 0.6456, + "loss": 0.7353, "step": 20000 }, { - "epoch": 2.4078979051288227, - "eval/acc": 39.53488540649414, + "epoch": 14.265335235378032, + "eval/acc": 51.16279220581055, "step": 20000 }, { - "epoch": 2.4078979051288227, - "eval_loss": 2.097860336303711, - "eval_runtime": 0.2169, - "eval_samples_per_second": 198.258, - "eval_steps_per_second": 4.611, + "epoch": 14.265335235378032, + "eval_loss": 2.5749382972717285, + "eval_runtime": 0.2259, + "eval_samples_per_second": 190.358, + "eval_steps_per_second": 4.427, "step": 20000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 18, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b98dce5180ec2d612b70665db845bd9c69293da --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bbab43b76cdbed4ee5364e337787e088ac7a5b381ebe2f680cc9ee3fbf04b17 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0242fb53042e5a94a518245b82c050c5e6350fbd --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff7e107bf5d1efad55fa123a28edf876fc0a79e6504a35e8436b491f3bce835 +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..672cff9ce8725ede76269af2d5c1218a49590bc5 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a25b1ca1a6812b6542e9e1ab355d92720b67020d48c39e80dfa44e2613e6782 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..290ff36b5a81e5bbe52cf035192e44421766663b --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062ea9ef2b29cc9252ab2bc026f7af9083d7a47fc2921720ed578a42d1a098b1 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3a4ba33f202109616def348f833a4eeb9c23f2f --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fa19b4084f36e88e23466f3e38f8923ef64d0e637be7a81e9c16350b86e72a +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..929ceb2abab1e52e36f2ce15aab552dbf7064596 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fcddbf722dbad693b2a4a95db5330dca78f263e2b699c99b8acc90a117bd68e +size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json similarity index 58% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json index d259f56148a3878e7001039f4657f86b1c9c8e28..e3c9bda054d9b8656b9bf8c3e2543818c385a3bf 100644 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.009872381411028, + "epoch": 17.83166904422254, "eval_steps": 100, "global_step": 25000, "is_hyper_param_search": false, @@ -10,20760 +10,20760 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0012039489525644113, - "grad_norm": 29.25, + "epoch": 0.007132667617689016, + "grad_norm": 19.75, "learning_rate": 3.6e-07, - "loss": 5.6475, + "loss": 5.6319, "step": 10 }, { - "epoch": 0.0024078979051288226, - "grad_norm": 13.6875, + "epoch": 0.014265335235378032, + "grad_norm": 19.375, "learning_rate": 7.6e-07, - "loss": 5.6394, + "loss": 5.5914, "step": 20 }, { - "epoch": 0.003611846857693234, - "grad_norm": 36.0, + "epoch": 0.021398002853067047, + "grad_norm": 51.25, "learning_rate": 1.16e-06, - "loss": 5.6168, + "loss": 5.6495, "step": 30 }, { - "epoch": 0.004815795810257645, - "grad_norm": 17.0, + "epoch": 0.028530670470756064, + "grad_norm": 19.0, "learning_rate": 1.56e-06, - "loss": 5.6346, + "loss": 5.6581, "step": 40 }, { - "epoch": 0.006019744762822056, - "grad_norm": 16.5, + "epoch": 0.03566333808844508, + "grad_norm": 23.75, "learning_rate": 1.96e-06, - "loss": 5.6391, + "loss": 5.6366, "step": 50 }, { - "epoch": 0.007223693715386468, - "grad_norm": 16.5, + "epoch": 0.042796005706134094, + "grad_norm": 18.0, "learning_rate": 2.36e-06, - "loss": 5.6272, + "loss": 5.6411, "step": 60 }, { - "epoch": 0.00842764266795088, - "grad_norm": 14.8125, + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, "learning_rate": 2.7600000000000003e-06, - "loss": 5.5979, + "loss": 5.5919, "step": 70 }, { - "epoch": 0.00963159162051529, - "grad_norm": 22.375, + "epoch": 0.05706134094151213, + "grad_norm": 24.125, "learning_rate": 3.1600000000000007e-06, - "loss": 5.6515, + "loss": 5.6083, "step": 80 }, { - "epoch": 0.010835540573079701, - "grad_norm": 17.125, + "epoch": 0.06419400855920114, + "grad_norm": 18.25, "learning_rate": 3.5600000000000002e-06, - "loss": 5.6018, + "loss": 5.6599, "step": 90 }, { - "epoch": 0.012039489525644112, - "grad_norm": 14.9375, + "epoch": 0.07132667617689016, + "grad_norm": 18.25, "learning_rate": 3.96e-06, - "loss": 5.6342, + "loss": 5.6652, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval/acc": 3.4883720874786377, + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval_loss": 5.140806198120117, - "eval_runtime": 2.4165, - "eval_samples_per_second": 17.794, - "eval_steps_per_second": 0.414, + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, "step": 100 }, { - "epoch": 0.013243438478208525, - "grad_norm": 13.0, + "epoch": 0.07845934379457917, + "grad_norm": 21.0, "learning_rate": 4.360000000000001e-06, - "loss": 5.6124, + "loss": 5.6402, "step": 110 }, { - "epoch": 0.014447387430772935, - "grad_norm": 18.625, + "epoch": 0.08559201141226819, + "grad_norm": 16.875, "learning_rate": 4.76e-06, - "loss": 5.6127, + "loss": 5.6535, "step": 120 }, { - "epoch": 0.015651336383337346, - "grad_norm": 14.375, + "epoch": 0.09272467902995721, + "grad_norm": 21.5, "learning_rate": 5.1600000000000006e-06, - "loss": 5.5663, + "loss": 5.5821, "step": 130 }, { - "epoch": 0.01685528533590176, - "grad_norm": 11.9375, + "epoch": 0.09985734664764621, + "grad_norm": 18.5, "learning_rate": 5.56e-06, - "loss": 5.55, + "loss": 5.6184, "step": 140 }, { - "epoch": 0.018059234288466168, - "grad_norm": 14.5, + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, "learning_rate": 5.9600000000000005e-06, - "loss": 5.5839, + "loss": 5.5743, "step": 150 }, { - "epoch": 0.01926318324103058, - "grad_norm": 15.0625, + "epoch": 0.11412268188302425, + "grad_norm": 16.875, "learning_rate": 6.360000000000001e-06, - "loss": 5.5259, + "loss": 5.5684, "step": 160 }, { - "epoch": 0.020467132193594993, - "grad_norm": 14.8125, + "epoch": 0.12125534950071326, + "grad_norm": 22.125, "learning_rate": 6.76e-06, - "loss": 5.4812, + "loss": 5.535, "step": 170 }, { - "epoch": 0.021671081146159402, - "grad_norm": 15.375, + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, "learning_rate": 7.16e-06, - "loss": 5.4964, + "loss": 5.4357, "step": 180 }, { - "epoch": 0.022875030098723815, - "grad_norm": 14.0625, + "epoch": 0.1355206847360913, + "grad_norm": 16.375, "learning_rate": 7.5600000000000005e-06, - "loss": 5.4023, + "loss": 5.3766, "step": 190 }, { - "epoch": 0.024078979051288224, - "grad_norm": 18.625, + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, "learning_rate": 7.96e-06, - "loss": 5.3778, + "loss": 5.4437, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval/acc": 5.232558250427246, + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval_loss": 4.991551399230957, - "eval_runtime": 0.2363, - "eval_samples_per_second": 181.988, - "eval_steps_per_second": 4.232, + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, "step": 200 }, { - "epoch": 0.025282928003852637, - "grad_norm": 16.25, + "epoch": 0.14978601997146934, + "grad_norm": 16.75, "learning_rate": 8.36e-06, - "loss": 5.3983, + "loss": 5.4744, "step": 210 }, { - "epoch": 0.02648687695641705, - "grad_norm": 17.25, + "epoch": 0.15691868758915833, + "grad_norm": 43.25, "learning_rate": 8.76e-06, - "loss": 5.2953, + "loss": 5.381, "step": 220 }, { - "epoch": 0.02769082590898146, - "grad_norm": 15.9375, + "epoch": 0.16405135520684735, + "grad_norm": 21.0, "learning_rate": 9.16e-06, - "loss": 5.2266, + "loss": 5.3092, "step": 230 }, { - "epoch": 0.02889477486154587, - "grad_norm": 21.875, + "epoch": 0.17118402282453637, + "grad_norm": 26.75, "learning_rate": 9.560000000000002e-06, - "loss": 5.139, + "loss": 5.2752, "step": 240 }, { - "epoch": 0.03009872381411028, - "grad_norm": 17.875, + "epoch": 0.1783166904422254, + "grad_norm": 26.875, "learning_rate": 9.96e-06, - "loss": 5.0639, + "loss": 5.2194, "step": 250 }, { - "epoch": 0.03130267276667469, - "grad_norm": 18.875, + "epoch": 0.18544935805991442, + "grad_norm": 20.875, "learning_rate": 1.036e-05, - "loss": 5.0118, + "loss": 5.0657, "step": 260 }, { - "epoch": 0.032506621719239105, - "grad_norm": 26.0, + "epoch": 0.19258202567760344, + "grad_norm": 25.125, "learning_rate": 1.076e-05, - "loss": 4.8959, + "loss": 4.967, "step": 270 }, { - "epoch": 0.03371057067180352, - "grad_norm": 18.5, + "epoch": 0.19971469329529243, + "grad_norm": 30.125, "learning_rate": 1.1160000000000002e-05, - "loss": 4.8454, + "loss": 4.9544, "step": 280 }, { - "epoch": 0.03491451962436792, - "grad_norm": 28.0, + "epoch": 0.20684736091298145, + "grad_norm": 24.625, "learning_rate": 1.156e-05, - "loss": 4.6846, + "loss": 4.7585, "step": 290 }, { - "epoch": 0.036118468576932336, - "grad_norm": 25.5, + "epoch": 0.21398002853067047, + "grad_norm": 21.375, "learning_rate": 1.196e-05, - "loss": 4.5211, + "loss": 4.635, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval/acc": 6.395349025726318, + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval_loss": 4.604515075683594, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.428, - "eval_steps_per_second": 4.638, + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, "step": 300 }, { - "epoch": 0.03732241752949675, - "grad_norm": 28.0, + "epoch": 0.2211126961483595, + "grad_norm": 30.125, "learning_rate": 1.236e-05, - "loss": 4.3466, + "loss": 4.5333, "step": 310 }, { - "epoch": 0.03852636648206116, - "grad_norm": 27.125, + "epoch": 0.2282453637660485, + "grad_norm": 28.125, "learning_rate": 1.276e-05, - "loss": 4.1005, + "loss": 4.2888, "step": 320 }, { - "epoch": 0.039730315434625574, - "grad_norm": 33.0, + "epoch": 0.23537803138373753, + "grad_norm": 30.5, "learning_rate": 1.316e-05, - "loss": 3.7904, + "loss": 4.1744, "step": 330 }, { - "epoch": 0.040934264387189986, - "grad_norm": 32.75, + "epoch": 0.24251069900142652, + "grad_norm": 35.0, "learning_rate": 1.356e-05, - "loss": 3.4061, + "loss": 3.8812, "step": 340 }, { - "epoch": 0.04213821333975439, - "grad_norm": 31.125, + "epoch": 0.24964336661911554, + "grad_norm": 30.75, "learning_rate": 1.396e-05, - "loss": 3.2838, + "loss": 3.6772, "step": 350 }, { - "epoch": 0.043342162292318805, - "grad_norm": 23.75, + "epoch": 0.25677603423680456, + "grad_norm": 25.875, "learning_rate": 1.4360000000000001e-05, - "loss": 2.9101, + "loss": 3.3797, "step": 360 }, { - "epoch": 0.04454611124488322, - "grad_norm": 44.75, + "epoch": 0.26390870185449355, + "grad_norm": 31.375, "learning_rate": 1.4760000000000001e-05, - "loss": 2.6306, + "loss": 3.2338, "step": 370 }, { - "epoch": 0.04575006019744763, - "grad_norm": 33.25, + "epoch": 0.2710413694721826, + "grad_norm": 72.0, "learning_rate": 1.5160000000000002e-05, - "loss": 2.5454, + "loss": 2.976, "step": 380 }, { - "epoch": 0.04695400915001204, - "grad_norm": 31.375, + "epoch": 0.2781740370898716, + "grad_norm": 22.375, "learning_rate": 1.556e-05, - "loss": 2.5867, + "loss": 2.8207, "step": 390 }, { - "epoch": 0.04815795810257645, - "grad_norm": 18.5, + "epoch": 0.28530670470756064, + "grad_norm": 21.25, "learning_rate": 1.596e-05, - "loss": 2.3251, + "loss": 2.8341, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval/acc": 12.209301948547363, + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval_loss": 3.941906452178955, - "eval_runtime": 0.2265, - "eval_samples_per_second": 189.814, - "eval_steps_per_second": 4.414, + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, "step": 400 }, { - "epoch": 0.04936190705514086, - "grad_norm": 18.0, + "epoch": 0.29243937232524964, + "grad_norm": 21.0, "learning_rate": 1.636e-05, - "loss": 2.394, + "loss": 2.6431, "step": 410 }, { - "epoch": 0.05056585600770527, - "grad_norm": 22.375, + "epoch": 0.2995720399429387, + "grad_norm": 20.875, "learning_rate": 1.6760000000000002e-05, - "loss": 2.2856, + "loss": 2.6506, "step": 420 }, { - "epoch": 0.051769804960269686, - "grad_norm": 17.25, + "epoch": 0.3067047075606277, + "grad_norm": 21.125, "learning_rate": 1.7160000000000002e-05, - "loss": 2.3414, + "loss": 2.491, "step": 430 }, { - "epoch": 0.0529737539128341, - "grad_norm": 15.25, + "epoch": 0.31383737517831667, + "grad_norm": 31.75, "learning_rate": 1.756e-05, - "loss": 2.156, + "loss": 2.423, "step": 440 }, { - "epoch": 0.054177702865398504, - "grad_norm": 15.75, + "epoch": 0.3209700427960057, + "grad_norm": 19.375, "learning_rate": 1.796e-05, - "loss": 2.0164, + "loss": 2.5108, "step": 450 }, { - "epoch": 0.05538165181796292, - "grad_norm": 28.5, + "epoch": 0.3281027104136947, + "grad_norm": 17.375, "learning_rate": 1.8360000000000004e-05, - "loss": 1.9555, + "loss": 2.4584, "step": 460 }, { - "epoch": 0.05658560077052733, - "grad_norm": 19.25, + "epoch": 0.33523537803138376, + "grad_norm": 22.625, "learning_rate": 1.876e-05, - "loss": 2.0277, + "loss": 2.3526, "step": 470 }, { - "epoch": 0.05778954972309174, - "grad_norm": 15.375, + "epoch": 0.34236804564907275, + "grad_norm": 30.25, "learning_rate": 1.916e-05, - "loss": 2.1719, + "loss": 2.3634, "step": 480 }, { - "epoch": 0.058993498675656154, - "grad_norm": 18.875, + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, "learning_rate": 1.956e-05, - "loss": 2.013, + "loss": 2.3339, "step": 490 }, { - "epoch": 0.06019744762822056, - "grad_norm": 18.625, + "epoch": 0.3566333808844508, + "grad_norm": 19.5, "learning_rate": 1.9960000000000002e-05, - "loss": 1.8574, + "loss": 2.268, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval/acc": 20.930233001708984, + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval_loss": 3.6547293663024902, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.002, - "eval_steps_per_second": 4.674, + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, "step": 500 }, { - "epoch": 0.06140139658078497, - "grad_norm": 19.875, + "epoch": 0.3637660485021398, + "grad_norm": 29.375, "learning_rate": 2.036e-05, - "loss": 1.9431, + "loss": 2.2728, "step": 510 }, { - "epoch": 0.06260534553334939, - "grad_norm": 14.625, + "epoch": 0.37089871611982883, + "grad_norm": 21.25, "learning_rate": 2.076e-05, - "loss": 1.8311, + "loss": 2.1346, "step": 520 }, { - "epoch": 0.0638092944859138, - "grad_norm": 20.0, + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, "learning_rate": 2.116e-05, - "loss": 2.0005, + "loss": 2.2719, "step": 530 }, { - "epoch": 0.06501324343847821, - "grad_norm": 16.0, + "epoch": 0.38516405135520687, + "grad_norm": 27.75, "learning_rate": 2.1560000000000004e-05, - "loss": 1.7374, + "loss": 2.145, "step": 540 }, { - "epoch": 0.06621719239104262, - "grad_norm": 13.0625, + "epoch": 0.39229671897289586, + "grad_norm": 16.125, "learning_rate": 2.196e-05, - "loss": 1.7838, + "loss": 2.0912, "step": 550 }, { - "epoch": 0.06742114134360704, - "grad_norm": 16.5, + "epoch": 0.39942938659058486, + "grad_norm": 20.25, "learning_rate": 2.236e-05, - "loss": 1.8264, + "loss": 2.0302, "step": 560 }, { - "epoch": 0.06862509029617145, - "grad_norm": 20.5, + "epoch": 0.4065620542082739, + "grad_norm": 17.75, "learning_rate": 2.2760000000000002e-05, - "loss": 1.658, + "loss": 2.1832, "step": 570 }, { - "epoch": 0.06982903924873585, - "grad_norm": 25.75, + "epoch": 0.4136947218259629, + "grad_norm": 14.5, "learning_rate": 2.3160000000000002e-05, - "loss": 1.7826, + "loss": 1.9652, "step": 580 }, { - "epoch": 0.07103298820130026, - "grad_norm": 19.375, + "epoch": 0.42082738944365194, + "grad_norm": 17.0, "learning_rate": 2.356e-05, - "loss": 1.6539, + "loss": 1.8911, "step": 590 }, { - "epoch": 0.07223693715386467, - "grad_norm": 19.25, + "epoch": 0.42796005706134094, + "grad_norm": 20.0, "learning_rate": 2.396e-05, - "loss": 1.6278, + "loss": 2.0266, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval/acc": 20.930233001708984, + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval_loss": 3.387899398803711, - "eval_runtime": 0.2536, - "eval_samples_per_second": 169.572, - "eval_steps_per_second": 3.944, + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, "step": 600 }, { - "epoch": 0.07344088610642908, - "grad_norm": 12.0625, + "epoch": 0.43509272467902993, + "grad_norm": 25.5, "learning_rate": 2.4360000000000004e-05, - "loss": 1.5342, + "loss": 1.9116, "step": 610 }, { - "epoch": 0.0746448350589935, - "grad_norm": 15.625, + "epoch": 0.442225392296719, + "grad_norm": 25.375, "learning_rate": 2.476e-05, - "loss": 1.5919, + "loss": 1.7644, "step": 620 }, { - "epoch": 0.07584878401155791, - "grad_norm": 25.5, + "epoch": 0.44935805991440797, + "grad_norm": 15.5, "learning_rate": 2.516e-05, - "loss": 1.5713, + "loss": 1.9008, "step": 630 }, { - "epoch": 0.07705273296412232, - "grad_norm": 14.8125, + "epoch": 0.456490727532097, + "grad_norm": 16.875, "learning_rate": 2.556e-05, - "loss": 1.4714, + "loss": 1.619, "step": 640 }, { - "epoch": 0.07825668191668674, - "grad_norm": 21.5, + "epoch": 0.463623395149786, + "grad_norm": 37.25, "learning_rate": 2.5960000000000002e-05, - "loss": 1.5835, + "loss": 1.7725, "step": 650 }, { - "epoch": 0.07946063086925115, - "grad_norm": 58.0, + "epoch": 0.47075606276747506, + "grad_norm": 16.5, "learning_rate": 2.6360000000000002e-05, - "loss": 1.5369, + "loss": 1.7405, "step": 660 }, { - "epoch": 0.08066457982181556, - "grad_norm": 45.0, + "epoch": 0.47788873038516405, + "grad_norm": 16.25, "learning_rate": 2.676e-05, - "loss": 1.4629, + "loss": 1.5825, "step": 670 }, { - "epoch": 0.08186852877437997, - "grad_norm": 14.1875, + "epoch": 0.48502139800285304, + "grad_norm": 68.5, "learning_rate": 2.716e-05, - "loss": 1.4288, + "loss": 1.8379, "step": 680 }, { - "epoch": 0.08307247772694437, - "grad_norm": 40.25, + "epoch": 0.4921540656205421, + "grad_norm": 50.0, "learning_rate": 2.7560000000000004e-05, - "loss": 1.4729, + "loss": 1.7989, "step": 690 }, { - "epoch": 0.08427642667950878, - "grad_norm": 13.625, + "epoch": 0.4992867332382311, + "grad_norm": 16.25, "learning_rate": 2.7960000000000003e-05, - "loss": 1.4883, + "loss": 1.7058, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval/acc": 23.255813598632812, + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval_loss": 3.206946611404419, - "eval_runtime": 0.4188, - "eval_samples_per_second": 102.684, - "eval_steps_per_second": 2.388, + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, "step": 700 }, { - "epoch": 0.0854803756320732, - "grad_norm": 15.75, + "epoch": 0.5064194008559201, + "grad_norm": 14.625, "learning_rate": 2.8360000000000003e-05, - "loss": 1.5656, + "loss": 1.6542, "step": 710 }, { - "epoch": 0.08668432458463761, - "grad_norm": 22.25, + "epoch": 0.5135520684736091, + "grad_norm": 71.0, "learning_rate": 2.8760000000000002e-05, - "loss": 1.6742, + "loss": 1.6763, "step": 720 }, { - "epoch": 0.08788827353720202, - "grad_norm": 12.3125, + "epoch": 0.5206847360912982, + "grad_norm": 17.125, "learning_rate": 2.9160000000000005e-05, - "loss": 1.35, + "loss": 1.6858, "step": 730 }, { - "epoch": 0.08909222248976643, - "grad_norm": 13.8125, + "epoch": 0.5278174037089871, + "grad_norm": 19.75, "learning_rate": 2.9559999999999998e-05, - "loss": 1.4435, + "loss": 1.6718, "step": 740 }, { - "epoch": 0.09029617144233085, - "grad_norm": 13.1875, + "epoch": 0.5349500713266762, + "grad_norm": 13.375, "learning_rate": 2.9959999999999998e-05, - "loss": 1.3843, + "loss": 1.6164, "step": 750 }, { - "epoch": 0.09150012039489526, - "grad_norm": 13.3125, + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, "learning_rate": 3.036e-05, - "loss": 1.3327, + "loss": 1.6049, "step": 760 }, { - "epoch": 0.09270406934745967, - "grad_norm": 18.875, + "epoch": 0.5492154065620543, + "grad_norm": 35.75, "learning_rate": 3.076e-05, - "loss": 1.4628, + "loss": 1.5453, "step": 770 }, { - "epoch": 0.09390801830002408, - "grad_norm": 14.5625, + "epoch": 0.5563480741797432, + "grad_norm": 28.75, "learning_rate": 3.116e-05, - "loss": 1.3306, + "loss": 1.4818, "step": 780 }, { - "epoch": 0.09511196725258848, - "grad_norm": 18.75, + "epoch": 0.5634807417974322, + "grad_norm": 17.375, "learning_rate": 3.156e-05, - "loss": 1.4936, + "loss": 1.5647, "step": 790 }, { - "epoch": 0.0963159162051529, - "grad_norm": 11.5, + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, "learning_rate": 3.196e-05, - "loss": 1.3515, + "loss": 1.5206, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval/acc": 22.674419403076172, + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval_loss": 3.1510462760925293, - "eval_runtime": 0.2676, - "eval_samples_per_second": 160.701, - "eval_steps_per_second": 3.737, + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, "step": 800 }, { - "epoch": 0.09751986515771731, - "grad_norm": 11.6875, + "epoch": 0.5777460770328102, + "grad_norm": 17.125, "learning_rate": 3.236e-05, - "loss": 1.4593, + "loss": 1.6124, "step": 810 }, { - "epoch": 0.09872381411028172, - "grad_norm": 10.5625, + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, "learning_rate": 3.2760000000000005e-05, - "loss": 1.3453, + "loss": 1.4254, "step": 820 }, { - "epoch": 0.09992776306284613, - "grad_norm": 11.625, + "epoch": 0.5920114122681883, + "grad_norm": 15.0, "learning_rate": 3.316e-05, - "loss": 1.4041, + "loss": 1.7124, "step": 830 }, { - "epoch": 0.10113171201541055, - "grad_norm": 13.0, + "epoch": 0.5991440798858774, + "grad_norm": 14.75, "learning_rate": 3.3560000000000004e-05, - "loss": 1.2766, + "loss": 1.5384, "step": 840 }, { - "epoch": 0.10233566096797496, - "grad_norm": 40.0, + "epoch": 0.6062767475035663, + "grad_norm": 31.5, "learning_rate": 3.396e-05, - "loss": 1.2678, + "loss": 1.4899, "step": 850 }, { - "epoch": 0.10353960992053937, - "grad_norm": 13.75, + "epoch": 0.6134094151212554, + "grad_norm": 13.875, "learning_rate": 3.436e-05, - "loss": 1.2514, + "loss": 1.5377, "step": 860 }, { - "epoch": 0.10474355887310378, - "grad_norm": 11.75, + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, "learning_rate": 3.4760000000000006e-05, - "loss": 1.3518, + "loss": 1.4892, "step": 870 }, { - "epoch": 0.1059475078256682, - "grad_norm": 11.875, + "epoch": 0.6276747503566333, + "grad_norm": 37.25, "learning_rate": 3.516e-05, - "loss": 1.2675, + "loss": 1.4872, "step": 880 }, { - "epoch": 0.10715145677823261, - "grad_norm": 13.0, + "epoch": 0.6348074179743224, + "grad_norm": 18.875, "learning_rate": 3.5560000000000005e-05, - "loss": 1.294, + "loss": 1.536, "step": 890 }, { - "epoch": 0.10835540573079701, - "grad_norm": 13.0, + "epoch": 0.6419400855920114, + "grad_norm": 18.625, "learning_rate": 3.596e-05, - "loss": 1.1209, + "loss": 1.5208, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval/acc": 25.581396102905273, + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval_loss": 3.0571491718292236, - "eval_runtime": 0.3097, - "eval_samples_per_second": 138.846, - "eval_steps_per_second": 3.229, + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, "step": 900 }, { - "epoch": 0.10955935468336142, - "grad_norm": 12.75, + "epoch": 0.6490727532097005, + "grad_norm": 19.875, "learning_rate": 3.636e-05, - "loss": 1.2681, + "loss": 1.4606, "step": 910 }, { - "epoch": 0.11076330363592583, - "grad_norm": 17.0, + "epoch": 0.6562054208273894, + "grad_norm": 12.625, "learning_rate": 3.676e-05, - "loss": 1.2606, + "loss": 1.4728, "step": 920 }, { - "epoch": 0.11196725258849025, - "grad_norm": 11.375, + "epoch": 0.6633380884450785, + "grad_norm": 15.0, "learning_rate": 3.716e-05, - "loss": 1.2194, + "loss": 1.449, "step": 930 }, { - "epoch": 0.11317120154105466, - "grad_norm": 12.125, + "epoch": 0.6704707560627675, + "grad_norm": 19.0, "learning_rate": 3.756e-05, - "loss": 1.2905, + "loss": 1.5292, "step": 940 }, { - "epoch": 0.11437515049361907, - "grad_norm": 18.125, + "epoch": 0.6776034236804565, + "grad_norm": 111.5, "learning_rate": 3.796e-05, - "loss": 1.2563, + "loss": 1.4891, "step": 950 }, { - "epoch": 0.11557909944618348, - "grad_norm": 17.125, + "epoch": 0.6847360912981455, + "grad_norm": 14.75, "learning_rate": 3.836e-05, - "loss": 1.1894, + "loss": 1.4202, "step": 960 }, { - "epoch": 0.1167830483987479, - "grad_norm": 11.875, + "epoch": 0.6918687589158345, + "grad_norm": 20.25, "learning_rate": 3.876e-05, - "loss": 1.2441, + "loss": 1.5258, "step": 970 }, { - "epoch": 0.11798699735131231, - "grad_norm": 15.8125, + "epoch": 0.6990014265335235, + "grad_norm": 48.0, "learning_rate": 3.9160000000000005e-05, - "loss": 1.2627, + "loss": 1.3912, "step": 980 }, { - "epoch": 0.11919094630387672, - "grad_norm": 17.375, + "epoch": 0.7061340941512125, + "grad_norm": 13.0, "learning_rate": 3.956e-05, - "loss": 1.3929, + "loss": 1.4859, "step": 990 }, { - "epoch": 0.12039489525644112, - "grad_norm": 11.125, + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, "learning_rate": 3.9960000000000004e-05, - "loss": 1.1332, + "loss": 1.4614, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval/acc": 26.162790298461914, + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval_loss": 2.9910976886749268, - "eval_runtime": 0.2826, - "eval_samples_per_second": 152.17, - "eval_steps_per_second": 3.539, + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, "step": 1000 }, { - "epoch": 0.12159884420900553, - "grad_norm": 13.75, + "epoch": 0.7203994293865906, + "grad_norm": 16.625, "learning_rate": 4.0360000000000007e-05, - "loss": 1.2314, + "loss": 1.56, "step": 1010 }, { - "epoch": 0.12280279316156995, - "grad_norm": 11.875, + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, "learning_rate": 4.076e-05, - "loss": 1.2654, + "loss": 1.4469, "step": 1020 }, { - "epoch": 0.12400674211413436, - "grad_norm": 12.8125, + "epoch": 0.7346647646219686, + "grad_norm": 15.0, "learning_rate": 4.1160000000000006e-05, - "loss": 1.1432, + "loss": 1.381, "step": 1030 }, { - "epoch": 0.12521069106669877, - "grad_norm": 13.9375, + "epoch": 0.7417974322396577, + "grad_norm": 13.625, "learning_rate": 4.156e-05, - "loss": 1.1669, + "loss": 1.3749, "step": 1040 }, { - "epoch": 0.1264146400192632, - "grad_norm": 19.25, + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, "learning_rate": 4.196e-05, - "loss": 1.1836, + "loss": 1.3919, "step": 1050 }, { - "epoch": 0.1276185889718276, - "grad_norm": 11.375, + "epoch": 0.7560627674750356, + "grad_norm": 16.25, "learning_rate": 4.236e-05, - "loss": 1.2449, + "loss": 1.4208, "step": 1060 }, { - "epoch": 0.128822537924392, - "grad_norm": 10.6875, + "epoch": 0.7631954350927247, + "grad_norm": 27.75, "learning_rate": 4.276e-05, - "loss": 1.1361, + "loss": 1.3714, "step": 1070 }, { - "epoch": 0.13002648687695642, - "grad_norm": 11.5, + "epoch": 0.7703281027104137, + "grad_norm": 13.125, "learning_rate": 4.316e-05, - "loss": 1.1989, + "loss": 1.3344, "step": 1080 }, { - "epoch": 0.13123043582952082, - "grad_norm": 13.0, + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, "learning_rate": 4.356e-05, - "loss": 1.1004, + "loss": 1.3291, "step": 1090 }, { - "epoch": 0.13243438478208525, - "grad_norm": 10.125, + "epoch": 0.7845934379457917, + "grad_norm": 17.125, "learning_rate": 4.396e-05, - "loss": 1.1308, + "loss": 1.3536, "step": 1100 }, { - "epoch": 0.13243438478208525, + "epoch": 0.7845934379457917, "eval/acc": 27.9069766998291, "step": 1100 }, { - "epoch": 0.13243438478208525, - "eval_loss": 3.0177316665649414, - "eval_runtime": 0.2801, - "eval_samples_per_second": 153.54, - "eval_steps_per_second": 3.571, + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, "step": 1100 }, { - "epoch": 0.13363833373464964, - "grad_norm": 9.5, + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, "learning_rate": 4.436e-05, - "loss": 1.1862, + "loss": 1.4598, "step": 1110 }, { - "epoch": 0.13484228268721407, - "grad_norm": 13.75, + "epoch": 0.7988587731811697, + "grad_norm": 15.25, "learning_rate": 4.4760000000000005e-05, - "loss": 1.1764, + "loss": 1.3795, "step": 1120 }, { - "epoch": 0.13604623163977847, - "grad_norm": 30.625, + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, "learning_rate": 4.516e-05, - "loss": 1.0422, + "loss": 1.2518, "step": 1130 }, { - "epoch": 0.1372501805923429, - "grad_norm": 9.875, + "epoch": 0.8131241084165478, + "grad_norm": 16.625, "learning_rate": 4.5560000000000004e-05, - "loss": 1.1796, + "loss": 1.3104, "step": 1140 }, { - "epoch": 0.1384541295449073, - "grad_norm": 13.1875, + "epoch": 0.8202567760342369, + "grad_norm": 11.875, "learning_rate": 4.596e-05, - "loss": 1.0483, + "loss": 1.2996, "step": 1150 }, { - "epoch": 0.1396580784974717, - "grad_norm": 11.75, + "epoch": 0.8273894436519258, + "grad_norm": 24.125, "learning_rate": 4.636e-05, - "loss": 1.1647, + "loss": 1.2067, "step": 1160 }, { - "epoch": 0.14086202745003612, - "grad_norm": 13.375, + "epoch": 0.8345221112696148, + "grad_norm": 11.0, "learning_rate": 4.6760000000000006e-05, - "loss": 1.2839, + "loss": 1.3035, "step": 1170 }, { - "epoch": 0.14206597640260052, - "grad_norm": 42.0, + "epoch": 0.8416547788873039, + "grad_norm": 13.125, "learning_rate": 4.716e-05, - "loss": 1.1594, + "loss": 1.2859, "step": 1180 }, { - "epoch": 0.14326992535516495, - "grad_norm": 15.625, + "epoch": 0.8487874465049928, + "grad_norm": 11.0, "learning_rate": 4.7560000000000005e-05, - "loss": 1.1073, + "loss": 1.3982, "step": 1190 }, { - "epoch": 0.14447387430772934, - "grad_norm": 11.5, + "epoch": 0.8559201141226819, + "grad_norm": 12.875, "learning_rate": 4.796e-05, - "loss": 1.1593, + "loss": 1.299, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval/acc": 26.162790298461914, + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval_loss": 3.0329606533050537, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.829, - "eval_steps_per_second": 4.577, + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, "step": 1200 }, { - "epoch": 0.14567782326029377, - "grad_norm": 12.5625, + "epoch": 0.8630527817403709, + "grad_norm": 11.25, "learning_rate": 4.836e-05, - "loss": 1.1088, + "loss": 1.3549, "step": 1210 }, { - "epoch": 0.14688177221285817, - "grad_norm": 10.4375, + "epoch": 0.8701854493580599, + "grad_norm": 15.25, "learning_rate": 4.876e-05, - "loss": 1.1565, + "loss": 1.3649, "step": 1220 }, { - "epoch": 0.1480857211654226, - "grad_norm": 11.3125, + "epoch": 0.8773181169757489, + "grad_norm": 22.0, "learning_rate": 4.9160000000000004e-05, - "loss": 1.0596, + "loss": 1.2441, "step": 1230 }, { - "epoch": 0.149289670117987, - "grad_norm": 11.375, + "epoch": 0.884450784593438, + "grad_norm": 12.375, "learning_rate": 4.956e-05, - "loss": 1.2416, + "loss": 1.2196, "step": 1240 }, { - "epoch": 0.15049361907055142, - "grad_norm": 10.3125, + "epoch": 0.891583452211127, + "grad_norm": 14.25, "learning_rate": 4.996e-05, - "loss": 1.0492, + "loss": 1.3274, "step": 1250 }, { - "epoch": 0.15169756802311582, - "grad_norm": 10.9375, + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, "learning_rate": 5.0360000000000006e-05, - "loss": 1.0263, + "loss": 1.2896, "step": 1260 }, { - "epoch": 0.15290151697568022, - "grad_norm": 11.0625, + "epoch": 0.905848787446505, + "grad_norm": 16.875, "learning_rate": 5.076000000000001e-05, - "loss": 1.1197, + "loss": 1.3019, "step": 1270 }, { - "epoch": 0.15410546592824464, - "grad_norm": 33.25, + "epoch": 0.912981455064194, + "grad_norm": 26.375, "learning_rate": 5.1160000000000005e-05, - "loss": 1.0614, + "loss": 1.3756, "step": 1280 }, { - "epoch": 0.15530941488080904, - "grad_norm": 11.3125, + "epoch": 0.920114122681883, + "grad_norm": 18.25, "learning_rate": 5.1559999999999994e-05, - "loss": 1.0948, + "loss": 1.327, "step": 1290 }, { - "epoch": 0.15651336383337347, - "grad_norm": 24.5, + "epoch": 0.927246790299572, + "grad_norm": 11.3125, "learning_rate": 5.196e-05, - "loss": 1.1113, + "loss": 1.3237, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval/acc": 25.581396102905273, + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval_loss": 2.944797992706299, - "eval_runtime": 0.3019, - "eval_samples_per_second": 142.434, - "eval_steps_per_second": 3.312, + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, "step": 1300 }, { - "epoch": 0.15771731278593787, - "grad_norm": 12.4375, + "epoch": 0.9343794579172611, + "grad_norm": 18.125, "learning_rate": 5.236e-05, - "loss": 0.9531, + "loss": 1.256, "step": 1310 }, { - "epoch": 0.1589212617385023, - "grad_norm": 12.3125, + "epoch": 0.9415121255349501, + "grad_norm": 10.25, "learning_rate": 5.2759999999999996e-05, - "loss": 1.0079, + "loss": 1.1386, "step": 1320 }, { - "epoch": 0.1601252106910667, - "grad_norm": 13.1875, + "epoch": 0.948644793152639, + "grad_norm": 11.1875, "learning_rate": 5.316e-05, - "loss": 1.0674, + "loss": 1.3115, "step": 1330 }, { - "epoch": 0.16132915964363112, - "grad_norm": 16.875, + "epoch": 0.9557774607703281, + "grad_norm": 10.875, "learning_rate": 5.356e-05, - "loss": 1.1194, + "loss": 1.2315, "step": 1340 }, { - "epoch": 0.16253310859619552, - "grad_norm": 10.625, + "epoch": 0.9629101283880172, + "grad_norm": 12.0, "learning_rate": 5.396e-05, - "loss": 1.0057, + "loss": 1.3327, "step": 1350 }, { - "epoch": 0.16373705754875995, - "grad_norm": 9.125, + "epoch": 0.9700427960057061, + "grad_norm": 11.75, "learning_rate": 5.436e-05, - "loss": 1.1257, + "loss": 1.4052, "step": 1360 }, { - "epoch": 0.16494100650132434, - "grad_norm": 8.5, + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, "learning_rate": 5.476e-05, - "loss": 0.9545, + "loss": 1.1349, "step": 1370 }, { - "epoch": 0.16614495545388874, - "grad_norm": 10.25, + "epoch": 0.9843081312410842, + "grad_norm": 15.125, "learning_rate": 5.516e-05, - "loss": 1.0648, + "loss": 1.3803, "step": 1380 }, { - "epoch": 0.16734890440645317, - "grad_norm": 14.9375, + "epoch": 0.9914407988587732, + "grad_norm": 16.75, "learning_rate": 5.556e-05, - "loss": 1.0364, + "loss": 1.3536, "step": 1390 }, { - "epoch": 0.16855285335901757, - "grad_norm": 138.0, + "epoch": 0.9985734664764622, + "grad_norm": 10.625, "learning_rate": 5.596e-05, - "loss": 1.0255, + "loss": 1.2981, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval/acc": 27.9069766998291, + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval_loss": 2.763101100921631, - "eval_runtime": 0.2759, - "eval_samples_per_second": 155.826, - "eval_steps_per_second": 3.624, + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, "step": 1400 }, { - "epoch": 0.169756802311582, - "grad_norm": 11.8125, + "epoch": 1.005706134094151, + "grad_norm": 15.0, "learning_rate": 5.636e-05, - "loss": 0.9813, + "loss": 1.2173, "step": 1410 }, { - "epoch": 0.1709607512641464, - "grad_norm": 9.1875, + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, "learning_rate": 5.6760000000000005e-05, - "loss": 0.9929, + "loss": 1.1965, "step": 1420 }, { - "epoch": 0.17216470021671082, - "grad_norm": 10.875, + "epoch": 1.0199714693295292, + "grad_norm": 21.625, "learning_rate": 5.716e-05, - "loss": 0.9113, + "loss": 1.2494, "step": 1430 }, { - "epoch": 0.17336864916927522, - "grad_norm": 19.375, + "epoch": 1.0271041369472182, + "grad_norm": 13.0, "learning_rate": 5.7560000000000005e-05, - "loss": 1.0711, + "loss": 1.1948, "step": 1440 }, { - "epoch": 0.17457259812183964, - "grad_norm": 9.8125, + "epoch": 1.0342368045649073, + "grad_norm": 11.0, "learning_rate": 5.796e-05, - "loss": 0.9322, + "loss": 1.2641, "step": 1450 }, { - "epoch": 0.17577654707440404, - "grad_norm": 10.5, + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, "learning_rate": 5.8360000000000004e-05, - "loss": 1.0316, + "loss": 1.2526, "step": 1460 }, { - "epoch": 0.17698049602696847, - "grad_norm": 10.25, + "epoch": 1.0485021398002854, + "grad_norm": 46.0, "learning_rate": 5.876000000000001e-05, - "loss": 1.0165, + "loss": 1.0786, "step": 1470 }, { - "epoch": 0.17818444497953287, - "grad_norm": 10.4375, + "epoch": 1.0556348074179742, + "grad_norm": 11.0, "learning_rate": 5.916e-05, - "loss": 1.0229, + "loss": 1.3154, "step": 1480 }, { - "epoch": 0.17938839393209727, - "grad_norm": 14.4375, + "epoch": 1.0627674750356633, + "grad_norm": 18.75, "learning_rate": 5.9560000000000006e-05, - "loss": 0.9684, + "loss": 1.257, "step": 1490 }, { - "epoch": 0.1805923428846617, - "grad_norm": 8.375, + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, "learning_rate": 5.996e-05, - "loss": 0.9948, + "loss": 1.2636, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval/acc": 34.88372039794922, + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval_loss": 2.8177433013916016, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.732, - "eval_steps_per_second": 4.808, + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, "step": 1500 }, { - "epoch": 0.1817962918372261, - "grad_norm": 19.25, + "epoch": 1.0770328102710414, + "grad_norm": 13.75, "learning_rate": 6.0360000000000005e-05, - "loss": 0.9897, + "loss": 1.2602, "step": 1510 }, { - "epoch": 0.18300024078979052, - "grad_norm": 32.5, + "epoch": 1.0841654778887304, + "grad_norm": 11.625, "learning_rate": 6.076000000000001e-05, - "loss": 0.9217, + "loss": 1.0823, "step": 1520 }, { - "epoch": 0.18420418974235492, - "grad_norm": 9.5, + "epoch": 1.0912981455064195, + "grad_norm": 9.0, "learning_rate": 6.116e-05, - "loss": 1.0494, + "loss": 1.3059, "step": 1530 }, { - "epoch": 0.18540813869491934, - "grad_norm": 9.25, + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, "learning_rate": 6.156e-05, - "loss": 0.9359, + "loss": 1.2006, "step": 1540 }, { - "epoch": 0.18661208764748374, - "grad_norm": 11.375, + "epoch": 1.1055634807417973, + "grad_norm": 15.75, "learning_rate": 6.196000000000001e-05, - "loss": 0.9112, + "loss": 1.3731, "step": 1550 }, { - "epoch": 0.18781603660004817, - "grad_norm": 12.6875, + "epoch": 1.1126961483594864, + "grad_norm": 9.5, "learning_rate": 6.236e-05, - "loss": 1.07, + "loss": 1.1925, "step": 1560 }, { - "epoch": 0.18901998555261257, - "grad_norm": 11.1875, + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, "learning_rate": 6.276e-05, - "loss": 0.9853, + "loss": 1.1554, "step": 1570 }, { - "epoch": 0.19022393450517697, - "grad_norm": 8.375, + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, "learning_rate": 6.316000000000001e-05, - "loss": 0.9579, + "loss": 1.0875, "step": 1580 }, { - "epoch": 0.1914278834577414, - "grad_norm": 20.875, + "epoch": 1.1340941512125535, + "grad_norm": 10.875, "learning_rate": 6.356000000000001e-05, - "loss": 0.9401, + "loss": 1.1895, "step": 1590 }, { - "epoch": 0.1926318324103058, - "grad_norm": 8.9375, + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, "learning_rate": 6.396e-05, - "loss": 1.0279, + "loss": 1.2354, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval/acc": 30.23255729675293, + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval_loss": 2.8526248931884766, - "eval_runtime": 0.3114, - "eval_samples_per_second": 138.103, - "eval_steps_per_second": 3.212, + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, "step": 1600 }, { - "epoch": 0.19383578136287022, - "grad_norm": 7.78125, + "epoch": 1.1483594864479316, + "grad_norm": 12.375, "learning_rate": 6.436e-05, - "loss": 0.8743, + "loss": 1.2167, "step": 1610 }, { - "epoch": 0.19503973031543462, - "grad_norm": 9.8125, + "epoch": 1.1554921540656204, + "grad_norm": 10.375, "learning_rate": 6.476e-05, - "loss": 0.8702, + "loss": 1.1638, "step": 1620 }, { - "epoch": 0.19624367926799904, - "grad_norm": 12.4375, + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, "learning_rate": 6.515999999999999e-05, - "loss": 1.0028, + "loss": 1.1666, "step": 1630 }, { - "epoch": 0.19744762822056344, - "grad_norm": 10.125, + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, "learning_rate": 6.556e-05, - "loss": 0.9377, + "loss": 1.1961, "step": 1640 }, { - "epoch": 0.19865157717312787, - "grad_norm": 8.9375, + "epoch": 1.1768901569186876, + "grad_norm": 9.875, "learning_rate": 6.596e-05, - "loss": 1.031, + "loss": 1.2558, "step": 1650 }, { - "epoch": 0.19985552612569227, - "grad_norm": 8.5625, + "epoch": 1.1840228245363766, + "grad_norm": 10.375, "learning_rate": 6.636e-05, - "loss": 1.0162, + "loss": 1.1728, "step": 1660 }, { - "epoch": 0.2010594750782567, - "grad_norm": 33.75, + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, "learning_rate": 6.676e-05, - "loss": 0.9448, + "loss": 1.2947, "step": 1670 }, { - "epoch": 0.2022634240308211, - "grad_norm": 9.625, + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, "learning_rate": 6.716e-05, - "loss": 1.0077, + "loss": 1.2151, "step": 1680 }, { - "epoch": 0.2034673729833855, - "grad_norm": 8.6875, + "epoch": 1.2054208273894436, + "grad_norm": 10.5, "learning_rate": 6.756e-05, - "loss": 0.9654, + "loss": 1.0612, "step": 1690 }, { - "epoch": 0.20467132193594992, - "grad_norm": 12.625, + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, "learning_rate": 6.796e-05, - "loss": 0.8899, + "loss": 1.1079, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval/acc": 32.55813980102539, + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval_loss": 2.7813549041748047, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.701, - "eval_steps_per_second": 4.691, + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, "step": 1700 }, { - "epoch": 0.20587527088851432, - "grad_norm": 12.0, + "epoch": 1.2196861626248217, + "grad_norm": 11.25, "learning_rate": 6.836e-05, - "loss": 1.0412, + "loss": 1.1541, "step": 1710 }, { - "epoch": 0.20707921984107874, - "grad_norm": 11.75, + "epoch": 1.2268188302425107, + "grad_norm": 8.125, "learning_rate": 6.876e-05, - "loss": 0.9239, + "loss": 1.0772, "step": 1720 }, { - "epoch": 0.20828316879364314, - "grad_norm": 11.375, + "epoch": 1.2339514978601998, + "grad_norm": 18.125, "learning_rate": 6.916000000000001e-05, - "loss": 0.9243, + "loss": 1.1623, "step": 1730 }, { - "epoch": 0.20948711774620757, - "grad_norm": 12.0, + "epoch": 1.2410841654778888, + "grad_norm": 10.125, "learning_rate": 6.956e-05, - "loss": 1.0204, + "loss": 1.182, "step": 1740 }, { - "epoch": 0.21069106669877197, - "grad_norm": 13.0625, + "epoch": 1.2482168330955776, + "grad_norm": 9.75, "learning_rate": 6.996e-05, - "loss": 0.8811, + "loss": 1.0796, "step": 1750 }, { - "epoch": 0.2118950156513364, - "grad_norm": 17.0, + "epoch": 1.2553495007132667, + "grad_norm": 10.5, "learning_rate": 7.036e-05, - "loss": 0.8755, + "loss": 1.2374, "step": 1760 }, { - "epoch": 0.2130989646039008, - "grad_norm": 11.25, + "epoch": 1.2624821683309557, + "grad_norm": 20.875, "learning_rate": 7.076000000000001e-05, - "loss": 0.858, + "loss": 1.2718, "step": 1770 }, { - "epoch": 0.21430291355646522, - "grad_norm": 9.625, + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, "learning_rate": 7.116e-05, - "loss": 0.9076, + "loss": 1.0922, "step": 1780 }, { - "epoch": 0.21550686250902962, - "grad_norm": 10.4375, + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, "learning_rate": 7.156e-05, - "loss": 0.8817, + "loss": 1.0637, "step": 1790 }, { - "epoch": 0.21671081146159402, - "grad_norm": 12.8125, + "epoch": 1.2838801711840229, + "grad_norm": 9.5, "learning_rate": 7.196000000000001e-05, - "loss": 0.9121, + "loss": 1.1661, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval/acc": 30.813953399658203, + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval_loss": 2.6508796215057373, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.798, - "eval_steps_per_second": 4.577, + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, "step": 1800 }, { - "epoch": 0.21791476041415844, - "grad_norm": 16.5, + "epoch": 1.291012838801712, + "grad_norm": 14.3125, "learning_rate": 7.236e-05, - "loss": 0.9044, + "loss": 1.1139, "step": 1810 }, { - "epoch": 0.21911870936672284, - "grad_norm": 15.1875, + "epoch": 1.298145506419401, + "grad_norm": 41.5, "learning_rate": 7.276e-05, - "loss": 0.9552, + "loss": 1.0817, "step": 1820 }, { - "epoch": 0.22032265831928727, - "grad_norm": 11.375, + "epoch": 1.3052781740370898, + "grad_norm": 15.125, "learning_rate": 7.316000000000001e-05, - "loss": 0.9264, + "loss": 1.2462, "step": 1830 }, { - "epoch": 0.22152660727185167, - "grad_norm": 8.8125, + "epoch": 1.3124108416547788, + "grad_norm": 33.25, "learning_rate": 7.356000000000001e-05, - "loss": 0.8928, + "loss": 1.1143, "step": 1840 }, { - "epoch": 0.2227305562244161, - "grad_norm": 9.625, + "epoch": 1.3195435092724679, + "grad_norm": 13.625, "learning_rate": 7.396e-05, - "loss": 0.9515, + "loss": 1.1783, "step": 1850 }, { - "epoch": 0.2239345051769805, - "grad_norm": 31.0, + "epoch": 1.326676176890157, + "grad_norm": 18.375, "learning_rate": 7.436000000000001e-05, - "loss": 0.8989, + "loss": 1.2101, "step": 1860 }, { - "epoch": 0.22513845412954492, - "grad_norm": 9.5, + "epoch": 1.333808844507846, + "grad_norm": 13.875, "learning_rate": 7.476000000000001e-05, - "loss": 1.0206, + "loss": 1.1348, "step": 1870 }, { - "epoch": 0.22634240308210932, - "grad_norm": 8.625, + "epoch": 1.340941512125535, + "grad_norm": 13.9375, "learning_rate": 7.516e-05, - "loss": 0.8961, + "loss": 1.0747, "step": 1880 }, { - "epoch": 0.22754635203467374, - "grad_norm": 9.0, + "epoch": 1.3480741797432239, + "grad_norm": 29.75, "learning_rate": 7.556000000000002e-05, - "loss": 0.9421, + "loss": 1.1895, "step": 1890 }, { - "epoch": 0.22875030098723814, - "grad_norm": 12.0625, + "epoch": 1.355206847360913, + "grad_norm": 17.25, "learning_rate": 7.596000000000001e-05, - "loss": 0.9049, + "loss": 1.2512, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval/acc": 36.046512603759766, + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval_loss": 2.636018753051758, - "eval_runtime": 0.2084, - "eval_samples_per_second": 206.343, - "eval_steps_per_second": 4.799, + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, "step": 1900 }, { - "epoch": 0.22995424993980254, - "grad_norm": 8.0625, + "epoch": 1.362339514978602, + "grad_norm": 21.125, "learning_rate": 7.636e-05, - "loss": 0.8983, + "loss": 1.1306, "step": 1910 }, { - "epoch": 0.23115819889236697, - "grad_norm": 11.875, + "epoch": 1.369472182596291, + "grad_norm": 9.0625, "learning_rate": 7.676e-05, - "loss": 0.9293, + "loss": 1.1139, "step": 1920 }, { - "epoch": 0.23236214784493137, - "grad_norm": 11.75, + "epoch": 1.37660485021398, + "grad_norm": 30.25, "learning_rate": 7.716e-05, - "loss": 0.8602, + "loss": 1.1595, "step": 1930 }, { - "epoch": 0.2335660967974958, - "grad_norm": 11.5625, + "epoch": 1.383737517831669, + "grad_norm": 13.6875, "learning_rate": 7.756e-05, - "loss": 0.8078, + "loss": 1.2437, "step": 1940 }, { - "epoch": 0.2347700457500602, - "grad_norm": 9.125, + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, "learning_rate": 7.796e-05, - "loss": 0.8773, + "loss": 1.1005, "step": 1950 }, { - "epoch": 0.23597399470262462, - "grad_norm": 10.6875, + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, "learning_rate": 7.836e-05, - "loss": 0.8464, + "loss": 1.0748, "step": 1960 }, { - "epoch": 0.23717794365518902, - "grad_norm": 18.25, + "epoch": 1.405135520684736, + "grad_norm": 9.125, "learning_rate": 7.876e-05, - "loss": 0.8779, + "loss": 1.1576, "step": 1970 }, { - "epoch": 0.23838189260775344, - "grad_norm": 10.875, + "epoch": 1.412268188302425, + "grad_norm": 11.375, "learning_rate": 7.916e-05, - "loss": 0.9351, + "loss": 1.0982, "step": 1980 }, { - "epoch": 0.23958584156031784, - "grad_norm": 11.0, + "epoch": 1.4194008559201141, + "grad_norm": 10.375, "learning_rate": 7.956e-05, - "loss": 0.8581, + "loss": 1.132, "step": 1990 }, { - "epoch": 0.24078979051288224, - "grad_norm": 8.875, + "epoch": 1.4265335235378032, + "grad_norm": 16.375, "learning_rate": 7.996e-05, - "loss": 0.9799, + "loss": 1.121, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval/acc": 36.046512603759766, + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval_loss": 2.716654062271118, - "eval_runtime": 0.21, - "eval_samples_per_second": 204.721, - "eval_steps_per_second": 4.761, + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, "step": 2000 }, { - "epoch": 0.24199373946544667, - "grad_norm": 11.0625, + "epoch": 1.4336661911554922, + "grad_norm": 9.125, "learning_rate": 8.036e-05, - "loss": 0.8678, + "loss": 1.2079, "step": 2010 }, { - "epoch": 0.24319768841801107, + "epoch": 1.440798858773181, "grad_norm": 12.125, "learning_rate": 8.076e-05, - "loss": 0.8832, + "loss": 1.1098, "step": 2020 }, { - "epoch": 0.2444016373705755, - "grad_norm": 8.25, + "epoch": 1.44793152639087, + "grad_norm": 8.8125, "learning_rate": 8.116e-05, - "loss": 0.8689, + "loss": 0.9849, "step": 2030 }, { - "epoch": 0.2456055863231399, - "grad_norm": 6.53125, + "epoch": 1.4550641940085591, + "grad_norm": 9.0, "learning_rate": 8.156e-05, - "loss": 0.8829, + "loss": 1.0905, "step": 2040 }, { - "epoch": 0.24680953527570432, - "grad_norm": 9.5625, + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, "learning_rate": 8.196000000000001e-05, - "loss": 0.9181, + "loss": 1.2211, "step": 2050 }, { - "epoch": 0.24801348422826872, - "grad_norm": 22.875, + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, "learning_rate": 8.236e-05, - "loss": 0.8011, + "loss": 1.0968, "step": 2060 }, { - "epoch": 0.24921743318083314, - "grad_norm": 14.4375, + "epoch": 1.4764621968616263, + "grad_norm": 9.0, "learning_rate": 8.276e-05, - "loss": 0.9163, + "loss": 1.0973, "step": 2070 }, { - "epoch": 0.25042138213339754, - "grad_norm": 10.625, + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, "learning_rate": 8.316000000000001e-05, - "loss": 0.7869, + "loss": 1.1012, "step": 2080 }, { - "epoch": 0.25162533108596197, - "grad_norm": 11.0, + "epoch": 1.4907275320970044, + "grad_norm": 31.0, "learning_rate": 8.356e-05, - "loss": 0.8779, + "loss": 1.0437, "step": 2090 }, { - "epoch": 0.2528292800385264, - "grad_norm": 12.625, + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, "learning_rate": 8.396e-05, - "loss": 0.889, + "loss": 1.0934, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval/acc": 37.20930099487305, + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval_loss": 2.626293182373047, - "eval_runtime": 0.2735, - "eval_samples_per_second": 157.235, - "eval_steps_per_second": 3.657, + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, "step": 2100 }, { - "epoch": 0.25403322899109076, - "grad_norm": 8.3125, + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, "learning_rate": 8.436000000000001e-05, - "loss": 0.8363, + "loss": 1.0862, "step": 2110 }, { - "epoch": 0.2552371779436552, - "grad_norm": 8.625, + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, "learning_rate": 8.476000000000001e-05, - "loss": 0.8762, + "loss": 1.0786, "step": 2120 }, { - "epoch": 0.2564411268962196, - "grad_norm": 7.4375, + "epoch": 1.5192582025677603, + "grad_norm": 8.25, "learning_rate": 8.516e-05, - "loss": 0.7925, + "loss": 1.1496, "step": 2130 }, { - "epoch": 0.257645075848784, - "grad_norm": 9.1875, + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, "learning_rate": 8.556e-05, - "loss": 0.9575, + "loss": 1.1132, "step": 2140 }, { - "epoch": 0.2588490248013484, - "grad_norm": 9.8125, + "epoch": 1.5335235378031382, + "grad_norm": 21.375, "learning_rate": 8.596000000000001e-05, - "loss": 0.7551, + "loss": 1.1043, "step": 2150 }, { - "epoch": 0.26005297375391284, - "grad_norm": 7.15625, + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, "learning_rate": 8.636e-05, - "loss": 0.808, + "loss": 1.2549, "step": 2160 }, { - "epoch": 0.26125692270647727, - "grad_norm": 8.3125, + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, "learning_rate": 8.676e-05, - "loss": 0.9449, + "loss": 1.115, "step": 2170 }, { - "epoch": 0.26246087165904164, - "grad_norm": 11.5, + "epoch": 1.5549215406562054, + "grad_norm": 8.375, "learning_rate": 8.716000000000001e-05, - "loss": 0.8712, + "loss": 1.1963, "step": 2180 }, { - "epoch": 0.26366482061160607, - "grad_norm": 8.0, + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, "learning_rate": 8.756000000000001e-05, - "loss": 0.9389, + "loss": 1.1697, "step": 2190 }, { - "epoch": 0.2648687695641705, - "grad_norm": 13.5, + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, "learning_rate": 8.796e-05, - "loss": 0.7875, + "loss": 0.9716, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval/acc": 35.46511459350586, + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval_loss": 2.5862526893615723, - "eval_runtime": 0.2151, - "eval_samples_per_second": 199.927, - "eval_steps_per_second": 4.649, + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, "step": 2200 }, { - "epoch": 0.26607271851673486, - "grad_norm": 11.5625, + "epoch": 1.5763195435092725, + "grad_norm": 10.0, "learning_rate": 8.836000000000001e-05, - "loss": 0.9947, + "loss": 1.0254, "step": 2210 }, { - "epoch": 0.2672766674692993, - "grad_norm": 8.25, + "epoch": 1.5834522111269616, + "grad_norm": 12.625, "learning_rate": 8.876e-05, - "loss": 0.717, + "loss": 1.1672, "step": 2220 }, { - "epoch": 0.2684806164218637, - "grad_norm": 26.25, + "epoch": 1.5905848787446506, + "grad_norm": 11.5, "learning_rate": 8.916e-05, - "loss": 0.8688, + "loss": 1.0656, "step": 2230 }, { - "epoch": 0.26968456537442814, - "grad_norm": 11.5, + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, "learning_rate": 8.956e-05, - "loss": 0.9134, + "loss": 1.035, "step": 2240 }, { - "epoch": 0.2708885143269925, - "grad_norm": 6.875, + "epoch": 1.6048502139800287, + "grad_norm": 9.25, "learning_rate": 8.996e-05, - "loss": 0.8592, + "loss": 1.0972, "step": 2250 }, { - "epoch": 0.27209246327955694, - "grad_norm": 7.21875, + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, "learning_rate": 9.036e-05, - "loss": 0.6548, + "loss": 1.0148, "step": 2260 }, { - "epoch": 0.27329641223212137, - "grad_norm": 12.25, + "epoch": 1.6191155492154066, + "grad_norm": 13.5, "learning_rate": 9.076e-05, - "loss": 0.8613, + "loss": 1.1202, "step": 2270 }, { - "epoch": 0.2745003611846858, - "grad_norm": 8.875, + "epoch": 1.6262482168330956, + "grad_norm": 9.125, "learning_rate": 9.116e-05, - "loss": 0.7455, + "loss": 1.1134, "step": 2280 }, { - "epoch": 0.27570431013725016, - "grad_norm": 12.5625, + "epoch": 1.6333808844507844, + "grad_norm": 15.25, "learning_rate": 9.156e-05, - "loss": 0.8458, + "loss": 1.0373, "step": 2290 }, { - "epoch": 0.2769082590898146, - "grad_norm": 8.8125, + "epoch": 1.6405135520684735, + "grad_norm": 9.125, "learning_rate": 9.196000000000001e-05, - "loss": 0.8003, + "loss": 1.0654, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval/acc": 32.55813980102539, + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval_loss": 2.6594340801239014, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.965, - "eval_steps_per_second": 4.697, + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, "step": 2300 }, { - "epoch": 0.278112208042379, - "grad_norm": 10.6875, + "epoch": 1.6476462196861625, + "grad_norm": 8.25, "learning_rate": 9.236e-05, - "loss": 0.812, + "loss": 1.0218, "step": 2310 }, { - "epoch": 0.2793161569949434, - "grad_norm": 12.1875, + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, "learning_rate": 9.276e-05, - "loss": 0.781, + "loss": 1.106, "step": 2320 }, { - "epoch": 0.2805201059475078, - "grad_norm": 8.125, + "epoch": 1.6619115549215406, + "grad_norm": 8.25, "learning_rate": 9.316000000000001e-05, - "loss": 0.9682, + "loss": 1.0558, "step": 2330 }, { - "epoch": 0.28172405490007224, - "grad_norm": 8.8125, + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, "learning_rate": 9.356e-05, - "loss": 0.7531, + "loss": 0.9931, "step": 2340 }, { - "epoch": 0.28292800385263667, - "grad_norm": 7.375, + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, "learning_rate": 9.396e-05, - "loss": 0.7235, + "loss": 1.0683, "step": 2350 }, { - "epoch": 0.28413195280520104, - "grad_norm": 7.8125, + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, "learning_rate": 9.436e-05, - "loss": 0.9204, + "loss": 1.0631, "step": 2360 }, { - "epoch": 0.28533590175776546, - "grad_norm": 6.65625, + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, "learning_rate": 9.476000000000001e-05, - "loss": 0.7636, + "loss": 1.049, "step": 2370 }, { - "epoch": 0.2865398507103299, - "grad_norm": 9.625, + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, "learning_rate": 9.516e-05, - "loss": 0.855, + "loss": 1.0259, "step": 2380 }, { - "epoch": 0.2877437996628943, - "grad_norm": 9.6875, + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, "learning_rate": 9.556e-05, - "loss": 0.8643, + "loss": 1.0085, "step": 2390 }, { - "epoch": 0.2889477486154587, - "grad_norm": 7.1875, + "epoch": 1.7118402282453637, + "grad_norm": 131.0, "learning_rate": 9.596000000000001e-05, - "loss": 0.8258, + "loss": 0.944, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval/acc": 36.627906799316406, + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval_loss": 2.7174084186553955, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.672, - "eval_steps_per_second": 4.737, + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, "step": 2400 }, { - "epoch": 0.2901516975680231, - "grad_norm": 7.65625, + "epoch": 1.7189728958630528, + "grad_norm": 8.375, "learning_rate": 9.636e-05, - "loss": 0.8752, + "loss": 1.0069, "step": 2410 }, { - "epoch": 0.29135564652058754, - "grad_norm": 8.75, + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, "learning_rate": 9.676e-05, - "loss": 0.8082, + "loss": 1.0648, "step": 2420 }, { - "epoch": 0.2925595954731519, - "grad_norm": 10.4375, + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, "learning_rate": 9.716000000000001e-05, - "loss": 0.7538, + "loss": 1.0594, "step": 2430 }, { - "epoch": 0.29376354442571634, - "grad_norm": 6.4375, + "epoch": 1.7403708987161197, + "grad_norm": 8.75, "learning_rate": 9.756000000000001e-05, - "loss": 0.7766, + "loss": 1.2082, "step": 2440 }, { - "epoch": 0.29496749337828077, - "grad_norm": 7.96875, + "epoch": 1.7475035663338088, + "grad_norm": 9.875, "learning_rate": 9.796e-05, - "loss": 0.844, + "loss": 1.0225, "step": 2450 }, { - "epoch": 0.2961714423308452, - "grad_norm": 7.75, + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, "learning_rate": 9.836000000000001e-05, - "loss": 0.7127, + "loss": 0.9975, "step": 2460 }, { - "epoch": 0.29737539128340956, - "grad_norm": 11.5, + "epoch": 1.7617689015691869, + "grad_norm": 21.0, "learning_rate": 9.876000000000001e-05, - "loss": 0.8363, + "loss": 0.9533, "step": 2470 }, { - "epoch": 0.298579340235974, - "grad_norm": 6.4375, + "epoch": 1.768901569186876, + "grad_norm": 7.65625, "learning_rate": 9.916e-05, - "loss": 0.7429, + "loss": 0.9619, "step": 2480 }, { - "epoch": 0.2997832891885384, - "grad_norm": 11.5, + "epoch": 1.776034236804565, + "grad_norm": 13.625, "learning_rate": 9.956e-05, - "loss": 0.736, + "loss": 0.9425, "step": 2490 }, { - "epoch": 0.30098723814110284, - "grad_norm": 9.25, + "epoch": 1.783166904422254, + "grad_norm": 12.375, "learning_rate": 9.996000000000001e-05, - "loss": 0.8365, + "loss": 0.9893, "step": 2500 }, { - "epoch": 0.30098723814110284, + "epoch": 1.783166904422254, "eval/acc": 39.53488540649414, "step": 2500 }, { - "epoch": 0.30098723814110284, - "eval_loss": 2.713433027267456, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.919, - "eval_steps_per_second": 4.789, + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 2500 }, { - "epoch": 0.3021911870936672, - "grad_norm": 7.03125, + "epoch": 1.790299572039943, + "grad_norm": 10.0, "learning_rate": 9.996000000000001e-05, - "loss": 0.7664, + "loss": 1.0137, "step": 2510 }, { - "epoch": 0.30339513604623164, - "grad_norm": 7.75, + "epoch": 1.797432239657632, + "grad_norm": 10.125, "learning_rate": 9.991555555555556e-05, - "loss": 0.9128, + "loss": 1.059, "step": 2520 }, { - "epoch": 0.30459908499879607, - "grad_norm": 9.0, + "epoch": 1.804564907275321, + "grad_norm": 32.0, "learning_rate": 9.987111111111111e-05, - "loss": 0.8045, + "loss": 1.0498, "step": 2530 }, { - "epoch": 0.30580303395136044, - "grad_norm": 8.9375, + "epoch": 1.81169757489301, + "grad_norm": 10.125, "learning_rate": 9.982666666666667e-05, - "loss": 0.8292, + "loss": 1.1431, "step": 2540 }, { - "epoch": 0.30700698290392486, - "grad_norm": 7.40625, + "epoch": 1.818830242510699, + "grad_norm": 7.90625, "learning_rate": 9.978222222222223e-05, - "loss": 0.7557, + "loss": 1.0715, "step": 2550 }, { - "epoch": 0.3082109318564893, - "grad_norm": 7.625, + "epoch": 1.825962910128388, + "grad_norm": 10.9375, "learning_rate": 9.973777777777778e-05, - "loss": 0.683, + "loss": 1.0446, "step": 2560 }, { - "epoch": 0.3094148808090537, - "grad_norm": 8.1875, + "epoch": 1.833095577746077, + "grad_norm": 13.0, "learning_rate": 9.969333333333334e-05, - "loss": 0.8052, + "loss": 1.0291, "step": 2570 }, { - "epoch": 0.3106188297616181, - "grad_norm": 8.4375, + "epoch": 1.840228245363766, + "grad_norm": 9.75, "learning_rate": 9.964888888888889e-05, - "loss": 0.7819, + "loss": 0.9713, "step": 2580 }, { - "epoch": 0.3118227787141825, - "grad_norm": 10.8125, + "epoch": 1.847360912981455, + "grad_norm": 10.5625, "learning_rate": 9.960444444444444e-05, - "loss": 0.8452, + "loss": 1.2157, "step": 2590 }, { - "epoch": 0.31302672766674694, - "grad_norm": 6.21875, + "epoch": 1.854493580599144, + "grad_norm": 9.3125, "learning_rate": 9.956e-05, - "loss": 0.7478, + "loss": 1.0455, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval/acc": 34.88372039794922, + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval_loss": 2.6625020503997803, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.644, - "eval_steps_per_second": 4.852, + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, "step": 2600 }, { - "epoch": 0.31423067661931137, - "grad_norm": 7.375, + "epoch": 1.861626248216833, + "grad_norm": 10.5, "learning_rate": 9.951555555555556e-05, - "loss": 0.7623, + "loss": 1.0604, "step": 2610 }, { - "epoch": 0.31543462557187574, - "grad_norm": 9.0, + "epoch": 1.8687589158345221, + "grad_norm": 9.375, "learning_rate": 9.947111111111111e-05, - "loss": 0.8223, + "loss": 0.8715, "step": 2620 }, { - "epoch": 0.31663857452444016, - "grad_norm": 6.75, + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, "learning_rate": 9.942666666666667e-05, - "loss": 0.7797, + "loss": 1.0034, "step": 2630 }, { - "epoch": 0.3178425234770046, - "grad_norm": 9.125, + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, "learning_rate": 9.938222222222224e-05, - "loss": 0.6746, + "loss": 1.0557, "step": 2640 }, { - "epoch": 0.31904647242956896, - "grad_norm": 8.5, + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, "learning_rate": 9.933777777777779e-05, - "loss": 0.8434, + "loss": 0.974, "step": 2650 }, { - "epoch": 0.3202504213821334, - "grad_norm": 10.3125, + "epoch": 1.8972895863052783, + "grad_norm": 10.875, "learning_rate": 9.929333333333333e-05, - "loss": 0.8625, + "loss": 1.1366, "step": 2660 }, { - "epoch": 0.3214543703346978, - "grad_norm": 8.125, + "epoch": 1.9044222539229672, + "grad_norm": 28.75, "learning_rate": 9.92488888888889e-05, - "loss": 0.8003, + "loss": 1.0135, "step": 2670 }, { - "epoch": 0.32265831928726224, - "grad_norm": 8.5625, + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, "learning_rate": 9.920444444444444e-05, - "loss": 0.8145, + "loss": 1.0263, "step": 2680 }, { - "epoch": 0.3238622682398266, - "grad_norm": 8.0, + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, "learning_rate": 9.916e-05, - "loss": 0.6519, + "loss": 0.9952, "step": 2690 }, { - "epoch": 0.32506621719239104, - "grad_norm": 8.5625, + "epoch": 1.925820256776034, + "grad_norm": 8.8125, "learning_rate": 9.911555555555557e-05, - "loss": 0.7627, + "loss": 1.0438, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval/acc": 38.953487396240234, + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval_loss": 2.629239082336426, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.931, - "eval_steps_per_second": 4.626, + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, "step": 2700 }, { - "epoch": 0.32627016614495546, - "grad_norm": 7.625, + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, "learning_rate": 9.907111111111112e-05, - "loss": 0.7265, + "loss": 0.9522, "step": 2710 }, { - "epoch": 0.3274741150975199, - "grad_norm": 7.15625, + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, "learning_rate": 9.902666666666666e-05, - "loss": 0.7468, + "loss": 0.9729, "step": 2720 }, { - "epoch": 0.32867806405008426, - "grad_norm": 8.5, + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, "learning_rate": 9.898222222222223e-05, - "loss": 0.7816, + "loss": 1.0528, "step": 2730 }, { - "epoch": 0.3298820130026487, - "grad_norm": 6.8125, + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, "learning_rate": 9.893777777777779e-05, - "loss": 0.7828, + "loss": 1.1212, "step": 2740 }, { - "epoch": 0.3310859619552131, - "grad_norm": 8.5625, + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, "learning_rate": 9.889333333333334e-05, - "loss": 0.8273, + "loss": 0.9866, "step": 2750 }, { - "epoch": 0.3322899109077775, - "grad_norm": 7.28125, + "epoch": 1.9686162624821684, + "grad_norm": 8.25, "learning_rate": 9.884888888888889e-05, - "loss": 0.6265, + "loss": 0.8616, "step": 2760 }, { - "epoch": 0.3334938598603419, - "grad_norm": 7.78125, + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, "learning_rate": 9.880444444444445e-05, - "loss": 0.8716, + "loss": 0.9972, "step": 2770 }, { - "epoch": 0.33469780881290634, - "grad_norm": 6.0, + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, "learning_rate": 9.876000000000001e-05, - "loss": 0.7587, + "loss": 0.9781, "step": 2780 }, { - "epoch": 0.33590175776547077, - "grad_norm": 11.8125, + "epoch": 1.9900142653352355, + "grad_norm": 10.75, "learning_rate": 9.871555555555556e-05, - "loss": 0.836, + "loss": 1.0579, "step": 2790 }, { - "epoch": 0.33710570671803514, - "grad_norm": 8.3125, + "epoch": 1.9971469329529246, + "grad_norm": 8.25, "learning_rate": 9.867111111111112e-05, - "loss": 0.7196, + "loss": 1.0323, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval/acc": 34.88372039794922, + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval_loss": 2.5979089736938477, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.843, - "eval_steps_per_second": 4.717, + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, "step": 2800 }, { - "epoch": 0.33830965567059956, - "grad_norm": 8.125, + "epoch": 2.0042796005706136, + "grad_norm": 10.25, "learning_rate": 9.862666666666667e-05, - "loss": 0.7128, + "loss": 1.0597, "step": 2810 }, { - "epoch": 0.339513604623164, - "grad_norm": 7.0, + "epoch": 2.011412268188302, + "grad_norm": 7.0625, "learning_rate": 9.858222222222223e-05, - "loss": 0.8709, + "loss": 0.9582, "step": 2820 }, { - "epoch": 0.3407175535757284, - "grad_norm": 10.875, + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, "learning_rate": 9.853777777777778e-05, - "loss": 0.6885, + "loss": 1.0058, "step": 2830 }, { - "epoch": 0.3419215025282928, - "grad_norm": 6.625, + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, "learning_rate": 9.849333333333334e-05, - "loss": 0.8262, + "loss": 1.009, "step": 2840 }, { - "epoch": 0.3431254514808572, - "grad_norm": 9.0625, + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, "learning_rate": 9.844888888888889e-05, - "loss": 0.6365, + "loss": 0.93, "step": 2850 }, { - "epoch": 0.34432940043342164, - "grad_norm": 7.96875, + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, "learning_rate": 9.840444444444445e-05, - "loss": 0.8177, + "loss": 1.0953, "step": 2860 }, { - "epoch": 0.345533349385986, - "grad_norm": 6.71875, + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, "learning_rate": 9.836000000000001e-05, - "loss": 0.7043, + "loss": 1.0437, "step": 2870 }, { - "epoch": 0.34673729833855044, - "grad_norm": 10.4375, + "epoch": 2.0542082738944365, + "grad_norm": 8.75, "learning_rate": 9.831555555555556e-05, - "loss": 0.7503, + "loss": 0.9873, "step": 2880 }, { - "epoch": 0.34794124729111486, - "grad_norm": 7.375, + "epoch": 2.0613409415121255, + "grad_norm": 8.375, "learning_rate": 9.827111111111111e-05, - "loss": 0.7532, + "loss": 0.9414, "step": 2890 }, { - "epoch": 0.3491451962436793, - "grad_norm": 7.65625, + "epoch": 2.0684736091298146, + "grad_norm": 9.0, "learning_rate": 9.822666666666667e-05, - "loss": 0.6942, + "loss": 0.9625, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval/acc": 37.79069900512695, + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval_loss": 2.698911190032959, - "eval_runtime": 1.2554, - "eval_samples_per_second": 34.253, - "eval_steps_per_second": 0.797, + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, "step": 2900 }, { - "epoch": 0.35034914519624366, - "grad_norm": 7.1875, + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, "learning_rate": 9.818222222222223e-05, - "loss": 0.7651, + "loss": 1.0246, "step": 2910 }, { - "epoch": 0.3515530941488081, - "grad_norm": 6.0, + "epoch": 2.0827389443651927, + "grad_norm": 8.125, "learning_rate": 9.813777777777778e-05, - "loss": 0.7786, + "loss": 0.9646, "step": 2920 }, { - "epoch": 0.3527570431013725, - "grad_norm": 9.375, + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, "learning_rate": 9.809333333333333e-05, - "loss": 0.8285, + "loss": 1.0022, "step": 2930 }, { - "epoch": 0.35396099205393694, - "grad_norm": 6.4375, + "epoch": 2.097004279600571, + "grad_norm": 8.625, "learning_rate": 9.80488888888889e-05, - "loss": 0.7339, + "loss": 0.9834, "step": 2940 }, { - "epoch": 0.3551649410065013, - "grad_norm": 8.8125, + "epoch": 2.10413694721826, + "grad_norm": 45.25, "learning_rate": 9.800444444444446e-05, - "loss": 0.6948, + "loss": 0.9159, "step": 2950 }, { - "epoch": 0.35636888995906574, - "grad_norm": 11.4375, + "epoch": 2.1112696148359484, + "grad_norm": 9.375, "learning_rate": 9.796e-05, - "loss": 0.8455, + "loss": 1.0598, "step": 2960 }, { - "epoch": 0.35757283891163016, - "grad_norm": 8.5625, + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, "learning_rate": 9.791555555555557e-05, - "loss": 0.791, + "loss": 0.8848, "step": 2970 }, { - "epoch": 0.35877678786419454, - "grad_norm": 7.84375, + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, "learning_rate": 9.787111111111111e-05, - "loss": 0.8574, + "loss": 0.942, "step": 2980 }, { - "epoch": 0.35998073681675896, - "grad_norm": 9.4375, + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, "learning_rate": 9.782666666666666e-05, - "loss": 0.7923, + "loss": 0.9583, "step": 2990 }, { - "epoch": 0.3611846857693234, - "grad_norm": 8.0625, + "epoch": 2.1398002853067046, + "grad_norm": 9.0, "learning_rate": 9.778222222222222e-05, - "loss": 0.863, + "loss": 0.9836, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval/acc": 41.86046600341797, + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval_loss": 2.5240559577941895, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.269, - "eval_steps_per_second": 4.75, + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, "step": 3000 }, { - "epoch": 0.3623886347218878, - "grad_norm": 6.71875, + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, "learning_rate": 9.773777777777779e-05, - "loss": 0.7726, + "loss": 1.028, "step": 3010 }, { - "epoch": 0.3635925836744522, - "grad_norm": 8.125, + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, "learning_rate": 9.769333333333334e-05, - "loss": 0.8234, + "loss": 0.9209, "step": 3020 }, { - "epoch": 0.3647965326270166, - "grad_norm": 7.90625, + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, "learning_rate": 9.764888888888888e-05, - "loss": 0.8125, + "loss": 0.9999, "step": 3030 }, { - "epoch": 0.36600048157958104, - "grad_norm": 5.875, + "epoch": 2.168330955777461, + "grad_norm": 8.375, "learning_rate": 9.760444444444446e-05, - "loss": 0.739, + "loss": 0.9576, "step": 3040 }, { - "epoch": 0.3672044305321454, - "grad_norm": 32.75, + "epoch": 2.17546362339515, + "grad_norm": 7.4375, "learning_rate": 9.756000000000001e-05, - "loss": 0.8773, + "loss": 0.8832, "step": 3050 }, { - "epoch": 0.36840837948470984, - "grad_norm": 8.625, + "epoch": 2.182596291012839, + "grad_norm": 8.125, "learning_rate": 9.751555555555556e-05, - "loss": 0.6411, + "loss": 0.933, "step": 3060 }, { - "epoch": 0.36961232843727426, - "grad_norm": 10.0625, + "epoch": 2.189728958630528, + "grad_norm": 8.9375, "learning_rate": 9.747111111111112e-05, - "loss": 0.7757, + "loss": 0.9962, "step": 3070 }, { - "epoch": 0.3708162773898387, - "grad_norm": 7.78125, + "epoch": 2.196861626248217, + "grad_norm": 7.1875, "learning_rate": 9.742666666666667e-05, - "loss": 0.8144, + "loss": 1.003, "step": 3080 }, { - "epoch": 0.37202022634240306, - "grad_norm": 8.25, + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, "learning_rate": 9.738222222222223e-05, - "loss": 0.7915, + "loss": 0.9441, "step": 3090 }, { - "epoch": 0.3732241752949675, - "grad_norm": 9.5, + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, "learning_rate": 9.733777777777778e-05, - "loss": 0.7808, + "loss": 1.0335, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval/acc": 39.53488540649414, + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval_loss": 2.6263325214385986, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.065, - "eval_steps_per_second": 4.746, + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, "step": 3100 }, { - "epoch": 0.3744281242475319, - "grad_norm": 7.34375, + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, "learning_rate": 9.729333333333334e-05, - "loss": 0.6467, + "loss": 0.9694, "step": 3110 }, { - "epoch": 0.37563207320009634, - "grad_norm": 10.5625, + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, "learning_rate": 9.724888888888889e-05, - "loss": 0.7271, + "loss": 1.0386, "step": 3120 }, { - "epoch": 0.3768360221526607, - "grad_norm": 19.375, + "epoch": 2.232524964336662, + "grad_norm": 8.6875, "learning_rate": 9.720444444444445e-05, - "loss": 0.8248, + "loss": 0.9614, "step": 3130 }, { - "epoch": 0.37803997110522514, - "grad_norm": 11.6875, + "epoch": 2.239657631954351, + "grad_norm": 8.3125, "learning_rate": 9.716000000000001e-05, - "loss": 0.7468, + "loss": 1.0643, "step": 3140 }, { - "epoch": 0.37924392005778956, - "grad_norm": 6.71875, + "epoch": 2.24679029957204, + "grad_norm": 8.125, "learning_rate": 9.711555555555556e-05, - "loss": 0.8189, + "loss": 0.9243, "step": 3150 }, { - "epoch": 0.38044786901035393, - "grad_norm": 7.15625, + "epoch": 2.253922967189729, + "grad_norm": 9.125, "learning_rate": 9.707111111111111e-05, - "loss": 0.7265, + "loss": 0.8419, "step": 3160 }, { - "epoch": 0.38165181796291836, - "grad_norm": 11.9375, + "epoch": 2.261055634807418, + "grad_norm": 9.125, "learning_rate": 9.702666666666667e-05, - "loss": 0.7502, + "loss": 0.9961, "step": 3170 }, { - "epoch": 0.3828557669154828, - "grad_norm": 7.78125, + "epoch": 2.268188302425107, + "grad_norm": 6.3125, "learning_rate": 9.698222222222223e-05, - "loss": 0.8412, + "loss": 0.8931, "step": 3180 }, { - "epoch": 0.3840597158680472, - "grad_norm": 6.75, + "epoch": 2.275320970042796, + "grad_norm": 7.875, "learning_rate": 9.693777777777778e-05, - "loss": 0.8689, + "loss": 1.0057, "step": 3190 }, { - "epoch": 0.3852636648206116, - "grad_norm": 7.6875, + "epoch": 2.282453637660485, + "grad_norm": 6.90625, "learning_rate": 9.689333333333333e-05, - "loss": 0.8053, + "loss": 0.9606, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval/acc": 39.53488540649414, + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval_loss": 2.6145706176757812, - "eval_runtime": 0.2093, - "eval_samples_per_second": 205.398, - "eval_steps_per_second": 4.777, + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, "step": 3200 }, { - "epoch": 0.386467613773176, - "grad_norm": 7.65625, + "epoch": 2.289586305278174, + "grad_norm": 11.8125, "learning_rate": 9.684888888888889e-05, - "loss": 0.7601, + "loss": 0.9218, "step": 3210 }, { - "epoch": 0.38767156272574044, - "grad_norm": 19.25, + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, "learning_rate": 9.680444444444445e-05, - "loss": 0.7944, + "loss": 1.0111, "step": 3220 }, { - "epoch": 0.38887551167830486, - "grad_norm": 9.375, + "epoch": 2.3038516405135523, + "grad_norm": 8.625, "learning_rate": 9.676e-05, - "loss": 0.839, + "loss": 1.0968, "step": 3230 }, { - "epoch": 0.39007946063086923, - "grad_norm": 8.5, + "epoch": 2.310984308131241, + "grad_norm": 7.1875, "learning_rate": 9.671555555555556e-05, - "loss": 0.7794, + "loss": 1.0236, "step": 3240 }, { - "epoch": 0.39128340958343366, - "grad_norm": 7.78125, + "epoch": 2.31811697574893, + "grad_norm": 6.84375, "learning_rate": 9.667111111111111e-05, - "loss": 0.753, + "loss": 0.92, "step": 3250 }, { - "epoch": 0.3924873585359981, - "grad_norm": 7.15625, + "epoch": 2.325249643366619, + "grad_norm": 8.75, "learning_rate": 9.662666666666667e-05, - "loss": 0.7326, + "loss": 0.8205, "step": 3260 }, { - "epoch": 0.39369130748856246, - "grad_norm": 13.4375, + "epoch": 2.332382310984308, + "grad_norm": 30.75, "learning_rate": 9.658222222222222e-05, - "loss": 0.6754, + "loss": 0.9676, "step": 3270 }, { - "epoch": 0.3948952564411269, - "grad_norm": 6.71875, + "epoch": 2.339514978601997, + "grad_norm": 13.0, "learning_rate": 9.653777777777778e-05, - "loss": 0.757, + "loss": 0.9086, "step": 3280 }, { - "epoch": 0.3960992053936913, - "grad_norm": 7.5625, + "epoch": 2.346647646219686, + "grad_norm": 9.375, "learning_rate": 9.649333333333333e-05, - "loss": 0.9203, + "loss": 1.0504, "step": 3290 }, { - "epoch": 0.39730315434625574, - "grad_norm": 8.375, + "epoch": 2.353780313837375, + "grad_norm": 39.0, "learning_rate": 9.64488888888889e-05, - "loss": 0.8552, + "loss": 0.9481, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval/acc": 44.1860466003418, + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval_loss": 2.571866273880005, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.479, - "eval_steps_per_second": 4.802, + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, "step": 3300 }, { - "epoch": 0.3985071032988201, - "grad_norm": 7.5625, + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, "learning_rate": 9.640444444444446e-05, - "loss": 0.7811, + "loss": 0.9641, "step": 3310 }, { - "epoch": 0.39971105225138454, - "grad_norm": 11.75, + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, "learning_rate": 9.636e-05, - "loss": 0.6717, + "loss": 0.9624, "step": 3320 }, { - "epoch": 0.40091500120394896, - "grad_norm": 8.1875, + "epoch": 2.3751783166904423, + "grad_norm": 12.625, "learning_rate": 9.631555555555555e-05, - "loss": 0.838, + "loss": 1.0082, "step": 3330 }, { - "epoch": 0.4021189501565134, - "grad_norm": 6.40625, + "epoch": 2.3823109843081314, + "grad_norm": 7.25, "learning_rate": 9.627111111111112e-05, - "loss": 0.8568, + "loss": 1.0249, "step": 3340 }, { - "epoch": 0.40332289910907776, - "grad_norm": 7.3125, + "epoch": 2.3894436519258204, + "grad_norm": 13.375, "learning_rate": 9.622666666666668e-05, - "loss": 0.6742, + "loss": 1.0153, "step": 3350 }, { - "epoch": 0.4045268480616422, - "grad_norm": 7.875, + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, "learning_rate": 9.618222222222223e-05, - "loss": 0.7849, + "loss": 0.9533, "step": 3360 }, { - "epoch": 0.4057307970142066, - "grad_norm": 8.5625, + "epoch": 2.403708987161198, + "grad_norm": 9.25, "learning_rate": 9.613777777777779e-05, - "loss": 0.7537, + "loss": 1.1051, "step": 3370 }, { - "epoch": 0.406934745966771, - "grad_norm": 8.5625, + "epoch": 2.410841654778887, + "grad_norm": 9.5625, "learning_rate": 9.609333333333334e-05, - "loss": 0.6935, + "loss": 1.0551, "step": 3380 }, { - "epoch": 0.4081386949193354, - "grad_norm": 6.3125, + "epoch": 2.417974322396576, + "grad_norm": 7.21875, "learning_rate": 9.604888888888889e-05, - "loss": 0.8065, + "loss": 0.9032, "step": 3390 }, { - "epoch": 0.40934264387189984, - "grad_norm": 26.25, + "epoch": 2.425106990014265, + "grad_norm": 8.5625, "learning_rate": 9.600444444444445e-05, - "loss": 0.6558, + "loss": 1.1008, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval/acc": 37.20930099487305, + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval_loss": 2.7212982177734375, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.345, - "eval_steps_per_second": 4.775, + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, "step": 3400 }, { - "epoch": 0.41054659282446426, - "grad_norm": 6.84375, + "epoch": 2.4322396576319543, + "grad_norm": 10.375, "learning_rate": 9.596000000000001e-05, - "loss": 0.7642, + "loss": 0.9562, "step": 3410 }, { - "epoch": 0.41175054177702863, - "grad_norm": 7.0625, + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, "learning_rate": 9.591555555555556e-05, - "loss": 0.7185, + "loss": 1.0756, "step": 3420 }, { - "epoch": 0.41295449072959306, - "grad_norm": 7.15625, + "epoch": 2.4465049928673324, + "grad_norm": 9.125, "learning_rate": 9.58711111111111e-05, - "loss": 0.6634, + "loss": 0.9554, "step": 3430 }, { - "epoch": 0.4141584396821575, - "grad_norm": 4.96875, + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, "learning_rate": 9.582666666666668e-05, - "loss": 0.6383, + "loss": 0.9122, "step": 3440 }, { - "epoch": 0.4153623886347219, - "grad_norm": 7.15625, + "epoch": 2.4607703281027105, + "grad_norm": 8.625, "learning_rate": 9.578222222222223e-05, - "loss": 0.8032, + "loss": 0.9311, "step": 3450 }, { - "epoch": 0.4165663375872863, - "grad_norm": 9.0625, + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, "learning_rate": 9.573777777777778e-05, - "loss": 0.7294, + "loss": 1.0023, "step": 3460 }, { - "epoch": 0.4177702865398507, - "grad_norm": 9.5, + "epoch": 2.4750356633380886, + "grad_norm": 8.125, "learning_rate": 9.569333333333334e-05, - "loss": 0.802, + "loss": 0.9172, "step": 3470 }, { - "epoch": 0.41897423549241514, - "grad_norm": 7.0, + "epoch": 2.4821683309557776, + "grad_norm": 7.375, "learning_rate": 9.56488888888889e-05, - "loss": 0.7307, + "loss": 0.9407, "step": 3480 }, { - "epoch": 0.4201781844449795, - "grad_norm": 6.34375, + "epoch": 2.4893009985734667, + "grad_norm": 10.25, "learning_rate": 9.560444444444445e-05, - "loss": 0.7239, + "loss": 0.9433, "step": 3490 }, { - "epoch": 0.42138213339754393, - "grad_norm": 6.5, + "epoch": 2.4964336661911553, + "grad_norm": 8.625, "learning_rate": 9.556e-05, - "loss": 0.6711, + "loss": 0.9934, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval/acc": 39.53488540649414, + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval_loss": 2.569326400756836, - "eval_runtime": 0.2066, - "eval_samples_per_second": 208.137, - "eval_steps_per_second": 4.84, + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 3500 }, { - "epoch": 0.42258608235010836, - "grad_norm": 8.125, + "epoch": 2.5035663338088447, + "grad_norm": 7.625, "learning_rate": 9.551555555555556e-05, - "loss": 0.695, + "loss": 0.9157, "step": 3510 }, { - "epoch": 0.4237900313026728, - "grad_norm": 8.3125, + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, "learning_rate": 9.547111111111111e-05, - "loss": 0.8691, + "loss": 0.9202, "step": 3520 }, { - "epoch": 0.42499398025523716, - "grad_norm": 8.6875, + "epoch": 2.5178316690442224, + "grad_norm": 9.25, "learning_rate": 9.542666666666667e-05, - "loss": 0.7582, + "loss": 0.8526, "step": 3530 }, { - "epoch": 0.4261979292078016, - "grad_norm": 7.25, + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, "learning_rate": 9.538222222222223e-05, - "loss": 0.7143, + "loss": 0.9562, "step": 3540 }, { - "epoch": 0.427401878160366, - "grad_norm": 8.6875, + "epoch": 2.5320970042796005, + "grad_norm": 9.75, "learning_rate": 9.533777777777778e-05, - "loss": 0.6754, + "loss": 0.9927, "step": 3550 }, { - "epoch": 0.42860582711293044, - "grad_norm": 7.8125, + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, "learning_rate": 9.529333333333333e-05, - "loss": 0.7153, + "loss": 0.9263, "step": 3560 }, { - "epoch": 0.4298097760654948, - "grad_norm": 7.5625, + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, "learning_rate": 9.52488888888889e-05, - "loss": 0.7293, + "loss": 0.9367, "step": 3570 }, { - "epoch": 0.43101372501805923, - "grad_norm": 7.5625, + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, "learning_rate": 9.520444444444446e-05, - "loss": 0.7066, + "loss": 0.9284, "step": 3580 }, { - "epoch": 0.43221767397062366, - "grad_norm": 8.1875, + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, "learning_rate": 9.516e-05, - "loss": 0.691, + "loss": 0.8394, "step": 3590 }, { - "epoch": 0.43342162292318803, - "grad_norm": 7.125, + "epoch": 2.5677603423680457, + "grad_norm": 10.25, "learning_rate": 9.511555555555555e-05, - "loss": 0.8239, + "loss": 0.9336, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval/acc": 44.1860466003418, + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval_loss": 2.4877374172210693, - "eval_runtime": 0.3957, - "eval_samples_per_second": 108.658, - "eval_steps_per_second": 2.527, + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, "step": 3600 }, { - "epoch": 0.43462557187575246, - "grad_norm": 6.375, + "epoch": 2.574893009985735, + "grad_norm": 10.0625, "learning_rate": 9.507111111111111e-05, - "loss": 0.6782, + "loss": 1.0005, "step": 3610 }, { - "epoch": 0.4358295208283169, - "grad_norm": 7.1875, + "epoch": 2.582025677603424, + "grad_norm": 8.375, "learning_rate": 9.502666666666668e-05, - "loss": 0.7602, + "loss": 0.9319, "step": 3620 }, { - "epoch": 0.4370334697808813, - "grad_norm": 8.125, + "epoch": 2.5891583452211124, + "grad_norm": 8.5, "learning_rate": 9.498222222222222e-05, - "loss": 0.7232, + "loss": 0.9125, "step": 3630 }, { - "epoch": 0.4382374187334457, - "grad_norm": 7.84375, + "epoch": 2.596291012838802, + "grad_norm": 7.71875, "learning_rate": 9.493777777777779e-05, - "loss": 0.729, + "loss": 0.9279, "step": 3640 }, { - "epoch": 0.4394413676860101, - "grad_norm": 8.375, + "epoch": 2.6034236804564905, + "grad_norm": 11.875, "learning_rate": 9.489333333333334e-05, - "loss": 0.8222, + "loss": 0.952, "step": 3650 }, { - "epoch": 0.44064531663857454, - "grad_norm": 8.125, + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, "learning_rate": 9.48488888888889e-05, - "loss": 0.6918, + "loss": 1.0043, "step": 3660 }, { - "epoch": 0.44184926559113896, - "grad_norm": 8.1875, + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, "learning_rate": 9.480444444444445e-05, - "loss": 0.6761, + "loss": 0.8932, "step": 3670 }, { - "epoch": 0.44305321454370333, - "grad_norm": 5.65625, + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, "learning_rate": 9.476000000000001e-05, - "loss": 0.7532, + "loss": 0.8775, "step": 3680 }, { - "epoch": 0.44425716349626776, - "grad_norm": 8.8125, + "epoch": 2.6319543509272467, + "grad_norm": 9.0, "learning_rate": 9.471555555555556e-05, - "loss": 0.7072, + "loss": 0.9756, "step": 3690 }, { - "epoch": 0.4454611124488322, - "grad_norm": 6.5625, + "epoch": 2.6390870185449358, + "grad_norm": 7.375, "learning_rate": 9.46711111111111e-05, - "loss": 0.8405, + "loss": 0.9345, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval/acc": 39.53488540649414, + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval_loss": 2.615053176879883, - "eval_runtime": 4.8304, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 0.207, + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, "step": 3700 }, { - "epoch": 0.44666506140139656, - "grad_norm": 8.6875, + "epoch": 2.646219686162625, + "grad_norm": 8.4375, "learning_rate": 9.462666666666668e-05, - "loss": 0.7249, + "loss": 0.9851, "step": 3710 }, { - "epoch": 0.447869010353961, - "grad_norm": 8.4375, + "epoch": 2.653352353780314, + "grad_norm": 31.75, "learning_rate": 9.458222222222223e-05, - "loss": 0.8561, + "loss": 0.9712, "step": 3720 }, { - "epoch": 0.4490729593065254, - "grad_norm": 7.3125, + "epoch": 2.660485021398003, + "grad_norm": 6.75, "learning_rate": 9.453777777777778e-05, - "loss": 0.7884, + "loss": 0.8641, "step": 3730 }, { - "epoch": 0.45027690825908984, - "grad_norm": 7.34375, + "epoch": 2.667617689015692, + "grad_norm": 6.5625, "learning_rate": 9.449333333333334e-05, - "loss": 0.7169, + "loss": 0.945, "step": 3740 }, { - "epoch": 0.4514808572116542, - "grad_norm": 5.5, + "epoch": 2.674750356633381, + "grad_norm": 6.0625, "learning_rate": 9.44488888888889e-05, - "loss": 0.7542, + "loss": 0.9535, "step": 3750 }, { - "epoch": 0.45268480616421863, - "grad_norm": 6.09375, + "epoch": 2.68188302425107, + "grad_norm": 7.90625, "learning_rate": 9.440444444444445e-05, - "loss": 0.6292, + "loss": 0.8844, "step": 3760 }, { - "epoch": 0.45388875511678306, - "grad_norm": 8.9375, + "epoch": 2.689015691868759, + "grad_norm": 9.8125, "learning_rate": 9.436e-05, - "loss": 0.6682, + "loss": 0.9064, "step": 3770 }, { - "epoch": 0.4550927040693475, - "grad_norm": 5.09375, + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, "learning_rate": 9.431555555555556e-05, - "loss": 0.6499, + "loss": 1.0119, "step": 3780 }, { - "epoch": 0.45629665302191186, - "grad_norm": 8.5, + "epoch": 2.703281027104137, + "grad_norm": 7.15625, "learning_rate": 9.427111111111112e-05, - "loss": 0.7859, + "loss": 0.9655, "step": 3790 }, { - "epoch": 0.4575006019744763, - "grad_norm": 14.5, + "epoch": 2.710413694721826, + "grad_norm": 9.4375, "learning_rate": 9.422666666666667e-05, - "loss": 0.7987, + "loss": 0.9187, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval/acc": 39.53488540649414, + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval_loss": 2.645066022872925, - "eval_runtime": 0.6165, - "eval_samples_per_second": 69.745, - "eval_steps_per_second": 1.622, + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, "step": 3800 }, { - "epoch": 0.4587045509270407, - "grad_norm": 6.25, + "epoch": 2.717546362339515, + "grad_norm": 9.25, "learning_rate": 9.418222222222223e-05, - "loss": 0.7035, + "loss": 0.8689, "step": 3810 }, { - "epoch": 0.4599084998796051, - "grad_norm": 6.46875, + "epoch": 2.724679029957204, + "grad_norm": 8.0625, "learning_rate": 9.413777777777778e-05, - "loss": 0.6329, + "loss": 0.9138, "step": 3820 }, { - "epoch": 0.4611124488321695, - "grad_norm": 8.875, + "epoch": 2.731811697574893, + "grad_norm": 14.3125, "learning_rate": 9.409333333333333e-05, - "loss": 0.7553, + "loss": 0.9129, "step": 3830 }, { - "epoch": 0.46231639778473393, - "grad_norm": 9.3125, + "epoch": 2.738944365192582, + "grad_norm": 6.78125, "learning_rate": 9.404888888888889e-05, - "loss": 0.6551, + "loss": 0.8666, "step": 3840 }, { - "epoch": 0.46352034673729836, - "grad_norm": 11.0625, + "epoch": 2.746077032810271, + "grad_norm": 7.4375, "learning_rate": 9.400444444444445e-05, - "loss": 0.6634, + "loss": 0.9474, "step": 3850 }, { - "epoch": 0.46472429568986273, - "grad_norm": 6.71875, + "epoch": 2.75320970042796, + "grad_norm": 7.46875, "learning_rate": 9.396e-05, - "loss": 0.6527, + "loss": 0.9312, "step": 3860 }, { - "epoch": 0.46592824464242716, - "grad_norm": 6.75, + "epoch": 2.760342368045649, + "grad_norm": 7.84375, "learning_rate": 9.391555555555555e-05, - "loss": 0.8268, + "loss": 0.943, "step": 3870 }, { - "epoch": 0.4671321935949916, - "grad_norm": 7.78125, + "epoch": 2.767475035663338, + "grad_norm": 8.125, "learning_rate": 9.387111111111113e-05, - "loss": 0.742, + "loss": 0.9471, "step": 3880 }, { - "epoch": 0.468336142547556, - "grad_norm": 6.53125, + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, "learning_rate": 9.382666666666667e-05, - "loss": 0.7446, + "loss": 0.9785, "step": 3890 }, { - "epoch": 0.4695400915001204, - "grad_norm": 7.0625, + "epoch": 2.7817403708987163, + "grad_norm": 10.5, "learning_rate": 9.378222222222222e-05, - "loss": 0.7764, + "loss": 1.0151, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval/acc": 37.79069900512695, + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval_loss": 2.6463897228240967, - "eval_runtime": 1.4145, - "eval_samples_per_second": 30.4, - "eval_steps_per_second": 0.707, + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, "step": 3900 }, { - "epoch": 0.4707440404526848, - "grad_norm": 5.625, + "epoch": 2.788873038516405, + "grad_norm": 9.75, "learning_rate": 9.373777777777778e-05, - "loss": 0.7248, + "loss": 0.9148, "step": 3910 }, { - "epoch": 0.47194798940524924, - "grad_norm": 7.09375, + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, "learning_rate": 9.369333333333333e-05, - "loss": 0.6977, + "loss": 1.0314, "step": 3920 }, { - "epoch": 0.4731519383578136, - "grad_norm": 7.53125, + "epoch": 2.803138373751783, + "grad_norm": 8.375, "learning_rate": 9.36488888888889e-05, - "loss": 0.6496, + "loss": 0.9076, "step": 3930 }, { - "epoch": 0.47435588731037803, - "grad_norm": 11.0, + "epoch": 2.810271041369472, + "grad_norm": 6.46875, "learning_rate": 9.360444444444444e-05, - "loss": 0.7309, + "loss": 0.8218, "step": 3940 }, { - "epoch": 0.47555983626294246, - "grad_norm": 10.5625, + "epoch": 2.817403708987161, + "grad_norm": 7.96875, "learning_rate": 9.356e-05, - "loss": 0.7837, + "loss": 0.9415, "step": 3950 }, { - "epoch": 0.4767637852155069, - "grad_norm": 6.9375, + "epoch": 2.82453637660485, + "grad_norm": 7.53125, "learning_rate": 9.351555555555555e-05, - "loss": 0.6769, + "loss": 0.9593, "step": 3960 }, { - "epoch": 0.47796773416807126, - "grad_norm": 6.84375, + "epoch": 2.831669044222539, + "grad_norm": 5.96875, "learning_rate": 9.347111111111112e-05, - "loss": 0.642, + "loss": 0.9134, "step": 3970 }, { - "epoch": 0.4791716831206357, - "grad_norm": 9.125, + "epoch": 2.8388017118402282, + "grad_norm": 8.25, "learning_rate": 9.342666666666668e-05, - "loss": 0.6947, + "loss": 0.9339, "step": 3980 }, { - "epoch": 0.4803756320732001, - "grad_norm": 7.4375, + "epoch": 2.8459343794579173, + "grad_norm": 9.625, "learning_rate": 9.338222222222223e-05, - "loss": 0.5902, + "loss": 1.0018, "step": 3990 }, { - "epoch": 0.4815795810257645, - "grad_norm": 8.1875, + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, "learning_rate": 9.333777777777777e-05, - "loss": 0.6075, + "loss": 0.9302, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval/acc": 34.88372039794922, + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval_loss": 2.6985960006713867, - "eval_runtime": 0.2767, - "eval_samples_per_second": 155.399, - "eval_steps_per_second": 3.614, + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, "step": 4000 }, { - "epoch": 0.4827835299783289, - "grad_norm": 6.8125, + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, "learning_rate": 9.329333333333334e-05, - "loss": 0.7166, + "loss": 0.9375, "step": 4010 }, { - "epoch": 0.48398747893089333, - "grad_norm": 6.375, + "epoch": 2.8673323823109844, + "grad_norm": 11.875, "learning_rate": 9.32488888888889e-05, - "loss": 0.6136, + "loss": 0.8406, "step": 4020 }, { - "epoch": 0.48519142788345776, - "grad_norm": 6.09375, + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, "learning_rate": 9.320444444444445e-05, - "loss": 0.7948, + "loss": 0.8863, "step": 4030 }, { - "epoch": 0.48639537683602213, - "grad_norm": 7.5625, + "epoch": 2.881597717546362, + "grad_norm": 6.9375, "learning_rate": 9.316000000000001e-05, - "loss": 0.7253, + "loss": 0.9546, "step": 4040 }, { - "epoch": 0.48759932578858656, - "grad_norm": 7.1875, + "epoch": 2.8887303851640516, + "grad_norm": 8.625, "learning_rate": 9.311555555555556e-05, - "loss": 0.7386, + "loss": 1.0175, "step": 4050 }, { - "epoch": 0.488803274741151, - "grad_norm": 7.71875, + "epoch": 2.89586305278174, + "grad_norm": 45.0, "learning_rate": 9.307111111111112e-05, - "loss": 0.7222, + "loss": 0.9058, "step": 4060 }, { - "epoch": 0.4900072236937154, - "grad_norm": 10.8125, + "epoch": 2.9029957203994297, + "grad_norm": 13.625, "learning_rate": 9.302666666666667e-05, - "loss": 0.6298, + "loss": 0.9137, "step": 4070 }, { - "epoch": 0.4912111726462798, - "grad_norm": 14.25, + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, "learning_rate": 9.298222222222223e-05, - "loss": 0.6551, + "loss": 0.8862, "step": 4080 }, { - "epoch": 0.4924151215988442, - "grad_norm": 7.75, + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, "learning_rate": 9.293777777777778e-05, - "loss": 0.7201, + "loss": 0.9152, "step": 4090 }, { - "epoch": 0.49361907055140863, - "grad_norm": 9.0625, + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, "learning_rate": 9.289333333333334e-05, - "loss": 0.708, + "loss": 0.9623, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval/acc": 34.88372039794922, + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval_loss": 2.7673676013946533, - "eval_runtime": 0.3468, - "eval_samples_per_second": 124.003, - "eval_steps_per_second": 2.884, + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, "step": 4100 }, { - "epoch": 0.494823019503973, - "grad_norm": 7.9375, + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, "learning_rate": 9.28488888888889e-05, - "loss": 0.6997, + "loss": 0.9088, "step": 4110 }, { - "epoch": 0.49602696845653743, - "grad_norm": 6.84375, + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, "learning_rate": 9.280444444444445e-05, - "loss": 0.6195, + "loss": 0.9927, "step": 4120 }, { - "epoch": 0.49723091740910186, - "grad_norm": 7.40625, + "epoch": 2.9457917261055635, + "grad_norm": 75.0, "learning_rate": 9.276e-05, - "loss": 0.765, + "loss": 0.912, "step": 4130 }, { - "epoch": 0.4984348663616663, - "grad_norm": 7.8125, + "epoch": 2.9529243937232525, + "grad_norm": 9.125, "learning_rate": 9.271555555555556e-05, - "loss": 0.7097, + "loss": 0.9878, "step": 4140 }, { - "epoch": 0.49963881531423066, - "grad_norm": 7.75, + "epoch": 2.9600570613409416, + "grad_norm": 7.125, "learning_rate": 9.267111111111112e-05, - "loss": 0.7067, + "loss": 0.8785, "step": 4150 }, { - "epoch": 0.5008427642667951, - "grad_norm": 27.875, + "epoch": 2.9671897289586306, + "grad_norm": 8.25, "learning_rate": 9.262666666666667e-05, - "loss": 0.7989, + "loss": 0.9296, "step": 4160 }, { - "epoch": 0.5020467132193595, - "grad_norm": 8.0, + "epoch": 2.9743223965763197, + "grad_norm": 8.75, "learning_rate": 9.258222222222222e-05, - "loss": 0.6744, + "loss": 0.9284, "step": 4170 }, { - "epoch": 0.5032506621719239, - "grad_norm": 7.96875, + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, "learning_rate": 9.253777777777778e-05, - "loss": 0.738, + "loss": 0.9566, "step": 4180 }, { - "epoch": 0.5044546111244883, - "grad_norm": 7.21875, + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, "learning_rate": 9.249333333333334e-05, - "loss": 0.7021, + "loss": 0.8368, "step": 4190 }, { - "epoch": 0.5056585600770528, - "grad_norm": 9.6875, + "epoch": 2.995720399429387, + "grad_norm": 9.875, "learning_rate": 9.244888888888889e-05, - "loss": 0.7133, + "loss": 1.0306, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval/acc": 32.55813980102539, + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval_loss": 2.7288577556610107, - "eval_runtime": 0.2266, - "eval_samples_per_second": 189.803, - "eval_steps_per_second": 4.414, + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, "step": 4200 }, { - "epoch": 0.5068625090296172, - "grad_norm": 10.5, + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, "learning_rate": 9.240444444444445e-05, - "loss": 0.6886, + "loss": 0.957, "step": 4210 }, { - "epoch": 0.5080664579821815, - "grad_norm": 9.0625, + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, "learning_rate": 9.236e-05, - "loss": 0.7944, + "loss": 0.884, "step": 4220 }, { - "epoch": 0.509270406934746, - "grad_norm": 7.78125, + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, "learning_rate": 9.231555555555555e-05, - "loss": 0.7869, + "loss": 0.9064, "step": 4230 }, { - "epoch": 0.5104743558873104, - "grad_norm": 6.375, + "epoch": 3.0242510699001426, + "grad_norm": 8.0, "learning_rate": 9.227111111111111e-05, - "loss": 0.6245, + "loss": 0.9164, "step": 4240 }, { - "epoch": 0.5116783048398748, - "grad_norm": 9.9375, + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, "learning_rate": 9.222666666666668e-05, - "loss": 0.7006, + "loss": 0.9787, "step": 4250 }, { - "epoch": 0.5128822537924392, - "grad_norm": 6.1875, + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, "learning_rate": 9.218222222222222e-05, - "loss": 0.7588, + "loss": 0.8852, "step": 4260 }, { - "epoch": 0.5140862027450036, - "grad_norm": 10.6875, + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, "learning_rate": 9.213777777777777e-05, - "loss": 0.737, + "loss": 1.0092, "step": 4270 }, { - "epoch": 0.515290151697568, - "grad_norm": 6.15625, + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, "learning_rate": 9.209333333333335e-05, - "loss": 0.6774, + "loss": 0.9972, "step": 4280 }, { - "epoch": 0.5164941006501325, - "grad_norm": 8.8125, + "epoch": 3.059914407988588, + "grad_norm": 7.25, "learning_rate": 9.20488888888889e-05, - "loss": 0.6972, + "loss": 0.9237, "step": 4290 }, { - "epoch": 0.5176980496026968, - "grad_norm": 6.40625, + "epoch": 3.067047075606277, + "grad_norm": 6.4375, "learning_rate": 9.200444444444445e-05, - "loss": 0.6423, + "loss": 0.9096, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval/acc": 38.953487396240234, + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval_loss": 2.7444300651550293, - "eval_runtime": 0.2708, - "eval_samples_per_second": 158.776, - "eval_steps_per_second": 3.692, + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, "step": 4300 }, { - "epoch": 0.5189019985552613, - "grad_norm": 6.8125, + "epoch": 3.074179743223966, + "grad_norm": 8.4375, "learning_rate": 9.196000000000001e-05, - "loss": 0.7705, + "loss": 0.9697, "step": 4310 }, { - "epoch": 0.5201059475078257, - "grad_norm": 5.90625, + "epoch": 3.081312410841655, + "grad_norm": 8.4375, "learning_rate": 9.191555555555556e-05, - "loss": 0.7534, + "loss": 0.8379, "step": 4320 }, { - "epoch": 0.52130989646039, - "grad_norm": 9.25, + "epoch": 3.088445078459344, + "grad_norm": 8.125, "learning_rate": 9.187111111111112e-05, - "loss": 0.6586, + "loss": 0.8576, "step": 4330 }, { - "epoch": 0.5225138454129545, - "grad_norm": 7.53125, + "epoch": 3.0955777460770326, + "grad_norm": 10.75, "learning_rate": 9.182666666666667e-05, - "loss": 0.7459, + "loss": 0.9616, "step": 4340 }, { - "epoch": 0.5237177943655189, - "grad_norm": 6.09375, + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, "learning_rate": 9.178222222222223e-05, - "loss": 0.7088, + "loss": 0.7674, "step": 4350 }, { - "epoch": 0.5249217433180833, - "grad_norm": 8.5, + "epoch": 3.1098430813124107, + "grad_norm": 8.375, "learning_rate": 9.173777777777778e-05, - "loss": 0.7313, + "loss": 0.8712, "step": 4360 }, { - "epoch": 0.5261256922706478, - "grad_norm": 8.8125, + "epoch": 3.1169757489300998, + "grad_norm": 8.375, "learning_rate": 9.169333333333334e-05, - "loss": 0.7364, + "loss": 0.8599, "step": 4370 }, { - "epoch": 0.5273296412232121, - "grad_norm": 7.09375, + "epoch": 3.124108416547789, + "grad_norm": 7.1875, "learning_rate": 9.16488888888889e-05, - "loss": 0.6962, + "loss": 0.9736, "step": 4380 }, { - "epoch": 0.5285335901757765, - "grad_norm": 6.28125, + "epoch": 3.131241084165478, + "grad_norm": 7.75, "learning_rate": 9.160444444444445e-05, - "loss": 0.6817, + "loss": 0.8663, "step": 4390 }, { - "epoch": 0.529737539128341, - "grad_norm": 8.25, + "epoch": 3.138373751783167, + "grad_norm": 7.53125, "learning_rate": 9.156e-05, - "loss": 0.6786, + "loss": 0.9221, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval/acc": 34.88372039794922, + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval_loss": 2.728501081466675, - "eval_runtime": 0.3599, - "eval_samples_per_second": 119.474, - "eval_steps_per_second": 2.778, + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, "step": 4400 }, { - "epoch": 0.5309414880809054, - "grad_norm": 7.59375, + "epoch": 3.145506419400856, + "grad_norm": 8.125, "learning_rate": 9.151555555555556e-05, - "loss": 0.6744, + "loss": 0.9144, "step": 4410 }, { - "epoch": 0.5321454370334697, - "grad_norm": 8.0625, + "epoch": 3.152639087018545, + "grad_norm": 7.46875, "learning_rate": 9.147111111111112e-05, - "loss": 0.8287, + "loss": 0.9445, "step": 4420 }, { - "epoch": 0.5333493859860342, - "grad_norm": 8.1875, + "epoch": 3.159771754636234, + "grad_norm": 6.9375, "learning_rate": 9.142666666666667e-05, - "loss": 0.7069, + "loss": 0.8308, "step": 4430 }, { - "epoch": 0.5345533349385986, - "grad_norm": 8.125, + "epoch": 3.166904422253923, + "grad_norm": 7.53125, "learning_rate": 9.138222222222222e-05, - "loss": 0.662, + "loss": 0.8428, "step": 4440 }, { - "epoch": 0.5357572838911631, - "grad_norm": 7.46875, + "epoch": 3.174037089871612, + "grad_norm": 7.96875, "learning_rate": 9.133777777777778e-05, - "loss": 0.7424, + "loss": 0.9022, "step": 4450 }, { - "epoch": 0.5369612328437274, - "grad_norm": 6.96875, + "epoch": 3.181169757489301, + "grad_norm": 6.875, "learning_rate": 9.129333333333334e-05, - "loss": 0.7308, + "loss": 0.9955, "step": 4460 }, { - "epoch": 0.5381651817962918, - "grad_norm": 8.3125, + "epoch": 3.18830242510699, + "grad_norm": 9.5625, "learning_rate": 9.124888888888889e-05, - "loss": 0.7524, + "loss": 0.9493, "step": 4470 }, { - "epoch": 0.5393691307488563, - "grad_norm": 6.40625, + "epoch": 3.195435092724679, + "grad_norm": 9.0625, "learning_rate": 9.120444444444445e-05, - "loss": 0.7523, + "loss": 0.9608, "step": 4480 }, { - "epoch": 0.5405730797014207, - "grad_norm": 7.65625, + "epoch": 3.202567760342368, + "grad_norm": 8.625, "learning_rate": 9.116e-05, - "loss": 0.647, + "loss": 0.821, "step": 4490 }, { - "epoch": 0.541777028653985, - "grad_norm": 6.875, + "epoch": 3.209700427960057, + "grad_norm": 8.125, "learning_rate": 9.111555555555556e-05, - "loss": 0.6547, + "loss": 0.9175, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval/acc": 37.20930099487305, + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval_loss": 2.8390543460845947, - "eval_runtime": 0.2096, - "eval_samples_per_second": 205.2, - "eval_steps_per_second": 4.772, + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, "step": 4500 }, { - "epoch": 0.5429809776065495, - "grad_norm": 9.375, + "epoch": 3.216833095577746, + "grad_norm": 8.0625, "learning_rate": 9.107111111111111e-05, - "loss": 0.6773, + "loss": 0.9169, "step": 4510 }, { - "epoch": 0.5441849265591139, - "grad_norm": 10.1875, + "epoch": 3.223965763195435, + "grad_norm": 8.3125, "learning_rate": 9.102666666666667e-05, - "loss": 0.704, + "loss": 0.8001, "step": 4520 }, { - "epoch": 0.5453888755116783, - "grad_norm": 5.0625, + "epoch": 3.231098430813124, + "grad_norm": 7.3125, "learning_rate": 9.098222222222222e-05, - "loss": 0.6303, + "loss": 0.8513, "step": 4530 }, { - "epoch": 0.5465928244642427, - "grad_norm": 8.25, + "epoch": 3.238231098430813, + "grad_norm": 7.625, "learning_rate": 9.093777777777777e-05, - "loss": 0.7469, + "loss": 0.912, "step": 4540 }, { - "epoch": 0.5477967734168071, - "grad_norm": 7.375, + "epoch": 3.245363766048502, + "grad_norm": 6.46875, "learning_rate": 9.089333333333335e-05, - "loss": 0.6995, + "loss": 0.9418, "step": 4550 }, { - "epoch": 0.5490007223693716, - "grad_norm": 7.78125, + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, "learning_rate": 9.08488888888889e-05, - "loss": 0.6965, + "loss": 0.871, "step": 4560 }, { - "epoch": 0.550204671321936, - "grad_norm": 13.625, + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, "learning_rate": 9.080444444444444e-05, - "loss": 0.759, + "loss": 0.8507, "step": 4570 }, { - "epoch": 0.5514086202745003, - "grad_norm": 6.875, + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, "learning_rate": 9.076e-05, - "loss": 0.7284, + "loss": 0.8058, "step": 4580 }, { - "epoch": 0.5526125692270648, - "grad_norm": 5.875, + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, "learning_rate": 9.071555555555557e-05, - "loss": 0.6721, + "loss": 0.7959, "step": 4590 }, { - "epoch": 0.5538165181796292, - "grad_norm": 5.46875, + "epoch": 3.281027104136947, + "grad_norm": 6.375, "learning_rate": 9.067111111111112e-05, - "loss": 0.6522, + "loss": 0.9206, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval/acc": 39.53488540649414, + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval_loss": 2.801618814468384, - "eval_runtime": 0.2155, - "eval_samples_per_second": 199.501, - "eval_steps_per_second": 4.64, + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, "step": 4600 }, { - "epoch": 0.5550204671321936, - "grad_norm": 8.5625, + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, "learning_rate": 9.062666666666666e-05, - "loss": 0.6399, + "loss": 0.8306, "step": 4610 }, { - "epoch": 0.556224416084758, - "grad_norm": 7.40625, + "epoch": 3.295292439372325, + "grad_norm": 6.9375, "learning_rate": 9.058222222222223e-05, - "loss": 0.7303, + "loss": 0.8958, "step": 4620 }, { - "epoch": 0.5574283650373224, - "grad_norm": 6.96875, + "epoch": 3.302425106990014, + "grad_norm": 7.96875, "learning_rate": 9.053777777777777e-05, - "loss": 0.7126, + "loss": 0.8919, "step": 4630 }, { - "epoch": 0.5586323139898868, - "grad_norm": 7.15625, + "epoch": 3.309557774607703, + "grad_norm": 6.9375, "learning_rate": 9.049333333333334e-05, - "loss": 0.702, + "loss": 0.8844, "step": 4640 }, { - "epoch": 0.5598362629424513, - "grad_norm": 6.625, + "epoch": 3.316690442225392, + "grad_norm": 7.21875, "learning_rate": 9.04488888888889e-05, - "loss": 0.6957, + "loss": 0.8335, "step": 4650 }, { - "epoch": 0.5610402118950156, - "grad_norm": 7.90625, + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, "learning_rate": 9.040444444444445e-05, - "loss": 0.703, + "loss": 0.9337, "step": 4660 }, { - "epoch": 0.5622441608475801, - "grad_norm": 7.75, + "epoch": 3.3309557774607703, + "grad_norm": 9.25, "learning_rate": 9.036e-05, - "loss": 0.7195, + "loss": 1.0282, "step": 4670 }, { - "epoch": 0.5634481098001445, - "grad_norm": 6.59375, + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, "learning_rate": 9.031555555555557e-05, - "loss": 0.6445, + "loss": 0.9401, "step": 4680 }, { - "epoch": 0.5646520587527089, - "grad_norm": 25.125, + "epoch": 3.3452211126961484, + "grad_norm": 7.25, "learning_rate": 9.027111111111112e-05, - "loss": 0.699, + "loss": 0.908, "step": 4690 }, { - "epoch": 0.5658560077052733, - "grad_norm": 8.125, + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, "learning_rate": 9.022666666666667e-05, - "loss": 0.716, + "loss": 0.9262, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval/acc": 34.88372039794922, + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval_loss": 2.777444839477539, - "eval_runtime": 0.218, - "eval_samples_per_second": 197.287, - "eval_steps_per_second": 4.588, + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, "step": 4700 }, { - "epoch": 0.5670599566578377, - "grad_norm": 7.0, + "epoch": 3.3594864479315265, + "grad_norm": 13.0, "learning_rate": 9.018222222222223e-05, - "loss": 0.693, + "loss": 0.9692, "step": 4710 }, { - "epoch": 0.5682639056104021, - "grad_norm": 8.8125, + "epoch": 3.3666191155492156, + "grad_norm": 5.875, "learning_rate": 9.013777777777779e-05, - "loss": 0.7, + "loss": 0.9071, "step": 4720 }, { - "epoch": 0.5694678545629666, - "grad_norm": 7.0, + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, "learning_rate": 9.009333333333334e-05, - "loss": 0.6616, + "loss": 0.8528, "step": 4730 }, { - "epoch": 0.5706718035155309, - "grad_norm": 7.75, + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, "learning_rate": 9.004888888888889e-05, - "loss": 0.7987, + "loss": 0.9408, "step": 4740 }, { - "epoch": 0.5718757524680953, - "grad_norm": 6.53125, + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, "learning_rate": 9.000444444444445e-05, - "loss": 0.7162, + "loss": 1.0017, "step": 4750 }, { - "epoch": 0.5730797014206598, - "grad_norm": 8.6875, + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, "learning_rate": 8.996e-05, - "loss": 0.673, + "loss": 0.9107, "step": 4760 }, { - "epoch": 0.5742836503732242, - "grad_norm": 6.5625, + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, "learning_rate": 8.991555555555556e-05, - "loss": 0.7389, + "loss": 0.9387, "step": 4770 }, { - "epoch": 0.5754875993257886, - "grad_norm": 7.25, + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, "learning_rate": 8.987111111111112e-05, - "loss": 0.6674, + "loss": 0.9775, "step": 4780 }, { - "epoch": 0.576691548278353, - "grad_norm": 8.8125, + "epoch": 3.4165477888730384, + "grad_norm": 8.375, "learning_rate": 8.982666666666667e-05, - "loss": 0.7464, + "loss": 0.8173, "step": 4790 }, { - "epoch": 0.5778954972309174, - "grad_norm": 7.65625, + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, "learning_rate": 8.978222222222222e-05, - "loss": 0.6979, + "loss": 0.9068, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval/acc": 37.20930099487305, + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval_loss": 2.7990331649780273, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.72, - "eval_steps_per_second": 4.831, + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, "step": 4800 }, { - "epoch": 0.5790994461834819, - "grad_norm": 6.90625, + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, "learning_rate": 8.973777777777778e-05, - "loss": 0.7292, + "loss": 0.8262, "step": 4810 }, { - "epoch": 0.5803033951360462, - "grad_norm": 7.34375, + "epoch": 3.4379457917261056, + "grad_norm": 9.125, "learning_rate": 8.969333333333334e-05, - "loss": 0.6484, + "loss": 0.9207, "step": 4820 }, { - "epoch": 0.5815073440886106, - "grad_norm": 7.96875, + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, "learning_rate": 8.964888888888889e-05, - "loss": 0.6246, + "loss": 1.0115, "step": 4830 }, { - "epoch": 0.5827112930411751, - "grad_norm": 5.4375, + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, "learning_rate": 8.960444444444444e-05, - "loss": 0.6978, + "loss": 0.9031, "step": 4840 }, { - "epoch": 0.5839152419937395, - "grad_norm": 7.25, + "epoch": 3.4593437945791727, + "grad_norm": 7.875, "learning_rate": 8.956e-05, - "loss": 0.6848, + "loss": 0.9626, "step": 4850 }, { - "epoch": 0.5851191909463038, - "grad_norm": 8.9375, + "epoch": 3.466476462196862, + "grad_norm": 4.625, "learning_rate": 8.951555555555557e-05, - "loss": 0.7541, + "loss": 0.7793, "step": 4860 }, { - "epoch": 0.5863231398988683, - "grad_norm": 8.6875, + "epoch": 3.473609129814551, + "grad_norm": 7.40625, "learning_rate": 8.947111111111111e-05, - "loss": 0.6872, + "loss": 0.8733, "step": 4870 }, { - "epoch": 0.5875270888514327, - "grad_norm": 6.375, + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, "learning_rate": 8.942666666666668e-05, - "loss": 0.7521, + "loss": 0.8448, "step": 4880 }, { - "epoch": 0.5887310378039972, - "grad_norm": 7.34375, + "epoch": 3.4878744650499285, + "grad_norm": 8.625, "learning_rate": 8.938222222222222e-05, - "loss": 0.6741, + "loss": 0.815, "step": 4890 }, { - "epoch": 0.5899349867565615, - "grad_norm": 9.25, + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, "learning_rate": 8.933777777777779e-05, - "loss": 0.7085, + "loss": 0.7837, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval/acc": 32.55813980102539, + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval_loss": 2.822793483734131, - "eval_runtime": 0.2077, - "eval_samples_per_second": 206.985, - "eval_steps_per_second": 4.814, + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, "step": 4900 }, { - "epoch": 0.5911389357091259, - "grad_norm": 6.75, + "epoch": 3.5021398002853066, + "grad_norm": 7.25, "learning_rate": 8.929333333333333e-05, - "loss": 0.6908, + "loss": 0.9082, "step": 4910 }, { - "epoch": 0.5923428846616904, - "grad_norm": 14.3125, + "epoch": 3.5092724679029956, + "grad_norm": 9.0, "learning_rate": 8.92488888888889e-05, - "loss": 0.6954, + "loss": 0.8041, "step": 4920 }, { - "epoch": 0.5935468336142548, - "grad_norm": 5.03125, + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, "learning_rate": 8.920444444444444e-05, - "loss": 0.6255, + "loss": 0.878, "step": 4930 }, { - "epoch": 0.5947507825668191, - "grad_norm": 7.3125, + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, "learning_rate": 8.916e-05, - "loss": 0.6094, + "loss": 0.8609, "step": 4940 }, { - "epoch": 0.5959547315193836, - "grad_norm": 6.875, + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, "learning_rate": 8.911555555555557e-05, - "loss": 0.6488, + "loss": 0.8203, "step": 4950 }, { - "epoch": 0.597158680471948, - "grad_norm": 6.90625, + "epoch": 3.537803138373752, + "grad_norm": 6.4375, "learning_rate": 8.907111111111112e-05, - "loss": 0.6333, + "loss": 0.8976, "step": 4960 }, { - "epoch": 0.5983626294245123, - "grad_norm": 7.0, + "epoch": 3.544935805991441, + "grad_norm": 15.0, "learning_rate": 8.902666666666667e-05, - "loss": 0.6687, + "loss": 0.8585, "step": 4970 }, { - "epoch": 0.5995665783770768, - "grad_norm": 8.9375, + "epoch": 3.55206847360913, + "grad_norm": 6.21875, "learning_rate": 8.898222222222223e-05, - "loss": 0.6762, + "loss": 0.9642, "step": 4980 }, { - "epoch": 0.6007705273296412, - "grad_norm": 7.53125, + "epoch": 3.559201141226819, + "grad_norm": 9.8125, "learning_rate": 8.893777777777779e-05, - "loss": 0.6007, + "loss": 0.9241, "step": 4990 }, { - "epoch": 0.6019744762822057, - "grad_norm": 5.78125, + "epoch": 3.566333808844508, + "grad_norm": 9.25, "learning_rate": 8.889333333333334e-05, - "loss": 0.682, + "loss": 0.7841, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval/acc": 32.55813980102539, + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval_loss": 2.827073097229004, - "eval_runtime": 0.2073, - "eval_samples_per_second": 207.385, - "eval_steps_per_second": 4.823, + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, "step": 5000 }, { - "epoch": 0.60317842523477, - "grad_norm": 8.25, + "epoch": 3.5734664764621966, + "grad_norm": 7.53125, "learning_rate": 8.884888888888889e-05, - "loss": 0.6711, + "loss": 0.8513, "step": 5010 }, { - "epoch": 0.6043823741873344, - "grad_norm": 7.34375, + "epoch": 3.580599144079886, + "grad_norm": 7.3125, "learning_rate": 8.880444444444445e-05, - "loss": 0.6916, + "loss": 0.9502, "step": 5020 }, { - "epoch": 0.6055863231398989, - "grad_norm": 6.6875, + "epoch": 3.5877318116975747, + "grad_norm": 7.375, "learning_rate": 8.876e-05, - "loss": 0.6601, + "loss": 0.9329, "step": 5030 }, { - "epoch": 0.6067902720924633, - "grad_norm": 6.34375, + "epoch": 3.5948644793152638, + "grad_norm": 7.3125, "learning_rate": 8.871555555555556e-05, - "loss": 0.6945, + "loss": 0.8648, "step": 5040 }, { - "epoch": 0.6079942210450276, - "grad_norm": 6.9375, + "epoch": 3.601997146932953, + "grad_norm": 6.5, "learning_rate": 8.867111111111112e-05, - "loss": 0.6492, + "loss": 0.8019, "step": 5050 }, { - "epoch": 0.6091981699975921, - "grad_norm": 7.1875, + "epoch": 3.609129814550642, + "grad_norm": 9.0, "learning_rate": 8.862666666666667e-05, - "loss": 0.5963, + "loss": 0.8829, "step": 5060 }, { - "epoch": 0.6104021189501565, - "grad_norm": 7.1875, + "epoch": 3.616262482168331, + "grad_norm": 6.46875, "learning_rate": 8.858222222222222e-05, - "loss": 0.6715, + "loss": 0.8419, "step": 5070 }, { - "epoch": 0.6116060679027209, - "grad_norm": 9.25, + "epoch": 3.62339514978602, + "grad_norm": 8.9375, "learning_rate": 8.853777777777778e-05, - "loss": 0.7572, + "loss": 0.9345, "step": 5080 }, { - "epoch": 0.6128100168552854, - "grad_norm": 6.3125, + "epoch": 3.630527817403709, + "grad_norm": 7.09375, "learning_rate": 8.849333333333334e-05, - "loss": 0.7521, + "loss": 0.8204, "step": 5090 }, { - "epoch": 0.6140139658078497, - "grad_norm": 6.9375, + "epoch": 3.637660485021398, + "grad_norm": 7.71875, "learning_rate": 8.844888888888889e-05, - "loss": 0.6313, + "loss": 0.9305, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval/acc": 34.88372039794922, + "epoch": 3.637660485021398, + "eval/acc": 39.53488540649414, "step": 5100 }, { - "epoch": 0.6140139658078497, - "eval_loss": 2.9495913982391357, - "eval_runtime": 0.2063, - "eval_samples_per_second": 208.439, - "eval_steps_per_second": 4.847, + "epoch": 3.637660485021398, + "eval_loss": 2.0034291744232178, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 5100 }, { - "epoch": 0.6152179147604142, - "grad_norm": 9.0, + "epoch": 3.644793152639087, + "grad_norm": 6.09375, "learning_rate": 8.840444444444444e-05, - "loss": 0.7974, + "loss": 0.9168, "step": 5110 }, { - "epoch": 0.6164218637129786, - "grad_norm": 5.46875, + "epoch": 3.651925820256776, + "grad_norm": 8.25, "learning_rate": 8.836000000000001e-05, - "loss": 0.6245, + "loss": 0.8155, "step": 5120 }, { - "epoch": 0.617625812665543, - "grad_norm": 9.4375, + "epoch": 3.659058487874465, + "grad_norm": 7.84375, "learning_rate": 8.831555555555556e-05, - "loss": 0.7513, + "loss": 0.8641, "step": 5130 }, { - "epoch": 0.6188297616181074, - "grad_norm": 8.125, + "epoch": 3.666191155492154, + "grad_norm": 6.5, "learning_rate": 8.827111111111111e-05, - "loss": 0.6427, + "loss": 0.8623, "step": 5140 }, { - "epoch": 0.6200337105706718, - "grad_norm": 5.78125, + "epoch": 3.6733238231098433, + "grad_norm": 21.125, "learning_rate": 8.822666666666667e-05, - "loss": 0.6801, + "loss": 0.8205, "step": 5150 }, { - "epoch": 0.6212376595232362, - "grad_norm": 8.8125, + "epoch": 3.680456490727532, + "grad_norm": 7.28125, "learning_rate": 8.818222222222222e-05, - "loss": 0.5978, + "loss": 0.7993, "step": 5160 }, { - "epoch": 0.6224416084758007, - "grad_norm": 8.0, + "epoch": 3.6875891583452214, + "grad_norm": 36.0, "learning_rate": 8.813777777777778e-05, - "loss": 0.6697, + "loss": 0.9083, "step": 5170 }, { - "epoch": 0.623645557428365, - "grad_norm": 8.1875, + "epoch": 3.69472182596291, + "grad_norm": 8.125, "learning_rate": 8.809333333333333e-05, - "loss": 0.7621, + "loss": 0.9264, "step": 5180 }, { - "epoch": 0.6248495063809294, - "grad_norm": 6.4375, + "epoch": 3.701854493580599, + "grad_norm": 10.75, "learning_rate": 8.80488888888889e-05, - "loss": 0.6934, + "loss": 0.8496, "step": 5190 }, { - "epoch": 0.6260534553334939, - "grad_norm": 7.8125, + "epoch": 3.708987161198288, + "grad_norm": 7.78125, "learning_rate": 8.800444444444444e-05, - "loss": 0.7008, + "loss": 0.8718, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval/acc": 34.88372039794922, + "epoch": 3.708987161198288, + "eval/acc": 39.53488540649414, "step": 5200 }, { - "epoch": 0.6260534553334939, - "eval_loss": 2.8201522827148438, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.368, - "eval_steps_per_second": 4.729, + "epoch": 3.708987161198288, + "eval_loss": 2.0305864810943604, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.683, + "eval_steps_per_second": 4.504, "step": 5200 }, { - "epoch": 0.6272574042860583, - "grad_norm": 5.78125, + "epoch": 3.716119828815977, + "grad_norm": 9.3125, "learning_rate": 8.796e-05, - "loss": 0.7211, + "loss": 1.0077, "step": 5210 }, { - "epoch": 0.6284613532386227, - "grad_norm": 6.25, + "epoch": 3.723252496433666, + "grad_norm": 11.4375, "learning_rate": 8.791555555555557e-05, - "loss": 0.654, + "loss": 0.8364, "step": 5220 }, { - "epoch": 0.6296653021911871, - "grad_norm": 9.0625, + "epoch": 3.7303851640513552, + "grad_norm": 15.125, "learning_rate": 8.787111111111112e-05, - "loss": 0.6348, + "loss": 0.8557, "step": 5230 }, { - "epoch": 0.6308692511437515, - "grad_norm": 7.59375, + "epoch": 3.7375178316690443, + "grad_norm": 7.875, "learning_rate": 8.782666666666666e-05, - "loss": 0.6363, + "loss": 0.8674, "step": 5240 }, { - "epoch": 0.632073200096316, - "grad_norm": 6.25, + "epoch": 3.7446504992867333, + "grad_norm": 7.84375, "learning_rate": 8.778222222222223e-05, - "loss": 0.629, + "loss": 0.8788, "step": 5250 }, { - "epoch": 0.6332771490488803, - "grad_norm": 12.375, + "epoch": 3.7517831669044224, + "grad_norm": 7.59375, "learning_rate": 8.773777777777779e-05, - "loss": 0.771, + "loss": 0.8098, "step": 5260 }, { - "epoch": 0.6344810980014447, - "grad_norm": 5.96875, + "epoch": 3.7589158345221114, + "grad_norm": 7.40625, "learning_rate": 8.769333333333334e-05, - "loss": 0.589, + "loss": 0.8895, "step": 5270 }, { - "epoch": 0.6356850469540092, - "grad_norm": 7.1875, + "epoch": 3.7660485021398005, + "grad_norm": 6.78125, "learning_rate": 8.76488888888889e-05, - "loss": 0.5794, + "loss": 0.823, "step": 5280 }, { - "epoch": 0.6368889959065736, - "grad_norm": 7.09375, + "epoch": 3.773181169757489, + "grad_norm": 8.125, "learning_rate": 8.760444444444445e-05, - "loss": 0.6449, + "loss": 0.8418, "step": 5290 }, { - "epoch": 0.6380929448591379, - "grad_norm": 11.1875, + "epoch": 3.7803138373751786, + "grad_norm": 8.4375, "learning_rate": 8.756000000000001e-05, - "loss": 0.6708, + "loss": 0.8202, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval/acc": 36.627906799316406, + "epoch": 3.7803138373751786, + "eval/acc": 41.86046600341797, "step": 5300 }, { - "epoch": 0.6380929448591379, - "eval_loss": 2.902387857437134, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.456, - "eval_steps_per_second": 4.732, + "epoch": 3.7803138373751786, + "eval_loss": 2.100001811981201, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.218, + "eval_steps_per_second": 4.47, "step": 5300 }, { - "epoch": 0.6392968938117024, - "grad_norm": 8.625, + "epoch": 3.787446504992867, + "grad_norm": 7.78125, "learning_rate": 8.751555555555556e-05, - "loss": 0.5895, + "loss": 0.9786, "step": 5310 }, { - "epoch": 0.6405008427642668, - "grad_norm": 8.625, + "epoch": 3.794579172610556, + "grad_norm": 14.125, "learning_rate": 8.747111111111112e-05, - "loss": 0.6012, + "loss": 1.0893, "step": 5320 }, { - "epoch": 0.6417047917168313, - "grad_norm": 5.25, + "epoch": 3.8017118402282453, + "grad_norm": 6.71875, "learning_rate": 8.742666666666667e-05, - "loss": 0.6262, + "loss": 0.8484, "step": 5330 }, { - "epoch": 0.6429087406693956, - "grad_norm": 8.5625, + "epoch": 3.8088445078459343, + "grad_norm": 7.53125, "learning_rate": 8.738222222222222e-05, - "loss": 0.7584, + "loss": 0.922, "step": 5340 }, { - "epoch": 0.64411268962196, - "grad_norm": 7.53125, + "epoch": 3.8159771754636234, + "grad_norm": 6.9375, "learning_rate": 8.733777777777779e-05, - "loss": 0.6793, + "loss": 0.87, "step": 5350 }, { - "epoch": 0.6453166385745245, - "grad_norm": 9.625, + "epoch": 3.8231098430813124, + "grad_norm": 6.75, "learning_rate": 8.729333333333334e-05, - "loss": 0.6166, + "loss": 0.9272, "step": 5360 }, { - "epoch": 0.6465205875270889, - "grad_norm": 7.0625, + "epoch": 3.8302425106990015, + "grad_norm": 6.875, "learning_rate": 8.724888888888889e-05, - "loss": 0.667, + "loss": 0.8358, "step": 5370 }, { - "epoch": 0.6477245364796532, - "grad_norm": 6.90625, + "epoch": 3.8373751783166905, + "grad_norm": 7.53125, "learning_rate": 8.720444444444445e-05, - "loss": 0.6427, + "loss": 0.8764, "step": 5380 }, { - "epoch": 0.6489284854322177, + "epoch": 3.8445078459343796, "grad_norm": 7.96875, "learning_rate": 8.716000000000001e-05, - "loss": 0.7689, + "loss": 0.9348, "step": 5390 }, { - "epoch": 0.6501324343847821, - "grad_norm": 8.9375, + "epoch": 3.8516405135520686, + "grad_norm": 7.5625, "learning_rate": 8.711555555555556e-05, - "loss": 0.6957, + "loss": 0.9033, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval/acc": 34.88372039794922, + "epoch": 3.8516405135520686, + "eval/acc": 39.53488540649414, "step": 5400 }, { - "epoch": 0.6501324343847821, - "eval_loss": 2.8916988372802734, - "eval_runtime": 0.2068, - "eval_samples_per_second": 207.976, - "eval_steps_per_second": 4.837, + "epoch": 3.8516405135520686, + "eval_loss": 2.0633187294006348, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.324, + "eval_steps_per_second": 4.449, "step": 5400 }, { - "epoch": 0.6513363833373464, - "grad_norm": 6.34375, + "epoch": 3.8587731811697576, + "grad_norm": 6.90625, "learning_rate": 8.707111111111111e-05, - "loss": 0.6811, + "loss": 0.9344, "step": 5410 }, { - "epoch": 0.6525403322899109, - "grad_norm": 6.71875, + "epoch": 3.8659058487874463, + "grad_norm": 7.5, "learning_rate": 8.702666666666667e-05, - "loss": 0.6849, + "loss": 0.9346, "step": 5420 }, { - "epoch": 0.6537442812424753, - "grad_norm": 6.46875, + "epoch": 3.8730385164051357, + "grad_norm": 7.03125, "learning_rate": 8.698222222222223e-05, - "loss": 0.6134, + "loss": 0.8835, "step": 5430 }, { - "epoch": 0.6549482301950398, - "grad_norm": 10.5, + "epoch": 3.8801711840228243, + "grad_norm": 6.3125, "learning_rate": 8.693777777777778e-05, - "loss": 0.6213, + "loss": 0.8434, "step": 5440 }, { - "epoch": 0.6561521791476042, - "grad_norm": 6.25, + "epoch": 3.8873038516405134, + "grad_norm": 7.03125, "learning_rate": 8.689333333333334e-05, - "loss": 0.6892, + "loss": 0.8555, "step": 5450 }, { - "epoch": 0.6573561281001685, - "grad_norm": 7.0, + "epoch": 3.8944365192582024, + "grad_norm": 8.0, "learning_rate": 8.684888888888889e-05, - "loss": 0.6003, + "loss": 0.9287, "step": 5460 }, { - "epoch": 0.658560077052733, - "grad_norm": 7.46875, + "epoch": 3.9015691868758915, + "grad_norm": 8.1875, "learning_rate": 8.680444444444444e-05, - "loss": 0.726, + "loss": 0.8738, "step": 5470 }, { - "epoch": 0.6597640260052974, - "grad_norm": 6.0, + "epoch": 3.9087018544935805, + "grad_norm": 7.96875, "learning_rate": 8.676e-05, - "loss": 0.7526, + "loss": 0.8189, "step": 5480 }, { - "epoch": 0.6609679749578617, - "grad_norm": 9.875, + "epoch": 3.9158345221112696, + "grad_norm": 10.1875, "learning_rate": 8.671555555555556e-05, - "loss": 0.603, + "loss": 0.8983, "step": 5490 }, { - "epoch": 0.6621719239104262, - "grad_norm": 13.6875, + "epoch": 3.9229671897289586, + "grad_norm": 10.375, "learning_rate": 8.667111111111111e-05, - "loss": 0.6759, + "loss": 0.8083, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval/acc": 34.88372039794922, + "epoch": 3.9229671897289586, + "eval/acc": 39.53488540649414, "step": 5500 }, { - "epoch": 0.6621719239104262, - "eval_loss": 2.915025234222412, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.294, - "eval_steps_per_second": 4.821, + "epoch": 3.9229671897289586, + "eval_loss": 2.089243173599243, + "eval_runtime": 0.2203, + "eval_samples_per_second": 195.23, + "eval_steps_per_second": 4.54, "step": 5500 }, { - "epoch": 0.6633758728629906, - "grad_norm": 8.8125, + "epoch": 3.9300998573466477, + "grad_norm": 13.125, "learning_rate": 8.662666666666666e-05, - "loss": 0.6582, + "loss": 0.8747, "step": 5510 }, { - "epoch": 0.664579821815555, - "grad_norm": 7.6875, + "epoch": 3.9372325249643367, + "grad_norm": 8.25, "learning_rate": 8.658222222222224e-05, - "loss": 0.6219, + "loss": 0.8609, "step": 5520 }, { - "epoch": 0.6657837707681195, - "grad_norm": 9.25, + "epoch": 3.944365192582026, + "grad_norm": 6.75, "learning_rate": 8.653777777777779e-05, - "loss": 0.742, + "loss": 0.8563, "step": 5530 }, { - "epoch": 0.6669877197206838, - "grad_norm": 6.59375, + "epoch": 3.951497860199715, + "grad_norm": 7.75, "learning_rate": 8.649333333333333e-05, - "loss": 0.653, + "loss": 0.8912, "step": 5540 }, { - "epoch": 0.6681916686732483, - "grad_norm": 9.25, + "epoch": 3.9586305278174034, + "grad_norm": 6.40625, "learning_rate": 8.64488888888889e-05, - "loss": 0.67, + "loss": 0.7477, "step": 5550 }, { - "epoch": 0.6693956176258127, - "grad_norm": 7.59375, + "epoch": 3.965763195435093, + "grad_norm": 7.0, "learning_rate": 8.640444444444444e-05, - "loss": 0.7448, + "loss": 0.8185, "step": 5560 }, { - "epoch": 0.670599566578377, - "grad_norm": 7.125, + "epoch": 3.9728958630527815, + "grad_norm": 5.6875, "learning_rate": 8.636e-05, - "loss": 0.607, + "loss": 0.9497, "step": 5570 }, { - "epoch": 0.6718035155309415, - "grad_norm": 6.59375, + "epoch": 3.980028530670471, + "grad_norm": 8.0, "learning_rate": 8.631555555555556e-05, - "loss": 0.6398, + "loss": 0.8117, "step": 5580 }, { - "epoch": 0.6730074644835059, - "grad_norm": 6.21875, + "epoch": 3.9871611982881596, + "grad_norm": 6.625, "learning_rate": 8.627111111111112e-05, - "loss": 0.6334, + "loss": 0.8245, "step": 5590 }, { - "epoch": 0.6742114134360703, - "grad_norm": 7.0625, + "epoch": 3.9942938659058487, + "grad_norm": 6.96875, "learning_rate": 8.622666666666667e-05, - "loss": 0.6878, + "loss": 0.902, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval/acc": 32.55813980102539, + "epoch": 3.9942938659058487, + "eval/acc": 39.53488540649414, "step": 5600 }, { - "epoch": 0.6742114134360703, - "eval_loss": 2.8182010650634766, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.724, - "eval_steps_per_second": 4.831, + "epoch": 3.9942938659058487, + "eval_loss": 2.186225652694702, + "eval_runtime": 0.2194, + "eval_samples_per_second": 196.001, + "eval_steps_per_second": 4.558, "step": 5600 }, { - "epoch": 0.6754153623886348, - "grad_norm": 7.9375, + "epoch": 4.001426533523538, + "grad_norm": 6.78125, "learning_rate": 8.618222222222223e-05, - "loss": 0.6577, + "loss": 0.8757, "step": 5610 }, { - "epoch": 0.6766193113411991, - "grad_norm": 7.34375, + "epoch": 4.008559201141227, + "grad_norm": 11.0625, "learning_rate": 8.613777777777779e-05, - "loss": 0.7787, + "loss": 0.885, "step": 5620 }, { - "epoch": 0.6778232602937635, - "grad_norm": 6.96875, + "epoch": 4.015691868758916, + "grad_norm": 6.4375, "learning_rate": 8.609333333333334e-05, - "loss": 0.7849, + "loss": 0.8611, "step": 5630 }, { - "epoch": 0.679027209246328, - "grad_norm": 16.125, + "epoch": 4.022824536376604, + "grad_norm": 14.8125, "learning_rate": 8.604888888888889e-05, - "loss": 0.8503, + "loss": 0.8262, "step": 5640 }, { - "epoch": 0.6802311581988923, - "grad_norm": 7.625, + "epoch": 4.029957203994294, + "grad_norm": 8.0625, "learning_rate": 8.600444444444445e-05, - "loss": 0.6215, + "loss": 0.7549, "step": 5650 }, { - "epoch": 0.6814351071514568, - "grad_norm": 7.28125, + "epoch": 4.0370898716119825, + "grad_norm": 6.84375, "learning_rate": 8.596000000000001e-05, - "loss": 0.6894, + "loss": 0.8725, "step": 5660 }, { - "epoch": 0.6826390561040212, - "grad_norm": 6.28125, + "epoch": 4.044222539229672, + "grad_norm": 8.0, "learning_rate": 8.591555555555556e-05, - "loss": 0.616, + "loss": 0.8846, "step": 5670 }, { - "epoch": 0.6838430050565856, - "grad_norm": 6.125, + "epoch": 4.051355206847361, + "grad_norm": 7.84375, "learning_rate": 8.587111111111111e-05, - "loss": 0.6417, + "loss": 0.9373, "step": 5680 }, { - "epoch": 0.68504695400915, - "grad_norm": 7.78125, + "epoch": 4.05848787446505, + "grad_norm": 6.84375, "learning_rate": 8.582666666666667e-05, - "loss": 0.7842, + "loss": 0.7823, "step": 5690 }, { - "epoch": 0.6862509029617144, - "grad_norm": 9.4375, + "epoch": 4.065620542082739, + "grad_norm": 11.4375, "learning_rate": 8.578222222222223e-05, - "loss": 0.6562, + "loss": 0.9588, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval/acc": 32.55813980102539, + "epoch": 4.065620542082739, + "eval/acc": 37.20930099487305, "step": 5700 }, { - "epoch": 0.6862509029617144, - "eval_loss": 2.861806869506836, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.449, - "eval_steps_per_second": 4.801, + "epoch": 4.065620542082739, + "eval_loss": 2.841008424758911, + "eval_runtime": 1.3984, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.715, "step": 5700 }, { - "epoch": 0.6874548519142788, - "grad_norm": 6.46875, + "epoch": 4.072753209700428, + "grad_norm": 5.5625, "learning_rate": 8.573777777777778e-05, - "loss": 0.6165, + "loss": 0.8014, "step": 5710 }, { - "epoch": 0.6886588008668433, - "grad_norm": 7.0625, + "epoch": 4.079885877318117, + "grad_norm": 6.90625, "learning_rate": 8.569333333333334e-05, - "loss": 0.7014, + "loss": 0.818, "step": 5720 }, { - "epoch": 0.6898627498194077, - "grad_norm": 8.0625, + "epoch": 4.087018544935806, + "grad_norm": 8.4375, "learning_rate": 8.564888888888889e-05, - "loss": 0.7459, + "loss": 0.8142, "step": 5730 }, { - "epoch": 0.691066698771972, - "grad_norm": 5.84375, + "epoch": 4.094151212553495, + "grad_norm": 7.75, "learning_rate": 8.560444444444445e-05, - "loss": 0.6708, + "loss": 0.863, "step": 5740 }, { - "epoch": 0.6922706477245365, - "grad_norm": 7.9375, + "epoch": 4.101283880171184, + "grad_norm": 6.90625, "learning_rate": 8.556e-05, - "loss": 0.6487, + "loss": 0.8501, "step": 5750 }, { - "epoch": 0.6934745966771009, - "grad_norm": 8.125, + "epoch": 4.108416547788873, + "grad_norm": 7.15625, "learning_rate": 8.551555555555556e-05, - "loss": 0.6634, + "loss": 0.8293, "step": 5760 }, { - "epoch": 0.6946785456296654, - "grad_norm": 5.0, + "epoch": 4.1155492154065625, + "grad_norm": 8.125, "learning_rate": 8.547111111111111e-05, - "loss": 0.6575, + "loss": 0.8655, "step": 5770 }, { - "epoch": 0.6958824945822297, - "grad_norm": 6.28125, + "epoch": 4.122681883024251, + "grad_norm": 7.75, "learning_rate": 8.542666666666666e-05, - "loss": 0.6661, + "loss": 0.7958, "step": 5780 }, { - "epoch": 0.6970864435347941, - "grad_norm": 6.5, + "epoch": 4.12981455064194, + "grad_norm": 8.3125, "learning_rate": 8.538222222222224e-05, - "loss": 0.6922, + "loss": 0.9186, "step": 5790 }, { - "epoch": 0.6982903924873586, - "grad_norm": 9.0625, + "epoch": 4.136947218259629, + "grad_norm": 7.0625, "learning_rate": 8.533777777777778e-05, - "loss": 0.687, + "loss": 0.9135, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval/acc": 37.79069900512695, + "epoch": 4.136947218259629, + "eval/acc": 37.20930099487305, "step": 5800 }, { - "epoch": 0.6982903924873586, - "eval_loss": 2.878754138946533, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.039, - "eval_steps_per_second": 4.745, + "epoch": 4.136947218259629, + "eval_loss": 2.8186914920806885, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.722, + "eval_steps_per_second": 4.645, "step": 5800 }, { - "epoch": 0.699494341439923, - "grad_norm": 8.875, + "epoch": 4.144079885877318, + "grad_norm": 8.125, "learning_rate": 8.529333333333333e-05, - "loss": 0.7106, + "loss": 0.8248, "step": 5810 }, { - "epoch": 0.7006982903924873, - "grad_norm": 8.3125, + "epoch": 4.151212553495007, + "grad_norm": 7.65625, "learning_rate": 8.52488888888889e-05, - "loss": 0.5969, + "loss": 0.9186, "step": 5820 }, { - "epoch": 0.7019022393450518, - "grad_norm": 6.40625, + "epoch": 4.158345221112696, + "grad_norm": 7.6875, "learning_rate": 8.520444444444446e-05, - "loss": 0.6795, + "loss": 0.8367, "step": 5830 }, { - "epoch": 0.7031061882976162, - "grad_norm": 8.5625, + "epoch": 4.165477888730385, + "grad_norm": 9.75, "learning_rate": 8.516e-05, - "loss": 0.7621, + "loss": 0.8898, "step": 5840 }, { - "epoch": 0.7043101372501805, - "grad_norm": 9.5625, + "epoch": 4.172610556348074, + "grad_norm": 8.5625, "learning_rate": 8.511555555555555e-05, - "loss": 0.7035, + "loss": 0.9218, "step": 5850 }, { - "epoch": 0.705514086202745, - "grad_norm": 11.3125, + "epoch": 4.1797432239657635, + "grad_norm": 6.0, "learning_rate": 8.507111111111112e-05, - "loss": 0.8043, + "loss": 0.8784, "step": 5860 }, { - "epoch": 0.7067180351553094, - "grad_norm": 7.4375, + "epoch": 4.186875891583452, + "grad_norm": 8.5625, "learning_rate": 8.502666666666666e-05, - "loss": 0.6349, + "loss": 0.8361, "step": 5870 }, { - "epoch": 0.7079219841078739, - "grad_norm": 6.28125, + "epoch": 4.194008559201142, + "grad_norm": 7.40625, "learning_rate": 8.498222222222223e-05, - "loss": 0.6593, + "loss": 0.816, "step": 5880 }, { - "epoch": 0.7091259330604383, - "grad_norm": 6.4375, + "epoch": 4.20114122681883, + "grad_norm": 7.84375, "learning_rate": 8.493777777777779e-05, - "loss": 0.6236, + "loss": 0.897, "step": 5890 }, { - "epoch": 0.7103298820130026, - "grad_norm": 7.84375, + "epoch": 4.20827389443652, + "grad_norm": 10.0625, "learning_rate": 8.489333333333334e-05, - "loss": 0.6134, + "loss": 0.7807, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval/acc": 34.88372039794922, + "epoch": 4.20827389443652, + "eval/acc": 37.20930099487305, "step": 5900 }, { - "epoch": 0.7103298820130026, - "eval_loss": 2.918956756591797, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.945, - "eval_steps_per_second": 4.696, + "epoch": 4.20827389443652, + "eval_loss": 2.890333890914917, + "eval_runtime": 0.2187, + "eval_samples_per_second": 196.595, + "eval_steps_per_second": 4.572, "step": 5900 }, { - "epoch": 0.7115338309655671, - "grad_norm": 7.40625, + "epoch": 4.215406562054208, + "grad_norm": 7.6875, "learning_rate": 8.484888888888888e-05, - "loss": 0.5883, + "loss": 0.8786, "step": 5910 }, { - "epoch": 0.7127377799181315, - "grad_norm": 7.0625, + "epoch": 4.222539229671897, + "grad_norm": 7.46875, "learning_rate": 8.480444444444445e-05, - "loss": 0.6805, + "loss": 0.8689, "step": 5920 }, { - "epoch": 0.7139417288706958, - "grad_norm": 5.25, + "epoch": 4.229671897289586, + "grad_norm": 14.125, "learning_rate": 8.476000000000001e-05, - "loss": 0.5638, + "loss": 0.83, "step": 5930 }, { - "epoch": 0.7151456778232603, - "grad_norm": 5.84375, + "epoch": 4.236804564907275, + "grad_norm": 6.09375, "learning_rate": 8.471555555555556e-05, - "loss": 0.6112, + "loss": 0.8921, "step": 5940 }, { - "epoch": 0.7163496267758247, - "grad_norm": 6.5625, + "epoch": 4.2439372325249645, + "grad_norm": 8.875, "learning_rate": 8.467111111111112e-05, - "loss": 0.6147, + "loss": 0.9293, "step": 5950 }, { - "epoch": 0.7175535757283891, - "grad_norm": 6.15625, + "epoch": 4.251069900142653, + "grad_norm": 10.5625, "learning_rate": 8.462666666666667e-05, - "loss": 0.7292, + "loss": 0.7955, "step": 5960 }, { - "epoch": 0.7187575246809536, - "grad_norm": 8.25, + "epoch": 4.258202567760343, + "grad_norm": 15.25, "learning_rate": 8.458222222222223e-05, - "loss": 0.6048, + "loss": 0.9267, "step": 5970 }, { - "epoch": 0.7199614736335179, - "grad_norm": 8.0625, + "epoch": 4.265335235378031, + "grad_norm": 8.0, "learning_rate": 8.453777777777778e-05, - "loss": 0.581, + "loss": 0.7665, "step": 5980 }, { - "epoch": 0.7211654225860824, - "grad_norm": 7.90625, + "epoch": 4.272467902995721, + "grad_norm": 6.4375, "learning_rate": 8.449333333333334e-05, - "loss": 0.6918, + "loss": 0.8212, "step": 5990 }, { - "epoch": 0.7223693715386468, - "grad_norm": 5.65625, + "epoch": 4.279600570613409, + "grad_norm": 8.0625, "learning_rate": 8.444888888888889e-05, - "loss": 0.6774, + "loss": 0.8294, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval/acc": 36.627906799316406, + "epoch": 4.279600570613409, + "eval/acc": 34.88372039794922, "step": 6000 }, { - "epoch": 0.7223693715386468, - "eval_loss": 2.936192512512207, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.531, - "eval_steps_per_second": 4.733, + "epoch": 4.279600570613409, + "eval_loss": 2.8812708854675293, + "eval_runtime": 0.2262, + "eval_samples_per_second": 190.082, + "eval_steps_per_second": 4.421, "step": 6000 }, { - "epoch": 0.7235733204912111, - "grad_norm": 7.59375, + "epoch": 4.286733238231099, + "grad_norm": 5.625, "learning_rate": 8.440444444444445e-05, - "loss": 0.5982, + "loss": 0.8813, "step": 6010 }, { - "epoch": 0.7247772694437756, - "grad_norm": 9.0625, + "epoch": 4.293865905848787, + "grad_norm": 8.375, "learning_rate": 8.436000000000001e-05, - "loss": 0.6048, + "loss": 0.8792, "step": 6020 }, { - "epoch": 0.72598121839634, - "grad_norm": 7.46875, + "epoch": 4.300998573466477, + "grad_norm": 9.125, "learning_rate": 8.431555555555556e-05, - "loss": 0.7024, + "loss": 0.9509, "step": 6030 }, { - "epoch": 0.7271851673489044, - "grad_norm": 8.0625, + "epoch": 4.3081312410841655, + "grad_norm": 7.34375, "learning_rate": 8.427111111111111e-05, - "loss": 0.7556, + "loss": 0.9452, "step": 6040 }, { - "epoch": 0.7283891163014689, - "grad_norm": 6.78125, + "epoch": 4.315263908701855, + "grad_norm": 8.25, "learning_rate": 8.422666666666667e-05, - "loss": 0.7187, + "loss": 0.8801, "step": 6050 }, { - "epoch": 0.7295930652540332, - "grad_norm": 6.8125, + "epoch": 4.3223965763195435, + "grad_norm": 6.75, "learning_rate": 8.418222222222223e-05, - "loss": 0.5774, + "loss": 0.805, "step": 6060 }, { - "epoch": 0.7307970142065976, - "grad_norm": 6.9375, + "epoch": 4.329529243937232, + "grad_norm": 8.375, "learning_rate": 8.413777777777778e-05, - "loss": 0.6724, + "loss": 0.8176, "step": 6070 }, { - "epoch": 0.7320009631591621, + "epoch": 4.336661911554922, "grad_norm": 6.1875, "learning_rate": 8.409333333333333e-05, - "loss": 0.6109, + "loss": 0.8662, "step": 6080 }, { - "epoch": 0.7332049121117264, - "grad_norm": 5.84375, + "epoch": 4.34379457917261, + "grad_norm": 6.03125, "learning_rate": 8.404888888888889e-05, - "loss": 0.6251, + "loss": 0.9121, "step": 6090 }, { - "epoch": 0.7344088610642908, - "grad_norm": 6.78125, + "epoch": 4.3509272467903, + "grad_norm": 5.6875, "learning_rate": 8.400444444444445e-05, - "loss": 0.6916, + "loss": 0.8697, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval/acc": 32.55813980102539, + "epoch": 4.3509272467903, + "eval/acc": 39.53488540649414, "step": 6100 }, { - "epoch": 0.7344088610642908, - "eval_loss": 2.947686195373535, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.91, - "eval_steps_per_second": 4.789, + "epoch": 4.3509272467903, + "eval_loss": 2.7605249881744385, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.191, + "eval_steps_per_second": 4.493, "step": 6100 }, { - "epoch": 0.7356128100168553, - "grad_norm": 6.96875, + "epoch": 4.358059914407988, + "grad_norm": 8.125, "learning_rate": 8.396e-05, - "loss": 0.6525, + "loss": 0.783, "step": 6110 }, { - "epoch": 0.7368167589694197, - "grad_norm": 9.625, + "epoch": 4.365192582025678, + "grad_norm": 6.71875, "learning_rate": 8.391555555555556e-05, - "loss": 0.6107, + "loss": 0.7273, "step": 6120 }, { - "epoch": 0.7380207079219842, - "grad_norm": 5.84375, + "epoch": 4.372325249643366, + "grad_norm": 7.625, "learning_rate": 8.387111111111111e-05, - "loss": 0.6339, + "loss": 0.9497, "step": 6130 }, { - "epoch": 0.7392246568745485, - "grad_norm": 8.0, + "epoch": 4.379457917261056, + "grad_norm": 7.625, "learning_rate": 8.382666666666667e-05, - "loss": 0.6243, + "loss": 0.9318, "step": 6140 }, { - "epoch": 0.7404286058271129, - "grad_norm": 7.9375, + "epoch": 4.3865905848787445, + "grad_norm": 7.5625, "learning_rate": 8.378222222222222e-05, - "loss": 0.6644, + "loss": 0.7827, "step": 6150 }, { - "epoch": 0.7416325547796774, + "epoch": 4.393723252496434, "grad_norm": 7.4375, "learning_rate": 8.373777777777779e-05, - "loss": 0.6117, + "loss": 0.8471, "step": 6160 }, { - "epoch": 0.7428365037322417, - "grad_norm": 7.28125, + "epoch": 4.400855920114123, + "grad_norm": 5.59375, "learning_rate": 8.369333333333333e-05, - "loss": 0.6253, + "loss": 0.866, "step": 6170 }, { - "epoch": 0.7440404526848061, - "grad_norm": 6.59375, + "epoch": 4.407988587731811, + "grad_norm": 5.34375, "learning_rate": 8.364888888888888e-05, - "loss": 0.5973, + "loss": 0.8237, "step": 6180 }, { - "epoch": 0.7452444016373706, - "grad_norm": 8.5, + "epoch": 4.415121255349501, + "grad_norm": 9.375, "learning_rate": 8.360444444444446e-05, - "loss": 0.5938, + "loss": 0.896, "step": 6190 }, { - "epoch": 0.746448350589935, - "grad_norm": 6.40625, + "epoch": 4.422253922967189, + "grad_norm": 7.78125, "learning_rate": 8.356e-05, - "loss": 0.7276, + "loss": 0.8402, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval/acc": 34.88372039794922, + "epoch": 4.422253922967189, + "eval/acc": 37.20930099487305, "step": 6200 }, { - "epoch": 0.746448350589935, - "eval_loss": 3.0573887825012207, - "eval_runtime": 0.2067, - "eval_samples_per_second": 208.014, - "eval_steps_per_second": 4.838, + "epoch": 4.422253922967189, + "eval_loss": 2.8444175720214844, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.997, + "eval_steps_per_second": 4.512, "step": 6200 }, { - "epoch": 0.7476522995424993, - "grad_norm": 6.75, + "epoch": 4.429386590584879, + "grad_norm": 7.625, "learning_rate": 8.351555555555555e-05, - "loss": 0.6518, + "loss": 0.8708, "step": 6210 }, { - "epoch": 0.7488562484950638, - "grad_norm": 6.5, + "epoch": 4.436519258202567, + "grad_norm": 7.28125, "learning_rate": 8.347111111111112e-05, - "loss": 0.5737, + "loss": 0.8505, "step": 6220 }, { - "epoch": 0.7500601974476282, - "grad_norm": 7.96875, + "epoch": 4.443651925820257, + "grad_norm": 7.28125, "learning_rate": 8.342666666666668e-05, - "loss": 0.743, + "loss": 0.878, "step": 6230 }, { - "epoch": 0.7512641464001927, - "grad_norm": 8.375, + "epoch": 4.4507845934379455, + "grad_norm": 8.0, "learning_rate": 8.338222222222223e-05, - "loss": 0.6803, + "loss": 0.7568, "step": 6240 }, { - "epoch": 0.752468095352757, - "grad_norm": 10.9375, + "epoch": 4.457917261055635, + "grad_norm": 7.28125, "learning_rate": 8.333777777777778e-05, - "loss": 0.8047, + "loss": 0.7909, "step": 6250 }, { - "epoch": 0.7536720443053214, - "grad_norm": 6.21875, + "epoch": 4.465049928673324, + "grad_norm": 10.625, "learning_rate": 8.329333333333334e-05, - "loss": 0.5941, + "loss": 0.8732, "step": 6260 }, { - "epoch": 0.7548759932578859, - "grad_norm": 7.0, + "epoch": 4.472182596291013, + "grad_norm": 7.40625, "learning_rate": 8.324888888888889e-05, - "loss": 0.673, + "loss": 0.8827, "step": 6270 }, { - "epoch": 0.7560799422104503, - "grad_norm": 5.6875, + "epoch": 4.479315263908702, + "grad_norm": 11.25, "learning_rate": 8.320444444444445e-05, - "loss": 0.6869, + "loss": 0.7889, "step": 6280 }, { - "epoch": 0.7572838911630146, - "grad_norm": 7.46875, + "epoch": 4.486447931526391, + "grad_norm": 7.59375, "learning_rate": 8.316000000000001e-05, - "loss": 0.7399, + "loss": 0.7808, "step": 6290 }, { - "epoch": 0.7584878401155791, - "grad_norm": 7.21875, + "epoch": 4.49358059914408, + "grad_norm": 5.40625, "learning_rate": 8.311555555555556e-05, - "loss": 0.6582, + "loss": 0.8223, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval/acc": 34.88372039794922, + "epoch": 4.49358059914408, + "eval/acc": 37.20930099487305, "step": 6300 }, { - "epoch": 0.7584878401155791, - "eval_loss": 2.991325616836548, - "eval_runtime": 0.2058, - "eval_samples_per_second": 208.93, - "eval_steps_per_second": 4.859, + "epoch": 4.49358059914408, + "eval_loss": 2.798743963241577, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.44, + "eval_steps_per_second": 4.592, "step": 6300 }, { - "epoch": 0.7596917890681435, - "grad_norm": 7.5625, + "epoch": 4.500713266761769, + "grad_norm": 7.9375, "learning_rate": 8.307111111111111e-05, - "loss": 0.6455, + "loss": 0.8588, "step": 6310 }, { - "epoch": 0.7608957380207079, - "grad_norm": 5.0625, + "epoch": 4.507845934379458, + "grad_norm": 8.0625, "learning_rate": 8.302666666666667e-05, - "loss": 0.6269, + "loss": 0.9003, "step": 6320 }, { - "epoch": 0.7620996869732723, - "grad_norm": 7.15625, + "epoch": 4.5149786019971465, + "grad_norm": 7.21875, "learning_rate": 8.298222222222223e-05, - "loss": 0.6453, + "loss": 0.8942, "step": 6330 }, { - "epoch": 0.7633036359258367, - "grad_norm": 6.34375, + "epoch": 4.522111269614836, + "grad_norm": 7.625, "learning_rate": 8.293777777777778e-05, - "loss": 0.6721, + "loss": 0.8622, "step": 6340 }, { - "epoch": 0.7645075848784012, - "grad_norm": 7.59375, + "epoch": 4.529243937232525, + "grad_norm": 5.53125, "learning_rate": 8.289333333333333e-05, - "loss": 0.569, + "loss": 0.8048, "step": 6350 }, { - "epoch": 0.7657115338309656, - "grad_norm": 6.78125, + "epoch": 4.536376604850214, + "grad_norm": 9.125, "learning_rate": 8.28488888888889e-05, - "loss": 0.6221, + "loss": 0.8506, "step": 6360 }, { - "epoch": 0.76691548278353, - "grad_norm": 9.875, + "epoch": 4.543509272467903, + "grad_norm": 6.125, "learning_rate": 8.280444444444445e-05, - "loss": 0.6623, + "loss": 0.7767, "step": 6370 }, { - "epoch": 0.7681194317360944, - "grad_norm": 7.125, + "epoch": 4.550641940085592, + "grad_norm": 6.90625, "learning_rate": 8.276e-05, - "loss": 0.7166, + "loss": 0.9143, "step": 6380 }, { - "epoch": 0.7693233806886588, - "grad_norm": 7.59375, + "epoch": 4.557774607703281, + "grad_norm": 5.84375, "learning_rate": 8.271555555555556e-05, - "loss": 0.6984, + "loss": 0.8641, "step": 6390 }, { - "epoch": 0.7705273296412232, - "grad_norm": 9.4375, + "epoch": 4.56490727532097, + "grad_norm": 6.3125, "learning_rate": 8.267111111111111e-05, - "loss": 0.7095, + "loss": 0.8297, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval/acc": 34.88372039794922, + "epoch": 4.56490727532097, + "eval/acc": 37.20930099487305, "step": 6400 }, { - "epoch": 0.7705273296412232, - "eval_loss": 3.0461771488189697, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.746, - "eval_steps_per_second": 4.808, + "epoch": 4.56490727532097, + "eval_loss": 2.804457426071167, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.742, + "eval_steps_per_second": 4.529, "step": 6400 }, { - "epoch": 0.7717312785937877, - "grad_norm": 9.375, + "epoch": 4.572039942938659, + "grad_norm": 7.15625, "learning_rate": 8.262666666666667e-05, - "loss": 0.6975, + "loss": 0.7398, "step": 6410 }, { - "epoch": 0.772935227546352, - "grad_norm": 6.75, + "epoch": 4.579172610556348, + "grad_norm": 6.125, "learning_rate": 8.258222222222222e-05, - "loss": 0.5826, + "loss": 0.8443, "step": 6420 }, { - "epoch": 0.7741391764989164, - "grad_norm": 8.25, + "epoch": 4.586305278174037, + "grad_norm": 9.25, "learning_rate": 8.253777777777778e-05, - "loss": 0.6596, + "loss": 0.7983, "step": 6430 }, { - "epoch": 0.7753431254514809, - "grad_norm": 6.375, + "epoch": 4.5934379457917265, + "grad_norm": 7.3125, "learning_rate": 8.249333333333333e-05, - "loss": 0.6624, + "loss": 0.9705, "step": 6440 }, { - "epoch": 0.7765470744040452, - "grad_norm": 7.375, + "epoch": 4.600570613409415, + "grad_norm": 7.34375, "learning_rate": 8.24488888888889e-05, - "loss": 0.6221, + "loss": 1.0079, "step": 6450 }, { - "epoch": 0.7777510233566097, - "grad_norm": 8.125, + "epoch": 4.607703281027105, + "grad_norm": 8.875, "learning_rate": 8.240444444444446e-05, - "loss": 0.6819, + "loss": 0.8982, "step": 6460 }, { - "epoch": 0.7789549723091741, - "grad_norm": 4.375, + "epoch": 4.614835948644793, + "grad_norm": 8.375, "learning_rate": 8.236e-05, - "loss": 0.588, + "loss": 0.8417, "step": 6470 }, { - "epoch": 0.7801589212617385, - "grad_norm": 8.875, + "epoch": 4.621968616262482, + "grad_norm": 7.78125, "learning_rate": 8.231555555555555e-05, - "loss": 0.7451, + "loss": 0.8566, "step": 6480 }, { - "epoch": 0.781362870214303, - "grad_norm": 8.5, + "epoch": 4.629101283880171, + "grad_norm": 6.5625, "learning_rate": 8.227111111111111e-05, - "loss": 0.64, + "loss": 0.8155, "step": 6490 }, { - "epoch": 0.7825668191668673, - "grad_norm": 6.59375, + "epoch": 4.63623395149786, + "grad_norm": 5.875, "learning_rate": 8.222666666666668e-05, - "loss": 0.6879, + "loss": 0.9449, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval/acc": 32.55813980102539, + "epoch": 4.63623395149786, + "eval/acc": 41.86046600341797, "step": 6500 }, { - "epoch": 0.7825668191668673, - "eval_loss": 2.970376491546631, - "eval_runtime": 0.2075, - "eval_samples_per_second": 207.198, - "eval_steps_per_second": 4.819, + "epoch": 4.63623395149786, + "eval_loss": 2.761596918106079, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.549, + "eval_steps_per_second": 4.664, "step": 6500 }, { - "epoch": 0.7837707681194317, - "grad_norm": 6.96875, + "epoch": 4.643366619115549, + "grad_norm": 7.5, "learning_rate": 8.218222222222223e-05, - "loss": 0.6584, + "loss": 0.8549, "step": 6510 }, { - "epoch": 0.7849747170719962, - "grad_norm": 7.3125, + "epoch": 4.650499286733238, + "grad_norm": 7.0625, "learning_rate": 8.213777777777777e-05, - "loss": 0.6892, + "loss": 0.8473, "step": 6520 }, { - "epoch": 0.7861786660245605, - "grad_norm": 6.28125, + "epoch": 4.6576319543509275, + "grad_norm": 7.1875, "learning_rate": 8.209333333333334e-05, - "loss": 0.6658, + "loss": 0.8773, "step": 6530 }, { - "epoch": 0.7873826149771249, - "grad_norm": 7.3125, + "epoch": 4.664764621968616, + "grad_norm": 7.25, "learning_rate": 8.20488888888889e-05, - "loss": 0.6379, + "loss": 0.789, "step": 6540 }, { - "epoch": 0.7885865639296894, - "grad_norm": 6.09375, + "epoch": 4.671897289586306, + "grad_norm": 7.34375, "learning_rate": 8.200444444444445e-05, - "loss": 0.5797, + "loss": 0.852, "step": 6550 }, { - "epoch": 0.7897905128822538, - "grad_norm": 7.03125, + "epoch": 4.679029957203994, + "grad_norm": 5.65625, "learning_rate": 8.196000000000001e-05, - "loss": 0.6778, + "loss": 0.8291, "step": 6560 }, { - "epoch": 0.7909944618348183, - "grad_norm": 7.46875, + "epoch": 4.686162624821684, + "grad_norm": 5.5625, "learning_rate": 8.191555555555556e-05, - "loss": 0.669, + "loss": 0.7943, "step": 6570 }, { - "epoch": 0.7921984107873826, - "grad_norm": 7.46875, + "epoch": 4.693295292439372, + "grad_norm": 9.25, "learning_rate": 8.18711111111111e-05, - "loss": 0.7272, + "loss": 0.8418, "step": 6580 }, { - "epoch": 0.793402359739947, - "grad_norm": 6.3125, + "epoch": 4.700427960057061, + "grad_norm": 6.75, "learning_rate": 8.182666666666667e-05, - "loss": 0.5767, + "loss": 0.8661, "step": 6590 }, { - "epoch": 0.7946063086925115, - "grad_norm": 7.28125, + "epoch": 4.70756062767475, + "grad_norm": 7.40625, "learning_rate": 8.178222222222223e-05, - "loss": 0.6776, + "loss": 0.768, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval/acc": 34.88372039794922, + "epoch": 4.70756062767475, + "eval/acc": 41.86046600341797, "step": 6600 }, { - "epoch": 0.7946063086925115, - "eval_loss": 2.941105842590332, - "eval_runtime": 0.2071, - "eval_samples_per_second": 207.595, - "eval_steps_per_second": 4.828, + "epoch": 4.70756062767475, + "eval_loss": 2.8003947734832764, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.665, + "eval_steps_per_second": 4.527, "step": 6600 }, { - "epoch": 0.7958102576450758, - "grad_norm": 7.125, + "epoch": 4.71469329529244, + "grad_norm": 7.1875, "learning_rate": 8.173777777777778e-05, - "loss": 0.6368, + "loss": 0.9038, "step": 6610 }, { - "epoch": 0.7970142065976402, - "grad_norm": 6.34375, + "epoch": 4.7218259629101285, + "grad_norm": 6.46875, "learning_rate": 8.169333333333334e-05, - "loss": 0.6504, + "loss": 0.7185, "step": 6620 }, { - "epoch": 0.7982181555502047, - "grad_norm": 5.46875, + "epoch": 4.728958630527817, + "grad_norm": 6.3125, "learning_rate": 8.16488888888889e-05, - "loss": 0.6305, + "loss": 0.9515, "step": 6630 }, { - "epoch": 0.7994221045027691, - "grad_norm": 6.3125, + "epoch": 4.736091298145507, + "grad_norm": 6.46875, "learning_rate": 8.160444444444445e-05, - "loss": 0.6538, + "loss": 0.8127, "step": 6640 }, { - "epoch": 0.8006260534553334, - "grad_norm": 9.0625, + "epoch": 4.743223965763195, + "grad_norm": 6.4375, "learning_rate": 8.156e-05, - "loss": 0.6747, + "loss": 0.8914, "step": 6650 }, { - "epoch": 0.8018300024078979, - "grad_norm": 13.0, + "epoch": 4.750356633380885, + "grad_norm": 6.8125, "learning_rate": 8.151555555555556e-05, - "loss": 0.6412, + "loss": 0.8545, "step": 6660 }, { - "epoch": 0.8030339513604623, - "grad_norm": 7.0, + "epoch": 4.757489300998573, + "grad_norm": 7.21875, "learning_rate": 8.147111111111112e-05, - "loss": 0.6479, + "loss": 0.6783, "step": 6670 }, { - "epoch": 0.8042379003130268, - "grad_norm": 7.375, + "epoch": 4.764621968616263, + "grad_norm": 7.03125, "learning_rate": 8.142666666666667e-05, - "loss": 0.6577, + "loss": 0.9337, "step": 6680 }, { - "epoch": 0.8054418492655911, - "grad_norm": 7.625, + "epoch": 4.771754636233951, + "grad_norm": 10.5625, "learning_rate": 8.138222222222223e-05, - "loss": 0.7217, + "loss": 0.8181, "step": 6690 }, { - "epoch": 0.8066457982181555, - "grad_norm": 5.625, + "epoch": 4.778887303851641, + "grad_norm": 7.375, "learning_rate": 8.133777777777778e-05, - "loss": 0.6363, + "loss": 0.8639, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval/acc": 34.88372039794922, + "epoch": 4.778887303851641, + "eval/acc": 37.20930099487305, "step": 6700 }, { - "epoch": 0.8066457982181555, - "eval_loss": 2.8945717811584473, - "eval_runtime": 0.2054, - "eval_samples_per_second": 209.381, - "eval_steps_per_second": 4.869, + "epoch": 4.778887303851641, + "eval_loss": 2.8262782096862793, + "eval_runtime": 0.2194, + "eval_samples_per_second": 195.949, + "eval_steps_per_second": 4.557, "step": 6700 }, { - "epoch": 0.80784974717072, - "grad_norm": 8.0625, + "epoch": 4.7860199714693294, + "grad_norm": 10.8125, "learning_rate": 8.129333333333333e-05, - "loss": 0.6784, + "loss": 0.8742, "step": 6710 }, { - "epoch": 0.8090536961232844, - "grad_norm": 9.1875, + "epoch": 4.793152639087019, + "grad_norm": 5.53125, "learning_rate": 8.124888888888889e-05, - "loss": 0.6187, + "loss": 0.7438, "step": 6720 }, { - "epoch": 0.8102576450758487, - "grad_norm": 9.1875, + "epoch": 4.8002853067047075, + "grad_norm": 6.65625, "learning_rate": 8.120444444444445e-05, - "loss": 0.6461, + "loss": 0.7859, "step": 6730 }, { - "epoch": 0.8114615940284132, - "grad_norm": 7.375, + "epoch": 4.807417974322396, + "grad_norm": 6.78125, "learning_rate": 8.116e-05, - "loss": 0.7325, + "loss": 0.8942, "step": 6740 }, { - "epoch": 0.8126655429809776, - "grad_norm": 7.71875, + "epoch": 4.814550641940086, + "grad_norm": 8.4375, "learning_rate": 8.111555555555555e-05, - "loss": 0.6758, + "loss": 0.8483, "step": 6750 }, { - "epoch": 0.813869491933542, - "grad_norm": 10.125, + "epoch": 4.821683309557774, + "grad_norm": 6.40625, "learning_rate": 8.107111111111113e-05, - "loss": 0.6223, + "loss": 0.8284, "step": 6760 }, { - "epoch": 0.8150734408861064, - "grad_norm": 7.90625, + "epoch": 4.828815977175464, + "grad_norm": 6.84375, "learning_rate": 8.102666666666667e-05, - "loss": 0.6115, + "loss": 0.8887, "step": 6770 }, { - "epoch": 0.8162773898386708, - "grad_norm": 5.375, + "epoch": 4.835948644793152, + "grad_norm": 8.875, "learning_rate": 8.098222222222222e-05, - "loss": 0.5747, + "loss": 0.8431, "step": 6780 }, { - "epoch": 0.8174813387912353, - "grad_norm": 7.375, + "epoch": 4.843081312410842, + "grad_norm": 6.90625, "learning_rate": 8.093777777777779e-05, - "loss": 0.618, + "loss": 0.8325, "step": 6790 }, { - "epoch": 0.8186852877437997, - "grad_norm": 7.125, + "epoch": 4.85021398002853, + "grad_norm": 7.0, "learning_rate": 8.089333333333333e-05, - "loss": 0.6603, + "loss": 0.7742, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval/acc": 34.88372039794922, + "epoch": 4.85021398002853, + "eval/acc": 39.53488540649414, "step": 6800 }, { - "epoch": 0.8186852877437997, - "eval_loss": 2.9451656341552734, - "eval_runtime": 1.2476, - "eval_samples_per_second": 34.466, - "eval_steps_per_second": 0.802, + "epoch": 4.85021398002853, + "eval_loss": 2.7403292655944824, + "eval_runtime": 0.5509, + "eval_samples_per_second": 78.059, + "eval_steps_per_second": 1.815, "step": 6800 }, { - "epoch": 0.819889236696364, - "grad_norm": 6.28125, + "epoch": 4.85734664764622, + "grad_norm": 6.625, "learning_rate": 8.08488888888889e-05, - "loss": 0.5918, + "loss": 0.8418, "step": 6810 }, { - "epoch": 0.8210931856489285, - "grad_norm": 8.6875, + "epoch": 4.8644793152639085, + "grad_norm": 7.65625, "learning_rate": 8.080444444444444e-05, - "loss": 0.5911, + "loss": 0.9022, "step": 6820 }, { - "epoch": 0.8222971346014929, - "grad_norm": 6.75, + "epoch": 4.871611982881598, + "grad_norm": 7.75, "learning_rate": 8.076e-05, - "loss": 0.6648, + "loss": 0.8201, "step": 6830 }, { - "epoch": 0.8235010835540573, - "grad_norm": 6.78125, + "epoch": 4.878744650499287, + "grad_norm": 7.84375, "learning_rate": 8.071555555555555e-05, - "loss": 0.6044, + "loss": 0.8144, "step": 6840 }, { - "epoch": 0.8247050325066217, - "grad_norm": 15.1875, + "epoch": 4.885877318116976, + "grad_norm": 8.3125, "learning_rate": 8.067111111111112e-05, - "loss": 0.6896, + "loss": 0.8821, "step": 6850 }, { - "epoch": 0.8259089814591861, - "grad_norm": 7.6875, + "epoch": 4.893009985734665, + "grad_norm": 9.0, "learning_rate": 8.062666666666668e-05, - "loss": 0.5829, + "loss": 0.8572, "step": 6860 }, { - "epoch": 0.8271129304117505, - "grad_norm": 5.21875, + "epoch": 4.900142653352354, + "grad_norm": 10.0, "learning_rate": 8.058222222222223e-05, - "loss": 0.6934, + "loss": 0.7498, "step": 6870 }, { - "epoch": 0.828316879364315, - "grad_norm": 10.375, + "epoch": 4.907275320970043, + "grad_norm": 6.09375, "learning_rate": 8.053777777777778e-05, - "loss": 0.7309, + "loss": 0.8709, "step": 6880 }, { - "epoch": 0.8295208283168793, - "grad_norm": 8.1875, + "epoch": 4.914407988587731, + "grad_norm": 7.84375, "learning_rate": 8.049333333333334e-05, - "loss": 0.7213, + "loss": 0.8045, "step": 6890 }, { - "epoch": 0.8307247772694438, - "grad_norm": 5.15625, + "epoch": 4.921540656205421, + "grad_norm": 7.0625, "learning_rate": 8.04488888888889e-05, - "loss": 0.6034, + "loss": 0.8919, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval/acc": 32.55813980102539, + "epoch": 4.921540656205421, + "eval/acc": 34.88372039794922, "step": 6900 }, { - "epoch": 0.8307247772694438, - "eval_loss": 2.8601129055023193, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.302, - "eval_steps_per_second": 4.821, + "epoch": 4.921540656205421, + "eval_loss": 2.8702921867370605, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.143, + "eval_steps_per_second": 4.515, "step": 6900 }, { - "epoch": 0.8319287262220082, - "grad_norm": 7.25, + "epoch": 4.9286733238231095, + "grad_norm": 18.125, "learning_rate": 8.040444444444445e-05, - "loss": 0.5585, + "loss": 0.8407, "step": 6910 }, { - "epoch": 0.8331326751745726, - "grad_norm": 5.9375, + "epoch": 4.935805991440799, + "grad_norm": 7.8125, "learning_rate": 8.036e-05, - "loss": 0.7539, + "loss": 0.9023, "step": 6920 }, { - "epoch": 0.834336624127137, - "grad_norm": 8.0, + "epoch": 4.942938659058488, + "grad_norm": 6.53125, "learning_rate": 8.031555555555556e-05, - "loss": 0.6104, + "loss": 0.7747, "step": 6930 }, { - "epoch": 0.8355405730797014, - "grad_norm": 7.4375, + "epoch": 4.950071326676177, + "grad_norm": 7.3125, "learning_rate": 8.027111111111112e-05, - "loss": 0.613, + "loss": 0.7357, "step": 6940 }, { - "epoch": 0.8367445220322658, - "grad_norm": 8.1875, + "epoch": 4.957203994293866, + "grad_norm": 5.71875, "learning_rate": 8.022666666666667e-05, - "loss": 0.6647, + "loss": 0.8914, "step": 6950 }, { - "epoch": 0.8379484709848303, - "grad_norm": 7.4375, + "epoch": 4.964336661911555, + "grad_norm": 7.9375, "learning_rate": 8.018222222222223e-05, - "loss": 0.7037, + "loss": 0.8626, "step": 6960 }, { - "epoch": 0.8391524199373946, - "grad_norm": 7.25, + "epoch": 4.971469329529244, + "grad_norm": 6.9375, "learning_rate": 8.013777777777778e-05, - "loss": 0.5853, + "loss": 0.8388, "step": 6970 }, { - "epoch": 0.840356368889959, - "grad_norm": 8.75, + "epoch": 4.978601997146933, + "grad_norm": 6.5, "learning_rate": 8.009333333333334e-05, - "loss": 0.6264, + "loss": 0.8321, "step": 6980 }, { - "epoch": 0.8415603178425235, - "grad_norm": 8.4375, + "epoch": 4.985734664764622, + "grad_norm": 6.6875, "learning_rate": 8.004888888888889e-05, - "loss": 0.6221, + "loss": 0.8276, "step": 6990 }, { - "epoch": 0.8427642667950879, - "grad_norm": 8.3125, + "epoch": 4.9928673323823105, + "grad_norm": 10.5625, "learning_rate": 8.000444444444445e-05, - "loss": 0.6408, + "loss": 0.8847, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval/acc": 33.72093200683594, + "epoch": 4.9928673323823105, + "eval/acc": 39.53488540649414, "step": 7000 }, { - "epoch": 0.8427642667950879, - "eval_loss": 2.9269802570343018, - "eval_runtime": 0.2045, - "eval_samples_per_second": 210.301, - "eval_steps_per_second": 4.891, + "epoch": 4.9928673323823105, + "eval_loss": 2.7940218448638916, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.063, + "eval_steps_per_second": 4.467, "step": 7000 }, { - "epoch": 0.8439682157476524, - "grad_norm": 9.125, + "epoch": 5.0, + "grad_norm": 7.1875, "learning_rate": 7.996e-05, - "loss": 0.6321, + "loss": 0.9472, "step": 7010 }, { - "epoch": 0.8451721647002167, - "grad_norm": 7.125, + "epoch": 5.007132667617689, + "grad_norm": 7.25, "learning_rate": 7.991555555555555e-05, - "loss": 0.5927, + "loss": 0.9009, "step": 7020 }, { - "epoch": 0.8463761136527811, - "grad_norm": 7.65625, + "epoch": 5.014265335235378, + "grad_norm": 7.34375, "learning_rate": 7.987111111111112e-05, - "loss": 0.6574, + "loss": 0.8805, "step": 7030 }, { - "epoch": 0.8475800626053456, - "grad_norm": 7.0, + "epoch": 5.021398002853067, + "grad_norm": 5.78125, "learning_rate": 7.982666666666667e-05, - "loss": 0.7185, + "loss": 0.8475, "step": 7040 }, { - "epoch": 0.84878401155791, - "grad_norm": 7.3125, + "epoch": 5.028530670470756, + "grad_norm": 5.53125, "learning_rate": 7.978222222222222e-05, - "loss": 0.7157, + "loss": 0.7598, "step": 7050 }, { - "epoch": 0.8499879605104743, - "grad_norm": 5.6875, + "epoch": 5.035663338088445, + "grad_norm": 6.25, "learning_rate": 7.973777777777778e-05, - "loss": 0.606, + "loss": 0.8605, "step": 7060 }, { - "epoch": 0.8511919094630388, - "grad_norm": 6.28125, + "epoch": 5.042796005706134, + "grad_norm": 7.46875, "learning_rate": 7.969333333333335e-05, - "loss": 0.6493, + "loss": 0.9293, "step": 7070 }, { - "epoch": 0.8523958584156032, - "grad_norm": 7.8125, + "epoch": 5.049928673323823, + "grad_norm": 5.9375, "learning_rate": 7.96488888888889e-05, - "loss": 0.6123, + "loss": 0.7984, "step": 7080 }, { - "epoch": 0.8535998073681675, + "epoch": 5.057061340941512, "grad_norm": 8.375, "learning_rate": 7.960444444444444e-05, - "loss": 0.6035, + "loss": 0.8222, "step": 7090 }, { - "epoch": 0.854803756320732, - "grad_norm": 7.78125, + "epoch": 5.064194008559201, + "grad_norm": 6.9375, "learning_rate": 7.956e-05, - "loss": 0.5902, + "loss": 0.8535, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval/acc": 37.20930099487305, + "epoch": 5.064194008559201, + "eval/acc": 41.86046600341797, "step": 7100 }, { - "epoch": 0.854803756320732, - "eval_loss": 2.926543712615967, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.37, - "eval_steps_per_second": 4.73, + "epoch": 5.064194008559201, + "eval_loss": 2.631981134414673, + "eval_runtime": 2.5832, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.387, "step": 7100 }, { - "epoch": 0.8560077052732964, - "grad_norm": 6.0625, + "epoch": 5.0713266761768905, + "grad_norm": 6.5625, "learning_rate": 7.951555555555555e-05, - "loss": 0.6464, + "loss": 0.8668, "step": 7110 }, { - "epoch": 0.8572116542258609, + "epoch": 5.078459343794579, "grad_norm": 9.0, "learning_rate": 7.947111111111111e-05, - "loss": 0.7656, + "loss": 0.8142, "step": 7120 }, { - "epoch": 0.8584156031784252, - "grad_norm": 7.21875, + "epoch": 5.085592011412269, + "grad_norm": 8.3125, "learning_rate": 7.942666666666668e-05, - "loss": 0.5546, + "loss": 0.9271, "step": 7130 }, { - "epoch": 0.8596195521309896, - "grad_norm": 8.5, + "epoch": 5.092724679029957, + "grad_norm": 7.875, "learning_rate": 7.938222222222222e-05, - "loss": 0.6538, + "loss": 0.8213, "step": 7140 }, { - "epoch": 0.8608235010835541, - "grad_norm": 8.0625, + "epoch": 5.099857346647646, + "grad_norm": 6.8125, "learning_rate": 7.933777777777777e-05, - "loss": 0.7057, + "loss": 0.8511, "step": 7150 }, { - "epoch": 0.8620274500361185, - "grad_norm": 7.34375, + "epoch": 5.106990014265335, + "grad_norm": 7.53125, "learning_rate": 7.929333333333334e-05, - "loss": 0.6287, + "loss": 0.8525, "step": 7160 }, { - "epoch": 0.8632313989886828, - "grad_norm": 6.53125, + "epoch": 5.114122681883024, + "grad_norm": 7.21875, "learning_rate": 7.92488888888889e-05, - "loss": 0.6231, + "loss": 0.8554, "step": 7170 }, { - "epoch": 0.8644353479412473, - "grad_norm": 18.5, + "epoch": 5.121255349500713, + "grad_norm": 6.84375, "learning_rate": 7.920444444444445e-05, - "loss": 0.664, + "loss": 0.8128, "step": 7180 }, { - "epoch": 0.8656392968938117, - "grad_norm": 8.875, + "epoch": 5.128388017118402, + "grad_norm": 7.84375, "learning_rate": 7.916e-05, - "loss": 0.6286, + "loss": 0.7726, "step": 7190 }, { - "epoch": 0.8668432458463761, - "grad_norm": 6.0625, + "epoch": 5.1355206847360915, + "grad_norm": 7.78125, "learning_rate": 7.911555555555556e-05, - "loss": 0.6808, + "loss": 0.8902, "step": 7200 }, { - "epoch": 0.8668432458463761, + "epoch": 5.1355206847360915, "eval/acc": 37.20930099487305, "step": 7200 }, { - "epoch": 0.8668432458463761, - "eval_loss": 2.9467363357543945, - "eval_runtime": 0.2052, - "eval_samples_per_second": 209.502, - "eval_steps_per_second": 4.872, + "epoch": 5.1355206847360915, + "eval_loss": 2.5633885860443115, + "eval_runtime": 0.2541, + "eval_samples_per_second": 169.248, + "eval_steps_per_second": 3.936, "step": 7200 }, { - "epoch": 0.8680471947989405, - "grad_norm": 7.9375, + "epoch": 5.14265335235378, + "grad_norm": 6.8125, "learning_rate": 7.907111111111112e-05, - "loss": 0.6626, + "loss": 0.7482, "step": 7210 }, { - "epoch": 0.8692511437515049, - "grad_norm": 7.15625, + "epoch": 5.14978601997147, + "grad_norm": 42.0, "learning_rate": 7.902666666666667e-05, - "loss": 0.7685, + "loss": 0.9007, "step": 7220 }, { - "epoch": 0.8704550927040694, - "grad_norm": 10.3125, + "epoch": 5.156918687589158, + "grad_norm": 6.0625, "learning_rate": 7.898222222222223e-05, - "loss": 0.6848, + "loss": 0.8643, "step": 7230 }, { - "epoch": 0.8716590416566338, - "grad_norm": 7.21875, + "epoch": 5.164051355206848, + "grad_norm": 7.03125, "learning_rate": 7.893777777777778e-05, - "loss": 0.6433, + "loss": 0.8899, "step": 7240 }, { - "epoch": 0.8728629906091981, - "grad_norm": 6.34375, + "epoch": 5.171184022824536, + "grad_norm": 7.53125, "learning_rate": 7.889333333333334e-05, - "loss": 0.6121, + "loss": 0.7462, "step": 7250 }, { - "epoch": 0.8740669395617626, - "grad_norm": 7.40625, + "epoch": 5.178316690442226, + "grad_norm": 7.21875, "learning_rate": 7.884888888888889e-05, - "loss": 0.6391, + "loss": 0.9199, "step": 7260 }, { - "epoch": 0.875270888514327, - "grad_norm": 7.96875, + "epoch": 5.185449358059914, + "grad_norm": 8.1875, "learning_rate": 7.880444444444445e-05, - "loss": 0.638, + "loss": 0.7966, "step": 7270 }, { - "epoch": 0.8764748374668914, - "grad_norm": 6.28125, + "epoch": 5.192582025677604, + "grad_norm": 8.0, "learning_rate": 7.876e-05, - "loss": 0.6214, + "loss": 0.9086, "step": 7280 }, { - "epoch": 0.8776787864194558, - "grad_norm": 9.125, + "epoch": 5.1997146932952925, + "grad_norm": 7.46875, "learning_rate": 7.871555555555556e-05, - "loss": 0.7473, + "loss": 0.9184, "step": 7290 }, { - "epoch": 0.8788827353720202, - "grad_norm": 7.5, + "epoch": 5.206847360912981, + "grad_norm": 7.28125, "learning_rate": 7.867111111111112e-05, - "loss": 0.68, + "loss": 0.742, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval/acc": 34.88372039794922, + "epoch": 5.206847360912981, + "eval/acc": 39.53488540649414, "step": 7300 }, { - "epoch": 0.8788827353720202, - "eval_loss": 2.999979257583618, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.25, - "eval_steps_per_second": 4.703, + "epoch": 5.206847360912981, + "eval_loss": 2.5178542137145996, + "eval_runtime": 0.2274, + "eval_samples_per_second": 189.112, + "eval_steps_per_second": 4.398, "step": 7300 }, { - "epoch": 0.8800866843245846, - "grad_norm": 7.03125, + "epoch": 5.2139800285306706, + "grad_norm": 10.4375, "learning_rate": 7.862666666666667e-05, - "loss": 0.4952, + "loss": 0.8737, "step": 7310 }, { - "epoch": 0.8812906332771491, - "grad_norm": 7.65625, + "epoch": 5.221112696148359, + "grad_norm": 6.8125, "learning_rate": 7.858222222222222e-05, - "loss": 0.7879, + "loss": 0.8197, "step": 7320 }, { - "epoch": 0.8824945822297134, - "grad_norm": 7.71875, + "epoch": 5.228245363766049, + "grad_norm": 8.125, "learning_rate": 7.853777777777778e-05, - "loss": 0.6093, + "loss": 0.9561, "step": 7330 }, { - "epoch": 0.8836985311822779, - "grad_norm": 8.125, + "epoch": 5.235378031383737, + "grad_norm": 9.5, "learning_rate": 7.849333333333334e-05, - "loss": 0.6522, + "loss": 0.9066, "step": 7340 }, { - "epoch": 0.8849024801348423, - "grad_norm": 8.9375, + "epoch": 5.242510699001427, + "grad_norm": 6.09375, "learning_rate": 7.844888888888889e-05, - "loss": 0.6861, + "loss": 0.839, "step": 7350 }, { - "epoch": 0.8861064290874067, - "grad_norm": 6.9375, + "epoch": 5.249643366619115, + "grad_norm": 8.0625, "learning_rate": 7.840444444444445e-05, - "loss": 0.6023, + "loss": 0.8996, "step": 7360 }, { - "epoch": 0.8873103780399711, - "grad_norm": 8.1875, + "epoch": 5.256776034236805, + "grad_norm": 6.3125, "learning_rate": 7.836e-05, - "loss": 0.5156, + "loss": 0.8253, "step": 7370 }, { - "epoch": 0.8885143269925355, - "grad_norm": 7.125, + "epoch": 5.263908701854493, + "grad_norm": 6.15625, "learning_rate": 7.831555555555556e-05, - "loss": 0.6841, + "loss": 0.7275, "step": 7380 }, { - "epoch": 0.8897182759450999, - "grad_norm": 8.0625, + "epoch": 5.271041369472183, + "grad_norm": 6.375, "learning_rate": 7.827111111111111e-05, - "loss": 0.5521, + "loss": 0.8548, "step": 7390 }, { - "epoch": 0.8909222248976644, - "grad_norm": 7.03125, + "epoch": 5.2781740370898715, + "grad_norm": 8.0625, "learning_rate": 7.822666666666667e-05, - "loss": 0.7556, + "loss": 0.8754, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval/acc": 32.55813980102539, + "epoch": 5.2781740370898715, + "eval/acc": 39.53488540649414, "step": 7400 }, { - "epoch": 0.8909222248976644, - "eval_loss": 2.882596015930176, - "eval_runtime": 0.2414, - "eval_samples_per_second": 178.131, - "eval_steps_per_second": 4.143, + "epoch": 5.2781740370898715, + "eval_loss": 2.599212408065796, + "eval_runtime": 0.2355, + "eval_samples_per_second": 182.56, + "eval_steps_per_second": 4.246, "step": 7400 }, { - "epoch": 0.8921261738502287, - "grad_norm": 6.8125, + "epoch": 5.285306704707561, + "grad_norm": 8.875, "learning_rate": 7.818222222222222e-05, - "loss": 0.6252, + "loss": 0.8725, "step": 7410 }, { - "epoch": 0.8933301228027931, - "grad_norm": 6.5, + "epoch": 5.29243937232525, + "grad_norm": 8.0625, "learning_rate": 7.813777777777777e-05, - "loss": 0.6405, + "loss": 0.8689, "step": 7420 }, { - "epoch": 0.8945340717553576, - "grad_norm": 7.25, + "epoch": 5.299572039942939, + "grad_norm": 7.59375, "learning_rate": 7.809333333333335e-05, - "loss": 0.5753, + "loss": 0.7615, "step": 7430 }, { - "epoch": 0.895738020707922, - "grad_norm": 8.4375, + "epoch": 5.306704707560628, + "grad_norm": 6.3125, "learning_rate": 7.80488888888889e-05, - "loss": 0.5782, + "loss": 0.8141, "step": 7440 }, { - "epoch": 0.8969419696604864, - "grad_norm": 7.875, + "epoch": 5.313837375178316, + "grad_norm": 6.84375, "learning_rate": 7.800444444444444e-05, - "loss": 0.6364, + "loss": 0.8328, "step": 7450 }, { - "epoch": 0.8981459186130508, - "grad_norm": 6.15625, + "epoch": 5.320970042796006, + "grad_norm": 7.71875, "learning_rate": 7.796e-05, - "loss": 0.6243, + "loss": 0.8158, "step": 7460 }, { - "epoch": 0.8993498675656152, - "grad_norm": 7.5, + "epoch": 5.328102710413694, + "grad_norm": 7.0625, "learning_rate": 7.791555555555557e-05, - "loss": 0.6401, + "loss": 0.7663, "step": 7470 }, { - "epoch": 0.9005538165181797, - "grad_norm": 6.03125, + "epoch": 5.335235378031384, + "grad_norm": 8.1875, "learning_rate": 7.787111111111112e-05, - "loss": 0.5183, + "loss": 0.7704, "step": 7480 }, { - "epoch": 0.901757765470744, - "grad_norm": 6.5, + "epoch": 5.3423680456490725, + "grad_norm": 8.0, "learning_rate": 7.782666666666666e-05, - "loss": 0.6057, + "loss": 0.8511, "step": 7490 }, { - "epoch": 0.9029617144233084, - "grad_norm": 9.0, + "epoch": 5.349500713266762, + "grad_norm": 5.15625, "learning_rate": 7.778222222222223e-05, - "loss": 0.6341, + "loss": 0.783, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval/acc": 34.30232620239258, + "epoch": 5.349500713266762, + "eval/acc": 39.53488540649414, "step": 7500 }, { - "epoch": 0.9029617144233084, - "eval_loss": 2.997713804244995, - "eval_runtime": 1.0811, - "eval_samples_per_second": 39.775, - "eval_steps_per_second": 0.925, + "epoch": 5.349500713266762, + "eval_loss": 2.6000046730041504, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.876, + "eval_steps_per_second": 4.392, "step": 7500 }, { - "epoch": 0.9041656633758729, - "grad_norm": 7.03125, + "epoch": 5.356633380884451, + "grad_norm": 7.6875, "learning_rate": 7.773777777777778e-05, - "loss": 0.6595, + "loss": 0.7674, "step": 7510 }, { - "epoch": 0.9053696123284373, - "grad_norm": 7.84375, + "epoch": 5.36376604850214, + "grad_norm": 6.53125, "learning_rate": 7.769333333333334e-05, - "loss": 0.7769, + "loss": 0.8338, "step": 7520 }, { - "epoch": 0.9065735612810016, - "grad_norm": 6.78125, + "epoch": 5.370898716119829, + "grad_norm": 5.8125, "learning_rate": 7.76488888888889e-05, - "loss": 0.6876, + "loss": 0.8279, "step": 7530 }, { - "epoch": 0.9077775102335661, - "grad_norm": 9.375, + "epoch": 5.378031383737518, + "grad_norm": 7.0625, "learning_rate": 7.760444444444445e-05, - "loss": 0.6271, + "loss": 0.7954, "step": 7540 }, { - "epoch": 0.9089814591861305, - "grad_norm": 6.96875, + "epoch": 5.385164051355207, + "grad_norm": 8.0, "learning_rate": 7.756e-05, - "loss": 0.6117, + "loss": 0.8632, "step": 7550 }, { - "epoch": 0.910185408138695, - "grad_norm": 6.28125, + "epoch": 5.392296718972895, + "grad_norm": 6.84375, "learning_rate": 7.751555555555556e-05, - "loss": 0.6461, + "loss": 0.8191, "step": 7560 }, { - "epoch": 0.9113893570912593, - "grad_norm": 7.96875, + "epoch": 5.399429386590585, + "grad_norm": 7.375, "learning_rate": 7.747111111111112e-05, - "loss": 0.6543, + "loss": 0.708, "step": 7570 }, { - "epoch": 0.9125933060438237, - "grad_norm": 10.0, + "epoch": 5.4065620542082735, + "grad_norm": 7.15625, "learning_rate": 7.742666666666667e-05, - "loss": 0.686, + "loss": 0.6851, "step": 7580 }, { - "epoch": 0.9137972549963882, - "grad_norm": 7.90625, + "epoch": 5.413694721825963, + "grad_norm": 7.25, "learning_rate": 7.738222222222222e-05, - "loss": 0.6634, + "loss": 0.8769, "step": 7590 }, { - "epoch": 0.9150012039489526, - "grad_norm": 11.5625, + "epoch": 5.420827389443652, + "grad_norm": 7.6875, "learning_rate": 7.733777777777779e-05, - "loss": 0.6627, + "loss": 0.8316, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval/acc": 37.20930099487305, + "epoch": 5.420827389443652, + "eval/acc": 39.53488540649414, "step": 7600 }, { - "epoch": 0.9150012039489526, - "eval_loss": 2.908363103866577, - "eval_runtime": 2.6366, - "eval_samples_per_second": 16.309, - "eval_steps_per_second": 0.379, + "epoch": 5.420827389443652, + "eval_loss": 2.583944797515869, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.433, + "eval_steps_per_second": 4.522, "step": 7600 }, { - "epoch": 0.9162051529015169, - "grad_norm": 5.65625, + "epoch": 5.427960057061341, + "grad_norm": 7.625, "learning_rate": 7.729333333333334e-05, - "loss": 0.5503, + "loss": 0.8444, "step": 7610 }, { - "epoch": 0.9174091018540814, - "grad_norm": 7.15625, + "epoch": 5.43509272467903, + "grad_norm": 6.6875, "learning_rate": 7.724888888888889e-05, - "loss": 0.5263, + "loss": 0.8101, "step": 7620 }, { - "epoch": 0.9186130508066458, - "grad_norm": 5.96875, + "epoch": 5.442225392296719, + "grad_norm": 6.375, "learning_rate": 7.720444444444445e-05, - "loss": 0.6969, + "loss": 0.8094, "step": 7630 }, { - "epoch": 0.9198169997592102, - "grad_norm": 8.0625, + "epoch": 5.449358059914408, + "grad_norm": 7.09375, "learning_rate": 7.716e-05, - "loss": 0.6371, + "loss": 0.9292, "step": 7640 }, { - "epoch": 0.9210209487117746, - "grad_norm": 7.5625, + "epoch": 5.456490727532097, + "grad_norm": 8.0, "learning_rate": 7.711555555555556e-05, - "loss": 0.6406, + "loss": 0.8544, "step": 7650 }, { - "epoch": 0.922224897664339, - "grad_norm": 10.6875, + "epoch": 5.463623395149786, + "grad_norm": 5.625, "learning_rate": 7.707111111111111e-05, - "loss": 0.7058, + "loss": 0.787, "step": 7660 }, { - "epoch": 0.9234288466169035, - "grad_norm": 12.5625, + "epoch": 5.470756062767475, + "grad_norm": 8.375, "learning_rate": 7.702666666666667e-05, - "loss": 0.7067, + "loss": 0.8763, "step": 7670 }, { - "epoch": 0.9246327955694679, - "grad_norm": 7.21875, + "epoch": 5.477888730385164, + "grad_norm": 12.9375, "learning_rate": 7.698222222222222e-05, - "loss": 0.5543, + "loss": 0.8317, "step": 7680 }, { - "epoch": 0.9258367445220322, - "grad_norm": 10.125, + "epoch": 5.4850213980028535, + "grad_norm": 8.125, "learning_rate": 7.693777777777778e-05, - "loss": 0.6719, + "loss": 0.8156, "step": 7690 }, { - "epoch": 0.9270406934745967, - "grad_norm": 7.03125, + "epoch": 5.492154065620542, + "grad_norm": 6.96875, "learning_rate": 7.689333333333334e-05, - "loss": 0.5764, + "loss": 0.8998, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval/acc": 35.46511459350586, + "epoch": 5.492154065620542, + "eval/acc": 39.53488540649414, "step": 7700 }, { - "epoch": 0.9270406934745967, - "eval_loss": 2.8986358642578125, - "eval_runtime": 4.4935, - "eval_samples_per_second": 9.569, - "eval_steps_per_second": 0.223, + "epoch": 5.492154065620542, + "eval_loss": 2.6069791316986084, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.632, + "eval_steps_per_second": 4.457, "step": 7700 }, { - "epoch": 0.9282446424271611, - "grad_norm": 10.25, + "epoch": 5.499286733238231, + "grad_norm": 7.5625, "learning_rate": 7.68488888888889e-05, - "loss": 0.6302, + "loss": 0.7881, "step": 7710 }, { - "epoch": 0.9294485913797255, - "grad_norm": 9.8125, + "epoch": 5.50641940085592, + "grad_norm": 6.65625, "learning_rate": 7.680444444444444e-05, - "loss": 0.6236, + "loss": 0.8379, "step": 7720 }, { - "epoch": 0.93065254033229, - "grad_norm": 6.5, + "epoch": 5.513552068473609, + "grad_norm": 6.34375, "learning_rate": 7.676e-05, - "loss": 0.7159, + "loss": 0.844, "step": 7730 }, { - "epoch": 0.9318564892848543, - "grad_norm": 7.1875, + "epoch": 5.520684736091298, + "grad_norm": 8.3125, "learning_rate": 7.671555555555557e-05, - "loss": 0.6257, + "loss": 0.8762, "step": 7740 }, { - "epoch": 0.9330604382374187, - "grad_norm": 7.3125, + "epoch": 5.527817403708987, + "grad_norm": 7.09375, "learning_rate": 7.667111111111111e-05, - "loss": 0.5247, + "loss": 0.8621, "step": 7750 }, { - "epoch": 0.9342643871899832, - "grad_norm": 5.0, + "epoch": 5.534950071326676, + "grad_norm": 8.5625, "learning_rate": 7.662666666666666e-05, - "loss": 0.5185, + "loss": 1.0092, "step": 7760 }, { - "epoch": 0.9354683361425475, - "grad_norm": 13.375, + "epoch": 5.542082738944365, + "grad_norm": 6.3125, "learning_rate": 7.658222222222222e-05, - "loss": 0.8069, + "loss": 0.8743, "step": 7770 }, { - "epoch": 0.936672285095112, - "grad_norm": 10.3125, + "epoch": 5.5492154065620545, + "grad_norm": 6.0625, "learning_rate": 7.653777777777779e-05, - "loss": 0.6619, + "loss": 0.754, "step": 7780 }, { - "epoch": 0.9378762340476764, - "grad_norm": 7.1875, + "epoch": 5.556348074179743, + "grad_norm": 7.6875, "learning_rate": 7.649333333333334e-05, - "loss": 0.785, + "loss": 0.8504, "step": 7790 }, { - "epoch": 0.9390801830002408, - "grad_norm": 6.0625, + "epoch": 5.563480741797433, + "grad_norm": 8.3125, "learning_rate": 7.64488888888889e-05, - "loss": 0.6064, + "loss": 0.7512, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval/acc": 40.69767379760742, + "epoch": 5.563480741797433, + "eval/acc": 37.20930099487305, "step": 7800 }, { - "epoch": 0.9390801830002408, - "eval_loss": 2.9625086784362793, - "eval_runtime": 1.0058, - "eval_samples_per_second": 42.753, - "eval_steps_per_second": 0.994, + "epoch": 5.563480741797433, + "eval_loss": 2.610304594039917, + "eval_runtime": 0.2338, + "eval_samples_per_second": 183.899, + "eval_steps_per_second": 4.277, "step": 7800 }, { - "epoch": 0.9402841319528052, - "grad_norm": 7.0, + "epoch": 5.570613409415121, + "grad_norm": 6.71875, "learning_rate": 7.640444444444445e-05, - "loss": 0.5744, + "loss": 0.8204, "step": 7810 }, { - "epoch": 0.9414880809053696, - "grad_norm": 7.78125, + "epoch": 5.57774607703281, + "grad_norm": 6.625, "learning_rate": 7.636e-05, - "loss": 0.6294, + "loss": 0.734, "step": 7820 }, { - "epoch": 0.942692029857934, - "grad_norm": 6.46875, + "epoch": 5.584878744650499, + "grad_norm": 5.65625, "learning_rate": 7.631555555555556e-05, - "loss": 0.7608, + "loss": 0.8047, "step": 7830 }, { - "epoch": 0.9438959788104985, - "grad_norm": 6.71875, + "epoch": 5.592011412268189, + "grad_norm": 6.40625, "learning_rate": 7.627111111111112e-05, - "loss": 0.6084, + "loss": 0.7179, "step": 7840 }, { - "epoch": 0.9450999277630628, - "grad_norm": 7.15625, + "epoch": 5.599144079885877, + "grad_norm": 6.78125, "learning_rate": 7.622666666666667e-05, - "loss": 0.5791, + "loss": 0.849, "step": 7850 }, { - "epoch": 0.9463038767156272, - "grad_norm": 10.1875, + "epoch": 5.606276747503566, + "grad_norm": 8.8125, "learning_rate": 7.618222222222221e-05, - "loss": 0.683, + "loss": 0.8817, "step": 7860 }, { - "epoch": 0.9475078256681917, - "grad_norm": 7.59375, + "epoch": 5.6134094151212555, + "grad_norm": 6.375, "learning_rate": 7.613777777777779e-05, - "loss": 0.6413, + "loss": 0.8812, "step": 7870 }, { - "epoch": 0.9487117746207561, - "grad_norm": 5.71875, + "epoch": 5.620542082738944, + "grad_norm": 13.125, "learning_rate": 7.609333333333334e-05, - "loss": 0.5985, + "loss": 0.8522, "step": 7880 }, { - "epoch": 0.9499157235733204, - "grad_norm": 8.625, + "epoch": 5.627674750356634, + "grad_norm": 7.0625, "learning_rate": 7.604888888888889e-05, - "loss": 0.572, + "loss": 0.731, "step": 7890 }, { - "epoch": 0.9511196725258849, - "grad_norm": 15.75, + "epoch": 5.634807417974322, + "grad_norm": 7.21875, "learning_rate": 7.600444444444445e-05, - "loss": 0.674, + "loss": 0.8841, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval/acc": 34.88372039794922, + "epoch": 5.634807417974322, + "eval/acc": 39.53488540649414, "step": 7900 }, { - "epoch": 0.9511196725258849, - "eval_loss": 2.9794013500213623, - "eval_runtime": 0.209, - "eval_samples_per_second": 205.705, - "eval_steps_per_second": 4.784, + "epoch": 5.634807417974322, + "eval_loss": 2.6105217933654785, + "eval_runtime": 0.2306, + "eval_samples_per_second": 186.447, + "eval_steps_per_second": 4.336, "step": 7900 }, { - "epoch": 0.9523236214784493, - "grad_norm": 8.25, + "epoch": 5.641940085592012, + "grad_norm": 7.625, "learning_rate": 7.596000000000001e-05, - "loss": 0.7455, + "loss": 0.8654, "step": 7910 }, { - "epoch": 0.9535275704310138, - "grad_norm": 7.40625, + "epoch": 5.6490727532097, + "grad_norm": 26.75, "learning_rate": 7.591555555555556e-05, - "loss": 0.5746, + "loss": 0.8103, "step": 7920 }, { - "epoch": 0.9547315193835781, - "grad_norm": 7.78125, + "epoch": 5.65620542082739, + "grad_norm": 7.375, "learning_rate": 7.587111111111112e-05, - "loss": 0.6232, + "loss": 0.7461, "step": 7930 }, { - "epoch": 0.9559354683361425, - "grad_norm": 10.9375, + "epoch": 5.663338088445078, + "grad_norm": 6.09375, "learning_rate": 7.582666666666667e-05, - "loss": 0.7393, + "loss": 0.9693, "step": 7940 }, { - "epoch": 0.957139417288707, - "grad_norm": 8.6875, + "epoch": 5.670470756062768, + "grad_norm": 7.09375, "learning_rate": 7.578222222222222e-05, - "loss": 0.6138, + "loss": 0.8595, "step": 7950 }, { - "epoch": 0.9583433662412714, - "grad_norm": 7.625, + "epoch": 5.6776034236804565, + "grad_norm": 7.3125, "learning_rate": 7.573777777777778e-05, - "loss": 0.637, + "loss": 0.8541, "step": 7960 }, { - "epoch": 0.9595473151938357, - "grad_norm": 6.90625, + "epoch": 5.684736091298145, + "grad_norm": 7.90625, "learning_rate": 7.569333333333334e-05, - "loss": 0.606, + "loss": 0.8774, "step": 7970 }, { - "epoch": 0.9607512641464002, - "grad_norm": 8.8125, + "epoch": 5.6918687589158345, + "grad_norm": 9.0, "learning_rate": 7.564888888888889e-05, - "loss": 0.7135, + "loss": 0.8823, "step": 7980 }, { - "epoch": 0.9619552130989646, - "grad_norm": 6.84375, + "epoch": 5.699001426533523, + "grad_norm": 6.09375, "learning_rate": 7.560444444444444e-05, - "loss": 0.6138, + "loss": 0.7302, "step": 7990 }, { - "epoch": 0.963159162051529, - "grad_norm": 8.25, + "epoch": 5.706134094151213, + "grad_norm": 7.21875, "learning_rate": 7.556000000000002e-05, - "loss": 0.7128, + "loss": 0.8339, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval/acc": 34.88372039794922, + "epoch": 5.706134094151213, + "eval/acc": 37.20930099487305, "step": 8000 }, { - "epoch": 0.963159162051529, - "eval_loss": 2.9879119396209717, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.451, - "eval_steps_per_second": 4.708, + "epoch": 5.706134094151213, + "eval_loss": 2.576781988143921, + "eval_runtime": 0.2231, + "eval_samples_per_second": 192.779, + "eval_steps_per_second": 4.483, "step": 8000 }, { - "epoch": 0.9643631110040934, - "grad_norm": 9.875, + "epoch": 5.713266761768901, + "grad_norm": 7.75, "learning_rate": 7.551555555555556e-05, - "loss": 0.5835, + "loss": 0.7642, "step": 8010 }, { - "epoch": 0.9655670599566578, - "grad_norm": 8.8125, + "epoch": 5.720399429386591, + "grad_norm": 7.8125, "learning_rate": 7.547111111111111e-05, - "loss": 0.6138, + "loss": 0.9188, "step": 8020 }, { - "epoch": 0.9667710089092223, - "grad_norm": 8.3125, + "epoch": 5.727532097004279, + "grad_norm": 7.28125, "learning_rate": 7.542666666666667e-05, - "loss": 0.6638, + "loss": 0.8202, "step": 8030 }, { - "epoch": 0.9679749578617867, - "grad_norm": 7.0, + "epoch": 5.734664764621969, + "grad_norm": 9.0, "learning_rate": 7.538222222222222e-05, - "loss": 0.6484, + "loss": 0.8286, "step": 8040 }, { - "epoch": 0.969178906814351, - "grad_norm": 8.25, + "epoch": 5.741797432239657, + "grad_norm": 7.25, "learning_rate": 7.533777777777778e-05, - "loss": 0.6291, + "loss": 0.7856, "step": 8050 }, { - "epoch": 0.9703828557669155, - "grad_norm": 9.75, + "epoch": 5.748930099857347, + "grad_norm": 6.90625, "learning_rate": 7.529333333333333e-05, - "loss": 0.71, + "loss": 0.8832, "step": 8060 }, { - "epoch": 0.9715868047194799, - "grad_norm": 6.375, + "epoch": 5.7560627674750355, + "grad_norm": 6.09375, "learning_rate": 7.52488888888889e-05, - "loss": 0.5791, + "loss": 0.7606, "step": 8070 }, { - "epoch": 0.9727907536720443, - "grad_norm": 7.40625, + "epoch": 5.763195435092725, + "grad_norm": 6.625, "learning_rate": 7.520444444444444e-05, - "loss": 0.6359, + "loss": 0.8706, "step": 8080 }, { - "epoch": 0.9739947026246087, - "grad_norm": 8.125, + "epoch": 5.770328102710414, + "grad_norm": 7.25, "learning_rate": 7.516e-05, - "loss": 0.5274, + "loss": 0.8542, "step": 8090 }, { - "epoch": 0.9751986515771731, - "grad_norm": 8.6875, + "epoch": 5.777460770328103, + "grad_norm": 6.84375, "learning_rate": 7.511555555555557e-05, - "loss": 0.5887, + "loss": 0.7988, "step": 8100 }, { - "epoch": 0.9751986515771731, + "epoch": 5.777460770328103, "eval/acc": 37.20930099487305, "step": 8100 }, { - "epoch": 0.9751986515771731, - "eval_loss": 3.0165836811065674, - "eval_runtime": 0.2158, - "eval_samples_per_second": 199.215, - "eval_steps_per_second": 4.633, + "epoch": 5.777460770328103, + "eval_loss": 2.598762273788452, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.04, + "eval_steps_per_second": 4.443, "step": 8100 }, { - "epoch": 0.9764026005297375, - "grad_norm": 9.125, + "epoch": 5.784593437945792, + "grad_norm": 8.875, "learning_rate": 7.507111111111112e-05, - "loss": 0.6646, + "loss": 0.8825, "step": 8110 }, { - "epoch": 0.977606549482302, - "grad_norm": 6.4375, + "epoch": 5.79172610556348, + "grad_norm": 7.375, "learning_rate": 7.502666666666666e-05, - "loss": 0.6757, + "loss": 0.8316, "step": 8120 }, { - "epoch": 0.9788104984348663, - "grad_norm": 6.90625, + "epoch": 5.79885877318117, + "grad_norm": 8.125, "learning_rate": 7.498222222222223e-05, - "loss": 0.6722, + "loss": 0.8567, "step": 8130 }, { - "epoch": 0.9800144473874308, - "grad_norm": 5.34375, + "epoch": 5.805991440798858, + "grad_norm": 6.3125, "learning_rate": 7.493777777777779e-05, - "loss": 0.5574, + "loss": 0.8415, "step": 8140 }, { - "epoch": 0.9812183963399952, - "grad_norm": 12.25, + "epoch": 5.813124108416548, + "grad_norm": 8.5, "learning_rate": 7.489333333333334e-05, - "loss": 0.5701, + "loss": 0.8369, "step": 8150 }, { - "epoch": 0.9824223452925596, - "grad_norm": 5.09375, + "epoch": 5.8202567760342365, + "grad_norm": 13.25, "learning_rate": 7.484888888888889e-05, - "loss": 0.7311, + "loss": 0.8692, "step": 8160 }, { - "epoch": 0.983626294245124, - "grad_norm": 9.6875, + "epoch": 5.827389443651926, + "grad_norm": 7.71875, "learning_rate": 7.480444444444445e-05, - "loss": 0.6314, + "loss": 0.8535, "step": 8170 }, { - "epoch": 0.9848302431976884, - "grad_norm": 7.46875, + "epoch": 5.834522111269615, + "grad_norm": 7.6875, "learning_rate": 7.476000000000001e-05, - "loss": 0.6023, + "loss": 0.8701, "step": 8180 }, { - "epoch": 0.9860341921502528, - "grad_norm": 4.53125, + "epoch": 5.841654778887304, + "grad_norm": 5.46875, "learning_rate": 7.471555555555556e-05, - "loss": 0.5998, + "loss": 0.7843, "step": 8190 }, { - "epoch": 0.9872381411028173, - "grad_norm": 7.3125, + "epoch": 5.848787446504993, + "grad_norm": 7.46875, "learning_rate": 7.467111111111112e-05, - "loss": 0.6607, + "loss": 0.7914, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval/acc": 34.88372039794922, + "epoch": 5.848787446504993, + "eval/acc": 37.20930099487305, "step": 8200 }, { - "epoch": 0.9872381411028173, - "eval_loss": 2.9665606021881104, - "eval_runtime": 0.2143, - "eval_samples_per_second": 200.607, - "eval_steps_per_second": 4.665, + "epoch": 5.848787446504993, + "eval_loss": 2.566337823867798, + "eval_runtime": 0.3566, + "eval_samples_per_second": 120.59, + "eval_steps_per_second": 2.804, "step": 8200 }, { - "epoch": 0.9884420900553816, - "grad_norm": 7.96875, + "epoch": 5.855920114122682, + "grad_norm": 7.03125, "learning_rate": 7.462666666666667e-05, - "loss": 0.7103, + "loss": 0.849, "step": 8210 }, { - "epoch": 0.989646039007946, - "grad_norm": 10.3125, + "epoch": 5.863052781740371, + "grad_norm": 7.5625, "learning_rate": 7.458222222222223e-05, - "loss": 0.5721, + "loss": 0.8066, "step": 8220 }, { - "epoch": 0.9908499879605105, - "grad_norm": 8.5, + "epoch": 5.870185449358059, + "grad_norm": 6.875, "learning_rate": 7.453777777777778e-05, - "loss": 0.7032, + "loss": 0.8556, "step": 8230 }, { - "epoch": 0.9920539369130749, - "grad_norm": 6.21875, + "epoch": 5.877318116975749, + "grad_norm": 8.0, "learning_rate": 7.449333333333334e-05, - "loss": 0.6547, + "loss": 0.9098, "step": 8240 }, { - "epoch": 0.9932578858656393, - "grad_norm": 7.84375, + "epoch": 5.884450784593438, + "grad_norm": 8.375, "learning_rate": 7.444888888888889e-05, - "loss": 0.6587, + "loss": 0.8183, "step": 8250 }, { - "epoch": 0.9944618348182037, - "grad_norm": 6.53125, + "epoch": 5.891583452211127, + "grad_norm": 13.9375, "learning_rate": 7.440444444444444e-05, - "loss": 0.5486, + "loss": 0.8316, "step": 8260 }, { - "epoch": 0.9956657837707681, - "grad_norm": 8.1875, + "epoch": 5.898716119828816, + "grad_norm": 7.25, "learning_rate": 7.436000000000001e-05, - "loss": 0.6284, + "loss": 0.8563, "step": 8270 }, { - "epoch": 0.9968697327233326, - "grad_norm": 7.59375, + "epoch": 5.905848787446505, + "grad_norm": 10.75, "learning_rate": 7.431555555555556e-05, - "loss": 0.7033, + "loss": 0.8473, "step": 8280 }, { - "epoch": 0.9980736816758969, - "grad_norm": 9.0625, + "epoch": 5.912981455064194, + "grad_norm": 14.1875, "learning_rate": 7.427111111111111e-05, - "loss": 0.6621, + "loss": 0.774, "step": 8290 }, { - "epoch": 0.9992776306284613, - "grad_norm": 8.1875, + "epoch": 5.920114122681883, + "grad_norm": 6.8125, "learning_rate": 7.422666666666667e-05, - "loss": 0.6675, + "loss": 0.8783, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval/acc": 34.30232620239258, + "epoch": 5.920114122681883, + "eval/acc": 34.88372039794922, "step": 8300 }, { - "epoch": 0.9992776306284613, - "eval_loss": 2.9075520038604736, - "eval_runtime": 0.7142, - "eval_samples_per_second": 60.205, - "eval_steps_per_second": 1.4, + "epoch": 5.920114122681883, + "eval_loss": 2.6135735511779785, + "eval_runtime": 0.2367, + "eval_samples_per_second": 181.665, + "eval_steps_per_second": 4.225, "step": 8300 }, { - "epoch": 1.0004815795810258, - "grad_norm": 6.34375, + "epoch": 5.927246790299572, + "grad_norm": 7.5625, "learning_rate": 7.418222222222223e-05, - "loss": 0.6534, + "loss": 0.9057, "step": 8310 }, { - "epoch": 1.0016855285335902, - "grad_norm": 7.5, + "epoch": 5.934379457917261, + "grad_norm": 7.875, "learning_rate": 7.413777777777778e-05, - "loss": 0.5778, + "loss": 0.8854, "step": 8320 }, { - "epoch": 1.0028894774861545, - "grad_norm": 8.25, + "epoch": 5.94151212553495, + "grad_norm": 8.1875, "learning_rate": 7.409333333333333e-05, - "loss": 0.6143, + "loss": 0.8049, "step": 8330 }, { - "epoch": 1.004093426438719, - "grad_norm": 6.40625, + "epoch": 5.948644793152639, + "grad_norm": 6.90625, "learning_rate": 7.404888888888889e-05, - "loss": 0.5399, + "loss": 0.7738, "step": 8340 }, { - "epoch": 1.0052973753912835, - "grad_norm": 8.6875, + "epoch": 5.955777460770328, + "grad_norm": 7.90625, "learning_rate": 7.400444444444444e-05, - "loss": 0.6422, + "loss": 0.8268, "step": 8350 }, { - "epoch": 1.0065013243438479, - "grad_norm": 6.5625, + "epoch": 5.9629101283880175, + "grad_norm": 8.3125, "learning_rate": 7.396e-05, - "loss": 0.5578, + "loss": 0.8336, "step": 8360 }, { - "epoch": 1.0077052732964122, - "grad_norm": 6.15625, + "epoch": 5.970042796005706, + "grad_norm": 7.375, "learning_rate": 7.391555555555557e-05, - "loss": 0.6529, + "loss": 0.8282, "step": 8370 }, { - "epoch": 1.0089092222489766, - "grad_norm": 8.875, + "epoch": 5.977175463623395, + "grad_norm": 6.8125, "learning_rate": 7.387111111111111e-05, - "loss": 0.7195, + "loss": 0.8234, "step": 8380 }, { - "epoch": 1.010113171201541, - "grad_norm": 14.0, + "epoch": 5.984308131241084, + "grad_norm": 7.15625, "learning_rate": 7.382666666666666e-05, - "loss": 0.6301, + "loss": 0.8771, "step": 8390 }, { - "epoch": 1.0113171201541056, - "grad_norm": 7.46875, + "epoch": 5.991440798858774, + "grad_norm": 8.5, "learning_rate": 7.378222222222222e-05, - "loss": 0.6439, + "loss": 0.8572, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval/acc": 44.1860466003418, + "epoch": 5.991440798858774, + "eval/acc": 34.88372039794922, "step": 8400 }, { - "epoch": 1.0113171201541056, - "eval_loss": 2.8671419620513916, - "eval_runtime": 7.1371, - "eval_samples_per_second": 6.025, - "eval_steps_per_second": 0.14, + "epoch": 5.991440798858774, + "eval_loss": 2.5367989540100098, + "eval_runtime": 0.224, + "eval_samples_per_second": 191.97, + "eval_steps_per_second": 4.464, "step": 8400 }, { - "epoch": 1.01252106910667, - "grad_norm": 11.5625, + "epoch": 5.998573466476462, + "grad_norm": 7.0, "learning_rate": 7.373777777777779e-05, - "loss": 0.618, + "loss": 0.7468, "step": 8410 }, { - "epoch": 1.0137250180592343, - "grad_norm": 6.1875, + "epoch": 6.005706134094151, + "grad_norm": 7.78125, "learning_rate": 7.369333333333333e-05, - "loss": 0.666, + "loss": 0.7882, "step": 8420 }, { - "epoch": 1.0149289670117987, - "grad_norm": 7.34375, + "epoch": 6.01283880171184, + "grad_norm": 9.1875, "learning_rate": 7.364888888888888e-05, - "loss": 0.6237, + "loss": 0.9419, "step": 8430 }, { - "epoch": 1.016132915964363, - "grad_norm": 7.21875, + "epoch": 6.019971469329529, + "grad_norm": 17.625, "learning_rate": 7.360444444444445e-05, - "loss": 0.5974, + "loss": 0.7904, "step": 8440 }, { - "epoch": 1.0173368649169274, - "grad_norm": 8.625, + "epoch": 6.0271041369472185, + "grad_norm": 8.0625, "learning_rate": 7.356000000000001e-05, - "loss": 0.5766, + "loss": 0.8125, "step": 8450 }, { - "epoch": 1.018540813869492, - "grad_norm": 7.71875, + "epoch": 6.034236804564907, + "grad_norm": 7.4375, "learning_rate": 7.351555555555556e-05, - "loss": 0.6754, + "loss": 0.8002, "step": 8460 }, { - "epoch": 1.0197447628220564, - "grad_norm": 6.8125, + "epoch": 6.041369472182597, + "grad_norm": 5.6875, "learning_rate": 7.347111111111112e-05, - "loss": 0.6515, + "loss": 0.7719, "step": 8470 }, { - "epoch": 1.0209487117746208, - "grad_norm": 7.40625, + "epoch": 6.048502139800285, + "grad_norm": 8.9375, "learning_rate": 7.342666666666667e-05, - "loss": 0.6191, + "loss": 0.8122, "step": 8480 }, { - "epoch": 1.0221526607271851, - "grad_norm": 7.34375, + "epoch": 6.055634807417975, + "grad_norm": 9.875, "learning_rate": 7.338222222222223e-05, - "loss": 0.5703, + "loss": 0.8052, "step": 8490 }, { - "epoch": 1.0233566096797495, - "grad_norm": 8.125, + "epoch": 6.062767475035663, + "grad_norm": 9.125, "learning_rate": 7.333777777777778e-05, - "loss": 0.585, + "loss": 0.8171, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval/acc": 44.1860466003418, + "epoch": 6.062767475035663, + "eval/acc": 46.511627197265625, "step": 8500 }, { - "epoch": 1.0233566096797495, - "eval_loss": 2.8172407150268555, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.34, - "eval_steps_per_second": 4.729, + "epoch": 6.062767475035663, + "eval_loss": 2.4180805683135986, + "eval_runtime": 1.182, + "eval_samples_per_second": 36.38, + "eval_steps_per_second": 0.846, "step": 8500 }, { - "epoch": 1.0245605586323139, - "grad_norm": 8.375, + "epoch": 6.069900142653353, + "grad_norm": 6.84375, "learning_rate": 7.329333333333334e-05, - "loss": 0.5365, + "loss": 0.9028, "step": 8510 }, { - "epoch": 1.0257645075848785, - "grad_norm": 8.0, + "epoch": 6.077032810271041, + "grad_norm": 23.625, "learning_rate": 7.324888888888889e-05, - "loss": 0.5976, + "loss": 0.8576, "step": 8520 }, { - "epoch": 1.0269684565374428, - "grad_norm": 6.59375, + "epoch": 6.08416547788873, + "grad_norm": 6.96875, "learning_rate": 7.320444444444445e-05, - "loss": 0.6249, + "loss": 0.8407, "step": 8530 }, { - "epoch": 1.0281724054900072, - "grad_norm": 8.625, + "epoch": 6.0912981455064195, + "grad_norm": 8.6875, "learning_rate": 7.316000000000001e-05, - "loss": 0.5953, + "loss": 0.8419, "step": 8540 }, { - "epoch": 1.0293763544425716, - "grad_norm": 4.875, + "epoch": 6.098430813124108, + "grad_norm": 6.90625, "learning_rate": 7.311555555555556e-05, - "loss": 0.5528, + "loss": 0.7802, "step": 8550 }, { - "epoch": 1.030580303395136, - "grad_norm": 5.28125, + "epoch": 6.1055634807417976, + "grad_norm": 6.34375, "learning_rate": 7.307111111111111e-05, - "loss": 0.5181, + "loss": 0.7716, "step": 8560 }, { - "epoch": 1.0317842523477005, - "grad_norm": 9.9375, + "epoch": 6.112696148359486, + "grad_norm": 13.5, "learning_rate": 7.302666666666667e-05, - "loss": 0.5991, + "loss": 0.8538, "step": 8570 }, { - "epoch": 1.032988201300265, - "grad_norm": 5.78125, + "epoch": 6.119828815977176, + "grad_norm": 6.59375, "learning_rate": 7.298222222222223e-05, - "loss": 0.6822, + "loss": 0.6951, "step": 8580 }, { - "epoch": 1.0341921502528293, - "grad_norm": 7.84375, + "epoch": 6.126961483594864, + "grad_norm": 7.0625, "learning_rate": 7.293777777777778e-05, - "loss": 0.671, + "loss": 0.794, "step": 8590 }, { - "epoch": 1.0353960992053937, - "grad_norm": 8.4375, + "epoch": 6.134094151212554, + "grad_norm": 7.15625, "learning_rate": 7.289333333333334e-05, - "loss": 0.6266, + "loss": 0.8058, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval/acc": 41.27906799316406, + "epoch": 6.134094151212554, + "eval/acc": 46.511627197265625, "step": 8600 }, { - "epoch": 1.0353960992053937, - "eval_loss": 2.89090895652771, - "eval_runtime": 0.2168, - "eval_samples_per_second": 198.358, - "eval_steps_per_second": 4.613, + "epoch": 6.134094151212554, + "eval_loss": 2.5194764137268066, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.736, + "eval_steps_per_second": 4.529, "step": 8600 }, { - "epoch": 1.036600048157958, - "grad_norm": 6.84375, + "epoch": 6.141226818830242, + "grad_norm": 8.0625, "learning_rate": 7.284888888888889e-05, - "loss": 0.5829, + "loss": 0.8754, "step": 8610 }, { - "epoch": 1.0378039971105224, - "grad_norm": 12.6875, + "epoch": 6.148359486447932, + "grad_norm": 4.875, "learning_rate": 7.280444444444445e-05, - "loss": 0.6336, + "loss": 0.7852, "step": 8620 }, { - "epoch": 1.039007946063087, - "grad_norm": 5.78125, + "epoch": 6.1554921540656204, + "grad_norm": 8.0, "learning_rate": 7.276e-05, - "loss": 0.5621, + "loss": 0.8064, "step": 8630 }, { - "epoch": 1.0402118950156514, - "grad_norm": 6.78125, + "epoch": 6.16262482168331, + "grad_norm": 6.3125, "learning_rate": 7.271555555555556e-05, - "loss": 0.5822, + "loss": 0.7643, "step": 8640 }, { - "epoch": 1.0414158439682157, - "grad_norm": 5.40625, + "epoch": 6.1697574893009985, + "grad_norm": 8.875, "learning_rate": 7.267111111111111e-05, - "loss": 0.6402, + "loss": 0.7702, "step": 8650 }, { - "epoch": 1.04261979292078, - "grad_norm": 5.84375, + "epoch": 6.176890156918688, + "grad_norm": 18.5, "learning_rate": 7.262666666666666e-05, - "loss": 0.5793, + "loss": 0.903, "step": 8660 }, { - "epoch": 1.0438237418733445, - "grad_norm": 9.375, + "epoch": 6.184022824536377, + "grad_norm": 9.875, "learning_rate": 7.258222222222224e-05, - "loss": 0.6447, + "loss": 0.788, "step": 8670 }, { - "epoch": 1.045027690825909, - "grad_norm": 8.4375, + "epoch": 6.191155492154065, + "grad_norm": 7.71875, "learning_rate": 7.253777777777778e-05, - "loss": 0.6428, + "loss": 0.7504, "step": 8680 }, { - "epoch": 1.0462316397784734, - "grad_norm": 8.5, + "epoch": 6.198288159771755, + "grad_norm": 7.5, "learning_rate": 7.249333333333333e-05, - "loss": 0.6219, + "loss": 0.8821, "step": 8690 }, { - "epoch": 1.0474355887310378, - "grad_norm": 8.0625, + "epoch": 6.205420827389443, + "grad_norm": 6.71875, "learning_rate": 7.24488888888889e-05, - "loss": 0.5728, + "loss": 0.9166, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval/acc": 41.86046600341797, + "epoch": 6.205420827389443, + "eval/acc": 48.83720779418945, "step": 8700 }, { - "epoch": 1.0474355887310378, - "eval_loss": 2.881147861480713, - "eval_runtime": 0.2167, - "eval_samples_per_second": 198.476, - "eval_steps_per_second": 4.616, + "epoch": 6.205420827389443, + "eval_loss": 2.488805055618286, + "eval_runtime": 0.2195, + "eval_samples_per_second": 195.91, + "eval_steps_per_second": 4.556, "step": 8700 }, { - "epoch": 1.0486395376836022, - "grad_norm": 7.09375, + "epoch": 6.212553495007133, + "grad_norm": 8.3125, "learning_rate": 7.240444444444446e-05, - "loss": 0.6532, + "loss": 0.7724, "step": 8710 }, { - "epoch": 1.0498434866361666, - "grad_norm": 7.0625, + "epoch": 6.219686162624821, + "grad_norm": 7.84375, "learning_rate": 7.236e-05, - "loss": 0.5758, + "loss": 0.8881, "step": 8720 }, { - "epoch": 1.051047435588731, - "grad_norm": 8.375, + "epoch": 6.226818830242511, + "grad_norm": 7.21875, "learning_rate": 7.231555555555555e-05, - "loss": 0.6071, + "loss": 0.8538, "step": 8730 }, { - "epoch": 1.0522513845412955, - "grad_norm": 7.34375, + "epoch": 6.2339514978601995, + "grad_norm": 7.5, "learning_rate": 7.227111111111112e-05, - "loss": 0.6905, + "loss": 0.8909, "step": 8740 }, { - "epoch": 1.05345533349386, - "grad_norm": 6.59375, + "epoch": 6.241084165477889, + "grad_norm": 7.25, "learning_rate": 7.222666666666666e-05, - "loss": 0.584, + "loss": 0.7965, "step": 8750 }, { - "epoch": 1.0546592824464243, - "grad_norm": 7.4375, + "epoch": 6.248216833095578, + "grad_norm": 7.46875, "learning_rate": 7.218222222222223e-05, - "loss": 0.6222, + "loss": 0.8547, "step": 8760 }, { - "epoch": 1.0558632313989886, - "grad_norm": 7.1875, + "epoch": 6.255349500713267, + "grad_norm": 6.1875, "learning_rate": 7.213777777777779e-05, - "loss": 0.6167, + "loss": 0.7528, "step": 8770 }, { - "epoch": 1.057067180351553, - "grad_norm": 7.875, + "epoch": 6.262482168330956, + "grad_norm": 7.03125, "learning_rate": 7.209333333333334e-05, - "loss": 0.5766, + "loss": 0.8632, "step": 8780 }, { - "epoch": 1.0582711293041176, - "grad_norm": 7.96875, + "epoch": 6.269614835948644, + "grad_norm": 8.375, "learning_rate": 7.204888888888888e-05, - "loss": 0.5747, + "loss": 0.7832, "step": 8790 }, { - "epoch": 1.059475078256682, - "grad_norm": 7.5, + "epoch": 6.276747503566334, + "grad_norm": 8.125, "learning_rate": 7.200444444444445e-05, - "loss": 0.5361, + "loss": 0.7659, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval/acc": 44.1860466003418, + "epoch": 6.276747503566334, + "eval/acc": 48.83720779418945, "step": 8800 }, { - "epoch": 1.059475078256682, - "eval_loss": 2.9378437995910645, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.123, - "eval_steps_per_second": 4.631, + "epoch": 6.276747503566334, + "eval_loss": 2.4990618228912354, + "eval_runtime": 0.2586, + "eval_samples_per_second": 166.3, + "eval_steps_per_second": 3.867, "step": 8800 }, { - "epoch": 1.0606790272092463, - "grad_norm": 6.46875, + "epoch": 6.283880171184022, + "grad_norm": 7.375, "learning_rate": 7.196000000000001e-05, - "loss": 0.5386, + "loss": 0.7402, "step": 8810 }, { - "epoch": 1.0618829761618107, - "grad_norm": 5.75, + "epoch": 6.291012838801712, + "grad_norm": 7.0, "learning_rate": 7.191555555555556e-05, - "loss": 0.5701, + "loss": 0.8381, "step": 8820 }, { - "epoch": 1.063086925114375, - "grad_norm": 10.5625, + "epoch": 6.2981455064194005, + "grad_norm": 15.75, "learning_rate": 7.18711111111111e-05, - "loss": 0.6061, + "loss": 0.8837, "step": 8830 }, { - "epoch": 1.0642908740669395, - "grad_norm": 6.75, + "epoch": 6.30527817403709, + "grad_norm": 5.46875, "learning_rate": 7.182666666666668e-05, - "loss": 0.6201, + "loss": 0.8638, "step": 8840 }, { - "epoch": 1.065494823019504, - "grad_norm": 9.625, + "epoch": 6.312410841654779, + "grad_norm": 5.46875, "learning_rate": 7.178222222222223e-05, - "loss": 0.6315, + "loss": 0.8348, "step": 8850 }, { - "epoch": 1.0666987719720684, - "grad_norm": 6.15625, + "epoch": 6.319543509272468, + "grad_norm": 7.9375, "learning_rate": 7.173777777777778e-05, - "loss": 0.6142, + "loss": 0.8598, "step": 8860 }, { - "epoch": 1.0679027209246328, - "grad_norm": 8.875, + "epoch": 6.326676176890157, + "grad_norm": 7.15625, "learning_rate": 7.169333333333334e-05, - "loss": 0.6545, + "loss": 0.8124, "step": 8870 }, { - "epoch": 1.0691066698771972, - "grad_norm": 6.5, + "epoch": 6.333808844507846, + "grad_norm": 6.28125, "learning_rate": 7.164888888888889e-05, - "loss": 0.6305, + "loss": 0.8184, "step": 8880 }, { - "epoch": 1.0703106188297615, - "grad_norm": 12.5, + "epoch": 6.340941512125535, + "grad_norm": 7.25, "learning_rate": 7.160444444444445e-05, - "loss": 0.6451, + "loss": 0.8522, "step": 8890 }, { - "epoch": 1.0715145677823261, - "grad_norm": 6.28125, + "epoch": 6.348074179743224, + "grad_norm": 8.4375, "learning_rate": 7.156e-05, - "loss": 0.5406, + "loss": 0.894, "step": 8900 }, { - "epoch": 1.0715145677823261, + "epoch": 6.348074179743224, "eval/acc": 46.511627197265625, "step": 8900 }, { - "epoch": 1.0715145677823261, - "eval_loss": 2.895603656768799, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.356, - "eval_steps_per_second": 4.729, + "epoch": 6.348074179743224, + "eval_loss": 2.4536728858947754, + "eval_runtime": 0.2168, + "eval_samples_per_second": 198.347, + "eval_steps_per_second": 4.613, "step": 8900 }, { - "epoch": 1.0727185167348905, - "grad_norm": 5.8125, + "epoch": 6.355206847360913, + "grad_norm": 8.3125, "learning_rate": 7.151555555555556e-05, - "loss": 0.6598, + "loss": 0.8331, "step": 8910 }, { - "epoch": 1.0739224656874549, - "grad_norm": 6.9375, + "epoch": 6.362339514978602, + "grad_norm": 13.1875, "learning_rate": 7.147111111111111e-05, - "loss": 0.5678, + "loss": 0.8107, "step": 8920 }, { - "epoch": 1.0751264146400192, - "grad_norm": 7.25, + "epoch": 6.369472182596291, + "grad_norm": 7.0, "learning_rate": 7.142666666666667e-05, - "loss": 0.6032, + "loss": 0.9504, "step": 8930 }, { - "epoch": 1.0763303635925836, - "grad_norm": 7.09375, + "epoch": 6.37660485021398, + "grad_norm": 9.5625, "learning_rate": 7.138222222222223e-05, - "loss": 0.5432, + "loss": 0.766, "step": 8940 }, { - "epoch": 1.077534312545148, - "grad_norm": 8.6875, + "epoch": 6.383737517831669, + "grad_norm": 13.4375, "learning_rate": 7.133777777777778e-05, - "loss": 0.6408, + "loss": 0.7923, "step": 8950 }, { - "epoch": 1.0787382614977126, - "grad_norm": 8.1875, + "epoch": 6.390870185449358, + "grad_norm": 6.6875, "learning_rate": 7.129333333333333e-05, - "loss": 0.5834, + "loss": 0.7777, "step": 8960 }, { - "epoch": 1.079942210450277, - "grad_norm": 8.75, + "epoch": 6.398002853067047, + "grad_norm": 6.09375, "learning_rate": 7.124888888888889e-05, - "loss": 0.5956, + "loss": 0.7729, "step": 8970 }, { - "epoch": 1.0811461594028413, - "grad_norm": 6.90625, + "epoch": 6.405135520684736, + "grad_norm": 6.46875, "learning_rate": 7.120444444444445e-05, - "loss": 0.6124, + "loss": 0.8118, "step": 8980 }, { - "epoch": 1.0823501083554057, - "grad_norm": 9.5625, + "epoch": 6.412268188302425, + "grad_norm": 6.21875, "learning_rate": 7.116e-05, - "loss": 0.6513, + "loss": 0.9006, "step": 8990 }, { - "epoch": 1.08355405730797, - "grad_norm": 8.0, + "epoch": 6.419400855920114, + "grad_norm": 6.5625, "learning_rate": 7.111555555555555e-05, - "loss": 0.6044, + "loss": 0.7092, "step": 9000 }, { - "epoch": 1.08355405730797, + "epoch": 6.419400855920114, "eval/acc": 44.1860466003418, "step": 9000 }, { - "epoch": 1.08355405730797, - "eval_loss": 2.894747257232666, - "eval_runtime": 0.2236, - "eval_samples_per_second": 192.288, - "eval_steps_per_second": 4.472, + "epoch": 6.419400855920114, + "eval_loss": 2.533996343612671, + "eval_runtime": 0.3418, + "eval_samples_per_second": 125.802, + "eval_steps_per_second": 2.926, "step": 9000 }, { - "epoch": 1.0847580062605346, - "grad_norm": 7.1875, + "epoch": 6.426533523537803, + "grad_norm": 7.59375, "learning_rate": 7.107111111111111e-05, - "loss": 0.4939, + "loss": 0.7684, "step": 9010 }, { - "epoch": 1.085961955213099, - "grad_norm": 8.25, + "epoch": 6.433666191155492, + "grad_norm": 6.8125, "learning_rate": 7.102666666666668e-05, - "loss": 0.7751, + "loss": 0.7654, "step": 9020 }, { - "epoch": 1.0871659041656634, - "grad_norm": 6.875, + "epoch": 6.4407988587731815, + "grad_norm": 7.5625, "learning_rate": 7.098222222222222e-05, - "loss": 0.593, + "loss": 0.8404, "step": 9030 }, { - "epoch": 1.0883698531182278, - "grad_norm": 7.5625, + "epoch": 6.44793152639087, + "grad_norm": 8.5, "learning_rate": 7.093777777777779e-05, - "loss": 0.587, + "loss": 0.8519, "step": 9040 }, { - "epoch": 1.0895738020707921, - "grad_norm": 9.5625, + "epoch": 6.45506419400856, + "grad_norm": 6.53125, "learning_rate": 7.089333333333333e-05, - "loss": 0.639, + "loss": 0.8487, "step": 9050 }, { - "epoch": 1.0907777510233565, - "grad_norm": 8.25, + "epoch": 6.462196861626248, + "grad_norm": 7.59375, "learning_rate": 7.084888888888888e-05, - "loss": 0.6537, + "loss": 0.8695, "step": 9060 }, { - "epoch": 1.091981699975921, - "grad_norm": 9.9375, + "epoch": 6.469329529243938, + "grad_norm": 8.4375, "learning_rate": 7.080444444444444e-05, - "loss": 0.6134, + "loss": 0.7864, "step": 9070 }, { - "epoch": 1.0931856489284855, - "grad_norm": 9.375, + "epoch": 6.476462196861626, + "grad_norm": 66.5, "learning_rate": 7.076000000000001e-05, - "loss": 0.5259, + "loss": 0.7726, "step": 9080 }, { - "epoch": 1.0943895978810498, - "grad_norm": 7.90625, + "epoch": 6.483594864479315, + "grad_norm": 6.96875, "learning_rate": 7.071555555555556e-05, - "loss": 0.7362, + "loss": 0.7832, "step": 9090 }, { - "epoch": 1.0955935468336142, - "grad_norm": 7.46875, + "epoch": 6.490727532097004, + "grad_norm": 7.40625, "learning_rate": 7.06711111111111e-05, - "loss": 0.6197, + "loss": 0.8063, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval/acc": 41.86046600341797, + "epoch": 6.490727532097004, + "eval/acc": 44.1860466003418, "step": 9100 }, { - "epoch": 1.0955935468336142, - "eval_loss": 2.920775890350342, - "eval_runtime": 0.2089, - "eval_samples_per_second": 205.889, - "eval_steps_per_second": 4.788, + "epoch": 6.490727532097004, + "eval_loss": 2.5438809394836426, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.594, + "eval_steps_per_second": 4.642, "step": 9100 }, { - "epoch": 1.0967974957861786, - "grad_norm": 9.6875, + "epoch": 6.497860199714693, + "grad_norm": 7.21875, "learning_rate": 7.062666666666668e-05, - "loss": 0.5682, + "loss": 0.7605, "step": 9110 }, { - "epoch": 1.0980014447387432, - "grad_norm": 6.5625, + "epoch": 6.5049928673323825, + "grad_norm": 7.90625, "learning_rate": 7.058222222222223e-05, - "loss": 0.572, + "loss": 0.8032, "step": 9120 }, { - "epoch": 1.0992053936913075, - "grad_norm": 7.5, + "epoch": 6.512125534950071, + "grad_norm": 6.9375, "learning_rate": 7.053777777777778e-05, - "loss": 0.5307, + "loss": 0.743, "step": 9130 }, { - "epoch": 1.100409342643872, - "grad_norm": 7.75, + "epoch": 6.519258202567761, + "grad_norm": 5.65625, "learning_rate": 7.049333333333334e-05, - "loss": 0.5432, + "loss": 0.8261, "step": 9140 }, { - "epoch": 1.1016132915964363, - "grad_norm": 6.84375, + "epoch": 6.526390870185449, + "grad_norm": 7.03125, "learning_rate": 7.04488888888889e-05, - "loss": 0.6012, + "loss": 0.8099, "step": 9150 }, { - "epoch": 1.1028172405490007, - "grad_norm": 6.84375, + "epoch": 6.533523537803139, + "grad_norm": 7.15625, "learning_rate": 7.040444444444445e-05, - "loss": 0.5776, + "loss": 0.817, "step": 9160 }, { - "epoch": 1.104021189501565, - "grad_norm": 8.0625, + "epoch": 6.540656205420827, + "grad_norm": 11.625, "learning_rate": 7.036e-05, - "loss": 0.5353, + "loss": 0.782, "step": 9170 }, { - "epoch": 1.1052251384541296, - "grad_norm": 5.65625, + "epoch": 6.547788873038517, + "grad_norm": 7.5625, "learning_rate": 7.031555555555556e-05, - "loss": 0.5664, + "loss": 0.8145, "step": 9180 }, { - "epoch": 1.106429087406694, - "grad_norm": 14.0, + "epoch": 6.554921540656205, + "grad_norm": 7.5625, "learning_rate": 7.027111111111111e-05, - "loss": 0.6547, + "loss": 0.8822, "step": 9190 }, { - "epoch": 1.1076330363592584, - "grad_norm": 7.9375, + "epoch": 6.562054208273894, + "grad_norm": 6.53125, "learning_rate": 7.022666666666667e-05, - "loss": 0.6063, + "loss": 0.8132, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval/acc": 41.86046600341797, + "epoch": 6.562054208273894, + "eval/acc": 44.1860466003418, "step": 9200 }, { - "epoch": 1.1076330363592584, - "eval_loss": 2.9192073345184326, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.15, - "eval_steps_per_second": 4.794, + "epoch": 6.562054208273894, + "eval_loss": 2.528564929962158, + "eval_runtime": 0.2169, + "eval_samples_per_second": 198.28, + "eval_steps_per_second": 4.611, "step": 9200 }, { - "epoch": 1.1088369853118227, - "grad_norm": 11.5625, + "epoch": 6.5691868758915835, + "grad_norm": 7.21875, "learning_rate": 7.018222222222223e-05, - "loss": 0.6251, + "loss": 0.7858, "step": 9210 }, { - "epoch": 1.110040934264387, - "grad_norm": 7.34375, + "epoch": 6.576319543509273, + "grad_norm": 6.4375, "learning_rate": 7.013777777777778e-05, - "loss": 0.5408, + "loss": 0.7098, "step": 9220 }, { - "epoch": 1.1112448832169517, - "grad_norm": 6.75, + "epoch": 6.5834522111269616, + "grad_norm": 7.125, "learning_rate": 7.009333333333333e-05, - "loss": 0.6217, + "loss": 0.8362, "step": 9230 }, { - "epoch": 1.112448832169516, - "grad_norm": 8.375, + "epoch": 6.59058487874465, + "grad_norm": 5.78125, "learning_rate": 7.004888888888889e-05, - "loss": 0.6792, + "loss": 0.7737, "step": 9240 }, { - "epoch": 1.1136527811220804, - "grad_norm": 8.375, + "epoch": 6.59771754636234, + "grad_norm": 9.0625, "learning_rate": 7.000444444444445e-05, - "loss": 0.5786, + "loss": 0.857, "step": 9250 }, { - "epoch": 1.1148567300746448, - "grad_norm": 11.0, + "epoch": 6.604850213980028, + "grad_norm": 9.125, "learning_rate": 6.996e-05, - "loss": 0.6588, + "loss": 0.7562, "step": 9260 }, { - "epoch": 1.1160606790272092, - "grad_norm": 6.75, + "epoch": 6.611982881597718, + "grad_norm": 8.3125, "learning_rate": 6.991555555555556e-05, - "loss": 0.6016, + "loss": 0.8619, "step": 9270 }, { - "epoch": 1.1172646279797736, - "grad_norm": 9.1875, + "epoch": 6.619115549215406, + "grad_norm": 6.78125, "learning_rate": 6.987111111111111e-05, - "loss": 0.5728, + "loss": 0.7212, "step": 9280 }, { - "epoch": 1.1184685769323381, - "grad_norm": 8.0625, + "epoch": 6.626248216833096, + "grad_norm": 26.125, "learning_rate": 6.982666666666667e-05, - "loss": 0.669, + "loss": 0.951, "step": 9290 }, { - "epoch": 1.1196725258849025, - "grad_norm": 9.6875, + "epoch": 6.633380884450784, + "grad_norm": 7.03125, "learning_rate": 6.978222222222222e-05, - "loss": 0.625, + "loss": 0.7791, "step": 9300 }, { - "epoch": 1.1196725258849025, + "epoch": 6.633380884450784, "eval/acc": 44.1860466003418, "step": 9300 }, { - "epoch": 1.1196725258849025, - "eval_loss": 2.8807859420776367, - "eval_runtime": 0.2269, - "eval_samples_per_second": 189.503, - "eval_steps_per_second": 4.407, + "epoch": 6.633380884450784, + "eval_loss": 2.587022304534912, + "eval_runtime": 0.2175, + "eval_samples_per_second": 197.663, + "eval_steps_per_second": 4.597, "step": 9300 }, { - "epoch": 1.1208764748374669, - "grad_norm": 7.09375, + "epoch": 6.640513552068474, + "grad_norm": 6.6875, "learning_rate": 6.973777777777778e-05, - "loss": 0.5112, + "loss": 0.8082, "step": 9310 }, { - "epoch": 1.1220804237900313, - "grad_norm": 19.375, + "epoch": 6.6476462196861625, + "grad_norm": 7.625, "learning_rate": 6.969333333333333e-05, - "loss": 0.7337, + "loss": 0.6863, "step": 9320 }, { - "epoch": 1.1232843727425956, - "grad_norm": 8.25, + "epoch": 6.654778887303852, + "grad_norm": 8.625, "learning_rate": 6.96488888888889e-05, - "loss": 0.6687, + "loss": 0.7921, "step": 9330 }, { - "epoch": 1.1244883216951602, - "grad_norm": 8.125, + "epoch": 6.661911554921541, + "grad_norm": 6.5, "learning_rate": 6.960444444444446e-05, - "loss": 0.5604, + "loss": 0.7762, "step": 9340 }, { - "epoch": 1.1256922706477246, - "grad_norm": 9.1875, + "epoch": 6.669044222539229, + "grad_norm": 12.6875, "learning_rate": 6.956e-05, - "loss": 0.6999, + "loss": 0.7977, "step": 9350 }, { - "epoch": 1.126896219600289, - "grad_norm": 8.5, + "epoch": 6.676176890156919, + "grad_norm": 6.84375, "learning_rate": 6.951555555555555e-05, - "loss": 0.5909, + "loss": 0.907, "step": 9360 }, { - "epoch": 1.1281001685528533, - "grad_norm": 7.21875, + "epoch": 6.683309557774607, + "grad_norm": 7.15625, "learning_rate": 6.947111111111112e-05, - "loss": 0.5857, + "loss": 0.792, "step": 9370 }, { - "epoch": 1.1293041175054177, - "grad_norm": 6.84375, + "epoch": 6.690442225392297, + "grad_norm": 8.5, "learning_rate": 6.942666666666668e-05, - "loss": 0.5965, + "loss": 0.7838, "step": 9380 }, { - "epoch": 1.130508066457982, - "grad_norm": 6.59375, + "epoch": 6.697574893009985, + "grad_norm": 8.1875, "learning_rate": 6.938222222222223e-05, - "loss": 0.6098, + "loss": 0.8141, "step": 9390 }, { - "epoch": 1.1317120154105467, - "grad_norm": 9.3125, + "epoch": 6.704707560627675, + "grad_norm": 7.875, "learning_rate": 6.933777777777777e-05, - "loss": 0.5917, + "loss": 0.8348, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval/acc": 44.1860466003418, + "epoch": 6.704707560627675, + "eval/acc": 39.53488540649414, "step": 9400 }, { - "epoch": 1.1317120154105467, - "eval_loss": 2.906259536743164, - "eval_runtime": 0.2551, - "eval_samples_per_second": 168.571, - "eval_steps_per_second": 3.92, + "epoch": 6.704707560627675, + "eval_loss": 2.6398463249206543, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.145, + "eval_steps_per_second": 4.585, "step": 9400 }, { - "epoch": 1.132915964363111, - "grad_norm": 8.3125, + "epoch": 6.7118402282453635, + "grad_norm": 6.625, "learning_rate": 6.929333333333334e-05, - "loss": 0.5629, + "loss": 0.889, "step": 9410 }, { - "epoch": 1.1341199133156754, - "grad_norm": 5.21875, + "epoch": 6.718972895863053, + "grad_norm": 7.3125, "learning_rate": 6.92488888888889e-05, - "loss": 0.4913, + "loss": 0.7913, "step": 9420 }, { - "epoch": 1.1353238622682398, - "grad_norm": 7.5625, + "epoch": 6.726105563480742, + "grad_norm": 10.875, "learning_rate": 6.920444444444445e-05, - "loss": 0.5868, + "loss": 0.8099, "step": 9430 }, { - "epoch": 1.1365278112208042, - "grad_norm": 8.8125, + "epoch": 6.733238231098431, + "grad_norm": 23.75, "learning_rate": 6.916000000000001e-05, - "loss": 0.6205, + "loss": 0.7098, "step": 9440 }, { - "epoch": 1.1377317601733687, - "grad_norm": 6.78125, + "epoch": 6.74037089871612, + "grad_norm": 6.625, "learning_rate": 6.911555555555556e-05, - "loss": 0.6569, + "loss": 0.7859, "step": 9450 }, { - "epoch": 1.1389357091259331, - "grad_norm": 7.9375, + "epoch": 6.747503566333809, + "grad_norm": 5.875, "learning_rate": 6.907111111111112e-05, - "loss": 0.5849, + "loss": 0.7947, "step": 9460 }, { - "epoch": 1.1401396580784975, - "grad_norm": 8.6875, + "epoch": 6.754636233951498, + "grad_norm": 7.25, "learning_rate": 6.902666666666667e-05, - "loss": 0.5997, + "loss": 0.927, "step": 9470 }, { - "epoch": 1.1413436070310619, - "grad_norm": 12.75, + "epoch": 6.761768901569187, + "grad_norm": 12.875, "learning_rate": 6.898222222222223e-05, - "loss": 0.6568, + "loss": 0.8474, "step": 9480 }, { - "epoch": 1.1425475559836262, - "grad_norm": 7.6875, + "epoch": 6.768901569186876, + "grad_norm": 6.8125, "learning_rate": 6.893777777777778e-05, - "loss": 0.6542, + "loss": 0.848, "step": 9490 }, { - "epoch": 1.1437515049361906, - "grad_norm": 6.59375, + "epoch": 6.7760342368045645, + "grad_norm": 7.96875, "learning_rate": 6.889333333333333e-05, - "loss": 0.4745, + "loss": 0.8081, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval/acc": 48.83720779418945, + "epoch": 6.7760342368045645, + "eval/acc": 41.86046600341797, "step": 9500 }, { - "epoch": 1.1437515049361906, - "eval_loss": 2.8177154064178467, - "eval_runtime": 0.2171, - "eval_samples_per_second": 198.092, - "eval_steps_per_second": 4.607, + "epoch": 6.7760342368045645, + "eval_loss": 2.6681759357452393, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.372, + "eval_steps_per_second": 4.451, "step": 9500 }, { - "epoch": 1.1449554538887552, - "grad_norm": 6.53125, + "epoch": 6.783166904422254, + "grad_norm": 9.0625, "learning_rate": 6.88488888888889e-05, - "loss": 0.664, + "loss": 0.8836, "step": 9510 }, { - "epoch": 1.1461594028413196, - "grad_norm": 7.6875, + "epoch": 6.790299572039943, + "grad_norm": 15.875, "learning_rate": 6.880444444444445e-05, - "loss": 0.5778, + "loss": 0.8696, "step": 9520 }, { - "epoch": 1.147363351793884, - "grad_norm": 6.84375, + "epoch": 6.797432239657632, + "grad_norm": 10.75, "learning_rate": 6.876e-05, - "loss": 0.6596, + "loss": 0.844, "step": 9530 }, { - "epoch": 1.1485673007464483, - "grad_norm": 8.75, + "epoch": 6.804564907275321, + "grad_norm": 23.875, "learning_rate": 6.871555555555556e-05, - "loss": 0.6422, + "loss": 0.823, "step": 9540 }, { - "epoch": 1.1497712496990127, - "grad_norm": 6.5625, + "epoch": 6.81169757489301, + "grad_norm": 7.75, "learning_rate": 6.867111111111112e-05, - "loss": 0.5794, + "loss": 0.8875, "step": 9550 }, { - "epoch": 1.1509751986515773, - "grad_norm": 8.625, + "epoch": 6.818830242510699, + "grad_norm": 6.46875, "learning_rate": 6.862666666666667e-05, - "loss": 0.6171, + "loss": 0.7703, "step": 9560 }, { - "epoch": 1.1521791476041416, - "grad_norm": 6.875, + "epoch": 6.825962910128388, + "grad_norm": 6.375, "learning_rate": 6.858222222222222e-05, - "loss": 0.58, + "loss": 0.8, "step": 9570 }, { - "epoch": 1.153383096556706, - "grad_norm": 14.375, + "epoch": 6.833095577746077, + "grad_norm": 7.96875, "learning_rate": 6.853777777777778e-05, - "loss": 0.6651, + "loss": 0.8139, "step": 9580 }, { - "epoch": 1.1545870455092704, - "grad_norm": 6.65625, + "epoch": 6.840228245363766, + "grad_norm": 11.625, "learning_rate": 6.849333333333333e-05, - "loss": 0.621, + "loss": 0.8042, "step": 9590 }, { - "epoch": 1.1557909944618348, - "grad_norm": 8.25, + "epoch": 6.847360912981455, + "grad_norm": 10.8125, "learning_rate": 6.844888888888889e-05, - "loss": 0.6578, + "loss": 0.8403, "step": 9600 }, { - "epoch": 1.1557909944618348, + "epoch": 6.847360912981455, "eval/acc": 44.1860466003418, "step": 9600 }, { - "epoch": 1.1557909944618348, - "eval_loss": 2.841442108154297, - "eval_runtime": 0.2144, - "eval_samples_per_second": 200.545, - "eval_steps_per_second": 4.664, + "epoch": 6.847360912981455, + "eval_loss": 2.6575427055358887, + "eval_runtime": 0.2186, + "eval_samples_per_second": 196.745, + "eval_steps_per_second": 4.575, "step": 9600 }, { - "epoch": 1.1569949434143991, - "grad_norm": 6.09375, + "epoch": 6.854493580599144, + "grad_norm": 14.6875, "learning_rate": 6.840444444444445e-05, - "loss": 0.5215, + "loss": 0.8426, "step": 9610 }, { - "epoch": 1.1581988923669637, - "grad_norm": 9.1875, + "epoch": 6.861626248216833, + "grad_norm": 7.84375, "learning_rate": 6.836e-05, - "loss": 0.6458, + "loss": 0.8874, "step": 9620 }, { - "epoch": 1.159402841319528, - "grad_norm": 8.8125, + "epoch": 6.868758915834523, + "grad_norm": 8.9375, "learning_rate": 6.831555555555555e-05, - "loss": 0.6037, + "loss": 0.78, "step": 9630 }, { - "epoch": 1.1606067902720925, - "grad_norm": 7.0, + "epoch": 6.875891583452211, + "grad_norm": 6.1875, "learning_rate": 6.827111111111111e-05, - "loss": 0.5408, + "loss": 0.7788, "step": 9640 }, { - "epoch": 1.1618107392246568, - "grad_norm": 5.78125, + "epoch": 6.8830242510699, + "grad_norm": 6.34375, "learning_rate": 6.822666666666668e-05, - "loss": 0.5832, + "loss": 0.7385, "step": 9650 }, { - "epoch": 1.1630146881772212, - "grad_norm": 6.84375, + "epoch": 6.890156918687589, + "grad_norm": 7.59375, "learning_rate": 6.818222222222222e-05, - "loss": 0.5802, + "loss": 0.8938, "step": 9660 }, { - "epoch": 1.1642186371297858, - "grad_norm": 5.75, + "epoch": 6.897289586305278, + "grad_norm": 10.8125, "learning_rate": 6.813777777777777e-05, - "loss": 0.5377, + "loss": 0.8154, "step": 9670 }, { - "epoch": 1.1654225860823502, - "grad_norm": 7.5625, + "epoch": 6.904422253922967, + "grad_norm": 6.90625, "learning_rate": 6.809333333333333e-05, - "loss": 0.5657, + "loss": 0.9273, "step": 9680 }, { - "epoch": 1.1666265350349145, - "grad_norm": 6.15625, + "epoch": 6.911554921540656, + "grad_norm": 8.3125, "learning_rate": 6.80488888888889e-05, - "loss": 0.5107, + "loss": 0.8595, "step": 9690 }, { - "epoch": 1.167830483987479, - "grad_norm": 5.28125, + "epoch": 6.9186875891583455, + "grad_norm": 10.75, "learning_rate": 6.800444444444444e-05, - "loss": 0.5898, + "loss": 0.8569, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval/acc": 46.511627197265625, + "epoch": 6.9186875891583455, + "eval/acc": 39.53488540649414, "step": 9700 }, { - "epoch": 1.167830483987479, - "eval_loss": 2.8665220737457275, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.771, - "eval_steps_per_second": 4.739, + "epoch": 6.9186875891583455, + "eval_loss": 2.6524884700775146, + "eval_runtime": 0.215, + "eval_samples_per_second": 200.025, + "eval_steps_per_second": 4.652, "step": 9700 }, { - "epoch": 1.1690344329400433, - "grad_norm": 7.0, + "epoch": 6.925820256776034, + "grad_norm": 7.90625, "learning_rate": 6.796e-05, - "loss": 0.6016, + "loss": 0.7726, "step": 9710 }, { - "epoch": 1.1702383818926076, - "grad_norm": 7.0, + "epoch": 6.932952924393724, + "grad_norm": 7.71875, "learning_rate": 6.791555555555556e-05, - "loss": 0.6048, + "loss": 0.789, "step": 9720 }, { - "epoch": 1.1714423308451722, - "grad_norm": 7.21875, + "epoch": 6.940085592011412, + "grad_norm": 7.4375, "learning_rate": 6.787111111111112e-05, - "loss": 0.5315, + "loss": 0.7525, "step": 9730 }, { - "epoch": 1.1726462797977366, - "grad_norm": 6.53125, + "epoch": 6.947218259629102, + "grad_norm": 6.96875, "learning_rate": 6.782666666666667e-05, - "loss": 0.5033, + "loss": 0.8183, "step": 9740 }, { - "epoch": 1.173850228750301, - "grad_norm": 6.34375, + "epoch": 6.95435092724679, + "grad_norm": 6.5625, "learning_rate": 6.778222222222223e-05, - "loss": 0.5615, + "loss": 0.8713, "step": 9750 }, { - "epoch": 1.1750541777028654, - "grad_norm": 6.34375, + "epoch": 6.961483594864479, + "grad_norm": 6.59375, "learning_rate": 6.773777777777778e-05, - "loss": 0.5494, + "loss": 0.8089, "step": 9760 }, { - "epoch": 1.1762581266554297, - "grad_norm": 7.3125, + "epoch": 6.968616262482168, + "grad_norm": 7.46875, "learning_rate": 6.769333333333334e-05, - "loss": 0.6047, + "loss": 0.8173, "step": 9770 }, { - "epoch": 1.1774620756079943, - "grad_norm": 6.53125, + "epoch": 6.975748930099857, + "grad_norm": 8.75, "learning_rate": 6.76488888888889e-05, - "loss": 0.6653, + "loss": 0.8359, "step": 9780 }, { - "epoch": 1.1786660245605587, - "grad_norm": 21.75, + "epoch": 6.9828815977175465, + "grad_norm": 6.96875, "learning_rate": 6.760444444444445e-05, - "loss": 0.5944, + "loss": 0.7308, "step": 9790 }, { - "epoch": 1.179869973513123, - "grad_norm": 17.25, + "epoch": 6.990014265335235, + "grad_norm": 8.6875, "learning_rate": 6.756e-05, - "loss": 0.6511, + "loss": 0.7651, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval/acc": 46.511627197265625, + "epoch": 6.990014265335235, + "eval/acc": 44.1860466003418, "step": 9800 }, { - "epoch": 1.179869973513123, - "eval_loss": 2.8695812225341797, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.781, - "eval_steps_per_second": 4.669, + "epoch": 6.990014265335235, + "eval_loss": 2.581909418106079, + "eval_runtime": 0.217, + "eval_samples_per_second": 198.162, + "eval_steps_per_second": 4.608, "step": 9800 }, { - "epoch": 1.1810739224656874, - "grad_norm": 5.3125, + "epoch": 6.997146932952925, + "grad_norm": 7.6875, "learning_rate": 6.751555555555556e-05, - "loss": 0.6008, + "loss": 0.8653, "step": 9810 }, { - "epoch": 1.1822778714182518, - "grad_norm": 9.4375, + "epoch": 7.004279600570613, + "grad_norm": 8.5, "learning_rate": 6.747111111111112e-05, - "loss": 0.5898, + "loss": 0.8445, "step": 9820 }, { - "epoch": 1.1834818203708162, - "grad_norm": 6.6875, + "epoch": 7.011412268188303, + "grad_norm": 6.375, "learning_rate": 6.742666666666667e-05, - "loss": 0.5976, + "loss": 0.7759, "step": 9830 }, { - "epoch": 1.1846857693233808, - "grad_norm": 7.875, + "epoch": 7.018544935805991, + "grad_norm": 6.375, "learning_rate": 6.738222222222222e-05, - "loss": 0.5604, + "loss": 0.7709, "step": 9840 }, { - "epoch": 1.1858897182759451, - "grad_norm": 6.0625, + "epoch": 7.025677603423681, + "grad_norm": 7.8125, "learning_rate": 6.733777777777778e-05, - "loss": 0.736, + "loss": 0.768, "step": 9850 }, { - "epoch": 1.1870936672285095, - "grad_norm": 8.125, + "epoch": 7.032810271041369, + "grad_norm": 8.4375, "learning_rate": 6.729333333333334e-05, - "loss": 0.5235, + "loss": 0.8725, "step": 9860 }, { - "epoch": 1.1882976161810739, - "grad_norm": 6.46875, + "epoch": 7.039942938659059, + "grad_norm": 7.8125, "learning_rate": 6.724888888888889e-05, - "loss": 0.5716, + "loss": 0.8146, "step": 9870 }, { - "epoch": 1.1895015651336383, - "grad_norm": 6.21875, + "epoch": 7.0470756062767475, + "grad_norm": 70.0, "learning_rate": 6.720444444444445e-05, - "loss": 0.5337, + "loss": 0.8137, "step": 9880 }, { - "epoch": 1.1907055140862028, - "grad_norm": 7.28125, + "epoch": 7.054208273894437, + "grad_norm": 7.03125, "learning_rate": 6.716e-05, - "loss": 0.5203, + "loss": 0.8025, "step": 9890 }, { - "epoch": 1.1919094630387672, - "grad_norm": 8.1875, + "epoch": 7.0613409415121255, + "grad_norm": 7.15625, "learning_rate": 6.711555555555555e-05, - "loss": 0.5532, + "loss": 0.8237, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval/acc": 46.511627197265625, + "epoch": 7.0613409415121255, + "eval/acc": 62.79069900512695, "step": 9900 }, { - "epoch": 1.1919094630387672, - "eval_loss": 2.864424705505371, - "eval_runtime": 0.2199, - "eval_samples_per_second": 195.51, - "eval_steps_per_second": 4.547, + "epoch": 7.0613409415121255, + "eval_loss": 2.023484706878662, + "eval_runtime": 1.3641, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 0.733, "step": 9900 }, { - "epoch": 1.1931134119913316, - "grad_norm": 8.5625, + "epoch": 7.068473609129814, + "grad_norm": 10.375, "learning_rate": 6.707111111111111e-05, - "loss": 0.585, + "loss": 0.7141, "step": 9910 }, { - "epoch": 1.194317360943896, - "grad_norm": 8.25, + "epoch": 7.075606276747504, + "grad_norm": 9.25, "learning_rate": 6.702666666666667e-05, - "loss": 0.6533, + "loss": 0.7963, "step": 9920 }, { - "epoch": 1.1955213098964603, - "grad_norm": 8.8125, + "epoch": 7.082738944365192, + "grad_norm": 7.375, "learning_rate": 6.698222222222222e-05, - "loss": 0.5962, + "loss": 0.7935, "step": 9930 }, { - "epoch": 1.1967252588490247, - "grad_norm": 13.0625, + "epoch": 7.089871611982882, + "grad_norm": 6.8125, "learning_rate": 6.693777777777778e-05, - "loss": 0.6169, + "loss": 0.7882, "step": 9940 }, { - "epoch": 1.1979292078015893, - "grad_norm": 7.5625, + "epoch": 7.09700427960057, + "grad_norm": 7.0625, "learning_rate": 6.689333333333335e-05, - "loss": 0.5756, + "loss": 0.7698, "step": 9950 }, { - "epoch": 1.1991331567541537, - "grad_norm": 6.03125, + "epoch": 7.10413694721826, + "grad_norm": 6.9375, "learning_rate": 6.68488888888889e-05, - "loss": 0.5746, + "loss": 0.8595, "step": 9960 }, { - "epoch": 1.200337105706718, - "grad_norm": 4.875, + "epoch": 7.111269614835948, + "grad_norm": 9.5, "learning_rate": 6.680444444444444e-05, - "loss": 0.6586, + "loss": 0.8158, "step": 9970 }, { - "epoch": 1.2015410546592824, - "grad_norm": 7.375, + "epoch": 7.118402282453638, + "grad_norm": 8.375, "learning_rate": 6.676e-05, - "loss": 0.6928, + "loss": 0.7916, "step": 9980 }, { - "epoch": 1.2027450036118468, - "grad_norm": 8.875, + "epoch": 7.1255349500713265, + "grad_norm": 6.3125, "learning_rate": 6.671555555555555e-05, - "loss": 0.6166, + "loss": 0.7455, "step": 9990 }, { - "epoch": 1.2039489525644114, - "grad_norm": 7.96875, + "epoch": 7.132667617689016, + "grad_norm": 7.375, "learning_rate": 6.667111111111112e-05, - "loss": 0.6778, + "loss": 0.7398, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval/acc": 46.511627197265625, + "epoch": 7.132667617689016, + "eval/acc": 65.11627960205078, "step": 10000 }, { - "epoch": 1.2039489525644114, - "eval_loss": 2.8400421142578125, - "eval_runtime": 0.2085, - "eval_samples_per_second": 206.266, - "eval_steps_per_second": 4.797, + "epoch": 7.132667617689016, + "eval_loss": 2.0408403873443604, + "eval_runtime": 0.2184, + "eval_samples_per_second": 196.923, + "eval_steps_per_second": 4.58, "step": 10000 }, { - "epoch": 1.2051529015169757, - "grad_norm": 6.21875, + "epoch": 7.139800285306705, + "grad_norm": 8.375, "learning_rate": 6.662666666666668e-05, - "loss": 0.5977, + "loss": 0.8887, "step": 10010 }, { - "epoch": 1.20635685046954, - "grad_norm": 7.03125, + "epoch": 7.146932952924394, + "grad_norm": 8.5, "learning_rate": 6.658222222222223e-05, - "loss": 0.5471, + "loss": 0.8945, "step": 10020 }, { - "epoch": 1.2075607994221045, - "grad_norm": 7.3125, + "epoch": 7.154065620542083, + "grad_norm": 22.5, "learning_rate": 6.653777777777777e-05, - "loss": 0.587, + "loss": 0.7934, "step": 10030 }, { - "epoch": 1.2087647483746689, - "grad_norm": 7.28125, + "epoch": 7.161198288159771, + "grad_norm": 7.34375, "learning_rate": 6.649333333333334e-05, - "loss": 0.5015, + "loss": 0.8056, "step": 10040 }, { - "epoch": 1.2099686973272332, - "grad_norm": 8.3125, + "epoch": 7.168330955777461, + "grad_norm": 7.59375, "learning_rate": 6.64488888888889e-05, - "loss": 0.5784, + "loss": 0.7893, "step": 10050 }, { - "epoch": 1.2111726462797978, - "grad_norm": 6.46875, + "epoch": 7.175463623395149, + "grad_norm": 8.5, "learning_rate": 6.640444444444445e-05, - "loss": 0.5528, + "loss": 1.0099, "step": 10060 }, { - "epoch": 1.2123765952323622, - "grad_norm": 4.8125, + "epoch": 7.182596291012839, + "grad_norm": 8.0625, "learning_rate": 6.636e-05, - "loss": 0.6008, + "loss": 0.8701, "step": 10070 }, { - "epoch": 1.2135805441849266, - "grad_norm": 7.46875, + "epoch": 7.1897289586305275, + "grad_norm": 9.25, "learning_rate": 6.631555555555557e-05, - "loss": 0.5804, + "loss": 0.8203, "step": 10080 }, { - "epoch": 1.214784493137491, - "grad_norm": 8.375, + "epoch": 7.196861626248217, + "grad_norm": 7.90625, "learning_rate": 6.627111111111112e-05, - "loss": 0.5645, + "loss": 0.8197, "step": 10090 }, { - "epoch": 1.2159884420900553, - "grad_norm": 12.0, + "epoch": 7.203994293865906, + "grad_norm": 6.03125, "learning_rate": 6.622666666666667e-05, - "loss": 0.5773, + "loss": 0.8087, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval/acc": 44.1860466003418, + "epoch": 7.203994293865906, + "eval/acc": 60.46511459350586, "step": 10100 }, { - "epoch": 1.2159884420900553, - "eval_loss": 2.8810744285583496, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.671, - "eval_steps_per_second": 4.783, + "epoch": 7.203994293865906, + "eval_loss": 1.940862774848938, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.814, + "eval_steps_per_second": 4.391, "step": 10100 }, { - "epoch": 1.21719239104262, - "grad_norm": 9.625, + "epoch": 7.211126961483595, + "grad_norm": 8.1875, "learning_rate": 6.618222222222223e-05, - "loss": 0.6729, + "loss": 0.7932, "step": 10110 }, { - "epoch": 1.2183963399951843, - "grad_norm": 7.15625, + "epoch": 7.218259629101284, + "grad_norm": 7.4375, "learning_rate": 6.613777777777778e-05, - "loss": 0.613, + "loss": 0.7562, "step": 10120 }, { - "epoch": 1.2196002889477486, - "grad_norm": 5.34375, + "epoch": 7.225392296718973, + "grad_norm": 8.4375, "learning_rate": 6.609333333333334e-05, - "loss": 0.5637, + "loss": 0.8474, "step": 10130 }, { - "epoch": 1.220804237900313, - "grad_norm": 8.1875, + "epoch": 7.232524964336662, + "grad_norm": 8.0, "learning_rate": 6.604888888888889e-05, - "loss": 0.6426, + "loss": 0.8197, "step": 10140 }, { - "epoch": 1.2220081868528774, - "grad_norm": 7.34375, + "epoch": 7.239657631954351, + "grad_norm": 8.0625, "learning_rate": 6.600444444444445e-05, - "loss": 0.5698, + "loss": 0.7804, "step": 10150 }, { - "epoch": 1.2232121358054417, - "grad_norm": 9.25, + "epoch": 7.24679029957204, + "grad_norm": 21.25, "learning_rate": 6.596e-05, - "loss": 0.6375, + "loss": 0.8914, "step": 10160 }, { - "epoch": 1.2244160847580063, - "grad_norm": 6.25, + "epoch": 7.2539229671897285, + "grad_norm": 7.125, "learning_rate": 6.591555555555556e-05, - "loss": 0.5693, + "loss": 0.8185, "step": 10170 }, { - "epoch": 1.2256200337105707, - "grad_norm": 6.4375, + "epoch": 7.261055634807418, + "grad_norm": 6.6875, "learning_rate": 6.587111111111112e-05, - "loss": 0.5378, + "loss": 0.7911, "step": 10180 }, { - "epoch": 1.226823982663135, - "grad_norm": 8.375, + "epoch": 7.268188302425107, + "grad_norm": 7.21875, "learning_rate": 6.582666666666667e-05, - "loss": 0.7013, + "loss": 0.8004, "step": 10190 }, { - "epoch": 1.2280279316156995, - "grad_norm": 5.8125, + "epoch": 7.275320970042796, + "grad_norm": 7.0, "learning_rate": 6.578222222222222e-05, - "loss": 0.6519, + "loss": 0.7226, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval/acc": 46.511627197265625, + "epoch": 7.275320970042796, + "eval/acc": 60.46511459350586, "step": 10200 }, { - "epoch": 1.2280279316156995, - "eval_loss": 2.8267436027526855, - "eval_runtime": 0.2057, - "eval_samples_per_second": 209.0, - "eval_steps_per_second": 4.86, + "epoch": 7.275320970042796, + "eval_loss": 2.004242420196533, + "eval_runtime": 0.2197, + "eval_samples_per_second": 195.738, + "eval_steps_per_second": 4.552, "step": 10200 }, { - "epoch": 1.2292318805682638, - "grad_norm": 7.1875, + "epoch": 7.282453637660485, + "grad_norm": 16.25, "learning_rate": 6.573777777777778e-05, - "loss": 0.5266, + "loss": 0.8735, "step": 10210 }, { - "epoch": 1.2304358295208284, - "grad_norm": 6.875, + "epoch": 7.289586305278174, + "grad_norm": 6.8125, "learning_rate": 6.569333333333334e-05, - "loss": 0.5686, + "loss": 0.8356, "step": 10220 }, { - "epoch": 1.2316397784733928, - "grad_norm": 8.0, + "epoch": 7.296718972895863, + "grad_norm": 5.65625, "learning_rate": 6.564888888888889e-05, - "loss": 0.6414, + "loss": 0.8032, "step": 10230 }, { - "epoch": 1.2328437274259572, - "grad_norm": 6.8125, + "epoch": 7.303851640513552, + "grad_norm": 6.125, "learning_rate": 6.560444444444444e-05, - "loss": 0.6118, + "loss": 0.7803, "step": 10240 }, { - "epoch": 1.2340476763785215, - "grad_norm": 8.625, + "epoch": 7.310984308131241, + "grad_norm": 9.375, "learning_rate": 6.556e-05, - "loss": 0.5839, + "loss": 0.8748, "step": 10250 }, { - "epoch": 1.235251625331086, - "grad_norm": 7.34375, + "epoch": 7.31811697574893, + "grad_norm": 6.625, "learning_rate": 6.551555555555556e-05, - "loss": 0.6561, + "loss": 0.7793, "step": 10260 }, { - "epoch": 1.2364555742836503, - "grad_norm": 11.5625, + "epoch": 7.325249643366619, + "grad_norm": 13.625, "learning_rate": 6.547111111111111e-05, - "loss": 0.6036, + "loss": 0.8052, "step": 10270 }, { - "epoch": 1.2376595232362149, - "grad_norm": 7.875, + "epoch": 7.3323823109843085, + "grad_norm": 8.6875, "learning_rate": 6.542666666666667e-05, - "loss": 0.5566, + "loss": 0.8387, "step": 10280 }, { - "epoch": 1.2388634721887792, - "grad_norm": 7.59375, + "epoch": 7.339514978601997, + "grad_norm": 6.84375, "learning_rate": 6.538222222222222e-05, - "loss": 0.5778, + "loss": 0.8713, "step": 10290 }, { - "epoch": 1.2400674211413436, - "grad_norm": 7.25, + "epoch": 7.346647646219687, + "grad_norm": 9.875, "learning_rate": 6.533777777777777e-05, - "loss": 0.616, + "loss": 0.7266, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval/acc": 45.930233001708984, + "epoch": 7.346647646219687, + "eval/acc": 62.79069900512695, "step": 10300 }, { - "epoch": 1.2400674211413436, - "eval_loss": 2.851064682006836, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.029, - "eval_steps_per_second": 4.675, + "epoch": 7.346647646219687, + "eval_loss": 1.9304108619689941, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.684, + "eval_steps_per_second": 4.504, "step": 10300 }, { - "epoch": 1.241271370093908, - "grad_norm": 5.375, + "epoch": 7.353780313837375, + "grad_norm": 9.5625, "learning_rate": 6.529333333333333e-05, - "loss": 0.5576, + "loss": 0.7775, "step": 10310 }, { - "epoch": 1.2424753190464723, - "grad_norm": 9.9375, + "epoch": 7.360912981455064, + "grad_norm": 8.0625, "learning_rate": 6.52488888888889e-05, - "loss": 0.6111, + "loss": 0.7669, "step": 10320 }, { - "epoch": 1.243679267999037, - "grad_norm": 7.34375, + "epoch": 7.368045649072753, + "grad_norm": 7.0625, "learning_rate": 6.520444444444444e-05, - "loss": 0.6349, + "loss": 0.897, "step": 10330 }, { - "epoch": 1.2448832169516013, - "grad_norm": 7.8125, + "epoch": 7.375178316690442, + "grad_norm": 7.4375, "learning_rate": 6.515999999999999e-05, - "loss": 0.5117, + "loss": 0.7859, "step": 10340 }, { - "epoch": 1.2460871659041657, - "grad_norm": 9.9375, + "epoch": 7.382310984308131, + "grad_norm": 10.25, "learning_rate": 6.511555555555557e-05, - "loss": 0.5363, + "loss": 0.9447, "step": 10350 }, { - "epoch": 1.24729111485673, - "grad_norm": 10.0625, + "epoch": 7.38944365192582, + "grad_norm": 7.21875, "learning_rate": 6.507111111111112e-05, - "loss": 0.694, + "loss": 0.78, "step": 10360 }, { - "epoch": 1.2484950638092944, - "grad_norm": 9.625, + "epoch": 7.3965763195435095, + "grad_norm": 8.625, "learning_rate": 6.502666666666667e-05, - "loss": 0.5528, + "loss": 0.9362, "step": 10370 }, { - "epoch": 1.2496990127618588, - "grad_norm": 7.25, + "epoch": 7.403708987161198, + "grad_norm": 8.125, "learning_rate": 6.498222222222223e-05, - "loss": 0.5428, + "loss": 0.7343, "step": 10380 }, { - "epoch": 1.2509029617144234, - "grad_norm": 7.59375, + "epoch": 7.410841654778888, + "grad_norm": 8.125, "learning_rate": 6.493777777777779e-05, - "loss": 0.6291, + "loss": 0.8328, "step": 10390 }, { - "epoch": 1.2521069106669878, - "grad_norm": 7.28125, + "epoch": 7.417974322396576, + "grad_norm": 7.8125, "learning_rate": 6.489333333333334e-05, - "loss": 0.5882, + "loss": 0.8261, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval/acc": 46.511627197265625, + "epoch": 7.417974322396576, + "eval/acc": 62.79069900512695, "step": 10400 }, { - "epoch": 1.2521069106669878, - "eval_loss": 2.878549098968506, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.843, - "eval_steps_per_second": 4.741, + "epoch": 7.417974322396576, + "eval_loss": 1.9274901151657104, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.989, + "eval_steps_per_second": 4.511, "step": 10400 }, { - "epoch": 1.2533108596195521, - "grad_norm": 9.25, + "epoch": 7.425106990014266, + "grad_norm": 48.0, "learning_rate": 6.484888888888889e-05, - "loss": 0.5934, + "loss": 0.8167, "step": 10410 }, { - "epoch": 1.2545148085721165, - "grad_norm": 6.84375, + "epoch": 7.432239657631954, + "grad_norm": 8.4375, "learning_rate": 6.480444444444445e-05, - "loss": 0.714, + "loss": 0.8262, "step": 10420 }, { - "epoch": 1.2557187575246809, - "grad_norm": 8.375, + "epoch": 7.439372325249644, + "grad_norm": 6.90625, "learning_rate": 6.476e-05, - "loss": 0.7209, + "loss": 0.9254, "step": 10430 }, { - "epoch": 1.2569227064772455, - "grad_norm": 7.96875, + "epoch": 7.446504992867332, + "grad_norm": 8.5625, "learning_rate": 6.471555555555556e-05, - "loss": 0.6045, + "loss": 0.7657, "step": 10440 }, { - "epoch": 1.2581266554298098, - "grad_norm": 9.125, + "epoch": 7.453637660485022, + "grad_norm": 6.875, "learning_rate": 6.467111111111112e-05, - "loss": 0.632, + "loss": 0.8123, "step": 10450 }, { - "epoch": 1.2593306043823742, - "grad_norm": 6.9375, + "epoch": 7.4607703281027105, + "grad_norm": 8.5625, "learning_rate": 6.462666666666667e-05, - "loss": 0.5078, + "loss": 0.8951, "step": 10460 }, { - "epoch": 1.2605345533349386, - "grad_norm": 7.375, + "epoch": 7.467902995720399, + "grad_norm": 7.46875, "learning_rate": 6.458222222222222e-05, - "loss": 0.6352, + "loss": 0.8287, "step": 10470 }, { - "epoch": 1.261738502287503, - "grad_norm": 7.375, + "epoch": 7.4750356633380886, + "grad_norm": 6.28125, "learning_rate": 6.453777777777778e-05, - "loss": 0.6588, + "loss": 0.7364, "step": 10480 }, { - "epoch": 1.2629424512400673, - "grad_norm": 8.625, + "epoch": 7.482168330955777, + "grad_norm": 7.625, "learning_rate": 6.449333333333334e-05, - "loss": 0.6612, + "loss": 0.9265, "step": 10490 }, { - "epoch": 1.264146400192632, - "grad_norm": 6.78125, + "epoch": 7.489300998573467, + "grad_norm": 7.15625, "learning_rate": 6.444888888888889e-05, - "loss": 0.5578, + "loss": 0.7547, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval/acc": 46.511627197265625, + "epoch": 7.489300998573467, + "eval/acc": 62.79069900512695, "step": 10500 }, { - "epoch": 1.264146400192632, - "eval_loss": 2.8044533729553223, - "eval_runtime": 0.2198, - "eval_samples_per_second": 195.595, - "eval_steps_per_second": 4.549, + "epoch": 7.489300998573467, + "eval_loss": 1.9239764213562012, + "eval_runtime": 0.2285, + "eval_samples_per_second": 188.187, + "eval_steps_per_second": 4.376, "step": 10500 }, { - "epoch": 1.2653503491451963, - "grad_norm": 7.0625, + "epoch": 7.496433666191155, + "grad_norm": 7.875, "learning_rate": 6.440444444444444e-05, - "loss": 0.5674, + "loss": 0.8612, "step": 10510 }, { - "epoch": 1.2665542980977607, - "grad_norm": 7.5, + "epoch": 7.503566333808845, + "grad_norm": 7.46875, "learning_rate": 6.436e-05, - "loss": 0.5692, + "loss": 0.8751, "step": 10520 }, { - "epoch": 1.267758247050325, - "grad_norm": 6.96875, + "epoch": 7.510699001426533, + "grad_norm": 6.78125, "learning_rate": 6.431555555555556e-05, - "loss": 0.5209, + "loss": 0.7706, "step": 10530 }, { - "epoch": 1.2689621960028894, - "grad_norm": 6.625, + "epoch": 7.517831669044223, + "grad_norm": 6.375, "learning_rate": 6.427111111111111e-05, - "loss": 0.7402, + "loss": 0.7602, "step": 10540 }, { - "epoch": 1.270166144955454, - "grad_norm": 8.5625, + "epoch": 7.5249643366619114, + "grad_norm": 7.1875, "learning_rate": 6.422666666666667e-05, - "loss": 0.6213, + "loss": 0.7953, "step": 10550 }, { - "epoch": 1.2713700939080184, - "grad_norm": 6.625, + "epoch": 7.532097004279601, + "grad_norm": 6.5, "learning_rate": 6.418222222222222e-05, - "loss": 0.587, + "loss": 0.871, "step": 10560 }, { - "epoch": 1.2725740428605827, - "grad_norm": 8.3125, + "epoch": 7.5392296718972895, + "grad_norm": 6.65625, "learning_rate": 6.413777777777778e-05, - "loss": 0.5949, + "loss": 0.7343, "step": 10570 }, { - "epoch": 1.273777991813147, - "grad_norm": 5.9375, + "epoch": 7.546362339514978, + "grad_norm": 6.3125, "learning_rate": 6.409333333333333e-05, - "loss": 0.5501, + "loss": 0.8275, "step": 10580 }, { - "epoch": 1.2749819407657115, - "grad_norm": 4.59375, + "epoch": 7.553495007132668, + "grad_norm": 6.125, "learning_rate": 6.40488888888889e-05, - "loss": 0.5145, + "loss": 0.8243, "step": 10590 }, { - "epoch": 1.2761858897182758, - "grad_norm": 8.6875, + "epoch": 7.560627674750357, + "grad_norm": 7.75, "learning_rate": 6.400444444444444e-05, - "loss": 0.6859, + "loss": 0.8731, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval/acc": 46.511627197265625, + "epoch": 7.560627674750357, + "eval/acc": 58.13953399658203, "step": 10600 }, { - "epoch": 1.2761858897182758, - "eval_loss": 2.836024045944214, - "eval_runtime": 0.2165, - "eval_samples_per_second": 198.581, - "eval_steps_per_second": 4.618, + "epoch": 7.560627674750357, + "eval_loss": 1.9751547574996948, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.634, + "eval_steps_per_second": 4.526, "step": 10600 }, { - "epoch": 1.2773898386708404, - "grad_norm": 7.6875, + "epoch": 7.567760342368046, + "grad_norm": 7.34375, "learning_rate": 6.396e-05, - "loss": 0.5479, + "loss": 0.7555, "step": 10610 }, { - "epoch": 1.2785937876234048, - "grad_norm": 6.46875, + "epoch": 7.574893009985734, + "grad_norm": 9.1875, "learning_rate": 6.391555555555557e-05, - "loss": 0.6267, + "loss": 0.7415, "step": 10620 }, { - "epoch": 1.2797977365759692, - "grad_norm": 7.8125, + "epoch": 7.582025677603424, + "grad_norm": 11.875, "learning_rate": 6.387111111111111e-05, - "loss": 0.6473, + "loss": 0.7363, "step": 10630 }, { - "epoch": 1.2810016855285336, - "grad_norm": 8.75, + "epoch": 7.589158345221112, + "grad_norm": 7.90625, "learning_rate": 6.382666666666666e-05, - "loss": 0.7012, + "loss": 0.858, "step": 10640 }, { - "epoch": 1.282205634481098, - "grad_norm": 7.0625, + "epoch": 7.596291012838802, + "grad_norm": 8.25, "learning_rate": 6.378222222222223e-05, - "loss": 0.6147, + "loss": 0.7934, "step": 10650 }, { - "epoch": 1.2834095834336625, - "grad_norm": 8.1875, + "epoch": 7.6034236804564905, + "grad_norm": 6.84375, "learning_rate": 6.373777777777779e-05, - "loss": 0.6508, + "loss": 0.7867, "step": 10660 }, { - "epoch": 1.2846135323862269, - "grad_norm": 7.21875, + "epoch": 7.61055634807418, + "grad_norm": 8.3125, "learning_rate": 6.369333333333334e-05, - "loss": 0.5718, + "loss": 0.8519, "step": 10670 }, { - "epoch": 1.2858174813387913, - "grad_norm": 6.40625, + "epoch": 7.617689015691869, + "grad_norm": 8.25, "learning_rate": 6.36488888888889e-05, - "loss": 0.6092, + "loss": 0.8771, "step": 10680 }, { - "epoch": 1.2870214302913556, - "grad_norm": 8.5625, + "epoch": 7.624821683309558, + "grad_norm": 6.1875, "learning_rate": 6.360444444444445e-05, - "loss": 0.6562, + "loss": 0.8483, "step": 10690 }, { - "epoch": 1.28822537924392, - "grad_norm": 6.71875, + "epoch": 7.631954350927247, + "grad_norm": 34.25, "learning_rate": 6.356000000000001e-05, - "loss": 0.5452, + "loss": 0.8799, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval/acc": 41.27906799316406, + "epoch": 7.631954350927247, + "eval/acc": 62.79069900512695, "step": 10700 }, { - "epoch": 1.28822537924392, - "eval_loss": 2.846574306488037, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.316, - "eval_steps_per_second": 4.775, + "epoch": 7.631954350927247, + "eval_loss": 1.9270039796829224, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.545, + "eval_steps_per_second": 4.199, "step": 10700 }, { - "epoch": 1.2894293281964844, - "grad_norm": 7.09375, + "epoch": 7.639087018544936, + "grad_norm": 7.875, "learning_rate": 6.351555555555556e-05, - "loss": 0.5298, + "loss": 0.8321, "step": 10710 }, { - "epoch": 1.290633277149049, - "grad_norm": 7.8125, + "epoch": 7.646219686162625, + "grad_norm": 7.0, "learning_rate": 6.347111111111112e-05, - "loss": 0.6176, + "loss": 0.8192, "step": 10720 }, { - "epoch": 1.2918372261016133, - "grad_norm": 6.0, + "epoch": 7.653352353780313, + "grad_norm": 8.25, "learning_rate": 6.342666666666667e-05, - "loss": 0.527, + "loss": 0.7631, "step": 10730 }, { - "epoch": 1.2930411750541777, - "grad_norm": 20.0, + "epoch": 7.660485021398003, + "grad_norm": 7.1875, "learning_rate": 6.338222222222222e-05, - "loss": 0.6201, + "loss": 0.8088, "step": 10740 }, { - "epoch": 1.294245124006742, - "grad_norm": 8.25, + "epoch": 7.6676176890156915, + "grad_norm": 6.5, "learning_rate": 6.333777777777779e-05, - "loss": 0.6072, + "loss": 0.7612, "step": 10750 }, { - "epoch": 1.2954490729593064, - "grad_norm": 10.3125, + "epoch": 7.674750356633381, + "grad_norm": 7.5, "learning_rate": 6.329333333333334e-05, - "loss": 0.6123, + "loss": 0.8282, "step": 10760 }, { - "epoch": 1.296653021911871, - "grad_norm": 7.0625, + "epoch": 7.68188302425107, + "grad_norm": 6.46875, "learning_rate": 6.324888888888889e-05, - "loss": 0.5529, + "loss": 0.8197, "step": 10770 }, { - "epoch": 1.2978569708644354, - "grad_norm": 7.5625, + "epoch": 7.689015691868759, + "grad_norm": 12.0, "learning_rate": 6.320444444444445e-05, - "loss": 0.5527, + "loss": 0.8304, "step": 10780 }, { - "epoch": 1.2990609198169998, - "grad_norm": 9.375, + "epoch": 7.696148359486448, + "grad_norm": 7.875, "learning_rate": 6.316000000000001e-05, - "loss": 0.562, + "loss": 0.8242, "step": 10790 }, { - "epoch": 1.3002648687695642, - "grad_norm": 6.0, + "epoch": 7.703281027104137, + "grad_norm": 7.34375, "learning_rate": 6.311555555555556e-05, - "loss": 0.5706, + "loss": 0.7904, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval/acc": 39.53488540649414, + "epoch": 7.703281027104137, + "eval/acc": 60.46511459350586, "step": 10800 }, { - "epoch": 1.3002648687695642, - "eval_loss": 2.8325037956237793, - "eval_runtime": 0.9046, - "eval_samples_per_second": 47.536, - "eval_steps_per_second": 1.105, + "epoch": 7.703281027104137, + "eval_loss": 1.931999683380127, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.21, + "eval_steps_per_second": 4.493, "step": 10800 }, { - "epoch": 1.3014688177221285, - "grad_norm": 6.6875, + "epoch": 7.710413694721826, + "grad_norm": 8.625, "learning_rate": 6.307111111111111e-05, - "loss": 0.5263, + "loss": 0.861, "step": 10810 }, { - "epoch": 1.302672766674693, - "grad_norm": 7.3125, + "epoch": 7.717546362339515, + "grad_norm": 12.0, "learning_rate": 6.302666666666667e-05, - "loss": 0.5811, + "loss": 0.7917, "step": 10820 }, { - "epoch": 1.3038767156272573, - "grad_norm": 7.65625, + "epoch": 7.724679029957204, + "grad_norm": 6.5, "learning_rate": 6.298222222222222e-05, - "loss": 0.6056, + "loss": 0.709, "step": 10830 }, { - "epoch": 1.3050806645798219, - "grad_norm": 7.1875, + "epoch": 7.731811697574893, + "grad_norm": 6.96875, "learning_rate": 6.293777777777778e-05, - "loss": 0.625, + "loss": 0.8168, "step": 10840 }, { - "epoch": 1.3062846135323862, - "grad_norm": 8.0625, + "epoch": 7.738944365192582, + "grad_norm": 7.625, "learning_rate": 6.289333333333334e-05, - "loss": 0.5916, + "loss": 0.7357, "step": 10850 }, { - "epoch": 1.3074885624849506, - "grad_norm": 7.46875, + "epoch": 7.7460770328102715, + "grad_norm": 17.125, "learning_rate": 6.284888888888889e-05, - "loss": 0.5399, + "loss": 0.7115, "step": 10860 }, { - "epoch": 1.308692511437515, - "grad_norm": 6.21875, + "epoch": 7.75320970042796, + "grad_norm": 6.78125, "learning_rate": 6.280444444444444e-05, - "loss": 0.5895, + "loss": 0.6973, "step": 10870 }, { - "epoch": 1.3098964603900796, - "grad_norm": 15.5, + "epoch": 7.760342368045649, + "grad_norm": 6.75, "learning_rate": 6.276e-05, - "loss": 0.6447, + "loss": 0.7925, "step": 10880 }, { - "epoch": 1.311100409342644, - "grad_norm": 10.625, + "epoch": 7.767475035663338, + "grad_norm": 6.78125, "learning_rate": 6.271555555555556e-05, - "loss": 0.6577, + "loss": 0.7927, "step": 10890 }, { - "epoch": 1.3123043582952083, - "grad_norm": 8.0625, + "epoch": 7.774607703281027, + "grad_norm": 7.375, "learning_rate": 6.267111111111111e-05, - "loss": 0.6119, + "loss": 0.9383, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval/acc": 44.1860466003418, + "epoch": 7.774607703281027, + "eval/acc": 62.79069900512695, "step": 10900 }, { - "epoch": 1.3123043582952083, - "eval_loss": 2.8269896507263184, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.171, - "eval_steps_per_second": 4.725, + "epoch": 7.774607703281027, + "eval_loss": 1.947619915008545, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.442, + "eval_steps_per_second": 4.592, "step": 10900 }, { - "epoch": 1.3135083072477727, - "grad_norm": 6.21875, + "epoch": 7.781740370898716, + "grad_norm": 13.75, "learning_rate": 6.262666666666666e-05, - "loss": 0.5292, + "loss": 0.8463, "step": 10910 }, { - "epoch": 1.314712256200337, - "grad_norm": 7.5, + "epoch": 7.788873038516405, + "grad_norm": 10.875, "learning_rate": 6.258222222222222e-05, - "loss": 0.6174, + "loss": 0.7938, "step": 10920 }, { - "epoch": 1.3159162051529014, - "grad_norm": 7.71875, + "epoch": 7.796005706134094, + "grad_norm": 7.75, "learning_rate": 6.253777777777779e-05, - "loss": 0.6011, + "loss": 0.8174, "step": 10930 }, { - "epoch": 1.3171201541054658, - "grad_norm": 6.375, + "epoch": 7.803138373751783, + "grad_norm": 6.3125, "learning_rate": 6.249333333333333e-05, - "loss": 0.6249, + "loss": 0.7583, "step": 10940 }, { - "epoch": 1.3183241030580304, - "grad_norm": 7.1875, + "epoch": 7.8102710413694725, + "grad_norm": 6.625, "learning_rate": 6.24488888888889e-05, - "loss": 0.5995, + "loss": 0.7677, "step": 10950 }, { - "epoch": 1.3195280520105948, - "grad_norm": 6.46875, + "epoch": 7.817403708987161, + "grad_norm": 7.03125, "learning_rate": 6.240444444444444e-05, - "loss": 0.5469, + "loss": 0.8211, "step": 10960 }, { - "epoch": 1.3207320009631591, - "grad_norm": 8.1875, + "epoch": 7.824536376604851, + "grad_norm": 6.78125, "learning_rate": 6.236e-05, - "loss": 0.6817, + "loss": 0.8165, "step": 10970 }, { - "epoch": 1.3219359499157235, - "grad_norm": 8.875, + "epoch": 7.831669044222539, + "grad_norm": 7.25, "learning_rate": 6.231555555555555e-05, - "loss": 0.6015, + "loss": 0.8452, "step": 10980 }, { - "epoch": 1.323139898868288, - "grad_norm": 9.25, + "epoch": 7.838801711840228, + "grad_norm": 7.78125, "learning_rate": 6.227111111111112e-05, - "loss": 0.5469, + "loss": 0.7316, "step": 10990 }, { - "epoch": 1.3243438478208525, - "grad_norm": 5.4375, + "epoch": 7.845934379457917, + "grad_norm": 7.1875, "learning_rate": 6.222666666666666e-05, - "loss": 0.6355, + "loss": 0.7908, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval/acc": 44.1860466003418, + "epoch": 7.845934379457917, + "eval/acc": 60.46511459350586, "step": 11000 }, { - "epoch": 1.3243438478208525, - "eval_loss": 2.7861974239349365, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.615, - "eval_steps_per_second": 4.852, + "epoch": 7.845934379457917, + "eval_loss": 1.9402235746383667, + "eval_runtime": 0.2151, + "eval_samples_per_second": 199.927, + "eval_steps_per_second": 4.649, "step": 11000 }, { - "epoch": 1.3255477967734168, - "grad_norm": 6.5, + "epoch": 7.853067047075607, + "grad_norm": 7.3125, "learning_rate": 6.218222222222223e-05, - "loss": 0.6631, + "loss": 0.8226, "step": 11010 }, { - "epoch": 1.3267517457259812, - "grad_norm": 8.5625, + "epoch": 7.860199714693295, + "grad_norm": 6.1875, "learning_rate": 6.213777777777779e-05, - "loss": 0.7161, + "loss": 0.7946, "step": 11020 }, { - "epoch": 1.3279556946785456, - "grad_norm": 6.78125, + "epoch": 7.867332382310984, + "grad_norm": 6.21875, "learning_rate": 6.209333333333334e-05, - "loss": 0.5578, + "loss": 0.8494, "step": 11030 }, { - "epoch": 1.32915964363111, - "grad_norm": 9.0625, + "epoch": 7.8744650499286735, + "grad_norm": 6.875, "learning_rate": 6.204888888888889e-05, - "loss": 0.5986, + "loss": 0.7066, "step": 11040 }, { - "epoch": 1.3303635925836743, - "grad_norm": 10.25, + "epoch": 7.881597717546362, + "grad_norm": 6.375, "learning_rate": 6.200444444444445e-05, - "loss": 0.5198, + "loss": 0.8499, "step": 11050 }, { - "epoch": 1.331567541536239, - "grad_norm": 3.796875, + "epoch": 7.888730385164052, + "grad_norm": 8.0, "learning_rate": 6.196000000000001e-05, - "loss": 0.5459, + "loss": 0.8761, "step": 11060 }, { - "epoch": 1.3327714904888033, - "grad_norm": 7.125, + "epoch": 7.89586305278174, + "grad_norm": 5.75, "learning_rate": 6.191555555555556e-05, - "loss": 0.5896, + "loss": 0.8536, "step": 11070 }, { - "epoch": 1.3339754394413676, - "grad_norm": 7.34375, + "epoch": 7.90299572039943, + "grad_norm": 7.0, "learning_rate": 6.18711111111111e-05, - "loss": 0.5403, + "loss": 0.9413, "step": 11080 }, { - "epoch": 1.335179388393932, - "grad_norm": 9.125, + "epoch": 7.910128388017118, + "grad_norm": 8.0, "learning_rate": 6.182666666666667e-05, - "loss": 0.6377, + "loss": 0.7626, "step": 11090 }, { - "epoch": 1.3363833373464966, - "grad_norm": 8.8125, + "epoch": 7.917261055634808, + "grad_norm": 6.375, "learning_rate": 6.178222222222223e-05, - "loss": 0.6292, + "loss": 0.8177, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval/acc": 41.86046600341797, + "epoch": 7.917261055634808, + "eval/acc": 65.11627960205078, "step": 11100 }, { - "epoch": 1.3363833373464966, - "eval_loss": 2.8322744369506836, - "eval_runtime": 0.2115, - "eval_samples_per_second": 203.288, - "eval_steps_per_second": 4.728, + "epoch": 7.917261055634808, + "eval_loss": 1.8976689577102661, + "eval_runtime": 0.2399, + "eval_samples_per_second": 179.237, + "eval_steps_per_second": 4.168, "step": 11100 }, { - "epoch": 1.337587286299061, - "grad_norm": 6.28125, + "epoch": 7.924393723252496, + "grad_norm": 7.4375, "learning_rate": 6.173777777777778e-05, - "loss": 0.6421, + "loss": 0.8178, "step": 11110 }, { - "epoch": 1.3387912352516254, - "grad_norm": 6.21875, + "epoch": 7.931526390870186, + "grad_norm": 35.25, "learning_rate": 6.169333333333334e-05, - "loss": 0.6178, + "loss": 0.7931, "step": 11120 }, { - "epoch": 1.3399951842041897, - "grad_norm": 6.8125, + "epoch": 7.9386590584878745, + "grad_norm": 26.75, "learning_rate": 6.164888888888889e-05, - "loss": 0.704, + "loss": 0.7883, "step": 11130 }, { - "epoch": 1.341199133156754, - "grad_norm": 9.1875, + "epoch": 7.945791726105563, + "grad_norm": 6.375, "learning_rate": 6.160444444444444e-05, - "loss": 0.5763, + "loss": 0.7407, "step": 11140 }, { - "epoch": 1.3424030821093185, - "grad_norm": 9.625, + "epoch": 7.9529243937232525, + "grad_norm": 8.5, "learning_rate": 6.156e-05, - "loss": 0.6345, + "loss": 0.8509, "step": 11150 }, { - "epoch": 1.3436070310618828, - "grad_norm": 7.15625, + "epoch": 7.960057061340941, + "grad_norm": 7.34375, "learning_rate": 6.151555555555556e-05, - "loss": 0.5969, + "loss": 0.7948, "step": 11160 }, { - "epoch": 1.3448109800144474, - "grad_norm": 7.65625, + "epoch": 7.967189728958631, + "grad_norm": 5.90625, "learning_rate": 6.147111111111111e-05, - "loss": 0.6219, + "loss": 0.8066, "step": 11170 }, { - "epoch": 1.3460149289670118, - "grad_norm": 7.46875, + "epoch": 7.974322396576319, + "grad_norm": 6.8125, "learning_rate": 6.142666666666666e-05, - "loss": 0.5902, + "loss": 0.7545, "step": 11180 }, { - "epoch": 1.3472188779195762, - "grad_norm": 8.4375, + "epoch": 7.981455064194009, + "grad_norm": 7.40625, "learning_rate": 6.138222222222223e-05, - "loss": 0.6771, + "loss": 0.8842, "step": 11190 }, { - "epoch": 1.3484228268721405, - "grad_norm": 9.875, + "epoch": 7.988587731811697, + "grad_norm": 8.625, "learning_rate": 6.133777777777778e-05, - "loss": 0.5981, + "loss": 0.8874, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval/acc": 41.86046600341797, + "epoch": 7.988587731811697, + "eval/acc": 60.46511459350586, "step": 11200 }, { - "epoch": 1.3484228268721405, - "eval_loss": 2.8496346473693848, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.315, - "eval_steps_per_second": 4.658, + "epoch": 7.988587731811697, + "eval_loss": 1.9585436582565308, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.499, + "eval_steps_per_second": 4.663, "step": 11200 }, { - "epoch": 1.3496267758247051, - "grad_norm": 8.4375, + "epoch": 7.995720399429387, + "grad_norm": 10.125, "learning_rate": 6.129333333333333e-05, - "loss": 0.6014, + "loss": 0.8723, "step": 11210 }, { - "epoch": 1.3508307247772695, - "grad_norm": 6.625, + "epoch": 8.002853067047075, + "grad_norm": 6.375, "learning_rate": 6.12488888888889e-05, - "loss": 0.5484, + "loss": 0.7986, "step": 11220 }, { - "epoch": 1.3520346737298339, - "grad_norm": 8.8125, + "epoch": 8.009985734664765, + "grad_norm": 7.34375, "learning_rate": 6.120444444444444e-05, - "loss": 0.6505, + "loss": 0.8382, "step": 11230 }, { - "epoch": 1.3532386226823983, - "grad_norm": 9.0, + "epoch": 8.017118402282454, + "grad_norm": 6.21875, "learning_rate": 6.116e-05, - "loss": 0.7428, + "loss": 0.796, "step": 11240 }, { - "epoch": 1.3544425716349626, - "grad_norm": 6.03125, + "epoch": 8.024251069900142, + "grad_norm": 30.5, "learning_rate": 6.111555555555557e-05, - "loss": 0.5092, + "loss": 0.8541, "step": 11250 }, { - "epoch": 1.355646520587527, - "grad_norm": 8.375, + "epoch": 8.031383737517832, + "grad_norm": 7.90625, "learning_rate": 6.107111111111111e-05, - "loss": 0.6589, + "loss": 0.7689, "step": 11260 }, { - "epoch": 1.3568504695400914, - "grad_norm": 5.1875, + "epoch": 8.038516405135521, + "grad_norm": 10.375, "learning_rate": 6.102666666666666e-05, - "loss": 0.7026, + "loss": 0.803, "step": 11270 }, { - "epoch": 1.358054418492656, - "grad_norm": 19.125, + "epoch": 8.045649072753209, + "grad_norm": 8.3125, "learning_rate": 6.098222222222223e-05, - "loss": 0.6705, + "loss": 0.9584, "step": 11280 }, { - "epoch": 1.3592583674452203, - "grad_norm": 5.625, + "epoch": 8.052781740370898, + "grad_norm": 7.8125, "learning_rate": 6.093777777777778e-05, - "loss": 0.6484, + "loss": 0.761, "step": 11290 }, { - "epoch": 1.3604623163977847, - "grad_norm": 9.9375, + "epoch": 8.059914407988588, + "grad_norm": 9.125, "learning_rate": 6.0893333333333335e-05, - "loss": 0.5762, + "loss": 0.7506, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval/acc": 47.67441940307617, + "epoch": 8.059914407988588, + "eval/acc": 48.83720779418945, "step": 11300 }, { - "epoch": 1.3604623163977847, - "eval_loss": 2.804438591003418, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.261, - "eval_steps_per_second": 4.75, + "epoch": 8.059914407988588, + "eval_loss": 2.348471164703369, + "eval_runtime": 0.9666, + "eval_samples_per_second": 44.484, + "eval_steps_per_second": 1.035, "step": 11300 }, { - "epoch": 1.361666265350349, - "grad_norm": 11.4375, + "epoch": 8.067047075606277, + "grad_norm": 9.0625, "learning_rate": 6.084888888888889e-05, - "loss": 0.63, + "loss": 0.7246, "step": 11310 }, { - "epoch": 1.3628702143029137, - "grad_norm": 7.625, + "epoch": 8.074179743223965, + "grad_norm": 24.5, "learning_rate": 6.080444444444445e-05, - "loss": 0.5893, + "loss": 0.8399, "step": 11320 }, { - "epoch": 1.364074163255478, - "grad_norm": 7.5625, + "epoch": 8.081312410841655, + "grad_norm": 8.0625, "learning_rate": 6.076000000000001e-05, - "loss": 0.6789, + "loss": 0.8196, "step": 11330 }, { - "epoch": 1.3652781122080424, - "grad_norm": 6.875, + "epoch": 8.088445078459344, + "grad_norm": 7.5625, "learning_rate": 6.0715555555555556e-05, - "loss": 0.5739, + "loss": 0.7496, "step": 11340 }, { - "epoch": 1.3664820611606068, - "grad_norm": 7.9375, + "epoch": 8.095577746077034, + "grad_norm": 10.6875, "learning_rate": 6.067111111111111e-05, - "loss": 0.593, + "loss": 0.791, "step": 11350 }, { - "epoch": 1.3676860101131711, - "grad_norm": 6.03125, + "epoch": 8.102710413694721, + "grad_norm": 7.28125, "learning_rate": 6.062666666666667e-05, - "loss": 0.6003, + "loss": 0.7064, "step": 11360 }, { - "epoch": 1.3688899590657355, - "grad_norm": 7.21875, + "epoch": 8.10984308131241, + "grad_norm": 7.28125, "learning_rate": 6.058222222222223e-05, - "loss": 0.6658, + "loss": 0.8306, "step": 11370 }, { - "epoch": 1.3700939080182999, - "grad_norm": 6.25, + "epoch": 8.1169757489301, + "grad_norm": 7.84375, "learning_rate": 6.0537777777777784e-05, - "loss": 0.5438, + "loss": 0.8394, "step": 11380 }, { - "epoch": 1.3712978569708645, - "grad_norm": 7.21875, + "epoch": 8.12410841654779, + "grad_norm": 6.5625, "learning_rate": 6.049333333333333e-05, - "loss": 0.5269, + "loss": 0.789, "step": 11390 }, { - "epoch": 1.3725018059234289, - "grad_norm": 8.875, + "epoch": 8.131241084165477, + "grad_norm": 7.125, "learning_rate": 6.044888888888889e-05, - "loss": 0.5357, + "loss": 0.7752, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval/acc": 45.930233001708984, + "epoch": 8.131241084165477, + "eval/acc": 48.83720779418945, "step": 11400 }, { - "epoch": 1.3725018059234289, - "eval_loss": 2.79295015335083, - "eval_runtime": 0.5452, - "eval_samples_per_second": 78.877, - "eval_steps_per_second": 1.834, + "epoch": 8.131241084165477, + "eval_loss": 2.3455872535705566, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.559, + "eval_steps_per_second": 4.664, "step": 11400 }, { - "epoch": 1.3737057548759932, - "grad_norm": 6.375, + "epoch": 8.138373751783167, + "grad_norm": 6.75, "learning_rate": 6.040444444444445e-05, - "loss": 0.6255, + "loss": 0.7773, "step": 11410 }, { - "epoch": 1.3749097038285576, - "grad_norm": 6.96875, + "epoch": 8.145506419400856, + "grad_norm": 7.8125, "learning_rate": 6.0360000000000005e-05, - "loss": 0.6689, + "loss": 0.7369, "step": 11420 }, { - "epoch": 1.3761136527811222, - "grad_norm": 7.28125, + "epoch": 8.152639087018544, + "grad_norm": 6.4375, "learning_rate": 6.031555555555556e-05, - "loss": 0.5944, + "loss": 0.8158, "step": 11430 }, { - "epoch": 1.3773176017336866, - "grad_norm": 5.90625, + "epoch": 8.159771754636234, + "grad_norm": 7.53125, "learning_rate": 6.027111111111111e-05, - "loss": 0.6054, + "loss": 0.874, "step": 11440 }, { - "epoch": 1.378521550686251, - "grad_norm": 7.09375, + "epoch": 8.166904422253923, + "grad_norm": 8.0625, "learning_rate": 6.0226666666666664e-05, - "loss": 0.5204, + "loss": 0.7564, "step": 11450 }, { - "epoch": 1.3797254996388153, - "grad_norm": 8.125, + "epoch": 8.174037089871613, + "grad_norm": 6.65625, "learning_rate": 6.0182222222222226e-05, - "loss": 0.5088, + "loss": 0.8675, "step": 11460 }, { - "epoch": 1.3809294485913797, - "grad_norm": 7.96875, + "epoch": 8.1811697574893, + "grad_norm": 7.34375, "learning_rate": 6.013777777777778e-05, - "loss": 0.5873, + "loss": 0.8338, "step": 11470 }, { - "epoch": 1.382133397543944, - "grad_norm": 7.8125, + "epoch": 8.18830242510699, + "grad_norm": 8.75, "learning_rate": 6.0093333333333336e-05, - "loss": 0.5889, + "loss": 0.7316, "step": 11480 }, { - "epoch": 1.3833373464965084, - "grad_norm": 7.3125, + "epoch": 8.19543509272468, + "grad_norm": 8.625, "learning_rate": 6.0048888888888885e-05, - "loss": 0.6799, + "loss": 0.8842, "step": 11490 }, { - "epoch": 1.384541295449073, - "grad_norm": 17.0, + "epoch": 8.202567760342369, + "grad_norm": 11.3125, "learning_rate": 6.0004444444444453e-05, - "loss": 0.5965, + "loss": 0.7852, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval/acc": 46.511627197265625, + "epoch": 8.202567760342369, + "eval/acc": 48.83720779418945, "step": 11500 }, { - "epoch": 1.384541295449073, - "eval_loss": 2.8261971473693848, - "eval_runtime": 1.1762, - "eval_samples_per_second": 36.557, - "eval_steps_per_second": 0.85, + "epoch": 8.202567760342369, + "eval_loss": 2.352907657623291, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.909, + "eval_steps_per_second": 4.719, "step": 11500 }, { - "epoch": 1.3857452444016374, - "grad_norm": 7.5, + "epoch": 8.209700427960057, + "grad_norm": 7.9375, "learning_rate": 5.996e-05, - "loss": 0.5685, + "loss": 0.7898, "step": 11510 }, { - "epoch": 1.3869491933542017, - "grad_norm": 7.8125, + "epoch": 8.216833095577746, + "grad_norm": 7.21875, "learning_rate": 5.991555555555556e-05, - "loss": 0.6299, + "loss": 0.7728, "step": 11520 }, { - "epoch": 1.3881531423067661, - "grad_norm": 6.34375, + "epoch": 8.223965763195435, + "grad_norm": 8.1875, "learning_rate": 5.987111111111111e-05, - "loss": 0.6243, + "loss": 0.7456, "step": 11530 }, { - "epoch": 1.3893570912593307, - "grad_norm": 7.84375, + "epoch": 8.231098430813125, + "grad_norm": 7.1875, "learning_rate": 5.982666666666666e-05, - "loss": 0.6387, + "loss": 0.8461, "step": 11540 }, { - "epoch": 1.390561040211895, - "grad_norm": 7.5, + "epoch": 8.238231098430813, + "grad_norm": 7.9375, "learning_rate": 5.978222222222223e-05, - "loss": 0.6561, + "loss": 0.7297, "step": 11550 }, { - "epoch": 1.3917649891644595, - "grad_norm": 9.125, + "epoch": 8.245363766048502, + "grad_norm": 6.75, "learning_rate": 5.973777777777778e-05, - "loss": 0.6064, + "loss": 0.8327, "step": 11560 }, { - "epoch": 1.3929689381170238, - "grad_norm": 8.3125, + "epoch": 8.252496433666192, + "grad_norm": 6.1875, "learning_rate": 5.969333333333333e-05, - "loss": 0.6107, + "loss": 0.8054, "step": 11570 }, { - "epoch": 1.3941728870695882, - "grad_norm": 8.0, + "epoch": 8.25962910128388, + "grad_norm": 8.375, "learning_rate": 5.964888888888889e-05, - "loss": 0.7101, + "loss": 0.7853, "step": 11580 }, { - "epoch": 1.3953768360221526, - "grad_norm": 4.84375, + "epoch": 8.266761768901569, + "grad_norm": 8.9375, "learning_rate": 5.960444444444445e-05, - "loss": 0.5981, + "loss": 0.7891, "step": 11590 }, { - "epoch": 1.396580784974717, - "grad_norm": 5.25, + "epoch": 8.273894436519258, + "grad_norm": 7.8125, "learning_rate": 5.9560000000000006e-05, - "loss": 0.5498, + "loss": 0.7407, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval/acc": 44.1860466003418, + "epoch": 8.273894436519258, + "eval/acc": 55.8139533996582, "step": 11600 }, { - "epoch": 1.396580784974717, - "eval_loss": 2.822185516357422, - "eval_runtime": 2.963, - "eval_samples_per_second": 14.512, - "eval_steps_per_second": 0.337, + "epoch": 8.273894436519258, + "eval_loss": 2.3408679962158203, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.913, + "eval_steps_per_second": 4.719, "step": 11600 }, { - "epoch": 1.3977847339272815, - "grad_norm": 10.875, + "epoch": 8.281027104136948, + "grad_norm": 16.5, "learning_rate": 5.951555555555556e-05, - "loss": 0.65, + "loss": 0.7662, "step": 11610 }, { - "epoch": 1.398988682879846, - "grad_norm": 8.375, + "epoch": 8.288159771754636, + "grad_norm": 9.1875, "learning_rate": 5.947111111111111e-05, - "loss": 0.604, + "loss": 0.8136, "step": 11620 }, { - "epoch": 1.4001926318324103, - "grad_norm": 7.21875, + "epoch": 8.295292439372325, + "grad_norm": 6.5, "learning_rate": 5.942666666666668e-05, - "loss": 0.5246, + "loss": 0.8833, "step": 11630 }, { - "epoch": 1.4013965807849746, - "grad_norm": 7.8125, + "epoch": 8.302425106990015, + "grad_norm": 9.8125, "learning_rate": 5.938222222222223e-05, - "loss": 0.5189, + "loss": 0.7388, "step": 11640 }, { - "epoch": 1.4026005297375392, - "grad_norm": 4.78125, + "epoch": 8.309557774607704, + "grad_norm": 8.375, "learning_rate": 5.933777777777778e-05, - "loss": 0.5959, + "loss": 0.687, "step": 11650 }, { - "epoch": 1.4038044786901036, - "grad_norm": 8.875, + "epoch": 8.316690442225392, + "grad_norm": 6.75, "learning_rate": 5.929333333333334e-05, - "loss": 0.5887, + "loss": 0.7731, "step": 11660 }, { - "epoch": 1.405008427642668, - "grad_norm": 8.625, + "epoch": 8.323823109843081, + "grad_norm": 5.875, "learning_rate": 5.9248888888888886e-05, - "loss": 0.5538, + "loss": 0.8294, "step": 11670 }, { - "epoch": 1.4062123765952323, - "grad_norm": 5.8125, + "epoch": 8.33095577746077, + "grad_norm": 7.25, "learning_rate": 5.9204444444444454e-05, - "loss": 0.5945, + "loss": 0.8312, "step": 11680 }, { - "epoch": 1.4074163255477967, - "grad_norm": 6.9375, + "epoch": 8.338088445078458, + "grad_norm": 6.15625, "learning_rate": 5.916e-05, - "loss": 0.5444, + "loss": 0.8745, "step": 11690 }, { - "epoch": 1.408620274500361, - "grad_norm": 5.375, + "epoch": 8.345221112696148, + "grad_norm": 11.9375, "learning_rate": 5.911555555555556e-05, - "loss": 0.5762, + "loss": 0.8136, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval/acc": 45.930233001708984, + "epoch": 8.345221112696148, + "eval/acc": 53.488372802734375, "step": 11700 }, { - "epoch": 1.408620274500361, - "eval_loss": 2.837869167327881, - "eval_runtime": 4.5845, - "eval_samples_per_second": 9.38, - "eval_steps_per_second": 0.218, + "epoch": 8.345221112696148, + "eval_loss": 2.348762273788452, + "eval_runtime": 1.1232, + "eval_samples_per_second": 38.285, + "eval_steps_per_second": 0.89, "step": 11700 }, { - "epoch": 1.4098242234529255, - "grad_norm": 6.90625, + "epoch": 8.352353780313837, + "grad_norm": 7.6875, "learning_rate": 5.907111111111111e-05, - "loss": 0.5918, + "loss": 0.8979, "step": 11710 }, { - "epoch": 1.41102817240549, - "grad_norm": 12.375, + "epoch": 8.359486447931527, + "grad_norm": 7.75, "learning_rate": 5.9026666666666675e-05, - "loss": 0.5955, + "loss": 0.7527, "step": 11720 }, { - "epoch": 1.4122321213580544, - "grad_norm": 13.4375, + "epoch": 8.366619115549215, + "grad_norm": 7.75, "learning_rate": 5.898222222222223e-05, - "loss": 0.5389, + "loss": 0.7397, "step": 11730 }, { - "epoch": 1.4134360703106188, - "grad_norm": 8.5625, + "epoch": 8.373751783166904, + "grad_norm": 7.125, "learning_rate": 5.893777777777778e-05, - "loss": 0.5927, + "loss": 0.7371, "step": 11740 }, { - "epoch": 1.4146400192631832, - "grad_norm": 7.625, + "epoch": 8.380884450784594, + "grad_norm": 7.09375, "learning_rate": 5.8893333333333334e-05, - "loss": 0.5979, + "loss": 0.7787, "step": 11750 }, { - "epoch": 1.4158439682157478, - "grad_norm": 6.5625, + "epoch": 8.388017118402283, + "grad_norm": 12.75, "learning_rate": 5.884888888888889e-05, - "loss": 0.4657, + "loss": 0.7745, "step": 11760 }, { - "epoch": 1.4170479171683121, - "grad_norm": 7.5, + "epoch": 8.39514978601997, + "grad_norm": 5.96875, "learning_rate": 5.880444444444445e-05, - "loss": 0.6833, + "loss": 0.7675, "step": 11770 }, { - "epoch": 1.4182518661208765, - "grad_norm": 12.5, + "epoch": 8.40228245363766, + "grad_norm": 7.28125, "learning_rate": 5.876000000000001e-05, - "loss": 0.6065, + "loss": 0.7369, "step": 11780 }, { - "epoch": 1.4194558150734409, - "grad_norm": 8.6875, + "epoch": 8.40941512125535, + "grad_norm": 8.5625, "learning_rate": 5.8715555555555555e-05, - "loss": 0.6406, + "loss": 0.7679, "step": 11790 }, { - "epoch": 1.4206597640260052, - "grad_norm": 7.625, + "epoch": 8.41654778887304, + "grad_norm": 6.09375, "learning_rate": 5.867111111111111e-05, - "loss": 0.54, + "loss": 0.7575, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval/acc": 44.1860466003418, + "epoch": 8.41654778887304, + "eval/acc": 48.83720779418945, "step": 11800 }, { - "epoch": 1.4206597640260052, - "eval_loss": 2.8732845783233643, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.731, - "eval_steps_per_second": 4.691, + "epoch": 8.41654778887304, + "eval_loss": 2.3886027336120605, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.752, + "eval_steps_per_second": 4.645, "step": 11800 }, { - "epoch": 1.4218637129785696, - "grad_norm": 13.25, + "epoch": 8.423680456490727, + "grad_norm": 7.75, "learning_rate": 5.862666666666667e-05, - "loss": 0.6655, + "loss": 0.7837, "step": 11810 }, { - "epoch": 1.423067661931134, - "grad_norm": 8.8125, + "epoch": 8.430813124108417, + "grad_norm": 7.03125, "learning_rate": 5.858222222222223e-05, - "loss": 0.5967, + "loss": 0.7153, "step": 11820 }, { - "epoch": 1.4242716108836986, - "grad_norm": 5.84375, + "epoch": 8.437945791726106, + "grad_norm": 9.3125, "learning_rate": 5.853777777777778e-05, - "loss": 0.6698, + "loss": 0.7655, "step": 11830 }, { - "epoch": 1.425475559836263, - "grad_norm": 5.375, + "epoch": 8.445078459343794, + "grad_norm": 7.34375, "learning_rate": 5.849333333333333e-05, - "loss": 0.5963, + "loss": 0.761, "step": 11840 }, { - "epoch": 1.4266795087888273, - "grad_norm": 6.625, + "epoch": 8.452211126961483, + "grad_norm": 8.875, "learning_rate": 5.8448888888888886e-05, - "loss": 0.5941, + "loss": 0.7985, "step": 11850 }, { - "epoch": 1.4278834577413917, - "grad_norm": 6.90625, + "epoch": 8.459343794579173, + "grad_norm": 6.96875, "learning_rate": 5.840444444444445e-05, - "loss": 0.6464, + "loss": 0.7208, "step": 11860 }, { - "epoch": 1.4290874066939563, - "grad_norm": 8.5625, + "epoch": 8.466476462196862, + "grad_norm": 6.25, "learning_rate": 5.8360000000000004e-05, - "loss": 0.5185, + "loss": 0.8474, "step": 11870 }, { - "epoch": 1.4302913556465207, - "grad_norm": 5.46875, + "epoch": 8.47360912981455, + "grad_norm": 5.5625, "learning_rate": 5.831555555555556e-05, - "loss": 0.6194, + "loss": 0.773, "step": 11880 }, { - "epoch": 1.431495304599085, - "grad_norm": 7.03125, + "epoch": 8.48074179743224, + "grad_norm": 19.125, "learning_rate": 5.827111111111111e-05, - "loss": 0.5993, + "loss": 0.7026, "step": 11890 }, { - "epoch": 1.4326992535516494, - "grad_norm": 8.25, + "epoch": 8.487874465049929, + "grad_norm": 8.4375, "learning_rate": 5.8226666666666676e-05, - "loss": 0.5726, + "loss": 0.7825, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval/acc": 44.1860466003418, + "epoch": 8.487874465049929, + "eval/acc": 48.83720779418945, "step": 11900 }, { - "epoch": 1.4326992535516494, - "eval_loss": 2.9054577350616455, - "eval_runtime": 0.2116, - "eval_samples_per_second": 203.241, - "eval_steps_per_second": 4.727, + "epoch": 8.487874465049929, + "eval_loss": 2.395317316055298, + "eval_runtime": 0.2104, + "eval_samples_per_second": 204.361, + "eval_steps_per_second": 4.753, "step": 11900 }, { - "epoch": 1.4339032025042138, - "grad_norm": 6.4375, + "epoch": 8.495007132667618, + "grad_norm": 8.5625, "learning_rate": 5.8182222222222225e-05, - "loss": 0.5805, + "loss": 0.8574, "step": 11910 }, { - "epoch": 1.4351071514567781, - "grad_norm": 9.5, + "epoch": 8.502139800285306, + "grad_norm": 8.0, "learning_rate": 5.813777777777778e-05, - "loss": 0.604, + "loss": 0.8031, "step": 11920 }, { - "epoch": 1.4363111004093425, - "grad_norm": 5.0, + "epoch": 8.509272467902996, + "grad_norm": 8.125, "learning_rate": 5.8093333333333335e-05, - "loss": 0.4871, + "loss": 0.8578, "step": 11930 }, { - "epoch": 1.437515049361907, - "grad_norm": 7.6875, + "epoch": 8.516405135520685, + "grad_norm": 8.3125, "learning_rate": 5.80488888888889e-05, - "loss": 0.5968, + "loss": 0.854, "step": 11940 }, { - "epoch": 1.4387189983144715, - "grad_norm": 7.0625, + "epoch": 8.523537803138375, + "grad_norm": 23.5, "learning_rate": 5.800444444444445e-05, - "loss": 0.5715, + "loss": 0.8375, "step": 11950 }, { - "epoch": 1.4399229472670358, - "grad_norm": 8.4375, + "epoch": 8.530670470756062, + "grad_norm": 6.625, "learning_rate": 5.796e-05, - "loss": 0.6258, + "loss": 0.7793, "step": 11960 }, { - "epoch": 1.4411268962196002, - "grad_norm": 7.78125, + "epoch": 8.537803138373752, + "grad_norm": 36.25, "learning_rate": 5.7915555555555556e-05, - "loss": 0.6474, + "loss": 0.7395, "step": 11970 }, { - "epoch": 1.4423308451721648, - "grad_norm": 6.625, + "epoch": 8.544935805991441, + "grad_norm": 7.96875, "learning_rate": 5.787111111111111e-05, - "loss": 0.6148, + "loss": 0.8492, "step": 11980 }, { - "epoch": 1.4435347941247292, - "grad_norm": 6.5625, + "epoch": 8.552068473609129, + "grad_norm": 7.3125, "learning_rate": 5.782666666666667e-05, - "loss": 0.5533, + "loss": 0.7591, "step": 11990 }, { - "epoch": 1.4447387430772936, - "grad_norm": 11.0625, + "epoch": 8.559201141226819, + "grad_norm": 13.75, "learning_rate": 5.778222222222223e-05, - "loss": 0.5756, + "loss": 0.7175, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval/acc": 46.511627197265625, + "epoch": 8.559201141226819, + "eval/acc": 48.83720779418945, "step": 12000 }, { - "epoch": 1.4447387430772936, - "eval_loss": 2.8763856887817383, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.613, - "eval_steps_per_second": 4.735, + "epoch": 8.559201141226819, + "eval_loss": 2.375894069671631, + "eval_runtime": 0.2121, + "eval_samples_per_second": 202.777, + "eval_steps_per_second": 4.716, "step": 12000 }, { - "epoch": 1.445942692029858, - "grad_norm": 10.625, + "epoch": 8.566333808844508, + "grad_norm": 8.375, "learning_rate": 5.773777777777778e-05, - "loss": 0.6164, + "loss": 0.8009, "step": 12010 }, { - "epoch": 1.4471466409824223, - "grad_norm": 8.3125, + "epoch": 8.573466476462198, + "grad_norm": 10.375, "learning_rate": 5.769333333333333e-05, - "loss": 0.5638, + "loss": 0.7651, "step": 12020 }, { - "epoch": 1.4483505899349867, - "grad_norm": 7.25, + "epoch": 8.580599144079885, + "grad_norm": 10.5, "learning_rate": 5.7648888888888894e-05, - "loss": 0.6082, + "loss": 0.7947, "step": 12030 }, { - "epoch": 1.449554538887551, - "grad_norm": 6.65625, + "epoch": 8.587731811697575, + "grad_norm": 9.375, "learning_rate": 5.760444444444445e-05, - "loss": 0.5113, + "loss": 0.8377, "step": 12040 }, { - "epoch": 1.4507584878401156, - "grad_norm": 8.5625, + "epoch": 8.594864479315264, + "grad_norm": 7.0, "learning_rate": 5.7560000000000005e-05, - "loss": 0.5966, + "loss": 0.7803, "step": 12050 }, { - "epoch": 1.45196243679268, - "grad_norm": 9.0625, + "epoch": 8.601997146932954, + "grad_norm": 7.03125, "learning_rate": 5.751555555555555e-05, - "loss": 0.4791, + "loss": 0.7129, "step": 12060 }, { - "epoch": 1.4531663857452444, - "grad_norm": 7.8125, + "epoch": 8.609129814550641, + "grad_norm": 9.6875, "learning_rate": 5.747111111111111e-05, - "loss": 0.5999, + "loss": 0.9395, "step": 12070 }, { - "epoch": 1.4543703346978087, - "grad_norm": 7.96875, + "epoch": 8.616262482168331, + "grad_norm": 7.1875, "learning_rate": 5.742666666666667e-05, - "loss": 0.5942, + "loss": 0.8461, "step": 12080 }, { - "epoch": 1.4555742836503733, - "grad_norm": 100.0, + "epoch": 8.62339514978602, + "grad_norm": 8.3125, "learning_rate": 5.7382222222222225e-05, - "loss": 0.6591, + "loss": 0.8533, "step": 12090 }, { - "epoch": 1.4567782326029377, - "grad_norm": 10.625, + "epoch": 8.63052781740371, + "grad_norm": 7.75, "learning_rate": 5.733777777777778e-05, - "loss": 0.5924, + "loss": 0.7819, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval/acc": 45.930233001708984, + "epoch": 8.63052781740371, + "eval/acc": 46.511627197265625, "step": 12100 }, { - "epoch": 1.4567782326029377, - "eval_loss": 2.9057791233062744, - "eval_runtime": 0.2182, - "eval_samples_per_second": 197.109, - "eval_steps_per_second": 4.584, + "epoch": 8.63052781740371, + "eval_loss": 2.367159605026245, + "eval_runtime": 0.35, + "eval_samples_per_second": 122.848, + "eval_steps_per_second": 2.857, "step": 12100 }, { - "epoch": 1.457982181555502, - "grad_norm": 6.21875, + "epoch": 8.637660485021398, + "grad_norm": 8.1875, "learning_rate": 5.729333333333333e-05, - "loss": 0.6354, + "loss": 0.8752, "step": 12110 }, { - "epoch": 1.4591861305080664, - "grad_norm": 7.59375, + "epoch": 8.644793152639087, + "grad_norm": 5.6875, "learning_rate": 5.72488888888889e-05, - "loss": 0.5777, + "loss": 0.8182, "step": 12120 }, { - "epoch": 1.4603900794606308, + "epoch": 8.651925820256777, "grad_norm": 7.09375, "learning_rate": 5.7204444444444446e-05, - "loss": 0.6189, + "loss": 0.8116, "step": 12130 }, { - "epoch": 1.4615940284131952, - "grad_norm": 7.84375, + "epoch": 8.659058487874464, + "grad_norm": 7.65625, "learning_rate": 5.716e-05, - "loss": 0.5161, + "loss": 0.7563, "step": 12140 }, { - "epoch": 1.4627979773657596, - "grad_norm": 9.25, + "epoch": 8.666191155492154, + "grad_norm": 20.75, "learning_rate": 5.711555555555556e-05, - "loss": 0.6892, + "loss": 0.6896, "step": 12150 }, { - "epoch": 1.4640019263183242, - "grad_norm": 7.875, + "epoch": 8.673323823109843, + "grad_norm": 9.25, "learning_rate": 5.7071111111111105e-05, - "loss": 0.4845, + "loss": 0.8233, "step": 12160 }, { - "epoch": 1.4652058752708885, - "grad_norm": 6.625, + "epoch": 8.680456490727533, + "grad_norm": 11.0625, "learning_rate": 5.7026666666666674e-05, - "loss": 0.6342, + "loss": 0.8978, "step": 12170 }, { - "epoch": 1.466409824223453, - "grad_norm": 7.03125, + "epoch": 8.68758915834522, + "grad_norm": 8.1875, "learning_rate": 5.698222222222222e-05, - "loss": 0.5427, + "loss": 0.7671, "step": 12180 }, { - "epoch": 1.4676137731760173, - "grad_norm": 6.84375, + "epoch": 8.69472182596291, + "grad_norm": 13.0625, "learning_rate": 5.693777777777778e-05, - "loss": 0.5672, + "loss": 0.7771, "step": 12190 }, { - "epoch": 1.4688177221285819, - "grad_norm": 9.0, + "epoch": 8.7018544935806, + "grad_norm": 8.25, "learning_rate": 5.689333333333333e-05, - "loss": 0.6318, + "loss": 0.758, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval/acc": 45.930233001708984, + "epoch": 8.7018544935806, + "eval/acc": 46.511627197265625, "step": 12200 }, { - "epoch": 1.4688177221285819, - "eval_loss": 2.8778676986694336, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.649, - "eval_steps_per_second": 4.69, + "epoch": 8.7018544935806, + "eval_loss": 2.3872835636138916, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.006, + "eval_steps_per_second": 4.675, "step": 12200 }, { - "epoch": 1.4700216710811462, - "grad_norm": 7.4375, + "epoch": 8.708987161198289, + "grad_norm": 6.5, "learning_rate": 5.6848888888888895e-05, - "loss": 0.5589, + "loss": 0.8066, "step": 12210 }, { - "epoch": 1.4712256200337106, - "grad_norm": 9.1875, + "epoch": 8.716119828815977, + "grad_norm": 7.21875, "learning_rate": 5.680444444444445e-05, - "loss": 0.6005, + "loss": 0.8287, "step": 12220 }, { - "epoch": 1.472429568986275, - "grad_norm": 7.875, + "epoch": 8.723252496433666, + "grad_norm": 10.625, "learning_rate": 5.6760000000000005e-05, - "loss": 0.5573, + "loss": 0.917, "step": 12230 }, { - "epoch": 1.4736335179388393, - "grad_norm": 7.71875, + "epoch": 8.730385164051356, + "grad_norm": 9.6875, "learning_rate": 5.6715555555555554e-05, - "loss": 0.5431, + "loss": 0.8417, "step": 12240 }, { - "epoch": 1.4748374668914037, - "grad_norm": 9.0625, + "epoch": 8.737517831669045, + "grad_norm": 8.6875, "learning_rate": 5.6671111111111116e-05, - "loss": 0.5939, + "loss": 0.8405, "step": 12250 }, { - "epoch": 1.476041415843968, - "grad_norm": 9.875, + "epoch": 8.744650499286733, + "grad_norm": 6.875, "learning_rate": 5.662666666666667e-05, - "loss": 0.5894, + "loss": 0.7838, "step": 12260 }, { - "epoch": 1.4772453647965327, - "grad_norm": 6.34375, + "epoch": 8.751783166904422, + "grad_norm": 6.25, "learning_rate": 5.6582222222222226e-05, - "loss": 0.5602, + "loss": 0.6897, "step": 12270 }, { - "epoch": 1.478449313749097, - "grad_norm": 7.34375, + "epoch": 8.758915834522112, + "grad_norm": 7.375, "learning_rate": 5.653777777777778e-05, - "loss": 0.6093, + "loss": 0.7716, "step": 12280 }, { - "epoch": 1.4796532627016614, - "grad_norm": 7.65625, + "epoch": 8.7660485021398, + "grad_norm": 7.96875, "learning_rate": 5.649333333333333e-05, - "loss": 0.5755, + "loss": 0.8497, "step": 12290 }, { - "epoch": 1.4808572116542258, - "grad_norm": 7.15625, + "epoch": 8.773181169757489, + "grad_norm": 7.75, "learning_rate": 5.64488888888889e-05, - "loss": 0.5593, + "loss": 0.747, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval/acc": 46.511627197265625, + "epoch": 8.773181169757489, + "eval/acc": 48.83720779418945, "step": 12300 }, { - "epoch": 1.4808572116542258, - "eval_loss": 2.8776164054870605, - "eval_runtime": 1.0978, - "eval_samples_per_second": 39.169, - "eval_steps_per_second": 0.911, + "epoch": 8.773181169757489, + "eval_loss": 2.3708367347717285, + "eval_runtime": 0.2183, + "eval_samples_per_second": 197.001, + "eval_steps_per_second": 4.581, "step": 12300 }, { - "epoch": 1.4820611606067904, - "grad_norm": 6.96875, + "epoch": 8.780313837375179, + "grad_norm": 7.28125, "learning_rate": 5.640444444444445e-05, - "loss": 0.6139, + "loss": 0.8225, "step": 12310 }, { - "epoch": 1.4832651095593548, - "grad_norm": 6.375, + "epoch": 8.787446504992868, + "grad_norm": 6.8125, "learning_rate": 5.636e-05, - "loss": 0.5185, + "loss": 0.684, "step": 12320 }, { - "epoch": 1.4844690585119191, - "grad_norm": 10.1875, + "epoch": 8.794579172610556, + "grad_norm": 5.84375, "learning_rate": 5.631555555555556e-05, - "loss": 0.5106, + "loss": 0.8008, "step": 12330 }, { - "epoch": 1.4856730074644835, - "grad_norm": 8.875, + "epoch": 8.801711840228245, + "grad_norm": 6.8125, "learning_rate": 5.627111111111112e-05, - "loss": 0.6202, + "loss": 0.7119, "step": 12340 }, { - "epoch": 1.4868769564170479, - "grad_norm": 7.90625, + "epoch": 8.808844507845935, + "grad_norm": 7.625, "learning_rate": 5.6226666666666675e-05, - "loss": 0.5785, + "loss": 0.7878, "step": 12350 }, { - "epoch": 1.4880809053696122, - "grad_norm": 7.625, + "epoch": 8.815977175463622, + "grad_norm": 6.5625, "learning_rate": 5.6182222222222223e-05, - "loss": 0.5529, + "loss": 0.8389, "step": 12360 }, { - "epoch": 1.4892848543221766, - "grad_norm": 6.53125, + "epoch": 8.823109843081312, + "grad_norm": 7.8125, "learning_rate": 5.613777777777778e-05, - "loss": 0.5533, + "loss": 0.8858, "step": 12370 }, { - "epoch": 1.4904888032747412, - "grad_norm": 7.09375, + "epoch": 8.830242510699001, + "grad_norm": 7.0, "learning_rate": 5.6093333333333334e-05, - "loss": 0.6117, + "loss": 0.797, "step": 12380 }, { - "epoch": 1.4916927522273056, - "grad_norm": 6.59375, + "epoch": 8.837375178316691, + "grad_norm": 8.125, "learning_rate": 5.6048888888888896e-05, - "loss": 0.602, + "loss": 0.7154, "step": 12390 }, { - "epoch": 1.49289670117987, - "grad_norm": 10.8125, + "epoch": 8.844507845934379, + "grad_norm": 6.59375, "learning_rate": 5.600444444444445e-05, - "loss": 0.5845, + "loss": 0.8543, "step": 12400 }, { - "epoch": 1.49289670117987, + "epoch": 8.844507845934379, "eval/acc": 46.511627197265625, "step": 12400 }, { - "epoch": 1.49289670117987, - "eval_loss": 2.860626697540283, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.684, - "eval_steps_per_second": 4.644, + "epoch": 8.844507845934379, + "eval_loss": 2.3827686309814453, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.553, + "eval_steps_per_second": 4.664, "step": 12400 }, { - "epoch": 1.4941006501324343, - "grad_norm": 7.40625, + "epoch": 8.851640513552068, + "grad_norm": 8.8125, "learning_rate": 5.596e-05, - "loss": 0.5725, + "loss": 0.8071, "step": 12410 }, { - "epoch": 1.495304599084999, - "grad_norm": 7.28125, + "epoch": 8.858773181169758, + "grad_norm": 6.0625, "learning_rate": 5.5915555555555555e-05, - "loss": 0.6233, + "loss": 0.7174, "step": 12420 }, { - "epoch": 1.4965085480375633, - "grad_norm": 7.375, + "epoch": 8.865905848787447, + "grad_norm": 9.8125, "learning_rate": 5.587111111111112e-05, - "loss": 0.6094, + "loss": 0.861, "step": 12430 }, { - "epoch": 1.4977124969901277, - "grad_norm": 8.8125, + "epoch": 8.873038516405135, + "grad_norm": 8.0, "learning_rate": 5.582666666666667e-05, - "loss": 0.6249, + "loss": 0.831, "step": 12440 }, { - "epoch": 1.498916445942692, - "grad_norm": 14.125, + "epoch": 8.880171184022824, + "grad_norm": 5.21875, "learning_rate": 5.578222222222223e-05, - "loss": 0.5612, + "loss": 0.7814, "step": 12450 }, { - "epoch": 1.5001203948952564, - "grad_norm": 9.125, + "epoch": 8.887303851640514, + "grad_norm": 6.78125, "learning_rate": 5.5737777777777776e-05, - "loss": 0.6067, + "loss": 0.6926, "step": 12460 }, { - "epoch": 1.5013243438478208, - "grad_norm": 7.875, + "epoch": 8.894436519258203, + "grad_norm": 8.6875, "learning_rate": 5.569333333333333e-05, - "loss": 0.5496, + "loss": 0.7977, "step": 12470 }, { - "epoch": 1.5025282928003851, - "grad_norm": 9.3125, + "epoch": 8.901569186875891, + "grad_norm": 6.5625, "learning_rate": 5.564888888888889e-05, - "loss": 0.5547, + "loss": 0.7647, "step": 12480 }, { - "epoch": 1.5037322417529497, - "grad_norm": 6.375, + "epoch": 8.90870185449358, + "grad_norm": 10.875, "learning_rate": 5.560444444444445e-05, - "loss": 0.5596, + "loss": 0.8469, "step": 12490 }, { - "epoch": 1.504936190705514, - "grad_norm": 8.125, + "epoch": 8.91583452211127, + "grad_norm": 12.0625, "learning_rate": 5.556e-05, - "loss": 0.5604, + "loss": 0.9152, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval/acc": 41.86046600341797, + "epoch": 8.91583452211127, + "eval/acc": 46.511627197265625, "step": 12500 }, { - "epoch": 1.504936190705514, - "eval_loss": 2.8973793983459473, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.739, - "eval_steps_per_second": 4.668, + "epoch": 8.91583452211127, + "eval_loss": 2.3970413208007812, + "eval_runtime": 0.2133, + "eval_samples_per_second": 201.589, + "eval_steps_per_second": 4.688, "step": 12500 }, { - "epoch": 1.5061401396580785, - "grad_norm": 6.875, + "epoch": 8.922967189728958, + "grad_norm": 9.875, "learning_rate": 5.551555555555555e-05, - "loss": 0.5898, + "loss": 0.8202, "step": 12510 }, { - "epoch": 1.507344088610643, - "grad_norm": 10.6875, + "epoch": 8.930099857346647, + "grad_norm": 7.625, "learning_rate": 5.547111111111112e-05, - "loss": 0.5065, + "loss": 0.8159, "step": 12520 }, { - "epoch": 1.5085480375632074, - "grad_norm": 8.6875, + "epoch": 8.937232524964337, + "grad_norm": 7.875, "learning_rate": 5.542666666666667e-05, - "loss": 0.6214, + "loss": 0.684, "step": 12530 }, { - "epoch": 1.5097519865157718, - "grad_norm": 7.8125, + "epoch": 8.944365192582026, + "grad_norm": 6.59375, "learning_rate": 5.5382222222222224e-05, - "loss": 0.5012, + "loss": 0.7629, "step": 12540 }, { - "epoch": 1.5109559354683362, - "grad_norm": 7.25, + "epoch": 8.951497860199714, + "grad_norm": 6.90625, "learning_rate": 5.533777777777778e-05, - "loss": 0.5807, + "loss": 0.8227, "step": 12550 }, { - "epoch": 1.5121598844209005, - "grad_norm": 8.625, + "epoch": 8.958630527817403, + "grad_norm": 6.3125, "learning_rate": 5.529333333333334e-05, - "loss": 0.6293, + "loss": 0.8235, "step": 12560 }, { - "epoch": 1.513363833373465, - "grad_norm": 8.125, + "epoch": 8.965763195435093, + "grad_norm": 6.5, "learning_rate": 5.52488888888889e-05, - "loss": 0.5367, + "loss": 0.7865, "step": 12570 }, { - "epoch": 1.5145677823260293, - "grad_norm": 6.53125, + "epoch": 8.972895863052782, + "grad_norm": 5.875, "learning_rate": 5.5204444444444445e-05, - "loss": 0.6308, + "loss": 0.7331, "step": 12580 }, { - "epoch": 1.5157717312785937, - "grad_norm": 6.09375, + "epoch": 8.98002853067047, + "grad_norm": 7.15625, "learning_rate": 5.516e-05, - "loss": 0.571, + "loss": 0.8498, "step": 12590 }, { - "epoch": 1.5169756802311583, - "grad_norm": 9.625, + "epoch": 8.98716119828816, + "grad_norm": 7.75, "learning_rate": 5.5115555555555556e-05, - "loss": 0.6378, + "loss": 0.7825, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval/acc": 44.1860466003418, + "epoch": 8.98716119828816, + "eval/acc": 51.16279220581055, "step": 12600 }, { - "epoch": 1.5169756802311583, - "eval_loss": 2.856049060821533, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.899, - "eval_steps_per_second": 4.626, + "epoch": 8.98716119828816, + "eval_loss": 2.3282017707824707, + "eval_runtime": 0.2152, + "eval_samples_per_second": 199.842, + "eval_steps_per_second": 4.647, "step": 12600 }, { - "epoch": 1.5181796291837226, - "grad_norm": 8.0, + "epoch": 8.99429386590585, + "grad_norm": 7.0, "learning_rate": 5.507111111111112e-05, - "loss": 0.5873, + "loss": 0.8485, "step": 12610 }, { - "epoch": 1.519383578136287, - "grad_norm": 6.75, + "epoch": 9.001426533523539, + "grad_norm": 8.1875, "learning_rate": 5.502666666666667e-05, - "loss": 0.6427, + "loss": 0.8691, "step": 12620 }, { - "epoch": 1.5205875270888516, - "grad_norm": 9.1875, + "epoch": 9.008559201141226, + "grad_norm": 8.875, "learning_rate": 5.498222222222222e-05, - "loss": 0.6036, + "loss": 0.8085, "step": 12630 }, { - "epoch": 1.521791476041416, - "grad_norm": 6.3125, + "epoch": 9.015691868758916, + "grad_norm": 10.875, "learning_rate": 5.4937777777777777e-05, - "loss": 0.6524, + "loss": 0.7221, "step": 12640 }, { - "epoch": 1.5229954249939803, - "grad_norm": 4.6875, + "epoch": 9.022824536376605, + "grad_norm": 7.9375, "learning_rate": 5.489333333333334e-05, - "loss": 0.5334, + "loss": 0.8136, "step": 12650 }, { - "epoch": 1.5241993739465447, - "grad_norm": 11.5, + "epoch": 9.029957203994293, + "grad_norm": 6.78125, "learning_rate": 5.4848888888888894e-05, - "loss": 0.5485, + "loss": 0.6211, "step": 12660 }, { - "epoch": 1.525403322899109, - "grad_norm": 7.21875, + "epoch": 9.037089871611983, + "grad_norm": 7.09375, "learning_rate": 5.480444444444445e-05, - "loss": 0.646, + "loss": 0.7893, "step": 12670 }, { - "epoch": 1.5266072718516734, - "grad_norm": 7.5625, + "epoch": 9.044222539229672, + "grad_norm": 7.375, "learning_rate": 5.476e-05, - "loss": 0.5385, + "loss": 0.9348, "step": 12680 }, { - "epoch": 1.5278112208042378, - "grad_norm": 8.375, + "epoch": 9.051355206847362, + "grad_norm": 7.09375, "learning_rate": 5.471555555555555e-05, - "loss": 0.503, + "loss": 0.9088, "step": 12690 }, { - "epoch": 1.5290151697568022, - "grad_norm": 13.0625, + "epoch": 9.05848787446505, + "grad_norm": 7.9375, "learning_rate": 5.4671111111111115e-05, - "loss": 0.5886, + "loss": 0.8116, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval/acc": 44.76744079589844, + "epoch": 9.05848787446505, + "eval/acc": 32.55813980102539, "step": 12700 }, { - "epoch": 1.5290151697568022, - "eval_loss": 2.8569490909576416, - "eval_runtime": 0.2181, - "eval_samples_per_second": 197.134, - "eval_steps_per_second": 4.585, + "epoch": 9.05848787446505, + "eval_loss": 3.3768653869628906, + "eval_runtime": 1.089, + "eval_samples_per_second": 39.487, + "eval_steps_per_second": 0.918, "step": 12700 }, { - "epoch": 1.5302191187093668, - "grad_norm": 8.6875, + "epoch": 9.065620542082739, + "grad_norm": 7.875, "learning_rate": 5.462666666666667e-05, - "loss": 0.7241, + "loss": 0.7748, "step": 12710 }, { - "epoch": 1.5314230676619311, - "grad_norm": 8.1875, + "epoch": 9.072753209700428, + "grad_norm": 6.96875, "learning_rate": 5.4582222222222225e-05, - "loss": 0.5965, + "loss": 0.872, "step": 12720 }, { - "epoch": 1.5326270166144955, - "grad_norm": 6.96875, + "epoch": 9.079885877318118, + "grad_norm": 7.59375, "learning_rate": 5.4537777777777774e-05, - "loss": 0.5195, + "loss": 0.8539, "step": 12730 }, { - "epoch": 1.5338309655670601, - "grad_norm": 6.65625, + "epoch": 9.087018544935805, + "grad_norm": 6.9375, "learning_rate": 5.449333333333334e-05, - "loss": 0.5949, + "loss": 0.784, "step": 12740 }, { - "epoch": 1.5350349145196245, - "grad_norm": 10.25, + "epoch": 9.094151212553495, + "grad_norm": 6.5625, "learning_rate": 5.444888888888889e-05, - "loss": 0.5962, + "loss": 0.7998, "step": 12750 }, { - "epoch": 1.5362388634721889, - "grad_norm": 10.8125, + "epoch": 9.101283880171184, + "grad_norm": 7.0625, "learning_rate": 5.4404444444444446e-05, - "loss": 0.6544, + "loss": 0.8213, "step": 12760 }, { - "epoch": 1.5374428124247532, - "grad_norm": 9.8125, + "epoch": 9.108416547788874, + "grad_norm": 5.9375, "learning_rate": 5.436e-05, - "loss": 0.5681, + "loss": 0.8233, "step": 12770 }, { - "epoch": 1.5386467613773176, - "grad_norm": 7.875, + "epoch": 9.115549215406562, + "grad_norm": 6.53125, "learning_rate": 5.431555555555555e-05, - "loss": 0.5944, + "loss": 0.7617, "step": 12780 }, { - "epoch": 1.539850710329882, - "grad_norm": 6.75, + "epoch": 9.122681883024251, + "grad_norm": 7.3125, "learning_rate": 5.427111111111112e-05, - "loss": 0.5141, + "loss": 0.8139, "step": 12790 }, { - "epoch": 1.5410546592824463, - "grad_norm": 6.96875, + "epoch": 9.12981455064194, + "grad_norm": 7.625, "learning_rate": 5.422666666666667e-05, - "loss": 0.627, + "loss": 0.7742, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval/acc": 44.1860466003418, + "epoch": 9.12981455064194, + "eval/acc": 34.88372039794922, "step": 12800 }, { - "epoch": 1.5410546592824463, - "eval_loss": 2.8678698539733887, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.486, - "eval_steps_per_second": 4.709, + "epoch": 9.12981455064194, + "eval_loss": 3.372913122177124, + "eval_runtime": 0.2597, + "eval_samples_per_second": 165.6, + "eval_steps_per_second": 3.851, "step": 12800 }, { - "epoch": 1.5422586082350107, - "grad_norm": 5.1875, + "epoch": 9.136947218259628, + "grad_norm": 7.90625, "learning_rate": 5.418222222222222e-05, - "loss": 0.601, + "loss": 0.8071, "step": 12810 }, { - "epoch": 1.5434625571875753, - "grad_norm": 10.0625, + "epoch": 9.144079885877318, + "grad_norm": 6.5625, "learning_rate": 5.413777777777778e-05, - "loss": 0.6409, + "loss": 0.7691, "step": 12820 }, { - "epoch": 1.5446665061401397, - "grad_norm": 6.21875, + "epoch": 9.151212553495007, + "grad_norm": 8.375, "learning_rate": 5.409333333333334e-05, - "loss": 0.6065, + "loss": 0.8105, "step": 12830 }, { - "epoch": 1.545870455092704, - "grad_norm": 7.125, + "epoch": 9.158345221112697, + "grad_norm": 7.5, "learning_rate": 5.4048888888888895e-05, - "loss": 0.5369, + "loss": 0.83, "step": 12840 }, { - "epoch": 1.5470744040452686, - "grad_norm": 8.4375, + "epoch": 9.165477888730384, + "grad_norm": 7.21875, "learning_rate": 5.400444444444444e-05, - "loss": 0.6577, + "loss": 0.8158, "step": 12850 }, { - "epoch": 1.548278352997833, - "grad_norm": 7.09375, + "epoch": 9.172610556348074, + "grad_norm": 8.0625, "learning_rate": 5.396e-05, - "loss": 0.6092, + "loss": 0.7359, "step": 12860 }, { - "epoch": 1.5494823019503974, - "grad_norm": 6.5625, + "epoch": 9.179743223965763, + "grad_norm": 7.21875, "learning_rate": 5.391555555555556e-05, - "loss": 0.7309, + "loss": 0.7797, "step": 12870 }, { - "epoch": 1.5506862509029617, - "grad_norm": 6.96875, + "epoch": 9.186875891583453, + "grad_norm": 11.8125, "learning_rate": 5.3871111111111116e-05, - "loss": 0.6047, + "loss": 0.8005, "step": 12880 }, { - "epoch": 1.5518901998555261, - "grad_norm": 8.125, + "epoch": 9.19400855920114, + "grad_norm": 14.0, "learning_rate": 5.382666666666667e-05, - "loss": 0.6257, + "loss": 0.8764, "step": 12890 }, { - "epoch": 1.5530941488080905, - "grad_norm": 7.4375, + "epoch": 9.20114122681883, + "grad_norm": 6.96875, "learning_rate": 5.3782222222222226e-05, - "loss": 0.5871, + "loss": 0.6898, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval/acc": 44.1860466003418, + "epoch": 9.20114122681883, + "eval/acc": 32.55813980102539, "step": 12900 }, { - "epoch": 1.5530941488080905, - "eval_loss": 2.8619837760925293, - "eval_runtime": 0.2172, - "eval_samples_per_second": 197.932, - "eval_steps_per_second": 4.603, + "epoch": 9.20114122681883, + "eval_loss": 3.383354425430298, + "eval_runtime": 0.2453, + "eval_samples_per_second": 175.301, + "eval_steps_per_second": 4.077, "step": 12900 }, { - "epoch": 1.5542980977606549, - "grad_norm": 9.8125, + "epoch": 9.20827389443652, + "grad_norm": 8.125, "learning_rate": 5.3737777777777775e-05, - "loss": 0.5404, + "loss": 0.8066, "step": 12910 }, { - "epoch": 1.5555020467132192, - "grad_norm": 7.4375, + "epoch": 9.21540656205421, + "grad_norm": 7.96875, "learning_rate": 5.369333333333334e-05, - "loss": 0.5922, + "loss": 0.7809, "step": 12920 }, { - "epoch": 1.5567059956657838, - "grad_norm": 6.53125, + "epoch": 9.222539229671897, + "grad_norm": 7.15625, "learning_rate": 5.364888888888889e-05, - "loss": 0.6612, + "loss": 0.7242, "step": 12930 }, { - "epoch": 1.5579099446183482, - "grad_norm": 8.25, + "epoch": 9.229671897289586, + "grad_norm": 7.65625, "learning_rate": 5.360444444444445e-05, - "loss": 0.6073, + "loss": 0.8201, "step": 12940 }, { - "epoch": 1.5591138935709126, - "grad_norm": 8.875, + "epoch": 9.236804564907276, + "grad_norm": 8.75, "learning_rate": 5.356e-05, - "loss": 0.609, + "loss": 0.8531, "step": 12950 }, { - "epoch": 1.5603178425234772, - "grad_norm": 7.03125, + "epoch": 9.243937232524964, + "grad_norm": 7.3125, "learning_rate": 5.3515555555555564e-05, - "loss": 0.5725, + "loss": 0.8004, "step": 12960 }, { - "epoch": 1.5615217914760415, - "grad_norm": 11.375, + "epoch": 9.251069900142653, + "grad_norm": 9.1875, "learning_rate": 5.347111111111112e-05, - "loss": 0.6808, + "loss": 0.8026, "step": 12970 }, { - "epoch": 1.562725740428606, - "grad_norm": 8.4375, + "epoch": 9.258202567760343, + "grad_norm": 8.75, "learning_rate": 5.342666666666667e-05, - "loss": 0.6652, + "loss": 0.9001, "step": 12980 }, { - "epoch": 1.5639296893811703, - "grad_norm": 12.625, + "epoch": 9.265335235378032, + "grad_norm": 6.75, "learning_rate": 5.338222222222222e-05, - "loss": 0.6361, + "loss": 0.8698, "step": 12990 }, { - "epoch": 1.5651336383337346, - "grad_norm": 5.875, + "epoch": 9.27246790299572, + "grad_norm": 5.75, "learning_rate": 5.333777777777778e-05, - "loss": 0.539, + "loss": 0.7668, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval/acc": 42.44186019897461, + "epoch": 9.27246790299572, + "eval/acc": 34.88372039794922, "step": 13000 }, { - "epoch": 1.5651336383337346, - "eval_loss": 2.877701759338379, - "eval_runtime": 0.2098, - "eval_samples_per_second": 204.994, - "eval_steps_per_second": 4.767, + "epoch": 9.27246790299572, + "eval_loss": 3.3350794315338135, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.644, + "eval_steps_per_second": 4.457, "step": 13000 }, { - "epoch": 1.566337587286299, - "grad_norm": 9.1875, + "epoch": 9.27960057061341, + "grad_norm": 7.90625, "learning_rate": 5.329333333333334e-05, - "loss": 0.5718, + "loss": 0.8908, "step": 13010 }, { - "epoch": 1.5675415362388634, - "grad_norm": 6.78125, + "epoch": 9.286733238231099, + "grad_norm": 7.78125, "learning_rate": 5.3248888888888896e-05, - "loss": 0.6074, + "loss": 0.76, "step": 13020 }, { - "epoch": 1.5687454851914278, - "grad_norm": 7.28125, + "epoch": 9.293865905848788, + "grad_norm": 22.125, "learning_rate": 5.3204444444444444e-05, - "loss": 0.5788, + "loss": 0.8205, "step": 13030 }, { - "epoch": 1.5699494341439921, - "grad_norm": 6.78125, + "epoch": 9.300998573466476, + "grad_norm": 9.6875, "learning_rate": 5.316e-05, - "loss": 0.6445, + "loss": 0.7528, "step": 13040 }, { - "epoch": 1.5711533830965567, - "grad_norm": 7.34375, + "epoch": 9.308131241084165, + "grad_norm": 6.8125, "learning_rate": 5.311555555555556e-05, - "loss": 0.6391, + "loss": 0.8987, "step": 13050 }, { - "epoch": 1.572357332049121, - "grad_norm": 8.375, + "epoch": 9.315263908701855, + "grad_norm": 7.71875, "learning_rate": 5.3071111111111116e-05, - "loss": 0.6101, + "loss": 0.8056, "step": 13060 }, { - "epoch": 1.5735612810016857, - "grad_norm": 9.6875, + "epoch": 9.322396576319543, + "grad_norm": 6.78125, "learning_rate": 5.302666666666667e-05, - "loss": 0.6029, + "loss": 0.7962, "step": 13070 }, { - "epoch": 1.57476522995425, - "grad_norm": 7.5625, + "epoch": 9.329529243937232, + "grad_norm": 6.3125, "learning_rate": 5.298222222222222e-05, - "loss": 0.6034, + "loss": 0.846, "step": 13080 }, { - "epoch": 1.5759691789068144, - "grad_norm": 6.90625, + "epoch": 9.336661911554922, + "grad_norm": 8.75, "learning_rate": 5.2937777777777775e-05, - "loss": 0.629, + "loss": 0.8005, "step": 13090 }, { - "epoch": 1.5771731278593788, - "grad_norm": 8.0625, + "epoch": 9.343794579172611, + "grad_norm": 27.25, "learning_rate": 5.289333333333334e-05, - "loss": 0.5272, + "loss": 0.7313, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval/acc": 42.44186019897461, + "epoch": 9.343794579172611, + "eval/acc": 32.55813980102539, "step": 13100 }, { - "epoch": 1.5771731278593788, - "eval_loss": 2.883392095565796, - "eval_runtime": 0.2099, - "eval_samples_per_second": 204.824, - "eval_steps_per_second": 4.763, + "epoch": 9.343794579172611, + "eval_loss": 3.3405187129974365, + "eval_runtime": 0.2462, + "eval_samples_per_second": 174.636, + "eval_steps_per_second": 4.061, "step": 13100 }, { - "epoch": 1.5783770768119432, - "grad_norm": 7.46875, + "epoch": 9.350927246790299, + "grad_norm": 7.5625, "learning_rate": 5.284888888888889e-05, - "loss": 0.6273, + "loss": 0.8474, "step": 13110 }, { - "epoch": 1.5795810257645075, - "grad_norm": 7.90625, + "epoch": 9.358059914407988, + "grad_norm": 7.25, "learning_rate": 5.280444444444445e-05, - "loss": 0.6752, + "loss": 0.8104, "step": 13120 }, { - "epoch": 1.580784974717072, - "grad_norm": 9.6875, + "epoch": 9.365192582025678, + "grad_norm": 7.71875, "learning_rate": 5.2759999999999996e-05, - "loss": 0.5769, + "loss": 0.8638, "step": 13130 }, { - "epoch": 1.5819889236696363, - "grad_norm": 8.1875, + "epoch": 9.372325249643367, + "grad_norm": 8.25, "learning_rate": 5.2715555555555565e-05, - "loss": 0.5469, + "loss": 0.7968, "step": 13140 }, { - "epoch": 1.5831928726222007, - "grad_norm": 4.96875, + "epoch": 9.379457917261055, + "grad_norm": 9.4375, "learning_rate": 5.2671111111111114e-05, - "loss": 0.5805, + "loss": 0.692, "step": 13150 }, { - "epoch": 1.5843968215747652, - "grad_norm": 6.65625, + "epoch": 9.386590584878745, + "grad_norm": 6.1875, "learning_rate": 5.262666666666667e-05, - "loss": 0.527, + "loss": 0.8222, "step": 13160 }, { - "epoch": 1.5856007705273296, - "grad_norm": 9.375, + "epoch": 9.393723252496434, + "grad_norm": 9.9375, "learning_rate": 5.2582222222222224e-05, - "loss": 0.5363, + "loss": 0.8494, "step": 13170 }, { - "epoch": 1.5868047194798942, - "grad_norm": 6.15625, + "epoch": 9.400855920114124, + "grad_norm": 8.0, "learning_rate": 5.2537777777777786e-05, - "loss": 0.5795, + "loss": 0.8254, "step": 13180 }, { - "epoch": 1.5880086684324586, - "grad_norm": 6.59375, + "epoch": 9.407988587731811, + "grad_norm": 7.375, "learning_rate": 5.249333333333334e-05, - "loss": 0.567, + "loss": 0.8771, "step": 13190 }, { - "epoch": 1.589212617385023, - "grad_norm": 7.46875, + "epoch": 9.4151212553495, + "grad_norm": 7.34375, "learning_rate": 5.244888888888889e-05, - "loss": 0.5838, + "loss": 0.8563, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval/acc": 44.76744079589844, + "epoch": 9.4151212553495, + "eval/acc": 37.20930099487305, "step": 13200 }, { - "epoch": 1.589212617385023, - "eval_loss": 2.885694742202759, - "eval_runtime": 0.2136, - "eval_samples_per_second": 201.314, - "eval_steps_per_second": 4.682, + "epoch": 9.4151212553495, + "eval_loss": 3.293537139892578, + "eval_runtime": 0.219, + "eval_samples_per_second": 196.361, + "eval_steps_per_second": 4.567, "step": 13200 }, { - "epoch": 1.5904165663375873, - "grad_norm": 7.625, + "epoch": 9.42225392296719, + "grad_norm": 7.1875, "learning_rate": 5.2404444444444445e-05, - "loss": 0.5836, + "loss": 0.769, "step": 13210 }, { - "epoch": 1.5916205152901517, - "grad_norm": 7.28125, + "epoch": 9.429386590584878, + "grad_norm": 8.5, "learning_rate": 5.236e-05, - "loss": 0.6374, + "loss": 0.778, "step": 13220 }, { - "epoch": 1.592824464242716, - "grad_norm": 5.59375, + "epoch": 9.436519258202567, + "grad_norm": 7.6875, "learning_rate": 5.231555555555556e-05, - "loss": 0.5608, + "loss": 0.8043, "step": 13230 }, { - "epoch": 1.5940284131952804, - "grad_norm": 6.15625, + "epoch": 9.443651925820257, + "grad_norm": 7.59375, "learning_rate": 5.227111111111112e-05, - "loss": 0.6031, + "loss": 0.7962, "step": 13240 }, { - "epoch": 1.5952323621478448, - "grad_norm": 6.84375, + "epoch": 9.450784593437946, + "grad_norm": 9.6875, "learning_rate": 5.2226666666666666e-05, - "loss": 0.6458, + "loss": 0.8623, "step": 13250 }, { - "epoch": 1.5964363111004092, - "grad_norm": 7.59375, + "epoch": 9.457917261055634, + "grad_norm": 7.125, "learning_rate": 5.218222222222222e-05, - "loss": 0.5275, + "loss": 0.7408, "step": 13260 }, { - "epoch": 1.5976402600529738, - "grad_norm": 9.4375, + "epoch": 9.465049928673324, + "grad_norm": 8.1875, "learning_rate": 5.213777777777778e-05, - "loss": 0.6249, + "loss": 0.7233, "step": 13270 }, { - "epoch": 1.5988442090055381, - "grad_norm": 8.375, + "epoch": 9.472182596291013, + "grad_norm": 9.375, "learning_rate": 5.209333333333334e-05, - "loss": 0.629, + "loss": 0.7349, "step": 13280 }, { - "epoch": 1.6000481579581027, - "grad_norm": 7.21875, + "epoch": 9.479315263908703, + "grad_norm": 6.75, "learning_rate": 5.2048888888888894e-05, - "loss": 0.6004, + "loss": 0.7311, "step": 13290 }, { - "epoch": 1.601252106910667, - "grad_norm": 6.9375, + "epoch": 9.48644793152639, + "grad_norm": 10.25, "learning_rate": 5.200444444444444e-05, - "loss": 0.4867, + "loss": 0.828, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval/acc": 46.511627197265625, + "epoch": 9.48644793152639, + "eval/acc": 34.88372039794922, "step": 13300 }, { - "epoch": 1.601252106910667, - "eval_loss": 2.8910820484161377, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.807, - "eval_steps_per_second": 4.716, + "epoch": 9.48644793152639, + "eval_loss": 3.376410484313965, + "eval_runtime": 0.2205, + "eval_samples_per_second": 194.974, + "eval_steps_per_second": 4.534, "step": 13300 }, { - "epoch": 1.6024560558632315, - "grad_norm": 6.9375, + "epoch": 9.49358059914408, + "grad_norm": 12.1875, "learning_rate": 5.196e-05, - "loss": 0.6888, + "loss": 0.6994, "step": 13310 }, { - "epoch": 1.6036600048157958, - "grad_norm": 7.84375, + "epoch": 9.50071326676177, + "grad_norm": 10.375, "learning_rate": 5.191555555555556e-05, - "loss": 0.5953, + "loss": 0.7658, "step": 13320 }, { - "epoch": 1.6048639537683602, - "grad_norm": 10.0625, + "epoch": 9.507845934379457, + "grad_norm": 7.625, "learning_rate": 5.1871111111111114e-05, - "loss": 0.6347, + "loss": 0.7453, "step": 13330 }, { - "epoch": 1.6060679027209246, - "grad_norm": 10.75, + "epoch": 9.514978601997147, + "grad_norm": 8.0, "learning_rate": 5.182666666666667e-05, - "loss": 0.5822, + "loss": 0.7407, "step": 13340 }, { - "epoch": 1.607271851673489, - "grad_norm": 7.90625, + "epoch": 9.522111269614836, + "grad_norm": 6.96875, "learning_rate": 5.178222222222222e-05, - "loss": 0.5955, + "loss": 0.8234, "step": 13350 }, { - "epoch": 1.6084758006260533, - "grad_norm": 8.3125, + "epoch": 9.529243937232525, + "grad_norm": 6.59375, "learning_rate": 5.173777777777779e-05, - "loss": 0.5097, + "loss": 0.7517, "step": 13360 }, { - "epoch": 1.6096797495786177, - "grad_norm": 7.03125, + "epoch": 9.536376604850213, + "grad_norm": 7.15625, "learning_rate": 5.1693333333333335e-05, - "loss": 0.6034, + "loss": 0.6939, "step": 13370 }, { - "epoch": 1.6108836985311823, - "grad_norm": 7.375, + "epoch": 9.543509272467903, + "grad_norm": 9.6875, "learning_rate": 5.164888888888889e-05, - "loss": 0.4866, + "loss": 0.7602, "step": 13380 }, { - "epoch": 1.6120876474837467, - "grad_norm": 7.59375, + "epoch": 9.550641940085592, + "grad_norm": 7.375, "learning_rate": 5.1604444444444446e-05, - "loss": 0.548, + "loss": 0.8016, "step": 13390 }, { - "epoch": 1.6132915964363113, - "grad_norm": 7.625, + "epoch": 9.557774607703282, + "grad_norm": 6.9375, "learning_rate": 5.1559999999999994e-05, - "loss": 0.5695, + "loss": 0.8258, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval/acc": 45.930233001708984, + "epoch": 9.557774607703282, + "eval/acc": 34.88372039794922, "step": 13400 }, { - "epoch": 1.6132915964363113, - "eval_loss": 2.88386607170105, - "eval_runtime": 0.215, - "eval_samples_per_second": 199.962, - "eval_steps_per_second": 4.65, + "epoch": 9.557774607703282, + "eval_loss": 3.3766846656799316, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.368, + "eval_steps_per_second": 4.404, "step": 13400 }, { - "epoch": 1.6144955453888756, - "grad_norm": 8.5, + "epoch": 9.56490727532097, + "grad_norm": 6.875, "learning_rate": 5.151555555555556e-05, - "loss": 0.5547, + "loss": 0.7926, "step": 13410 }, { - "epoch": 1.61569949434144, - "grad_norm": 6.625, + "epoch": 9.572039942938659, + "grad_norm": 6.28125, "learning_rate": 5.147111111111111e-05, - "loss": 0.5789, + "loss": 0.6912, "step": 13420 }, { - "epoch": 1.6169034432940044, - "grad_norm": 13.8125, + "epoch": 9.579172610556348, + "grad_norm": 60.5, "learning_rate": 5.142666666666667e-05, - "loss": 0.6012, + "loss": 0.8117, "step": 13430 }, { - "epoch": 1.6181073922465687, - "grad_norm": 7.59375, + "epoch": 9.586305278174038, + "grad_norm": 10.5, "learning_rate": 5.138222222222222e-05, - "loss": 0.539, + "loss": 0.7794, "step": 13440 }, { - "epoch": 1.6193113411991331, - "grad_norm": 7.0, + "epoch": 9.593437945791726, + "grad_norm": 5.6875, "learning_rate": 5.1337777777777784e-05, - "loss": 0.5513, + "loss": 0.6753, "step": 13450 }, { - "epoch": 1.6205152901516975, - "grad_norm": 6.875, + "epoch": 9.600570613409415, + "grad_norm": 8.4375, "learning_rate": 5.129333333333334e-05, - "loss": 0.5788, + "loss": 0.8676, "step": 13460 }, { - "epoch": 1.6217192391042619, - "grad_norm": 10.0, + "epoch": 9.607703281027105, + "grad_norm": 7.34375, "learning_rate": 5.124888888888889e-05, - "loss": 0.6301, + "loss": 0.7326, "step": 13470 }, { - "epoch": 1.6229231880568262, - "grad_norm": 7.15625, + "epoch": 9.614835948644792, + "grad_norm": 13.9375, "learning_rate": 5.120444444444444e-05, - "loss": 0.5939, + "loss": 0.8177, "step": 13480 }, { - "epoch": 1.6241271370093908, - "grad_norm": 7.6875, + "epoch": 9.621968616262482, + "grad_norm": 8.3125, "learning_rate": 5.1160000000000005e-05, - "loss": 0.575, + "loss": 0.7928, "step": 13490 }, { - "epoch": 1.6253310859619552, - "grad_norm": 9.125, + "epoch": 9.629101283880171, + "grad_norm": 5.3125, "learning_rate": 5.111555555555556e-05, - "loss": 0.7391, + "loss": 0.7693, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval/acc": 46.511627197265625, + "epoch": 9.629101283880171, + "eval/acc": 37.20930099487305, "step": 13500 }, { - "epoch": 1.6253310859619552, - "eval_loss": 2.8773036003112793, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.837, - "eval_steps_per_second": 4.74, + "epoch": 9.629101283880171, + "eval_loss": 3.340432643890381, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.437, + "eval_steps_per_second": 4.522, "step": 13500 }, { - "epoch": 1.6265350349145198, - "grad_norm": 6.34375, + "epoch": 9.63623395149786, + "grad_norm": 6.4375, "learning_rate": 5.1071111111111115e-05, - "loss": 0.5839, + "loss": 0.7974, "step": 13510 }, { - "epoch": 1.6277389838670842, - "grad_norm": 8.125, + "epoch": 9.643366619115548, + "grad_norm": 11.375, "learning_rate": 5.1026666666666664e-05, - "loss": 0.6332, + "loss": 0.8533, "step": 13520 }, { - "epoch": 1.6289429328196485, - "grad_norm": 7.53125, + "epoch": 9.650499286733238, + "grad_norm": 8.4375, "learning_rate": 5.098222222222222e-05, - "loss": 0.5676, + "loss": 0.7578, "step": 13530 }, { - "epoch": 1.630146881772213, - "grad_norm": 5.8125, + "epoch": 9.657631954350927, + "grad_norm": 6.96875, "learning_rate": 5.093777777777778e-05, - "loss": 0.5453, + "loss": 0.8348, "step": 13540 }, { - "epoch": 1.6313508307247773, - "grad_norm": 7.15625, + "epoch": 9.664764621968617, + "grad_norm": 6.75, "learning_rate": 5.0893333333333336e-05, - "loss": 0.6429, + "loss": 0.7562, "step": 13550 }, { - "epoch": 1.6325547796773416, - "grad_norm": 9.25, + "epoch": 9.671897289586305, + "grad_norm": 7.4375, "learning_rate": 5.084888888888889e-05, - "loss": 0.6338, + "loss": 0.8667, "step": 13560 }, { - "epoch": 1.633758728629906, - "grad_norm": 9.1875, + "epoch": 9.679029957203994, + "grad_norm": 11.4375, "learning_rate": 5.080444444444445e-05, - "loss": 0.5679, + "loss": 0.7158, "step": 13570 }, { - "epoch": 1.6349626775824704, - "grad_norm": 6.8125, + "epoch": 9.686162624821684, + "grad_norm": 6.15625, "learning_rate": 5.076000000000001e-05, - "loss": 0.5705, + "loss": 0.7153, "step": 13580 }, { - "epoch": 1.6361666265350348, - "grad_norm": 10.375, + "epoch": 9.693295292439373, + "grad_norm": 10.25, "learning_rate": 5.0715555555555564e-05, - "loss": 0.6313, + "loss": 0.7698, "step": 13590 }, { - "epoch": 1.6373705754875993, - "grad_norm": 6.6875, + "epoch": 9.70042796005706, + "grad_norm": 12.0, "learning_rate": 5.067111111111111e-05, - "loss": 0.5634, + "loss": 0.8033, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval/acc": 46.511627197265625, + "epoch": 9.70042796005706, + "eval/acc": 37.20930099487305, "step": 13600 }, { - "epoch": 1.6373705754875993, - "eval_loss": 2.8538527488708496, - "eval_runtime": 0.2095, - "eval_samples_per_second": 205.241, - "eval_steps_per_second": 4.773, + "epoch": 9.70042796005706, + "eval_loss": 3.325901985168457, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.369, + "eval_steps_per_second": 4.474, "step": 13600 }, { - "epoch": 1.6385745244401637, - "grad_norm": 7.25, + "epoch": 9.70756062767475, + "grad_norm": 11.5, "learning_rate": 5.062666666666667e-05, - "loss": 0.656, + "loss": 0.7757, "step": 13610 }, { - "epoch": 1.6397784733927283, - "grad_norm": 9.875, + "epoch": 9.71469329529244, + "grad_norm": 7.0625, "learning_rate": 5.058222222222222e-05, - "loss": 0.6261, + "loss": 0.7335, "step": 13620 }, { - "epoch": 1.6409824223452927, - "grad_norm": 8.25, + "epoch": 9.721825962910128, + "grad_norm": 7.09375, "learning_rate": 5.0537777777777785e-05, - "loss": 0.5124, + "loss": 0.7219, "step": 13630 }, { - "epoch": 1.642186371297857, - "grad_norm": 7.5625, + "epoch": 9.728958630527817, + "grad_norm": 28.625, "learning_rate": 5.049333333333334e-05, - "loss": 0.6025, + "loss": 0.7445, "step": 13640 }, { - "epoch": 1.6433903202504214, - "grad_norm": 7.875, + "epoch": 9.736091298145507, + "grad_norm": 6.34375, "learning_rate": 5.044888888888889e-05, - "loss": 0.6363, + "loss": 0.7203, "step": 13650 }, { - "epoch": 1.6445942692029858, - "grad_norm": 8.625, + "epoch": 9.743223965763196, + "grad_norm": 12.25, "learning_rate": 5.0404444444444444e-05, - "loss": 0.6095, + "loss": 0.815, "step": 13660 }, { - "epoch": 1.6457982181555502, - "grad_norm": 7.53125, + "epoch": 9.750356633380884, + "grad_norm": 8.125, "learning_rate": 5.0360000000000006e-05, - "loss": 0.5122, + "loss": 0.6969, "step": 13670 }, { - "epoch": 1.6470021671081145, - "grad_norm": 7.625, + "epoch": 9.757489300998573, + "grad_norm": 8.9375, "learning_rate": 5.031555555555556e-05, - "loss": 0.6545, + "loss": 0.742, "step": 13680 }, { - "epoch": 1.648206116060679, - "grad_norm": 8.4375, + "epoch": 9.764621968616263, + "grad_norm": 17.125, "learning_rate": 5.0271111111111116e-05, - "loss": 0.6044, + "loss": 0.8526, "step": 13690 }, { - "epoch": 1.6494100650132433, - "grad_norm": 7.03125, + "epoch": 9.771754636233952, + "grad_norm": 9.3125, "learning_rate": 5.0226666666666665e-05, - "loss": 0.5725, + "loss": 0.795, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval/acc": 46.511627197265625, + "epoch": 9.771754636233952, + "eval/acc": 37.20930099487305, "step": 13700 }, { - "epoch": 1.6494100650132433, - "eval_loss": 2.8802239894866943, - "eval_runtime": 0.2091, - "eval_samples_per_second": 205.668, - "eval_steps_per_second": 4.783, + "epoch": 9.771754636233952, + "eval_loss": 3.3737363815307617, + "eval_runtime": 0.2349, + "eval_samples_per_second": 183.026, + "eval_steps_per_second": 4.256, "step": 13700 }, { - "epoch": 1.6506140139658079, - "grad_norm": 7.84375, + "epoch": 9.77888730385164, + "grad_norm": 7.28125, "learning_rate": 5.018222222222222e-05, - "loss": 0.6112, + "loss": 0.7804, "step": 13710 }, { - "epoch": 1.6518179629183722, - "grad_norm": 12.0625, + "epoch": 9.78601997146933, + "grad_norm": 8.25, "learning_rate": 5.013777777777778e-05, - "loss": 0.5524, + "loss": 0.8201, "step": 13720 }, { - "epoch": 1.6530219118709368, - "grad_norm": 7.84375, + "epoch": 9.793152639087019, + "grad_norm": 7.125, "learning_rate": 5.009333333333334e-05, - "loss": 0.6066, + "loss": 0.7495, "step": 13730 }, { - "epoch": 1.6542258608235012, - "grad_norm": 6.15625, + "epoch": 9.800285306704708, + "grad_norm": 7.96875, "learning_rate": 5.004888888888889e-05, - "loss": 0.5683, + "loss": 0.7827, "step": 13740 }, { - "epoch": 1.6554298097760656, - "grad_norm": 7.03125, + "epoch": 9.807417974322396, + "grad_norm": 6.5625, "learning_rate": 5.000444444444444e-05, - "loss": 0.6051, + "loss": 0.8317, "step": 13750 }, { - "epoch": 1.65663375872863, - "grad_norm": 7.75, + "epoch": 9.814550641940086, + "grad_norm": 7.8125, "learning_rate": 4.996e-05, - "loss": 0.5289, + "loss": 0.8547, "step": 13760 }, { - "epoch": 1.6578377076811943, - "grad_norm": 6.5, + "epoch": 9.821683309557775, + "grad_norm": 7.15625, "learning_rate": 4.991555555555556e-05, - "loss": 0.5031, + "loss": 0.8679, "step": 13770 }, { - "epoch": 1.6590416566337587, - "grad_norm": 7.34375, + "epoch": 9.828815977175463, + "grad_norm": 7.8125, "learning_rate": 4.987111111111111e-05, - "loss": 0.6406, + "loss": 0.7479, "step": 13780 }, { - "epoch": 1.660245605586323, - "grad_norm": 8.1875, + "epoch": 9.835948644793152, + "grad_norm": 15.5, "learning_rate": 4.982666666666667e-05, - "loss": 0.5593, + "loss": 0.8501, "step": 13790 }, { - "epoch": 1.6614495545388874, - "grad_norm": 12.25, + "epoch": 9.843081312410842, + "grad_norm": 8.1875, "learning_rate": 4.9782222222222224e-05, - "loss": 0.4899, + "loss": 0.7662, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval/acc": 46.511627197265625, + "epoch": 9.843081312410842, + "eval/acc": 37.20930099487305, "step": 13800 }, { - "epoch": 1.6614495545388874, - "eval_loss": 2.8814125061035156, - "eval_runtime": 0.2176, - "eval_samples_per_second": 197.608, - "eval_steps_per_second": 4.596, + "epoch": 9.843081312410842, + "eval_loss": 3.3852930068969727, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.716, + "eval_steps_per_second": 4.528, "step": 13800 }, { - "epoch": 1.6626535034914518, - "grad_norm": 5.40625, + "epoch": 9.850213980028531, + "grad_norm": 8.3125, "learning_rate": 4.973777777777778e-05, - "loss": 0.5799, + "loss": 0.8303, "step": 13810 }, { - "epoch": 1.6638574524440164, - "grad_norm": 7.65625, + "epoch": 9.857346647646219, + "grad_norm": 8.8125, "learning_rate": 4.9693333333333334e-05, - "loss": 0.6122, + "loss": 0.7875, "step": 13820 }, { - "epoch": 1.6650614013965808, - "grad_norm": 8.1875, + "epoch": 9.864479315263909, + "grad_norm": 7.625, "learning_rate": 4.964888888888889e-05, - "loss": 0.593, + "loss": 0.7952, "step": 13830 }, { - "epoch": 1.6662653503491454, - "grad_norm": 6.5625, + "epoch": 9.871611982881598, + "grad_norm": 6.96875, "learning_rate": 4.9604444444444445e-05, - "loss": 0.6101, + "loss": 0.8041, "step": 13840 }, { - "epoch": 1.6674692993017097, - "grad_norm": 11.625, + "epoch": 9.878744650499288, + "grad_norm": 6.375, "learning_rate": 4.956e-05, - "loss": 0.6803, + "loss": 0.6869, "step": 13850 }, { - "epoch": 1.668673248254274, - "grad_norm": 7.5625, + "epoch": 9.885877318116975, + "grad_norm": 7.125, "learning_rate": 4.951555555555556e-05, - "loss": 0.6574, + "loss": 0.7707, "step": 13860 }, { - "epoch": 1.6698771972068385, - "grad_norm": 6.09375, + "epoch": 9.893009985734665, + "grad_norm": 8.125, "learning_rate": 4.947111111111111e-05, - "loss": 0.6698, + "loss": 0.7512, "step": 13870 }, { - "epoch": 1.6710811461594028, - "grad_norm": 7.75, + "epoch": 9.900142653352354, + "grad_norm": 8.8125, "learning_rate": 4.942666666666667e-05, - "loss": 0.6114, + "loss": 0.8059, "step": 13880 }, { - "epoch": 1.6722850951119672, - "grad_norm": 7.25, + "epoch": 9.907275320970044, + "grad_norm": 7.90625, "learning_rate": 4.938222222222223e-05, - "loss": 0.6604, + "loss": 0.729, "step": 13890 }, { - "epoch": 1.6734890440645316, - "grad_norm": 8.25, + "epoch": 9.914407988587731, + "grad_norm": 6.625, "learning_rate": 4.933777777777778e-05, - "loss": 0.6482, + "loss": 0.7958, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval/acc": 46.511627197265625, + "epoch": 9.914407988587731, + "eval/acc": 37.20930099487305, "step": 13900 }, { - "epoch": 1.6734890440645316, - "eval_loss": 2.856815814971924, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.827, - "eval_steps_per_second": 4.717, + "epoch": 9.914407988587731, + "eval_loss": 3.400364875793457, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.329, + "eval_steps_per_second": 4.356, "step": 13900 }, { - "epoch": 1.674692993017096, - "grad_norm": 7.15625, + "epoch": 9.921540656205421, + "grad_norm": 7.0625, "learning_rate": 4.929333333333334e-05, - "loss": 0.6165, + "loss": 0.7314, "step": 13910 }, { - "epoch": 1.6758969419696603, - "grad_norm": 7.65625, + "epoch": 9.92867332382311, + "grad_norm": 6.78125, "learning_rate": 4.9248888888888886e-05, - "loss": 0.5861, + "loss": 0.7581, "step": 13920 }, { - "epoch": 1.677100890922225, - "grad_norm": 6.8125, + "epoch": 9.935805991440798, + "grad_norm": 8.6875, "learning_rate": 4.920444444444445e-05, - "loss": 0.6019, + "loss": 0.7865, "step": 13930 }, { - "epoch": 1.6783048398747893, - "grad_norm": 7.1875, + "epoch": 9.942938659058488, + "grad_norm": 7.78125, "learning_rate": 4.9160000000000004e-05, - "loss": 0.5639, + "loss": 0.7174, "step": 13940 }, { - "epoch": 1.6795087888273539, - "grad_norm": 8.875, + "epoch": 9.950071326676177, + "grad_norm": 7.6875, "learning_rate": 4.911555555555556e-05, - "loss": 0.6077, + "loss": 0.855, "step": 13950 }, { - "epoch": 1.6807127377799183, - "grad_norm": 7.75, + "epoch": 9.957203994293867, + "grad_norm": 7.46875, "learning_rate": 4.9071111111111114e-05, - "loss": 0.5095, + "loss": 0.7511, "step": 13960 }, { - "epoch": 1.6819166867324826, - "grad_norm": 6.75, + "epoch": 9.964336661911554, + "grad_norm": 6.34375, "learning_rate": 4.902666666666667e-05, - "loss": 0.6097, + "loss": 0.6901, "step": 13970 }, { - "epoch": 1.683120635685047, - "grad_norm": 10.0625, + "epoch": 9.971469329529244, + "grad_norm": 19.125, "learning_rate": 4.8982222222222225e-05, - "loss": 0.5662, + "loss": 0.7621, "step": 13980 }, { - "epoch": 1.6843245846376114, - "grad_norm": 9.25, + "epoch": 9.978601997146933, + "grad_norm": 9.375, "learning_rate": 4.893777777777778e-05, - "loss": 0.6319, + "loss": 0.7466, "step": 13990 }, { - "epoch": 1.6855285335901757, - "grad_norm": 6.28125, + "epoch": 9.985734664764623, + "grad_norm": 7.1875, "learning_rate": 4.8893333333333335e-05, - "loss": 0.5154, + "loss": 0.749, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval/acc": 46.511627197265625, + "epoch": 9.985734664764623, + "eval/acc": 37.20930099487305, "step": 14000 }, { - "epoch": 1.6855285335901757, - "eval_loss": 2.8956446647644043, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.844, - "eval_steps_per_second": 4.624, + "epoch": 9.985734664764623, + "eval_loss": 3.3502047061920166, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.024, + "eval_steps_per_second": 4.442, "step": 14000 }, { - "epoch": 1.68673248254274, - "grad_norm": 8.8125, + "epoch": 9.99286733238231, + "grad_norm": 9.5625, "learning_rate": 4.884888888888889e-05, - "loss": 0.5645, + "loss": 0.6932, "step": 14010 }, { - "epoch": 1.6879364314953045, - "grad_norm": 8.8125, + "epoch": 10.0, + "grad_norm": 5.875, "learning_rate": 4.8804444444444445e-05, - "loss": 0.6886, + "loss": 0.7939, "step": 14020 }, { - "epoch": 1.6891403804478688, - "grad_norm": 9.3125, + "epoch": 10.00713266761769, + "grad_norm": 7.15625, "learning_rate": 4.876e-05, - "loss": 0.5767, + "loss": 0.8124, "step": 14030 }, { - "epoch": 1.6903443294004334, - "grad_norm": 11.625, + "epoch": 10.014265335235377, + "grad_norm": 6.0625, "learning_rate": 4.8715555555555556e-05, - "loss": 0.542, + "loss": 0.6855, "step": 14040 }, { - "epoch": 1.6915482783529978, - "grad_norm": 6.5625, + "epoch": 10.021398002853067, + "grad_norm": 50.75, "learning_rate": 4.867111111111111e-05, - "loss": 0.538, + "loss": 0.8354, "step": 14050 }, { - "epoch": 1.6927522273055624, - "grad_norm": 6.9375, + "epoch": 10.028530670470756, + "grad_norm": 7.46875, "learning_rate": 4.862666666666667e-05, - "loss": 0.5314, + "loss": 0.8605, "step": 14060 }, { - "epoch": 1.6939561762581268, - "grad_norm": 7.9375, + "epoch": 10.035663338088446, + "grad_norm": 9.625, "learning_rate": 4.858222222222222e-05, - "loss": 0.5909, + "loss": 0.8626, "step": 14070 }, { - "epoch": 1.6951601252106911, - "grad_norm": 7.09375, + "epoch": 10.042796005706133, + "grad_norm": 6.125, "learning_rate": 4.8537777777777784e-05, - "loss": 0.5809, + "loss": 0.7302, "step": 14080 }, { - "epoch": 1.6963640741632555, - "grad_norm": 7.9375, + "epoch": 10.049928673323823, + "grad_norm": 8.0625, "learning_rate": 4.849333333333333e-05, - "loss": 0.576, + "loss": 0.9058, "step": 14090 }, { - "epoch": 1.69756802311582, - "grad_norm": 6.1875, + "epoch": 10.057061340941512, + "grad_norm": 7.3125, "learning_rate": 4.8448888888888894e-05, - "loss": 0.5162, + "loss": 0.7981, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval/acc": 45.930233001708984, + "epoch": 10.057061340941512, + "eval/acc": 46.511627197265625, "step": 14100 }, { - "epoch": 1.69756802311582, - "eval_loss": 2.8892974853515625, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.308, - "eval_steps_per_second": 4.775, + "epoch": 10.057061340941512, + "eval_loss": 2.7371480464935303, + "eval_runtime": 1.1832, + "eval_samples_per_second": 36.344, + "eval_steps_per_second": 0.845, "step": 14100 }, { - "epoch": 1.6987719720683843, - "grad_norm": 6.84375, + "epoch": 10.064194008559202, + "grad_norm": 12.625, "learning_rate": 4.840444444444445e-05, - "loss": 0.6473, + "loss": 0.7461, "step": 14110 }, { - "epoch": 1.6999759210209486, - "grad_norm": 6.3125, + "epoch": 10.07132667617689, + "grad_norm": 7.375, "learning_rate": 4.836e-05, - "loss": 0.5388, + "loss": 0.8649, "step": 14120 }, { - "epoch": 1.701179869973513, - "grad_norm": 9.25, + "epoch": 10.078459343794579, + "grad_norm": 10.6875, "learning_rate": 4.831555555555556e-05, - "loss": 0.6188, + "loss": 0.8143, "step": 14130 }, { - "epoch": 1.7023838189260774, - "grad_norm": 8.875, + "epoch": 10.085592011412269, + "grad_norm": 43.5, "learning_rate": 4.827111111111111e-05, - "loss": 0.5347, + "loss": 0.8249, "step": 14140 }, { - "epoch": 1.703587767878642, - "grad_norm": 7.40625, + "epoch": 10.092724679029958, + "grad_norm": 6.6875, "learning_rate": 4.822666666666667e-05, - "loss": 0.5254, + "loss": 0.6324, "step": 14150 }, { - "epoch": 1.7047917168312063, - "grad_norm": 8.125, + "epoch": 10.099857346647646, + "grad_norm": 9.6875, "learning_rate": 4.8182222222222225e-05, - "loss": 0.5787, + "loss": 0.7795, "step": 14160 }, { - "epoch": 1.705995665783771, - "grad_norm": 9.875, + "epoch": 10.106990014265335, + "grad_norm": 7.8125, "learning_rate": 4.813777777777778e-05, - "loss": 0.5102, + "loss": 0.8453, "step": 14170 }, { - "epoch": 1.7071996147363353, - "grad_norm": 7.34375, + "epoch": 10.114122681883025, + "grad_norm": 6.21875, "learning_rate": 4.8093333333333336e-05, - "loss": 0.5871, + "loss": 0.735, "step": 14180 }, { - "epoch": 1.7084035636888997, - "grad_norm": 5.71875, + "epoch": 10.121255349500712, + "grad_norm": 8.1875, "learning_rate": 4.804888888888889e-05, - "loss": 0.6621, + "loss": 0.6646, "step": 14190 }, { - "epoch": 1.709607512641464, - "grad_norm": 8.625, + "epoch": 10.128388017118402, + "grad_norm": 38.0, "learning_rate": 4.8004444444444446e-05, - "loss": 0.5733, + "loss": 0.7963, "step": 14200 }, { - "epoch": 1.709607512641464, + "epoch": 10.128388017118402, "eval/acc": 44.1860466003418, "step": 14200 }, { - "epoch": 1.709607512641464, - "eval_loss": 2.9033682346343994, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.727, - "eval_steps_per_second": 4.691, + "epoch": 10.128388017118402, + "eval_loss": 2.7271535396575928, + "eval_runtime": 1.4583, + "eval_samples_per_second": 29.486, + "eval_steps_per_second": 0.686, "step": 14200 }, { - "epoch": 1.7108114615940284, - "grad_norm": 7.375, + "epoch": 10.135520684736091, + "grad_norm": 5.78125, "learning_rate": 4.796e-05, - "loss": 0.5375, + "loss": 0.693, "step": 14210 }, { - "epoch": 1.7120154105465928, - "grad_norm": 6.875, + "epoch": 10.142653352353781, + "grad_norm": 11.875, "learning_rate": 4.791555555555556e-05, - "loss": 0.5126, + "loss": 0.7578, "step": 14220 }, { - "epoch": 1.7132193594991572, - "grad_norm": 6.625, + "epoch": 10.149786019971469, + "grad_norm": 9.25, "learning_rate": 4.787111111111111e-05, - "loss": 0.5148, + "loss": 0.8127, "step": 14230 }, { - "epoch": 1.7144233084517215, - "grad_norm": 8.6875, + "epoch": 10.156918687589158, + "grad_norm": 9.1875, "learning_rate": 4.782666666666667e-05, - "loss": 0.6544, + "loss": 0.6935, "step": 14240 }, { - "epoch": 1.715627257404286, - "grad_norm": 7.28125, + "epoch": 10.164051355206848, + "grad_norm": 8.25, "learning_rate": 4.778222222222222e-05, - "loss": 0.5869, + "loss": 0.7233, "step": 14250 }, { - "epoch": 1.7168312063568505, - "grad_norm": 6.96875, + "epoch": 10.171184022824537, + "grad_norm": 6.71875, "learning_rate": 4.7737777777777785e-05, - "loss": 0.4857, + "loss": 0.8749, "step": 14260 }, { - "epoch": 1.7180351553094149, - "grad_norm": 6.90625, + "epoch": 10.178316690442225, + "grad_norm": 7.84375, "learning_rate": 4.769333333333333e-05, - "loss": 0.5416, + "loss": 0.7786, "step": 14270 }, { - "epoch": 1.7192391042619795, - "grad_norm": 9.5, + "epoch": 10.185449358059914, + "grad_norm": 9.1875, "learning_rate": 4.7648888888888895e-05, - "loss": 0.6318, + "loss": 0.7024, "step": 14280 }, { - "epoch": 1.7204430532145438, - "grad_norm": 8.125, + "epoch": 10.192582025677604, + "grad_norm": 7.78125, "learning_rate": 4.7604444444444443e-05, - "loss": 0.5763, + "loss": 0.8525, "step": 14290 }, { - "epoch": 1.7216470021671082, - "grad_norm": 6.6875, + "epoch": 10.199714693295292, + "grad_norm": 6.90625, "learning_rate": 4.7560000000000005e-05, - "loss": 0.568, + "loss": 0.8181, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval/acc": 47.093021392822266, + "epoch": 10.199714693295292, + "eval/acc": 46.511627197265625, "step": 14300 }, { - "epoch": 1.7216470021671082, - "eval_loss": 2.8553571701049805, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.63, - "eval_steps_per_second": 4.736, + "epoch": 10.199714693295292, + "eval_loss": 2.766357898712158, + "eval_runtime": 2.3027, + "eval_samples_per_second": 18.674, + "eval_steps_per_second": 0.434, "step": 14300 }, { - "epoch": 1.7228509511196726, - "grad_norm": 9.4375, + "epoch": 10.206847360912981, + "grad_norm": 5.90625, "learning_rate": 4.751555555555556e-05, - "loss": 0.6341, + "loss": 0.8026, "step": 14310 }, { - "epoch": 1.724054900072237, - "grad_norm": 6.90625, + "epoch": 10.21398002853067, + "grad_norm": 7.3125, "learning_rate": 4.747111111111111e-05, - "loss": 0.5031, + "loss": 0.8758, "step": 14320 }, { - "epoch": 1.7252588490248013, - "grad_norm": 7.59375, + "epoch": 10.22111269614836, + "grad_norm": 9.4375, "learning_rate": 4.742666666666667e-05, - "loss": 0.5438, + "loss": 0.7889, "step": 14330 }, { - "epoch": 1.7264627979773657, - "grad_norm": 6.09375, + "epoch": 10.228245363766048, + "grad_norm": 8.0, "learning_rate": 4.738222222222222e-05, - "loss": 0.6364, + "loss": 0.7343, "step": 14340 }, { - "epoch": 1.72766674692993, - "grad_norm": 7.0625, + "epoch": 10.235378031383737, + "grad_norm": 6.59375, "learning_rate": 4.733777777777778e-05, - "loss": 0.6025, + "loss": 0.788, "step": 14350 }, { - "epoch": 1.7288706958824944, - "grad_norm": 11.3125, + "epoch": 10.242510699001427, + "grad_norm": 9.1875, "learning_rate": 4.729333333333334e-05, - "loss": 0.5846, + "loss": 0.8068, "step": 14360 }, { - "epoch": 1.730074644835059, - "grad_norm": 5.5625, + "epoch": 10.249643366619116, + "grad_norm": 7.53125, "learning_rate": 4.724888888888889e-05, - "loss": 0.5983, + "loss": 0.8188, "step": 14370 }, { - "epoch": 1.7312785937876234, - "grad_norm": 8.6875, + "epoch": 10.256776034236804, + "grad_norm": 7.1875, "learning_rate": 4.720444444444445e-05, - "loss": 0.5374, + "loss": 0.7643, "step": 14380 }, { - "epoch": 1.732482542740188, - "grad_norm": 7.4375, + "epoch": 10.263908701854493, + "grad_norm": 9.125, "learning_rate": 4.716e-05, - "loss": 0.5893, + "loss": 0.7052, "step": 14390 }, { - "epoch": 1.7336864916927524, - "grad_norm": 6.90625, + "epoch": 10.271041369472183, + "grad_norm": 10.5625, "learning_rate": 4.711555555555556e-05, - "loss": 0.5874, + "loss": 0.762, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval/acc": 44.1860466003418, + "epoch": 10.271041369472183, + "eval/acc": 46.511627197265625, "step": 14400 }, { - "epoch": 1.7336864916927524, - "eval_loss": 2.897413969039917, - "eval_runtime": 0.2186, - "eval_samples_per_second": 196.696, - "eval_steps_per_second": 4.574, + "epoch": 10.271041369472183, + "eval_loss": 2.774780750274658, + "eval_runtime": 1.2232, + "eval_samples_per_second": 35.152, + "eval_steps_per_second": 0.817, "step": 14400 }, { - "epoch": 1.7348904406453167, - "grad_norm": 8.0625, + "epoch": 10.278174037089872, + "grad_norm": 7.5625, "learning_rate": 4.707111111111111e-05, - "loss": 0.6197, + "loss": 0.8322, "step": 14410 }, { - "epoch": 1.736094389597881, - "grad_norm": 7.875, + "epoch": 10.28530670470756, + "grad_norm": 35.25, "learning_rate": 4.702666666666667e-05, - "loss": 0.5427, + "loss": 0.8043, "step": 14420 }, { - "epoch": 1.7372983385504455, - "grad_norm": 10.5625, + "epoch": 10.29243937232525, + "grad_norm": 7.09375, "learning_rate": 4.6982222222222223e-05, - "loss": 0.5801, + "loss": 0.7257, "step": 14430 }, { - "epoch": 1.7385022875030098, - "grad_norm": 11.5625, + "epoch": 10.29957203994294, + "grad_norm": 15.375, "learning_rate": 4.693777777777778e-05, - "loss": 0.5667, + "loss": 0.7922, "step": 14440 }, { - "epoch": 1.7397062364555742, - "grad_norm": 7.71875, + "epoch": 10.306704707560627, + "grad_norm": 7.09375, "learning_rate": 4.6893333333333334e-05, - "loss": 0.6626, + "loss": 0.694, "step": 14450 }, { - "epoch": 1.7409101854081386, - "grad_norm": 5.90625, + "epoch": 10.313837375178316, + "grad_norm": 7.0625, "learning_rate": 4.684888888888889e-05, - "loss": 0.6022, + "loss": 0.7734, "step": 14460 }, { - "epoch": 1.742114134360703, - "grad_norm": 9.9375, + "epoch": 10.320970042796006, + "grad_norm": 6.75, "learning_rate": 4.6804444444444444e-05, - "loss": 0.6492, + "loss": 0.7469, "step": 14470 }, { - "epoch": 1.7433180833132675, - "grad_norm": 6.6875, + "epoch": 10.328102710413695, + "grad_norm": 5.9375, "learning_rate": 4.6760000000000006e-05, - "loss": 0.6251, + "loss": 0.6948, "step": 14480 }, { - "epoch": 1.744522032265832, - "grad_norm": 5.65625, + "epoch": 10.335235378031383, + "grad_norm": 7.15625, "learning_rate": 4.6715555555555555e-05, - "loss": 0.5951, + "loss": 0.7593, "step": 14490 }, { - "epoch": 1.7457259812183965, - "grad_norm": 7.53125, + "epoch": 10.342368045649073, + "grad_norm": 26.875, "learning_rate": 4.667111111111112e-05, - "loss": 0.5074, + "loss": 0.7302, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval/acc": 45.930233001708984, + "epoch": 10.342368045649073, + "eval/acc": 44.1860466003418, "step": 14500 }, { - "epoch": 1.7457259812183965, - "eval_loss": 2.843892812728882, - "eval_runtime": 0.2078, - "eval_samples_per_second": 206.911, - "eval_steps_per_second": 4.812, + "epoch": 10.342368045649073, + "eval_loss": 2.7937443256378174, + "eval_runtime": 0.2689, + "eval_samples_per_second": 159.899, + "eval_steps_per_second": 3.719, "step": 14500 }, { - "epoch": 1.7469299301709609, - "grad_norm": 8.8125, + "epoch": 10.349500713266762, + "grad_norm": 53.75, "learning_rate": 4.6626666666666665e-05, - "loss": 0.5739, + "loss": 0.8025, "step": 14510 }, { - "epoch": 1.7481338791235252, - "grad_norm": 9.5, + "epoch": 10.356633380884452, + "grad_norm": 10.4375, "learning_rate": 4.658222222222223e-05, - "loss": 0.5531, + "loss": 0.6807, "step": 14520 }, { - "epoch": 1.7493378280760896, - "grad_norm": 8.4375, + "epoch": 10.36376604850214, + "grad_norm": 17.5, "learning_rate": 4.653777777777778e-05, - "loss": 0.5929, + "loss": 0.7773, "step": 14530 }, { - "epoch": 1.750541777028654, - "grad_norm": 6.78125, + "epoch": 10.370898716119829, + "grad_norm": 9.0625, "learning_rate": 4.649333333333333e-05, - "loss": 0.6202, + "loss": 0.7322, "step": 14540 }, { - "epoch": 1.7517457259812184, - "grad_norm": 7.28125, + "epoch": 10.378031383737518, + "grad_norm": 7.5, "learning_rate": 4.644888888888889e-05, - "loss": 0.6164, + "loss": 0.801, "step": 14550 }, { - "epoch": 1.7529496749337827, - "grad_norm": 9.0625, + "epoch": 10.385164051355208, + "grad_norm": 7.03125, "learning_rate": 4.640444444444445e-05, - "loss": 0.6379, + "loss": 0.7887, "step": 14560 }, { - "epoch": 1.754153623886347, - "grad_norm": 8.9375, + "epoch": 10.392296718972895, + "grad_norm": 5.78125, "learning_rate": 4.636e-05, - "loss": 0.5701, + "loss": 0.75, "step": 14570 }, { - "epoch": 1.7553575728389115, - "grad_norm": 8.375, + "epoch": 10.399429386590585, + "grad_norm": 11.8125, "learning_rate": 4.631555555555556e-05, - "loss": 0.6824, + "loss": 0.7594, "step": 14580 }, { - "epoch": 1.756561521791476, - "grad_norm": 7.78125, + "epoch": 10.406562054208274, + "grad_norm": 26.375, "learning_rate": 4.6271111111111114e-05, - "loss": 0.7127, + "loss": 0.7863, "step": 14590 }, { - "epoch": 1.7577654707440404, - "grad_norm": 6.96875, + "epoch": 10.413694721825962, + "grad_norm": 11.875, "learning_rate": 4.622666666666667e-05, - "loss": 0.5165, + "loss": 0.7701, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval/acc": 46.511627197265625, + "epoch": 10.413694721825962, + "eval/acc": 44.1860466003418, "step": 14600 }, { - "epoch": 1.7577654707440404, - "eval_loss": 2.8514623641967773, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.924, - "eval_steps_per_second": 4.742, + "epoch": 10.413694721825962, + "eval_loss": 2.7834675312042236, + "eval_runtime": 0.4675, + "eval_samples_per_second": 91.973, + "eval_steps_per_second": 2.139, "step": 14600 }, { - "epoch": 1.758969419696605, - "grad_norm": 8.25, + "epoch": 10.420827389443652, + "grad_norm": 12.5625, "learning_rate": 4.6182222222222224e-05, - "loss": 0.6424, + "loss": 0.7568, "step": 14610 }, { - "epoch": 1.7601733686491694, - "grad_norm": 7.09375, + "epoch": 10.427960057061341, + "grad_norm": 7.40625, "learning_rate": 4.613777777777778e-05, - "loss": 0.5774, + "loss": 0.7547, "step": 14620 }, { - "epoch": 1.7613773176017338, - "grad_norm": 7.3125, + "epoch": 10.43509272467903, + "grad_norm": 8.875, "learning_rate": 4.6093333333333335e-05, - "loss": 0.5932, + "loss": 0.7594, "step": 14630 }, { - "epoch": 1.7625812665542981, - "grad_norm": 7.84375, + "epoch": 10.442225392296718, + "grad_norm": 25.0, "learning_rate": 4.604888888888889e-05, - "loss": 0.5451, + "loss": 0.8313, "step": 14640 }, { - "epoch": 1.7637852155068625, - "grad_norm": 6.875, + "epoch": 10.449358059914408, + "grad_norm": 7.9375, "learning_rate": 4.6004444444444445e-05, - "loss": 0.6025, + "loss": 0.8017, "step": 14650 }, { - "epoch": 1.7649891644594269, - "grad_norm": 6.71875, + "epoch": 10.456490727532097, + "grad_norm": 7.59375, "learning_rate": 4.596e-05, - "loss": 0.5298, + "loss": 0.7648, "step": 14660 }, { - "epoch": 1.7661931134119913, - "grad_norm": 8.75, + "epoch": 10.463623395149787, + "grad_norm": 8.5625, "learning_rate": 4.5915555555555556e-05, - "loss": 0.6542, + "loss": 0.6931, "step": 14670 }, { - "epoch": 1.7673970623645556, - "grad_norm": 8.875, + "epoch": 10.470756062767475, + "grad_norm": 9.8125, "learning_rate": 4.587111111111112e-05, - "loss": 0.5926, + "loss": 0.7128, "step": 14680 }, { - "epoch": 1.76860101131712, - "grad_norm": 8.9375, + "epoch": 10.477888730385164, + "grad_norm": 8.0, "learning_rate": 4.5826666666666666e-05, - "loss": 0.5747, + "loss": 0.8199, "step": 14690 }, { - "epoch": 1.7698049602696846, - "grad_norm": 7.34375, + "epoch": 10.485021398002853, + "grad_norm": 7.53125, "learning_rate": 4.578222222222223e-05, - "loss": 0.5349, + "loss": 0.8027, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval/acc": 44.1860466003418, + "epoch": 10.485021398002853, + "eval/acc": 48.83720779418945, "step": 14700 }, { - "epoch": 1.7698049602696846, - "eval_loss": 2.8648550510406494, - "eval_runtime": 0.2015, - "eval_samples_per_second": 213.416, - "eval_steps_per_second": 4.963, + "epoch": 10.485021398002853, + "eval_loss": 2.773456573486328, + "eval_runtime": 0.2477, + "eval_samples_per_second": 173.569, + "eval_steps_per_second": 4.036, "step": 14700 }, { - "epoch": 1.771008909222249, - "grad_norm": 7.5625, + "epoch": 10.492154065620543, + "grad_norm": 13.375, "learning_rate": 4.5737777777777777e-05, - "loss": 0.4603, + "loss": 0.8284, "step": 14710 }, { - "epoch": 1.7722128581748133, - "grad_norm": 8.4375, + "epoch": 10.49928673323823, + "grad_norm": 7.875, "learning_rate": 4.569333333333334e-05, - "loss": 0.5674, + "loss": 0.7522, "step": 14720 }, { - "epoch": 1.773416807127378, - "grad_norm": 6.625, + "epoch": 10.50641940085592, + "grad_norm": 6.375, "learning_rate": 4.5648888888888894e-05, - "loss": 0.5988, + "loss": 0.672, "step": 14730 }, { - "epoch": 1.7746207560799423, - "grad_norm": 8.4375, + "epoch": 10.51355206847361, + "grad_norm": 6.40625, "learning_rate": 4.560444444444444e-05, - "loss": 0.6072, + "loss": 0.8234, "step": 14740 }, { - "epoch": 1.7758247050325067, - "grad_norm": 7.84375, + "epoch": 10.520684736091297, + "grad_norm": 9.1875, "learning_rate": 4.5560000000000004e-05, - "loss": 0.5524, + "loss": 0.7505, "step": 14750 }, { - "epoch": 1.777028653985071, - "grad_norm": 6.0, + "epoch": 10.527817403708987, + "grad_norm": 7.25, "learning_rate": 4.551555555555555e-05, - "loss": 0.5633, + "loss": 0.7694, "step": 14760 }, { - "epoch": 1.7782326029376354, - "grad_norm": 12.625, + "epoch": 10.534950071326676, + "grad_norm": 6.3125, "learning_rate": 4.5471111111111115e-05, - "loss": 0.5669, + "loss": 0.7743, "step": 14770 }, { - "epoch": 1.7794365518901998, - "grad_norm": 6.5, + "epoch": 10.542082738944366, + "grad_norm": 10.0, "learning_rate": 4.542666666666667e-05, - "loss": 0.4503, + "loss": 0.8179, "step": 14780 }, { - "epoch": 1.7806405008427642, - "grad_norm": 7.46875, + "epoch": 10.549215406562054, + "grad_norm": 9.875, "learning_rate": 4.5382222222222225e-05, - "loss": 0.6596, + "loss": 0.9151, "step": 14790 }, { - "epoch": 1.7818444497953285, - "grad_norm": 6.125, + "epoch": 10.556348074179743, + "grad_norm": 8.6875, "learning_rate": 4.533777777777778e-05, - "loss": 0.6978, + "loss": 0.8133, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval/acc": 48.83720779418945, + "epoch": 10.556348074179743, + "eval/acc": 46.511627197265625, "step": 14800 }, { - "epoch": 1.7818444497953285, - "eval_loss": 2.870887279510498, - "eval_runtime": 0.9511, - "eval_samples_per_second": 45.209, - "eval_steps_per_second": 1.051, + "epoch": 10.556348074179743, + "eval_loss": 2.8140347003936768, + "eval_runtime": 0.2939, + "eval_samples_per_second": 146.299, + "eval_steps_per_second": 3.402, "step": 14800 }, { - "epoch": 1.7830483987478931, - "grad_norm": 8.375, + "epoch": 10.563480741797433, + "grad_norm": 7.0, "learning_rate": 4.5293333333333336e-05, - "loss": 0.581, + "loss": 0.7129, "step": 14810 }, { - "epoch": 1.7842523477004575, - "grad_norm": 6.1875, + "epoch": 10.570613409415122, + "grad_norm": 8.5, "learning_rate": 4.524888888888889e-05, - "loss": 0.526, + "loss": 0.7667, "step": 14820 }, { - "epoch": 1.7854562966530219, - "grad_norm": 6.40625, + "epoch": 10.57774607703281, + "grad_norm": 7.4375, "learning_rate": 4.5204444444444446e-05, - "loss": 0.5246, + "loss": 0.7692, "step": 14830 }, { - "epoch": 1.7866602456055865, - "grad_norm": 7.125, + "epoch": 10.5848787446505, + "grad_norm": 8.0625, "learning_rate": 4.516e-05, - "loss": 0.6338, + "loss": 0.7613, "step": 14840 }, { - "epoch": 1.7878641945581508, - "grad_norm": 10.3125, + "epoch": 10.592011412268189, + "grad_norm": 7.96875, "learning_rate": 4.5115555555555557e-05, - "loss": 0.5313, + "loss": 0.6925, "step": 14850 }, { - "epoch": 1.7890681435107152, - "grad_norm": 8.4375, + "epoch": 10.599144079885878, + "grad_norm": 14.375, "learning_rate": 4.507111111111111e-05, - "loss": 0.6848, + "loss": 0.84, "step": 14860 }, { - "epoch": 1.7902720924632796, - "grad_norm": 20.125, + "epoch": 10.606276747503566, + "grad_norm": 11.4375, "learning_rate": 4.502666666666667e-05, - "loss": 0.5839, + "loss": 0.8508, "step": 14870 }, { - "epoch": 1.791476041415844, - "grad_norm": 9.1875, + "epoch": 10.613409415121255, + "grad_norm": 8.375, "learning_rate": 4.498222222222222e-05, - "loss": 0.5869, + "loss": 0.7863, "step": 14880 }, { - "epoch": 1.7926799903684083, - "grad_norm": 9.8125, + "epoch": 10.620542082738945, + "grad_norm": 7.625, "learning_rate": 4.493777777777778e-05, - "loss": 0.5319, + "loss": 0.7177, "step": 14890 }, { - "epoch": 1.7938839393209727, - "grad_norm": 7.03125, + "epoch": 10.627674750356633, + "grad_norm": 10.375, "learning_rate": 4.489333333333334e-05, - "loss": 0.6254, + "loss": 0.7795, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval/acc": 46.511627197265625, + "epoch": 10.627674750356633, + "eval/acc": 44.1860466003418, "step": 14900 }, { - "epoch": 1.7938839393209727, - "eval_loss": 2.8519837856292725, - "eval_runtime": 0.2174, - "eval_samples_per_second": 197.755, - "eval_steps_per_second": 4.599, + "epoch": 10.627674750356633, + "eval_loss": 2.830230951309204, + "eval_runtime": 0.2428, + "eval_samples_per_second": 177.067, + "eval_steps_per_second": 4.118, "step": 14900 }, { - "epoch": 1.795087888273537, - "grad_norm": 8.9375, + "epoch": 10.634807417974322, + "grad_norm": 10.875, "learning_rate": 4.484888888888889e-05, - "loss": 0.613, + "loss": 0.7878, "step": 14910 }, { - "epoch": 1.7962918372261016, - "grad_norm": 9.1875, + "epoch": 10.641940085592012, + "grad_norm": 9.0, "learning_rate": 4.480444444444445e-05, - "loss": 0.6735, + "loss": 0.8517, "step": 14920 }, { - "epoch": 1.797495786178666, - "grad_norm": 7.4375, + "epoch": 10.649072753209701, + "grad_norm": 6.9375, "learning_rate": 4.4760000000000005e-05, - "loss": 0.5792, + "loss": 0.8469, "step": 14930 }, { - "epoch": 1.7986997351312304, - "grad_norm": 6.375, + "epoch": 10.656205420827389, + "grad_norm": 7.28125, "learning_rate": 4.4715555555555554e-05, - "loss": 0.5137, + "loss": 0.7262, "step": 14940 }, { - "epoch": 1.799903684083795, - "grad_norm": 8.0, + "epoch": 10.663338088445078, + "grad_norm": 6.15625, "learning_rate": 4.4671111111111116e-05, - "loss": 0.5431, + "loss": 0.739, "step": 14950 }, { - "epoch": 1.8011076330363593, - "grad_norm": 6.65625, + "epoch": 10.670470756062768, + "grad_norm": 7.84375, "learning_rate": 4.4626666666666664e-05, - "loss": 0.6689, + "loss": 0.7671, "step": 14960 }, { - "epoch": 1.8023115819889237, - "grad_norm": 7.5625, + "epoch": 10.677603423680456, + "grad_norm": 7.53125, "learning_rate": 4.4582222222222226e-05, - "loss": 0.6171, + "loss": 0.8059, "step": 14970 }, { - "epoch": 1.803515530941488, - "grad_norm": 6.9375, + "epoch": 10.684736091298145, + "grad_norm": 8.0, "learning_rate": 4.453777777777778e-05, - "loss": 0.6821, + "loss": 0.8167, "step": 14980 }, { - "epoch": 1.8047194798940525, - "grad_norm": 7.6875, + "epoch": 10.691868758915835, + "grad_norm": 7.4375, "learning_rate": 4.4493333333333337e-05, - "loss": 0.5024, + "loss": 0.7768, "step": 14990 }, { - "epoch": 1.8059234288466168, - "grad_norm": 7.90625, + "epoch": 10.699001426533524, + "grad_norm": 9.0625, "learning_rate": 4.444888888888889e-05, - "loss": 0.6338, + "loss": 0.7805, "step": 15000 }, { - "epoch": 1.8059234288466168, + "epoch": 10.699001426533524, "eval/acc": 46.511627197265625, "step": 15000 }, { - "epoch": 1.8059234288466168, - "eval_loss": 2.8593192100524902, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.704, - "eval_steps_per_second": 4.737, + "epoch": 10.699001426533524, + "eval_loss": 2.8128726482391357, + "eval_runtime": 0.239, + "eval_samples_per_second": 179.883, + "eval_steps_per_second": 4.183, "step": 15000 }, { - "epoch": 1.8071273777991812, - "grad_norm": 8.3125, + "epoch": 10.706134094151212, + "grad_norm": 6.59375, "learning_rate": 4.440444444444445e-05, - "loss": 0.5786, + "loss": 0.7292, "step": 15010 }, { - "epoch": 1.8083313267517456, - "grad_norm": 7.78125, + "epoch": 10.713266761768901, + "grad_norm": 7.625, "learning_rate": 4.436e-05, - "loss": 0.6018, + "loss": 0.7543, "step": 15020 }, { - "epoch": 1.8095352757043102, - "grad_norm": 9.625, + "epoch": 10.72039942938659, + "grad_norm": 7.6875, "learning_rate": 4.431555555555556e-05, - "loss": 0.5862, + "loss": 0.812, "step": 15030 }, { - "epoch": 1.8107392246568745, - "grad_norm": 7.59375, + "epoch": 10.72753209700428, + "grad_norm": 8.375, "learning_rate": 4.427111111111111e-05, - "loss": 0.5643, + "loss": 0.8101, "step": 15040 }, { - "epoch": 1.811943173609439, - "grad_norm": 9.6875, + "epoch": 10.734664764621968, + "grad_norm": 9.3125, "learning_rate": 4.422666666666667e-05, - "loss": 0.5234, + "loss": 0.7984, "step": 15050 }, { - "epoch": 1.8131471225620035, - "grad_norm": 5.15625, + "epoch": 10.741797432239657, + "grad_norm": 8.0, "learning_rate": 4.418222222222222e-05, - "loss": 0.5302, + "loss": 0.7325, "step": 15060 }, { - "epoch": 1.8143510715145679, - "grad_norm": 8.5625, + "epoch": 10.748930099857347, + "grad_norm": 7.125, "learning_rate": 4.413777777777778e-05, - "loss": 0.5529, + "loss": 0.823, "step": 15070 }, { - "epoch": 1.8155550204671322, - "grad_norm": 8.8125, + "epoch": 10.756062767475036, + "grad_norm": 7.40625, "learning_rate": 4.4093333333333334e-05, - "loss": 0.5429, + "loss": 0.8095, "step": 15080 }, { - "epoch": 1.8167589694196966, - "grad_norm": 8.1875, + "epoch": 10.763195435092724, + "grad_norm": 7.875, "learning_rate": 4.404888888888889e-05, - "loss": 0.6234, + "loss": 0.7326, "step": 15090 }, { - "epoch": 1.817962918372261, - "grad_norm": 10.6875, + "epoch": 10.770328102710414, + "grad_norm": 31.0, "learning_rate": 4.400444444444445e-05, - "loss": 0.5107, + "loss": 0.8735, "step": 15100 }, { - "epoch": 1.817962918372261, + "epoch": 10.770328102710414, "eval/acc": 46.511627197265625, "step": 15100 }, { - "epoch": 1.817962918372261, - "eval_loss": 2.8768866062164307, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.465, - "eval_steps_per_second": 4.708, + "epoch": 10.770328102710414, + "eval_loss": 2.815082311630249, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.248, + "eval_steps_per_second": 3.541, "step": 15100 }, { - "epoch": 1.8191668673248254, - "grad_norm": 7.34375, + "epoch": 10.777460770328103, + "grad_norm": 9.625, "learning_rate": 4.396e-05, - "loss": 0.5985, + "loss": 0.8593, "step": 15110 }, { - "epoch": 1.8203708162773897, - "grad_norm": 8.4375, + "epoch": 10.78459343794579, + "grad_norm": 8.0, "learning_rate": 4.391555555555556e-05, - "loss": 0.5498, + "loss": 0.8309, "step": 15120 }, { - "epoch": 1.821574765229954, - "grad_norm": 10.5, + "epoch": 10.79172610556348, + "grad_norm": 7.53125, "learning_rate": 4.387111111111111e-05, - "loss": 0.7284, + "loss": 0.6465, "step": 15130 }, { - "epoch": 1.8227787141825187, - "grad_norm": 9.3125, + "epoch": 10.79885877318117, + "grad_norm": 9.25, "learning_rate": 4.382666666666667e-05, - "loss": 0.6616, + "loss": 0.8751, "step": 15140 }, { - "epoch": 1.823982663135083, - "grad_norm": 6.90625, + "epoch": 10.80599144079886, + "grad_norm": 7.6875, "learning_rate": 4.378222222222223e-05, - "loss": 0.541, + "loss": 0.7533, "step": 15150 }, { - "epoch": 1.8251866120876474, - "grad_norm": 8.25, + "epoch": 10.813124108416547, + "grad_norm": 8.5625, "learning_rate": 4.3737777777777775e-05, - "loss": 0.6453, + "loss": 0.7803, "step": 15160 }, { - "epoch": 1.826390561040212, - "grad_norm": 10.875, + "epoch": 10.820256776034237, + "grad_norm": 6.09375, "learning_rate": 4.369333333333334e-05, - "loss": 0.5483, + "loss": 0.6925, "step": 15170 }, { - "epoch": 1.8275945099927764, - "grad_norm": 6.9375, + "epoch": 10.827389443651926, + "grad_norm": 8.1875, "learning_rate": 4.3648888888888886e-05, - "loss": 0.4767, + "loss": 0.8491, "step": 15180 }, { - "epoch": 1.8287984589453408, - "grad_norm": 8.5, + "epoch": 10.834522111269616, + "grad_norm": 13.125, "learning_rate": 4.360444444444445e-05, - "loss": 0.5885, + "loss": 0.7565, "step": 15190 }, { - "epoch": 1.8300024078979051, - "grad_norm": 8.5625, + "epoch": 10.841654778887303, + "grad_norm": 9.25, "learning_rate": 4.356e-05, - "loss": 0.5084, + "loss": 0.9506, "step": 15200 }, { - "epoch": 1.8300024078979051, - "eval/acc": 46.511627197265625, + "epoch": 10.841654778887303, + "eval/acc": 41.86046600341797, "step": 15200 }, { - "epoch": 1.8300024078979051, - "eval_loss": 2.888690233230591, - "eval_runtime": 0.2472, - "eval_samples_per_second": 173.974, - "eval_steps_per_second": 4.046, + "epoch": 10.841654778887303, + "eval_loss": 2.834817409515381, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.267, + "eval_steps_per_second": 4.285, "step": 15200 }, { - "epoch": 1.8312063568504695, - "grad_norm": 7.5, + "epoch": 10.848787446504993, + "grad_norm": 6.71875, "learning_rate": 4.351555555555556e-05, - "loss": 0.6484, + "loss": 0.8631, "step": 15210 }, { - "epoch": 1.8324103058030339, - "grad_norm": 7.75, + "epoch": 10.855920114122682, + "grad_norm": 7.25, "learning_rate": 4.3471111111111114e-05, - "loss": 0.5007, + "loss": 0.8258, "step": 15220 }, { - "epoch": 1.8336142547555982, - "grad_norm": 13.5625, + "epoch": 10.863052781740372, + "grad_norm": 5.59375, "learning_rate": 4.342666666666667e-05, - "loss": 0.5155, + "loss": 0.7895, "step": 15230 }, { - "epoch": 1.8348182037081626, - "grad_norm": 12.75, + "epoch": 10.87018544935806, + "grad_norm": 6.65625, "learning_rate": 4.3382222222222224e-05, - "loss": 0.6569, + "loss": 0.8473, "step": 15240 }, { - "epoch": 1.8360221526607272, - "grad_norm": 8.5625, + "epoch": 10.877318116975749, + "grad_norm": 6.59375, "learning_rate": 4.333777777777778e-05, - "loss": 0.591, + "loss": 0.8323, "step": 15250 }, { - "epoch": 1.8372261016132916, - "grad_norm": 8.9375, + "epoch": 10.884450784593438, + "grad_norm": 9.3125, "learning_rate": 4.3293333333333334e-05, - "loss": 0.6035, + "loss": 0.7446, "step": 15260 }, { - "epoch": 1.838430050565856, - "grad_norm": 8.875, + "epoch": 10.891583452211126, + "grad_norm": 15.125, "learning_rate": 4.324888888888889e-05, - "loss": 0.6398, + "loss": 0.8885, "step": 15270 }, { - "epoch": 1.8396339995184205, - "grad_norm": 6.40625, + "epoch": 10.898716119828816, + "grad_norm": 21.25, "learning_rate": 4.3204444444444445e-05, - "loss": 0.6009, + "loss": 0.7624, "step": 15280 }, { - "epoch": 1.840837948470985, - "grad_norm": 6.875, + "epoch": 10.905848787446505, + "grad_norm": 7.125, "learning_rate": 4.316e-05, - "loss": 0.4908, + "loss": 0.6841, "step": 15290 }, { - "epoch": 1.8420418974235493, - "grad_norm": 7.0, + "epoch": 10.912981455064195, + "grad_norm": 8.5625, "learning_rate": 4.311555555555556e-05, - "loss": 0.613, + "loss": 0.8645, "step": 15300 }, { - "epoch": 1.8420418974235493, + "epoch": 10.912981455064195, "eval/acc": 46.511627197265625, "step": 15300 }, { - "epoch": 1.8420418974235493, - "eval_loss": 2.8745079040527344, - "eval_runtime": 0.2086, - "eval_samples_per_second": 206.14, - "eval_steps_per_second": 4.794, + "epoch": 10.912981455064195, + "eval_loss": 2.790827512741089, + "eval_runtime": 0.2527, + "eval_samples_per_second": 170.168, + "eval_steps_per_second": 3.957, "step": 15300 }, { - "epoch": 1.8432458463761137, - "grad_norm": 7.21875, + "epoch": 10.920114122681882, + "grad_norm": 9.0625, "learning_rate": 4.307111111111111e-05, - "loss": 0.5939, + "loss": 0.779, "step": 15310 }, { - "epoch": 1.844449795328678, - "grad_norm": 6.8125, + "epoch": 10.927246790299572, + "grad_norm": 5.96875, "learning_rate": 4.302666666666667e-05, - "loss": 0.5942, + "loss": 0.7987, "step": 15320 }, { - "epoch": 1.8456537442812424, - "grad_norm": 7.46875, + "epoch": 10.934379457917261, + "grad_norm": 7.25, "learning_rate": 4.298222222222222e-05, - "loss": 0.5722, + "loss": 0.8278, "step": 15330 }, { - "epoch": 1.8468576932338068, - "grad_norm": 9.6875, + "epoch": 10.94151212553495, + "grad_norm": 7.625, "learning_rate": 4.293777777777778e-05, - "loss": 0.6481, + "loss": 0.8022, "step": 15340 }, { - "epoch": 1.8480616421863711, - "grad_norm": 6.96875, + "epoch": 10.948644793152638, + "grad_norm": 6.6875, "learning_rate": 4.289333333333334e-05, - "loss": 0.5667, + "loss": 0.7333, "step": 15350 }, { - "epoch": 1.8492655911389357, - "grad_norm": 6.6875, + "epoch": 10.955777460770328, + "grad_norm": 7.4375, "learning_rate": 4.284888888888889e-05, - "loss": 0.553, + "loss": 0.7398, "step": 15360 }, { - "epoch": 1.8504695400915, - "grad_norm": 11.4375, + "epoch": 10.962910128388017, + "grad_norm": 10.1875, "learning_rate": 4.280444444444445e-05, - "loss": 0.6384, + "loss": 0.7397, "step": 15370 }, { - "epoch": 1.8516734890440645, - "grad_norm": 9.3125, + "epoch": 10.970042796005707, + "grad_norm": 7.375, "learning_rate": 4.276e-05, - "loss": 0.591, + "loss": 0.7993, "step": 15380 }, { - "epoch": 1.852877437996629, - "grad_norm": 8.4375, + "epoch": 10.977175463623395, + "grad_norm": 7.59375, "learning_rate": 4.271555555555556e-05, - "loss": 0.5953, + "loss": 0.7811, "step": 15390 }, { - "epoch": 1.8540813869491934, - "grad_norm": 6.4375, + "epoch": 10.984308131241084, + "grad_norm": 7.46875, "learning_rate": 4.2671111111111114e-05, - "loss": 0.6149, + "loss": 0.7611, "step": 15400 }, { - "epoch": 1.8540813869491934, + "epoch": 10.984308131241084, "eval/acc": 46.511627197265625, "step": 15400 }, { - "epoch": 1.8540813869491934, - "eval_loss": 2.8363993167877197, - "eval_runtime": 0.3986, - "eval_samples_per_second": 107.885, - "eval_steps_per_second": 2.509, + "epoch": 10.984308131241084, + "eval_loss": 2.8109776973724365, + "eval_runtime": 0.2365, + "eval_samples_per_second": 181.849, + "eval_steps_per_second": 4.229, "step": 15400 }, { - "epoch": 1.8552853359017578, + "epoch": 10.991440798858774, "grad_norm": 7.46875, "learning_rate": 4.262666666666667e-05, - "loss": 0.5669, + "loss": 0.8165, "step": 15410 }, { - "epoch": 1.8564892848543222, - "grad_norm": 8.0625, + "epoch": 10.998573466476461, + "grad_norm": 8.625, "learning_rate": 4.2582222222222225e-05, - "loss": 0.618, + "loss": 0.8449, "step": 15420 }, { - "epoch": 1.8576932338068866, - "grad_norm": 7.21875, + "epoch": 11.00570613409415, + "grad_norm": 7.28125, "learning_rate": 4.253777777777778e-05, - "loss": 0.5657, + "loss": 0.7556, "step": 15430 }, { - "epoch": 1.858897182759451, - "grad_norm": 6.90625, + "epoch": 11.01283880171184, + "grad_norm": 30.375, "learning_rate": 4.2493333333333335e-05, - "loss": 0.6795, + "loss": 0.8541, "step": 15440 }, { - "epoch": 1.8601011317120153, - "grad_norm": 7.71875, + "epoch": 11.01997146932953, + "grad_norm": 8.375, "learning_rate": 4.244888888888889e-05, - "loss": 0.5312, + "loss": 0.8616, "step": 15450 }, { - "epoch": 1.8613050806645797, - "grad_norm": 8.4375, + "epoch": 11.027104136947218, + "grad_norm": 8.5, "learning_rate": 4.2404444444444446e-05, - "loss": 0.6856, + "loss": 0.7837, "step": 15460 }, { - "epoch": 1.8625090296171443, - "grad_norm": 9.6875, + "epoch": 11.034236804564907, + "grad_norm": 9.125, "learning_rate": 4.236e-05, - "loss": 0.6227, + "loss": 0.6939, "step": 15470 }, { - "epoch": 1.8637129785697086, - "grad_norm": 18.75, + "epoch": 11.041369472182597, + "grad_norm": 8.1875, "learning_rate": 4.2315555555555556e-05, - "loss": 0.5966, + "loss": 0.6788, "step": 15480 }, { - "epoch": 1.864916927522273, - "grad_norm": 8.9375, + "epoch": 11.048502139800286, + "grad_norm": 8.1875, "learning_rate": 4.227111111111111e-05, - "loss": 0.6483, + "loss": 0.7654, "step": 15490 }, { - "epoch": 1.8661208764748376, - "grad_norm": 7.625, + "epoch": 11.055634807417974, + "grad_norm": 8.375, "learning_rate": 4.222666666666667e-05, - "loss": 0.6927, + "loss": 0.7765, "step": 15500 }, { - "epoch": 1.8661208764748376, - "eval/acc": 44.1860466003418, + "epoch": 11.055634807417974, + "eval/acc": 41.86046600341797, "step": 15500 }, { - "epoch": 1.8661208764748376, - "eval_loss": 2.852205991744995, - "eval_runtime": 4.5374, - "eval_samples_per_second": 9.477, - "eval_steps_per_second": 0.22, + "epoch": 11.055634807417974, + "eval_loss": 2.01023268699646, + "eval_runtime": 5.0492, + "eval_samples_per_second": 8.516, + "eval_steps_per_second": 0.198, "step": 15500 }, { - "epoch": 1.867324825427402, - "grad_norm": 6.09375, + "epoch": 11.062767475035663, + "grad_norm": 9.125, "learning_rate": 4.218222222222222e-05, - "loss": 0.5317, + "loss": 0.8758, "step": 15510 }, { - "epoch": 1.8685287743799663, - "grad_norm": 6.09375, + "epoch": 11.069900142653353, + "grad_norm": 8.375, "learning_rate": 4.2137777777777784e-05, - "loss": 0.5278, + "loss": 0.8337, "step": 15520 }, { - "epoch": 1.8697327233325307, - "grad_norm": 15.375, + "epoch": 11.077032810271042, + "grad_norm": 7.78125, "learning_rate": 4.209333333333333e-05, - "loss": 0.6451, + "loss": 0.8168, "step": 15530 }, { - "epoch": 1.870936672285095, - "grad_norm": 9.5, + "epoch": 11.08416547788873, + "grad_norm": 7.03125, "learning_rate": 4.2048888888888894e-05, - "loss": 0.5186, + "loss": 0.8345, "step": 15540 }, { - "epoch": 1.8721406212376595, - "grad_norm": 7.375, + "epoch": 11.09129814550642, + "grad_norm": 8.8125, "learning_rate": 4.200444444444445e-05, - "loss": 0.5882, + "loss": 0.7392, "step": 15550 }, { - "epoch": 1.8733445701902238, - "grad_norm": 5.1875, + "epoch": 11.098430813124109, + "grad_norm": 8.9375, "learning_rate": 4.196e-05, - "loss": 0.5213, + "loss": 0.7623, "step": 15560 }, { - "epoch": 1.8745485191427882, - "grad_norm": 9.3125, + "epoch": 11.105563480741797, + "grad_norm": 9.875, "learning_rate": 4.191555555555556e-05, - "loss": 0.58, + "loss": 0.6797, "step": 15570 }, { - "epoch": 1.8757524680953528, - "grad_norm": 6.75, + "epoch": 11.112696148359486, + "grad_norm": 7.96875, "learning_rate": 4.187111111111111e-05, - "loss": 0.5244, + "loss": 0.7957, "step": 15580 }, { - "epoch": 1.8769564170479172, - "grad_norm": 11.625, + "epoch": 11.119828815977176, + "grad_norm": 7.0625, "learning_rate": 4.182666666666667e-05, - "loss": 0.5446, + "loss": 0.7701, "step": 15590 }, { - "epoch": 1.8781603660004815, - "grad_norm": 7.96875, + "epoch": 11.126961483594865, + "grad_norm": 6.90625, "learning_rate": 4.1782222222222226e-05, - "loss": 0.5606, + "loss": 0.8514, "step": 15600 }, { - "epoch": 1.8781603660004815, - "eval/acc": 46.511627197265625, + "epoch": 11.126961483594865, + "eval/acc": 39.53488540649414, "step": 15600 }, { - "epoch": 1.8781603660004815, - "eval_loss": 2.845609664916992, - "eval_runtime": 4.5649, - "eval_samples_per_second": 9.42, - "eval_steps_per_second": 0.219, + "epoch": 11.126961483594865, + "eval_loss": 2.0217747688293457, + "eval_runtime": 0.2343, + "eval_samples_per_second": 183.512, + "eval_steps_per_second": 4.268, "step": 15600 }, { - "epoch": 1.8793643149530461, - "grad_norm": 8.875, + "epoch": 11.134094151212553, + "grad_norm": 7.3125, "learning_rate": 4.173777777777778e-05, - "loss": 0.6415, + "loss": 0.7757, "step": 15610 }, { - "epoch": 1.8805682639056105, - "grad_norm": 7.75, + "epoch": 11.141226818830242, + "grad_norm": 6.6875, "learning_rate": 4.1693333333333336e-05, - "loss": 0.5137, + "loss": 0.6947, "step": 15620 }, { - "epoch": 1.8817722128581749, - "grad_norm": 7.5, + "epoch": 11.148359486447932, + "grad_norm": 7.3125, "learning_rate": 4.164888888888889e-05, - "loss": 0.5945, + "loss": 0.8118, "step": 15630 }, { - "epoch": 1.8829761618107392, - "grad_norm": 9.25, + "epoch": 11.155492154065621, + "grad_norm": 8.0625, "learning_rate": 4.160444444444445e-05, - "loss": 0.5678, + "loss": 0.848, "step": 15640 }, { - "epoch": 1.8841801107633036, - "grad_norm": 8.125, + "epoch": 11.162624821683309, + "grad_norm": 6.6875, "learning_rate": 4.156e-05, - "loss": 0.5011, + "loss": 0.7256, "step": 15650 }, { - "epoch": 1.885384059715868, - "grad_norm": 6.125, + "epoch": 11.169757489300999, + "grad_norm": 6.3125, "learning_rate": 4.151555555555556e-05, - "loss": 0.5921, + "loss": 0.7898, "step": 15660 }, { - "epoch": 1.8865880086684323, - "grad_norm": 7.15625, + "epoch": 11.176890156918688, + "grad_norm": 5.53125, "learning_rate": 4.147111111111111e-05, - "loss": 0.5706, + "loss": 0.7057, "step": 15670 }, { - "epoch": 1.8877919576209967, - "grad_norm": 7.15625, + "epoch": 11.184022824536376, + "grad_norm": 8.0, "learning_rate": 4.142666666666667e-05, - "loss": 0.6199, + "loss": 0.7065, "step": 15680 }, { - "epoch": 1.8889959065735613, - "grad_norm": 8.375, + "epoch": 11.191155492154065, + "grad_norm": 7.0, "learning_rate": 4.138222222222222e-05, - "loss": 0.6039, + "loss": 0.812, "step": 15690 }, { - "epoch": 1.8901998555261257, - "grad_norm": 9.0, + "epoch": 11.198288159771755, + "grad_norm": 35.0, "learning_rate": 4.133777777777778e-05, - "loss": 0.5823, + "loss": 0.7953, "step": 15700 }, { - "epoch": 1.8901998555261257, - "eval/acc": 47.093021392822266, + "epoch": 11.198288159771755, + "eval/acc": 39.53488540649414, "step": 15700 }, { - "epoch": 1.8901998555261257, - "eval_loss": 2.8587894439697266, - "eval_runtime": 0.2205, - "eval_samples_per_second": 195.01, - "eval_steps_per_second": 4.535, + "epoch": 11.198288159771755, + "eval_loss": 2.0371451377868652, + "eval_runtime": 0.2311, + "eval_samples_per_second": 186.077, + "eval_steps_per_second": 4.327, "step": 15700 }, { - "epoch": 1.89140380447869, - "grad_norm": 6.59375, + "epoch": 11.205420827389444, + "grad_norm": 6.8125, "learning_rate": 4.129333333333333e-05, - "loss": 0.6721, + "loss": 0.8272, "step": 15710 }, { - "epoch": 1.8926077534312546, - "grad_norm": 6.1875, + "epoch": 11.212553495007132, + "grad_norm": 9.625, "learning_rate": 4.1248888888888895e-05, - "loss": 0.5404, + "loss": 0.782, "step": 15720 }, { - "epoch": 1.893811702383819, - "grad_norm": 8.0, + "epoch": 11.219686162624821, + "grad_norm": 7.59375, "learning_rate": 4.1204444444444444e-05, - "loss": 0.6024, + "loss": 0.8176, "step": 15730 }, { - "epoch": 1.8950156513363834, - "grad_norm": 7.15625, + "epoch": 11.226818830242511, + "grad_norm": 7.40625, "learning_rate": 4.1160000000000006e-05, - "loss": 0.6063, + "loss": 0.7592, "step": 15740 }, { - "epoch": 1.8962196002889478, - "grad_norm": 6.125, + "epoch": 11.2339514978602, + "grad_norm": 9.1875, "learning_rate": 4.1115555555555554e-05, - "loss": 0.6106, + "loss": 0.7587, "step": 15750 }, { - "epoch": 1.8974235492415121, - "grad_norm": 7.09375, + "epoch": 11.241084165477888, + "grad_norm": 17.375, "learning_rate": 4.1071111111111116e-05, - "loss": 0.5872, + "loss": 0.7165, "step": 15760 }, { - "epoch": 1.8986274981940765, - "grad_norm": 4.59375, + "epoch": 11.248216833095578, + "grad_norm": 8.5625, "learning_rate": 4.102666666666667e-05, - "loss": 0.5415, + "loss": 0.7391, "step": 15770 }, { - "epoch": 1.8998314471466409, - "grad_norm": 7.3125, + "epoch": 11.255349500713267, + "grad_norm": 9.75, "learning_rate": 4.098222222222222e-05, - "loss": 0.6141, + "loss": 0.745, "step": 15780 }, { - "epoch": 1.9010353960992052, - "grad_norm": 9.5, + "epoch": 11.262482168330957, + "grad_norm": 8.75, "learning_rate": 4.093777777777778e-05, - "loss": 0.6448, + "loss": 0.7635, "step": 15790 }, { - "epoch": 1.9022393450517698, - "grad_norm": 7.1875, + "epoch": 11.269614835948644, + "grad_norm": 6.5, "learning_rate": 4.089333333333333e-05, - "loss": 0.5461, + "loss": 0.8273, "step": 15800 }, { - "epoch": 1.9022393450517698, - "eval/acc": 48.83720779418945, + "epoch": 11.269614835948644, + "eval/acc": 41.86046600341797, "step": 15800 }, { - "epoch": 1.9022393450517698, - "eval_loss": 2.854886293411255, - "eval_runtime": 0.2206, - "eval_samples_per_second": 194.945, - "eval_steps_per_second": 4.534, + "epoch": 11.269614835948644, + "eval_loss": 2.0061967372894287, + "eval_runtime": 0.2356, + "eval_samples_per_second": 182.538, + "eval_steps_per_second": 4.245, "step": 15800 }, { - "epoch": 1.9034432940043342, - "grad_norm": 20.625, + "epoch": 11.276747503566334, + "grad_norm": 7.0, "learning_rate": 4.084888888888889e-05, - "loss": 0.631, + "loss": 0.7411, "step": 15810 }, { - "epoch": 1.9046472429568986, - "grad_norm": 6.875, + "epoch": 11.283880171184023, + "grad_norm": 11.375, "learning_rate": 4.080444444444445e-05, - "loss": 0.5455, + "loss": 0.7564, "step": 15820 }, { - "epoch": 1.9058511919094632, - "grad_norm": 6.4375, + "epoch": 11.291012838801711, + "grad_norm": 8.5, "learning_rate": 4.076e-05, - "loss": 0.5827, + "loss": 0.8688, "step": 15830 }, { - "epoch": 1.9070551408620275, - "grad_norm": 6.875, + "epoch": 11.2981455064194, + "grad_norm": 7.03125, "learning_rate": 4.071555555555556e-05, - "loss": 0.5239, + "loss": 0.7351, "step": 15840 }, { - "epoch": 1.908259089814592, - "grad_norm": 12.8125, + "epoch": 11.30527817403709, + "grad_norm": 9.0, "learning_rate": 4.067111111111111e-05, - "loss": 0.5824, + "loss": 0.7432, "step": 15850 }, { - "epoch": 1.9094630387671563, - "grad_norm": 7.65625, + "epoch": 11.31241084165478, + "grad_norm": 9.875, "learning_rate": 4.062666666666667e-05, - "loss": 0.53, + "loss": 0.7984, "step": 15860 }, { - "epoch": 1.9106669877197207, - "grad_norm": 7.5, + "epoch": 11.319543509272467, + "grad_norm": 7.1875, "learning_rate": 4.0582222222222224e-05, - "loss": 0.6126, + "loss": 0.8125, "step": 15870 }, { - "epoch": 1.911870936672285, - "grad_norm": 7.84375, + "epoch": 11.326676176890157, + "grad_norm": 7.5, "learning_rate": 4.053777777777778e-05, - "loss": 0.5268, + "loss": 0.7995, "step": 15880 }, { - "epoch": 1.9130748856248494, - "grad_norm": 8.75, + "epoch": 11.333808844507846, + "grad_norm": 9.0, "learning_rate": 4.0493333333333334e-05, - "loss": 0.6154, + "loss": 0.7915, "step": 15890 }, { - "epoch": 1.9142788345774138, - "grad_norm": 10.0625, + "epoch": 11.340941512125536, + "grad_norm": 9.5625, "learning_rate": 4.044888888888889e-05, - "loss": 0.6189, + "loss": 0.8598, "step": 15900 }, { - "epoch": 1.9142788345774138, - "eval/acc": 44.1860466003418, + "epoch": 11.340941512125536, + "eval/acc": 37.20930099487305, "step": 15900 }, { - "epoch": 1.9142788345774138, - "eval_loss": 2.848970890045166, - "eval_runtime": 0.216, - "eval_samples_per_second": 199.097, - "eval_steps_per_second": 4.63, + "epoch": 11.340941512125536, + "eval_loss": 2.0159339904785156, + "eval_runtime": 0.7376, + "eval_samples_per_second": 58.299, + "eval_steps_per_second": 1.356, "step": 15900 }, { - "epoch": 1.9154827835299784, - "grad_norm": 7.5625, + "epoch": 11.348074179743223, + "grad_norm": 8.25, "learning_rate": 4.0404444444444445e-05, - "loss": 0.6334, + "loss": 0.8052, "step": 15910 }, { - "epoch": 1.9166867324825427, - "grad_norm": 6.1875, + "epoch": 11.355206847360913, + "grad_norm": 10.75, "learning_rate": 4.0360000000000007e-05, - "loss": 0.5638, + "loss": 0.8008, "step": 15920 }, { - "epoch": 1.917890681435107, - "grad_norm": 15.0, + "epoch": 11.362339514978602, + "grad_norm": 6.8125, "learning_rate": 4.0315555555555555e-05, - "loss": 0.5426, + "loss": 0.7922, "step": 15930 }, { - "epoch": 1.9190946303876717, - "grad_norm": 7.125, + "epoch": 11.36947218259629, + "grad_norm": 11.8125, "learning_rate": 4.027111111111112e-05, - "loss": 0.524, + "loss": 0.7762, "step": 15940 }, { - "epoch": 1.920298579340236, - "grad_norm": 11.6875, + "epoch": 11.37660485021398, + "grad_norm": 8.4375, "learning_rate": 4.0226666666666666e-05, - "loss": 0.5846, + "loss": 0.7765, "step": 15950 }, { - "epoch": 1.9215025282928004, - "grad_norm": 8.3125, + "epoch": 11.383737517831669, + "grad_norm": 8.25, "learning_rate": 4.018222222222223e-05, - "loss": 0.5928, + "loss": 0.7922, "step": 15960 }, { - "epoch": 1.9227064772453648, - "grad_norm": 7.40625, + "epoch": 11.390870185449359, + "grad_norm": 9.5, "learning_rate": 4.013777777777778e-05, - "loss": 0.5476, + "loss": 0.7839, "step": 15970 }, { - "epoch": 1.9239104261979292, - "grad_norm": 5.5625, + "epoch": 11.398002853067046, + "grad_norm": 8.125, "learning_rate": 4.009333333333333e-05, - "loss": 0.5344, + "loss": 0.6813, "step": 15980 }, { - "epoch": 1.9251143751504936, - "grad_norm": 6.5, + "epoch": 11.405135520684736, + "grad_norm": 6.28125, "learning_rate": 4.004888888888889e-05, - "loss": 0.4896, + "loss": 0.8077, "step": 15990 }, { - "epoch": 1.926318324103058, - "grad_norm": 6.96875, + "epoch": 11.412268188302425, + "grad_norm": 8.3125, "learning_rate": 4.000444444444444e-05, - "loss": 0.5144, + "loss": 0.7049, "step": 16000 }, { - "epoch": 1.926318324103058, - "eval/acc": 44.76744079589844, + "epoch": 11.412268188302425, + "eval/acc": 41.86046600341797, "step": 16000 }, { - "epoch": 1.926318324103058, - "eval_loss": 2.851822853088379, - "eval_runtime": 0.2165, - "eval_samples_per_second": 198.64, - "eval_steps_per_second": 4.62, + "epoch": 11.412268188302425, + "eval_loss": 2.0035645961761475, + "eval_runtime": 0.3154, + "eval_samples_per_second": 136.335, + "eval_steps_per_second": 3.171, "step": 16000 }, { - "epoch": 1.9275222730556223, - "grad_norm": 12.0625, + "epoch": 11.419400855920115, + "grad_norm": 6.75, "learning_rate": 3.9960000000000004e-05, - "loss": 0.5676, + "loss": 0.7572, "step": 16010 }, { - "epoch": 1.9287262220081869, - "grad_norm": 8.75, + "epoch": 11.426533523537802, + "grad_norm": 7.9375, "learning_rate": 3.991555555555556e-05, - "loss": 0.4875, + "loss": 0.7396, "step": 16020 }, { - "epoch": 1.9299301709607513, - "grad_norm": 7.5, + "epoch": 11.433666191155492, + "grad_norm": 8.5, "learning_rate": 3.9871111111111114e-05, - "loss": 0.6563, + "loss": 0.7213, "step": 16030 }, { - "epoch": 1.9311341199133156, - "grad_norm": 6.25, + "epoch": 11.440798858773181, + "grad_norm": 8.75, "learning_rate": 3.982666666666667e-05, - "loss": 0.5485, + "loss": 0.7403, "step": 16040 }, { - "epoch": 1.9323380688658802, - "grad_norm": 7.53125, + "epoch": 11.447931526390871, + "grad_norm": 6.625, "learning_rate": 3.9782222222222225e-05, - "loss": 0.5059, + "loss": 0.7124, "step": 16050 }, { - "epoch": 1.9335420178184446, - "grad_norm": 6.9375, + "epoch": 11.455064194008559, + "grad_norm": 8.75, "learning_rate": 3.973777777777778e-05, - "loss": 0.5767, + "loss": 0.7377, "step": 16060 }, { - "epoch": 1.934745966771009, - "grad_norm": 5.78125, + "epoch": 11.462196861626248, + "grad_norm": 9.1875, "learning_rate": 3.9693333333333335e-05, - "loss": 0.6159, + "loss": 0.8129, "step": 16070 }, { - "epoch": 1.9359499157235733, - "grad_norm": 9.6875, + "epoch": 11.469329529243938, + "grad_norm": 8.5, "learning_rate": 3.964888888888889e-05, - "loss": 0.5748, + "loss": 0.8099, "step": 16080 }, { - "epoch": 1.9371538646761377, - "grad_norm": 7.8125, + "epoch": 11.476462196861625, + "grad_norm": 8.75, "learning_rate": 3.9604444444444445e-05, - "loss": 0.5972, + "loss": 0.8657, "step": 16090 }, { - "epoch": 1.938357813628702, - "grad_norm": 7.6875, + "epoch": 11.483594864479315, + "grad_norm": 6.9375, "learning_rate": 3.956e-05, - "loss": 0.4898, + "loss": 0.8028, "step": 16100 }, { - "epoch": 1.938357813628702, - "eval/acc": 41.86046600341797, + "epoch": 11.483594864479315, + "eval/acc": 39.53488540649414, "step": 16100 }, { - "epoch": 1.938357813628702, - "eval_loss": 2.8877687454223633, - "eval_runtime": 0.5978, - "eval_samples_per_second": 71.93, - "eval_steps_per_second": 1.673, + "epoch": 11.483594864479315, + "eval_loss": 2.01182222366333, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.159, + "eval_steps_per_second": 4.306, "step": 16100 }, { - "epoch": 1.9395617625812664, - "grad_norm": 7.96875, + "epoch": 11.490727532097004, + "grad_norm": 22.0, "learning_rate": 3.9515555555555556e-05, - "loss": 0.5813, + "loss": 0.9008, "step": 16110 }, { - "epoch": 1.9407657115338308, - "grad_norm": 7.0625, + "epoch": 11.497860199714694, + "grad_norm": 7.96875, "learning_rate": 3.947111111111111e-05, - "loss": 0.5686, + "loss": 0.749, "step": 16120 }, { - "epoch": 1.9419696604863954, - "grad_norm": 7.15625, + "epoch": 11.504992867332382, + "grad_norm": 9.625, "learning_rate": 3.9426666666666666e-05, - "loss": 0.5711, + "loss": 0.7008, "step": 16130 }, { - "epoch": 1.9431736094389598, - "grad_norm": 9.25, + "epoch": 11.512125534950071, + "grad_norm": 7.90625, "learning_rate": 3.938222222222223e-05, - "loss": 0.5875, + "loss": 0.6725, "step": 16140 }, { - "epoch": 1.9443775583915242, - "grad_norm": 6.65625, + "epoch": 11.51925820256776, + "grad_norm": 6.9375, "learning_rate": 3.933777777777778e-05, - "loss": 0.5032, + "loss": 0.7104, "step": 16150 }, { - "epoch": 1.9455815073440887, - "grad_norm": 8.4375, + "epoch": 11.52639087018545, + "grad_norm": 9.3125, "learning_rate": 3.929333333333334e-05, - "loss": 0.5379, + "loss": 0.7202, "step": 16160 }, { - "epoch": 1.9467854562966531, - "grad_norm": 11.25, + "epoch": 11.533523537803138, + "grad_norm": 8.5625, "learning_rate": 3.924888888888889e-05, - "loss": 0.5714, + "loss": 0.8841, "step": 16170 }, { - "epoch": 1.9479894052492175, - "grad_norm": 7.59375, + "epoch": 11.540656205420827, + "grad_norm": 8.625, "learning_rate": 3.920444444444444e-05, - "loss": 0.7453, + "loss": 0.8151, "step": 16180 }, { - "epoch": 1.9491933542017819, - "grad_norm": 8.9375, + "epoch": 11.547788873038517, + "grad_norm": 8.6875, "learning_rate": 3.9160000000000005e-05, - "loss": 0.6535, + "loss": 0.6946, "step": 16190 }, { - "epoch": 1.9503973031543462, - "grad_norm": 6.71875, + "epoch": 11.554921540656206, + "grad_norm": 5.46875, "learning_rate": 3.911555555555555e-05, - "loss": 0.535, + "loss": 0.8014, "step": 16200 }, { - "epoch": 1.9503973031543462, - "eval/acc": 42.44186019897461, + "epoch": 11.554921540656206, + "eval/acc": 37.20930099487305, "step": 16200 }, { - "epoch": 1.9503973031543462, - "eval_loss": 2.87386417388916, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.07, - "eval_steps_per_second": 4.746, + "epoch": 11.554921540656206, + "eval_loss": 2.008047580718994, + "eval_runtime": 0.2377, + "eval_samples_per_second": 180.881, + "eval_steps_per_second": 4.207, "step": 16200 }, { - "epoch": 1.9516012521069106, - "grad_norm": 10.0625, + "epoch": 11.562054208273894, + "grad_norm": 6.5, "learning_rate": 3.9071111111111115e-05, - "loss": 0.5891, + "loss": 0.8634, "step": 16210 }, { - "epoch": 1.952805201059475, - "grad_norm": 37.0, + "epoch": 11.569186875891583, + "grad_norm": 10.0625, "learning_rate": 3.902666666666667e-05, - "loss": 0.584, + "loss": 0.8836, "step": 16220 }, { - "epoch": 1.9540091500120393, - "grad_norm": 8.3125, + "epoch": 11.576319543509273, + "grad_norm": 6.34375, "learning_rate": 3.8982222222222225e-05, - "loss": 0.5307, + "loss": 0.6787, "step": 16230 }, { - "epoch": 1.955213098964604, - "grad_norm": 16.125, + "epoch": 11.58345221112696, + "grad_norm": 7.8125, "learning_rate": 3.893777777777778e-05, - "loss": 0.5957, + "loss": 0.7925, "step": 16240 }, { - "epoch": 1.9564170479171683, - "grad_norm": 7.65625, + "epoch": 11.59058487874465, + "grad_norm": 10.6875, "learning_rate": 3.8893333333333336e-05, - "loss": 0.592, + "loss": 0.7393, "step": 16250 }, { - "epoch": 1.9576209968697327, - "grad_norm": 8.8125, + "epoch": 11.59771754636234, + "grad_norm": 6.65625, "learning_rate": 3.884888888888889e-05, - "loss": 0.468, + "loss": 0.7407, "step": 16260 }, { - "epoch": 1.9588249458222973, - "grad_norm": 7.25, + "epoch": 11.60485021398003, + "grad_norm": 6.5625, "learning_rate": 3.8804444444444446e-05, - "loss": 0.5946, + "loss": 0.8039, "step": 16270 }, { - "epoch": 1.9600288947748616, - "grad_norm": 7.84375, + "epoch": 11.611982881597717, + "grad_norm": 7.3125, "learning_rate": 3.876e-05, - "loss": 0.5809, + "loss": 0.8564, "step": 16280 }, { - "epoch": 1.961232843727426, - "grad_norm": 7.15625, + "epoch": 11.619115549215406, + "grad_norm": 6.6875, "learning_rate": 3.871555555555556e-05, - "loss": 0.5817, + "loss": 0.7674, "step": 16290 }, { - "epoch": 1.9624367926799904, - "grad_norm": 8.125, + "epoch": 11.626248216833096, + "grad_norm": 7.8125, "learning_rate": 3.867111111111111e-05, - "loss": 0.5724, + "loss": 0.8431, "step": 16300 }, { - "epoch": 1.9624367926799904, - "eval/acc": 44.1860466003418, + "epoch": 11.626248216833096, + "eval/acc": 41.86046600341797, "step": 16300 }, { - "epoch": 1.9624367926799904, - "eval_loss": 2.8631114959716797, - "eval_runtime": 0.2127, - "eval_samples_per_second": 202.139, - "eval_steps_per_second": 4.701, + "epoch": 11.626248216833096, + "eval_loss": 1.9884032011032104, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.225, + "eval_steps_per_second": 4.47, "step": 16300 }, { - "epoch": 1.9636407416325548, - "grad_norm": 8.5, + "epoch": 11.633380884450785, + "grad_norm": 6.84375, "learning_rate": 3.862666666666667e-05, - "loss": 0.5632, + "loss": 0.7772, "step": 16310 }, { - "epoch": 1.9648446905851191, - "grad_norm": 5.46875, + "epoch": 11.640513552068473, + "grad_norm": 6.40625, "learning_rate": 3.858222222222222e-05, - "loss": 0.5247, + "loss": 0.8256, "step": 16320 }, { - "epoch": 1.9660486395376835, - "grad_norm": 7.34375, + "epoch": 11.647646219686163, + "grad_norm": 7.96875, "learning_rate": 3.853777777777778e-05, - "loss": 0.5958, + "loss": 0.7004, "step": 16330 }, { - "epoch": 1.9672525884902479, - "grad_norm": 5.625, + "epoch": 11.654778887303852, + "grad_norm": 8.125, "learning_rate": 3.849333333333334e-05, - "loss": 0.6062, + "loss": 0.8883, "step": 16340 }, { - "epoch": 1.9684565374428125, - "grad_norm": 15.625, + "epoch": 11.661911554921542, + "grad_norm": 17.5, "learning_rate": 3.844888888888889e-05, - "loss": 0.4598, + "loss": 0.7894, "step": 16350 }, { - "epoch": 1.9696604863953768, - "grad_norm": 9.625, + "epoch": 11.66904422253923, + "grad_norm": 8.0, "learning_rate": 3.840444444444445e-05, - "loss": 0.5428, + "loss": 0.8491, "step": 16360 }, { - "epoch": 1.9708644353479412, - "grad_norm": 7.5625, + "epoch": 11.676176890156919, + "grad_norm": 6.78125, "learning_rate": 3.836e-05, - "loss": 0.5846, + "loss": 0.8265, "step": 16370 }, { - "epoch": 1.9720683843005058, - "grad_norm": 7.21875, + "epoch": 11.683309557774608, + "grad_norm": 8.25, "learning_rate": 3.831555555555556e-05, - "loss": 0.5767, + "loss": 0.7288, "step": 16380 }, { - "epoch": 1.9732723332530702, - "grad_norm": 7.875, + "epoch": 11.690442225392296, + "grad_norm": 7.96875, "learning_rate": 3.8271111111111116e-05, - "loss": 0.5815, + "loss": 0.7468, "step": 16390 }, { - "epoch": 1.9744762822056345, + "epoch": 11.697574893009985, "grad_norm": 7.28125, "learning_rate": 3.8226666666666664e-05, - "loss": 0.6038, + "loss": 0.7557, "step": 16400 }, { - "epoch": 1.9744762822056345, - "eval/acc": 44.1860466003418, + "epoch": 11.697574893009985, + "eval/acc": 39.53488540649414, "step": 16400 }, { - "epoch": 1.9744762822056345, - "eval_loss": 2.8775668144226074, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.634, - "eval_steps_per_second": 4.736, + "epoch": 11.697574893009985, + "eval_loss": 1.9937853813171387, + "eval_runtime": 0.2332, + "eval_samples_per_second": 184.37, + "eval_steps_per_second": 4.288, "step": 16400 }, { - "epoch": 1.975680231158199, - "grad_norm": 6.53125, + "epoch": 11.704707560627675, + "grad_norm": 9.5625, "learning_rate": 3.8182222222222226e-05, - "loss": 0.5933, + "loss": 0.8767, "step": 16410 }, { - "epoch": 1.9768841801107633, - "grad_norm": 7.78125, + "epoch": 11.711840228245364, + "grad_norm": 7.96875, "learning_rate": 3.8137777777777775e-05, - "loss": 0.5237, + "loss": 0.6997, "step": 16420 }, { - "epoch": 1.9780881290633276, - "grad_norm": 7.34375, + "epoch": 11.718972895863052, + "grad_norm": 6.75, "learning_rate": 3.809333333333334e-05, - "loss": 0.5757, + "loss": 0.8482, "step": 16430 }, { - "epoch": 1.979292078015892, - "grad_norm": 11.3125, + "epoch": 11.726105563480742, + "grad_norm": 9.25, "learning_rate": 3.804888888888889e-05, - "loss": 0.6774, + "loss": 0.8189, "step": 16440 }, { - "epoch": 1.9804960269684564, - "grad_norm": 5.75, + "epoch": 11.733238231098431, + "grad_norm": 7.21875, "learning_rate": 3.800444444444445e-05, - "loss": 0.5364, + "loss": 0.7069, "step": 16450 }, { - "epoch": 1.981699975921021, - "grad_norm": 6.28125, + "epoch": 11.74037089871612, + "grad_norm": 8.125, "learning_rate": 3.796e-05, - "loss": 0.6244, + "loss": 0.8172, "step": 16460 }, { - "epoch": 1.9829039248735854, - "grad_norm": 7.75, + "epoch": 11.747503566333808, + "grad_norm": 8.0625, "learning_rate": 3.791555555555556e-05, - "loss": 0.497, + "loss": 0.7332, "step": 16470 }, { - "epoch": 1.9841078738261497, - "grad_norm": 10.1875, + "epoch": 11.754636233951498, + "grad_norm": 7.75, "learning_rate": 3.787111111111111e-05, - "loss": 0.5933, + "loss": 0.7746, "step": 16480 }, { - "epoch": 1.9853118227787143, - "grad_norm": 6.40625, + "epoch": 11.761768901569187, + "grad_norm": 8.8125, "learning_rate": 3.782666666666667e-05, - "loss": 0.5994, + "loss": 0.7679, "step": 16490 }, { - "epoch": 1.9865157717312787, - "grad_norm": 7.21875, + "epoch": 11.768901569186877, + "grad_norm": 7.25, "learning_rate": 3.778222222222222e-05, - "loss": 0.7191, + "loss": 0.9753, "step": 16500 }, { - "epoch": 1.9865157717312787, - "eval/acc": 44.1860466003418, + "epoch": 11.768901569186877, + "eval/acc": 39.53488540649414, "step": 16500 }, { - "epoch": 1.9865157717312787, - "eval_loss": 2.8654353618621826, - "eval_runtime": 0.2125, - "eval_samples_per_second": 202.312, - "eval_steps_per_second": 4.705, + "epoch": 11.768901569186877, + "eval_loss": 1.9911694526672363, + "eval_runtime": 0.2278, + "eval_samples_per_second": 188.761, + "eval_steps_per_second": 4.39, "step": 16500 }, { - "epoch": 1.987719720683843, - "grad_norm": 9.25, + "epoch": 11.776034236804565, + "grad_norm": 7.375, "learning_rate": 3.773777777777778e-05, - "loss": 0.6151, + "loss": 0.7303, "step": 16510 }, { - "epoch": 1.9889236696364074, - "grad_norm": 7.34375, + "epoch": 11.783166904422254, + "grad_norm": 6.65625, "learning_rate": 3.7693333333333334e-05, - "loss": 0.5837, + "loss": 0.7036, "step": 16520 }, { - "epoch": 1.9901276185889718, - "grad_norm": 7.03125, + "epoch": 11.790299572039943, + "grad_norm": 7.8125, "learning_rate": 3.764888888888889e-05, - "loss": 0.5674, + "loss": 0.6873, "step": 16530 }, { - "epoch": 1.9913315675415362, - "grad_norm": 25.0, + "epoch": 11.797432239657631, + "grad_norm": 13.3125, "learning_rate": 3.760444444444445e-05, - "loss": 0.5812, + "loss": 0.8784, "step": 16540 }, { - "epoch": 1.9925355164941005, - "grad_norm": 19.125, + "epoch": 11.80456490727532, + "grad_norm": 10.5625, "learning_rate": 3.756e-05, - "loss": 0.6122, + "loss": 0.8149, "step": 16550 }, { - "epoch": 1.993739465446665, - "grad_norm": 10.0625, + "epoch": 11.81169757489301, + "grad_norm": 9.75, "learning_rate": 3.751555555555556e-05, - "loss": 0.623, + "loss": 0.7988, "step": 16560 }, { - "epoch": 1.9949434143992295, - "grad_norm": 8.1875, + "epoch": 11.8188302425107, + "grad_norm": 8.5625, "learning_rate": 3.747111111111111e-05, - "loss": 0.6221, + "loss": 0.8117, "step": 16570 }, { - "epoch": 1.9961473633517939, - "grad_norm": 7.59375, + "epoch": 11.825962910128387, + "grad_norm": 9.8125, "learning_rate": 3.742666666666667e-05, - "loss": 0.6054, + "loss": 0.7908, "step": 16580 }, { - "epoch": 1.9973513123043583, - "grad_norm": 10.25, + "epoch": 11.833095577746077, + "grad_norm": 8.0, "learning_rate": 3.738222222222223e-05, - "loss": 0.539, + "loss": 0.8379, "step": 16590 }, { - "epoch": 1.9985552612569228, - "grad_norm": 8.0, + "epoch": 11.840228245363766, + "grad_norm": 7.21875, "learning_rate": 3.7337777777777776e-05, - "loss": 0.5606, + "loss": 0.7278, "step": 16600 }, { - "epoch": 1.9985552612569228, - "eval/acc": 44.1860466003418, + "epoch": 11.840228245363766, + "eval/acc": 37.20930099487305, "step": 16600 }, { - "epoch": 1.9985552612569228, - "eval_loss": 2.8806099891662598, - "eval_runtime": 0.2152, - "eval_samples_per_second": 199.797, - "eval_steps_per_second": 4.646, + "epoch": 11.840228245363766, + "eval_loss": 1.9659804105758667, + "eval_runtime": 0.2325, + "eval_samples_per_second": 184.932, + "eval_steps_per_second": 4.301, "step": 16600 }, { - "epoch": 1.9997592102094872, - "grad_norm": 9.4375, + "epoch": 11.847360912981456, + "grad_norm": 8.0, "learning_rate": 3.729333333333334e-05, - "loss": 0.5944, + "loss": 0.773, "step": 16610 }, { - "epoch": 2.0009631591620516, - "grad_norm": 6.6875, + "epoch": 11.854493580599144, + "grad_norm": 5.78125, "learning_rate": 3.7248888888888886e-05, - "loss": 0.5375, + "loss": 0.7377, "step": 16620 }, { - "epoch": 2.002167108114616, - "grad_norm": 6.15625, + "epoch": 11.861626248216833, + "grad_norm": 8.875, "learning_rate": 3.720444444444445e-05, - "loss": 0.5144, + "loss": 0.6644, "step": 16630 }, { - "epoch": 2.0033710570671803, - "grad_norm": 7.375, + "epoch": 11.868758915834523, + "grad_norm": 7.125, "learning_rate": 3.716e-05, - "loss": 0.5942, + "loss": 0.8759, "step": 16640 }, { - "epoch": 2.0045750060197447, - "grad_norm": 7.125, + "epoch": 11.87589158345221, + "grad_norm": 7.0625, "learning_rate": 3.711555555555556e-05, - "loss": 0.5861, + "loss": 0.8503, "step": 16650 }, { - "epoch": 2.005778954972309, - "grad_norm": 9.1875, + "epoch": 11.8830242510699, + "grad_norm": 5.75, "learning_rate": 3.7071111111111114e-05, - "loss": 0.6177, + "loss": 0.7204, "step": 16660 }, { - "epoch": 2.0069829039248734, - "grad_norm": 6.15625, + "epoch": 11.89015691868759, + "grad_norm": 7.4375, "learning_rate": 3.702666666666667e-05, - "loss": 0.6381, + "loss": 0.8646, "step": 16670 }, { - "epoch": 2.008186852877438, - "grad_norm": 5.8125, + "epoch": 11.897289586305279, + "grad_norm": 7.875, "learning_rate": 3.6982222222222224e-05, - "loss": 0.5626, + "loss": 0.7951, "step": 16680 }, { - "epoch": 2.0093908018300026, - "grad_norm": 8.0625, + "epoch": 11.904422253922966, + "grad_norm": 7.75, "learning_rate": 3.693777777777778e-05, - "loss": 0.5352, + "loss": 0.7474, "step": 16690 }, { - "epoch": 2.010594750782567, - "grad_norm": 9.1875, + "epoch": 11.911554921540656, + "grad_norm": 8.0625, "learning_rate": 3.6893333333333335e-05, - "loss": 0.6053, + "loss": 0.8184, "step": 16700 }, { - "epoch": 2.010594750782567, - "eval/acc": 39.53488540649414, + "epoch": 11.911554921540656, + "eval/acc": 41.86046600341797, "step": 16700 }, { - "epoch": 2.010594750782567, - "eval_loss": 2.075368881225586, - "eval_runtime": 6.8777, - "eval_samples_per_second": 6.252, - "eval_steps_per_second": 0.145, + "epoch": 11.911554921540656, + "eval_loss": 1.9796830415725708, + "eval_runtime": 0.2288, + "eval_samples_per_second": 187.97, + "eval_steps_per_second": 4.371, "step": 16700 }, { - "epoch": 2.0117986997351314, - "grad_norm": 7.59375, + "epoch": 11.918687589158345, + "grad_norm": 8.5625, "learning_rate": 3.684888888888889e-05, - "loss": 0.5228, + "loss": 0.7233, "step": 16710 }, { - "epoch": 2.0130026486876957, - "grad_norm": 5.3125, + "epoch": 11.925820256776035, + "grad_norm": 6.84375, "learning_rate": 3.6804444444444445e-05, - "loss": 0.4805, + "loss": 0.8783, "step": 16720 }, { - "epoch": 2.01420659764026, - "grad_norm": 6.34375, + "epoch": 11.932952924393723, + "grad_norm": 8.0625, "learning_rate": 3.676e-05, - "loss": 0.5505, + "loss": 0.7848, "step": 16730 }, { - "epoch": 2.0154105465928245, - "grad_norm": 8.25, + "epoch": 11.940085592011412, + "grad_norm": 7.0, "learning_rate": 3.6715555555555556e-05, - "loss": 0.5271, + "loss": 0.7663, "step": 16740 }, { - "epoch": 2.016614495545389, - "grad_norm": 6.90625, + "epoch": 11.947218259629102, + "grad_norm": 8.25, "learning_rate": 3.667111111111111e-05, - "loss": 0.5843, + "loss": 0.7711, "step": 16750 }, { - "epoch": 2.0178184444979532, - "grad_norm": 6.25, + "epoch": 11.95435092724679, + "grad_norm": 8.5625, "learning_rate": 3.662666666666667e-05, - "loss": 0.5916, + "loss": 0.7848, "step": 16760 }, { - "epoch": 2.0190223934505176, - "grad_norm": 7.59375, + "epoch": 11.961483594864479, + "grad_norm": 13.375, "learning_rate": 3.658222222222222e-05, - "loss": 0.5564, + "loss": 0.8355, "step": 16770 }, { - "epoch": 2.020226342403082, - "grad_norm": 7.8125, + "epoch": 11.968616262482168, + "grad_norm": 8.1875, "learning_rate": 3.653777777777778e-05, - "loss": 0.5907, + "loss": 0.8452, "step": 16780 }, { - "epoch": 2.0214302913556463, - "grad_norm": 11.875, + "epoch": 11.975748930099858, + "grad_norm": 7.53125, "learning_rate": 3.649333333333333e-05, - "loss": 0.5908, + "loss": 0.8508, "step": 16790 }, { - "epoch": 2.022634240308211, - "grad_norm": 8.0, + "epoch": 11.982881597717546, + "grad_norm": 19.5, "learning_rate": 3.644888888888889e-05, - "loss": 0.5578, + "loss": 0.8187, "step": 16800 }, { - "epoch": 2.022634240308211, - "eval/acc": 41.86046600341797, + "epoch": 11.982881597717546, + "eval/acc": 39.53488540649414, "step": 16800 }, { - "epoch": 2.022634240308211, - "eval_loss": 2.0900626182556152, - "eval_runtime": 0.2159, - "eval_samples_per_second": 199.144, - "eval_steps_per_second": 4.631, + "epoch": 11.982881597717546, + "eval_loss": 1.9583516120910645, + "eval_runtime": 0.2249, + "eval_samples_per_second": 191.207, + "eval_steps_per_second": 4.447, "step": 16800 }, { - "epoch": 2.0238381892607755, - "grad_norm": 9.0, + "epoch": 11.990014265335235, + "grad_norm": 10.125, "learning_rate": 3.640444444444445e-05, - "loss": 0.6217, + "loss": 0.8547, "step": 16810 }, { - "epoch": 2.02504213821334, - "grad_norm": 9.1875, + "epoch": 11.997146932952925, + "grad_norm": 15.125, "learning_rate": 3.636e-05, - "loss": 0.6252, + "loss": 0.7173, "step": 16820 }, { - "epoch": 2.0262460871659043, - "grad_norm": 11.5, + "epoch": 12.004279600570614, + "grad_norm": 6.53125, "learning_rate": 3.631555555555556e-05, - "loss": 0.5154, + "loss": 0.7688, "step": 16830 }, { - "epoch": 2.0274500361184686, - "grad_norm": 8.3125, + "epoch": 12.011412268188302, + "grad_norm": 78.5, "learning_rate": 3.627111111111111e-05, - "loss": 0.505, + "loss": 0.8184, "step": 16840 }, { - "epoch": 2.028653985071033, - "grad_norm": 6.28125, + "epoch": 12.018544935805991, + "grad_norm": 8.5625, "learning_rate": 3.622666666666667e-05, - "loss": 0.4879, + "loss": 0.8062, "step": 16850 }, { - "epoch": 2.0298579340235974, - "grad_norm": 7.65625, + "epoch": 12.02567760342368, + "grad_norm": 9.25, "learning_rate": 3.6182222222222225e-05, - "loss": 0.5616, + "loss": 0.839, "step": 16860 }, { - "epoch": 2.0310618829761617, - "grad_norm": 6.9375, + "epoch": 12.03281027104137, + "grad_norm": 9.375, "learning_rate": 3.613777777777778e-05, - "loss": 0.5375, + "loss": 0.84, "step": 16870 }, { - "epoch": 2.032265831928726, - "grad_norm": 6.90625, + "epoch": 12.039942938659058, + "grad_norm": 7.46875, "learning_rate": 3.6093333333333336e-05, - "loss": 0.6451, + "loss": 0.7653, "step": 16880 }, { - "epoch": 2.0334697808812905, - "grad_norm": 5.0, + "epoch": 12.047075606276747, + "grad_norm": 14.875, "learning_rate": 3.604888888888889e-05, - "loss": 0.5616, + "loss": 0.7917, "step": 16890 }, { - "epoch": 2.034673729833855, - "grad_norm": 8.0, + "epoch": 12.054208273894437, + "grad_norm": 11.0, "learning_rate": 3.6004444444444446e-05, - "loss": 0.6754, + "loss": 0.7125, "step": 16900 }, { - "epoch": 2.034673729833855, - "eval/acc": 39.53488540649414, + "epoch": 12.054208273894437, + "eval/acc": 37.20930099487305, "step": 16900 }, { - "epoch": 2.034673729833855, - "eval_loss": 2.1007204055786133, - "eval_runtime": 0.2204, - "eval_samples_per_second": 195.141, - "eval_steps_per_second": 4.538, + "epoch": 12.054208273894437, + "eval_loss": 3.0164332389831543, + "eval_runtime": 5.2863, + "eval_samples_per_second": 8.134, + "eval_steps_per_second": 0.189, "step": 16900 }, { - "epoch": 2.0358776787864192, - "grad_norm": 7.5625, + "epoch": 12.061340941512125, + "grad_norm": 7.34375, "learning_rate": 3.596e-05, - "loss": 0.5048, + "loss": 0.8146, "step": 16910 }, { - "epoch": 2.037081627738984, - "grad_norm": 9.875, + "epoch": 12.068473609129814, + "grad_norm": 6.96875, "learning_rate": 3.5915555555555557e-05, - "loss": 0.5794, + "loss": 0.7891, "step": 16920 }, { - "epoch": 2.0382855766915484, - "grad_norm": 6.21875, + "epoch": 12.075606276747504, + "grad_norm": 6.78125, "learning_rate": 3.587111111111111e-05, - "loss": 0.5909, + "loss": 0.7922, "step": 16930 }, { - "epoch": 2.039489525644113, - "grad_norm": 7.09375, + "epoch": 12.082738944365193, + "grad_norm": 11.8125, "learning_rate": 3.582666666666667e-05, - "loss": 0.6029, + "loss": 0.7896, "step": 16940 }, { - "epoch": 2.040693474596677, - "grad_norm": 7.375, + "epoch": 12.08987161198288, + "grad_norm": 6.25, "learning_rate": 3.578222222222222e-05, - "loss": 0.5345, + "loss": 0.7579, "step": 16950 }, { - "epoch": 2.0418974235492415, - "grad_norm": 7.53125, + "epoch": 12.09700427960057, + "grad_norm": 7.9375, "learning_rate": 3.5737777777777784e-05, - "loss": 0.5384, + "loss": 0.8181, "step": 16960 }, { - "epoch": 2.043101372501806, - "grad_norm": 6.125, + "epoch": 12.10413694721826, + "grad_norm": 12.75, "learning_rate": 3.569333333333333e-05, - "loss": 0.5809, + "loss": 0.7577, "step": 16970 }, { - "epoch": 2.0443053214543703, - "grad_norm": 8.0625, + "epoch": 12.11126961483595, + "grad_norm": 50.75, "learning_rate": 3.5648888888888895e-05, - "loss": 0.5626, + "loss": 0.7314, "step": 16980 }, { - "epoch": 2.0455092704069346, - "grad_norm": 7.71875, + "epoch": 12.118402282453637, + "grad_norm": 8.625, "learning_rate": 3.560444444444444e-05, - "loss": 0.5675, + "loss": 0.7212, "step": 16990 }, { - "epoch": 2.046713219359499, - "grad_norm": 6.96875, + "epoch": 12.125534950071327, + "grad_norm": 7.90625, "learning_rate": 3.5560000000000005e-05, - "loss": 0.4864, + "loss": 0.8013, "step": 17000 }, { - "epoch": 2.046713219359499, - "eval/acc": 39.53488540649414, + "epoch": 12.125534950071327, + "eval/acc": 37.20930099487305, "step": 17000 }, { - "epoch": 2.046713219359499, - "eval_loss": 2.098133087158203, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.073, - "eval_steps_per_second": 4.746, + "epoch": 12.125534950071327, + "eval_loss": 3.037986993789673, + "eval_runtime": 0.2607, + "eval_samples_per_second": 164.968, + "eval_steps_per_second": 3.836, "step": 17000 }, { - "epoch": 2.0479171683120634, - "grad_norm": 7.875, + "epoch": 12.132667617689016, + "grad_norm": 8.0, "learning_rate": 3.551555555555556e-05, - "loss": 0.5531, + "loss": 0.7848, "step": 17010 }, { - "epoch": 2.0491211172646278, - "grad_norm": 7.09375, + "epoch": 12.139800285306706, + "grad_norm": 8.9375, "learning_rate": 3.547111111111111e-05, - "loss": 0.6313, + "loss": 0.7902, "step": 17020 }, { - "epoch": 2.0503250662171926, - "grad_norm": 6.3125, + "epoch": 12.146932952924393, + "grad_norm": 8.625, "learning_rate": 3.542666666666667e-05, - "loss": 0.5699, + "loss": 0.8201, "step": 17030 }, { - "epoch": 2.051529015169757, - "grad_norm": 7.09375, + "epoch": 12.154065620542083, + "grad_norm": 8.0625, "learning_rate": 3.538222222222222e-05, - "loss": 0.5396, + "loss": 0.7456, "step": 17040 }, { - "epoch": 2.0527329641223213, - "grad_norm": 12.625, + "epoch": 12.161198288159772, + "grad_norm": 6.375, "learning_rate": 3.533777777777778e-05, - "loss": 0.5393, + "loss": 0.7404, "step": 17050 }, { - "epoch": 2.0539369130748857, - "grad_norm": 7.5, + "epoch": 12.16833095577746, + "grad_norm": 10.5, "learning_rate": 3.5293333333333336e-05, - "loss": 0.5704, + "loss": 0.6491, "step": 17060 }, { - "epoch": 2.05514086202745, - "grad_norm": 7.59375, + "epoch": 12.17546362339515, + "grad_norm": 6.03125, "learning_rate": 3.524888888888889e-05, - "loss": 0.5339, + "loss": 0.8083, "step": 17070 }, { - "epoch": 2.0563448109800144, - "grad_norm": 8.5, + "epoch": 12.182596291012839, + "grad_norm": 7.15625, "learning_rate": 3.520444444444445e-05, - "loss": 0.5485, + "loss": 0.7394, "step": 17080 }, { - "epoch": 2.057548759932579, - "grad_norm": 8.1875, + "epoch": 12.189728958630528, + "grad_norm": 8.0, "learning_rate": 3.516e-05, - "loss": 0.5944, + "loss": 0.7967, "step": 17090 }, { - "epoch": 2.058752708885143, - "grad_norm": 8.9375, + "epoch": 12.196861626248216, + "grad_norm": 7.40625, "learning_rate": 3.511555555555556e-05, - "loss": 0.5787, + "loss": 0.7497, "step": 17100 }, { - "epoch": 2.058752708885143, + "epoch": 12.196861626248216, "eval/acc": 37.20930099487305, "step": 17100 }, { - "epoch": 2.058752708885143, - "eval_loss": 2.1219515800476074, - "eval_runtime": 0.2166, - "eval_samples_per_second": 198.5, - "eval_steps_per_second": 4.616, + "epoch": 12.196861626248216, + "eval_loss": 3.029151678085327, + "eval_runtime": 0.2337, + "eval_samples_per_second": 183.992, + "eval_steps_per_second": 4.279, "step": 17100 }, { - "epoch": 2.0599566578377075, - "grad_norm": 9.625, + "epoch": 12.203994293865906, + "grad_norm": 7.75, "learning_rate": 3.507111111111111e-05, - "loss": 0.6232, + "loss": 0.697, "step": 17110 }, { - "epoch": 2.061160606790272, - "grad_norm": 10.375, + "epoch": 12.211126961483595, + "grad_norm": 12.0625, "learning_rate": 3.502666666666667e-05, - "loss": 0.7104, + "loss": 0.7795, "step": 17120 }, { - "epoch": 2.0623645557428363, - "grad_norm": 11.1875, + "epoch": 12.218259629101285, + "grad_norm": 7.1875, "learning_rate": 3.498222222222222e-05, - "loss": 0.5668, + "loss": 0.7974, "step": 17130 }, { - "epoch": 2.063568504695401, - "grad_norm": 8.25, + "epoch": 12.225392296718972, + "grad_norm": 8.875, "learning_rate": 3.493777777777778e-05, - "loss": 0.5739, + "loss": 0.8272, "step": 17140 }, { - "epoch": 2.0647724536479655, - "grad_norm": 7.34375, + "epoch": 12.232524964336662, + "grad_norm": 15.4375, "learning_rate": 3.4893333333333334e-05, - "loss": 0.5701, + "loss": 0.8373, "step": 17150 }, { - "epoch": 2.06597640260053, - "grad_norm": 6.03125, + "epoch": 12.239657631954351, + "grad_norm": 6.9375, "learning_rate": 3.484888888888889e-05, - "loss": 0.5174, + "loss": 0.7922, "step": 17160 }, { - "epoch": 2.067180351553094, - "grad_norm": 6.3125, + "epoch": 12.24679029957204, + "grad_norm": 7.53125, "learning_rate": 3.4804444444444444e-05, - "loss": 0.5081, + "loss": 0.8099, "step": 17170 }, { - "epoch": 2.0683843005056586, - "grad_norm": 9.5, + "epoch": 12.253922967189729, + "grad_norm": 6.28125, "learning_rate": 3.4760000000000006e-05, - "loss": 0.5513, + "loss": 0.7522, "step": 17180 }, { - "epoch": 2.069588249458223, - "grad_norm": 12.875, + "epoch": 12.261055634807418, + "grad_norm": 8.8125, "learning_rate": 3.4715555555555554e-05, - "loss": 0.6219, + "loss": 0.7338, "step": 17190 }, { - "epoch": 2.0707921984107873, - "grad_norm": 16.125, + "epoch": 12.268188302425107, + "grad_norm": 6.34375, "learning_rate": 3.4671111111111116e-05, - "loss": 0.5014, + "loss": 0.7782, "step": 17200 }, { - "epoch": 2.0707921984107873, - "eval/acc": 39.53488540649414, + "epoch": 12.268188302425107, + "eval/acc": 37.20930099487305, "step": 17200 }, { - "epoch": 2.0707921984107873, - "eval_loss": 2.13092303276062, - "eval_runtime": 0.2186, - "eval_samples_per_second": 196.676, - "eval_steps_per_second": 4.574, + "epoch": 12.268188302425107, + "eval_loss": 3.063300848007202, + "eval_runtime": 0.2346, + "eval_samples_per_second": 183.302, + "eval_steps_per_second": 4.263, "step": 17200 }, { - "epoch": 2.0719961473633517, - "grad_norm": 8.3125, + "epoch": 12.275320970042795, + "grad_norm": 7.1875, "learning_rate": 3.462666666666667e-05, - "loss": 0.5354, + "loss": 0.7274, "step": 17210 }, { - "epoch": 2.073200096315916, - "grad_norm": 9.5, + "epoch": 12.282453637660485, + "grad_norm": 6.28125, "learning_rate": 3.458222222222222e-05, - "loss": 0.5422, + "loss": 0.7543, "step": 17220 }, { - "epoch": 2.0744040452684804, - "grad_norm": 8.9375, + "epoch": 12.289586305278174, + "grad_norm": 12.75, "learning_rate": 3.453777777777778e-05, - "loss": 0.5771, + "loss": 0.8159, "step": 17230 }, { - "epoch": 2.075607994221045, - "grad_norm": 24.375, + "epoch": 12.296718972895864, + "grad_norm": 7.03125, "learning_rate": 3.449333333333333e-05, - "loss": 0.5147, + "loss": 0.7927, "step": 17240 }, { - "epoch": 2.0768119431736096, - "grad_norm": 6.28125, + "epoch": 12.303851640513551, + "grad_norm": 7.09375, "learning_rate": 3.444888888888889e-05, - "loss": 0.6286, + "loss": 0.7862, "step": 17250 }, { - "epoch": 2.078015892126174, - "grad_norm": 6.65625, + "epoch": 12.310984308131241, + "grad_norm": 11.375, "learning_rate": 3.440444444444445e-05, - "loss": 0.5529, + "loss": 0.8505, "step": 17260 }, { - "epoch": 2.0792198410787384, - "grad_norm": 8.0, + "epoch": 12.31811697574893, + "grad_norm": 6.40625, "learning_rate": 3.436e-05, - "loss": 0.6149, + "loss": 0.7432, "step": 17270 }, { - "epoch": 2.0804237900313027, - "grad_norm": 8.4375, + "epoch": 12.32524964336662, + "grad_norm": 7.5, "learning_rate": 3.431555555555556e-05, - "loss": 0.5663, + "loss": 0.8234, "step": 17280 }, { - "epoch": 2.081627738983867, - "grad_norm": 4.5625, + "epoch": 12.332382310984308, + "grad_norm": 8.0, "learning_rate": 3.4271111111111114e-05, - "loss": 0.5788, + "loss": 0.8929, "step": 17290 }, { - "epoch": 2.0828316879364315, + "epoch": 12.339514978601997, "grad_norm": 8.8125, "learning_rate": 3.422666666666667e-05, - "loss": 0.6019, + "loss": 0.7827, "step": 17300 }, { - "epoch": 2.0828316879364315, - "eval/acc": 41.86046600341797, + "epoch": 12.339514978601997, + "eval/acc": 37.20930099487305, "step": 17300 }, { - "epoch": 2.0828316879364315, - "eval_loss": 2.138639211654663, - "eval_runtime": 0.2194, - "eval_samples_per_second": 195.991, - "eval_steps_per_second": 4.558, + "epoch": 12.339514978601997, + "eval_loss": 3.0306241512298584, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.496, + "eval_steps_per_second": 4.198, "step": 17300 }, { - "epoch": 2.084035636888996, - "grad_norm": 4.8125, + "epoch": 12.346647646219687, + "grad_norm": 7.125, "learning_rate": 3.4182222222222224e-05, - "loss": 0.5519, + "loss": 0.8023, "step": 17310 }, { - "epoch": 2.08523958584156, - "grad_norm": 8.0625, + "epoch": 12.353780313837376, + "grad_norm": 6.375, "learning_rate": 3.413777777777778e-05, - "loss": 0.5612, + "loss": 0.7275, "step": 17320 }, { - "epoch": 2.0864435347941246, - "grad_norm": 8.375, + "epoch": 12.360912981455064, + "grad_norm": 9.75, "learning_rate": 3.4093333333333334e-05, - "loss": 0.5622, + "loss": 0.8026, "step": 17330 }, { - "epoch": 2.087647483746689, - "grad_norm": 8.6875, + "epoch": 12.368045649072753, + "grad_norm": 6.75, "learning_rate": 3.404888888888889e-05, - "loss": 0.6023, + "loss": 0.7214, "step": 17340 }, { - "epoch": 2.0888514326992533, + "epoch": 12.375178316690443, "grad_norm": 9.0, "learning_rate": 3.4004444444444445e-05, - "loss": 0.5416, + "loss": 0.808, "step": 17350 }, { - "epoch": 2.090055381651818, - "grad_norm": 6.875, + "epoch": 12.38231098430813, + "grad_norm": 6.53125, "learning_rate": 3.396e-05, - "loss": 0.5268, + "loss": 0.7779, "step": 17360 }, { - "epoch": 2.0912593306043825, - "grad_norm": 8.1875, + "epoch": 12.38944365192582, + "grad_norm": 9.625, "learning_rate": 3.3915555555555555e-05, - "loss": 0.6577, + "loss": 0.6953, "step": 17370 }, { - "epoch": 2.092463279556947, - "grad_norm": 8.25, + "epoch": 12.39657631954351, + "grad_norm": 7.1875, "learning_rate": 3.387111111111112e-05, - "loss": 0.6687, + "loss": 0.7577, "step": 17380 }, { - "epoch": 2.0936672285095113, - "grad_norm": 6.8125, + "epoch": 12.403708987161199, + "grad_norm": 10.375, "learning_rate": 3.3826666666666666e-05, - "loss": 0.5323, + "loss": 0.7793, "step": 17390 }, { - "epoch": 2.0948711774620756, - "grad_norm": 7.53125, + "epoch": 12.410841654778887, + "grad_norm": 8.0, "learning_rate": 3.378222222222223e-05, - "loss": 0.5394, + "loss": 0.8273, "step": 17400 }, { - "epoch": 2.0948711774620756, + "epoch": 12.410841654778887, "eval/acc": 37.20930099487305, "step": 17400 }, { - "epoch": 2.0948711774620756, - "eval_loss": 2.1255204677581787, - "eval_runtime": 0.22, - "eval_samples_per_second": 195.45, - "eval_steps_per_second": 4.545, + "epoch": 12.410841654778887, + "eval_loss": 3.02496075630188, + "eval_runtime": 0.2305, + "eval_samples_per_second": 186.586, + "eval_steps_per_second": 4.339, "step": 17400 }, { - "epoch": 2.09607512641464, - "grad_norm": 8.6875, + "epoch": 12.417974322396576, + "grad_norm": 6.0625, "learning_rate": 3.3737777777777776e-05, - "loss": 0.575, + "loss": 0.7008, "step": 17410 }, { - "epoch": 2.0972790753672044, - "grad_norm": 7.90625, + "epoch": 12.425106990014266, + "grad_norm": 8.0, "learning_rate": 3.369333333333333e-05, - "loss": 0.5201, + "loss": 0.7961, "step": 17420 }, { - "epoch": 2.0984830243197687, - "grad_norm": 7.78125, + "epoch": 12.432239657631955, + "grad_norm": 8.9375, "learning_rate": 3.3648888888888893e-05, - "loss": 0.4728, + "loss": 0.7806, "step": 17430 }, { - "epoch": 2.099686973272333, - "grad_norm": 6.34375, + "epoch": 12.439372325249643, + "grad_norm": 8.75, "learning_rate": 3.360444444444444e-05, - "loss": 0.6064, + "loss": 0.6974, "step": 17440 }, { - "epoch": 2.1008909222248975, - "grad_norm": 6.71875, + "epoch": 12.446504992867332, + "grad_norm": 5.59375, "learning_rate": 3.3560000000000004e-05, - "loss": 0.5533, + "loss": 0.6685, "step": 17450 }, { - "epoch": 2.102094871177462, - "grad_norm": 8.3125, + "epoch": 12.453637660485022, + "grad_norm": 5.8125, "learning_rate": 3.351555555555555e-05, - "loss": 0.6109, + "loss": 0.7812, "step": 17460 }, { - "epoch": 2.1032988201300267, - "grad_norm": 8.75, + "epoch": 12.46077032810271, + "grad_norm": 8.8125, "learning_rate": 3.3471111111111114e-05, - "loss": 0.6461, + "loss": 0.8677, "step": 17470 }, { - "epoch": 2.104502769082591, - "grad_norm": 6.53125, + "epoch": 12.467902995720399, + "grad_norm": 8.3125, "learning_rate": 3.342666666666667e-05, - "loss": 0.6019, + "loss": 0.7837, "step": 17480 }, { - "epoch": 2.1057067180351554, - "grad_norm": 9.1875, + "epoch": 12.475035663338089, + "grad_norm": 11.0625, "learning_rate": 3.3382222222222225e-05, - "loss": 0.5926, + "loss": 0.7833, "step": 17490 }, { - "epoch": 2.10691066698772, - "grad_norm": 9.625, + "epoch": 12.482168330955778, + "grad_norm": 7.75, "learning_rate": 3.333777777777778e-05, - "loss": 0.5351, + "loss": 0.8063, "step": 17500 }, { - "epoch": 2.10691066698772, + "epoch": 12.482168330955778, "eval/acc": 37.20930099487305, "step": 17500 }, { - "epoch": 2.10691066698772, - "eval_loss": 2.1321325302124023, - "eval_runtime": 0.2113, - "eval_samples_per_second": 203.543, - "eval_steps_per_second": 4.734, + "epoch": 12.482168330955778, + "eval_loss": 3.0436007976531982, + "eval_runtime": 0.2352, + "eval_samples_per_second": 182.855, + "eval_steps_per_second": 4.252, "step": 17500 }, { - "epoch": 2.108114615940284, - "grad_norm": 7.4375, + "epoch": 12.489300998573466, + "grad_norm": 5.46875, "learning_rate": 3.3293333333333335e-05, - "loss": 0.5186, + "loss": 0.8146, "step": 17510 }, { - "epoch": 2.1093185648928485, - "grad_norm": 5.65625, + "epoch": 12.496433666191155, + "grad_norm": 7.6875, "learning_rate": 3.324888888888889e-05, - "loss": 0.532, + "loss": 0.7964, "step": 17520 }, { - "epoch": 2.110522513845413, - "grad_norm": 7.3125, + "epoch": 12.503566333808845, + "grad_norm": 6.03125, "learning_rate": 3.3204444444444446e-05, - "loss": 0.578, + "loss": 0.7377, "step": 17530 }, { - "epoch": 2.1117264627979773, - "grad_norm": 8.75, + "epoch": 12.510699001426534, + "grad_norm": 7.90625, "learning_rate": 3.316e-05, - "loss": 0.5241, + "loss": 0.7658, "step": 17540 }, { - "epoch": 2.1129304117505416, - "grad_norm": 5.65625, + "epoch": 12.517831669044222, + "grad_norm": 8.0625, "learning_rate": 3.3115555555555556e-05, - "loss": 0.5194, + "loss": 0.7035, "step": 17550 }, { - "epoch": 2.114134360703106, - "grad_norm": 8.375, + "epoch": 12.524964336661911, + "grad_norm": 7.75, "learning_rate": 3.307111111111111e-05, - "loss": 0.5815, + "loss": 0.794, "step": 17560 }, { - "epoch": 2.1153383096556704, - "grad_norm": 5.75, + "epoch": 12.532097004279601, + "grad_norm": 9.125, "learning_rate": 3.302666666666667e-05, - "loss": 0.5333, + "loss": 0.8046, "step": 17570 }, { - "epoch": 2.116542258608235, - "grad_norm": 8.125, + "epoch": 12.539229671897289, + "grad_norm": 8.8125, "learning_rate": 3.298222222222223e-05, - "loss": 0.5816, + "loss": 0.7632, "step": 17580 }, { - "epoch": 2.1177462075607996, - "grad_norm": 7.15625, + "epoch": 12.546362339514978, + "grad_norm": 9.375, "learning_rate": 3.293777777777778e-05, - "loss": 0.6058, + "loss": 0.8554, "step": 17590 }, { - "epoch": 2.118950156513364, - "grad_norm": 7.09375, + "epoch": 12.553495007132668, + "grad_norm": 7.0625, "learning_rate": 3.289333333333334e-05, - "loss": 0.6255, + "loss": 0.647, "step": 17600 }, { - "epoch": 2.118950156513364, - "eval/acc": 39.53488540649414, + "epoch": 12.553495007132668, + "eval/acc": 37.20930099487305, "step": 17600 }, { - "epoch": 2.118950156513364, - "eval_loss": 2.1110424995422363, - "eval_runtime": 0.2184, - "eval_samples_per_second": 196.909, - "eval_steps_per_second": 4.579, + "epoch": 12.553495007132668, + "eval_loss": 3.04180908203125, + "eval_runtime": 0.2372, + "eval_samples_per_second": 181.25, + "eval_steps_per_second": 4.215, "step": 17600 }, { - "epoch": 2.1201541054659283, - "grad_norm": 9.6875, + "epoch": 12.560627674750357, + "grad_norm": 7.4375, "learning_rate": 3.284888888888889e-05, - "loss": 0.619, + "loss": 0.7701, "step": 17610 }, { - "epoch": 2.1213580544184927, - "grad_norm": 7.09375, + "epoch": 12.567760342368045, + "grad_norm": 81.0, "learning_rate": 3.280444444444445e-05, - "loss": 0.4951, + "loss": 0.7707, "step": 17620 }, { - "epoch": 2.122562003371057, - "grad_norm": 6.625, + "epoch": 12.574893009985734, + "grad_norm": 6.75, "learning_rate": 3.2760000000000005e-05, - "loss": 0.5048, + "loss": 0.7871, "step": 17630 }, { - "epoch": 2.1237659523236214, - "grad_norm": 7.21875, + "epoch": 12.582025677603424, + "grad_norm": 6.28125, "learning_rate": 3.271555555555555e-05, - "loss": 0.6324, + "loss": 0.8382, "step": 17640 }, { - "epoch": 2.124969901276186, - "grad_norm": 7.125, + "epoch": 12.589158345221113, + "grad_norm": 8.875, "learning_rate": 3.2671111111111115e-05, - "loss": 0.5301, + "loss": 0.76, "step": 17650 }, { - "epoch": 2.12617385022875, - "grad_norm": 7.90625, + "epoch": 12.596291012838801, + "grad_norm": 7.75, "learning_rate": 3.2626666666666664e-05, - "loss": 0.5618, + "loss": 0.794, "step": 17660 }, { - "epoch": 2.1273777991813145, - "grad_norm": 7.84375, + "epoch": 12.60342368045649, + "grad_norm": 7.15625, "learning_rate": 3.2582222222222226e-05, - "loss": 0.57, + "loss": 0.7451, "step": 17670 }, { - "epoch": 2.128581748133879, - "grad_norm": 10.6875, + "epoch": 12.61055634807418, + "grad_norm": 6.75, "learning_rate": 3.253777777777778e-05, - "loss": 0.5702, + "loss": 0.7608, "step": 17680 }, { - "epoch": 2.1297856970864437, - "grad_norm": 6.625, + "epoch": 12.61768901569187, + "grad_norm": 8.8125, "learning_rate": 3.2493333333333336e-05, - "loss": 0.4928, + "loss": 0.8062, "step": 17690 }, { - "epoch": 2.130989646039008, - "grad_norm": 8.5, + "epoch": 12.624821683309557, + "grad_norm": 9.5625, "learning_rate": 3.244888888888889e-05, - "loss": 0.5074, + "loss": 0.6761, "step": 17700 }, { - "epoch": 2.130989646039008, - "eval/acc": 39.53488540649414, + "epoch": 12.624821683309557, + "eval/acc": 37.20930099487305, "step": 17700 }, { - "epoch": 2.130989646039008, - "eval_loss": 2.114666700363159, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.316, - "eval_steps_per_second": 4.659, + "epoch": 12.624821683309557, + "eval_loss": 3.029242515563965, + "eval_runtime": 0.2291, + "eval_samples_per_second": 187.698, + "eval_steps_per_second": 4.365, "step": 17700 }, { - "epoch": 2.1321935949915725, - "grad_norm": 9.6875, + "epoch": 12.631954350927247, + "grad_norm": 24.125, "learning_rate": 3.240444444444445e-05, - "loss": 0.6014, + "loss": 0.6484, "step": 17710 }, { - "epoch": 2.133397543944137, - "grad_norm": 7.1875, + "epoch": 12.639087018544936, + "grad_norm": 5.125, "learning_rate": 3.236e-05, - "loss": 0.5963, + "loss": 0.716, "step": 17720 }, { - "epoch": 2.134601492896701, - "grad_norm": 7.65625, + "epoch": 12.646219686162624, + "grad_norm": 10.0625, "learning_rate": 3.231555555555556e-05, - "loss": 0.5963, + "loss": 0.7955, "step": 17730 }, { - "epoch": 2.1358054418492656, - "grad_norm": 13.75, + "epoch": 12.653352353780313, + "grad_norm": 10.6875, "learning_rate": 3.227111111111111e-05, - "loss": 0.6004, + "loss": 0.8146, "step": 17740 }, { - "epoch": 2.13700939080183, - "grad_norm": 6.9375, + "epoch": 12.660485021398003, + "grad_norm": 8.0, "learning_rate": 3.222666666666667e-05, - "loss": 0.5469, + "loss": 0.7711, "step": 17750 }, { - "epoch": 2.1382133397543943, - "grad_norm": 15.125, + "epoch": 12.667617689015692, + "grad_norm": 8.0, "learning_rate": 3.218222222222222e-05, - "loss": 0.53, + "loss": 0.8463, "step": 17760 }, { - "epoch": 2.1394172887069587, - "grad_norm": 6.5, + "epoch": 12.67475035663338, + "grad_norm": 9.0, "learning_rate": 3.213777777777778e-05, - "loss": 0.6509, + "loss": 0.8483, "step": 17770 }, { - "epoch": 2.140621237659523, - "grad_norm": 8.5, + "epoch": 12.68188302425107, + "grad_norm": 7.78125, "learning_rate": 3.209333333333333e-05, - "loss": 0.5594, + "loss": 0.8471, "step": 17780 }, { - "epoch": 2.1418251866120874, - "grad_norm": 15.375, + "epoch": 12.689015691868759, + "grad_norm": 7.9375, "learning_rate": 3.204888888888889e-05, - "loss": 0.611, + "loss": 0.7851, "step": 17790 }, { - "epoch": 2.1430291355646522, - "grad_norm": 7.96875, + "epoch": 12.696148359486449, + "grad_norm": 7.59375, "learning_rate": 3.200444444444445e-05, - "loss": 0.5779, + "loss": 0.7442, "step": 17800 }, { - "epoch": 2.1430291355646522, - "eval/acc": 39.53488540649414, + "epoch": 12.696148359486449, + "eval/acc": 37.20930099487305, "step": 17800 }, { - "epoch": 2.1430291355646522, - "eval_loss": 2.0950489044189453, - "eval_runtime": 0.2125, - "eval_samples_per_second": 202.365, - "eval_steps_per_second": 4.706, + "epoch": 12.696148359486449, + "eval_loss": 3.0417492389678955, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.151, + "eval_steps_per_second": 4.399, "step": 17800 }, { - "epoch": 2.1442330845172166, - "grad_norm": 6.90625, + "epoch": 12.703281027104136, + "grad_norm": 7.5, "learning_rate": 3.196e-05, - "loss": 0.5222, + "loss": 0.8628, "step": 17810 }, { - "epoch": 2.145437033469781, - "grad_norm": 12.0625, + "epoch": 12.710413694721826, + "grad_norm": 9.5, "learning_rate": 3.191555555555556e-05, - "loss": 0.6021, + "loss": 0.7934, "step": 17820 }, { - "epoch": 2.1466409824223454, - "grad_norm": 6.8125, + "epoch": 12.717546362339515, + "grad_norm": 7.15625, "learning_rate": 3.187111111111111e-05, - "loss": 0.5762, + "loss": 0.7312, "step": 17830 }, { - "epoch": 2.1478449313749097, - "grad_norm": 6.84375, + "epoch": 12.724679029957205, + "grad_norm": 8.375, "learning_rate": 3.1826666666666665e-05, - "loss": 0.5999, + "loss": 0.7718, "step": 17840 }, { - "epoch": 2.149048880327474, - "grad_norm": 9.75, + "epoch": 12.731811697574893, + "grad_norm": 13.75, "learning_rate": 3.178222222222223e-05, - "loss": 0.6187, + "loss": 0.7138, "step": 17850 }, { - "epoch": 2.1502528292800385, - "grad_norm": 7.0, + "epoch": 12.738944365192582, + "grad_norm": 7.5, "learning_rate": 3.1737777777777775e-05, - "loss": 0.5659, + "loss": 0.8093, "step": 17860 }, { - "epoch": 2.151456778232603, - "grad_norm": 9.8125, + "epoch": 12.746077032810271, + "grad_norm": 30.5, "learning_rate": 3.169333333333334e-05, - "loss": 0.4921, + "loss": 0.8289, "step": 17870 }, { - "epoch": 2.152660727185167, - "grad_norm": 7.96875, + "epoch": 12.75320970042796, + "grad_norm": 5.625, "learning_rate": 3.164888888888889e-05, - "loss": 0.5766, + "loss": 0.7022, "step": 17880 }, { - "epoch": 2.1538646761377316, - "grad_norm": 7.03125, + "epoch": 12.760342368045649, + "grad_norm": 6.40625, "learning_rate": 3.160444444444445e-05, - "loss": 0.6072, + "loss": 0.6812, "step": 17890 }, { - "epoch": 2.155068625090296, - "grad_norm": 9.625, + "epoch": 12.767475035663338, + "grad_norm": 9.0, "learning_rate": 3.156e-05, - "loss": 0.5249, + "loss": 0.717, "step": 17900 }, { - "epoch": 2.155068625090296, - "eval/acc": 38.372093200683594, + "epoch": 12.767475035663338, + "eval/acc": 37.20930099487305, "step": 17900 }, { - "epoch": 2.155068625090296, - "eval_loss": 2.1039204597473145, - "eval_runtime": 0.2141, - "eval_samples_per_second": 200.859, - "eval_steps_per_second": 4.671, + "epoch": 12.767475035663338, + "eval_loss": 3.0246851444244385, + "eval_runtime": 0.2524, + "eval_samples_per_second": 170.366, + "eval_steps_per_second": 3.962, "step": 17900 }, { - "epoch": 2.1562725740428608, - "grad_norm": 7.4375, + "epoch": 12.774607703281028, + "grad_norm": 8.25, "learning_rate": 3.151555555555556e-05, - "loss": 0.5919, + "loss": 0.7882, "step": 17910 }, { - "epoch": 2.157476522995425, - "grad_norm": 5.21875, + "epoch": 12.781740370898715, + "grad_norm": 7.125, "learning_rate": 3.147111111111111e-05, - "loss": 0.5935, + "loss": 0.7156, "step": 17920 }, { - "epoch": 2.1586804719479895, - "grad_norm": 5.75, + "epoch": 12.788873038516405, + "grad_norm": 7.46875, "learning_rate": 3.142666666666667e-05, - "loss": 0.5401, + "loss": 0.7049, "step": 17930 }, { - "epoch": 2.159884420900554, - "grad_norm": 10.9375, + "epoch": 12.796005706134094, + "grad_norm": 8.6875, "learning_rate": 3.1382222222222224e-05, - "loss": 0.5899, + "loss": 0.9339, "step": 17940 }, { - "epoch": 2.1610883698531183, - "grad_norm": 7.6875, + "epoch": 12.803138373751784, + "grad_norm": 7.3125, "learning_rate": 3.133777777777778e-05, - "loss": 0.6101, + "loss": 0.6531, "step": 17950 }, { - "epoch": 2.1622923188056826, - "grad_norm": 9.0, + "epoch": 12.810271041369472, + "grad_norm": 6.84375, "learning_rate": 3.1293333333333334e-05, - "loss": 0.6088, + "loss": 0.7576, "step": 17960 }, { - "epoch": 2.163496267758247, - "grad_norm": 9.875, + "epoch": 12.817403708987161, + "grad_norm": 7.25, "learning_rate": 3.124888888888889e-05, - "loss": 0.5768, + "loss": 0.7153, "step": 17970 }, { - "epoch": 2.1647002167108114, - "grad_norm": 6.5625, + "epoch": 12.82453637660485, + "grad_norm": 7.1875, "learning_rate": 3.1204444444444445e-05, - "loss": 0.6349, + "loss": 0.7138, "step": 17980 }, { - "epoch": 2.1659041656633757, - "grad_norm": 9.3125, + "epoch": 12.83166904422254, + "grad_norm": 8.25, "learning_rate": 3.116e-05, - "loss": 0.4973, + "loss": 0.738, "step": 17990 }, { - "epoch": 2.16710811461594, - "grad_norm": 6.28125, + "epoch": 12.838801711840228, + "grad_norm": 9.5, "learning_rate": 3.111555555555556e-05, - "loss": 0.6212, + "loss": 0.7525, "step": 18000 }, { - "epoch": 2.16710811461594, - "eval/acc": 39.53488540649414, + "epoch": 12.838801711840228, + "eval/acc": 37.20930099487305, "step": 18000 }, { - "epoch": 2.16710811461594, - "eval_loss": 2.1069729328155518, - "eval_runtime": 0.2112, - "eval_samples_per_second": 203.644, - "eval_steps_per_second": 4.736, + "epoch": 12.838801711840228, + "eval_loss": 3.0545501708984375, + "eval_runtime": 0.2299, + "eval_samples_per_second": 187.055, + "eval_steps_per_second": 4.35, "step": 18000 }, { - "epoch": 2.1683120635685045, - "grad_norm": 10.3125, + "epoch": 12.845934379457917, + "grad_norm": 7.4375, "learning_rate": 3.107111111111111e-05, - "loss": 0.5577, + "loss": 0.8798, "step": 18010 }, { - "epoch": 2.1695160125210693, - "grad_norm": 7.75, + "epoch": 12.853067047075607, + "grad_norm": 8.625, "learning_rate": 3.102666666666667e-05, - "loss": 0.5199, + "loss": 0.9301, "step": 18020 }, { - "epoch": 2.1707199614736337, - "grad_norm": 9.4375, + "epoch": 12.860199714693294, + "grad_norm": 8.0625, "learning_rate": 3.098222222222222e-05, - "loss": 0.6411, + "loss": 0.808, "step": 18030 }, { - "epoch": 2.171923910426198, - "grad_norm": 9.0625, + "epoch": 12.867332382310984, + "grad_norm": 6.75, "learning_rate": 3.0937777777777776e-05, - "loss": 0.574, + "loss": 0.848, "step": 18040 }, { - "epoch": 2.1731278593787624, - "grad_norm": 9.125, + "epoch": 12.874465049928673, + "grad_norm": 6.1875, "learning_rate": 3.089333333333334e-05, - "loss": 0.5277, + "loss": 0.667, "step": 18050 }, { - "epoch": 2.1743318083313268, - "grad_norm": 23.875, + "epoch": 12.881597717546363, + "grad_norm": 17.125, "learning_rate": 3.0848888888888886e-05, - "loss": 0.5406, + "loss": 0.778, "step": 18060 }, { - "epoch": 2.175535757283891, - "grad_norm": 9.0, + "epoch": 12.88873038516405, + "grad_norm": 7.28125, "learning_rate": 3.080444444444445e-05, - "loss": 0.5573, + "loss": 0.7401, "step": 18070 }, { - "epoch": 2.1767397062364555, - "grad_norm": 8.1875, + "epoch": 12.89586305278174, + "grad_norm": 10.25, "learning_rate": 3.076e-05, - "loss": 0.5394, + "loss": 0.8231, "step": 18080 }, { - "epoch": 2.17794365518902, - "grad_norm": 9.3125, + "epoch": 12.90299572039943, + "grad_norm": 6.9375, "learning_rate": 3.071555555555556e-05, - "loss": 0.6329, + "loss": 0.8827, "step": 18090 }, { - "epoch": 2.1791476041415843, - "grad_norm": 17.875, + "epoch": 12.91012838801712, + "grad_norm": 7.4375, "learning_rate": 3.0671111111111114e-05, - "loss": 0.527, + "loss": 0.7568, "step": 18100 }, { - "epoch": 2.1791476041415843, - "eval/acc": 41.86046600341797, + "epoch": 12.91012838801712, + "eval/acc": 37.20930099487305, "step": 18100 }, { - "epoch": 2.1791476041415843, - "eval_loss": 2.1123554706573486, - "eval_runtime": 0.2126, - "eval_samples_per_second": 202.299, - "eval_steps_per_second": 4.705, + "epoch": 12.91012838801712, + "eval_loss": 3.0568392276763916, + "eval_runtime": 0.2321, + "eval_samples_per_second": 185.239, + "eval_steps_per_second": 4.308, "step": 18100 }, { - "epoch": 2.1803515530941486, - "grad_norm": 7.34375, + "epoch": 12.917261055634807, + "grad_norm": 8.5, "learning_rate": 3.062666666666667e-05, - "loss": 0.5937, + "loss": 0.7822, "step": 18110 }, { - "epoch": 2.181555502046713, - "grad_norm": 3.9375, + "epoch": 12.924393723252496, + "grad_norm": 7.375, "learning_rate": 3.0582222222222225e-05, - "loss": 0.6157, + "loss": 0.8247, "step": 18120 }, { - "epoch": 2.182759450999278, - "grad_norm": 6.4375, + "epoch": 12.931526390870186, + "grad_norm": 8.8125, "learning_rate": 3.053777777777778e-05, - "loss": 0.5808, + "loss": 0.8443, "step": 18130 }, { - "epoch": 2.183963399951842, - "grad_norm": 7.90625, + "epoch": 12.938659058487875, + "grad_norm": 6.28125, "learning_rate": 3.0493333333333335e-05, - "loss": 0.5375, + "loss": 0.803, "step": 18140 }, { - "epoch": 2.1851673489044066, - "grad_norm": 6.09375, + "epoch": 12.945791726105563, + "grad_norm": 38.75, "learning_rate": 3.0448888888888887e-05, - "loss": 0.5572, + "loss": 0.7168, "step": 18150 }, { - "epoch": 2.186371297856971, - "grad_norm": 9.875, + "epoch": 12.952924393723253, + "grad_norm": 8.0, "learning_rate": 3.0404444444444445e-05, - "loss": 0.5788, + "loss": 0.8827, "step": 18160 }, { - "epoch": 2.1875752468095353, - "grad_norm": 7.53125, + "epoch": 12.960057061340942, + "grad_norm": 17.75, "learning_rate": 3.036e-05, - "loss": 0.7093, + "loss": 0.822, "step": 18170 }, { - "epoch": 2.1887791957620997, - "grad_norm": 8.375, + "epoch": 12.96718972895863, + "grad_norm": 7.1875, "learning_rate": 3.031555555555556e-05, - "loss": 0.513, + "loss": 0.7958, "step": 18180 }, { - "epoch": 2.189983144714664, - "grad_norm": 12.125, + "epoch": 12.97432239657632, + "grad_norm": 6.125, "learning_rate": 3.027111111111111e-05, - "loss": 0.5108, + "loss": 0.676, "step": 18190 }, { - "epoch": 2.1911870936672284, - "grad_norm": 8.5, + "epoch": 12.981455064194009, + "grad_norm": 7.8125, "learning_rate": 3.022666666666667e-05, - "loss": 0.5063, + "loss": 0.8696, "step": 18200 }, { - "epoch": 2.1911870936672284, - "eval/acc": 39.53488540649414, + "epoch": 12.981455064194009, + "eval/acc": 37.20930099487305, "step": 18200 }, { - "epoch": 2.1911870936672284, - "eval_loss": 2.0905356407165527, - "eval_runtime": 0.2118, - "eval_samples_per_second": 202.993, - "eval_steps_per_second": 4.721, + "epoch": 12.981455064194009, + "eval_loss": 3.040698528289795, + "eval_runtime": 0.2626, + "eval_samples_per_second": 163.753, + "eval_steps_per_second": 3.808, "step": 18200 }, { - "epoch": 2.192391042619793, - "grad_norm": 8.0, + "epoch": 12.988587731811698, + "grad_norm": 11.1875, "learning_rate": 3.018222222222222e-05, - "loss": 0.6047, + "loss": 0.7538, "step": 18210 }, { - "epoch": 2.193594991572357, - "grad_norm": 7.53125, + "epoch": 12.995720399429386, + "grad_norm": 12.0, "learning_rate": 3.013777777777778e-05, - "loss": 0.5624, + "loss": 0.7873, "step": 18220 }, { - "epoch": 2.1947989405249215, - "grad_norm": 5.15625, + "epoch": 13.002853067047075, + "grad_norm": 8.4375, "learning_rate": 3.0093333333333335e-05, - "loss": 0.4844, + "loss": 0.7741, "step": 18230 }, { - "epoch": 2.1960028894774863, - "grad_norm": 7.53125, + "epoch": 13.009985734664765, + "grad_norm": 7.5, "learning_rate": 3.0048888888888894e-05, - "loss": 0.5534, + "loss": 0.7644, "step": 18240 }, { - "epoch": 2.1972068384300507, - "grad_norm": 6.75, + "epoch": 13.017118402282454, + "grad_norm": 6.84375, "learning_rate": 3.0004444444444446e-05, - "loss": 0.5302, + "loss": 0.769, "step": 18250 }, { - "epoch": 2.198410787382615, - "grad_norm": 8.5625, + "epoch": 13.024251069900142, + "grad_norm": 6.375, "learning_rate": 2.9959999999999998e-05, - "loss": 0.5995, + "loss": 0.7497, "step": 18260 }, { - "epoch": 2.1996147363351795, - "grad_norm": 7.28125, + "epoch": 13.031383737517832, + "grad_norm": 7.84375, "learning_rate": 2.9915555555555556e-05, - "loss": 0.5813, + "loss": 0.7643, "step": 18270 }, { - "epoch": 2.200818685287744, - "grad_norm": 6.28125, + "epoch": 13.038516405135521, + "grad_norm": 7.40625, "learning_rate": 2.987111111111111e-05, - "loss": 0.5663, + "loss": 0.7955, "step": 18280 }, { - "epoch": 2.202022634240308, - "grad_norm": 8.8125, + "epoch": 13.045649072753209, + "grad_norm": 14.125, "learning_rate": 2.982666666666667e-05, - "loss": 0.6051, + "loss": 0.7396, "step": 18290 }, { - "epoch": 2.2032265831928726, - "grad_norm": 7.15625, + "epoch": 13.052781740370898, + "grad_norm": 6.6875, "learning_rate": 2.9782222222222222e-05, - "loss": 0.5273, + "loss": 0.7616, "step": 18300 }, { - "epoch": 2.2032265831928726, - "eval/acc": 39.53488540649414, + "epoch": 13.052781740370898, + "eval/acc": 44.1860466003418, "step": 18300 }, { - "epoch": 2.2032265831928726, - "eval_loss": 2.106367588043213, - "eval_runtime": 0.2176, - "eval_samples_per_second": 197.621, - "eval_steps_per_second": 4.596, + "epoch": 13.052781740370898, + "eval_loss": 2.3592934608459473, + "eval_runtime": 4.7062, + "eval_samples_per_second": 9.137, + "eval_steps_per_second": 0.212, "step": 18300 }, { - "epoch": 2.204430532145437, - "grad_norm": 8.625, + "epoch": 13.059914407988588, + "grad_norm": 6.875, "learning_rate": 2.973777777777778e-05, - "loss": 0.5102, + "loss": 0.7691, "step": 18310 }, { - "epoch": 2.2056344810980013, - "grad_norm": 9.0, + "epoch": 13.067047075606277, + "grad_norm": 12.4375, "learning_rate": 2.9693333333333333e-05, - "loss": 0.6453, + "loss": 0.8092, "step": 18320 }, { - "epoch": 2.2068384300505657, - "grad_norm": 6.15625, + "epoch": 13.074179743223965, + "grad_norm": 7.65625, "learning_rate": 2.964888888888889e-05, - "loss": 0.5051, + "loss": 0.7653, "step": 18330 }, { - "epoch": 2.20804237900313, - "grad_norm": 7.875, + "epoch": 13.081312410841655, + "grad_norm": 9.9375, "learning_rate": 2.9604444444444446e-05, - "loss": 0.5706, + "loss": 0.7936, "step": 18340 }, { - "epoch": 2.209246327955695, - "grad_norm": 7.84375, + "epoch": 13.088445078459344, + "grad_norm": 11.0625, "learning_rate": 2.9559999999999998e-05, - "loss": 0.5046, + "loss": 0.7666, "step": 18350 }, { - "epoch": 2.2104502769082592, - "grad_norm": 7.34375, + "epoch": 13.095577746077034, + "grad_norm": 12.75, "learning_rate": 2.9515555555555557e-05, - "loss": 0.5694, + "loss": 0.874, "step": 18360 }, { - "epoch": 2.2116542258608236, - "grad_norm": 10.625, + "epoch": 13.102710413694721, + "grad_norm": 6.6875, "learning_rate": 2.9471111111111112e-05, - "loss": 0.5679, + "loss": 0.6979, "step": 18370 }, { - "epoch": 2.212858174813388, - "grad_norm": 6.90625, + "epoch": 13.10984308131241, + "grad_norm": 7.6875, "learning_rate": 2.942666666666667e-05, - "loss": 0.6488, + "loss": 0.8215, "step": 18380 }, { - "epoch": 2.2140621237659524, - "grad_norm": 11.625, + "epoch": 13.1169757489301, + "grad_norm": 6.75, "learning_rate": 2.9382222222222222e-05, - "loss": 0.6556, + "loss": 0.6941, "step": 18390 }, { - "epoch": 2.2152660727185167, - "grad_norm": 6.96875, + "epoch": 13.12410841654779, + "grad_norm": 8.5625, "learning_rate": 2.933777777777778e-05, - "loss": 0.4951, + "loss": 0.7365, "step": 18400 }, { - "epoch": 2.2152660727185167, - "eval/acc": 39.53488540649414, + "epoch": 13.12410841654779, + "eval/acc": 44.1860466003418, "step": 18400 }, { - "epoch": 2.2152660727185167, - "eval_loss": 2.115079402923584, - "eval_runtime": 0.2123, - "eval_samples_per_second": 202.524, - "eval_steps_per_second": 4.71, + "epoch": 13.12410841654779, + "eval_loss": 2.368854284286499, + "eval_runtime": 7.2366, + "eval_samples_per_second": 5.942, + "eval_steps_per_second": 0.138, "step": 18400 }, { - "epoch": 2.216470021671081, - "grad_norm": 6.9375, + "epoch": 13.131241084165477, + "grad_norm": 9.0625, "learning_rate": 2.9293333333333333e-05, - "loss": 0.5073, + "loss": 0.7201, "step": 18410 }, { - "epoch": 2.2176739706236455, - "grad_norm": 9.8125, + "epoch": 13.138373751783167, + "grad_norm": 8.4375, "learning_rate": 2.924888888888889e-05, - "loss": 0.6202, + "loss": 0.9061, "step": 18420 }, { - "epoch": 2.21887791957621, - "grad_norm": 7.40625, + "epoch": 13.145506419400856, + "grad_norm": 5.875, "learning_rate": 2.9204444444444447e-05, - "loss": 0.6417, + "loss": 0.7767, "step": 18430 }, { - "epoch": 2.220081868528774, - "grad_norm": 7.0, + "epoch": 13.152639087018544, + "grad_norm": 6.53125, "learning_rate": 2.9160000000000005e-05, - "loss": 0.5751, + "loss": 0.8086, "step": 18440 }, { - "epoch": 2.2212858174813386, - "grad_norm": 16.375, + "epoch": 13.159771754636234, + "grad_norm": 8.1875, "learning_rate": 2.9115555555555557e-05, - "loss": 0.6271, + "loss": 0.7938, "step": 18450 }, { - "epoch": 2.2224897664339034, - "grad_norm": 8.875, + "epoch": 13.166904422253923, + "grad_norm": 7.84375, "learning_rate": 2.907111111111111e-05, - "loss": 0.5437, + "loss": 0.8435, "step": 18460 }, { - "epoch": 2.2236937153864678, - "grad_norm": 7.75, + "epoch": 13.174037089871613, + "grad_norm": 8.3125, "learning_rate": 2.9026666666666668e-05, - "loss": 0.5829, + "loss": 0.7333, "step": 18470 }, { - "epoch": 2.224897664339032, + "epoch": 13.1811697574893, "grad_norm": 8.1875, "learning_rate": 2.8982222222222223e-05, - "loss": 0.5081, + "loss": 0.7546, "step": 18480 }, { - "epoch": 2.2261016132915965, - "grad_norm": 9.125, + "epoch": 13.18830242510699, + "grad_norm": 7.09375, "learning_rate": 2.893777777777778e-05, - "loss": 0.5637, + "loss": 0.7321, "step": 18490 }, { - "epoch": 2.227305562244161, - "grad_norm": 8.4375, + "epoch": 13.19543509272468, + "grad_norm": 9.375, "learning_rate": 2.8893333333333333e-05, - "loss": 0.5615, + "loss": 0.8419, "step": 18500 }, { - "epoch": 2.227305562244161, - "eval/acc": 41.27906799316406, + "epoch": 13.19543509272468, + "eval/acc": 44.1860466003418, "step": 18500 }, { - "epoch": 2.227305562244161, - "eval_loss": 2.0838871002197266, - "eval_runtime": 0.2124, - "eval_samples_per_second": 202.487, - "eval_steps_per_second": 4.709, + "epoch": 13.19543509272468, + "eval_loss": 2.343879461288452, + "eval_runtime": 0.2875, + "eval_samples_per_second": 149.591, + "eval_steps_per_second": 3.479, "step": 18500 }, { - "epoch": 2.2285095111967252, - "grad_norm": 7.875, + "epoch": 13.202567760342369, + "grad_norm": 9.8125, "learning_rate": 2.8848888888888892e-05, - "loss": 0.5499, + "loss": 0.7606, "step": 18510 }, { - "epoch": 2.2297134601492896, - "grad_norm": 9.6875, + "epoch": 13.209700427960057, + "grad_norm": 6.125, "learning_rate": 2.8804444444444444e-05, - "loss": 0.5668, + "loss": 0.7617, "step": 18520 }, { - "epoch": 2.230917409101854, - "grad_norm": 6.21875, + "epoch": 13.216833095577746, + "grad_norm": 6.4375, "learning_rate": 2.8760000000000002e-05, - "loss": 0.4969, + "loss": 0.6793, "step": 18530 }, { - "epoch": 2.2321213580544184, - "grad_norm": 8.125, + "epoch": 13.223965763195435, + "grad_norm": 8.5625, "learning_rate": 2.8715555555555558e-05, - "loss": 0.6084, + "loss": 0.7353, "step": 18540 }, { - "epoch": 2.2333253070069827, - "grad_norm": 6.21875, + "epoch": 13.231098430813125, + "grad_norm": 6.53125, "learning_rate": 2.8671111111111116e-05, - "loss": 0.5255, + "loss": 0.7745, "step": 18550 }, { - "epoch": 2.234529255959547, - "grad_norm": 5.71875, + "epoch": 13.238231098430813, + "grad_norm": 10.3125, "learning_rate": 2.8626666666666668e-05, - "loss": 0.5353, + "loss": 0.7891, "step": 18560 }, { - "epoch": 2.235733204912112, - "grad_norm": 26.875, + "epoch": 13.245363766048502, + "grad_norm": 6.03125, "learning_rate": 2.858222222222222e-05, - "loss": 0.6619, + "loss": 0.7913, "step": 18570 }, { - "epoch": 2.2369371538646763, - "grad_norm": 8.75, + "epoch": 13.252496433666192, + "grad_norm": 6.5, "learning_rate": 2.853777777777778e-05, - "loss": 0.5125, + "loss": 0.7704, "step": 18580 }, { - "epoch": 2.2381411028172407, - "grad_norm": 6.4375, + "epoch": 13.25962910128388, + "grad_norm": 11.5, "learning_rate": 2.8493333333333334e-05, - "loss": 0.576, + "loss": 0.7956, "step": 18590 }, { - "epoch": 2.239345051769805, - "grad_norm": 6.875, + "epoch": 13.266761768901569, + "grad_norm": 7.34375, "learning_rate": 2.8448888888888892e-05, - "loss": 0.5807, + "loss": 0.7904, "step": 18600 }, { - "epoch": 2.239345051769805, + "epoch": 13.266761768901569, "eval/acc": 39.53488540649414, "step": 18600 }, { - "epoch": 2.239345051769805, - "eval_loss": 2.097263813018799, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.684, - "eval_steps_per_second": 4.644, + "epoch": 13.266761768901569, + "eval_loss": 2.3613929748535156, + "eval_runtime": 0.2243, + "eval_samples_per_second": 191.727, + "eval_steps_per_second": 4.459, "step": 18600 }, { - "epoch": 2.2405490007223694, - "grad_norm": 8.5625, + "epoch": 13.273894436519258, + "grad_norm": 7.46875, "learning_rate": 2.8404444444444444e-05, - "loss": 0.5527, + "loss": 0.7528, "step": 18610 }, { - "epoch": 2.2417529496749338, - "grad_norm": 5.53125, + "epoch": 13.281027104136948, + "grad_norm": 8.0625, "learning_rate": 2.8360000000000003e-05, - "loss": 0.4939, + "loss": 0.7475, "step": 18620 }, { - "epoch": 2.242956898627498, - "grad_norm": 8.3125, + "epoch": 13.288159771754636, + "grad_norm": 9.9375, "learning_rate": 2.8315555555555555e-05, - "loss": 0.5648, + "loss": 0.7382, "step": 18630 }, { - "epoch": 2.2441608475800625, - "grad_norm": 5.71875, + "epoch": 13.295292439372325, + "grad_norm": 7.28125, "learning_rate": 2.8271111111111113e-05, - "loss": 0.5207, + "loss": 0.8196, "step": 18640 }, { - "epoch": 2.245364796532627, - "grad_norm": 12.9375, + "epoch": 13.302425106990015, + "grad_norm": 8.5, "learning_rate": 2.822666666666667e-05, - "loss": 0.5526, + "loss": 0.9212, "step": 18650 }, { - "epoch": 2.2465687454851913, - "grad_norm": 5.75, + "epoch": 13.309557774607704, + "grad_norm": 7.71875, "learning_rate": 2.818222222222222e-05, - "loss": 0.5586, + "loss": 0.7357, "step": 18660 }, { - "epoch": 2.2477726944377556, - "grad_norm": 13.5625, + "epoch": 13.316690442225392, + "grad_norm": 6.5, "learning_rate": 2.813777777777778e-05, - "loss": 0.5714, + "loss": 0.7228, "step": 18670 }, { - "epoch": 2.2489766433903204, - "grad_norm": 7.65625, + "epoch": 13.323823109843081, + "grad_norm": 6.5625, "learning_rate": 2.8093333333333334e-05, - "loss": 0.5244, + "loss": 0.8229, "step": 18680 }, { - "epoch": 2.250180592342885, - "grad_norm": 10.125, + "epoch": 13.33095577746077, + "grad_norm": 18.25, "learning_rate": 2.8048888888888893e-05, - "loss": 0.6128, + "loss": 0.8386, "step": 18690 }, { - "epoch": 2.251384541295449, - "grad_norm": 7.65625, + "epoch": 13.338088445078458, + "grad_norm": 11.0625, "learning_rate": 2.8004444444444445e-05, - "loss": 0.5419, + "loss": 0.7779, "step": 18700 }, { - "epoch": 2.251384541295449, - "eval/acc": 40.11627960205078, + "epoch": 13.338088445078458, + "eval/acc": 39.53488540649414, "step": 18700 }, { - "epoch": 2.251384541295449, - "eval_loss": 2.088261127471924, - "eval_runtime": 0.2127, - "eval_samples_per_second": 202.124, - "eval_steps_per_second": 4.701, + "epoch": 13.338088445078458, + "eval_loss": 2.363105535507202, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.351, + "eval_steps_per_second": 4.404, "step": 18700 }, { - "epoch": 2.2525884902480136, - "grad_norm": 8.25, + "epoch": 13.345221112696148, + "grad_norm": 6.75, "learning_rate": 2.7960000000000003e-05, - "loss": 0.5735, + "loss": 0.7851, "step": 18710 }, { - "epoch": 2.253792439200578, - "grad_norm": 20.25, + "epoch": 13.352353780313837, + "grad_norm": 7.28125, "learning_rate": 2.7915555555555555e-05, - "loss": 0.5479, + "loss": 0.7634, "step": 18720 }, { - "epoch": 2.2549963881531423, - "grad_norm": 9.0, + "epoch": 13.359486447931527, + "grad_norm": 7.0, "learning_rate": 2.7871111111111114e-05, - "loss": 0.549, + "loss": 0.8063, "step": 18730 }, { - "epoch": 2.2562003371057067, - "grad_norm": 10.25, + "epoch": 13.366619115549215, + "grad_norm": 7.15625, "learning_rate": 2.782666666666667e-05, - "loss": 0.637, + "loss": 0.7746, "step": 18740 }, { - "epoch": 2.257404286058271, - "grad_norm": 7.5625, + "epoch": 13.373751783166904, + "grad_norm": 7.75, "learning_rate": 2.7782222222222228e-05, - "loss": 0.5333, + "loss": 0.7927, "step": 18750 }, { - "epoch": 2.2586082350108354, - "grad_norm": 8.5625, + "epoch": 13.380884450784594, + "grad_norm": 10.4375, "learning_rate": 2.773777777777778e-05, - "loss": 0.5804, + "loss": 0.7496, "step": 18760 }, { - "epoch": 2.2598121839634, - "grad_norm": 8.0, + "epoch": 13.388017118402283, + "grad_norm": 13.0625, "learning_rate": 2.769333333333333e-05, - "loss": 0.4839, + "loss": 0.7797, "step": 18770 }, { - "epoch": 2.261016132915964, - "grad_norm": 6.53125, + "epoch": 13.39514978601997, + "grad_norm": 7.15625, "learning_rate": 2.764888888888889e-05, - "loss": 0.5463, + "loss": 0.7104, "step": 18780 }, { - "epoch": 2.262220081868529, - "grad_norm": 7.03125, + "epoch": 13.40228245363766, + "grad_norm": 123.0, "learning_rate": 2.7604444444444445e-05, - "loss": 0.6316, + "loss": 0.7854, "step": 18790 }, { - "epoch": 2.2634240308210933, - "grad_norm": 10.0625, + "epoch": 13.40941512125535, + "grad_norm": 8.9375, "learning_rate": 2.7560000000000004e-05, - "loss": 0.5341, + "loss": 0.7275, "step": 18800 }, { - "epoch": 2.2634240308210933, - "eval/acc": 40.11627960205078, + "epoch": 13.40941512125535, + "eval/acc": 46.511627197265625, "step": 18800 }, { - "epoch": 2.2634240308210933, - "eval_loss": 2.0978806018829346, - "eval_runtime": 0.2109, - "eval_samples_per_second": 203.871, - "eval_steps_per_second": 4.741, + "epoch": 13.40941512125535, + "eval_loss": 2.3632330894470215, + "eval_runtime": 0.223, + "eval_samples_per_second": 192.833, + "eval_steps_per_second": 4.484, "step": 18800 }, { - "epoch": 2.2646279797736577, - "grad_norm": 6.1875, + "epoch": 13.41654778887304, + "grad_norm": 8.0625, "learning_rate": 2.7515555555555556e-05, - "loss": 0.6241, + "loss": 0.7429, "step": 18810 }, { - "epoch": 2.265831928726222, - "grad_norm": 9.375, + "epoch": 13.423680456490727, + "grad_norm": 11.0, "learning_rate": 2.7471111111111114e-05, - "loss": 0.5504, + "loss": 0.7845, "step": 18820 }, { - "epoch": 2.2670358776787864, - "grad_norm": 6.96875, + "epoch": 13.430813124108417, + "grad_norm": 6.6875, "learning_rate": 2.7426666666666666e-05, - "loss": 0.5541, + "loss": 0.6946, "step": 18830 }, { - "epoch": 2.268239826631351, - "grad_norm": 8.875, + "epoch": 13.437945791726106, + "grad_norm": 6.875, "learning_rate": 2.7382222222222225e-05, - "loss": 0.5631, + "loss": 0.8386, "step": 18840 }, { - "epoch": 2.269443775583915, - "grad_norm": 7.25, + "epoch": 13.445078459343794, + "grad_norm": 10.375, "learning_rate": 2.733777777777778e-05, - "loss": 0.5903, + "loss": 0.7235, "step": 18850 }, { - "epoch": 2.2706477245364796, - "grad_norm": 7.625, + "epoch": 13.452211126961483, + "grad_norm": 7.90625, "learning_rate": 2.7293333333333332e-05, - "loss": 0.6442, + "loss": 0.7586, "step": 18860 }, { - "epoch": 2.271851673489044, - "grad_norm": 6.65625, + "epoch": 13.459343794579173, + "grad_norm": 7.34375, "learning_rate": 2.724888888888889e-05, - "loss": 0.5116, + "loss": 0.7546, "step": 18870 }, { - "epoch": 2.2730556224416083, - "grad_norm": 9.0625, + "epoch": 13.466476462196862, + "grad_norm": 5.9375, "learning_rate": 2.7204444444444442e-05, - "loss": 0.5303, + "loss": 0.7515, "step": 18880 }, { - "epoch": 2.2742595713941727, - "grad_norm": 8.4375, + "epoch": 13.47360912981455, + "grad_norm": 8.1875, "learning_rate": 2.716e-05, - "loss": 0.5667, + "loss": 0.7242, "step": 18890 }, { - "epoch": 2.2754635203467375, - "grad_norm": 8.75, + "epoch": 13.48074179743224, + "grad_norm": 6.53125, "learning_rate": 2.7115555555555556e-05, - "loss": 0.5621, + "loss": 0.7571, "step": 18900 }, { - "epoch": 2.2754635203467375, - "eval/acc": 38.953487396240234, + "epoch": 13.48074179743224, + "eval/acc": 44.1860466003418, "step": 18900 }, { - "epoch": 2.2754635203467375, - "eval_loss": 2.1006851196289062, - "eval_runtime": 0.2174, - "eval_samples_per_second": 197.827, - "eval_steps_per_second": 4.601, + "epoch": 13.48074179743224, + "eval_loss": 2.3733906745910645, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.866, + "eval_steps_per_second": 4.392, "step": 18900 }, { - "epoch": 2.276667469299302, - "grad_norm": 6.0625, + "epoch": 13.487874465049929, + "grad_norm": 8.125, "learning_rate": 2.7071111111111115e-05, - "loss": 0.5304, + "loss": 0.813, "step": 18910 }, { - "epoch": 2.2778714182518662, - "grad_norm": 8.25, + "epoch": 13.495007132667618, + "grad_norm": 9.4375, "learning_rate": 2.7026666666666667e-05, - "loss": 0.5421, + "loss": 0.7889, "step": 18920 }, { - "epoch": 2.2790753672044306, - "grad_norm": 5.34375, + "epoch": 13.502139800285306, + "grad_norm": 6.875, "learning_rate": 2.6982222222222225e-05, - "loss": 0.5085, + "loss": 0.6809, "step": 18930 }, { - "epoch": 2.280279316156995, - "grad_norm": 9.8125, + "epoch": 13.509272467902996, + "grad_norm": 17.875, "learning_rate": 2.6937777777777777e-05, - "loss": 0.4657, + "loss": 0.7368, "step": 18940 }, { - "epoch": 2.2814832651095593, - "grad_norm": 7.78125, + "epoch": 13.516405135520685, + "grad_norm": 7.3125, "learning_rate": 2.6893333333333336e-05, - "loss": 0.5507, + "loss": 0.7115, "step": 18950 }, { - "epoch": 2.2826872140621237, - "grad_norm": 6.40625, + "epoch": 13.523537803138375, + "grad_norm": 7.84375, "learning_rate": 2.684888888888889e-05, - "loss": 0.6328, + "loss": 0.6828, "step": 18960 }, { - "epoch": 2.283891163014688, - "grad_norm": 9.6875, + "epoch": 13.530670470756062, + "grad_norm": 8.4375, "learning_rate": 2.6804444444444443e-05, - "loss": 0.4663, + "loss": 0.7184, "step": 18970 }, { - "epoch": 2.2850951119672525, - "grad_norm": 6.84375, + "epoch": 13.537803138373752, + "grad_norm": 65.0, "learning_rate": 2.676e-05, - "loss": 0.5825, + "loss": 0.7808, "step": 18980 }, { - "epoch": 2.286299060919817, - "grad_norm": 7.25, + "epoch": 13.544935805991441, + "grad_norm": 8.75, "learning_rate": 2.6715555555555553e-05, - "loss": 0.5971, + "loss": 0.9181, "step": 18990 }, { - "epoch": 2.287503009872381, - "grad_norm": 8.875, + "epoch": 13.552068473609129, + "grad_norm": 9.3125, "learning_rate": 2.6671111111111112e-05, - "loss": 0.5891, + "loss": 0.7868, "step": 19000 }, { - "epoch": 2.287503009872381, - "eval/acc": 41.86046600341797, + "epoch": 13.552068473609129, + "eval/acc": 39.53488540649414, "step": 19000 }, { - "epoch": 2.287503009872381, - "eval_loss": 2.106297492980957, - "eval_runtime": 0.2175, - "eval_samples_per_second": 197.744, - "eval_steps_per_second": 4.599, + "epoch": 13.552068473609129, + "eval_loss": 2.3711330890655518, + "eval_runtime": 0.2397, + "eval_samples_per_second": 179.384, + "eval_steps_per_second": 4.172, "step": 19000 }, { - "epoch": 2.288706958824946, - "grad_norm": 6.25, + "epoch": 13.559201141226819, + "grad_norm": 23.625, "learning_rate": 2.6626666666666667e-05, - "loss": 0.5987, + "loss": 0.8008, "step": 19010 }, { - "epoch": 2.2899109077775104, - "grad_norm": 7.25, + "epoch": 13.566333808844508, + "grad_norm": 7.75, "learning_rate": 2.6582222222222226e-05, - "loss": 0.5449, + "loss": 0.7326, "step": 19020 }, { - "epoch": 2.2911148567300748, - "grad_norm": 9.125, + "epoch": 13.573466476462198, + "grad_norm": 9.375, "learning_rate": 2.6537777777777777e-05, - "loss": 0.52, + "loss": 0.7914, "step": 19030 }, { - "epoch": 2.292318805682639, - "grad_norm": 6.9375, + "epoch": 13.580599144079885, + "grad_norm": 5.875, "learning_rate": 2.6493333333333336e-05, - "loss": 0.5207, + "loss": 0.7849, "step": 19040 }, { - "epoch": 2.2935227546352035, - "grad_norm": 9.625, + "epoch": 13.587731811697575, + "grad_norm": 7.96875, "learning_rate": 2.644888888888889e-05, - "loss": 0.5534, + "loss": 0.7314, "step": 19050 }, { - "epoch": 2.294726703587768, - "grad_norm": 8.3125, + "epoch": 13.594864479315264, + "grad_norm": 7.90625, "learning_rate": 2.640444444444445e-05, - "loss": 0.6109, + "loss": 0.8637, "step": 19060 }, { - "epoch": 2.2959306525403322, - "grad_norm": 9.625, + "epoch": 13.601997146932954, + "grad_norm": 8.5625, "learning_rate": 2.6360000000000002e-05, - "loss": 0.6587, + "loss": 0.8337, "step": 19070 }, { - "epoch": 2.2971346014928966, - "grad_norm": 8.5625, + "epoch": 13.609129814550641, + "grad_norm": 8.75, "learning_rate": 2.6315555555555554e-05, - "loss": 0.6906, + "loss": 0.7362, "step": 19080 }, { - "epoch": 2.298338550445461, - "grad_norm": 6.375, + "epoch": 13.616262482168331, + "grad_norm": 7.3125, "learning_rate": 2.6271111111111112e-05, - "loss": 0.6105, + "loss": 0.7999, "step": 19090 }, { - "epoch": 2.2995424993980254, - "grad_norm": 7.40625, + "epoch": 13.62339514978602, + "grad_norm": 7.21875, "learning_rate": 2.6226666666666667e-05, - "loss": 0.5454, + "loss": 0.8105, "step": 19100 }, { - "epoch": 2.2995424993980254, - "eval/acc": 40.11627960205078, + "epoch": 13.62339514978602, + "eval/acc": 41.86046600341797, "step": 19100 }, { - "epoch": 2.2995424993980254, - "eval_loss": 2.120891571044922, - "eval_runtime": 0.2196, - "eval_samples_per_second": 195.827, - "eval_steps_per_second": 4.554, + "epoch": 13.62339514978602, + "eval_loss": 2.3755757808685303, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.454, + "eval_steps_per_second": 4.313, "step": 19100 }, { - "epoch": 2.3007464483505897, - "grad_norm": 8.75, + "epoch": 13.63052781740371, + "grad_norm": 30.5, "learning_rate": 2.6182222222222226e-05, - "loss": 0.6018, + "loss": 0.7981, "step": 19110 }, { - "epoch": 2.3019503973031545, - "grad_norm": 8.5625, + "epoch": 13.637660485021398, + "grad_norm": 7.0, "learning_rate": 2.6137777777777778e-05, - "loss": 0.6087, + "loss": 0.7317, "step": 19120 }, { - "epoch": 2.303154346255719, - "grad_norm": 8.375, + "epoch": 13.644793152639087, + "grad_norm": 7.40625, "learning_rate": 2.6093333333333336e-05, - "loss": 0.5712, + "loss": 0.808, "step": 19130 }, { - "epoch": 2.3043582952082833, - "grad_norm": 10.1875, + "epoch": 13.651925820256777, + "grad_norm": 8.5, "learning_rate": 2.604888888888889e-05, - "loss": 0.6849, + "loss": 0.8222, "step": 19140 }, { - "epoch": 2.3055622441608477, - "grad_norm": 5.625, + "epoch": 13.659058487874464, + "grad_norm": 8.625, "learning_rate": 2.6004444444444447e-05, - "loss": 0.5341, + "loss": 0.7333, "step": 19150 }, { - "epoch": 2.306766193113412, - "grad_norm": 5.65625, + "epoch": 13.666191155492154, + "grad_norm": 8.1875, "learning_rate": 2.5960000000000002e-05, - "loss": 0.5575, + "loss": 0.9, "step": 19160 }, { - "epoch": 2.3079701420659764, - "grad_norm": 7.4375, + "epoch": 13.673323823109843, + "grad_norm": 10.25, "learning_rate": 2.5915555555555554e-05, - "loss": 0.5442, + "loss": 0.7297, "step": 19170 }, { - "epoch": 2.3091740910185408, - "grad_norm": 13.4375, + "epoch": 13.680456490727533, + "grad_norm": 8.875, "learning_rate": 2.5871111111111113e-05, - "loss": 0.6246, + "loss": 0.8401, "step": 19180 }, { - "epoch": 2.310378039971105, - "grad_norm": 7.5, + "epoch": 13.68758915834522, + "grad_norm": 8.375, "learning_rate": 2.5826666666666664e-05, - "loss": 0.5112, + "loss": 0.803, "step": 19190 }, { - "epoch": 2.3115819889236695, - "grad_norm": 7.625, + "epoch": 13.69472182596291, + "grad_norm": 7.09375, "learning_rate": 2.5782222222222223e-05, - "loss": 0.5412, + "loss": 0.703, "step": 19200 }, { - "epoch": 2.3115819889236695, - "eval/acc": 39.53488540649414, + "epoch": 13.69472182596291, + "eval/acc": 41.86046600341797, "step": 19200 }, { - "epoch": 2.3115819889236695, - "eval_loss": 2.1130447387695312, - "eval_runtime": 0.3633, - "eval_samples_per_second": 118.375, - "eval_steps_per_second": 2.753, + "epoch": 13.69472182596291, + "eval_loss": 2.3849244117736816, + "eval_runtime": 0.2258, + "eval_samples_per_second": 190.453, + "eval_steps_per_second": 4.429, "step": 19200 }, { - "epoch": 2.312785937876234, - "grad_norm": 7.125, + "epoch": 13.7018544935806, + "grad_norm": 7.71875, "learning_rate": 2.573777777777778e-05, - "loss": 0.5188, + "loss": 0.7636, "step": 19210 }, { - "epoch": 2.3139898868287982, - "grad_norm": 7.84375, + "epoch": 13.708987161198289, + "grad_norm": 9.8125, "learning_rate": 2.5693333333333337e-05, - "loss": 0.5767, + "loss": 0.8003, "step": 19220 }, { - "epoch": 2.315193835781363, - "grad_norm": 7.71875, + "epoch": 13.716119828815977, + "grad_norm": 5.90625, "learning_rate": 2.564888888888889e-05, - "loss": 0.6596, + "loss": 0.7509, "step": 19230 }, { - "epoch": 2.3163977847339274, - "grad_norm": 9.6875, + "epoch": 13.723252496433666, + "grad_norm": 8.8125, "learning_rate": 2.5604444444444447e-05, - "loss": 0.6366, + "loss": 0.8054, "step": 19240 }, { - "epoch": 2.317601733686492, - "grad_norm": 6.21875, + "epoch": 13.730385164051356, + "grad_norm": 8.0, "learning_rate": 2.556e-05, - "loss": 0.5676, + "loss": 0.8303, "step": 19250 }, { - "epoch": 2.318805682639056, - "grad_norm": 6.125, + "epoch": 13.737517831669045, + "grad_norm": 15.6875, "learning_rate": 2.5515555555555558e-05, - "loss": 0.5419, + "loss": 0.8224, "step": 19260 }, { - "epoch": 2.3200096315916205, - "grad_norm": 8.125, + "epoch": 13.744650499286733, + "grad_norm": 17.25, "learning_rate": 2.5471111111111113e-05, - "loss": 0.5438, + "loss": 0.7978, "step": 19270 }, { - "epoch": 2.321213580544185, - "grad_norm": 8.5, + "epoch": 13.751783166904422, + "grad_norm": 9.625, "learning_rate": 2.5426666666666665e-05, - "loss": 0.5033, + "loss": 0.7964, "step": 19280 }, { - "epoch": 2.3224175294967493, - "grad_norm": 6.0, + "epoch": 13.758915834522112, + "grad_norm": 6.875, "learning_rate": 2.5382222222222224e-05, - "loss": 0.5529, + "loss": 0.8014, "step": 19290 }, { - "epoch": 2.3236214784493137, - "grad_norm": 7.34375, + "epoch": 13.7660485021398, + "grad_norm": 7.09375, "learning_rate": 2.5337777777777775e-05, - "loss": 0.5337, + "loss": 0.7804, "step": 19300 }, { - "epoch": 2.3236214784493137, - "eval/acc": 39.53488540649414, + "epoch": 13.7660485021398, + "eval/acc": 44.1860466003418, "step": 19300 }, { - "epoch": 2.3236214784493137, - "eval_loss": 2.0968053340911865, - "eval_runtime": 0.2285, - "eval_samples_per_second": 188.168, - "eval_steps_per_second": 4.376, + "epoch": 13.7660485021398, + "eval_loss": 2.3624305725097656, + "eval_runtime": 0.262, + "eval_samples_per_second": 164.141, + "eval_steps_per_second": 3.817, "step": 19300 }, { - "epoch": 2.324825427401878, - "grad_norm": 9.0625, + "epoch": 13.773181169757489, + "grad_norm": 6.46875, "learning_rate": 2.5293333333333334e-05, - "loss": 0.5345, + "loss": 0.7614, "step": 19310 }, { - "epoch": 2.3260293763544424, - "grad_norm": 7.125, + "epoch": 13.780313837375179, + "grad_norm": 7.8125, "learning_rate": 2.524888888888889e-05, - "loss": 0.5493, + "loss": 0.7323, "step": 19320 }, { - "epoch": 2.3272333253070068, - "grad_norm": 8.5625, + "epoch": 13.787446504992868, + "grad_norm": 6.375, "learning_rate": 2.5204444444444448e-05, - "loss": 0.5026, + "loss": 0.6908, "step": 19330 }, { - "epoch": 2.3284372742595716, - "grad_norm": 8.3125, + "epoch": 13.794579172610556, + "grad_norm": 10.4375, "learning_rate": 2.516e-05, - "loss": 0.6307, + "loss": 0.7742, "step": 19340 }, { - "epoch": 2.329641223212136, - "grad_norm": 7.84375, + "epoch": 13.801711840228245, + "grad_norm": 7.4375, "learning_rate": 2.5115555555555558e-05, - "loss": 0.5954, + "loss": 0.7686, "step": 19350 }, { - "epoch": 2.3308451721647003, - "grad_norm": 8.8125, + "epoch": 13.808844507845935, + "grad_norm": 8.875, "learning_rate": 2.5071111111111114e-05, - "loss": 0.5993, + "loss": 0.8334, "step": 19360 }, { - "epoch": 2.3320491211172647, - "grad_norm": 6.03125, + "epoch": 13.815977175463622, + "grad_norm": 7.5, "learning_rate": 2.5026666666666672e-05, - "loss": 0.5018, + "loss": 0.7758, "step": 19370 }, { - "epoch": 2.333253070069829, - "grad_norm": 8.0, + "epoch": 13.823109843081312, + "grad_norm": 7.53125, "learning_rate": 2.4982222222222224e-05, - "loss": 0.5332, + "loss": 0.7814, "step": 19380 }, { - "epoch": 2.3344570190223934, - "grad_norm": 7.84375, + "epoch": 13.830242510699001, + "grad_norm": 14.5625, "learning_rate": 2.493777777777778e-05, - "loss": 0.6697, + "loss": 0.8628, "step": 19390 }, { - "epoch": 2.335660967974958, - "grad_norm": 7.40625, + "epoch": 13.837375178316691, + "grad_norm": 6.5625, "learning_rate": 2.4893333333333334e-05, - "loss": 0.5305, + "loss": 0.7898, "step": 19400 }, { - "epoch": 2.335660967974958, - "eval/acc": 40.11627960205078, + "epoch": 13.837375178316691, + "eval/acc": 41.86046600341797, "step": 19400 }, { - "epoch": 2.335660967974958, - "eval_loss": 2.104180335998535, - "eval_runtime": 0.3958, - "eval_samples_per_second": 108.651, - "eval_steps_per_second": 2.527, + "epoch": 13.837375178316691, + "eval_loss": 2.359168767929077, + "eval_runtime": 0.2248, + "eval_samples_per_second": 191.308, + "eval_steps_per_second": 4.449, "step": 19400 }, { - "epoch": 2.336864916927522, - "grad_norm": 8.375, + "epoch": 13.844507845934379, + "grad_norm": 15.375, "learning_rate": 2.484888888888889e-05, - "loss": 0.5422, + "loss": 0.786, "step": 19410 }, { - "epoch": 2.3380688658800866, - "grad_norm": 16.875, + "epoch": 13.851640513552068, + "grad_norm": 5.59375, "learning_rate": 2.4804444444444448e-05, - "loss": 0.6836, + "loss": 0.8072, "step": 19420 }, { - "epoch": 2.339272814832651, - "grad_norm": 7.09375, + "epoch": 13.858773181169758, + "grad_norm": 8.1875, "learning_rate": 2.476e-05, - "loss": 0.6242, + "loss": 0.7777, "step": 19430 }, { - "epoch": 2.3404767637852153, - "grad_norm": 7.28125, + "epoch": 13.865905848787447, + "grad_norm": 8.8125, "learning_rate": 2.4715555555555555e-05, - "loss": 0.616, + "loss": 0.8148, "step": 19440 }, { - "epoch": 2.34168071273778, - "grad_norm": 7.9375, + "epoch": 13.873038516405135, + "grad_norm": 10.9375, "learning_rate": 2.467111111111111e-05, - "loss": 0.4962, + "loss": 0.7659, "step": 19450 }, { - "epoch": 2.3428846616903445, - "grad_norm": 6.4375, + "epoch": 13.880171184022824, + "grad_norm": 5.59375, "learning_rate": 2.4626666666666666e-05, - "loss": 0.4496, + "loss": 0.7819, "step": 19460 }, { - "epoch": 2.344088610642909, - "grad_norm": 8.6875, + "epoch": 13.887303851640514, + "grad_norm": 8.125, "learning_rate": 2.4582222222222224e-05, - "loss": 0.5492, + "loss": 0.8459, "step": 19470 }, { - "epoch": 2.3452925595954732, - "grad_norm": 40.5, + "epoch": 13.894436519258203, + "grad_norm": 7.1875, "learning_rate": 2.453777777777778e-05, - "loss": 0.6082, + "loss": 0.7448, "step": 19480 }, { - "epoch": 2.3464965085480376, - "grad_norm": 7.53125, + "epoch": 13.901569186875891, + "grad_norm": 7.6875, "learning_rate": 2.4493333333333335e-05, - "loss": 0.4591, + "loss": 0.8096, "step": 19490 }, { - "epoch": 2.347700457500602, - "grad_norm": 9.4375, + "epoch": 13.90870185449358, + "grad_norm": 12.0, "learning_rate": 2.444888888888889e-05, - "loss": 0.5899, + "loss": 0.7402, "step": 19500 }, { - "epoch": 2.347700457500602, + "epoch": 13.90870185449358, "eval/acc": 39.53488540649414, "step": 19500 }, { - "epoch": 2.347700457500602, - "eval_loss": 2.1160874366760254, - "eval_runtime": 4.1188, - "eval_samples_per_second": 10.44, - "eval_steps_per_second": 0.243, + "epoch": 13.90870185449358, + "eval_loss": 2.3664777278900146, + "eval_runtime": 0.2287, + "eval_samples_per_second": 188.004, + "eval_steps_per_second": 4.372, "step": 19500 }, { - "epoch": 2.3489044064531663, - "grad_norm": 7.625, + "epoch": 13.91583452211127, + "grad_norm": 8.5, "learning_rate": 2.4404444444444445e-05, - "loss": 0.5605, + "loss": 0.796, "step": 19510 }, { - "epoch": 2.3501083554057307, - "grad_norm": 7.59375, + "epoch": 13.922967189728958, + "grad_norm": 10.3125, "learning_rate": 2.4360000000000004e-05, - "loss": 0.5862, + "loss": 0.7331, "step": 19520 }, { - "epoch": 2.351312304358295, - "grad_norm": 8.0625, + "epoch": 13.930099857346647, + "grad_norm": 9.0, "learning_rate": 2.431555555555556e-05, - "loss": 0.583, + "loss": 0.6982, "step": 19530 }, { - "epoch": 2.3525162533108595, - "grad_norm": 7.09375, + "epoch": 13.937232524964337, + "grad_norm": 8.0625, "learning_rate": 2.427111111111111e-05, - "loss": 0.5206, + "loss": 0.7831, "step": 19540 }, { - "epoch": 2.353720202263424, - "grad_norm": 9.1875, + "epoch": 13.944365192582026, + "grad_norm": 8.1875, "learning_rate": 2.4226666666666666e-05, - "loss": 0.6646, + "loss": 0.7795, "step": 19550 }, { - "epoch": 2.3549241512159886, - "grad_norm": 9.5625, + "epoch": 13.951497860199714, + "grad_norm": 6.25, "learning_rate": 2.418222222222222e-05, - "loss": 0.5658, + "loss": 0.7416, "step": 19560 }, { - "epoch": 2.356128100168553, - "grad_norm": 6.5625, + "epoch": 13.958630527817403, + "grad_norm": 7.03125, "learning_rate": 2.413777777777778e-05, - "loss": 0.5273, + "loss": 0.7553, "step": 19570 }, { - "epoch": 2.3573320491211174, - "grad_norm": 6.15625, + "epoch": 13.965763195435093, + "grad_norm": 7.4375, "learning_rate": 2.4093333333333335e-05, - "loss": 0.647, + "loss": 0.7573, "step": 19580 }, { - "epoch": 2.3585359980736817, - "grad_norm": 7.9375, + "epoch": 13.972895863052782, + "grad_norm": 7.75, "learning_rate": 2.404888888888889e-05, - "loss": 0.5751, + "loss": 0.7041, "step": 19590 }, { - "epoch": 2.359739947026246, - "grad_norm": 10.125, + "epoch": 13.98002853067047, + "grad_norm": 8.75, "learning_rate": 2.4004444444444446e-05, - "loss": 0.5891, + "loss": 0.8235, "step": 19600 }, { - "epoch": 2.359739947026246, - "eval/acc": 40.11627960205078, + "epoch": 13.98002853067047, + "eval/acc": 44.1860466003418, "step": 19600 }, { - "epoch": 2.359739947026246, - "eval_loss": 2.0887560844421387, - "eval_runtime": 3.2754, - "eval_samples_per_second": 13.128, - "eval_steps_per_second": 0.305, + "epoch": 13.98002853067047, + "eval_loss": 2.3475139141082764, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.095, + "eval_steps_per_second": 4.467, "step": 19600 }, { - "epoch": 2.3609438959788105, - "grad_norm": 8.0625, + "epoch": 13.98716119828816, + "grad_norm": 8.9375, "learning_rate": 2.396e-05, - "loss": 0.5342, + "loss": 0.8462, "step": 19610 }, { - "epoch": 2.362147844931375, + "epoch": 13.99429386590585, "grad_norm": 7.9375, "learning_rate": 2.3915555555555556e-05, - "loss": 0.535, + "loss": 0.7937, "step": 19620 }, { - "epoch": 2.3633517938839392, - "grad_norm": 6.96875, + "epoch": 14.001426533523539, + "grad_norm": 13.1875, "learning_rate": 2.3871111111111115e-05, - "loss": 0.5114, + "loss": 0.7584, "step": 19630 }, { - "epoch": 2.3645557428365036, - "grad_norm": 8.4375, + "epoch": 14.008559201141226, + "grad_norm": 6.25, "learning_rate": 2.3826666666666667e-05, - "loss": 0.5442, + "loss": 0.7309, "step": 19640 }, { - "epoch": 2.365759691789068, - "grad_norm": 8.0, + "epoch": 14.015691868758916, + "grad_norm": 6.9375, "learning_rate": 2.3782222222222222e-05, - "loss": 0.4812, + "loss": 0.7802, "step": 19650 }, { - "epoch": 2.3669636407416323, - "grad_norm": 7.25, + "epoch": 14.022824536376605, + "grad_norm": 7.4375, "learning_rate": 2.3737777777777777e-05, - "loss": 0.6373, + "loss": 0.6789, "step": 19660 }, { - "epoch": 2.368167589694197, - "grad_norm": 7.0, + "epoch": 14.029957203994293, + "grad_norm": 8.25, "learning_rate": 2.3693333333333332e-05, - "loss": 0.5311, + "loss": 0.6285, "step": 19670 }, { - "epoch": 2.3693715386467615, - "grad_norm": 8.25, + "epoch": 14.037089871611983, + "grad_norm": 15.9375, "learning_rate": 2.364888888888889e-05, - "loss": 0.6208, + "loss": 0.8654, "step": 19680 }, { - "epoch": 2.370575487599326, - "grad_norm": 7.15625, + "epoch": 14.044222539229672, + "grad_norm": 8.625, "learning_rate": 2.3604444444444446e-05, - "loss": 0.5762, + "loss": 0.7778, "step": 19690 }, { - "epoch": 2.3717794365518903, - "grad_norm": 11.4375, + "epoch": 14.051355206847362, + "grad_norm": 8.0625, "learning_rate": 2.356e-05, - "loss": 0.5301, + "loss": 0.8099, "step": 19700 }, { - "epoch": 2.3717794365518903, - "eval/acc": 41.86046600341797, + "epoch": 14.051355206847362, + "eval/acc": 51.16279220581055, "step": 19700 }, { - "epoch": 2.3717794365518903, - "eval_loss": 2.0873513221740723, - "eval_runtime": 0.2195, - "eval_samples_per_second": 195.888, - "eval_steps_per_second": 4.556, + "epoch": 14.051355206847362, + "eval_loss": 2.568962574005127, + "eval_runtime": 7.2029, + "eval_samples_per_second": 5.97, + "eval_steps_per_second": 0.139, "step": 19700 }, { - "epoch": 2.3729833855044546, - "grad_norm": 7.65625, + "epoch": 14.05848787446505, + "grad_norm": 10.5, "learning_rate": 2.3515555555555557e-05, - "loss": 0.5212, + "loss": 0.7666, "step": 19710 }, { - "epoch": 2.374187334457019, - "grad_norm": 8.1875, + "epoch": 14.065620542082739, + "grad_norm": 10.9375, "learning_rate": 2.3471111111111112e-05, - "loss": 0.5576, + "loss": 0.7339, "step": 19720 }, { - "epoch": 2.3753912834095834, - "grad_norm": 8.625, + "epoch": 14.072753209700428, + "grad_norm": 7.21875, "learning_rate": 2.342666666666667e-05, - "loss": 0.5025, + "loss": 0.7559, "step": 19730 }, { - "epoch": 2.3765952323621478, - "grad_norm": 7.34375, + "epoch": 14.079885877318118, + "grad_norm": 15.0625, "learning_rate": 2.3382222222222222e-05, - "loss": 0.4975, + "loss": 0.7539, "step": 19740 }, { - "epoch": 2.377799181314712, - "grad_norm": 7.59375, + "epoch": 14.087018544935805, + "grad_norm": 6.90625, "learning_rate": 2.3337777777777778e-05, - "loss": 0.5568, + "loss": 0.744, "step": 19750 }, { - "epoch": 2.3790031302672765, - "grad_norm": 5.03125, + "epoch": 14.094151212553495, + "grad_norm": 7.28125, "learning_rate": 2.3293333333333333e-05, - "loss": 0.508, + "loss": 0.8038, "step": 19760 }, { - "epoch": 2.380207079219841, - "grad_norm": 6.53125, + "epoch": 14.101283880171184, + "grad_norm": 8.8125, "learning_rate": 2.3248888888888888e-05, - "loss": 0.5257, + "loss": 0.819, "step": 19770 }, { - "epoch": 2.3814110281724057, - "grad_norm": 8.125, + "epoch": 14.108416547788874, + "grad_norm": 7.1875, "learning_rate": 2.3204444444444447e-05, - "loss": 0.5114, + "loss": 0.7756, "step": 19780 }, { - "epoch": 2.38261497712497, - "grad_norm": 8.3125, + "epoch": 14.115549215406562, + "grad_norm": 7.4375, "learning_rate": 2.3160000000000002e-05, - "loss": 0.572, + "loss": 0.8243, "step": 19790 }, { - "epoch": 2.3838189260775344, - "grad_norm": 5.9375, + "epoch": 14.122681883024251, + "grad_norm": 6.21875, "learning_rate": 2.3115555555555557e-05, - "loss": 0.548, + "loss": 0.7094, "step": 19800 }, { - "epoch": 2.3838189260775344, - "eval/acc": 41.86046600341797, + "epoch": 14.122681883024251, + "eval/acc": 51.16279220581055, "step": 19800 }, { - "epoch": 2.3838189260775344, - "eval_loss": 2.076712131500244, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.681, - "eval_steps_per_second": 4.644, + "epoch": 14.122681883024251, + "eval_loss": 2.565817356109619, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.132, + "eval_steps_per_second": 4.584, "step": 19800 }, { - "epoch": 2.385022875030099, - "grad_norm": 9.5625, + "epoch": 14.12981455064194, + "grad_norm": 6.9375, "learning_rate": 2.3071111111111112e-05, - "loss": 0.5948, + "loss": 0.7453, "step": 19810 }, { - "epoch": 2.386226823982663, - "grad_norm": 12.6875, + "epoch": 14.136947218259628, + "grad_norm": 7.21875, "learning_rate": 2.3026666666666668e-05, - "loss": 0.5362, + "loss": 0.8192, "step": 19820 }, { - "epoch": 2.3874307729352275, - "grad_norm": 8.625, + "epoch": 14.144079885877318, + "grad_norm": 7.9375, "learning_rate": 2.2982222222222223e-05, - "loss": 0.5482, + "loss": 0.7633, "step": 19830 }, { - "epoch": 2.388634721887792, - "grad_norm": 5.40625, + "epoch": 14.151212553495007, + "grad_norm": 6.875, "learning_rate": 2.293777777777778e-05, - "loss": 0.5819, + "loss": 0.7686, "step": 19840 }, { - "epoch": 2.3898386708403563, - "grad_norm": 6.96875, + "epoch": 14.158345221112697, + "grad_norm": 9.5, "learning_rate": 2.2893333333333333e-05, - "loss": 0.5025, + "loss": 0.7597, "step": 19850 }, { - "epoch": 2.3910426197929207, - "grad_norm": 5.75, + "epoch": 14.165477888730384, + "grad_norm": 7.03125, "learning_rate": 2.284888888888889e-05, - "loss": 0.5603, + "loss": 0.7675, "step": 19860 }, { - "epoch": 2.392246568745485, - "grad_norm": 9.875, + "epoch": 14.172610556348074, + "grad_norm": 9.625, "learning_rate": 2.2804444444444444e-05, - "loss": 0.602, + "loss": 0.7373, "step": 19870 }, { - "epoch": 2.3934505176980494, - "grad_norm": 7.5625, + "epoch": 14.179743223965763, + "grad_norm": 7.0, "learning_rate": 2.2760000000000002e-05, - "loss": 0.5969, + "loss": 0.801, "step": 19880 }, { - "epoch": 2.394654466650614, - "grad_norm": 8.0, + "epoch": 14.186875891583453, + "grad_norm": 11.875, "learning_rate": 2.2715555555555558e-05, - "loss": 0.5586, + "loss": 0.7575, "step": 19890 }, { - "epoch": 2.3958584156031786, - "grad_norm": 7.25, + "epoch": 14.19400855920114, + "grad_norm": 8.375, "learning_rate": 2.2671111111111113e-05, - "loss": 0.6009, + "loss": 0.8073, "step": 19900 }, { - "epoch": 2.3958584156031786, - "eval/acc": 41.27906799316406, + "epoch": 14.19400855920114, + "eval/acc": 53.488372802734375, "step": 19900 }, { - "epoch": 2.3958584156031786, - "eval_loss": 2.0865280628204346, - "eval_runtime": 0.2179, - "eval_samples_per_second": 197.344, - "eval_steps_per_second": 4.589, + "epoch": 14.19400855920114, + "eval_loss": 2.5913121700286865, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.262, + "eval_steps_per_second": 3.541, "step": 19900 }, { - "epoch": 2.397062364555743, - "grad_norm": 8.5625, + "epoch": 14.20114122681883, + "grad_norm": 7.71875, "learning_rate": 2.2626666666666668e-05, - "loss": 0.574, + "loss": 0.7899, "step": 19910 }, { - "epoch": 2.3982663135083073, - "grad_norm": 7.78125, + "epoch": 14.20827389443652, + "grad_norm": 11.625, "learning_rate": 2.2582222222222223e-05, - "loss": 0.5355, + "loss": 0.8058, "step": 19920 }, { - "epoch": 2.3994702624608717, - "grad_norm": 9.5, + "epoch": 14.21540656205421, + "grad_norm": 8.4375, "learning_rate": 2.253777777777778e-05, - "loss": 0.6201, + "loss": 0.7907, "step": 19930 }, { - "epoch": 2.400674211413436, - "grad_norm": 9.4375, + "epoch": 14.222539229671897, + "grad_norm": 7.3125, "learning_rate": 2.2493333333333337e-05, - "loss": 0.5864, + "loss": 0.7557, "step": 19940 }, { - "epoch": 2.4018781603660004, - "grad_norm": 7.96875, + "epoch": 14.229671897289586, + "grad_norm": 6.25, "learning_rate": 2.244888888888889e-05, - "loss": 0.5624, + "loss": 0.7859, "step": 19950 }, { - "epoch": 2.403082109318565, - "grad_norm": 8.4375, + "epoch": 14.236804564907276, + "grad_norm": 9.25, "learning_rate": 2.2404444444444444e-05, - "loss": 0.6663, + "loss": 0.734, "step": 19960 }, { - "epoch": 2.404286058271129, - "grad_norm": 7.71875, + "epoch": 14.243937232524964, + "grad_norm": 7.78125, "learning_rate": 2.236e-05, - "loss": 0.5064, + "loss": 0.7623, "step": 19970 }, { - "epoch": 2.4054900072236935, - "grad_norm": 11.3125, + "epoch": 14.251069900142653, + "grad_norm": 7.15625, "learning_rate": 2.2315555555555555e-05, - "loss": 0.5555, + "loss": 0.6912, "step": 19980 }, { - "epoch": 2.406693956176258, - "grad_norm": 7.96875, + "epoch": 14.258202567760343, + "grad_norm": 6.5, "learning_rate": 2.2271111111111113e-05, - "loss": 0.5172, + "loss": 0.7367, "step": 19990 }, { - "epoch": 2.4078979051288227, - "grad_norm": 10.25, + "epoch": 14.265335235378032, + "grad_norm": 7.375, "learning_rate": 2.222666666666667e-05, - "loss": 0.6456, + "loss": 0.7353, "step": 20000 }, { - "epoch": 2.4078979051288227, - "eval/acc": 39.53488540649414, + "epoch": 14.265335235378032, + "eval/acc": 51.16279220581055, "step": 20000 }, { - "epoch": 2.4078979051288227, - "eval_loss": 2.097860336303711, - "eval_runtime": 0.2169, - "eval_samples_per_second": 198.258, - "eval_steps_per_second": 4.611, + "epoch": 14.265335235378032, + "eval_loss": 2.5749382972717285, + "eval_runtime": 0.2259, + "eval_samples_per_second": 190.358, + "eval_steps_per_second": 4.427, "step": 20000 }, { - "epoch": 2.409101854081387, - "grad_norm": 8.125, + "epoch": 14.27246790299572, + "grad_norm": 8.8125, "learning_rate": 2.2182222222222224e-05, - "loss": 0.5288, + "loss": 0.7647, "step": 20010 }, { - "epoch": 2.4103058030339515, - "grad_norm": 6.53125, + "epoch": 14.27960057061341, + "grad_norm": 8.25, "learning_rate": 2.213777777777778e-05, - "loss": 0.5817, + "loss": 0.7528, "step": 20020 }, { - "epoch": 2.411509751986516, - "grad_norm": 6.53125, + "epoch": 14.286733238231099, + "grad_norm": 8.125, "learning_rate": 2.2093333333333334e-05, - "loss": 0.4742, + "loss": 0.6316, "step": 20030 }, { - "epoch": 2.41271370093908, - "grad_norm": 8.4375, + "epoch": 14.293865905848788, + "grad_norm": 40.75, "learning_rate": 2.2048888888888893e-05, - "loss": 0.6367, + "loss": 0.6989, "step": 20040 }, { - "epoch": 2.4139176498916446, - "grad_norm": 9.0625, + "epoch": 14.300998573466476, + "grad_norm": 7.1875, "learning_rate": 2.2004444444444445e-05, - "loss": 0.5633, + "loss": 0.7497, "step": 20050 }, { - "epoch": 2.415121598844209, - "grad_norm": 9.875, + "epoch": 14.308131241084165, + "grad_norm": 7.46875, "learning_rate": 2.196e-05, - "loss": 0.5933, + "loss": 0.7738, "step": 20060 }, { - "epoch": 2.4163255477967733, - "grad_norm": 7.0625, + "epoch": 14.315263908701855, + "grad_norm": 10.25, "learning_rate": 2.1915555555555555e-05, - "loss": 0.4554, + "loss": 0.7183, "step": 20070 }, { - "epoch": 2.4175294967493377, - "grad_norm": 9.3125, + "epoch": 14.322396576319543, + "grad_norm": 13.125, "learning_rate": 2.187111111111111e-05, - "loss": 0.5836, + "loss": 0.7628, "step": 20080 }, { - "epoch": 2.418733445701902, - "grad_norm": 8.1875, + "epoch": 14.329529243937232, + "grad_norm": 6.96875, "learning_rate": 2.182666666666667e-05, - "loss": 0.568, + "loss": 0.7416, "step": 20090 }, { - "epoch": 2.4199373946544664, - "grad_norm": 8.9375, + "epoch": 14.336661911554922, + "grad_norm": 6.59375, "learning_rate": 2.1782222222222224e-05, - "loss": 0.548, + "loss": 0.7937, "step": 20100 }, { - "epoch": 2.4199373946544664, - "eval/acc": 39.53488540649414, + "epoch": 14.336661911554922, + "eval/acc": 51.16279220581055, "step": 20100 }, { - "epoch": 2.4199373946544664, - "eval_loss": 2.1182146072387695, - "eval_runtime": 0.965, - "eval_samples_per_second": 44.561, - "eval_steps_per_second": 1.036, + "epoch": 14.336661911554922, + "eval_loss": 2.5889506340026855, + "eval_runtime": 0.2201, + "eval_samples_per_second": 195.39, + "eval_steps_per_second": 4.544, "step": 20100 }, { - "epoch": 2.4211413436070313, - "grad_norm": 8.25, + "epoch": 14.343794579172611, + "grad_norm": 7.1875, "learning_rate": 2.173777777777778e-05, - "loss": 0.6409, + "loss": 0.8359, "step": 20110 }, { - "epoch": 2.4223452925595956, - "grad_norm": 9.0, + "epoch": 14.350927246790299, + "grad_norm": 6.84375, "learning_rate": 2.1693333333333335e-05, - "loss": 0.5349, + "loss": 0.782, "step": 20120 }, { - "epoch": 2.42354924151216, - "grad_norm": 8.125, + "epoch": 14.358059914407988, + "grad_norm": 6.78125, "learning_rate": 2.164888888888889e-05, - "loss": 0.5239, + "loss": 0.7893, "step": 20130 }, { - "epoch": 2.4247531904647244, - "grad_norm": 7.46875, + "epoch": 14.365192582025678, + "grad_norm": 9.125, "learning_rate": 2.1604444444444445e-05, - "loss": 0.5654, + "loss": 0.7187, "step": 20140 }, { - "epoch": 2.4259571394172887, - "grad_norm": 12.125, + "epoch": 14.372325249643367, + "grad_norm": 8.125, "learning_rate": 2.1560000000000004e-05, - "loss": 0.545, + "loss": 0.7507, "step": 20150 }, { - "epoch": 2.427161088369853, - "grad_norm": 8.6875, + "epoch": 14.379457917261055, + "grad_norm": 9.1875, "learning_rate": 2.1515555555555555e-05, - "loss": 0.5703, + "loss": 0.7716, "step": 20160 }, { - "epoch": 2.4283650373224175, - "grad_norm": 9.3125, + "epoch": 14.386590584878745, + "grad_norm": 7.53125, "learning_rate": 2.147111111111111e-05, - "loss": 0.505, + "loss": 0.7996, "step": 20170 }, { - "epoch": 2.429568986274982, - "grad_norm": 11.625, + "epoch": 14.393723252496434, + "grad_norm": 7.84375, "learning_rate": 2.1426666666666666e-05, - "loss": 0.5989, + "loss": 0.7752, "step": 20180 }, { - "epoch": 2.4307729352275462, - "grad_norm": 7.5, + "epoch": 14.400855920114124, + "grad_norm": 8.0, "learning_rate": 2.1382222222222225e-05, - "loss": 0.5705, + "loss": 0.8248, "step": 20190 }, { - "epoch": 2.4319768841801106, - "grad_norm": 8.9375, + "epoch": 14.407988587731811, + "grad_norm": 7.1875, "learning_rate": 2.133777777777778e-05, - "loss": 0.5854, + "loss": 0.7153, "step": 20200 }, { - "epoch": 2.4319768841801106, - "eval/acc": 39.53488540649414, + "epoch": 14.407988587731811, + "eval/acc": 53.488372802734375, "step": 20200 }, { - "epoch": 2.4319768841801106, - "eval_loss": 2.110692262649536, - "eval_runtime": 0.2209, - "eval_samples_per_second": 194.678, - "eval_steps_per_second": 4.527, + "epoch": 14.407988587731811, + "eval_loss": 2.5689048767089844, + "eval_runtime": 0.7977, + "eval_samples_per_second": 53.904, + "eval_steps_per_second": 1.254, "step": 20200 }, { - "epoch": 2.433180833132675, - "grad_norm": 6.84375, + "epoch": 14.4151212553495, + "grad_norm": 8.375, "learning_rate": 2.1293333333333335e-05, - "loss": 0.4891, + "loss": 0.809, "step": 20210 }, { - "epoch": 2.43438478208524, - "grad_norm": 7.1875, + "epoch": 14.42225392296719, + "grad_norm": 20.875, "learning_rate": 2.124888888888889e-05, - "loss": 0.5072, + "loss": 0.7955, "step": 20220 }, { - "epoch": 2.435588731037804, - "grad_norm": 9.125, + "epoch": 14.429386590584878, + "grad_norm": 7.9375, "learning_rate": 2.1204444444444445e-05, - "loss": 0.5481, + "loss": 0.7775, "step": 20230 }, { - "epoch": 2.4367926799903685, - "grad_norm": 8.75, + "epoch": 14.436519258202567, + "grad_norm": 8.6875, "learning_rate": 2.116e-05, - "loss": 0.6341, + "loss": 0.8455, "step": 20240 }, { - "epoch": 2.437996628942933, - "grad_norm": 9.625, + "epoch": 14.443651925820257, + "grad_norm": 7.125, "learning_rate": 2.111555555555556e-05, - "loss": 0.5346, + "loss": 0.815, "step": 20250 }, { - "epoch": 2.4392005778954973, - "grad_norm": 9.0, + "epoch": 14.450784593437946, + "grad_norm": 8.125, "learning_rate": 2.107111111111111e-05, - "loss": 0.6777, + "loss": 0.8153, "step": 20260 }, { - "epoch": 2.4404045268480616, - "grad_norm": 7.34375, + "epoch": 14.457917261055634, + "grad_norm": 6.09375, "learning_rate": 2.1026666666666666e-05, - "loss": 0.5555, + "loss": 0.7566, "step": 20270 }, { - "epoch": 2.441608475800626, - "grad_norm": 8.4375, + "epoch": 14.465049928673324, + "grad_norm": 8.9375, "learning_rate": 2.098222222222222e-05, - "loss": 0.5907, + "loss": 0.8227, "step": 20280 }, { - "epoch": 2.4428124247531904, - "grad_norm": 7.375, + "epoch": 14.472182596291013, + "grad_norm": 6.28125, "learning_rate": 2.0937777777777777e-05, - "loss": 0.6841, + "loss": 0.7402, "step": 20290 }, { - "epoch": 2.4440163737057548, - "grad_norm": 12.9375, + "epoch": 14.479315263908703, + "grad_norm": 6.84375, "learning_rate": 2.0893333333333335e-05, - "loss": 0.5502, + "loss": 0.826, "step": 20300 }, { - "epoch": 2.4440163737057548, - "eval/acc": 39.53488540649414, + "epoch": 14.479315263908703, + "eval/acc": 51.16279220581055, "step": 20300 }, { - "epoch": 2.4440163737057548, - "eval_loss": 2.098959445953369, - "eval_runtime": 0.2118, - "eval_samples_per_second": 203.049, - "eval_steps_per_second": 4.722, + "epoch": 14.479315263908703, + "eval_loss": 2.570920467376709, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.423, + "eval_steps_per_second": 4.614, "step": 20300 }, { - "epoch": 2.445220322658319, - "grad_norm": 8.5625, + "epoch": 14.48644793152639, + "grad_norm": 9.5625, "learning_rate": 2.084888888888889e-05, - "loss": 0.6012, + "loss": 0.7123, "step": 20310 }, { - "epoch": 2.4464242716108835, - "grad_norm": 4.875, + "epoch": 14.49358059914408, + "grad_norm": 7.625, "learning_rate": 2.0804444444444446e-05, - "loss": 0.526, + "loss": 0.7538, "step": 20320 }, { - "epoch": 2.4476282205634483, - "grad_norm": 7.15625, + "epoch": 14.50071326676177, + "grad_norm": 6.34375, "learning_rate": 2.076e-05, - "loss": 0.6189, + "loss": 0.7735, "step": 20330 }, { - "epoch": 2.4488321695160127, - "grad_norm": 7.90625, + "epoch": 14.507845934379457, + "grad_norm": 6.5625, "learning_rate": 2.0715555555555556e-05, - "loss": 0.5845, + "loss": 0.6973, "step": 20340 }, { - "epoch": 2.450036118468577, - "grad_norm": 7.8125, + "epoch": 14.514978601997147, + "grad_norm": 8.75, "learning_rate": 2.0671111111111115e-05, - "loss": 0.5502, + "loss": 0.8565, "step": 20350 }, { - "epoch": 2.4512400674211414, - "grad_norm": 8.375, + "epoch": 14.522111269614836, + "grad_norm": 7.6875, "learning_rate": 2.0626666666666667e-05, - "loss": 0.5771, + "loss": 0.8176, "step": 20360 }, { - "epoch": 2.452444016373706, - "grad_norm": 10.8125, + "epoch": 14.529243937232525, + "grad_norm": 6.875, "learning_rate": 2.0582222222222222e-05, - "loss": 0.5761, + "loss": 0.8526, "step": 20370 }, { - "epoch": 2.45364796532627, - "grad_norm": 7.4375, + "epoch": 14.536376604850213, + "grad_norm": 12.75, "learning_rate": 2.0537777777777777e-05, - "loss": 0.546, + "loss": 0.7997, "step": 20380 }, { - "epoch": 2.4548519142788345, - "grad_norm": 22.5, + "epoch": 14.543509272467903, + "grad_norm": 8.25, "learning_rate": 2.0493333333333333e-05, - "loss": 0.5746, + "loss": 0.7924, "step": 20390 }, { - "epoch": 2.456055863231399, - "grad_norm": 6.9375, + "epoch": 14.550641940085592, + "grad_norm": 7.84375, "learning_rate": 2.044888888888889e-05, - "loss": 0.5803, + "loss": 0.7639, "step": 20400 }, { - "epoch": 2.456055863231399, - "eval/acc": 40.11627960205078, + "epoch": 14.550641940085592, + "eval/acc": 51.16279220581055, "step": 20400 }, { - "epoch": 2.456055863231399, - "eval_loss": 2.1054108142852783, - "eval_runtime": 0.2157, - "eval_samples_per_second": 199.357, - "eval_steps_per_second": 4.636, + "epoch": 14.550641940085592, + "eval_loss": 2.5919346809387207, + "eval_runtime": 0.2218, + "eval_samples_per_second": 193.859, + "eval_steps_per_second": 4.508, "step": 20400 }, { - "epoch": 2.4572598121839633, - "grad_norm": 6.09375, + "epoch": 14.557774607703282, + "grad_norm": 8.1875, "learning_rate": 2.0404444444444446e-05, - "loss": 0.5956, + "loss": 0.7856, "step": 20410 }, { - "epoch": 2.4584637611365276, - "grad_norm": 7.78125, + "epoch": 14.56490727532097, + "grad_norm": 9.0, "learning_rate": 2.036e-05, - "loss": 0.5267, + "loss": 0.7842, "step": 20420 }, { - "epoch": 2.459667710089092, - "grad_norm": 9.0, + "epoch": 14.572039942938659, + "grad_norm": 6.75, "learning_rate": 2.0315555555555557e-05, - "loss": 0.5699, + "loss": 0.7689, "step": 20430 }, { - "epoch": 2.460871659041657, - "grad_norm": 9.375, + "epoch": 14.579172610556348, + "grad_norm": 7.59375, "learning_rate": 2.0271111111111112e-05, - "loss": 0.5736, + "loss": 0.7918, "step": 20440 }, { - "epoch": 2.462075607994221, - "grad_norm": 7.90625, + "epoch": 14.586305278174038, + "grad_norm": 7.1875, "learning_rate": 2.0226666666666667e-05, - "loss": 0.5772, + "loss": 0.7329, "step": 20450 }, { - "epoch": 2.4632795569467856, - "grad_norm": 16.875, + "epoch": 14.593437945791726, + "grad_norm": 7.59375, "learning_rate": 2.0182222222222222e-05, - "loss": 0.5439, + "loss": 0.7926, "step": 20460 }, { - "epoch": 2.46448350589935, - "grad_norm": 7.90625, + "epoch": 14.600570613409415, + "grad_norm": 7.21875, "learning_rate": 2.0137777777777778e-05, - "loss": 0.6314, + "loss": 0.786, "step": 20470 }, { - "epoch": 2.4656874548519143, - "grad_norm": 7.03125, + "epoch": 14.607703281027105, + "grad_norm": 10.6875, "learning_rate": 2.0093333333333333e-05, - "loss": 0.606, + "loss": 0.7662, "step": 20480 }, { - "epoch": 2.4668914038044787, - "grad_norm": 8.0, + "epoch": 14.614835948644792, + "grad_norm": 7.375, "learning_rate": 2.0048888888888888e-05, - "loss": 0.6585, + "loss": 0.7225, "step": 20490 }, { - "epoch": 2.468095352757043, - "grad_norm": 7.15625, + "epoch": 14.621968616262482, + "grad_norm": 9.0625, "learning_rate": 2.0004444444444447e-05, - "loss": 0.5018, + "loss": 0.7034, "step": 20500 }, { - "epoch": 2.468095352757043, - "eval/acc": 40.11627960205078, + "epoch": 14.621968616262482, + "eval/acc": 51.16279220581055, "step": 20500 }, { - "epoch": 2.468095352757043, - "eval_loss": 2.0760912895202637, - "eval_runtime": 0.2135, - "eval_samples_per_second": 201.409, - "eval_steps_per_second": 4.684, + "epoch": 14.621968616262482, + "eval_loss": 2.564755916595459, + "eval_runtime": 0.2647, + "eval_samples_per_second": 162.462, + "eval_steps_per_second": 3.778, "step": 20500 }, { - "epoch": 2.4692993017096074, - "grad_norm": 9.0, + "epoch": 14.629101283880171, + "grad_norm": 34.25, "learning_rate": 1.9960000000000002e-05, - "loss": 0.6822, + "loss": 0.7003, "step": 20510 }, { - "epoch": 2.470503250662172, - "grad_norm": 7.03125, + "epoch": 14.63623395149786, + "grad_norm": 7.53125, "learning_rate": 1.9915555555555557e-05, - "loss": 0.6248, + "loss": 0.6718, "step": 20520 }, { - "epoch": 2.471707199614736, - "grad_norm": 5.78125, + "epoch": 14.643366619115548, + "grad_norm": 8.375, "learning_rate": 1.9871111111111112e-05, - "loss": 0.6094, + "loss": 0.7539, "step": 20530 }, { - "epoch": 2.4729111485673005, - "grad_norm": 5.625, + "epoch": 14.650499286733238, + "grad_norm": 6.90625, "learning_rate": 1.9826666666666668e-05, - "loss": 0.5175, + "loss": 0.7997, "step": 20540 }, { - "epoch": 2.4741150975198654, - "grad_norm": 8.0, + "epoch": 14.657631954350927, + "grad_norm": 7.53125, "learning_rate": 1.9782222222222223e-05, - "loss": 0.5757, + "loss": 0.7655, "step": 20550 }, { - "epoch": 2.4753190464724297, - "grad_norm": 8.0, + "epoch": 14.664764621968617, + "grad_norm": 8.25, "learning_rate": 1.973777777777778e-05, - "loss": 0.4875, + "loss": 0.7282, "step": 20560 }, { - "epoch": 2.476522995424994, - "grad_norm": 8.25, + "epoch": 14.671897289586305, + "grad_norm": 6.875, "learning_rate": 1.9693333333333333e-05, - "loss": 0.5353, + "loss": 0.8961, "step": 20570 }, { - "epoch": 2.4777269443775585, - "grad_norm": 6.1875, + "epoch": 14.679029957203994, + "grad_norm": 7.71875, "learning_rate": 1.964888888888889e-05, - "loss": 0.5453, + "loss": 0.8167, "step": 20580 }, { - "epoch": 2.478930893330123, - "grad_norm": 7.375, + "epoch": 14.686162624821684, + "grad_norm": 7.4375, "learning_rate": 1.9604444444444444e-05, - "loss": 0.4804, + "loss": 0.7412, "step": 20590 }, { - "epoch": 2.480134842282687, - "grad_norm": 8.0, + "epoch": 14.693295292439373, + "grad_norm": 7.625, "learning_rate": 1.956e-05, - "loss": 0.5725, + "loss": 0.7148, "step": 20600 }, { - "epoch": 2.480134842282687, - "eval/acc": 43.02325439453125, + "epoch": 14.693295292439373, + "eval/acc": 53.488372802734375, "step": 20600 }, { - "epoch": 2.480134842282687, - "eval_loss": 2.0933635234832764, - "eval_runtime": 0.2138, - "eval_samples_per_second": 201.08, - "eval_steps_per_second": 4.676, + "epoch": 14.693295292439373, + "eval_loss": 2.580390691757202, + "eval_runtime": 0.2183, + "eval_samples_per_second": 196.951, + "eval_steps_per_second": 4.58, "step": 20600 }, { - "epoch": 2.4813387912352516, - "grad_norm": 9.0625, + "epoch": 14.70042796005706, + "grad_norm": 6.5, "learning_rate": 1.9515555555555558e-05, - "loss": 0.5613, + "loss": 0.7357, "step": 20610 }, { - "epoch": 2.482542740187816, - "grad_norm": 7.125, + "epoch": 14.70756062767475, + "grad_norm": 5.625, "learning_rate": 1.9471111111111113e-05, - "loss": 0.5453, + "loss": 0.771, "step": 20620 }, { - "epoch": 2.4837466891403803, - "grad_norm": 17.0, + "epoch": 14.71469329529244, + "grad_norm": 8.625, "learning_rate": 1.9426666666666668e-05, - "loss": 0.5077, + "loss": 0.8076, "step": 20630 }, { - "epoch": 2.4849506380929447, - "grad_norm": 7.125, + "epoch": 14.721825962910128, + "grad_norm": 7.625, "learning_rate": 1.9382222222222223e-05, - "loss": 0.4887, + "loss": 0.7406, "step": 20640 }, { - "epoch": 2.486154587045509, - "grad_norm": 7.0625, + "epoch": 14.728958630527817, + "grad_norm": 7.0, "learning_rate": 1.933777777777778e-05, - "loss": 0.5831, + "loss": 0.703, "step": 20650 }, { - "epoch": 2.487358535998074, - "grad_norm": 9.3125, + "epoch": 14.736091298145507, + "grad_norm": 7.5, "learning_rate": 1.9293333333333334e-05, - "loss": 0.5799, + "loss": 0.8349, "step": 20660 }, { - "epoch": 2.4885624849506383, - "grad_norm": 8.0, + "epoch": 14.743223965763196, + "grad_norm": 7.34375, "learning_rate": 1.924888888888889e-05, - "loss": 0.5701, + "loss": 0.7207, "step": 20670 }, { - "epoch": 2.4897664339032026, - "grad_norm": 8.0, + "epoch": 14.750356633380884, + "grad_norm": 6.78125, "learning_rate": 1.9204444444444444e-05, - "loss": 0.6202, + "loss": 0.7787, "step": 20680 }, { - "epoch": 2.490970382855767, + "epoch": 14.757489300998573, "grad_norm": 7.875, "learning_rate": 1.916e-05, - "loss": 0.5042, + "loss": 0.7411, "step": 20690 }, { - "epoch": 2.4921743318083314, - "grad_norm": 9.5625, + "epoch": 14.764621968616263, + "grad_norm": 7.5, "learning_rate": 1.9115555555555555e-05, - "loss": 0.5556, + "loss": 0.7974, "step": 20700 }, { - "epoch": 2.4921743318083314, - "eval/acc": 40.11627960205078, + "epoch": 14.764621968616263, + "eval/acc": 53.488372802734375, "step": 20700 }, { - "epoch": 2.4921743318083314, - "eval_loss": 2.110666275024414, - "eval_runtime": 1.2457, - "eval_samples_per_second": 34.519, - "eval_steps_per_second": 0.803, + "epoch": 14.764621968616263, + "eval_loss": 2.5736501216888428, + "eval_runtime": 0.2193, + "eval_samples_per_second": 196.101, + "eval_steps_per_second": 4.56, "step": 20700 }, { - "epoch": 2.4933782807608957, - "grad_norm": 8.3125, + "epoch": 14.771754636233952, + "grad_norm": 8.1875, "learning_rate": 1.9071111111111113e-05, - "loss": 0.5422, + "loss": 0.7605, "step": 20710 }, { - "epoch": 2.49458222971346, - "grad_norm": 5.59375, + "epoch": 14.77888730385164, + "grad_norm": 7.90625, "learning_rate": 1.902666666666667e-05, - "loss": 0.5558, + "loss": 0.9048, "step": 20720 }, { - "epoch": 2.4957861786660245, - "grad_norm": 7.9375, + "epoch": 14.78601997146933, + "grad_norm": 7.25, "learning_rate": 1.8982222222222224e-05, - "loss": 0.5513, + "loss": 0.7619, "step": 20730 }, { - "epoch": 2.496990127618589, - "grad_norm": 7.09375, + "epoch": 14.793152639087019, + "grad_norm": 6.65625, "learning_rate": 1.893777777777778e-05, - "loss": 0.552, + "loss": 0.7385, "step": 20740 }, { - "epoch": 2.498194076571153, - "grad_norm": 7.125, + "epoch": 14.800285306704708, + "grad_norm": 7.96875, "learning_rate": 1.8893333333333334e-05, - "loss": 0.5945, + "loss": 0.7685, "step": 20750 }, { - "epoch": 2.4993980255237176, - "grad_norm": 9.1875, + "epoch": 14.807417974322396, + "grad_norm": 8.375, "learning_rate": 1.884888888888889e-05, - "loss": 0.5635, + "loss": 0.7639, "step": 20760 }, { - "epoch": 2.5006019744762824, - "grad_norm": 6.65625, + "epoch": 14.814550641940086, + "grad_norm": 9.5625, "learning_rate": 1.8804444444444445e-05, - "loss": 0.5884, + "loss": 0.9127, "step": 20770 }, { - "epoch": 2.501805923428847, - "grad_norm": 7.1875, + "epoch": 14.821683309557775, + "grad_norm": 8.0625, "learning_rate": 1.876e-05, - "loss": 0.4558, + "loss": 0.6761, "step": 20780 }, { - "epoch": 2.503009872381411, - "grad_norm": 7.78125, + "epoch": 14.828815977175463, + "grad_norm": 7.0, "learning_rate": 1.8715555555555555e-05, - "loss": 0.594, + "loss": 0.7239, "step": 20790 }, { - "epoch": 2.5042138213339755, - "grad_norm": 11.25, + "epoch": 14.835948644793152, + "grad_norm": 7.15625, "learning_rate": 1.867111111111111e-05, - "loss": 0.5612, + "loss": 0.721, "step": 20800 }, { - "epoch": 2.5042138213339755, - "eval/acc": 40.11627960205078, + "epoch": 14.835948644793152, + "eval/acc": 51.16279220581055, "step": 20800 }, { - "epoch": 2.5042138213339755, - "eval_loss": 2.0993802547454834, - "eval_runtime": 0.224, - "eval_samples_per_second": 191.923, - "eval_steps_per_second": 4.463, + "epoch": 14.835948644793152, + "eval_loss": 2.592449426651001, + "eval_runtime": 0.2341, + "eval_samples_per_second": 183.686, + "eval_steps_per_second": 4.272, "step": 20800 }, { - "epoch": 2.50541777028654, - "grad_norm": 7.40625, + "epoch": 14.843081312410842, + "grad_norm": 7.84375, "learning_rate": 1.8626666666666666e-05, - "loss": 0.5589, + "loss": 0.7113, "step": 20810 }, { - "epoch": 2.5066217192391043, - "grad_norm": 7.03125, + "epoch": 14.850213980028531, + "grad_norm": 7.09375, "learning_rate": 1.8582222222222224e-05, - "loss": 0.5615, + "loss": 0.7946, "step": 20820 }, { - "epoch": 2.5078256681916686, - "grad_norm": 9.5625, + "epoch": 14.857346647646219, + "grad_norm": 8.8125, "learning_rate": 1.853777777777778e-05, - "loss": 0.5802, + "loss": 0.8066, "step": 20830 }, { - "epoch": 2.509029617144233, - "grad_norm": 5.6875, + "epoch": 14.864479315263909, + "grad_norm": 5.375, "learning_rate": 1.8493333333333335e-05, - "loss": 0.5528, + "loss": 0.7468, "step": 20840 }, { - "epoch": 2.5102335660967974, - "grad_norm": 8.5625, + "epoch": 14.871611982881598, + "grad_norm": 8.125, "learning_rate": 1.844888888888889e-05, - "loss": 0.5713, + "loss": 0.772, "step": 20850 }, { - "epoch": 2.5114375150493617, - "grad_norm": 7.1875, + "epoch": 14.878744650499288, + "grad_norm": 11.0625, "learning_rate": 1.8404444444444445e-05, - "loss": 0.5984, + "loss": 0.8239, "step": 20860 }, { - "epoch": 2.512641464001926, - "grad_norm": 8.8125, + "epoch": 14.885877318116975, + "grad_norm": 9.0, "learning_rate": 1.8360000000000004e-05, - "loss": 0.6409, + "loss": 0.8001, "step": 20870 }, { - "epoch": 2.513845412954491, - "grad_norm": 5.8125, + "epoch": 14.893009985734665, + "grad_norm": 6.71875, "learning_rate": 1.8315555555555556e-05, - "loss": 0.5423, + "loss": 0.7234, "step": 20880 }, { - "epoch": 2.5150493619070553, - "grad_norm": 10.3125, + "epoch": 14.900142653352354, + "grad_norm": 7.125, "learning_rate": 1.827111111111111e-05, - "loss": 0.6115, + "loss": 0.7657, "step": 20890 }, { - "epoch": 2.5162533108596197, - "grad_norm": 7.5, + "epoch": 14.907275320970044, + "grad_norm": 7.0625, "learning_rate": 1.8226666666666666e-05, - "loss": 0.604, + "loss": 0.8357, "step": 20900 }, { - "epoch": 2.5162533108596197, - "eval/acc": 43.604652404785156, + "epoch": 14.907275320970044, + "eval/acc": 51.16279220581055, "step": 20900 }, { - "epoch": 2.5162533108596197, - "eval_loss": 2.079679012298584, - "eval_runtime": 0.2208, - "eval_samples_per_second": 194.777, - "eval_steps_per_second": 4.53, + "epoch": 14.907275320970044, + "eval_loss": 2.567725658416748, + "eval_runtime": 0.2255, + "eval_samples_per_second": 190.652, + "eval_steps_per_second": 4.434, "step": 20900 }, { - "epoch": 2.517457259812184, - "grad_norm": 6.65625, + "epoch": 14.914407988587731, + "grad_norm": 6.28125, "learning_rate": 1.818222222222222e-05, - "loss": 0.5547, + "loss": 0.7335, "step": 20910 }, { - "epoch": 2.5186612087647484, - "grad_norm": 7.71875, + "epoch": 14.921540656205421, + "grad_norm": 9.25, "learning_rate": 1.813777777777778e-05, - "loss": 0.5287, + "loss": 0.7241, "step": 20920 }, { - "epoch": 2.519865157717313, - "grad_norm": 8.125, + "epoch": 14.92867332382311, + "grad_norm": 8.1875, "learning_rate": 1.8093333333333335e-05, - "loss": 0.5621, + "loss": 0.8071, "step": 20930 }, { - "epoch": 2.521069106669877, - "grad_norm": 7.125, + "epoch": 14.935805991440798, + "grad_norm": 7.6875, "learning_rate": 1.804888888888889e-05, - "loss": 0.6234, + "loss": 0.7734, "step": 20940 }, { - "epoch": 2.5222730556224415, - "grad_norm": 7.5625, + "epoch": 14.942938659058488, + "grad_norm": 11.125, "learning_rate": 1.8004444444444446e-05, - "loss": 0.5737, + "loss": 0.8113, "step": 20950 }, { - "epoch": 2.523477004575006, - "grad_norm": 9.625, + "epoch": 14.950071326676177, + "grad_norm": 15.875, "learning_rate": 1.796e-05, - "loss": 0.5585, + "loss": 0.747, "step": 20960 }, { - "epoch": 2.5246809535275703, - "grad_norm": 9.5625, + "epoch": 14.957203994293867, + "grad_norm": 8.0625, "learning_rate": 1.7915555555555556e-05, - "loss": 0.5995, + "loss": 0.7377, "step": 20970 }, { - "epoch": 2.5258849024801346, - "grad_norm": 7.15625, + "epoch": 14.964336661911554, + "grad_norm": 7.0, "learning_rate": 1.787111111111111e-05, - "loss": 0.5816, + "loss": 0.7357, "step": 20980 }, { - "epoch": 2.5270888514326995, - "grad_norm": 6.15625, + "epoch": 14.971469329529244, + "grad_norm": 7.6875, "learning_rate": 1.7826666666666667e-05, - "loss": 0.515, + "loss": 0.7443, "step": 20990 }, { - "epoch": 2.528292800385264, - "grad_norm": 7.28125, + "epoch": 14.978601997146933, + "grad_norm": 7.3125, "learning_rate": 1.7782222222222222e-05, - "loss": 0.6282, + "loss": 0.7437, "step": 21000 }, { - "epoch": 2.528292800385264, - "eval/acc": 39.53488540649414, + "epoch": 14.978601997146933, + "eval/acc": 51.16279220581055, "step": 21000 }, { - "epoch": 2.528292800385264, - "eval_loss": 2.0975944995880127, - "eval_runtime": 0.2204, - "eval_samples_per_second": 195.105, - "eval_steps_per_second": 4.537, + "epoch": 14.978601997146933, + "eval_loss": 2.595672369003296, + "eval_runtime": 0.2171, + "eval_samples_per_second": 198.079, + "eval_steps_per_second": 4.606, "step": 21000 }, { - "epoch": 2.529496749337828, - "grad_norm": 8.125, + "epoch": 14.985734664764623, + "grad_norm": 14.0625, "learning_rate": 1.7737777777777777e-05, - "loss": 0.5334, + "loss": 0.7814, "step": 21010 }, { - "epoch": 2.5307006982903926, - "grad_norm": 10.0625, + "epoch": 14.99286733238231, + "grad_norm": 6.75, "learning_rate": 1.7693333333333336e-05, - "loss": 0.6316, + "loss": 0.7794, "step": 21020 }, { - "epoch": 2.531904647242957, - "grad_norm": 5.6875, + "epoch": 15.0, + "grad_norm": 6.96875, "learning_rate": 1.764888888888889e-05, - "loss": 0.5891, + "loss": 0.7931, "step": 21030 }, { - "epoch": 2.5331085961955213, - "grad_norm": 11.0, + "epoch": 15.00713266761769, + "grad_norm": 7.8125, "learning_rate": 1.7604444444444446e-05, - "loss": 0.5301, + "loss": 0.8088, "step": 21040 }, { - "epoch": 2.5343125451480857, - "grad_norm": 7.71875, + "epoch": 15.014265335235377, + "grad_norm": 7.0625, "learning_rate": 1.756e-05, - "loss": 0.6381, + "loss": 0.7975, "step": 21050 }, { - "epoch": 2.53551649410065, - "grad_norm": 5.59375, + "epoch": 15.021398002853067, + "grad_norm": 7.125, "learning_rate": 1.7515555555555557e-05, - "loss": 0.495, + "loss": 0.819, "step": 21060 }, { - "epoch": 2.5367204430532144, - "grad_norm": 10.0625, + "epoch": 15.028530670470756, + "grad_norm": 6.5625, "learning_rate": 1.7471111111111112e-05, - "loss": 0.6769, + "loss": 0.6872, "step": 21070 }, { - "epoch": 2.537924392005779, - "grad_norm": 6.5, + "epoch": 15.035663338088446, + "grad_norm": 6.90625, "learning_rate": 1.7426666666666667e-05, - "loss": 0.5117, + "loss": 0.7137, "step": 21080 }, { - "epoch": 2.539128340958343, - "grad_norm": 7.96875, + "epoch": 15.042796005706133, + "grad_norm": 7.25, "learning_rate": 1.7382222222222222e-05, - "loss": 0.5569, + "loss": 0.7424, "step": 21090 }, { - "epoch": 2.540332289910908, - "grad_norm": 7.625, + "epoch": 15.049928673323823, + "grad_norm": 8.6875, "learning_rate": 1.7337777777777777e-05, - "loss": 0.5198, + "loss": 0.7837, "step": 21100 }, { - "epoch": 2.540332289910908, - "eval/acc": 41.27906799316406, + "epoch": 15.049928673323823, + "eval/acc": 51.16279220581055, "step": 21100 }, { - "epoch": 2.540332289910908, - "eval_loss": 2.090000867843628, - "eval_runtime": 0.2106, - "eval_samples_per_second": 204.174, - "eval_steps_per_second": 4.748, + "epoch": 15.049928673323823, + "eval_loss": 2.105516195297241, + "eval_runtime": 6.7681, + "eval_samples_per_second": 6.353, + "eval_steps_per_second": 0.148, "step": 21100 }, { - "epoch": 2.5415362388634724, - "grad_norm": 4.84375, + "epoch": 15.057061340941512, + "grad_norm": 7.96875, "learning_rate": 1.7293333333333333e-05, - "loss": 0.5165, + "loss": 0.7133, "step": 21110 }, { - "epoch": 2.5427401878160367, - "grad_norm": 6.53125, + "epoch": 15.064194008559202, + "grad_norm": 5.75, "learning_rate": 1.7248888888888888e-05, - "loss": 0.5745, + "loss": 0.6913, "step": 21120 }, { - "epoch": 2.543944136768601, - "grad_norm": 7.75, + "epoch": 15.07132667617689, + "grad_norm": 18.5, "learning_rate": 1.7204444444444446e-05, - "loss": 0.4919, + "loss": 0.7564, "step": 21130 }, { - "epoch": 2.5451480857211655, - "grad_norm": 6.75, + "epoch": 15.078459343794579, + "grad_norm": 8.625, "learning_rate": 1.7160000000000002e-05, - "loss": 0.5593, + "loss": 0.7526, "step": 21140 }, { - "epoch": 2.54635203467373, - "grad_norm": 7.71875, + "epoch": 15.085592011412269, + "grad_norm": 10.6875, "learning_rate": 1.7115555555555557e-05, - "loss": 0.5654, + "loss": 0.7741, "step": 21150 }, { - "epoch": 2.547555983626294, - "grad_norm": 7.1875, + "epoch": 15.092724679029958, + "grad_norm": 7.09375, "learning_rate": 1.7071111111111112e-05, - "loss": 0.6021, + "loss": 0.7387, "step": 21160 }, { - "epoch": 2.5487599325788586, - "grad_norm": 8.125, + "epoch": 15.099857346647646, + "grad_norm": 7.1875, "learning_rate": 1.7026666666666667e-05, - "loss": 0.5313, + "loss": 0.7139, "step": 21170 }, { - "epoch": 2.549963881531423, - "grad_norm": 7.03125, + "epoch": 15.106990014265335, + "grad_norm": 9.75, "learning_rate": 1.6982222222222226e-05, - "loss": 0.525, + "loss": 0.7564, "step": 21180 }, { - "epoch": 2.5511678304839873, - "grad_norm": 7.65625, + "epoch": 15.114122681883025, + "grad_norm": 7.1875, "learning_rate": 1.6937777777777778e-05, - "loss": 0.5401, + "loss": 0.7592, "step": 21190 }, { - "epoch": 2.5523717794365517, - "grad_norm": 6.53125, + "epoch": 15.121255349500712, + "grad_norm": 11.4375, "learning_rate": 1.6893333333333333e-05, - "loss": 0.5812, + "loss": 0.7444, "step": 21200 }, { - "epoch": 2.5523717794365517, - "eval/acc": 39.53488540649414, + "epoch": 15.121255349500712, + "eval/acc": 51.16279220581055, "step": 21200 }, { - "epoch": 2.5523717794365517, - "eval_loss": 2.094308614730835, - "eval_runtime": 0.2117, - "eval_samples_per_second": 203.099, - "eval_steps_per_second": 4.723, + "epoch": 15.121255349500712, + "eval_loss": 2.099931478500366, + "eval_runtime": 0.2305, + "eval_samples_per_second": 186.541, + "eval_steps_per_second": 4.338, "step": 21200 }, { - "epoch": 2.5535757283891165, - "grad_norm": 4.96875, + "epoch": 15.128388017118402, + "grad_norm": 8.5625, "learning_rate": 1.684888888888889e-05, - "loss": 0.5184, + "loss": 0.7643, "step": 21210 }, { - "epoch": 2.554779677341681, - "grad_norm": 7.21875, + "epoch": 15.135520684736091, + "grad_norm": 7.40625, "learning_rate": 1.6804444444444444e-05, - "loss": 0.5366, + "loss": 0.7471, "step": 21220 }, { - "epoch": 2.5559836262942452, - "grad_norm": 13.75, + "epoch": 15.142653352353781, + "grad_norm": 6.0, "learning_rate": 1.6760000000000002e-05, - "loss": 0.5492, + "loss": 0.8479, "step": 21230 }, { - "epoch": 2.5571875752468096, - "grad_norm": 12.875, + "epoch": 15.149786019971469, + "grad_norm": 8.3125, "learning_rate": 1.6715555555555557e-05, - "loss": 0.6225, + "loss": 0.8082, "step": 21240 }, { - "epoch": 2.558391524199374, - "grad_norm": 3.921875, + "epoch": 15.156918687589158, + "grad_norm": 7.375, "learning_rate": 1.6671111111111113e-05, - "loss": 0.5316, + "loss": 0.7453, "step": 21250 }, { - "epoch": 2.5595954731519384, - "grad_norm": 10.25, + "epoch": 15.164051355206848, + "grad_norm": 6.0, "learning_rate": 1.6626666666666668e-05, - "loss": 0.5819, + "loss": 0.784, "step": 21260 }, { - "epoch": 2.5607994221045027, - "grad_norm": 6.90625, + "epoch": 15.171184022824537, + "grad_norm": 9.5, "learning_rate": 1.6582222222222223e-05, - "loss": 0.5081, + "loss": 0.7564, "step": 21270 }, { - "epoch": 2.562003371057067, - "grad_norm": 6.1875, + "epoch": 15.178316690442225, + "grad_norm": 7.90625, "learning_rate": 1.6537777777777778e-05, - "loss": 0.5896, + "loss": 0.7713, "step": 21280 }, { - "epoch": 2.5632073200096315, - "grad_norm": 11.9375, + "epoch": 15.185449358059914, + "grad_norm": 7.0, "learning_rate": 1.6493333333333334e-05, - "loss": 0.6342, + "loss": 0.818, "step": 21290 }, { - "epoch": 2.564411268962196, - "grad_norm": 6.71875, + "epoch": 15.192582025677604, + "grad_norm": 7.15625, "learning_rate": 1.644888888888889e-05, - "loss": 0.617, + "loss": 0.7564, "step": 21300 }, { - "epoch": 2.564411268962196, - "eval/acc": 40.11627960205078, + "epoch": 15.192582025677604, + "eval/acc": 48.83720779418945, "step": 21300 }, { - "epoch": 2.564411268962196, - "eval_loss": 2.0883092880249023, - "eval_runtime": 0.214, - "eval_samples_per_second": 200.937, - "eval_steps_per_second": 4.673, + "epoch": 15.192582025677604, + "eval_loss": 2.1149563789367676, + "eval_runtime": 0.2268, + "eval_samples_per_second": 189.56, + "eval_steps_per_second": 4.408, "step": 21300 }, { - "epoch": 2.56561521791476, - "grad_norm": 8.1875, + "epoch": 15.199714693295292, + "grad_norm": 8.0, "learning_rate": 1.6404444444444444e-05, - "loss": 0.5297, + "loss": 0.7391, "step": 21310 }, { - "epoch": 2.566819166867325, - "grad_norm": 12.4375, + "epoch": 15.206847360912981, + "grad_norm": 9.5625, "learning_rate": 1.636e-05, - "loss": 0.5513, + "loss": 0.7907, "step": 21320 }, { - "epoch": 2.5680231158198894, - "grad_norm": 7.25, + "epoch": 15.21398002853067, + "grad_norm": 6.25, "learning_rate": 1.6315555555555558e-05, - "loss": 0.5183, + "loss": 0.7595, "step": 21330 }, { - "epoch": 2.5692270647724538, - "grad_norm": 7.6875, + "epoch": 15.22111269614836, + "grad_norm": 10.25, "learning_rate": 1.6271111111111113e-05, - "loss": 0.5934, + "loss": 0.7629, "step": 21340 }, { - "epoch": 2.570431013725018, - "grad_norm": 9.0625, + "epoch": 15.228245363766048, + "grad_norm": 8.6875, "learning_rate": 1.6226666666666668e-05, - "loss": 0.5486, + "loss": 0.7305, "step": 21350 }, { - "epoch": 2.5716349626775825, - "grad_norm": 7.84375, + "epoch": 15.235378031383737, + "grad_norm": 137.0, "learning_rate": 1.6182222222222224e-05, - "loss": 0.5434, + "loss": 0.7605, "step": 21360 }, { - "epoch": 2.572838911630147, - "grad_norm": 4.875, + "epoch": 15.242510699001427, + "grad_norm": 6.75, "learning_rate": 1.613777777777778e-05, - "loss": 0.6069, + "loss": 0.7498, "step": 21370 }, { - "epoch": 2.5740428605827113, - "grad_norm": 10.9375, + "epoch": 15.249643366619116, + "grad_norm": 6.46875, "learning_rate": 1.6093333333333334e-05, - "loss": 0.5589, + "loss": 0.773, "step": 21380 }, { - "epoch": 2.5752468095352756, - "grad_norm": 10.125, + "epoch": 15.256776034236804, + "grad_norm": 9.75, "learning_rate": 1.604888888888889e-05, - "loss": 0.5808, + "loss": 0.7547, "step": 21390 }, { - "epoch": 2.57645075848784, - "grad_norm": 9.625, + "epoch": 15.263908701854493, + "grad_norm": 6.71875, "learning_rate": 1.6004444444444444e-05, - "loss": 0.5528, + "loss": 0.7922, "step": 21400 }, { - "epoch": 2.57645075848784, - "eval/acc": 38.953487396240234, + "epoch": 15.263908701854493, + "eval/acc": 51.16279220581055, "step": 21400 }, { - "epoch": 2.57645075848784, - "eval_loss": 2.0963029861450195, - "eval_runtime": 0.2131, - "eval_samples_per_second": 201.756, - "eval_steps_per_second": 4.692, + "epoch": 15.263908701854493, + "eval_loss": 2.0905489921569824, + "eval_runtime": 0.2297, + "eval_samples_per_second": 187.229, + "eval_steps_per_second": 4.354, "step": 21400 }, { - "epoch": 2.5776547074404044, - "grad_norm": 6.53125, + "epoch": 15.271041369472183, + "grad_norm": 9.125, "learning_rate": 1.596e-05, - "loss": 0.6053, + "loss": 0.7484, "step": 21410 }, { - "epoch": 2.5788586563929687, - "grad_norm": 12.8125, + "epoch": 15.278174037089872, + "grad_norm": 7.4375, "learning_rate": 1.5915555555555555e-05, - "loss": 0.5655, + "loss": 0.7577, "step": 21420 }, { - "epoch": 2.5800626053455336, - "grad_norm": 7.34375, + "epoch": 15.28530670470756, + "grad_norm": 6.5, "learning_rate": 1.587111111111111e-05, - "loss": 0.5695, + "loss": 0.7756, "step": 21430 }, { - "epoch": 2.581266554298098, - "grad_norm": 9.0, + "epoch": 15.29243937232525, + "grad_norm": 7.65625, "learning_rate": 1.582666666666667e-05, - "loss": 0.5667, + "loss": 0.7254, "step": 21440 }, { - "epoch": 2.5824705032506623, - "grad_norm": 7.21875, + "epoch": 15.29957203994294, + "grad_norm": 6.96875, "learning_rate": 1.5782222222222224e-05, - "loss": 0.6134, + "loss": 0.7657, "step": 21450 }, { - "epoch": 2.5836744522032267, - "grad_norm": 9.875, + "epoch": 15.306704707560627, + "grad_norm": 7.5, "learning_rate": 1.573777777777778e-05, - "loss": 0.6478, + "loss": 0.8066, "step": 21460 }, { - "epoch": 2.584878401155791, - "grad_norm": 8.75, + "epoch": 15.313837375178316, + "grad_norm": 8.5625, "learning_rate": 1.5693333333333334e-05, - "loss": 0.5954, + "loss": 0.7305, "step": 21470 }, { - "epoch": 2.5860823501083554, - "grad_norm": 7.28125, + "epoch": 15.320970042796006, + "grad_norm": 16.5, "learning_rate": 1.564888888888889e-05, - "loss": 0.4527, + "loss": 0.769, "step": 21480 }, { - "epoch": 2.58728629906092, - "grad_norm": 9.6875, + "epoch": 15.328102710413695, + "grad_norm": 8.4375, "learning_rate": 1.5604444444444445e-05, - "loss": 0.6129, + "loss": 0.8408, "step": 21490 }, { - "epoch": 2.588490248013484, - "grad_norm": 12.0, + "epoch": 15.335235378031383, + "grad_norm": 8.375, "learning_rate": 1.556e-05, - "loss": 0.5081, + "loss": 0.7811, "step": 21500 }, { - "epoch": 2.588490248013484, - "eval/acc": 38.953487396240234, + "epoch": 15.335235378031383, + "eval/acc": 51.16279220581055, "step": 21500 }, { - "epoch": 2.588490248013484, - "eval_loss": 2.1097323894500732, - "eval_runtime": 0.211, - "eval_samples_per_second": 203.783, - "eval_steps_per_second": 4.739, + "epoch": 15.335235378031383, + "eval_loss": 2.1078341007232666, + "eval_runtime": 0.2375, + "eval_samples_per_second": 181.063, + "eval_steps_per_second": 4.211, "step": 21500 }, { - "epoch": 2.5896941969660485, - "grad_norm": 6.09375, + "epoch": 15.342368045649073, + "grad_norm": 8.3125, "learning_rate": 1.5515555555555555e-05, - "loss": 0.5519, + "loss": 0.7697, "step": 21510 }, { - "epoch": 2.590898145918613, - "grad_norm": 5.78125, + "epoch": 15.349500713266762, + "grad_norm": 8.3125, "learning_rate": 1.547111111111111e-05, - "loss": 0.622, + "loss": 0.7481, "step": 21520 }, { - "epoch": 2.5921020948711773, - "grad_norm": 6.4375, + "epoch": 15.356633380884452, + "grad_norm": 6.3125, "learning_rate": 1.5426666666666666e-05, - "loss": 0.5524, + "loss": 0.8076, "step": 21530 }, { - "epoch": 2.593306043823742, - "grad_norm": 8.8125, + "epoch": 15.36376604850214, + "grad_norm": 6.40625, "learning_rate": 1.5382222222222224e-05, - "loss": 0.563, + "loss": 0.803, "step": 21540 }, { - "epoch": 2.5945099927763065, - "grad_norm": 7.6875, + "epoch": 15.370898716119829, + "grad_norm": 10.0625, "learning_rate": 1.533777777777778e-05, - "loss": 0.6177, + "loss": 0.7999, "step": 21550 }, { - "epoch": 2.595713941728871, - "grad_norm": 6.9375, + "epoch": 15.378031383737518, + "grad_norm": 7.5, "learning_rate": 1.5293333333333335e-05, - "loss": 0.5798, + "loss": 0.7886, "step": 21560 }, { - "epoch": 2.596917890681435, - "grad_norm": 11.1875, + "epoch": 15.385164051355208, + "grad_norm": 9.375, "learning_rate": 1.524888888888889e-05, - "loss": 0.5977, + "loss": 0.7446, "step": 21570 }, { - "epoch": 2.5981218396339996, - "grad_norm": 9.8125, + "epoch": 15.392296718972895, + "grad_norm": 6.40625, "learning_rate": 1.5204444444444445e-05, - "loss": 0.5134, + "loss": 0.7721, "step": 21580 }, { - "epoch": 2.599325788586564, - "grad_norm": 7.28125, + "epoch": 15.399429386590585, + "grad_norm": 6.0, "learning_rate": 1.5160000000000002e-05, - "loss": 0.4897, + "loss": 0.8235, "step": 21590 }, { - "epoch": 2.6005297375391283, - "grad_norm": 8.6875, + "epoch": 15.406562054208274, + "grad_norm": 9.625, "learning_rate": 1.5115555555555556e-05, - "loss": 0.6154, + "loss": 0.7322, "step": 21600 }, { - "epoch": 2.6005297375391283, - "eval/acc": 39.53488540649414, + "epoch": 15.406562054208274, + "eval/acc": 48.83720779418945, "step": 21600 }, { - "epoch": 2.6005297375391283, - "eval_loss": 2.0928361415863037, - "eval_runtime": 0.2119, - "eval_samples_per_second": 202.892, - "eval_steps_per_second": 4.718, + "epoch": 15.406562054208274, + "eval_loss": 2.0801358222961426, + "eval_runtime": 0.2459, + "eval_samples_per_second": 174.833, + "eval_steps_per_second": 4.066, "step": 21600 }, { - "epoch": 2.6017336864916927, - "grad_norm": 8.75, + "epoch": 15.413694721825962, + "grad_norm": 6.46875, "learning_rate": 1.5071111111111111e-05, - "loss": 0.5544, + "loss": 0.7196, "step": 21610 }, { - "epoch": 2.602937635444257, - "grad_norm": 6.9375, + "epoch": 15.420827389443652, + "grad_norm": 7.03125, "learning_rate": 1.5026666666666666e-05, - "loss": 0.5312, + "loss": 0.7608, "step": 21620 }, { - "epoch": 2.6041415843968214, - "grad_norm": 8.9375, + "epoch": 15.427960057061341, + "grad_norm": 8.1875, "learning_rate": 1.4982222222222223e-05, - "loss": 0.4817, + "loss": 0.7963, "step": 21630 }, { - "epoch": 2.605345533349386, - "grad_norm": 6.125, + "epoch": 15.43509272467903, + "grad_norm": 9.8125, "learning_rate": 1.4937777777777778e-05, - "loss": 0.4783, + "loss": 0.8916, "step": 21640 }, { - "epoch": 2.6065494823019506, - "grad_norm": 8.75, + "epoch": 15.442225392296718, + "grad_norm": 8.0, "learning_rate": 1.4893333333333334e-05, - "loss": 0.5539, + "loss": 0.7113, "step": 21650 }, { - "epoch": 2.6077534312545145, - "grad_norm": 9.375, + "epoch": 15.449358059914408, + "grad_norm": 7.71875, "learning_rate": 1.484888888888889e-05, - "loss": 0.6077, + "loss": 0.784, "step": 21660 }, { - "epoch": 2.6089573802070793, - "grad_norm": 8.3125, + "epoch": 15.456490727532097, + "grad_norm": 13.0, "learning_rate": 1.4804444444444446e-05, - "loss": 0.5824, + "loss": 0.6864, "step": 21670 }, { - "epoch": 2.6101613291596437, - "grad_norm": 9.25, + "epoch": 15.463623395149787, + "grad_norm": 6.65625, "learning_rate": 1.4760000000000001e-05, - "loss": 0.5692, + "loss": 0.7334, "step": 21680 }, { - "epoch": 2.611365278112208, - "grad_norm": 6.15625, + "epoch": 15.470756062767475, + "grad_norm": 8.25, "learning_rate": 1.4715555555555558e-05, - "loss": 0.525, + "loss": 0.6251, "step": 21690 }, { - "epoch": 2.6125692270647725, - "grad_norm": 7.6875, + "epoch": 15.477888730385164, + "grad_norm": 7.21875, "learning_rate": 1.467111111111111e-05, - "loss": 0.6187, + "loss": 0.6724, "step": 21700 }, { - "epoch": 2.6125692270647725, - "eval/acc": 39.53488540649414, + "epoch": 15.477888730385164, + "eval/acc": 48.83720779418945, "step": 21700 }, - { - "epoch": 2.6125692270647725, - "eval_loss": 2.0849361419677734, - "eval_runtime": 0.2146, - "eval_samples_per_second": 200.385, - "eval_steps_per_second": 4.66, + { + "epoch": 15.477888730385164, + "eval_loss": 2.120603322982788, + "eval_runtime": 0.2216, + "eval_samples_per_second": 194.055, + "eval_steps_per_second": 4.513, "step": 21700 }, { - "epoch": 2.613773176017337, - "grad_norm": 7.0625, + "epoch": 15.485021398002853, + "grad_norm": 7.1875, "learning_rate": 1.4626666666666667e-05, - "loss": 0.5477, + "loss": 0.6897, "step": 21710 }, { - "epoch": 2.614977124969901, - "grad_norm": 6.96875, + "epoch": 15.492154065620543, + "grad_norm": 7.59375, "learning_rate": 1.4582222222222222e-05, - "loss": 0.5632, + "loss": 0.758, "step": 21720 }, { - "epoch": 2.6161810739224656, - "grad_norm": 6.5, + "epoch": 15.49928673323823, + "grad_norm": 16.5, "learning_rate": 1.4537777777777777e-05, - "loss": 0.5551, + "loss": 0.6712, "step": 21730 }, { - "epoch": 2.61738502287503, - "grad_norm": 8.875, + "epoch": 15.50641940085592, + "grad_norm": 6.09375, "learning_rate": 1.4493333333333334e-05, - "loss": 0.4995, + "loss": 0.8117, "step": 21740 }, { - "epoch": 2.6185889718275943, - "grad_norm": 6.875, + "epoch": 15.51355206847361, + "grad_norm": 8.8125, "learning_rate": 1.444888888888889e-05, - "loss": 0.5208, + "loss": 0.7811, "step": 21750 }, { - "epoch": 2.619792920780159, - "grad_norm": 8.125, + "epoch": 15.520684736091297, + "grad_norm": 6.78125, "learning_rate": 1.4404444444444446e-05, - "loss": 0.5138, + "loss": 0.8605, "step": 21760 }, { - "epoch": 2.620996869732723, - "grad_norm": 12.1875, + "epoch": 15.527817403708987, + "grad_norm": 17.25, "learning_rate": 1.4360000000000001e-05, - "loss": 0.5332, + "loss": 0.7169, "step": 21770 }, { - "epoch": 2.622200818685288, - "grad_norm": 8.0, + "epoch": 15.534950071326676, + "grad_norm": 9.625, "learning_rate": 1.4315555555555557e-05, - "loss": 0.54, + "loss": 0.7355, "step": 21780 }, { - "epoch": 2.6234047676378522, - "grad_norm": 8.3125, + "epoch": 15.542082738944366, + "grad_norm": 7.3125, "learning_rate": 1.4271111111111114e-05, - "loss": 0.5671, + "loss": 0.7565, "step": 21790 }, { - "epoch": 2.6246087165904166, - "grad_norm": 11.125, + "epoch": 15.549215406562054, + "grad_norm": 17.375, "learning_rate": 1.4226666666666669e-05, - "loss": 0.624, + "loss": 0.7719, "step": 21800 }, { - "epoch": 2.6246087165904166, - "eval/acc": 40.11627960205078, + "epoch": 15.549215406562054, + "eval/acc": 51.16279220581055, "step": 21800 }, { - "epoch": 2.6246087165904166, - "eval_loss": 2.100468158721924, - "eval_runtime": 0.2114, - "eval_samples_per_second": 203.378, - "eval_steps_per_second": 4.73, + "epoch": 15.549215406562054, + "eval_loss": 2.1045022010803223, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.156, + "eval_steps_per_second": 4.352, "step": 21800 }, { - "epoch": 2.625812665542981, - "grad_norm": 10.5, + "epoch": 15.556348074179743, + "grad_norm": 10.1875, "learning_rate": 1.4182222222222222e-05, - "loss": 0.4953, + "loss": 0.8614, "step": 21810 }, { - "epoch": 2.6270166144955454, - "grad_norm": 8.5625, + "epoch": 15.563480741797433, + "grad_norm": 7.15625, "learning_rate": 1.4137777777777778e-05, - "loss": 0.6391, + "loss": 0.6956, "step": 21820 }, { - "epoch": 2.6282205634481097, - "grad_norm": 6.0, + "epoch": 15.570613409415122, + "grad_norm": 7.625, "learning_rate": 1.4093333333333333e-05, - "loss": 0.5605, + "loss": 0.7024, "step": 21830 }, { - "epoch": 2.629424512400674, - "grad_norm": 12.25, + "epoch": 15.57774607703281, + "grad_norm": 15.875, "learning_rate": 1.404888888888889e-05, - "loss": 0.5091, + "loss": 0.7581, "step": 21840 }, { - "epoch": 2.6306284613532385, - "grad_norm": 7.8125, + "epoch": 15.5848787446505, + "grad_norm": 6.625, "learning_rate": 1.4004444444444445e-05, - "loss": 0.5609, + "loss": 0.6875, "step": 21850 }, { - "epoch": 2.631832410305803, - "grad_norm": 14.4375, + "epoch": 15.592011412268189, + "grad_norm": 8.8125, "learning_rate": 1.396e-05, - "loss": 0.579, + "loss": 0.706, "step": 21860 }, { - "epoch": 2.6330363592583677, - "grad_norm": 10.0625, + "epoch": 15.599144079885878, + "grad_norm": 8.25, "learning_rate": 1.3915555555555557e-05, - "loss": 0.6067, + "loss": 0.7664, "step": 21870 }, { - "epoch": 2.6342403082109316, - "grad_norm": 5.71875, + "epoch": 15.606276747503566, + "grad_norm": 8.125, "learning_rate": 1.3871111111111112e-05, - "loss": 0.6023, + "loss": 0.8298, "step": 21880 }, { - "epoch": 2.6354442571634964, - "grad_norm": 6.875, + "epoch": 15.613409415121255, + "grad_norm": 7.5, "learning_rate": 1.3826666666666668e-05, - "loss": 0.589, + "loss": 0.8146, "step": 21890 }, { - "epoch": 2.6366482061160608, - "grad_norm": 7.90625, + "epoch": 15.620542082738945, + "grad_norm": 17.75, "learning_rate": 1.3782222222222224e-05, - "loss": 0.6006, + "loss": 0.7299, "step": 21900 }, { - "epoch": 2.6366482061160608, - "eval/acc": 40.11627960205078, + "epoch": 15.620542082738945, + "eval/acc": 51.16279220581055, "step": 21900 }, { - "epoch": 2.6366482061160608, - "eval_loss": 2.0932207107543945, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.725, - "eval_steps_per_second": 4.691, + "epoch": 15.620542082738945, + "eval_loss": 2.1079678535461426, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.362, + "eval_steps_per_second": 4.52, "step": 21900 }, { - "epoch": 2.637852155068625, - "grad_norm": 8.3125, + "epoch": 15.627674750356633, + "grad_norm": 7.21875, "learning_rate": 1.3737777777777778e-05, - "loss": 0.4965, + "loss": 0.7489, "step": 21910 }, { - "epoch": 2.6390561040211895, - "grad_norm": 5.78125, + "epoch": 15.634807417974322, + "grad_norm": 7.5, "learning_rate": 1.3693333333333333e-05, - "loss": 0.5194, + "loss": 0.7586, "step": 21920 }, { - "epoch": 2.640260052973754, - "grad_norm": 6.75, + "epoch": 15.641940085592012, + "grad_norm": 7.375, "learning_rate": 1.3648888888888888e-05, - "loss": 0.4789, + "loss": 0.6981, "step": 21930 }, { - "epoch": 2.6414640019263183, - "grad_norm": 41.5, + "epoch": 15.649072753209701, + "grad_norm": 8.25, "learning_rate": 1.3604444444444445e-05, - "loss": 0.5768, + "loss": 0.7119, "step": 21940 }, { - "epoch": 2.6426679508788826, - "grad_norm": 8.75, + "epoch": 15.656205420827389, + "grad_norm": 11.6875, "learning_rate": 1.356e-05, - "loss": 0.521, + "loss": 0.8477, "step": 21950 }, { - "epoch": 2.643871899831447, - "grad_norm": 7.53125, + "epoch": 15.663338088445078, + "grad_norm": 5.78125, "learning_rate": 1.3515555555555556e-05, - "loss": 0.5664, + "loss": 0.7083, "step": 21960 }, { - "epoch": 2.6450758487840114, - "grad_norm": 8.9375, + "epoch": 15.670470756062768, + "grad_norm": 6.40625, "learning_rate": 1.3471111111111113e-05, - "loss": 0.5055, + "loss": 0.7326, "step": 21970 }, { - "epoch": 2.646279797736576, - "grad_norm": 7.9375, + "epoch": 15.677603423680456, + "grad_norm": 13.75, "learning_rate": 1.3426666666666668e-05, - "loss": 0.5264, + "loss": 0.8506, "step": 21980 }, { - "epoch": 2.64748374668914, - "grad_norm": 5.3125, + "epoch": 15.684736091298145, + "grad_norm": 10.1875, "learning_rate": 1.3382222222222223e-05, - "loss": 0.5359, + "loss": 0.8144, "step": 21990 }, { - "epoch": 2.648687695641705, - "grad_norm": 6.0625, + "epoch": 15.691868758915835, + "grad_norm": 6.78125, "learning_rate": 1.333777777777778e-05, - "loss": 0.5527, + "loss": 0.735, "step": 22000 }, { - "epoch": 2.648687695641705, - "eval/acc": 39.53488540649414, + "epoch": 15.691868758915835, + "eval/acc": 51.16279220581055, "step": 22000 }, { - "epoch": 2.648687695641705, - "eval_loss": 2.0784599781036377, - "eval_runtime": 0.2153, - "eval_samples_per_second": 199.748, - "eval_steps_per_second": 4.645, + "epoch": 15.691868758915835, + "eval_loss": 2.1133952140808105, + "eval_runtime": 0.2219, + "eval_samples_per_second": 193.737, + "eval_steps_per_second": 4.506, "step": 22000 }, { - "epoch": 2.6498916445942693, - "grad_norm": 14.3125, + "epoch": 15.699001426533524, + "grad_norm": 21.625, "learning_rate": 1.3293333333333332e-05, - "loss": 0.4998, + "loss": 0.7367, "step": 22010 }, { - "epoch": 2.6510955935468337, - "grad_norm": 7.6875, + "epoch": 15.706134094151212, + "grad_norm": 6.25, "learning_rate": 1.3248888888888889e-05, - "loss": 0.522, + "loss": 0.8047, "step": 22020 }, { - "epoch": 2.652299542499398, - "grad_norm": 8.1875, + "epoch": 15.713266761768901, + "grad_norm": 15.8125, "learning_rate": 1.3204444444444444e-05, - "loss": 0.56, + "loss": 0.7082, "step": 22030 }, { - "epoch": 2.6535034914519624, - "grad_norm": 8.5, + "epoch": 15.72039942938659, + "grad_norm": 7.25, "learning_rate": 1.316e-05, - "loss": 0.5443, + "loss": 0.7995, "step": 22040 }, { - "epoch": 2.6547074404045268, - "grad_norm": 5.96875, + "epoch": 15.72753209700428, + "grad_norm": 7.65625, "learning_rate": 1.3115555555555556e-05, - "loss": 0.5608, + "loss": 0.7402, "step": 22050 }, { - "epoch": 2.655911389357091, - "grad_norm": 9.1875, + "epoch": 15.734664764621968, + "grad_norm": 7.5, "learning_rate": 1.3071111111111112e-05, - "loss": 0.511, + "loss": 0.7339, "step": 22060 }, { - "epoch": 2.6571153383096555, - "grad_norm": 5.6875, + "epoch": 15.741797432239657, + "grad_norm": 9.0, "learning_rate": 1.3026666666666667e-05, - "loss": 0.4692, + "loss": 0.7367, "step": 22070 }, { - "epoch": 2.65831928726222, - "grad_norm": 8.5625, + "epoch": 15.748930099857347, + "grad_norm": 16.875, "learning_rate": 1.2982222222222224e-05, - "loss": 0.5169, + "loss": 0.7468, "step": 22080 }, { - "epoch": 2.6595232362147847, - "grad_norm": 23.25, + "epoch": 15.756062767475036, + "grad_norm": 6.53125, "learning_rate": 1.2937777777777779e-05, - "loss": 0.5214, + "loss": 0.6916, "step": 22090 }, { - "epoch": 2.6607271851673486, - "grad_norm": 6.8125, + "epoch": 15.763195435092724, + "grad_norm": 6.625, "learning_rate": 1.2893333333333336e-05, - "loss": 0.6334, + "loss": 0.765, "step": 22100 }, { - "epoch": 2.6607271851673486, - "eval/acc": 41.86046600341797, + "epoch": 15.763195435092724, + "eval/acc": 48.83720779418945, "step": 22100 }, { - "epoch": 2.6607271851673486, - "eval_loss": 2.094914436340332, - "eval_runtime": 0.2133, - "eval_samples_per_second": 201.617, - "eval_steps_per_second": 4.689, + "epoch": 15.763195435092724, + "eval_loss": 2.126471996307373, + "eval_runtime": 0.2307, + "eval_samples_per_second": 186.402, + "eval_steps_per_second": 4.335, "step": 22100 }, { - "epoch": 2.6619311341199134, - "grad_norm": 7.0625, + "epoch": 15.770328102710414, + "grad_norm": 22.75, "learning_rate": 1.2848888888888891e-05, - "loss": 0.5501, + "loss": 0.8085, "step": 22110 }, { - "epoch": 2.663135083072478, - "grad_norm": 8.3125, + "epoch": 15.777460770328103, + "grad_norm": 9.4375, "learning_rate": 1.2804444444444445e-05, - "loss": 0.5178, + "loss": 0.7648, "step": 22120 }, { - "epoch": 2.664339032025042, - "grad_norm": 8.0, + "epoch": 15.78459343794579, + "grad_norm": 6.625, "learning_rate": 1.276e-05, - "loss": 0.5664, + "loss": 0.7852, "step": 22130 }, { - "epoch": 2.6655429809776066, - "grad_norm": 6.65625, + "epoch": 15.79172610556348, + "grad_norm": 10.0, "learning_rate": 1.2715555555555555e-05, - "loss": 0.5001, + "loss": 0.8195, "step": 22140 }, { - "epoch": 2.666746929930171, + "epoch": 15.79885877318117, "grad_norm": 6.59375, "learning_rate": 1.2671111111111112e-05, - "loss": 0.4662, + "loss": 0.7586, "step": 22150 }, { - "epoch": 2.6679508788827353, - "grad_norm": 9.5625, + "epoch": 15.80599144079886, + "grad_norm": 6.84375, "learning_rate": 1.2626666666666667e-05, - "loss": 0.5473, + "loss": 0.6953, "step": 22160 }, { - "epoch": 2.6691548278352997, - "grad_norm": 6.5625, + "epoch": 15.813124108416547, + "grad_norm": 7.90625, "learning_rate": 1.2582222222222222e-05, - "loss": 0.6705, + "loss": 0.7339, "step": 22170 }, { - "epoch": 2.670358776787864, - "grad_norm": 7.75, + "epoch": 15.820256776034237, + "grad_norm": 9.25, "learning_rate": 1.253777777777778e-05, - "loss": 0.5361, + "loss": 0.7394, "step": 22180 }, { - "epoch": 2.6715627257404284, - "grad_norm": 7.59375, + "epoch": 15.827389443651926, + "grad_norm": 26.0, "learning_rate": 1.2493333333333333e-05, - "loss": 0.5734, + "loss": 0.6945, "step": 22190 }, { - "epoch": 2.6727666746929932, - "grad_norm": 10.75, + "epoch": 15.834522111269616, + "grad_norm": 7.96875, "learning_rate": 1.244888888888889e-05, - "loss": 0.5943, + "loss": 0.7714, "step": 22200 }, { - "epoch": 2.6727666746929932, - "eval/acc": 39.53488540649414, + "epoch": 15.834522111269616, + "eval/acc": 51.16279220581055, "step": 22200 }, { - "epoch": 2.6727666746929932, - "eval_loss": 2.097273349761963, - "eval_runtime": 0.2158, - "eval_samples_per_second": 199.222, - "eval_steps_per_second": 4.633, + "epoch": 15.834522111269616, + "eval_loss": 2.120994806289673, + "eval_runtime": 0.2302, + "eval_samples_per_second": 186.806, + "eval_steps_per_second": 4.344, "step": 22200 }, { - "epoch": 2.673970623645557, - "grad_norm": 6.34375, + "epoch": 15.841654778887303, + "grad_norm": 5.5, "learning_rate": 1.2404444444444445e-05, - "loss": 0.4849, + "loss": 0.7303, "step": 22210 }, { - "epoch": 2.675174572598122, - "grad_norm": 9.875, + "epoch": 15.848787446504993, + "grad_norm": 4.625, "learning_rate": 1.236e-05, - "loss": 0.5778, + "loss": 0.6918, "step": 22220 }, { - "epoch": 2.6763785215506863, - "grad_norm": 10.625, + "epoch": 15.855920114122682, + "grad_norm": 10.25, "learning_rate": 1.2315555555555557e-05, - "loss": 0.6242, + "loss": 0.676, "step": 22230 }, { - "epoch": 2.6775824705032507, - "grad_norm": 7.6875, + "epoch": 15.863052781740372, + "grad_norm": 9.0, "learning_rate": 1.2271111111111112e-05, - "loss": 0.5268, + "loss": 0.7605, "step": 22240 }, { - "epoch": 2.678786419455815, - "grad_norm": 9.5, + "epoch": 15.87018544935806, + "grad_norm": 5.78125, "learning_rate": 1.2226666666666668e-05, - "loss": 0.6594, + "loss": 0.7128, "step": 22250 }, { - "epoch": 2.6799903684083795, - "grad_norm": 8.0, + "epoch": 15.877318116975749, + "grad_norm": 6.8125, "learning_rate": 1.2182222222222223e-05, - "loss": 0.576, + "loss": 0.7533, "step": 22260 }, { - "epoch": 2.681194317360944, - "grad_norm": 6.59375, + "epoch": 15.884450784593438, + "grad_norm": 34.0, "learning_rate": 1.2137777777777778e-05, - "loss": 0.5592, + "loss": 0.8246, "step": 22270 }, { - "epoch": 2.682398266313508, - "grad_norm": 8.0, + "epoch": 15.891583452211126, + "grad_norm": 8.4375, "learning_rate": 1.2093333333333335e-05, - "loss": 0.498, + "loss": 0.7453, "step": 22280 }, { - "epoch": 2.6836022152660726, - "grad_norm": 6.46875, + "epoch": 15.898716119828816, + "grad_norm": 9.5, "learning_rate": 1.204888888888889e-05, - "loss": 0.4487, + "loss": 0.7955, "step": 22290 }, { - "epoch": 2.684806164218637, - "grad_norm": 9.125, + "epoch": 15.905848787446505, + "grad_norm": 9.5, "learning_rate": 1.2004444444444444e-05, - "loss": 0.5582, + "loss": 0.794, "step": 22300 }, { - "epoch": 2.684806164218637, - "eval/acc": 40.11627960205078, + "epoch": 15.905848787446505, + "eval/acc": 51.16279220581055, "step": 22300 }, { - "epoch": 2.684806164218637, - "eval_loss": 2.089059829711914, - "eval_runtime": 0.2142, - "eval_samples_per_second": 200.726, - "eval_steps_per_second": 4.668, + "epoch": 15.905848787446505, + "eval_loss": 2.1340696811676025, + "eval_runtime": 7.8966, + "eval_samples_per_second": 5.445, + "eval_steps_per_second": 0.127, "step": 22300 }, { - "epoch": 2.6860101131712018, - "grad_norm": 9.0625, + "epoch": 15.912981455064195, + "grad_norm": 6.8125, "learning_rate": 1.196e-05, - "loss": 0.5141, + "loss": 0.7258, "step": 22310 }, { - "epoch": 2.6872140621237657, - "grad_norm": 7.71875, + "epoch": 15.920114122681882, + "grad_norm": 8.1875, "learning_rate": 1.1915555555555556e-05, - "loss": 0.568, + "loss": 0.7714, "step": 22320 }, { - "epoch": 2.6884180110763305, - "grad_norm": 6.65625, + "epoch": 15.927246790299572, + "grad_norm": 24.0, "learning_rate": 1.1871111111111113e-05, - "loss": 0.491, + "loss": 0.8007, "step": 22330 }, { - "epoch": 2.689621960028895, - "grad_norm": 8.25, + "epoch": 15.934379457917261, + "grad_norm": 8.125, "learning_rate": 1.1826666666666668e-05, - "loss": 0.6707, + "loss": 0.7889, "step": 22340 }, { - "epoch": 2.6908259089814592, - "grad_norm": 14.625, + "epoch": 15.94151212553495, + "grad_norm": 7.625, "learning_rate": 1.1782222222222222e-05, - "loss": 0.563, + "loss": 0.8811, "step": 22350 }, { - "epoch": 2.6920298579340236, - "grad_norm": 9.25, + "epoch": 15.948644793152638, + "grad_norm": 7.78125, "learning_rate": 1.1737777777777779e-05, - "loss": 0.5388, + "loss": 0.7475, "step": 22360 }, { - "epoch": 2.693233806886588, - "grad_norm": 6.125, + "epoch": 15.955777460770328, + "grad_norm": 7.59375, "learning_rate": 1.1693333333333334e-05, - "loss": 0.5783, + "loss": 0.8789, "step": 22370 }, { - "epoch": 2.6944377558391523, - "grad_norm": 10.25, + "epoch": 15.962910128388017, + "grad_norm": 10.125, "learning_rate": 1.1648888888888889e-05, - "loss": 0.5629, + "loss": 0.695, "step": 22380 }, { - "epoch": 2.6956417047917167, - "grad_norm": 14.75, + "epoch": 15.970042796005707, + "grad_norm": 8.125, "learning_rate": 1.1604444444444446e-05, - "loss": 0.5353, + "loss": 0.8267, "step": 22390 }, { - "epoch": 2.696845653744281, - "grad_norm": 8.8125, + "epoch": 15.977175463623395, + "grad_norm": 17.125, "learning_rate": 1.156e-05, - "loss": 0.6026, + "loss": 0.786, "step": 22400 }, { - "epoch": 2.696845653744281, - "eval/acc": 41.86046600341797, + "epoch": 15.977175463623395, + "eval/acc": 51.16279220581055, "step": 22400 }, { - "epoch": 2.696845653744281, - "eval_loss": 2.0810489654541016, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.128, - "eval_steps_per_second": 4.747, + "epoch": 15.977175463623395, + "eval_loss": 2.109499216079712, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.377, + "eval_steps_per_second": 4.358, "step": 22400 }, { - "epoch": 2.6980496026968455, - "grad_norm": 6.6875, + "epoch": 15.984308131241084, + "grad_norm": 9.8125, "learning_rate": 1.1515555555555556e-05, - "loss": 0.5557, + "loss": 0.8732, "step": 22410 }, { - "epoch": 2.6992535516494103, - "grad_norm": 8.25, + "epoch": 15.991440798858774, + "grad_norm": 59.25, "learning_rate": 1.1471111111111112e-05, - "loss": 0.5954, + "loss": 0.7541, "step": 22420 }, { - "epoch": 2.700457500601974, - "grad_norm": 5.59375, + "epoch": 15.998573466476461, + "grad_norm": 9.9375, "learning_rate": 1.1426666666666667e-05, - "loss": 0.5131, + "loss": 0.7847, "step": 22430 }, { - "epoch": 2.701661449554539, - "grad_norm": 8.125, + "epoch": 16.00570613409415, + "grad_norm": 7.5625, "learning_rate": 1.1382222222222224e-05, - "loss": 0.7013, + "loss": 0.7616, "step": 22440 }, { - "epoch": 2.7028653985071034, - "grad_norm": 7.46875, + "epoch": 16.01283880171184, + "grad_norm": 7.90625, "learning_rate": 1.1337777777777777e-05, - "loss": 0.5817, + "loss": 0.7538, "step": 22450 }, { - "epoch": 2.7040693474596678, - "grad_norm": 6.46875, + "epoch": 16.01997146932953, + "grad_norm": 5.1875, "learning_rate": 1.1293333333333334e-05, - "loss": 0.4625, + "loss": 0.6573, "step": 22460 }, { - "epoch": 2.705273296412232, - "grad_norm": 13.4375, + "epoch": 16.02710413694722, + "grad_norm": 6.4375, "learning_rate": 1.124888888888889e-05, - "loss": 0.5075, + "loss": 0.6845, "step": 22470 }, { - "epoch": 2.7064772453647965, - "grad_norm": 9.0625, + "epoch": 16.03423680456491, + "grad_norm": 6.875, "learning_rate": 1.1204444444444445e-05, - "loss": 0.6094, + "loss": 0.7176, "step": 22480 }, { - "epoch": 2.707681194317361, - "grad_norm": 10.875, + "epoch": 16.041369472182595, + "grad_norm": 7.8125, "learning_rate": 1.1160000000000002e-05, - "loss": 0.5529, + "loss": 0.6968, "step": 22490 }, { - "epoch": 2.7088851432699252, - "grad_norm": 5.90625, + "epoch": 16.048502139800284, + "grad_norm": 7.3125, "learning_rate": 1.1115555555555555e-05, - "loss": 0.5454, + "loss": 0.7667, "step": 22500 }, { - "epoch": 2.7088851432699252, - "eval/acc": 40.11627960205078, + "epoch": 16.048502139800284, + "eval/acc": 46.511627197265625, "step": 22500 }, { - "epoch": 2.7088851432699252, - "eval_loss": 2.0772321224212646, - "eval_runtime": 0.2135, - "eval_samples_per_second": 201.414, - "eval_steps_per_second": 4.684, + "epoch": 16.048502139800284, + "eval_loss": 2.3885626792907715, + "eval_runtime": 7.6399, + "eval_samples_per_second": 5.628, + "eval_steps_per_second": 0.131, "step": 22500 }, { - "epoch": 2.7100890922224896, - "grad_norm": 8.1875, + "epoch": 16.055634807417974, + "grad_norm": 8.125, "learning_rate": 1.1071111111111112e-05, - "loss": 0.5451, + "loss": 0.6981, "step": 22510 }, { - "epoch": 2.711293041175054, - "grad_norm": 8.125, + "epoch": 16.062767475035663, + "grad_norm": 8.0, "learning_rate": 1.1026666666666667e-05, - "loss": 0.533, + "loss": 0.8857, "step": 22520 }, { - "epoch": 2.712496990127619, - "grad_norm": 7.125, + "epoch": 16.069900142653353, + "grad_norm": 7.21875, "learning_rate": 1.0982222222222222e-05, - "loss": 0.5324, + "loss": 0.7247, "step": 22530 }, { - "epoch": 2.7137009390801827, - "grad_norm": 6.625, + "epoch": 16.077032810271042, + "grad_norm": 8.8125, "learning_rate": 1.093777777777778e-05, - "loss": 0.5044, + "loss": 0.7655, "step": 22540 }, { - "epoch": 2.7149048880327475, - "grad_norm": 5.90625, + "epoch": 16.08416547788873, + "grad_norm": 8.3125, "learning_rate": 1.0893333333333333e-05, - "loss": 0.4005, + "loss": 0.7946, "step": 22550 }, { - "epoch": 2.716108836985312, - "grad_norm": 8.3125, + "epoch": 16.091298145506418, + "grad_norm": 7.09375, "learning_rate": 1.0848888888888888e-05, - "loss": 0.524, + "loss": 0.7194, "step": 22560 }, { - "epoch": 2.7173127859378763, - "grad_norm": 7.71875, + "epoch": 16.098430813124107, + "grad_norm": 6.9375, "learning_rate": 1.0804444444444445e-05, - "loss": 0.586, + "loss": 0.719, "step": 22570 }, { - "epoch": 2.7185167348904407, - "grad_norm": 8.5, + "epoch": 16.105563480741797, + "grad_norm": 8.0, "learning_rate": 1.076e-05, - "loss": 0.6012, + "loss": 0.7125, "step": 22580 }, { - "epoch": 2.719720683843005, - "grad_norm": 6.25, + "epoch": 16.112696148359486, + "grad_norm": 8.125, "learning_rate": 1.0715555555555557e-05, - "loss": 0.4667, + "loss": 0.8118, "step": 22590 }, { - "epoch": 2.7209246327955694, - "grad_norm": 6.78125, + "epoch": 16.119828815977176, + "grad_norm": 8.375, "learning_rate": 1.0671111111111112e-05, - "loss": 0.5742, + "loss": 0.7747, "step": 22600 }, { - "epoch": 2.7209246327955694, - "eval/acc": 43.604652404785156, + "epoch": 16.119828815977176, + "eval/acc": 46.511627197265625, "step": 22600 }, { - "epoch": 2.7209246327955694, - "eval_loss": 2.0858654975891113, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.323, - "eval_steps_per_second": 4.752, + "epoch": 16.119828815977176, + "eval_loss": 2.3585238456726074, + "eval_runtime": 0.2224, + "eval_samples_per_second": 193.314, + "eval_steps_per_second": 4.496, "step": 22600 }, { - "epoch": 2.7221285817481338, - "grad_norm": 8.1875, + "epoch": 16.126961483594865, + "grad_norm": 8.3125, "learning_rate": 1.0626666666666666e-05, - "loss": 0.6255, + "loss": 0.7209, "step": 22610 }, { - "epoch": 2.723332530700698, - "grad_norm": 8.125, + "epoch": 16.134094151212555, + "grad_norm": 6.9375, "learning_rate": 1.0582222222222223e-05, - "loss": 0.5842, + "loss": 0.7399, "step": 22620 }, { - "epoch": 2.7245364796532625, - "grad_norm": 7.375, + "epoch": 16.141226818830244, + "grad_norm": 8.25, "learning_rate": 1.0537777777777778e-05, - "loss": 0.4976, + "loss": 0.7628, "step": 22630 }, { - "epoch": 2.7257404286058273, - "grad_norm": 6.15625, + "epoch": 16.14835948644793, + "grad_norm": 7.03125, "learning_rate": 1.0493333333333333e-05, - "loss": 0.5504, + "loss": 0.7365, "step": 22640 }, { - "epoch": 2.7269443775583913, - "grad_norm": 12.5625, + "epoch": 16.15549215406562, + "grad_norm": 7.625, "learning_rate": 1.044888888888889e-05, - "loss": 0.6735, + "loss": 0.7585, "step": 22650 }, { - "epoch": 2.728148326510956, - "grad_norm": 7.71875, + "epoch": 16.16262482168331, + "grad_norm": 7.40625, "learning_rate": 1.0404444444444444e-05, - "loss": 0.5324, + "loss": 0.7341, "step": 22660 }, { - "epoch": 2.7293522754635204, - "grad_norm": 9.125, + "epoch": 16.169757489301, + "grad_norm": 6.875, "learning_rate": 1.036e-05, - "loss": 0.5544, + "loss": 0.8297, "step": 22670 }, { - "epoch": 2.730556224416085, - "grad_norm": 10.6875, + "epoch": 16.176890156918688, + "grad_norm": 8.9375, "learning_rate": 1.0315555555555556e-05, - "loss": 0.6203, + "loss": 0.8216, "step": 22680 }, { - "epoch": 2.731760173368649, - "grad_norm": 6.9375, + "epoch": 16.184022824536378, + "grad_norm": 12.625, "learning_rate": 1.0271111111111111e-05, - "loss": 0.5019, + "loss": 0.6936, "step": 22690 }, { - "epoch": 2.7329641223212136, - "grad_norm": 8.0625, + "epoch": 16.191155492154067, + "grad_norm": 5.53125, "learning_rate": 1.0226666666666668e-05, - "loss": 0.5176, + "loss": 0.8264, "step": 22700 }, { - "epoch": 2.7329641223212136, - "eval/acc": 39.53488540649414, + "epoch": 16.191155492154067, + "eval/acc": 46.511627197265625, "step": 22700 }, { - "epoch": 2.7329641223212136, - "eval_loss": 2.092508554458618, - "eval_runtime": 0.2145, - "eval_samples_per_second": 200.459, - "eval_steps_per_second": 4.662, + "epoch": 16.191155492154067, + "eval_loss": 2.3709049224853516, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 22700 }, { - "epoch": 2.734168071273778, - "grad_norm": 14.125, + "epoch": 16.198288159771753, + "grad_norm": 7.3125, "learning_rate": 1.0182222222222222e-05, - "loss": 0.5801, + "loss": 0.7458, "step": 22710 }, { - "epoch": 2.7353720202263423, - "grad_norm": 8.9375, + "epoch": 16.205420827389442, + "grad_norm": 8.625, "learning_rate": 1.0137777777777779e-05, - "loss": 0.6386, + "loss": 0.8079, "step": 22720 }, { - "epoch": 2.7365759691789067, - "grad_norm": 8.5625, + "epoch": 16.212553495007132, + "grad_norm": 7.53125, "learning_rate": 1.0093333333333334e-05, - "loss": 0.5935, + "loss": 0.7028, "step": 22730 }, { - "epoch": 2.737779918131471, - "grad_norm": 8.375, + "epoch": 16.21968616262482, + "grad_norm": 8.1875, "learning_rate": 1.0048888888888889e-05, - "loss": 0.53, + "loss": 0.76, "step": 22740 }, { - "epoch": 2.738983867084036, - "grad_norm": 7.09375, + "epoch": 16.22681883024251, + "grad_norm": 9.0625, "learning_rate": 1.0004444444444446e-05, - "loss": 0.5095, + "loss": 0.8226, "step": 22750 }, { - "epoch": 2.7401878160365998, - "grad_norm": 9.8125, + "epoch": 16.2339514978602, + "grad_norm": 7.53125, "learning_rate": 9.96e-06, - "loss": 0.5161, + "loss": 0.7254, "step": 22760 }, { - "epoch": 2.7413917649891646, - "grad_norm": 7.90625, + "epoch": 16.24108416547789, + "grad_norm": 42.5, "learning_rate": 9.915555555555556e-06, - "loss": 0.6324, + "loss": 0.7925, "step": 22770 }, { - "epoch": 2.742595713941729, - "grad_norm": 8.8125, + "epoch": 16.24821683309558, + "grad_norm": 13.0, "learning_rate": 9.871111111111112e-06, - "loss": 0.5563, + "loss": 0.8108, "step": 22780 }, { - "epoch": 2.7437996628942933, + "epoch": 16.255349500713265, "grad_norm": 6.875, "learning_rate": 9.826666666666667e-06, - "loss": 0.5627, + "loss": 0.8118, "step": 22790 }, { - "epoch": 2.7450036118468577, - "grad_norm": 7.21875, + "epoch": 16.262482168330955, + "grad_norm": 8.25, "learning_rate": 9.782222222222224e-06, - "loss": 0.4989, + "loss": 0.7531, "step": 22800 }, { - "epoch": 2.7450036118468577, - "eval/acc": 41.86046600341797, + "epoch": 16.262482168330955, + "eval/acc": 44.1860466003418, "step": 22800 }, { - "epoch": 2.7450036118468577, - "eval_loss": 2.08249831199646, - "eval_runtime": 0.2048, - "eval_samples_per_second": 209.969, - "eval_steps_per_second": 4.883, + "epoch": 16.262482168330955, + "eval_loss": 2.376155376434326, + "eval_runtime": 0.238, + "eval_samples_per_second": 180.665, + "eval_steps_per_second": 4.202, "step": 22800 }, { - "epoch": 2.746207560799422, - "grad_norm": 6.84375, + "epoch": 16.269614835948644, + "grad_norm": 8.5, "learning_rate": 9.737777777777777e-06, - "loss": 0.5058, + "loss": 0.829, "step": 22810 }, { - "epoch": 2.7474115097519864, - "grad_norm": 7.78125, + "epoch": 16.276747503566334, + "grad_norm": 7.21875, "learning_rate": 9.693333333333334e-06, - "loss": 0.5595, + "loss": 0.6652, "step": 22820 }, { - "epoch": 2.748615458704551, - "grad_norm": 8.0625, + "epoch": 16.283880171184023, + "grad_norm": 8.125, "learning_rate": 9.64888888888889e-06, - "loss": 0.5438, + "loss": 0.7132, "step": 22830 }, { - "epoch": 2.749819407657115, - "grad_norm": 7.78125, + "epoch": 16.291012838801713, + "grad_norm": 9.3125, "learning_rate": 9.604444444444445e-06, - "loss": 0.547, + "loss": 0.6487, "step": 22840 }, { - "epoch": 2.7510233566096796, - "grad_norm": 7.6875, + "epoch": 16.298145506419402, + "grad_norm": 10.0625, "learning_rate": 9.560000000000002e-06, - "loss": 0.5997, + "loss": 0.7387, "step": 22850 }, { - "epoch": 2.7522273055622444, - "grad_norm": 6.9375, + "epoch": 16.30527817403709, + "grad_norm": 48.5, "learning_rate": 9.515555555555555e-06, - "loss": 0.5666, + "loss": 0.7641, "step": 22860 }, { - "epoch": 2.7534312545148083, - "grad_norm": 5.375, + "epoch": 16.312410841654778, + "grad_norm": 9.4375, "learning_rate": 9.47111111111111e-06, - "loss": 0.5716, + "loss": 0.7797, "step": 22870 }, { - "epoch": 2.754635203467373, - "grad_norm": 8.0625, + "epoch": 16.319543509272467, + "grad_norm": 6.84375, "learning_rate": 9.426666666666667e-06, - "loss": 0.5906, + "loss": 0.7955, "step": 22880 }, { - "epoch": 2.7558391524199375, - "grad_norm": 9.8125, + "epoch": 16.326676176890157, + "grad_norm": 6.6875, "learning_rate": 9.382222222222223e-06, - "loss": 0.5665, + "loss": 0.6976, "step": 22890 }, { - "epoch": 2.757043101372502, - "grad_norm": 7.71875, + "epoch": 16.333808844507846, + "grad_norm": 7.90625, "learning_rate": 9.337777777777778e-06, - "loss": 0.514, + "loss": 0.7808, "step": 22900 }, { - "epoch": 2.757043101372502, - "eval/acc": 39.53488540649414, + "epoch": 16.333808844507846, + "eval/acc": 46.511627197265625, "step": 22900 }, { - "epoch": 2.757043101372502, - "eval_loss": 2.084805727005005, - "eval_runtime": 0.213, - "eval_samples_per_second": 201.852, - "eval_steps_per_second": 4.694, + "epoch": 16.333808844507846, + "eval_loss": 2.3756563663482666, + "eval_runtime": 0.2253, + "eval_samples_per_second": 190.817, + "eval_steps_per_second": 4.438, "step": 22900 }, { - "epoch": 2.7582470503250662, - "grad_norm": 8.875, + "epoch": 16.340941512125536, + "grad_norm": 8.25, "learning_rate": 9.293333333333335e-06, - "loss": 0.5587, + "loss": 0.743, "step": 22910 }, { - "epoch": 2.7594509992776306, - "grad_norm": 8.125, + "epoch": 16.348074179743225, + "grad_norm": 7.4375, "learning_rate": 9.248888888888888e-06, - "loss": 0.549, + "loss": 0.7825, "step": 22920 }, { - "epoch": 2.760654948230195, - "grad_norm": 9.0, + "epoch": 16.355206847360915, + "grad_norm": 32.25, "learning_rate": 9.204444444444445e-06, - "loss": 0.5719, + "loss": 0.7654, "step": 22930 }, { - "epoch": 2.7618588971827593, - "grad_norm": 8.0, + "epoch": 16.3623395149786, + "grad_norm": 7.6875, "learning_rate": 9.16e-06, - "loss": 0.5532, + "loss": 0.7186, "step": 22940 }, { - "epoch": 2.7630628461353237, - "grad_norm": 8.25, + "epoch": 16.36947218259629, + "grad_norm": 8.125, "learning_rate": 9.115555555555556e-06, - "loss": 0.5176, + "loss": 0.7137, "step": 22950 }, { - "epoch": 2.764266795087888, - "grad_norm": 8.25, + "epoch": 16.37660485021398, + "grad_norm": 7.375, "learning_rate": 9.071111111111113e-06, - "loss": 0.5235, + "loss": 0.7046, "step": 22960 }, { - "epoch": 2.765470744040453, - "grad_norm": 10.0625, + "epoch": 16.38373751783167, + "grad_norm": 7.59375, "learning_rate": 9.026666666666666e-06, - "loss": 0.5128, + "loss": 0.8367, "step": 22970 }, { - "epoch": 2.766674692993017, - "grad_norm": 6.90625, + "epoch": 16.39087018544936, + "grad_norm": 14.4375, "learning_rate": 8.982222222222223e-06, - "loss": 0.5114, + "loss": 0.8183, "step": 22980 }, { - "epoch": 2.7678786419455816, - "grad_norm": 7.59375, + "epoch": 16.398002853067048, + "grad_norm": 6.8125, "learning_rate": 8.937777777777778e-06, - "loss": 0.5895, + "loss": 0.6925, "step": 22990 }, { - "epoch": 2.769082590898146, - "grad_norm": 8.0, + "epoch": 16.405135520684738, + "grad_norm": 8.6875, "learning_rate": 8.893333333333333e-06, - "loss": 0.6213, + "loss": 0.7486, "step": 23000 }, { - "epoch": 2.769082590898146, - "eval/acc": 41.27906799316406, + "epoch": 16.405135520684738, + "eval/acc": 46.511627197265625, "step": 23000 }, { - "epoch": 2.769082590898146, - "eval_loss": 2.093411445617676, - "eval_runtime": 0.2469, - "eval_samples_per_second": 174.137, - "eval_steps_per_second": 4.05, + "epoch": 16.405135520684738, + "eval_loss": 2.3669910430908203, + "eval_runtime": 0.2182, + "eval_samples_per_second": 197.042, + "eval_steps_per_second": 4.582, "step": 23000 }, { - "epoch": 2.7702865398507104, - "grad_norm": 7.625, + "epoch": 16.412268188302424, + "grad_norm": 7.15625, "learning_rate": 8.84888888888889e-06, - "loss": 0.5952, + "loss": 0.7187, "step": 23010 }, { - "epoch": 2.7714904888032748, - "grad_norm": 11.375, + "epoch": 16.419400855920113, + "grad_norm": 7.6875, "learning_rate": 8.804444444444444e-06, - "loss": 0.5705, + "loss": 0.8066, "step": 23020 }, { - "epoch": 2.772694437755839, - "grad_norm": 7.65625, + "epoch": 16.426533523537802, + "grad_norm": 6.6875, "learning_rate": 8.76e-06, - "loss": 0.5177, + "loss": 0.7569, "step": 23030 }, { - "epoch": 2.7738983867084035, - "grad_norm": 8.5625, + "epoch": 16.433666191155492, + "grad_norm": 7.28125, "learning_rate": 8.715555555555556e-06, - "loss": 0.5716, + "loss": 0.8201, "step": 23040 }, { - "epoch": 2.775102335660968, - "grad_norm": 6.28125, + "epoch": 16.44079885877318, + "grad_norm": 7.03125, "learning_rate": 8.671111111111111e-06, - "loss": 0.5841, + "loss": 0.7451, "step": 23050 }, { - "epoch": 2.7763062846135322, - "grad_norm": 9.8125, + "epoch": 16.44793152639087, + "grad_norm": 7.3125, "learning_rate": 8.626666666666668e-06, - "loss": 0.6857, + "loss": 0.8503, "step": 23060 }, { - "epoch": 2.7775102335660966, - "grad_norm": 7.03125, + "epoch": 16.45506419400856, + "grad_norm": 6.90625, "learning_rate": 8.582222222222222e-06, - "loss": 0.5511, + "loss": 0.7842, "step": 23070 }, { - "epoch": 2.7787141825186614, - "grad_norm": 8.75, + "epoch": 16.46219686162625, + "grad_norm": 6.15625, "learning_rate": 8.537777777777779e-06, - "loss": 0.6073, + "loss": 0.7893, "step": 23080 }, { - "epoch": 2.7799181314712254, - "grad_norm": 8.75, + "epoch": 16.469329529243936, + "grad_norm": 8.1875, "learning_rate": 8.493333333333334e-06, - "loss": 0.5348, + "loss": 0.7195, "step": 23090 }, { - "epoch": 2.78112208042379, - "grad_norm": 7.3125, + "epoch": 16.476462196861625, + "grad_norm": 10.3125, "learning_rate": 8.448888888888889e-06, - "loss": 0.5881, + "loss": 0.7708, "step": 23100 }, { - "epoch": 2.78112208042379, - "eval/acc": 41.86046600341797, + "epoch": 16.476462196861625, + "eval/acc": 46.511627197265625, "step": 23100 }, { - "epoch": 2.78112208042379, - "eval_loss": 2.075512170791626, - "eval_runtime": 0.2315, - "eval_samples_per_second": 185.726, - "eval_steps_per_second": 4.319, + "epoch": 16.476462196861625, + "eval_loss": 2.35764741897583, + "eval_runtime": 0.2204, + "eval_samples_per_second": 195.143, + "eval_steps_per_second": 4.538, "step": 23100 }, { - "epoch": 2.7823260293763545, - "grad_norm": 7.84375, + "epoch": 16.483594864479315, + "grad_norm": 7.40625, "learning_rate": 8.404444444444446e-06, - "loss": 0.5666, + "loss": 0.6914, "step": 23110 }, { - "epoch": 2.783529978328919, - "grad_norm": 8.0, + "epoch": 16.490727532097004, + "grad_norm": 9.75, "learning_rate": 8.36e-06, - "loss": 0.5699, + "loss": 0.7454, "step": 23120 }, { - "epoch": 2.7847339272814833, - "grad_norm": 8.875, + "epoch": 16.497860199714694, + "grad_norm": 10.4375, "learning_rate": 8.315555555555555e-06, - "loss": 0.6619, + "loss": 0.793, "step": 23130 }, { - "epoch": 2.7859378762340476, - "grad_norm": 9.625, + "epoch": 16.504992867332383, + "grad_norm": 7.3125, "learning_rate": 8.271111111111112e-06, - "loss": 0.562, + "loss": 0.7618, "step": 23140 }, { - "epoch": 2.787141825186612, - "grad_norm": 7.9375, + "epoch": 16.512125534950073, + "grad_norm": 8.3125, "learning_rate": 8.226666666666667e-06, - "loss": 0.5356, + "loss": 0.8735, "step": 23150 }, { - "epoch": 2.7883457741391764, - "grad_norm": 4.53125, + "epoch": 16.51925820256776, + "grad_norm": 6.5625, "learning_rate": 8.182222222222224e-06, - "loss": 0.5491, + "loss": 0.7696, "step": 23160 }, { - "epoch": 2.7895497230917408, - "grad_norm": 9.375, + "epoch": 16.52639087018545, + "grad_norm": 7.0, "learning_rate": 8.137777777777777e-06, - "loss": 0.646, + "loss": 0.7254, "step": 23170 }, { - "epoch": 2.790753672044305, - "grad_norm": 6.5, + "epoch": 16.533523537803138, + "grad_norm": 6.09375, "learning_rate": 8.093333333333333e-06, - "loss": 0.5323, + "loss": 0.8109, "step": 23180 }, { - "epoch": 2.79195762099687, - "grad_norm": 11.1875, + "epoch": 16.540656205420827, + "grad_norm": 6.84375, "learning_rate": 8.04888888888889e-06, - "loss": 0.5733, + "loss": 0.7784, "step": 23190 }, { - "epoch": 2.793161569949434, - "grad_norm": 6.9375, + "epoch": 16.547788873038517, + "grad_norm": 8.0, "learning_rate": 8.004444444444445e-06, - "loss": 0.6876, + "loss": 0.7301, "step": 23200 }, { - "epoch": 2.793161569949434, - "eval/acc": 39.53488540649414, + "epoch": 16.547788873038517, + "eval/acc": 46.511627197265625, "step": 23200 }, { - "epoch": 2.793161569949434, - "eval_loss": 2.0778017044067383, - "eval_runtime": 0.2074, - "eval_samples_per_second": 207.358, - "eval_steps_per_second": 4.822, + "epoch": 16.547788873038517, + "eval_loss": 2.3700337409973145, + "eval_runtime": 0.2463, + "eval_samples_per_second": 174.56, + "eval_steps_per_second": 4.06, "step": 23200 }, { - "epoch": 2.7943655189019987, - "grad_norm": 7.8125, + "epoch": 16.554921540656206, + "grad_norm": 7.96875, "learning_rate": 7.96e-06, - "loss": 0.5402, + "loss": 0.7905, "step": 23210 }, { - "epoch": 2.795569467854563, - "grad_norm": 6.375, + "epoch": 16.562054208273896, + "grad_norm": 7.9375, "learning_rate": 7.915555555555557e-06, - "loss": 0.5113, + "loss": 0.7493, "step": 23220 }, { - "epoch": 2.7967734168071274, - "grad_norm": 8.75, + "epoch": 16.56918687589158, + "grad_norm": 7.03125, "learning_rate": 7.87111111111111e-06, - "loss": 0.6413, + "loss": 0.7384, "step": 23230 }, { - "epoch": 2.797977365759692, - "grad_norm": 7.96875, + "epoch": 16.57631954350927, + "grad_norm": 9.1875, "learning_rate": 7.826666666666667e-06, - "loss": 0.5623, + "loss": 0.8211, "step": 23240 }, { - "epoch": 2.799181314712256, - "grad_norm": 7.5625, + "epoch": 16.58345221112696, + "grad_norm": 53.75, "learning_rate": 7.782222222222223e-06, - "loss": 0.5519, + "loss": 0.8433, "step": 23250 }, { - "epoch": 2.8003852636648205, - "grad_norm": 11.375, + "epoch": 16.59058487874465, + "grad_norm": 8.125, "learning_rate": 7.737777777777778e-06, - "loss": 0.6279, + "loss": 0.7178, "step": 23260 }, { - "epoch": 2.801589212617385, - "grad_norm": 6.40625, + "epoch": 16.59771754636234, + "grad_norm": 6.34375, "learning_rate": 7.693333333333335e-06, - "loss": 0.5625, + "loss": 0.7261, "step": 23270 }, { - "epoch": 2.8027931615699493, - "grad_norm": 8.5625, + "epoch": 16.60485021398003, + "grad_norm": 6.59375, "learning_rate": 7.648888888888888e-06, - "loss": 0.5104, + "loss": 0.7469, "step": 23280 }, { - "epoch": 2.8039971105225137, - "grad_norm": 8.6875, + "epoch": 16.61198288159772, + "grad_norm": 5.15625, "learning_rate": 7.604444444444444e-06, - "loss": 0.4718, + "loss": 0.6811, "step": 23290 }, { - "epoch": 2.8052010594750785, - "grad_norm": 6.5625, + "epoch": 16.619115549215408, + "grad_norm": 19.875, "learning_rate": 7.5600000000000005e-06, - "loss": 0.5333, + "loss": 0.7753, "step": 23300 }, { - "epoch": 2.8052010594750785, - "eval/acc": 39.53488540649414, + "epoch": 16.619115549215408, + "eval/acc": 46.511627197265625, "step": 23300 }, { - "epoch": 2.8052010594750785, - "eval_loss": 2.085590362548828, - "eval_runtime": 0.259, - "eval_samples_per_second": 166.022, - "eval_steps_per_second": 3.861, + "epoch": 16.619115549215408, + "eval_loss": 2.3815970420837402, + "eval_runtime": 0.2316, + "eval_samples_per_second": 185.668, + "eval_steps_per_second": 4.318, "step": 23300 }, { - "epoch": 2.8064050084276424, - "grad_norm": 10.375, + "epoch": 16.626248216833094, + "grad_norm": 18.875, "learning_rate": 7.5155555555555565e-06, - "loss": 0.501, + "loss": 0.7564, "step": 23310 }, { - "epoch": 2.807608957380207, - "grad_norm": 8.75, + "epoch": 16.633380884450784, + "grad_norm": 4.875, "learning_rate": 7.471111111111112e-06, - "loss": 0.586, + "loss": 0.6863, "step": 23320 }, { - "epoch": 2.8088129063327716, - "grad_norm": 6.78125, + "epoch": 16.640513552068473, + "grad_norm": 7.25, "learning_rate": 7.426666666666666e-06, - "loss": 0.5707, + "loss": 0.8129, "step": 23330 }, { - "epoch": 2.810016855285336, - "grad_norm": 8.6875, + "epoch": 16.647646219686163, + "grad_norm": 8.0, "learning_rate": 7.382222222222222e-06, - "loss": 0.607, + "loss": 0.7198, "step": 23340 }, { - "epoch": 2.8112208042379003, - "grad_norm": 8.25, + "epoch": 16.654778887303852, + "grad_norm": 5.78125, "learning_rate": 7.337777777777778e-06, - "loss": 0.5406, + "loss": 0.7266, "step": 23350 }, { - "epoch": 2.8124247531904647, - "grad_norm": 8.1875, + "epoch": 16.66191155492154, + "grad_norm": 7.53125, "learning_rate": 7.293333333333334e-06, - "loss": 0.6398, + "loss": 0.7642, "step": 23360 }, { - "epoch": 2.813628702143029, - "grad_norm": 4.5625, + "epoch": 16.66904422253923, + "grad_norm": 7.40625, "learning_rate": 7.24888888888889e-06, - "loss": 0.5254, + "loss": 0.7696, "step": 23370 }, { - "epoch": 2.8148326510955934, - "grad_norm": 9.375, + "epoch": 16.676176890156917, + "grad_norm": 10.75, "learning_rate": 7.204444444444444e-06, - "loss": 0.6341, + "loss": 0.7654, "step": 23380 }, { - "epoch": 2.816036600048158, - "grad_norm": 7.9375, + "epoch": 16.683309557774606, + "grad_norm": 9.625, "learning_rate": 7.16e-06, - "loss": 0.5915, + "loss": 0.7238, "step": 23390 }, { - "epoch": 2.817240549000722, - "grad_norm": 6.96875, + "epoch": 16.690442225392296, + "grad_norm": 8.625, "learning_rate": 7.115555555555556e-06, - "loss": 0.589, + "loss": 0.7249, "step": 23400 }, { - "epoch": 2.817240549000722, - "eval/acc": 41.27906799316406, + "epoch": 16.690442225392296, + "eval/acc": 46.511627197265625, "step": 23400 }, { - "epoch": 2.817240549000722, - "eval_loss": 2.0793814659118652, - "eval_runtime": 0.2125, - "eval_samples_per_second": 202.328, - "eval_steps_per_second": 4.705, + "epoch": 16.690442225392296, + "eval_loss": 2.3753066062927246, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.376, + "eval_steps_per_second": 4.474, "step": 23400 }, { - "epoch": 2.818444497953287, - "grad_norm": 6.46875, + "epoch": 16.697574893009985, + "grad_norm": 6.6875, "learning_rate": 7.071111111111111e-06, - "loss": 0.5408, + "loss": 0.6913, "step": 23410 }, { - "epoch": 2.819648446905851, - "grad_norm": 6.09375, + "epoch": 16.704707560627675, + "grad_norm": 7.96875, "learning_rate": 7.0266666666666674e-06, - "loss": 0.6025, + "loss": 0.7034, "step": 23420 }, { - "epoch": 2.8208523958584157, - "grad_norm": 6.96875, + "epoch": 16.711840228245364, + "grad_norm": 16.625, "learning_rate": 6.982222222222222e-06, - "loss": 0.5497, + "loss": 0.7675, "step": 23430 }, { - "epoch": 2.82205634481098, - "grad_norm": 8.8125, + "epoch": 16.718972895863054, + "grad_norm": 8.5625, "learning_rate": 6.937777777777778e-06, - "loss": 0.5231, + "loss": 0.7343, "step": 23440 }, { - "epoch": 2.8232602937635445, - "grad_norm": 7.96875, + "epoch": 16.726105563480743, + "grad_norm": 7.75, "learning_rate": 6.893333333333334e-06, - "loss": 0.6037, + "loss": 0.8078, "step": 23450 }, { - "epoch": 2.824464242716109, - "grad_norm": 9.625, + "epoch": 16.73323823109843, + "grad_norm": 7.5, "learning_rate": 6.848888888888889e-06, - "loss": 0.6026, + "loss": 0.8483, "step": 23460 }, { - "epoch": 2.8256681916686732, - "grad_norm": 19.125, + "epoch": 16.74037089871612, + "grad_norm": 9.0625, "learning_rate": 6.804444444444445e-06, - "loss": 0.6064, + "loss": 0.7703, "step": 23470 }, { - "epoch": 2.8268721406212376, - "grad_norm": 8.125, + "epoch": 16.74750356633381, + "grad_norm": 8.3125, "learning_rate": 6.76e-06, - "loss": 0.5266, + "loss": 0.8339, "step": 23480 }, { - "epoch": 2.828076089573802, - "grad_norm": 7.375, + "epoch": 16.754636233951498, + "grad_norm": 7.53125, "learning_rate": 6.715555555555556e-06, - "loss": 0.5067, + "loss": 0.7384, "step": 23490 }, { - "epoch": 2.8292800385263663, - "grad_norm": 7.53125, + "epoch": 16.761768901569187, + "grad_norm": 6.96875, "learning_rate": 6.671111111111111e-06, - "loss": 0.5133, + "loss": 0.7953, "step": 23500 }, { - "epoch": 2.8292800385263663, - "eval/acc": 41.86046600341797, + "epoch": 16.761768901569187, + "eval/acc": 44.1860466003418, "step": 23500 }, { - "epoch": 2.8292800385263663, - "eval_loss": 2.0831339359283447, - "eval_runtime": 4.029, - "eval_samples_per_second": 10.673, - "eval_steps_per_second": 0.248, + "epoch": 16.761768901569187, + "eval_loss": 2.3848800659179688, + "eval_runtime": 0.2328, + "eval_samples_per_second": 184.731, + "eval_steps_per_second": 4.296, "step": 23500 }, { - "epoch": 2.8304839874789307, - "grad_norm": 6.6875, + "epoch": 16.768901569186877, + "grad_norm": 11.625, "learning_rate": 6.626666666666667e-06, - "loss": 0.4857, + "loss": 0.712, "step": 23510 }, { - "epoch": 2.8316879364314955, - "grad_norm": 6.0, + "epoch": 16.776034236804566, + "grad_norm": 8.5625, "learning_rate": 6.582222222222223e-06, - "loss": 0.5116, + "loss": 0.7337, "step": 23520 }, { - "epoch": 2.8328918853840594, - "grad_norm": 7.375, + "epoch": 16.783166904422252, + "grad_norm": 6.34375, "learning_rate": 6.537777777777779e-06, - "loss": 0.541, + "loss": 0.6829, "step": 23530 }, { - "epoch": 2.8340958343366243, - "grad_norm": 22.0, + "epoch": 16.79029957203994, + "grad_norm": 6.40625, "learning_rate": 6.4933333333333336e-06, - "loss": 0.4968, + "loss": 0.7174, "step": 23540 }, { - "epoch": 2.8352997832891886, - "grad_norm": 7.3125, + "epoch": 16.79743223965763, + "grad_norm": 8.1875, "learning_rate": 6.448888888888889e-06, - "loss": 0.5946, + "loss": 0.7812, "step": 23550 }, { - "epoch": 2.836503732241753, - "grad_norm": 8.0, + "epoch": 16.80456490727532, + "grad_norm": 7.21875, "learning_rate": 6.404444444444445e-06, - "loss": 0.5645, + "loss": 0.6941, "step": 23560 }, { - "epoch": 2.8377076811943174, - "grad_norm": 7.46875, + "epoch": 16.81169757489301, + "grad_norm": 7.75, "learning_rate": 6.360000000000001e-06, - "loss": 0.5638, + "loss": 0.8053, "step": 23570 }, { - "epoch": 2.8389116301468817, - "grad_norm": 6.03125, + "epoch": 16.8188302425107, + "grad_norm": 7.46875, "learning_rate": 6.315555555555556e-06, - "loss": 0.4892, + "loss": 0.8218, "step": 23580 }, { - "epoch": 2.840115579099446, - "grad_norm": 7.09375, + "epoch": 16.82596291012839, + "grad_norm": 5.84375, "learning_rate": 6.2711111111111105e-06, - "loss": 0.5778, + "loss": 0.7829, "step": 23590 }, { - "epoch": 2.8413195280520105, - "grad_norm": 9.0625, + "epoch": 16.83309557774608, + "grad_norm": 7.0, "learning_rate": 6.226666666666667e-06, - "loss": 0.5691, + "loss": 0.7756, "step": 23600 }, { - "epoch": 2.8413195280520105, - "eval/acc": 41.86046600341797, + "epoch": 16.83309557774608, + "eval/acc": 46.511627197265625, "step": 23600 }, { - "epoch": 2.8413195280520105, - "eval_loss": 2.0883255004882812, - "eval_runtime": 3.7876, - "eval_samples_per_second": 11.353, - "eval_steps_per_second": 0.264, + "epoch": 16.83309557774608, + "eval_loss": 2.3751766681671143, + "eval_runtime": 0.2292, + "eval_samples_per_second": 187.613, + "eval_steps_per_second": 4.363, "step": 23600 }, { - "epoch": 2.842523477004575, - "grad_norm": 7.21875, + "epoch": 16.840228245363765, + "grad_norm": 8.9375, "learning_rate": 6.182222222222223e-06, - "loss": 0.5736, + "loss": 0.859, "step": 23610 }, { - "epoch": 2.8437274259571392, - "grad_norm": 6.40625, + "epoch": 16.847360912981454, + "grad_norm": 5.96875, "learning_rate": 6.137777777777779e-06, - "loss": 0.6002, + "loss": 0.7727, "step": 23620 }, { - "epoch": 2.844931374909704, - "grad_norm": 10.5, + "epoch": 16.854493580599144, + "grad_norm": 10.6875, "learning_rate": 6.093333333333333e-06, - "loss": 0.6384, + "loss": 0.7321, "step": 23630 }, { - "epoch": 2.846135323862268, - "grad_norm": 6.84375, + "epoch": 16.861626248216833, + "grad_norm": 8.125, "learning_rate": 6.048888888888889e-06, - "loss": 0.542, + "loss": 0.7517, "step": 23640 }, { - "epoch": 2.847339272814833, - "grad_norm": 7.9375, + "epoch": 16.868758915834523, + "grad_norm": 8.3125, "learning_rate": 6.0044444444444445e-06, - "loss": 0.5721, + "loss": 0.7328, "step": 23650 }, { - "epoch": 2.848543221767397, - "grad_norm": 8.6875, + "epoch": 16.875891583452212, + "grad_norm": 8.75, "learning_rate": 5.9600000000000005e-06, - "loss": 0.5942, + "loss": 0.8173, "step": 23660 }, { - "epoch": 2.8497471707199615, - "grad_norm": 8.3125, + "epoch": 16.8830242510699, + "grad_norm": 9.0, "learning_rate": 5.915555555555556e-06, - "loss": 0.5052, + "loss": 0.7312, "step": 23670 }, { - "epoch": 2.850951119672526, - "grad_norm": 8.1875, + "epoch": 16.890156918687588, + "grad_norm": 17.25, "learning_rate": 5.871111111111111e-06, - "loss": 0.5816, + "loss": 0.7407, "step": 23680 }, { - "epoch": 2.8521550686250903, - "grad_norm": 9.9375, + "epoch": 16.897289586305277, + "grad_norm": 6.875, "learning_rate": 5.826666666666667e-06, - "loss": 0.6122, + "loss": 0.8061, "step": 23690 }, { - "epoch": 2.8533590175776546, - "grad_norm": 17.125, + "epoch": 16.904422253922966, + "grad_norm": 9.875, "learning_rate": 5.782222222222222e-06, - "loss": 0.5431, + "loss": 0.6627, "step": 23700 }, { - "epoch": 2.8533590175776546, - "eval/acc": 39.53488540649414, + "epoch": 16.904422253922966, + "eval/acc": 46.511627197265625, "step": 23700 }, { - "epoch": 2.8533590175776546, - "eval_loss": 2.0767838954925537, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.446, - "eval_steps_per_second": 4.638, + "epoch": 16.904422253922966, + "eval_loss": 2.367297887802124, + "eval_runtime": 0.2662, + "eval_samples_per_second": 161.545, + "eval_steps_per_second": 3.757, "step": 23700 }, { - "epoch": 2.854562966530219, - "grad_norm": 11.5, + "epoch": 16.911554921540656, + "grad_norm": 11.75, "learning_rate": 5.737777777777778e-06, - "loss": 0.5637, + "loss": 0.7706, "step": 23710 }, { - "epoch": 2.8557669154827834, - "grad_norm": 8.9375, + "epoch": 16.918687589158345, + "grad_norm": 8.5625, "learning_rate": 5.693333333333334e-06, - "loss": 0.5828, + "loss": 0.8145, "step": 23720 }, { - "epoch": 2.8569708644353478, - "grad_norm": 15.4375, + "epoch": 16.925820256776035, + "grad_norm": 14.25, "learning_rate": 5.648888888888889e-06, - "loss": 0.6337, + "loss": 0.7464, "step": 23730 }, { - "epoch": 2.8581748133879126, - "grad_norm": 4.9375, + "epoch": 16.932952924393724, + "grad_norm": 8.3125, "learning_rate": 5.604444444444445e-06, - "loss": 0.5116, + "loss": 0.8126, "step": 23740 }, { - "epoch": 2.8593787623404765, - "grad_norm": 6.8125, + "epoch": 16.940085592011414, + "grad_norm": 5.78125, "learning_rate": 5.56e-06, - "loss": 0.5515, + "loss": 0.7277, "step": 23750 }, { - "epoch": 2.8605827112930413, - "grad_norm": 6.75, + "epoch": 16.9472182596291, + "grad_norm": 5.71875, "learning_rate": 5.515555555555555e-06, - "loss": 0.6308, + "loss": 0.795, "step": 23760 }, { - "epoch": 2.8617866602456057, - "grad_norm": 7.9375, + "epoch": 16.95435092724679, + "grad_norm": 7.71875, "learning_rate": 5.4711111111111114e-06, - "loss": 0.5898, + "loss": 0.846, "step": 23770 }, { - "epoch": 2.86299060919817, - "grad_norm": 6.15625, + "epoch": 16.96148359486448, + "grad_norm": 5.84375, "learning_rate": 5.426666666666667e-06, - "loss": 0.5869, + "loss": 0.7186, "step": 23780 }, { - "epoch": 2.8641945581507344, - "grad_norm": 7.28125, + "epoch": 16.96861626248217, + "grad_norm": 7.21875, "learning_rate": 5.382222222222223e-06, - "loss": 0.5071, + "loss": 0.698, "step": 23790 }, { - "epoch": 2.865398507103299, - "grad_norm": 7.84375, + "epoch": 16.975748930099858, + "grad_norm": 7.46875, "learning_rate": 5.337777777777778e-06, - "loss": 0.5897, + "loss": 0.7792, "step": 23800 }, { - "epoch": 2.865398507103299, - "eval/acc": 39.53488540649414, + "epoch": 16.975748930099858, + "eval/acc": 46.511627197265625, "step": 23800 }, { - "epoch": 2.865398507103299, - "eval_loss": 2.083738327026367, - "eval_runtime": 0.2164, - "eval_samples_per_second": 198.733, - "eval_steps_per_second": 4.622, + "epoch": 16.975748930099858, + "eval_loss": 2.380526065826416, + "eval_runtime": 0.218, + "eval_samples_per_second": 197.26, + "eval_steps_per_second": 4.587, "step": 23800 }, { - "epoch": 2.866602456055863, - "grad_norm": 7.71875, + "epoch": 16.982881597717547, + "grad_norm": 7.21875, "learning_rate": 5.293333333333333e-06, - "loss": 0.5182, + "loss": 0.7599, "step": 23810 }, { - "epoch": 2.8678064050084275, - "grad_norm": 8.75, + "epoch": 16.990014265335237, + "grad_norm": 6.34375, "learning_rate": 5.248888888888889e-06, - "loss": 0.5786, + "loss": 0.7043, "step": 23820 }, { - "epoch": 2.869010353960992, - "grad_norm": 7.46875, + "epoch": 16.997146932952923, + "grad_norm": 8.25, "learning_rate": 5.2044444444444445e-06, - "loss": 0.6149, + "loss": 0.7336, "step": 23830 }, { - "epoch": 2.8702143029135563, - "grad_norm": 5.90625, + "epoch": 17.004279600570612, + "grad_norm": 7.1875, "learning_rate": 5.1600000000000006e-06, - "loss": 0.5871, + "loss": 0.7175, "step": 23840 }, { - "epoch": 2.871418251866121, - "grad_norm": 13.25, + "epoch": 17.0114122681883, + "grad_norm": 7.75, "learning_rate": 5.115555555555556e-06, - "loss": 0.4958, + "loss": 0.7258, "step": 23850 }, { - "epoch": 2.872622200818685, - "grad_norm": 7.25, + "epoch": 17.01854493580599, + "grad_norm": 6.90625, "learning_rate": 5.071111111111111e-06, - "loss": 0.5782, + "loss": 0.7136, "step": 23860 }, { - "epoch": 2.87382614977125, - "grad_norm": 7.6875, + "epoch": 17.02567760342368, + "grad_norm": 9.625, "learning_rate": 5.026666666666667e-06, - "loss": 0.5951, + "loss": 0.7191, "step": 23870 }, { - "epoch": 2.875030098723814, - "grad_norm": 7.0625, + "epoch": 17.03281027104137, + "grad_norm": 6.15625, "learning_rate": 4.982222222222222e-06, - "loss": 0.4959, + "loss": 0.7656, "step": 23880 }, { - "epoch": 2.8762340476763786, - "grad_norm": 9.875, + "epoch": 17.03994293865906, + "grad_norm": 7.65625, "learning_rate": 4.9377777777777776e-06, - "loss": 0.5914, + "loss": 0.6739, "step": 23890 }, { - "epoch": 2.877437996628943, - "grad_norm": 10.9375, + "epoch": 17.04707560627675, + "grad_norm": 8.25, "learning_rate": 4.893333333333334e-06, - "loss": 0.546, + "loss": 0.7209, "step": 23900 }, { - "epoch": 2.877437996628943, - "eval/acc": 39.53488540649414, + "epoch": 17.04707560627675, + "eval/acc": 51.16279220581055, "step": 23900 }, { - "epoch": 2.877437996628943, - "eval_loss": 2.0773050785064697, - "eval_runtime": 0.2412, - "eval_samples_per_second": 178.25, - "eval_steps_per_second": 4.145, + "epoch": 17.04707560627675, + "eval_loss": 1.9211745262145996, + "eval_runtime": 7.2436, + "eval_samples_per_second": 5.936, + "eval_steps_per_second": 0.138, "step": 23900 }, { - "epoch": 2.8786419455815073, - "grad_norm": 7.65625, + "epoch": 17.054208273894435, + "grad_norm": 7.40625, "learning_rate": 4.848888888888889e-06, - "loss": 0.5514, + "loss": 0.7848, "step": 23910 }, { - "epoch": 2.8798458945340717, - "grad_norm": 9.3125, + "epoch": 17.061340941512125, + "grad_norm": 8.75, "learning_rate": 4.804444444444445e-06, - "loss": 0.6168, + "loss": 0.7272, "step": 23920 }, { - "epoch": 2.881049843486636, - "grad_norm": 9.375, + "epoch": 17.068473609129814, + "grad_norm": 6.78125, "learning_rate": 4.76e-06, - "loss": 0.6102, + "loss": 0.6871, "step": 23930 }, { - "epoch": 2.8822537924392004, - "grad_norm": 6.9375, + "epoch": 17.075606276747504, + "grad_norm": 8.375, "learning_rate": 4.715555555555555e-06, - "loss": 0.5477, + "loss": 0.7285, "step": 23940 }, { - "epoch": 2.883457741391765, - "grad_norm": 7.84375, + "epoch": 17.082738944365193, + "grad_norm": 7.15625, "learning_rate": 4.6711111111111115e-06, - "loss": 0.4297, + "loss": 0.7626, "step": 23950 }, { - "epoch": 2.8846616903443296, - "grad_norm": 8.4375, + "epoch": 17.089871611982883, + "grad_norm": 8.5, "learning_rate": 4.626666666666667e-06, - "loss": 0.5617, + "loss": 0.8326, "step": 23960 }, { - "epoch": 2.8858656392968935, - "grad_norm": 9.375, + "epoch": 17.097004279600572, + "grad_norm": 6.78125, "learning_rate": 4.582222222222223e-06, - "loss": 0.5151, + "loss": 0.7334, "step": 23970 }, { - "epoch": 2.8870695882494584, - "grad_norm": 9.5625, + "epoch": 17.104136947218258, + "grad_norm": 10.75, "learning_rate": 4.537777777777778e-06, - "loss": 0.543, + "loss": 0.7101, "step": 23980 }, { - "epoch": 2.8882735372020227, - "grad_norm": 9.1875, + "epoch": 17.111269614835948, + "grad_norm": 7.40625, "learning_rate": 4.493333333333333e-06, - "loss": 0.5912, + "loss": 0.7165, "step": 23990 }, { - "epoch": 2.889477486154587, - "grad_norm": 8.8125, + "epoch": 17.118402282453637, + "grad_norm": 7.0, "learning_rate": 4.448888888888889e-06, - "loss": 0.5468, + "loss": 0.7806, "step": 24000 }, { - "epoch": 2.889477486154587, - "eval/acc": 41.27906799316406, + "epoch": 17.118402282453637, + "eval/acc": 48.83720779418945, "step": 24000 }, { - "epoch": 2.889477486154587, - "eval_loss": 2.0759472846984863, - "eval_runtime": 0.215, - "eval_samples_per_second": 200.023, - "eval_steps_per_second": 4.652, + "epoch": 17.118402282453637, + "eval_loss": 1.911737322807312, + "eval_runtime": 0.2294, + "eval_samples_per_second": 187.449, + "eval_steps_per_second": 4.359, "step": 24000 }, { - "epoch": 2.8906814351071515, - "grad_norm": 7.15625, + "epoch": 17.125534950071327, + "grad_norm": 6.75, "learning_rate": 4.4044444444444445e-06, - "loss": 0.66, + "loss": 0.6447, "step": 24010 }, { - "epoch": 2.891885384059716, - "grad_norm": 10.3125, + "epoch": 17.132667617689016, + "grad_norm": 30.75, "learning_rate": 4.360000000000001e-06, - "loss": 0.5861, + "loss": 0.7323, "step": 24020 }, { - "epoch": 2.89308933301228, - "grad_norm": 6.15625, + "epoch": 17.139800285306706, + "grad_norm": 8.0625, "learning_rate": 4.315555555555556e-06, - "loss": 0.5752, + "loss": 0.8296, "step": 24030 }, { - "epoch": 2.8942932819648446, - "grad_norm": 11.8125, + "epoch": 17.146932952924395, + "grad_norm": 7.9375, "learning_rate": 4.271111111111111e-06, - "loss": 0.5533, + "loss": 0.7555, "step": 24040 }, { - "epoch": 2.895497230917409, - "grad_norm": 8.1875, + "epoch": 17.154065620542085, + "grad_norm": 5.875, "learning_rate": 4.226666666666667e-06, - "loss": 0.4891, + "loss": 0.7072, "step": 24050 }, { - "epoch": 2.8967011798699733, - "grad_norm": 12.3125, + "epoch": 17.16119828815977, + "grad_norm": 7.03125, "learning_rate": 4.182222222222222e-06, - "loss": 0.6023, + "loss": 0.633, "step": 24060 }, { - "epoch": 2.897905128822538, - "grad_norm": 11.125, + "epoch": 17.16833095577746, + "grad_norm": 7.65625, "learning_rate": 4.137777777777778e-06, - "loss": 0.4778, + "loss": 0.7191, "step": 24070 }, { - "epoch": 2.899109077775102, - "grad_norm": 9.875, + "epoch": 17.17546362339515, + "grad_norm": 6.46875, "learning_rate": 4.093333333333334e-06, - "loss": 0.6184, + "loss": 0.7804, "step": 24080 }, { - "epoch": 2.900313026727667, - "grad_norm": 6.3125, + "epoch": 17.18259629101284, + "grad_norm": 7.53125, "learning_rate": 4.048888888888889e-06, - "loss": 0.607, + "loss": 0.8225, "step": 24090 }, { - "epoch": 2.9015169756802313, - "grad_norm": 6.96875, + "epoch": 17.18972895863053, + "grad_norm": 21.25, "learning_rate": 4.004444444444445e-06, - "loss": 0.6359, + "loss": 0.7856, "step": 24100 }, { - "epoch": 2.9015169756802313, - "eval/acc": 40.69767379760742, + "epoch": 17.18972895863053, + "eval/acc": 48.83720779418945, "step": 24100 }, { - "epoch": 2.9015169756802313, - "eval_loss": 2.08762264251709, - "eval_runtime": 0.2198, - "eval_samples_per_second": 195.668, - "eval_steps_per_second": 4.55, + "epoch": 17.18972895863053, + "eval_loss": 1.914453387260437, + "eval_runtime": 0.2272, + "eval_samples_per_second": 189.227, + "eval_steps_per_second": 4.401, "step": 24100 }, { - "epoch": 2.9027209246327956, - "grad_norm": 5.09375, + "epoch": 17.196861626248218, + "grad_norm": 16.0, "learning_rate": 3.96e-06, - "loss": 0.5914, + "loss": 0.6726, "step": 24110 }, { - "epoch": 2.90392487358536, - "grad_norm": 7.6875, + "epoch": 17.203994293865907, + "grad_norm": 12.1875, "learning_rate": 3.9155555555555554e-06, - "loss": 0.4959, + "loss": 0.8519, "step": 24120 }, { - "epoch": 2.9051288225379244, - "grad_norm": 6.59375, + "epoch": 17.211126961483593, + "grad_norm": 7.53125, "learning_rate": 3.8711111111111115e-06, - "loss": 0.5595, + "loss": 0.6615, "step": 24130 }, { - "epoch": 2.9063327714904887, - "grad_norm": 7.3125, + "epoch": 17.218259629101283, + "grad_norm": 7.53125, "learning_rate": 3.826666666666667e-06, - "loss": 0.6159, + "loss": 0.7664, "step": 24140 }, { - "epoch": 2.907536720443053, - "grad_norm": 7.90625, + "epoch": 17.225392296718972, + "grad_norm": 16.125, "learning_rate": 3.7822222222222224e-06, - "loss": 0.5159, + "loss": 0.7134, "step": 24150 }, { - "epoch": 2.9087406693956175, - "grad_norm": 7.96875, + "epoch": 17.232524964336662, + "grad_norm": 7.84375, "learning_rate": 3.737777777777778e-06, - "loss": 0.5206, + "loss": 0.8229, "step": 24160 }, { - "epoch": 2.909944618348182, - "grad_norm": 10.75, + "epoch": 17.23965763195435, + "grad_norm": 6.25, "learning_rate": 3.6933333333333333e-06, - "loss": 0.6031, + "loss": 0.7486, "step": 24170 }, { - "epoch": 2.9111485673007467, - "grad_norm": 6.4375, + "epoch": 17.24679029957204, + "grad_norm": 7.875, "learning_rate": 3.6488888888888893e-06, - "loss": 0.6091, + "loss": 0.7403, "step": 24180 }, { - "epoch": 2.9123525162533106, - "grad_norm": 7.75, + "epoch": 17.25392296718973, + "grad_norm": 8.5, "learning_rate": 3.604444444444444e-06, - "loss": 0.5488, + "loss": 0.7794, "step": 24190 }, { - "epoch": 2.9135564652058754, - "grad_norm": 6.40625, + "epoch": 17.261055634807416, + "grad_norm": 8.0625, "learning_rate": 3.5600000000000002e-06, - "loss": 0.6035, + "loss": 0.8832, "step": 24200 }, { - "epoch": 2.9135564652058754, - "eval/acc": 40.11627960205078, + "epoch": 17.261055634807416, + "eval/acc": 46.511627197265625, "step": 24200 }, { - "epoch": 2.9135564652058754, - "eval_loss": 2.0946478843688965, - "eval_runtime": 0.2147, - "eval_samples_per_second": 200.315, - "eval_steps_per_second": 4.658, + "epoch": 17.261055634807416, + "eval_loss": 1.9235628843307495, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.12, + "eval_steps_per_second": 4.468, "step": 24200 }, { - "epoch": 2.91476041415844, - "grad_norm": 6.0625, + "epoch": 17.268188302425106, + "grad_norm": 7.75, "learning_rate": 3.515555555555556e-06, - "loss": 0.5716, + "loss": 0.8659, "step": 24210 }, { - "epoch": 2.915964363111004, - "grad_norm": 9.75, + "epoch": 17.275320970042795, + "grad_norm": 7.65625, "learning_rate": 3.471111111111111e-06, - "loss": 0.597, + "loss": 0.7829, "step": 24220 }, { - "epoch": 2.9171683120635685, - "grad_norm": 7.0625, + "epoch": 17.282453637660485, + "grad_norm": 10.375, "learning_rate": 3.4266666666666668e-06, - "loss": 0.6404, + "loss": 0.7556, "step": 24230 }, { - "epoch": 2.918372261016133, - "grad_norm": 12.75, + "epoch": 17.289586305278174, + "grad_norm": 8.25, "learning_rate": 3.382222222222222e-06, - "loss": 0.6015, + "loss": 0.7601, "step": 24240 }, { - "epoch": 2.9195762099686973, - "grad_norm": 9.8125, + "epoch": 17.296718972895864, + "grad_norm": 7.53125, "learning_rate": 3.337777777777778e-06, - "loss": 0.6017, + "loss": 0.7801, "step": 24250 }, { - "epoch": 2.9207801589212616, - "grad_norm": 6.875, + "epoch": 17.303851640513553, + "grad_norm": 5.65625, "learning_rate": 3.2933333333333337e-06, - "loss": 0.5049, + "loss": 0.6873, "step": 24260 }, { - "epoch": 2.921984107873826, - "grad_norm": 7.5, + "epoch": 17.310984308131243, + "grad_norm": 7.03125, "learning_rate": 3.248888888888889e-06, - "loss": 0.5466, + "loss": 0.701, "step": 24270 }, { - "epoch": 2.9231880568263904, - "grad_norm": 8.5, + "epoch": 17.31811697574893, + "grad_norm": 6.25, "learning_rate": 3.2044444444444446e-06, - "loss": 0.4995, + "loss": 0.7061, "step": 24280 }, { - "epoch": 2.924392005778955, - "grad_norm": 9.75, + "epoch": 17.325249643366618, + "grad_norm": 5.78125, "learning_rate": 3.1600000000000007e-06, - "loss": 0.5926, + "loss": 0.7596, "step": 24290 }, { - "epoch": 2.925595954731519, - "grad_norm": 8.8125, + "epoch": 17.332382310984308, + "grad_norm": 7.5625, "learning_rate": 3.1155555555555555e-06, - "loss": 0.5745, + "loss": 0.813, "step": 24300 }, { - "epoch": 2.925595954731519, - "eval/acc": 39.53488540649414, + "epoch": 17.332382310984308, + "eval/acc": 48.83720779418945, "step": 24300 }, { - "epoch": 2.925595954731519, - "eval_loss": 2.0802857875823975, - "eval_runtime": 0.2211, - "eval_samples_per_second": 194.476, - "eval_steps_per_second": 4.523, + "epoch": 17.332382310984308, + "eval_loss": 1.9356474876403809, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.237, + "eval_steps_per_second": 4.517, "step": 24300 }, { - "epoch": 2.926799903684084, - "grad_norm": 6.78125, + "epoch": 17.339514978601997, + "grad_norm": 6.96875, "learning_rate": 3.0711111111111115e-06, - "loss": 0.5833, + "loss": 0.6874, "step": 24310 }, { - "epoch": 2.9280038526366483, - "grad_norm": 10.625, + "epoch": 17.346647646219687, + "grad_norm": 12.375, "learning_rate": 3.0266666666666668e-06, - "loss": 0.5886, + "loss": 0.7283, "step": 24320 }, { - "epoch": 2.9292078015892127, - "grad_norm": 8.625, + "epoch": 17.353780313837376, + "grad_norm": 25.875, "learning_rate": 2.9822222222222224e-06, - "loss": 0.5853, + "loss": 0.7118, "step": 24330 }, { - "epoch": 2.930411750541777, - "grad_norm": 7.53125, + "epoch": 17.360912981455066, + "grad_norm": 8.4375, "learning_rate": 2.9377777777777776e-06, - "loss": 0.516, + "loss": 0.8043, "step": 24340 }, { - "epoch": 2.9316156994943414, - "grad_norm": 11.0, + "epoch": 17.36804564907275, + "grad_norm": 7.5, "learning_rate": 2.8933333333333333e-06, - "loss": 0.6331, + "loss": 0.7448, "step": 24350 }, { - "epoch": 2.932819648446906, - "grad_norm": 7.71875, + "epoch": 17.37517831669044, + "grad_norm": 7.9375, "learning_rate": 2.848888888888889e-06, - "loss": 0.5055, + "loss": 0.7841, "step": 24360 }, { - "epoch": 2.93402359739947, - "grad_norm": 7.09375, + "epoch": 17.38231098430813, + "grad_norm": 8.0625, "learning_rate": 2.8044444444444446e-06, - "loss": 0.5795, + "loss": 0.751, "step": 24370 }, { - "epoch": 2.9352275463520345, - "grad_norm": 11.5625, + "epoch": 17.38944365192582, + "grad_norm": 8.5625, "learning_rate": 2.7600000000000003e-06, - "loss": 0.6956, + "loss": 0.7078, "step": 24380 }, { - "epoch": 2.936431495304599, - "grad_norm": 7.09375, + "epoch": 17.39657631954351, + "grad_norm": 9.5625, "learning_rate": 2.7155555555555555e-06, - "loss": 0.5626, + "loss": 0.7847, "step": 24390 }, { - "epoch": 2.9376354442571637, - "grad_norm": 9.125, + "epoch": 17.4037089871612, + "grad_norm": 6.03125, "learning_rate": 2.6711111111111116e-06, - "loss": 0.5959, + "loss": 0.7537, "step": 24400 }, { - "epoch": 2.9376354442571637, - "eval/acc": 39.53488540649414, + "epoch": 17.4037089871612, + "eval/acc": 48.83720779418945, "step": 24400 }, { - "epoch": 2.9376354442571637, - "eval_loss": 2.0959184169769287, - "eval_runtime": 0.2131, - "eval_samples_per_second": 201.821, - "eval_steps_per_second": 4.694, + "epoch": 17.4037089871612, + "eval_loss": 1.9303936958312988, + "eval_runtime": 1.066, + "eval_samples_per_second": 40.339, + "eval_steps_per_second": 0.938, "step": 24400 }, { - "epoch": 2.9388393932097276, - "grad_norm": 38.5, + "epoch": 17.41084165477889, + "grad_norm": 9.375, "learning_rate": 2.6266666666666668e-06, - "loss": 0.5759, + "loss": 0.8343, "step": 24410 }, { - "epoch": 2.9400433421622925, - "grad_norm": 8.3125, + "epoch": 17.417974322396578, + "grad_norm": 7.3125, "learning_rate": 2.5822222222222224e-06, - "loss": 0.5872, + "loss": 0.7967, "step": 24420 }, { - "epoch": 2.941247291114857, - "grad_norm": 6.8125, + "epoch": 17.425106990014264, + "grad_norm": 8.1875, "learning_rate": 2.5377777777777777e-06, - "loss": 0.4913, + "loss": 0.7392, "step": 24430 }, { - "epoch": 2.942451240067421, + "epoch": 17.432239657631953, "grad_norm": 9.5625, "learning_rate": 2.4933333333333333e-06, - "loss": 0.5069, + "loss": 0.7619, "step": 24440 }, { - "epoch": 2.9436551890199856, - "grad_norm": 7.21875, + "epoch": 17.439372325249643, + "grad_norm": 6.5, "learning_rate": 2.448888888888889e-06, - "loss": 0.5332, + "loss": 0.7465, "step": 24450 }, { - "epoch": 2.94485913797255, - "grad_norm": 6.625, + "epoch": 17.446504992867332, + "grad_norm": 8.625, "learning_rate": 2.4044444444444446e-06, - "loss": 0.3653, + "loss": 0.718, "step": 24460 }, { - "epoch": 2.9460630869251143, - "grad_norm": 11.3125, + "epoch": 17.453637660485022, + "grad_norm": 7.375, "learning_rate": 2.36e-06, - "loss": 0.5127, + "loss": 0.7457, "step": 24470 }, { - "epoch": 2.9472670358776787, - "grad_norm": 5.09375, + "epoch": 17.46077032810271, + "grad_norm": 7.53125, "learning_rate": 2.3155555555555555e-06, - "loss": 0.5264, + "loss": 0.6436, "step": 24480 }, { - "epoch": 2.948470984830243, - "grad_norm": 8.25, + "epoch": 17.4679029957204, + "grad_norm": 6.5625, "learning_rate": 2.271111111111111e-06, - "loss": 0.6099, + "loss": 0.7747, "step": 24490 }, { - "epoch": 2.9496749337828074, - "grad_norm": 7.34375, + "epoch": 17.475035663338087, + "grad_norm": 8.125, "learning_rate": 2.226666666666667e-06, - "loss": 0.4439, + "loss": 0.7697, "step": 24500 }, { - "epoch": 2.9496749337828074, - "eval/acc": 41.86046600341797, + "epoch": 17.475035663338087, + "eval/acc": 48.83720779418945, "step": 24500 }, { - "epoch": 2.9496749337828074, - "eval_loss": 2.083280563354492, - "eval_runtime": 1.9817, - "eval_samples_per_second": 21.698, - "eval_steps_per_second": 0.505, + "epoch": 17.475035663338087, + "eval_loss": 1.9065417051315308, + "eval_runtime": 0.2191, + "eval_samples_per_second": 196.239, + "eval_steps_per_second": 4.564, "step": 24500 }, { - "epoch": 2.9508788827353722, - "grad_norm": 8.25, + "epoch": 17.482168330955776, + "grad_norm": 6.90625, "learning_rate": 2.1822222222222225e-06, - "loss": 0.6018, + "loss": 0.7558, "step": 24510 }, { - "epoch": 2.952082831687936, - "grad_norm": 7.3125, + "epoch": 17.489300998573466, + "grad_norm": 19.375, "learning_rate": 2.1377777777777777e-06, - "loss": 0.5782, + "loss": 0.8242, "step": 24520 }, { - "epoch": 2.953286780640501, - "grad_norm": 8.875, + "epoch": 17.496433666191155, + "grad_norm": 6.8125, "learning_rate": 2.0933333333333338e-06, - "loss": 0.5007, + "loss": 0.8656, "step": 24530 }, { - "epoch": 2.9544907295930654, - "grad_norm": 6.03125, + "epoch": 17.503566333808845, + "grad_norm": 6.0, "learning_rate": 2.048888888888889e-06, - "loss": 0.4743, + "loss": 0.6613, "step": 24540 }, { - "epoch": 2.9556946785456297, - "grad_norm": 7.53125, + "epoch": 17.510699001426534, + "grad_norm": 6.90625, "learning_rate": 2.0044444444444446e-06, - "loss": 0.5644, + "loss": 0.7427, "step": 24550 }, { - "epoch": 2.956898627498194, - "grad_norm": 7.3125, + "epoch": 17.517831669044224, + "grad_norm": 8.875, "learning_rate": 1.96e-06, - "loss": 0.5382, + "loss": 0.719, "step": 24560 }, { - "epoch": 2.9581025764507585, - "grad_norm": 9.375, + "epoch": 17.524964336661913, + "grad_norm": 6.59375, "learning_rate": 1.9155555555555555e-06, - "loss": 0.5599, + "loss": 0.7669, "step": 24570 }, { - "epoch": 2.959306525403323, - "grad_norm": 14.5625, + "epoch": 17.5320970042796, + "grad_norm": 7.78125, "learning_rate": 1.8711111111111114e-06, - "loss": 0.5903, + "loss": 0.7332, "step": 24580 }, { - "epoch": 2.960510474355887, - "grad_norm": 5.125, + "epoch": 17.53922967189729, + "grad_norm": 7.8125, "learning_rate": 1.8266666666666668e-06, - "loss": 0.5542, + "loss": 0.712, "step": 24590 }, { - "epoch": 2.9617144233084516, - "grad_norm": 7.4375, + "epoch": 17.546362339514978, + "grad_norm": 8.375, "learning_rate": 1.7822222222222223e-06, - "loss": 0.5337, + "loss": 0.876, "step": 24600 }, { - "epoch": 2.9617144233084516, - "eval/acc": 41.86046600341797, + "epoch": 17.546362339514978, + "eval/acc": 48.83720779418945, "step": 24600 }, { - "epoch": 2.9617144233084516, - "eval_loss": 2.083366870880127, - "eval_runtime": 0.2128, - "eval_samples_per_second": 202.084, - "eval_steps_per_second": 4.7, + "epoch": 17.546362339514978, + "eval_loss": 1.9253615140914917, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.229, + "eval_steps_per_second": 4.517, "step": 24600 }, { - "epoch": 2.962918372261016, - "grad_norm": 7.75, + "epoch": 17.553495007132668, + "grad_norm": 11.9375, "learning_rate": 1.7377777777777777e-06, - "loss": 0.5617, + "loss": 0.6975, "step": 24610 }, { - "epoch": 2.9641223212135808, - "grad_norm": 6.78125, + "epoch": 17.560627674750357, + "grad_norm": 7.96875, "learning_rate": 1.6933333333333336e-06, - "loss": 0.5855, + "loss": 0.7823, "step": 24620 }, { - "epoch": 2.9653262701661447, - "grad_norm": 11.0625, + "epoch": 17.567760342368047, + "grad_norm": 8.125, "learning_rate": 1.648888888888889e-06, - "loss": 0.5385, + "loss": 0.7674, "step": 24630 }, { - "epoch": 2.9665302191187095, - "grad_norm": 17.75, + "epoch": 17.574893009985736, + "grad_norm": 7.0, "learning_rate": 1.6044444444444444e-06, - "loss": 0.5655, + "loss": 0.7347, "step": 24640 }, { - "epoch": 2.967734168071274, - "grad_norm": 7.21875, + "epoch": 17.582025677603422, + "grad_norm": 7.625, "learning_rate": 1.56e-06, - "loss": 0.5487, + "loss": 0.6577, "step": 24650 }, { - "epoch": 2.9689381170238383, - "grad_norm": 7.1875, + "epoch": 17.58915834522111, + "grad_norm": 7.5, "learning_rate": 1.5155555555555558e-06, - "loss": 0.517, + "loss": 0.7647, "step": 24660 }, { - "epoch": 2.9701420659764026, - "grad_norm": 8.1875, + "epoch": 17.5962910128388, + "grad_norm": 8.4375, "learning_rate": 1.4711111111111112e-06, - "loss": 0.5602, + "loss": 0.8421, "step": 24670 }, { - "epoch": 2.971346014928967, - "grad_norm": 12.3125, + "epoch": 17.60342368045649, + "grad_norm": 7.125, "learning_rate": 1.4266666666666668e-06, - "loss": 0.5504, + "loss": 0.6554, "step": 24680 }, { - "epoch": 2.9725499638815314, - "grad_norm": 13.5, + "epoch": 17.61055634807418, + "grad_norm": 8.375, "learning_rate": 1.3822222222222223e-06, - "loss": 0.6111, + "loss": 0.6979, "step": 24690 }, { - "epoch": 2.9737539128340957, - "grad_norm": 6.09375, + "epoch": 17.61768901569187, + "grad_norm": 7.125, "learning_rate": 1.337777777777778e-06, - "loss": 0.5521, + "loss": 0.7341, "step": 24700 }, { - "epoch": 2.9737539128340957, - "eval/acc": 41.86046600341797, + "epoch": 17.61768901569187, + "eval/acc": 46.511627197265625, "step": 24700 }, { - "epoch": 2.9737539128340957, - "eval_loss": 2.0899693965911865, - "eval_runtime": 1.1431, - "eval_samples_per_second": 37.618, - "eval_steps_per_second": 0.875, + "epoch": 17.61768901569187, + "eval_loss": 1.942150592803955, + "eval_runtime": 0.2263, + "eval_samples_per_second": 190.005, + "eval_steps_per_second": 4.419, "step": 24700 }, { - "epoch": 2.97495786178666, - "grad_norm": 7.21875, + "epoch": 17.62482168330956, + "grad_norm": 6.6875, "learning_rate": 1.2933333333333334e-06, - "loss": 0.5499, + "loss": 0.7998, "step": 24710 }, { - "epoch": 2.9761618107392245, - "grad_norm": 8.0625, + "epoch": 17.63195435092725, + "grad_norm": 7.3125, "learning_rate": 1.248888888888889e-06, - "loss": 0.5875, + "loss": 0.7074, "step": 24720 }, { - "epoch": 2.9773657596917893, - "grad_norm": 4.75, + "epoch": 17.639087018544934, + "grad_norm": 8.1875, "learning_rate": 1.2044444444444445e-06, - "loss": 0.4892, + "loss": 0.7117, "step": 24730 }, { - "epoch": 2.978569708644353, - "grad_norm": 16.625, + "epoch": 17.646219686162624, + "grad_norm": 5.75, "learning_rate": 1.16e-06, - "loss": 0.5939, + "loss": 0.7245, "step": 24740 }, { - "epoch": 2.979773657596918, - "grad_norm": 8.5625, + "epoch": 17.653352353780313, + "grad_norm": 6.65625, "learning_rate": 1.1155555555555556e-06, - "loss": 0.5277, + "loss": 0.7581, "step": 24750 }, { - "epoch": 2.9809776065494824, - "grad_norm": 6.46875, + "epoch": 17.660485021398003, + "grad_norm": 12.375, "learning_rate": 1.071111111111111e-06, - "loss": 0.5603, + "loss": 0.7798, "step": 24760 }, { - "epoch": 2.9821815555020468, - "grad_norm": 6.125, + "epoch": 17.667617689015692, + "grad_norm": 7.0625, "learning_rate": 1.0266666666666666e-06, - "loss": 0.48, + "loss": 0.6628, "step": 24770 }, { - "epoch": 2.983385504454611, - "grad_norm": 7.53125, + "epoch": 17.674750356633382, + "grad_norm": 6.75, "learning_rate": 9.822222222222223e-07, - "loss": 0.5272, + "loss": 0.7847, "step": 24780 }, { - "epoch": 2.9845894534071755, - "grad_norm": 6.5, + "epoch": 17.68188302425107, + "grad_norm": 7.09375, "learning_rate": 9.377777777777778e-07, - "loss": 0.5819, + "loss": 0.7502, "step": 24790 }, { - "epoch": 2.98579340235974, - "grad_norm": 8.0625, + "epoch": 17.689015691868757, + "grad_norm": 9.375, "learning_rate": 8.933333333333334e-07, - "loss": 0.5723, + "loss": 0.7847, "step": 24800 }, { - "epoch": 2.98579340235974, - "eval/acc": 41.27906799316406, + "epoch": 17.689015691868757, + "eval/acc": 48.83720779418945, "step": 24800 }, { - "epoch": 2.98579340235974, - "eval_loss": 2.0972721576690674, - "eval_runtime": 0.2104, - "eval_samples_per_second": 204.369, - "eval_steps_per_second": 4.753, + "epoch": 17.689015691868757, + "eval_loss": 1.923488974571228, + "eval_runtime": 0.2182, + "eval_samples_per_second": 197.099, + "eval_steps_per_second": 4.584, "step": 24800 }, { - "epoch": 2.9869973513123043, - "grad_norm": 9.0625, + "epoch": 17.696148359486447, + "grad_norm": 8.875, "learning_rate": 8.48888888888889e-07, - "loss": 0.5923, + "loss": 0.7152, "step": 24810 }, { - "epoch": 2.9882013002648686, + "epoch": 17.703281027104136, "grad_norm": 6.125, "learning_rate": 8.044444444444445e-07, - "loss": 0.5543, + "loss": 0.7513, "step": 24820 }, { - "epoch": 2.989405249217433, - "grad_norm": 10.5, + "epoch": 17.710413694721826, + "grad_norm": 7.375, "learning_rate": 7.6e-07, - "loss": 0.557, + "loss": 0.808, "step": 24830 }, { - "epoch": 2.990609198169998, - "grad_norm": 7.84375, + "epoch": 17.717546362339515, + "grad_norm": 7.71875, "learning_rate": 7.155555555555556e-07, - "loss": 0.6347, + "loss": 0.7731, "step": 24840 }, { - "epoch": 2.9918131471225617, - "grad_norm": 5.0625, + "epoch": 17.724679029957205, + "grad_norm": 7.65625, "learning_rate": 6.711111111111111e-07, - "loss": 0.5088, + "loss": 0.7497, "step": 24850 }, { - "epoch": 2.9930170960751266, - "grad_norm": 8.0625, + "epoch": 17.731811697574894, + "grad_norm": 55.5, "learning_rate": 6.266666666666668e-07, - "loss": 0.5135, + "loss": 0.7953, "step": 24860 }, { - "epoch": 2.994221045027691, - "grad_norm": 6.125, + "epoch": 17.73894436519258, + "grad_norm": 9.3125, "learning_rate": 5.822222222222223e-07, - "loss": 0.5662, + "loss": 0.7664, "step": 24870 }, { - "epoch": 2.9954249939802553, - "grad_norm": 6.6875, + "epoch": 17.74607703281027, + "grad_norm": 6.28125, "learning_rate": 5.377777777777779e-07, - "loss": 0.4421, + "loss": 0.7296, "step": 24880 }, { - "epoch": 2.9966289429328197, - "grad_norm": 19.375, + "epoch": 17.75320970042796, + "grad_norm": 9.5, "learning_rate": 4.933333333333333e-07, - "loss": 0.6045, + "loss": 0.8254, "step": 24890 }, { - "epoch": 2.997832891885384, - "grad_norm": 27.125, + "epoch": 17.76034236804565, + "grad_norm": 7.125, "learning_rate": 4.488888888888889e-07, - "loss": 0.6324, + "loss": 0.7546, "step": 24900 }, { - "epoch": 2.997832891885384, - "eval/acc": 39.53488540649414, + "epoch": 17.76034236804565, + "eval/acc": 46.511627197265625, "step": 24900 }, { - "epoch": 2.997832891885384, - "eval_loss": 2.0922555923461914, - "eval_runtime": 0.2145, - "eval_samples_per_second": 200.493, - "eval_steps_per_second": 4.663, + "epoch": 17.76034236804565, + "eval_loss": 1.9365407228469849, + "eval_runtime": 0.2222, + "eval_samples_per_second": 193.53, + "eval_steps_per_second": 4.501, "step": 24900 }, { - "epoch": 2.9990368408379484, - "grad_norm": 10.5, + "epoch": 17.767475035663338, + "grad_norm": 13.3125, "learning_rate": 4.0444444444444445e-07, - "loss": 0.606, + "loss": 0.7919, "step": 24910 }, { - "epoch": 3.000240789790513, - "grad_norm": 9.0625, + "epoch": 17.774607703281028, + "grad_norm": 6.84375, "learning_rate": 3.6e-07, - "loss": 0.5629, + "loss": 0.7368, "step": 24920 }, { - "epoch": 3.001444738743077, - "grad_norm": 9.9375, + "epoch": 17.781740370898717, + "grad_norm": 7.09375, "learning_rate": 3.155555555555556e-07, - "loss": 0.5628, + "loss": 0.6357, "step": 24930 }, { - "epoch": 3.0026486876956415, - "grad_norm": 7.625, + "epoch": 17.788873038516407, + "grad_norm": 6.09375, "learning_rate": 2.7111111111111114e-07, - "loss": 0.5243, + "loss": 0.7045, "step": 24940 }, { - "epoch": 3.003852636648206, - "grad_norm": 7.03125, + "epoch": 17.796005706134093, + "grad_norm": 8.1875, "learning_rate": 2.2666666666666668e-07, - "loss": 0.5715, + "loss": 0.749, "step": 24950 }, { - "epoch": 3.0050565856007707, - "grad_norm": 7.40625, + "epoch": 17.803138373751782, + "grad_norm": 8.5, "learning_rate": 1.8222222222222223e-07, - "loss": 0.5115, + "loss": 0.7689, "step": 24960 }, { - "epoch": 3.006260534553335, - "grad_norm": 10.375, + "epoch": 17.81027104136947, + "grad_norm": 17.875, "learning_rate": 1.3777777777777778e-07, - "loss": 0.5207, + "loss": 0.7133, "step": 24970 }, { - "epoch": 3.0074644835058995, - "grad_norm": 10.5625, + "epoch": 17.81740370898716, + "grad_norm": 9.25, "learning_rate": 9.333333333333334e-08, - "loss": 0.6261, + "loss": 0.778, "step": 24980 }, { - "epoch": 3.008668432458464, - "grad_norm": 7.53125, + "epoch": 17.82453637660485, + "grad_norm": 6.6875, "learning_rate": 4.888888888888889e-08, - "loss": 0.5476, + "loss": 0.7655, "step": 24990 }, { - "epoch": 3.009872381411028, - "grad_norm": 9.3125, + "epoch": 17.83166904422254, + "grad_norm": 6.8125, "learning_rate": 4.444444444444445e-09, - "loss": 0.6186, + "loss": 0.746, "step": 25000 }, { - "epoch": 3.009872381411028, - "eval/acc": 44.1860466003418, + "epoch": 17.83166904422254, + "eval/acc": 46.511627197265625, "step": 25000 }, { - "epoch": 3.009872381411028, - "eval_loss": 2.3378472328186035, - "eval_runtime": 7.506, - "eval_samples_per_second": 5.729, - "eval_steps_per_second": 0.133, + "epoch": 17.83166904422254, + "eval_loss": 1.9321389198303223, + "eval_runtime": 0.2223, + "eval_samples_per_second": 193.429, + "eval_steps_per_second": 4.498, "step": 25000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 18, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-25000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb8cf9c89a9294fb0423b08f6958546e38774de6 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3539d0188328679d6b168c95640b0e9bb61c47eb0294171660930064b47ef32 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..504966223fa1bac815d95ad3b95b9419c00819f4 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b066839ff96a64f33d23d29e6842f0e37aa831105dd531efd8b7a2278350651e +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcc8d11df666087597d8c1000cccb8685c64f419 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64751d1d30fc9a480b1be244c72d89dafe386df6d39c23a516d08b98b09ff2c0 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d2c222c3ca0a5870b825ab2529c3eb18b1ed564 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149750d3cca95f5471b4108e74133b194d6009aabcc2f5743105bb4ba54b7286 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..3b029c34638a4f77ba64990f3d092f5904a7091e --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9255605ac0cd8dc1535cca4054fc8282329b95c9bfaff1e3919675dd400a0ffa +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..003443d567333ce6f81244222e97c2ffb9fe95db --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1f5b0c4aedc1f943d8e8dc35c31e252031f0c094d6b8b7261007582209d296f +size 15429 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json similarity index 51% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json index 321e4fe2239862d50d6061218a749b654d174ea2..1eb06f3ee01718e146da90bfbf694c23c81e9bc1 100644 --- a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json @@ -2,7 +2,7 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.6019744762822057, + "epoch": 3.566333808844508, "eval_steps": 100, "global_step": 5000, "is_hyper_param_search": false, @@ -10,4160 +10,4160 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0012039489525644113, - "grad_norm": 29.25, + "epoch": 0.007132667617689016, + "grad_norm": 19.75, "learning_rate": 3.6e-07, - "loss": 5.6475, + "loss": 5.6319, "step": 10 }, { - "epoch": 0.0024078979051288226, - "grad_norm": 13.6875, + "epoch": 0.014265335235378032, + "grad_norm": 19.375, "learning_rate": 7.6e-07, - "loss": 5.6394, + "loss": 5.5914, "step": 20 }, { - "epoch": 0.003611846857693234, - "grad_norm": 36.0, + "epoch": 0.021398002853067047, + "grad_norm": 51.25, "learning_rate": 1.16e-06, - "loss": 5.6168, + "loss": 5.6495, "step": 30 }, { - "epoch": 0.004815795810257645, - "grad_norm": 17.0, + "epoch": 0.028530670470756064, + "grad_norm": 19.0, "learning_rate": 1.56e-06, - "loss": 5.6346, + "loss": 5.6581, "step": 40 }, { - "epoch": 0.006019744762822056, - "grad_norm": 16.5, + "epoch": 0.03566333808844508, + "grad_norm": 23.75, "learning_rate": 1.96e-06, - "loss": 5.6391, + "loss": 5.6366, "step": 50 }, { - "epoch": 0.007223693715386468, - "grad_norm": 16.5, + "epoch": 0.042796005706134094, + "grad_norm": 18.0, "learning_rate": 2.36e-06, - "loss": 5.6272, + "loss": 5.6411, "step": 60 }, { - "epoch": 0.00842764266795088, - "grad_norm": 14.8125, + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, "learning_rate": 2.7600000000000003e-06, - "loss": 5.5979, + "loss": 5.5919, "step": 70 }, { - "epoch": 0.00963159162051529, - "grad_norm": 22.375, + "epoch": 0.05706134094151213, + "grad_norm": 24.125, "learning_rate": 3.1600000000000007e-06, - "loss": 5.6515, + "loss": 5.6083, "step": 80 }, { - "epoch": 0.010835540573079701, - "grad_norm": 17.125, + "epoch": 0.06419400855920114, + "grad_norm": 18.25, "learning_rate": 3.5600000000000002e-06, - "loss": 5.6018, + "loss": 5.6599, "step": 90 }, { - "epoch": 0.012039489525644112, - "grad_norm": 14.9375, + "epoch": 0.07132667617689016, + "grad_norm": 18.25, "learning_rate": 3.96e-06, - "loss": 5.6342, + "loss": 5.6652, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval/acc": 3.4883720874786377, + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, "step": 100 }, { - "epoch": 0.012039489525644112, - "eval_loss": 5.140806198120117, - "eval_runtime": 2.4165, - "eval_samples_per_second": 17.794, - "eval_steps_per_second": 0.414, + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, "step": 100 }, { - "epoch": 0.013243438478208525, - "grad_norm": 13.0, + "epoch": 0.07845934379457917, + "grad_norm": 21.0, "learning_rate": 4.360000000000001e-06, - "loss": 5.6124, + "loss": 5.6402, "step": 110 }, { - "epoch": 0.014447387430772935, - "grad_norm": 18.625, + "epoch": 0.08559201141226819, + "grad_norm": 16.875, "learning_rate": 4.76e-06, - "loss": 5.6127, + "loss": 5.6535, "step": 120 }, { - "epoch": 0.015651336383337346, - "grad_norm": 14.375, + "epoch": 0.09272467902995721, + "grad_norm": 21.5, "learning_rate": 5.1600000000000006e-06, - "loss": 5.5663, + "loss": 5.5821, "step": 130 }, { - "epoch": 0.01685528533590176, - "grad_norm": 11.9375, + "epoch": 0.09985734664764621, + "grad_norm": 18.5, "learning_rate": 5.56e-06, - "loss": 5.55, + "loss": 5.6184, "step": 140 }, { - "epoch": 0.018059234288466168, - "grad_norm": 14.5, + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, "learning_rate": 5.9600000000000005e-06, - "loss": 5.5839, + "loss": 5.5743, "step": 150 }, { - "epoch": 0.01926318324103058, - "grad_norm": 15.0625, + "epoch": 0.11412268188302425, + "grad_norm": 16.875, "learning_rate": 6.360000000000001e-06, - "loss": 5.5259, + "loss": 5.5684, "step": 160 }, { - "epoch": 0.020467132193594993, - "grad_norm": 14.8125, + "epoch": 0.12125534950071326, + "grad_norm": 22.125, "learning_rate": 6.76e-06, - "loss": 5.4812, + "loss": 5.535, "step": 170 }, { - "epoch": 0.021671081146159402, - "grad_norm": 15.375, + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, "learning_rate": 7.16e-06, - "loss": 5.4964, + "loss": 5.4357, "step": 180 }, { - "epoch": 0.022875030098723815, - "grad_norm": 14.0625, + "epoch": 0.1355206847360913, + "grad_norm": 16.375, "learning_rate": 7.5600000000000005e-06, - "loss": 5.4023, + "loss": 5.3766, "step": 190 }, { - "epoch": 0.024078979051288224, - "grad_norm": 18.625, + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, "learning_rate": 7.96e-06, - "loss": 5.3778, + "loss": 5.4437, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval/acc": 5.232558250427246, + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, "step": 200 }, { - "epoch": 0.024078979051288224, - "eval_loss": 4.991551399230957, - "eval_runtime": 0.2363, - "eval_samples_per_second": 181.988, - "eval_steps_per_second": 4.232, + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, "step": 200 }, { - "epoch": 0.025282928003852637, - "grad_norm": 16.25, + "epoch": 0.14978601997146934, + "grad_norm": 16.75, "learning_rate": 8.36e-06, - "loss": 5.3983, + "loss": 5.4744, "step": 210 }, { - "epoch": 0.02648687695641705, - "grad_norm": 17.25, + "epoch": 0.15691868758915833, + "grad_norm": 43.25, "learning_rate": 8.76e-06, - "loss": 5.2953, + "loss": 5.381, "step": 220 }, { - "epoch": 0.02769082590898146, - "grad_norm": 15.9375, + "epoch": 0.16405135520684735, + "grad_norm": 21.0, "learning_rate": 9.16e-06, - "loss": 5.2266, + "loss": 5.3092, "step": 230 }, { - "epoch": 0.02889477486154587, - "grad_norm": 21.875, + "epoch": 0.17118402282453637, + "grad_norm": 26.75, "learning_rate": 9.560000000000002e-06, - "loss": 5.139, + "loss": 5.2752, "step": 240 }, { - "epoch": 0.03009872381411028, - "grad_norm": 17.875, + "epoch": 0.1783166904422254, + "grad_norm": 26.875, "learning_rate": 9.96e-06, - "loss": 5.0639, + "loss": 5.2194, "step": 250 }, { - "epoch": 0.03130267276667469, - "grad_norm": 18.875, + "epoch": 0.18544935805991442, + "grad_norm": 20.875, "learning_rate": 1.036e-05, - "loss": 5.0118, + "loss": 5.0657, "step": 260 }, { - "epoch": 0.032506621719239105, - "grad_norm": 26.0, + "epoch": 0.19258202567760344, + "grad_norm": 25.125, "learning_rate": 1.076e-05, - "loss": 4.8959, + "loss": 4.967, "step": 270 }, { - "epoch": 0.03371057067180352, - "grad_norm": 18.5, + "epoch": 0.19971469329529243, + "grad_norm": 30.125, "learning_rate": 1.1160000000000002e-05, - "loss": 4.8454, + "loss": 4.9544, "step": 280 }, { - "epoch": 0.03491451962436792, - "grad_norm": 28.0, + "epoch": 0.20684736091298145, + "grad_norm": 24.625, "learning_rate": 1.156e-05, - "loss": 4.6846, + "loss": 4.7585, "step": 290 }, { - "epoch": 0.036118468576932336, - "grad_norm": 25.5, + "epoch": 0.21398002853067047, + "grad_norm": 21.375, "learning_rate": 1.196e-05, - "loss": 4.5211, + "loss": 4.635, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval/acc": 6.395349025726318, + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, "step": 300 }, { - "epoch": 0.036118468576932336, - "eval_loss": 4.604515075683594, - "eval_runtime": 0.2156, - "eval_samples_per_second": 199.428, - "eval_steps_per_second": 4.638, + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, "step": 300 }, { - "epoch": 0.03732241752949675, - "grad_norm": 28.0, + "epoch": 0.2211126961483595, + "grad_norm": 30.125, "learning_rate": 1.236e-05, - "loss": 4.3466, + "loss": 4.5333, "step": 310 }, { - "epoch": 0.03852636648206116, - "grad_norm": 27.125, + "epoch": 0.2282453637660485, + "grad_norm": 28.125, "learning_rate": 1.276e-05, - "loss": 4.1005, + "loss": 4.2888, "step": 320 }, { - "epoch": 0.039730315434625574, - "grad_norm": 33.0, + "epoch": 0.23537803138373753, + "grad_norm": 30.5, "learning_rate": 1.316e-05, - "loss": 3.7904, + "loss": 4.1744, "step": 330 }, { - "epoch": 0.040934264387189986, - "grad_norm": 32.75, + "epoch": 0.24251069900142652, + "grad_norm": 35.0, "learning_rate": 1.356e-05, - "loss": 3.4061, + "loss": 3.8812, "step": 340 }, { - "epoch": 0.04213821333975439, - "grad_norm": 31.125, + "epoch": 0.24964336661911554, + "grad_norm": 30.75, "learning_rate": 1.396e-05, - "loss": 3.2838, + "loss": 3.6772, "step": 350 }, { - "epoch": 0.043342162292318805, - "grad_norm": 23.75, + "epoch": 0.25677603423680456, + "grad_norm": 25.875, "learning_rate": 1.4360000000000001e-05, - "loss": 2.9101, + "loss": 3.3797, "step": 360 }, { - "epoch": 0.04454611124488322, - "grad_norm": 44.75, + "epoch": 0.26390870185449355, + "grad_norm": 31.375, "learning_rate": 1.4760000000000001e-05, - "loss": 2.6306, + "loss": 3.2338, "step": 370 }, { - "epoch": 0.04575006019744763, - "grad_norm": 33.25, + "epoch": 0.2710413694721826, + "grad_norm": 72.0, "learning_rate": 1.5160000000000002e-05, - "loss": 2.5454, + "loss": 2.976, "step": 380 }, { - "epoch": 0.04695400915001204, - "grad_norm": 31.375, + "epoch": 0.2781740370898716, + "grad_norm": 22.375, "learning_rate": 1.556e-05, - "loss": 2.5867, + "loss": 2.8207, "step": 390 }, { - "epoch": 0.04815795810257645, - "grad_norm": 18.5, + "epoch": 0.28530670470756064, + "grad_norm": 21.25, "learning_rate": 1.596e-05, - "loss": 2.3251, + "loss": 2.8341, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval/acc": 12.209301948547363, + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, "step": 400 }, { - "epoch": 0.04815795810257645, - "eval_loss": 3.941906452178955, - "eval_runtime": 0.2265, - "eval_samples_per_second": 189.814, - "eval_steps_per_second": 4.414, + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, "step": 400 }, { - "epoch": 0.04936190705514086, - "grad_norm": 18.0, + "epoch": 0.29243937232524964, + "grad_norm": 21.0, "learning_rate": 1.636e-05, - "loss": 2.394, + "loss": 2.6431, "step": 410 }, { - "epoch": 0.05056585600770527, - "grad_norm": 22.375, + "epoch": 0.2995720399429387, + "grad_norm": 20.875, "learning_rate": 1.6760000000000002e-05, - "loss": 2.2856, + "loss": 2.6506, "step": 420 }, { - "epoch": 0.051769804960269686, - "grad_norm": 17.25, + "epoch": 0.3067047075606277, + "grad_norm": 21.125, "learning_rate": 1.7160000000000002e-05, - "loss": 2.3414, + "loss": 2.491, "step": 430 }, { - "epoch": 0.0529737539128341, - "grad_norm": 15.25, + "epoch": 0.31383737517831667, + "grad_norm": 31.75, "learning_rate": 1.756e-05, - "loss": 2.156, + "loss": 2.423, "step": 440 }, { - "epoch": 0.054177702865398504, - "grad_norm": 15.75, + "epoch": 0.3209700427960057, + "grad_norm": 19.375, "learning_rate": 1.796e-05, - "loss": 2.0164, + "loss": 2.5108, "step": 450 }, { - "epoch": 0.05538165181796292, - "grad_norm": 28.5, + "epoch": 0.3281027104136947, + "grad_norm": 17.375, "learning_rate": 1.8360000000000004e-05, - "loss": 1.9555, + "loss": 2.4584, "step": 460 }, { - "epoch": 0.05658560077052733, - "grad_norm": 19.25, + "epoch": 0.33523537803138376, + "grad_norm": 22.625, "learning_rate": 1.876e-05, - "loss": 2.0277, + "loss": 2.3526, "step": 470 }, { - "epoch": 0.05778954972309174, - "grad_norm": 15.375, + "epoch": 0.34236804564907275, + "grad_norm": 30.25, "learning_rate": 1.916e-05, - "loss": 2.1719, + "loss": 2.3634, "step": 480 }, { - "epoch": 0.058993498675656154, - "grad_norm": 18.875, + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, "learning_rate": 1.956e-05, - "loss": 2.013, + "loss": 2.3339, "step": 490 }, { - "epoch": 0.06019744762822056, - "grad_norm": 18.625, + "epoch": 0.3566333808844508, + "grad_norm": 19.5, "learning_rate": 1.9960000000000002e-05, - "loss": 1.8574, + "loss": 2.268, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval/acc": 20.930233001708984, + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, "step": 500 }, { - "epoch": 0.06019744762822056, - "eval_loss": 3.6547293663024902, - "eval_runtime": 0.2139, - "eval_samples_per_second": 201.002, - "eval_steps_per_second": 4.674, + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, "step": 500 }, { - "epoch": 0.06140139658078497, - "grad_norm": 19.875, + "epoch": 0.3637660485021398, + "grad_norm": 29.375, "learning_rate": 2.036e-05, - "loss": 1.9431, + "loss": 2.2728, "step": 510 }, { - "epoch": 0.06260534553334939, - "grad_norm": 14.625, + "epoch": 0.37089871611982883, + "grad_norm": 21.25, "learning_rate": 2.076e-05, - "loss": 1.8311, + "loss": 2.1346, "step": 520 }, { - "epoch": 0.0638092944859138, - "grad_norm": 20.0, + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, "learning_rate": 2.116e-05, - "loss": 2.0005, + "loss": 2.2719, "step": 530 }, { - "epoch": 0.06501324343847821, - "grad_norm": 16.0, + "epoch": 0.38516405135520687, + "grad_norm": 27.75, "learning_rate": 2.1560000000000004e-05, - "loss": 1.7374, + "loss": 2.145, "step": 540 }, { - "epoch": 0.06621719239104262, - "grad_norm": 13.0625, + "epoch": 0.39229671897289586, + "grad_norm": 16.125, "learning_rate": 2.196e-05, - "loss": 1.7838, + "loss": 2.0912, "step": 550 }, { - "epoch": 0.06742114134360704, - "grad_norm": 16.5, + "epoch": 0.39942938659058486, + "grad_norm": 20.25, "learning_rate": 2.236e-05, - "loss": 1.8264, + "loss": 2.0302, "step": 560 }, { - "epoch": 0.06862509029617145, - "grad_norm": 20.5, + "epoch": 0.4065620542082739, + "grad_norm": 17.75, "learning_rate": 2.2760000000000002e-05, - "loss": 1.658, + "loss": 2.1832, "step": 570 }, { - "epoch": 0.06982903924873585, - "grad_norm": 25.75, + "epoch": 0.4136947218259629, + "grad_norm": 14.5, "learning_rate": 2.3160000000000002e-05, - "loss": 1.7826, + "loss": 1.9652, "step": 580 }, { - "epoch": 0.07103298820130026, - "grad_norm": 19.375, + "epoch": 0.42082738944365194, + "grad_norm": 17.0, "learning_rate": 2.356e-05, - "loss": 1.6539, + "loss": 1.8911, "step": 590 }, { - "epoch": 0.07223693715386467, - "grad_norm": 19.25, + "epoch": 0.42796005706134094, + "grad_norm": 20.0, "learning_rate": 2.396e-05, - "loss": 1.6278, + "loss": 2.0266, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval/acc": 20.930233001708984, + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, "step": 600 }, { - "epoch": 0.07223693715386467, - "eval_loss": 3.387899398803711, - "eval_runtime": 0.2536, - "eval_samples_per_second": 169.572, - "eval_steps_per_second": 3.944, + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, "step": 600 }, { - "epoch": 0.07344088610642908, - "grad_norm": 12.0625, + "epoch": 0.43509272467902993, + "grad_norm": 25.5, "learning_rate": 2.4360000000000004e-05, - "loss": 1.5342, + "loss": 1.9116, "step": 610 }, { - "epoch": 0.0746448350589935, - "grad_norm": 15.625, + "epoch": 0.442225392296719, + "grad_norm": 25.375, "learning_rate": 2.476e-05, - "loss": 1.5919, + "loss": 1.7644, "step": 620 }, { - "epoch": 0.07584878401155791, - "grad_norm": 25.5, + "epoch": 0.44935805991440797, + "grad_norm": 15.5, "learning_rate": 2.516e-05, - "loss": 1.5713, + "loss": 1.9008, "step": 630 }, { - "epoch": 0.07705273296412232, - "grad_norm": 14.8125, + "epoch": 0.456490727532097, + "grad_norm": 16.875, "learning_rate": 2.556e-05, - "loss": 1.4714, + "loss": 1.619, "step": 640 }, { - "epoch": 0.07825668191668674, - "grad_norm": 21.5, + "epoch": 0.463623395149786, + "grad_norm": 37.25, "learning_rate": 2.5960000000000002e-05, - "loss": 1.5835, + "loss": 1.7725, "step": 650 }, { - "epoch": 0.07946063086925115, - "grad_norm": 58.0, + "epoch": 0.47075606276747506, + "grad_norm": 16.5, "learning_rate": 2.6360000000000002e-05, - "loss": 1.5369, + "loss": 1.7405, "step": 660 }, { - "epoch": 0.08066457982181556, - "grad_norm": 45.0, + "epoch": 0.47788873038516405, + "grad_norm": 16.25, "learning_rate": 2.676e-05, - "loss": 1.4629, + "loss": 1.5825, "step": 670 }, { - "epoch": 0.08186852877437997, - "grad_norm": 14.1875, + "epoch": 0.48502139800285304, + "grad_norm": 68.5, "learning_rate": 2.716e-05, - "loss": 1.4288, + "loss": 1.8379, "step": 680 }, { - "epoch": 0.08307247772694437, - "grad_norm": 40.25, + "epoch": 0.4921540656205421, + "grad_norm": 50.0, "learning_rate": 2.7560000000000004e-05, - "loss": 1.4729, + "loss": 1.7989, "step": 690 }, { - "epoch": 0.08427642667950878, - "grad_norm": 13.625, + "epoch": 0.4992867332382311, + "grad_norm": 16.25, "learning_rate": 2.7960000000000003e-05, - "loss": 1.4883, + "loss": 1.7058, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval/acc": 23.255813598632812, + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, "step": 700 }, { - "epoch": 0.08427642667950878, - "eval_loss": 3.206946611404419, - "eval_runtime": 0.4188, - "eval_samples_per_second": 102.684, - "eval_steps_per_second": 2.388, + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, "step": 700 }, { - "epoch": 0.0854803756320732, - "grad_norm": 15.75, + "epoch": 0.5064194008559201, + "grad_norm": 14.625, "learning_rate": 2.8360000000000003e-05, - "loss": 1.5656, + "loss": 1.6542, "step": 710 }, { - "epoch": 0.08668432458463761, - "grad_norm": 22.25, + "epoch": 0.5135520684736091, + "grad_norm": 71.0, "learning_rate": 2.8760000000000002e-05, - "loss": 1.6742, + "loss": 1.6763, "step": 720 }, { - "epoch": 0.08788827353720202, - "grad_norm": 12.3125, + "epoch": 0.5206847360912982, + "grad_norm": 17.125, "learning_rate": 2.9160000000000005e-05, - "loss": 1.35, + "loss": 1.6858, "step": 730 }, { - "epoch": 0.08909222248976643, - "grad_norm": 13.8125, + "epoch": 0.5278174037089871, + "grad_norm": 19.75, "learning_rate": 2.9559999999999998e-05, - "loss": 1.4435, + "loss": 1.6718, "step": 740 }, { - "epoch": 0.09029617144233085, - "grad_norm": 13.1875, + "epoch": 0.5349500713266762, + "grad_norm": 13.375, "learning_rate": 2.9959999999999998e-05, - "loss": 1.3843, + "loss": 1.6164, "step": 750 }, { - "epoch": 0.09150012039489526, - "grad_norm": 13.3125, + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, "learning_rate": 3.036e-05, - "loss": 1.3327, + "loss": 1.6049, "step": 760 }, { - "epoch": 0.09270406934745967, - "grad_norm": 18.875, + "epoch": 0.5492154065620543, + "grad_norm": 35.75, "learning_rate": 3.076e-05, - "loss": 1.4628, + "loss": 1.5453, "step": 770 }, { - "epoch": 0.09390801830002408, - "grad_norm": 14.5625, + "epoch": 0.5563480741797432, + "grad_norm": 28.75, "learning_rate": 3.116e-05, - "loss": 1.3306, + "loss": 1.4818, "step": 780 }, { - "epoch": 0.09511196725258848, - "grad_norm": 18.75, + "epoch": 0.5634807417974322, + "grad_norm": 17.375, "learning_rate": 3.156e-05, - "loss": 1.4936, + "loss": 1.5647, "step": 790 }, { - "epoch": 0.0963159162051529, - "grad_norm": 11.5, + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, "learning_rate": 3.196e-05, - "loss": 1.3515, + "loss": 1.5206, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval/acc": 22.674419403076172, + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, "step": 800 }, { - "epoch": 0.0963159162051529, - "eval_loss": 3.1510462760925293, - "eval_runtime": 0.2676, - "eval_samples_per_second": 160.701, - "eval_steps_per_second": 3.737, + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, "step": 800 }, { - "epoch": 0.09751986515771731, - "grad_norm": 11.6875, + "epoch": 0.5777460770328102, + "grad_norm": 17.125, "learning_rate": 3.236e-05, - "loss": 1.4593, + "loss": 1.6124, "step": 810 }, { - "epoch": 0.09872381411028172, - "grad_norm": 10.5625, + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, "learning_rate": 3.2760000000000005e-05, - "loss": 1.3453, + "loss": 1.4254, "step": 820 }, { - "epoch": 0.09992776306284613, - "grad_norm": 11.625, + "epoch": 0.5920114122681883, + "grad_norm": 15.0, "learning_rate": 3.316e-05, - "loss": 1.4041, + "loss": 1.7124, "step": 830 }, { - "epoch": 0.10113171201541055, - "grad_norm": 13.0, + "epoch": 0.5991440798858774, + "grad_norm": 14.75, "learning_rate": 3.3560000000000004e-05, - "loss": 1.2766, + "loss": 1.5384, "step": 840 }, { - "epoch": 0.10233566096797496, - "grad_norm": 40.0, + "epoch": 0.6062767475035663, + "grad_norm": 31.5, "learning_rate": 3.396e-05, - "loss": 1.2678, + "loss": 1.4899, "step": 850 }, { - "epoch": 0.10353960992053937, - "grad_norm": 13.75, + "epoch": 0.6134094151212554, + "grad_norm": 13.875, "learning_rate": 3.436e-05, - "loss": 1.2514, + "loss": 1.5377, "step": 860 }, { - "epoch": 0.10474355887310378, - "grad_norm": 11.75, + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, "learning_rate": 3.4760000000000006e-05, - "loss": 1.3518, + "loss": 1.4892, "step": 870 }, { - "epoch": 0.1059475078256682, - "grad_norm": 11.875, + "epoch": 0.6276747503566333, + "grad_norm": 37.25, "learning_rate": 3.516e-05, - "loss": 1.2675, + "loss": 1.4872, "step": 880 }, { - "epoch": 0.10715145677823261, - "grad_norm": 13.0, + "epoch": 0.6348074179743224, + "grad_norm": 18.875, "learning_rate": 3.5560000000000005e-05, - "loss": 1.294, + "loss": 1.536, "step": 890 }, { - "epoch": 0.10835540573079701, - "grad_norm": 13.0, + "epoch": 0.6419400855920114, + "grad_norm": 18.625, "learning_rate": 3.596e-05, - "loss": 1.1209, + "loss": 1.5208, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval/acc": 25.581396102905273, + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, "step": 900 }, { - "epoch": 0.10835540573079701, - "eval_loss": 3.0571491718292236, - "eval_runtime": 0.3097, - "eval_samples_per_second": 138.846, - "eval_steps_per_second": 3.229, + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, "step": 900 }, { - "epoch": 0.10955935468336142, - "grad_norm": 12.75, + "epoch": 0.6490727532097005, + "grad_norm": 19.875, "learning_rate": 3.636e-05, - "loss": 1.2681, + "loss": 1.4606, "step": 910 }, { - "epoch": 0.11076330363592583, - "grad_norm": 17.0, + "epoch": 0.6562054208273894, + "grad_norm": 12.625, "learning_rate": 3.676e-05, - "loss": 1.2606, + "loss": 1.4728, "step": 920 }, { - "epoch": 0.11196725258849025, - "grad_norm": 11.375, + "epoch": 0.6633380884450785, + "grad_norm": 15.0, "learning_rate": 3.716e-05, - "loss": 1.2194, + "loss": 1.449, "step": 930 }, { - "epoch": 0.11317120154105466, - "grad_norm": 12.125, + "epoch": 0.6704707560627675, + "grad_norm": 19.0, "learning_rate": 3.756e-05, - "loss": 1.2905, + "loss": 1.5292, "step": 940 }, { - "epoch": 0.11437515049361907, - "grad_norm": 18.125, + "epoch": 0.6776034236804565, + "grad_norm": 111.5, "learning_rate": 3.796e-05, - "loss": 1.2563, + "loss": 1.4891, "step": 950 }, { - "epoch": 0.11557909944618348, - "grad_norm": 17.125, + "epoch": 0.6847360912981455, + "grad_norm": 14.75, "learning_rate": 3.836e-05, - "loss": 1.1894, + "loss": 1.4202, "step": 960 }, { - "epoch": 0.1167830483987479, - "grad_norm": 11.875, + "epoch": 0.6918687589158345, + "grad_norm": 20.25, "learning_rate": 3.876e-05, - "loss": 1.2441, + "loss": 1.5258, "step": 970 }, { - "epoch": 0.11798699735131231, - "grad_norm": 15.8125, + "epoch": 0.6990014265335235, + "grad_norm": 48.0, "learning_rate": 3.9160000000000005e-05, - "loss": 1.2627, + "loss": 1.3912, "step": 980 }, { - "epoch": 0.11919094630387672, - "grad_norm": 17.375, + "epoch": 0.7061340941512125, + "grad_norm": 13.0, "learning_rate": 3.956e-05, - "loss": 1.3929, + "loss": 1.4859, "step": 990 }, { - "epoch": 0.12039489525644112, - "grad_norm": 11.125, + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, "learning_rate": 3.9960000000000004e-05, - "loss": 1.1332, + "loss": 1.4614, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval/acc": 26.162790298461914, + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, "step": 1000 }, { - "epoch": 0.12039489525644112, - "eval_loss": 2.9910976886749268, - "eval_runtime": 0.2826, - "eval_samples_per_second": 152.17, - "eval_steps_per_second": 3.539, + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, "step": 1000 }, { - "epoch": 0.12159884420900553, - "grad_norm": 13.75, + "epoch": 0.7203994293865906, + "grad_norm": 16.625, "learning_rate": 4.0360000000000007e-05, - "loss": 1.2314, + "loss": 1.56, "step": 1010 }, { - "epoch": 0.12280279316156995, - "grad_norm": 11.875, + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, "learning_rate": 4.076e-05, - "loss": 1.2654, + "loss": 1.4469, "step": 1020 }, { - "epoch": 0.12400674211413436, - "grad_norm": 12.8125, + "epoch": 0.7346647646219686, + "grad_norm": 15.0, "learning_rate": 4.1160000000000006e-05, - "loss": 1.1432, + "loss": 1.381, "step": 1030 }, { - "epoch": 0.12521069106669877, - "grad_norm": 13.9375, + "epoch": 0.7417974322396577, + "grad_norm": 13.625, "learning_rate": 4.156e-05, - "loss": 1.1669, + "loss": 1.3749, "step": 1040 }, { - "epoch": 0.1264146400192632, - "grad_norm": 19.25, + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, "learning_rate": 4.196e-05, - "loss": 1.1836, + "loss": 1.3919, "step": 1050 }, { - "epoch": 0.1276185889718276, - "grad_norm": 11.375, + "epoch": 0.7560627674750356, + "grad_norm": 16.25, "learning_rate": 4.236e-05, - "loss": 1.2449, + "loss": 1.4208, "step": 1060 }, { - "epoch": 0.128822537924392, - "grad_norm": 10.6875, + "epoch": 0.7631954350927247, + "grad_norm": 27.75, "learning_rate": 4.276e-05, - "loss": 1.1361, + "loss": 1.3714, "step": 1070 }, { - "epoch": 0.13002648687695642, - "grad_norm": 11.5, + "epoch": 0.7703281027104137, + "grad_norm": 13.125, "learning_rate": 4.316e-05, - "loss": 1.1989, + "loss": 1.3344, "step": 1080 }, { - "epoch": 0.13123043582952082, - "grad_norm": 13.0, + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, "learning_rate": 4.356e-05, - "loss": 1.1004, + "loss": 1.3291, "step": 1090 }, { - "epoch": 0.13243438478208525, - "grad_norm": 10.125, + "epoch": 0.7845934379457917, + "grad_norm": 17.125, "learning_rate": 4.396e-05, - "loss": 1.1308, + "loss": 1.3536, "step": 1100 }, { - "epoch": 0.13243438478208525, + "epoch": 0.7845934379457917, "eval/acc": 27.9069766998291, "step": 1100 }, { - "epoch": 0.13243438478208525, - "eval_loss": 3.0177316665649414, - "eval_runtime": 0.2801, - "eval_samples_per_second": 153.54, - "eval_steps_per_second": 3.571, + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, "step": 1100 }, { - "epoch": 0.13363833373464964, - "grad_norm": 9.5, + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, "learning_rate": 4.436e-05, - "loss": 1.1862, + "loss": 1.4598, "step": 1110 }, { - "epoch": 0.13484228268721407, - "grad_norm": 13.75, + "epoch": 0.7988587731811697, + "grad_norm": 15.25, "learning_rate": 4.4760000000000005e-05, - "loss": 1.1764, + "loss": 1.3795, "step": 1120 }, { - "epoch": 0.13604623163977847, - "grad_norm": 30.625, + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, "learning_rate": 4.516e-05, - "loss": 1.0422, + "loss": 1.2518, "step": 1130 }, { - "epoch": 0.1372501805923429, - "grad_norm": 9.875, + "epoch": 0.8131241084165478, + "grad_norm": 16.625, "learning_rate": 4.5560000000000004e-05, - "loss": 1.1796, + "loss": 1.3104, "step": 1140 }, { - "epoch": 0.1384541295449073, - "grad_norm": 13.1875, + "epoch": 0.8202567760342369, + "grad_norm": 11.875, "learning_rate": 4.596e-05, - "loss": 1.0483, + "loss": 1.2996, "step": 1150 }, { - "epoch": 0.1396580784974717, - "grad_norm": 11.75, + "epoch": 0.8273894436519258, + "grad_norm": 24.125, "learning_rate": 4.636e-05, - "loss": 1.1647, + "loss": 1.2067, "step": 1160 }, { - "epoch": 0.14086202745003612, - "grad_norm": 13.375, + "epoch": 0.8345221112696148, + "grad_norm": 11.0, "learning_rate": 4.6760000000000006e-05, - "loss": 1.2839, + "loss": 1.3035, "step": 1170 }, { - "epoch": 0.14206597640260052, - "grad_norm": 42.0, + "epoch": 0.8416547788873039, + "grad_norm": 13.125, "learning_rate": 4.716e-05, - "loss": 1.1594, + "loss": 1.2859, "step": 1180 }, { - "epoch": 0.14326992535516495, - "grad_norm": 15.625, + "epoch": 0.8487874465049928, + "grad_norm": 11.0, "learning_rate": 4.7560000000000005e-05, - "loss": 1.1073, + "loss": 1.3982, "step": 1190 }, { - "epoch": 0.14447387430772934, - "grad_norm": 11.5, + "epoch": 0.8559201141226819, + "grad_norm": 12.875, "learning_rate": 4.796e-05, - "loss": 1.1593, + "loss": 1.299, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval/acc": 26.162790298461914, + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, "step": 1200 }, { - "epoch": 0.14447387430772934, - "eval_loss": 3.0329606533050537, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.829, - "eval_steps_per_second": 4.577, + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, "step": 1200 }, { - "epoch": 0.14567782326029377, - "grad_norm": 12.5625, + "epoch": 0.8630527817403709, + "grad_norm": 11.25, "learning_rate": 4.836e-05, - "loss": 1.1088, + "loss": 1.3549, "step": 1210 }, { - "epoch": 0.14688177221285817, - "grad_norm": 10.4375, + "epoch": 0.8701854493580599, + "grad_norm": 15.25, "learning_rate": 4.876e-05, - "loss": 1.1565, + "loss": 1.3649, "step": 1220 }, { - "epoch": 0.1480857211654226, - "grad_norm": 11.3125, + "epoch": 0.8773181169757489, + "grad_norm": 22.0, "learning_rate": 4.9160000000000004e-05, - "loss": 1.0596, + "loss": 1.2441, "step": 1230 }, { - "epoch": 0.149289670117987, - "grad_norm": 11.375, + "epoch": 0.884450784593438, + "grad_norm": 12.375, "learning_rate": 4.956e-05, - "loss": 1.2416, + "loss": 1.2196, "step": 1240 }, { - "epoch": 0.15049361907055142, - "grad_norm": 10.3125, + "epoch": 0.891583452211127, + "grad_norm": 14.25, "learning_rate": 4.996e-05, - "loss": 1.0492, + "loss": 1.3274, "step": 1250 }, { - "epoch": 0.15169756802311582, - "grad_norm": 10.9375, + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, "learning_rate": 5.0360000000000006e-05, - "loss": 1.0263, + "loss": 1.2896, "step": 1260 }, { - "epoch": 0.15290151697568022, - "grad_norm": 11.0625, + "epoch": 0.905848787446505, + "grad_norm": 16.875, "learning_rate": 5.076000000000001e-05, - "loss": 1.1197, + "loss": 1.3019, "step": 1270 }, { - "epoch": 0.15410546592824464, - "grad_norm": 33.25, + "epoch": 0.912981455064194, + "grad_norm": 26.375, "learning_rate": 5.1160000000000005e-05, - "loss": 1.0614, + "loss": 1.3756, "step": 1280 }, { - "epoch": 0.15530941488080904, - "grad_norm": 11.3125, + "epoch": 0.920114122681883, + "grad_norm": 18.25, "learning_rate": 5.1559999999999994e-05, - "loss": 1.0948, + "loss": 1.327, "step": 1290 }, { - "epoch": 0.15651336383337347, - "grad_norm": 24.5, + "epoch": 0.927246790299572, + "grad_norm": 11.3125, "learning_rate": 5.196e-05, - "loss": 1.1113, + "loss": 1.3237, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval/acc": 25.581396102905273, + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, "step": 1300 }, { - "epoch": 0.15651336383337347, - "eval_loss": 2.944797992706299, - "eval_runtime": 0.3019, - "eval_samples_per_second": 142.434, - "eval_steps_per_second": 3.312, + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, "step": 1300 }, { - "epoch": 0.15771731278593787, - "grad_norm": 12.4375, + "epoch": 0.9343794579172611, + "grad_norm": 18.125, "learning_rate": 5.236e-05, - "loss": 0.9531, + "loss": 1.256, "step": 1310 }, { - "epoch": 0.1589212617385023, - "grad_norm": 12.3125, + "epoch": 0.9415121255349501, + "grad_norm": 10.25, "learning_rate": 5.2759999999999996e-05, - "loss": 1.0079, + "loss": 1.1386, "step": 1320 }, { - "epoch": 0.1601252106910667, - "grad_norm": 13.1875, + "epoch": 0.948644793152639, + "grad_norm": 11.1875, "learning_rate": 5.316e-05, - "loss": 1.0674, + "loss": 1.3115, "step": 1330 }, { - "epoch": 0.16132915964363112, - "grad_norm": 16.875, + "epoch": 0.9557774607703281, + "grad_norm": 10.875, "learning_rate": 5.356e-05, - "loss": 1.1194, + "loss": 1.2315, "step": 1340 }, { - "epoch": 0.16253310859619552, - "grad_norm": 10.625, + "epoch": 0.9629101283880172, + "grad_norm": 12.0, "learning_rate": 5.396e-05, - "loss": 1.0057, + "loss": 1.3327, "step": 1350 }, { - "epoch": 0.16373705754875995, - "grad_norm": 9.125, + "epoch": 0.9700427960057061, + "grad_norm": 11.75, "learning_rate": 5.436e-05, - "loss": 1.1257, + "loss": 1.4052, "step": 1360 }, { - "epoch": 0.16494100650132434, - "grad_norm": 8.5, + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, "learning_rate": 5.476e-05, - "loss": 0.9545, + "loss": 1.1349, "step": 1370 }, { - "epoch": 0.16614495545388874, - "grad_norm": 10.25, + "epoch": 0.9843081312410842, + "grad_norm": 15.125, "learning_rate": 5.516e-05, - "loss": 1.0648, + "loss": 1.3803, "step": 1380 }, { - "epoch": 0.16734890440645317, - "grad_norm": 14.9375, + "epoch": 0.9914407988587732, + "grad_norm": 16.75, "learning_rate": 5.556e-05, - "loss": 1.0364, + "loss": 1.3536, "step": 1390 }, { - "epoch": 0.16855285335901757, - "grad_norm": 138.0, + "epoch": 0.9985734664764622, + "grad_norm": 10.625, "learning_rate": 5.596e-05, - "loss": 1.0255, + "loss": 1.2981, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval/acc": 27.9069766998291, + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, "step": 1400 }, { - "epoch": 0.16855285335901757, - "eval_loss": 2.763101100921631, - "eval_runtime": 0.2759, - "eval_samples_per_second": 155.826, - "eval_steps_per_second": 3.624, + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, "step": 1400 }, { - "epoch": 0.169756802311582, - "grad_norm": 11.8125, + "epoch": 1.005706134094151, + "grad_norm": 15.0, "learning_rate": 5.636e-05, - "loss": 0.9813, + "loss": 1.2173, "step": 1410 }, { - "epoch": 0.1709607512641464, - "grad_norm": 9.1875, + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, "learning_rate": 5.6760000000000005e-05, - "loss": 0.9929, + "loss": 1.1965, "step": 1420 }, { - "epoch": 0.17216470021671082, - "grad_norm": 10.875, + "epoch": 1.0199714693295292, + "grad_norm": 21.625, "learning_rate": 5.716e-05, - "loss": 0.9113, + "loss": 1.2494, "step": 1430 }, { - "epoch": 0.17336864916927522, - "grad_norm": 19.375, + "epoch": 1.0271041369472182, + "grad_norm": 13.0, "learning_rate": 5.7560000000000005e-05, - "loss": 1.0711, + "loss": 1.1948, "step": 1440 }, { - "epoch": 0.17457259812183964, - "grad_norm": 9.8125, + "epoch": 1.0342368045649073, + "grad_norm": 11.0, "learning_rate": 5.796e-05, - "loss": 0.9322, + "loss": 1.2641, "step": 1450 }, { - "epoch": 0.17577654707440404, - "grad_norm": 10.5, + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, "learning_rate": 5.8360000000000004e-05, - "loss": 1.0316, + "loss": 1.2526, "step": 1460 }, { - "epoch": 0.17698049602696847, - "grad_norm": 10.25, + "epoch": 1.0485021398002854, + "grad_norm": 46.0, "learning_rate": 5.876000000000001e-05, - "loss": 1.0165, + "loss": 1.0786, "step": 1470 }, { - "epoch": 0.17818444497953287, - "grad_norm": 10.4375, + "epoch": 1.0556348074179742, + "grad_norm": 11.0, "learning_rate": 5.916e-05, - "loss": 1.0229, + "loss": 1.3154, "step": 1480 }, { - "epoch": 0.17938839393209727, - "grad_norm": 14.4375, + "epoch": 1.0627674750356633, + "grad_norm": 18.75, "learning_rate": 5.9560000000000006e-05, - "loss": 0.9684, + "loss": 1.257, "step": 1490 }, { - "epoch": 0.1805923428846617, - "grad_norm": 8.375, + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, "learning_rate": 5.996e-05, - "loss": 0.9948, + "loss": 1.2636, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval/acc": 34.88372039794922, + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, "step": 1500 }, { - "epoch": 0.1805923428846617, - "eval_loss": 2.8177433013916016, - "eval_runtime": 0.208, - "eval_samples_per_second": 206.732, - "eval_steps_per_second": 4.808, + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, "step": 1500 }, { - "epoch": 0.1817962918372261, - "grad_norm": 19.25, + "epoch": 1.0770328102710414, + "grad_norm": 13.75, "learning_rate": 6.0360000000000005e-05, - "loss": 0.9897, + "loss": 1.2602, "step": 1510 }, { - "epoch": 0.18300024078979052, - "grad_norm": 32.5, + "epoch": 1.0841654778887304, + "grad_norm": 11.625, "learning_rate": 6.076000000000001e-05, - "loss": 0.9217, + "loss": 1.0823, "step": 1520 }, { - "epoch": 0.18420418974235492, - "grad_norm": 9.5, + "epoch": 1.0912981455064195, + "grad_norm": 9.0, "learning_rate": 6.116e-05, - "loss": 1.0494, + "loss": 1.3059, "step": 1530 }, { - "epoch": 0.18540813869491934, - "grad_norm": 9.25, + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, "learning_rate": 6.156e-05, - "loss": 0.9359, + "loss": 1.2006, "step": 1540 }, { - "epoch": 0.18661208764748374, - "grad_norm": 11.375, + "epoch": 1.1055634807417973, + "grad_norm": 15.75, "learning_rate": 6.196000000000001e-05, - "loss": 0.9112, + "loss": 1.3731, "step": 1550 }, { - "epoch": 0.18781603660004817, - "grad_norm": 12.6875, + "epoch": 1.1126961483594864, + "grad_norm": 9.5, "learning_rate": 6.236e-05, - "loss": 1.07, + "loss": 1.1925, "step": 1560 }, { - "epoch": 0.18901998555261257, - "grad_norm": 11.1875, + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, "learning_rate": 6.276e-05, - "loss": 0.9853, + "loss": 1.1554, "step": 1570 }, { - "epoch": 0.19022393450517697, - "grad_norm": 8.375, + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, "learning_rate": 6.316000000000001e-05, - "loss": 0.9579, + "loss": 1.0875, "step": 1580 }, { - "epoch": 0.1914278834577414, - "grad_norm": 20.875, + "epoch": 1.1340941512125535, + "grad_norm": 10.875, "learning_rate": 6.356000000000001e-05, - "loss": 0.9401, + "loss": 1.1895, "step": 1590 }, { - "epoch": 0.1926318324103058, - "grad_norm": 8.9375, + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, "learning_rate": 6.396e-05, - "loss": 1.0279, + "loss": 1.2354, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval/acc": 30.23255729675293, + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, "step": 1600 }, { - "epoch": 0.1926318324103058, - "eval_loss": 2.8526248931884766, - "eval_runtime": 0.3114, - "eval_samples_per_second": 138.103, - "eval_steps_per_second": 3.212, + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, "step": 1600 }, { - "epoch": 0.19383578136287022, - "grad_norm": 7.78125, + "epoch": 1.1483594864479316, + "grad_norm": 12.375, "learning_rate": 6.436e-05, - "loss": 0.8743, + "loss": 1.2167, "step": 1610 }, { - "epoch": 0.19503973031543462, - "grad_norm": 9.8125, + "epoch": 1.1554921540656204, + "grad_norm": 10.375, "learning_rate": 6.476e-05, - "loss": 0.8702, + "loss": 1.1638, "step": 1620 }, { - "epoch": 0.19624367926799904, - "grad_norm": 12.4375, + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, "learning_rate": 6.515999999999999e-05, - "loss": 1.0028, + "loss": 1.1666, "step": 1630 }, { - "epoch": 0.19744762822056344, - "grad_norm": 10.125, + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, "learning_rate": 6.556e-05, - "loss": 0.9377, + "loss": 1.1961, "step": 1640 }, { - "epoch": 0.19865157717312787, - "grad_norm": 8.9375, + "epoch": 1.1768901569186876, + "grad_norm": 9.875, "learning_rate": 6.596e-05, - "loss": 1.031, + "loss": 1.2558, "step": 1650 }, { - "epoch": 0.19985552612569227, - "grad_norm": 8.5625, + "epoch": 1.1840228245363766, + "grad_norm": 10.375, "learning_rate": 6.636e-05, - "loss": 1.0162, + "loss": 1.1728, "step": 1660 }, { - "epoch": 0.2010594750782567, - "grad_norm": 33.75, + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, "learning_rate": 6.676e-05, - "loss": 0.9448, + "loss": 1.2947, "step": 1670 }, { - "epoch": 0.2022634240308211, - "grad_norm": 9.625, + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, "learning_rate": 6.716e-05, - "loss": 1.0077, + "loss": 1.2151, "step": 1680 }, { - "epoch": 0.2034673729833855, - "grad_norm": 8.6875, + "epoch": 1.2054208273894436, + "grad_norm": 10.5, "learning_rate": 6.756e-05, - "loss": 0.9654, + "loss": 1.0612, "step": 1690 }, { - "epoch": 0.20467132193594992, - "grad_norm": 12.625, + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, "learning_rate": 6.796e-05, - "loss": 0.8899, + "loss": 1.1079, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval/acc": 32.55813980102539, + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, "step": 1700 }, { - "epoch": 0.20467132193594992, - "eval_loss": 2.7813549041748047, - "eval_runtime": 0.2132, - "eval_samples_per_second": 201.701, - "eval_steps_per_second": 4.691, + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, "step": 1700 }, { - "epoch": 0.20587527088851432, - "grad_norm": 12.0, + "epoch": 1.2196861626248217, + "grad_norm": 11.25, "learning_rate": 6.836e-05, - "loss": 1.0412, + "loss": 1.1541, "step": 1710 }, { - "epoch": 0.20707921984107874, - "grad_norm": 11.75, + "epoch": 1.2268188302425107, + "grad_norm": 8.125, "learning_rate": 6.876e-05, - "loss": 0.9239, + "loss": 1.0772, "step": 1720 }, { - "epoch": 0.20828316879364314, - "grad_norm": 11.375, + "epoch": 1.2339514978601998, + "grad_norm": 18.125, "learning_rate": 6.916000000000001e-05, - "loss": 0.9243, + "loss": 1.1623, "step": 1730 }, { - "epoch": 0.20948711774620757, - "grad_norm": 12.0, + "epoch": 1.2410841654778888, + "grad_norm": 10.125, "learning_rate": 6.956e-05, - "loss": 1.0204, + "loss": 1.182, "step": 1740 }, { - "epoch": 0.21069106669877197, - "grad_norm": 13.0625, + "epoch": 1.2482168330955776, + "grad_norm": 9.75, "learning_rate": 6.996e-05, - "loss": 0.8811, + "loss": 1.0796, "step": 1750 }, { - "epoch": 0.2118950156513364, - "grad_norm": 17.0, + "epoch": 1.2553495007132667, + "grad_norm": 10.5, "learning_rate": 7.036e-05, - "loss": 0.8755, + "loss": 1.2374, "step": 1760 }, { - "epoch": 0.2130989646039008, - "grad_norm": 11.25, + "epoch": 1.2624821683309557, + "grad_norm": 20.875, "learning_rate": 7.076000000000001e-05, - "loss": 0.858, + "loss": 1.2718, "step": 1770 }, { - "epoch": 0.21430291355646522, - "grad_norm": 9.625, + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, "learning_rate": 7.116e-05, - "loss": 0.9076, + "loss": 1.0922, "step": 1780 }, { - "epoch": 0.21550686250902962, - "grad_norm": 10.4375, + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, "learning_rate": 7.156e-05, - "loss": 0.8817, + "loss": 1.0637, "step": 1790 }, { - "epoch": 0.21671081146159402, - "grad_norm": 12.8125, + "epoch": 1.2838801711840229, + "grad_norm": 9.5, "learning_rate": 7.196000000000001e-05, - "loss": 0.9121, + "loss": 1.1661, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval/acc": 30.813953399658203, + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, "step": 1800 }, { - "epoch": 0.21671081146159402, - "eval_loss": 2.6508796215057373, - "eval_runtime": 0.2185, - "eval_samples_per_second": 196.798, - "eval_steps_per_second": 4.577, + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, "step": 1800 }, { - "epoch": 0.21791476041415844, - "grad_norm": 16.5, + "epoch": 1.291012838801712, + "grad_norm": 14.3125, "learning_rate": 7.236e-05, - "loss": 0.9044, + "loss": 1.1139, "step": 1810 }, { - "epoch": 0.21911870936672284, - "grad_norm": 15.1875, + "epoch": 1.298145506419401, + "grad_norm": 41.5, "learning_rate": 7.276e-05, - "loss": 0.9552, + "loss": 1.0817, "step": 1820 }, { - "epoch": 0.22032265831928727, - "grad_norm": 11.375, + "epoch": 1.3052781740370898, + "grad_norm": 15.125, "learning_rate": 7.316000000000001e-05, - "loss": 0.9264, + "loss": 1.2462, "step": 1830 }, { - "epoch": 0.22152660727185167, - "grad_norm": 8.8125, + "epoch": 1.3124108416547788, + "grad_norm": 33.25, "learning_rate": 7.356000000000001e-05, - "loss": 0.8928, + "loss": 1.1143, "step": 1840 }, { - "epoch": 0.2227305562244161, - "grad_norm": 9.625, + "epoch": 1.3195435092724679, + "grad_norm": 13.625, "learning_rate": 7.396e-05, - "loss": 0.9515, + "loss": 1.1783, "step": 1850 }, { - "epoch": 0.2239345051769805, - "grad_norm": 31.0, + "epoch": 1.326676176890157, + "grad_norm": 18.375, "learning_rate": 7.436000000000001e-05, - "loss": 0.8989, + "loss": 1.2101, "step": 1860 }, { - "epoch": 0.22513845412954492, - "grad_norm": 9.5, + "epoch": 1.333808844507846, + "grad_norm": 13.875, "learning_rate": 7.476000000000001e-05, - "loss": 1.0206, + "loss": 1.1348, "step": 1870 }, { - "epoch": 0.22634240308210932, - "grad_norm": 8.625, + "epoch": 1.340941512125535, + "grad_norm": 13.9375, "learning_rate": 7.516e-05, - "loss": 0.8961, + "loss": 1.0747, "step": 1880 }, { - "epoch": 0.22754635203467374, - "grad_norm": 9.0, + "epoch": 1.3480741797432239, + "grad_norm": 29.75, "learning_rate": 7.556000000000002e-05, - "loss": 0.9421, + "loss": 1.1895, "step": 1890 }, { - "epoch": 0.22875030098723814, - "grad_norm": 12.0625, + "epoch": 1.355206847360913, + "grad_norm": 17.25, "learning_rate": 7.596000000000001e-05, - "loss": 0.9049, + "loss": 1.2512, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval/acc": 36.046512603759766, + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, "step": 1900 }, { - "epoch": 0.22875030098723814, - "eval_loss": 2.636018753051758, - "eval_runtime": 0.2084, - "eval_samples_per_second": 206.343, - "eval_steps_per_second": 4.799, + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, "step": 1900 }, { - "epoch": 0.22995424993980254, - "grad_norm": 8.0625, + "epoch": 1.362339514978602, + "grad_norm": 21.125, "learning_rate": 7.636e-05, - "loss": 0.8983, + "loss": 1.1306, "step": 1910 }, { - "epoch": 0.23115819889236697, - "grad_norm": 11.875, + "epoch": 1.369472182596291, + "grad_norm": 9.0625, "learning_rate": 7.676e-05, - "loss": 0.9293, + "loss": 1.1139, "step": 1920 }, { - "epoch": 0.23236214784493137, - "grad_norm": 11.75, + "epoch": 1.37660485021398, + "grad_norm": 30.25, "learning_rate": 7.716e-05, - "loss": 0.8602, + "loss": 1.1595, "step": 1930 }, { - "epoch": 0.2335660967974958, - "grad_norm": 11.5625, + "epoch": 1.383737517831669, + "grad_norm": 13.6875, "learning_rate": 7.756e-05, - "loss": 0.8078, + "loss": 1.2437, "step": 1940 }, { - "epoch": 0.2347700457500602, - "grad_norm": 9.125, + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, "learning_rate": 7.796e-05, - "loss": 0.8773, + "loss": 1.1005, "step": 1950 }, { - "epoch": 0.23597399470262462, - "grad_norm": 10.6875, + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, "learning_rate": 7.836e-05, - "loss": 0.8464, + "loss": 1.0748, "step": 1960 }, { - "epoch": 0.23717794365518902, - "grad_norm": 18.25, + "epoch": 1.405135520684736, + "grad_norm": 9.125, "learning_rate": 7.876e-05, - "loss": 0.8779, + "loss": 1.1576, "step": 1970 }, { - "epoch": 0.23838189260775344, - "grad_norm": 10.875, + "epoch": 1.412268188302425, + "grad_norm": 11.375, "learning_rate": 7.916e-05, - "loss": 0.9351, + "loss": 1.0982, "step": 1980 }, { - "epoch": 0.23958584156031784, - "grad_norm": 11.0, + "epoch": 1.4194008559201141, + "grad_norm": 10.375, "learning_rate": 7.956e-05, - "loss": 0.8581, + "loss": 1.132, "step": 1990 }, { - "epoch": 0.24078979051288224, - "grad_norm": 8.875, + "epoch": 1.4265335235378032, + "grad_norm": 16.375, "learning_rate": 7.996e-05, - "loss": 0.9799, + "loss": 1.121, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval/acc": 36.046512603759766, + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, "step": 2000 }, { - "epoch": 0.24078979051288224, - "eval_loss": 2.716654062271118, - "eval_runtime": 0.21, - "eval_samples_per_second": 204.721, - "eval_steps_per_second": 4.761, + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, "step": 2000 }, { - "epoch": 0.24199373946544667, - "grad_norm": 11.0625, + "epoch": 1.4336661911554922, + "grad_norm": 9.125, "learning_rate": 8.036e-05, - "loss": 0.8678, + "loss": 1.2079, "step": 2010 }, { - "epoch": 0.24319768841801107, + "epoch": 1.440798858773181, "grad_norm": 12.125, "learning_rate": 8.076e-05, - "loss": 0.8832, + "loss": 1.1098, "step": 2020 }, { - "epoch": 0.2444016373705755, - "grad_norm": 8.25, + "epoch": 1.44793152639087, + "grad_norm": 8.8125, "learning_rate": 8.116e-05, - "loss": 0.8689, + "loss": 0.9849, "step": 2030 }, { - "epoch": 0.2456055863231399, - "grad_norm": 6.53125, + "epoch": 1.4550641940085591, + "grad_norm": 9.0, "learning_rate": 8.156e-05, - "loss": 0.8829, + "loss": 1.0905, "step": 2040 }, { - "epoch": 0.24680953527570432, - "grad_norm": 9.5625, + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, "learning_rate": 8.196000000000001e-05, - "loss": 0.9181, + "loss": 1.2211, "step": 2050 }, { - "epoch": 0.24801348422826872, - "grad_norm": 22.875, + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, "learning_rate": 8.236e-05, - "loss": 0.8011, + "loss": 1.0968, "step": 2060 }, { - "epoch": 0.24921743318083314, - "grad_norm": 14.4375, + "epoch": 1.4764621968616263, + "grad_norm": 9.0, "learning_rate": 8.276e-05, - "loss": 0.9163, + "loss": 1.0973, "step": 2070 }, { - "epoch": 0.25042138213339754, - "grad_norm": 10.625, + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, "learning_rate": 8.316000000000001e-05, - "loss": 0.7869, + "loss": 1.1012, "step": 2080 }, { - "epoch": 0.25162533108596197, - "grad_norm": 11.0, + "epoch": 1.4907275320970044, + "grad_norm": 31.0, "learning_rate": 8.356e-05, - "loss": 0.8779, + "loss": 1.0437, "step": 2090 }, { - "epoch": 0.2528292800385264, - "grad_norm": 12.625, + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, "learning_rate": 8.396e-05, - "loss": 0.889, + "loss": 1.0934, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval/acc": 37.20930099487305, + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, "step": 2100 }, { - "epoch": 0.2528292800385264, - "eval_loss": 2.626293182373047, - "eval_runtime": 0.2735, - "eval_samples_per_second": 157.235, - "eval_steps_per_second": 3.657, + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, "step": 2100 }, { - "epoch": 0.25403322899109076, - "grad_norm": 8.3125, + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, "learning_rate": 8.436000000000001e-05, - "loss": 0.8363, + "loss": 1.0862, "step": 2110 }, { - "epoch": 0.2552371779436552, - "grad_norm": 8.625, + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, "learning_rate": 8.476000000000001e-05, - "loss": 0.8762, + "loss": 1.0786, "step": 2120 }, { - "epoch": 0.2564411268962196, - "grad_norm": 7.4375, + "epoch": 1.5192582025677603, + "grad_norm": 8.25, "learning_rate": 8.516e-05, - "loss": 0.7925, + "loss": 1.1496, "step": 2130 }, { - "epoch": 0.257645075848784, - "grad_norm": 9.1875, + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, "learning_rate": 8.556e-05, - "loss": 0.9575, + "loss": 1.1132, "step": 2140 }, { - "epoch": 0.2588490248013484, - "grad_norm": 9.8125, + "epoch": 1.5335235378031382, + "grad_norm": 21.375, "learning_rate": 8.596000000000001e-05, - "loss": 0.7551, + "loss": 1.1043, "step": 2150 }, { - "epoch": 0.26005297375391284, - "grad_norm": 7.15625, + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, "learning_rate": 8.636e-05, - "loss": 0.808, + "loss": 1.2549, "step": 2160 }, { - "epoch": 0.26125692270647727, - "grad_norm": 8.3125, + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, "learning_rate": 8.676e-05, - "loss": 0.9449, + "loss": 1.115, "step": 2170 }, { - "epoch": 0.26246087165904164, - "grad_norm": 11.5, + "epoch": 1.5549215406562054, + "grad_norm": 8.375, "learning_rate": 8.716000000000001e-05, - "loss": 0.8712, + "loss": 1.1963, "step": 2180 }, { - "epoch": 0.26366482061160607, - "grad_norm": 8.0, + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, "learning_rate": 8.756000000000001e-05, - "loss": 0.9389, + "loss": 1.1697, "step": 2190 }, { - "epoch": 0.2648687695641705, - "grad_norm": 13.5, + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, "learning_rate": 8.796e-05, - "loss": 0.7875, + "loss": 0.9716, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval/acc": 35.46511459350586, + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, "step": 2200 }, { - "epoch": 0.2648687695641705, - "eval_loss": 2.5862526893615723, - "eval_runtime": 0.2151, - "eval_samples_per_second": 199.927, - "eval_steps_per_second": 4.649, + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, "step": 2200 }, { - "epoch": 0.26607271851673486, - "grad_norm": 11.5625, + "epoch": 1.5763195435092725, + "grad_norm": 10.0, "learning_rate": 8.836000000000001e-05, - "loss": 0.9947, + "loss": 1.0254, "step": 2210 }, { - "epoch": 0.2672766674692993, - "grad_norm": 8.25, + "epoch": 1.5834522111269616, + "grad_norm": 12.625, "learning_rate": 8.876e-05, - "loss": 0.717, + "loss": 1.1672, "step": 2220 }, { - "epoch": 0.2684806164218637, - "grad_norm": 26.25, + "epoch": 1.5905848787446506, + "grad_norm": 11.5, "learning_rate": 8.916e-05, - "loss": 0.8688, + "loss": 1.0656, "step": 2230 }, { - "epoch": 0.26968456537442814, - "grad_norm": 11.5, + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, "learning_rate": 8.956e-05, - "loss": 0.9134, + "loss": 1.035, "step": 2240 }, { - "epoch": 0.2708885143269925, - "grad_norm": 6.875, + "epoch": 1.6048502139800287, + "grad_norm": 9.25, "learning_rate": 8.996e-05, - "loss": 0.8592, + "loss": 1.0972, "step": 2250 }, { - "epoch": 0.27209246327955694, - "grad_norm": 7.21875, + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, "learning_rate": 9.036e-05, - "loss": 0.6548, + "loss": 1.0148, "step": 2260 }, { - "epoch": 0.27329641223212137, - "grad_norm": 12.25, + "epoch": 1.6191155492154066, + "grad_norm": 13.5, "learning_rate": 9.076e-05, - "loss": 0.8613, + "loss": 1.1202, "step": 2270 }, { - "epoch": 0.2745003611846858, - "grad_norm": 8.875, + "epoch": 1.6262482168330956, + "grad_norm": 9.125, "learning_rate": 9.116e-05, - "loss": 0.7455, + "loss": 1.1134, "step": 2280 }, { - "epoch": 0.27570431013725016, - "grad_norm": 12.5625, + "epoch": 1.6333808844507844, + "grad_norm": 15.25, "learning_rate": 9.156e-05, - "loss": 0.8458, + "loss": 1.0373, "step": 2290 }, { - "epoch": 0.2769082590898146, - "grad_norm": 8.8125, + "epoch": 1.6405135520684735, + "grad_norm": 9.125, "learning_rate": 9.196000000000001e-05, - "loss": 0.8003, + "loss": 1.0654, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval/acc": 32.55813980102539, + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, "step": 2300 }, { - "epoch": 0.2769082590898146, - "eval_loss": 2.6594340801239014, - "eval_runtime": 0.2129, - "eval_samples_per_second": 201.965, - "eval_steps_per_second": 4.697, + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, "step": 2300 }, { - "epoch": 0.278112208042379, - "grad_norm": 10.6875, + "epoch": 1.6476462196861625, + "grad_norm": 8.25, "learning_rate": 9.236e-05, - "loss": 0.812, + "loss": 1.0218, "step": 2310 }, { - "epoch": 0.2793161569949434, - "grad_norm": 12.1875, + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, "learning_rate": 9.276e-05, - "loss": 0.781, + "loss": 1.106, "step": 2320 }, { - "epoch": 0.2805201059475078, - "grad_norm": 8.125, + "epoch": 1.6619115549215406, + "grad_norm": 8.25, "learning_rate": 9.316000000000001e-05, - "loss": 0.9682, + "loss": 1.0558, "step": 2330 }, { - "epoch": 0.28172405490007224, - "grad_norm": 8.8125, + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, "learning_rate": 9.356e-05, - "loss": 0.7531, + "loss": 0.9931, "step": 2340 }, { - "epoch": 0.28292800385263667, - "grad_norm": 7.375, + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, "learning_rate": 9.396e-05, - "loss": 0.7235, + "loss": 1.0683, "step": 2350 }, { - "epoch": 0.28413195280520104, - "grad_norm": 7.8125, + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, "learning_rate": 9.436e-05, - "loss": 0.9204, + "loss": 1.0631, "step": 2360 }, { - "epoch": 0.28533590175776546, - "grad_norm": 6.65625, + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, "learning_rate": 9.476000000000001e-05, - "loss": 0.7636, + "loss": 1.049, "step": 2370 }, { - "epoch": 0.2865398507103299, - "grad_norm": 9.625, + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, "learning_rate": 9.516e-05, - "loss": 0.855, + "loss": 1.0259, "step": 2380 }, { - "epoch": 0.2877437996628943, - "grad_norm": 9.6875, + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, "learning_rate": 9.556e-05, - "loss": 0.8643, + "loss": 1.0085, "step": 2390 }, { - "epoch": 0.2889477486154587, - "grad_norm": 7.1875, + "epoch": 1.7118402282453637, + "grad_norm": 131.0, "learning_rate": 9.596000000000001e-05, - "loss": 0.8258, + "loss": 0.944, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval/acc": 36.627906799316406, + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, "step": 2400 }, { - "epoch": 0.2889477486154587, - "eval_loss": 2.7174084186553955, - "eval_runtime": 0.2111, - "eval_samples_per_second": 203.672, - "eval_steps_per_second": 4.737, + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, "step": 2400 }, { - "epoch": 0.2901516975680231, - "grad_norm": 7.65625, + "epoch": 1.7189728958630528, + "grad_norm": 8.375, "learning_rate": 9.636e-05, - "loss": 0.8752, + "loss": 1.0069, "step": 2410 }, { - "epoch": 0.29135564652058754, - "grad_norm": 8.75, + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, "learning_rate": 9.676e-05, - "loss": 0.8082, + "loss": 1.0648, "step": 2420 }, { - "epoch": 0.2925595954731519, - "grad_norm": 10.4375, + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, "learning_rate": 9.716000000000001e-05, - "loss": 0.7538, + "loss": 1.0594, "step": 2430 }, { - "epoch": 0.29376354442571634, - "grad_norm": 6.4375, + "epoch": 1.7403708987161197, + "grad_norm": 8.75, "learning_rate": 9.756000000000001e-05, - "loss": 0.7766, + "loss": 1.2082, "step": 2440 }, { - "epoch": 0.29496749337828077, - "grad_norm": 7.96875, + "epoch": 1.7475035663338088, + "grad_norm": 9.875, "learning_rate": 9.796e-05, - "loss": 0.844, + "loss": 1.0225, "step": 2450 }, { - "epoch": 0.2961714423308452, - "grad_norm": 7.75, + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, "learning_rate": 9.836000000000001e-05, - "loss": 0.7127, + "loss": 0.9975, "step": 2460 }, { - "epoch": 0.29737539128340956, - "grad_norm": 11.5, + "epoch": 1.7617689015691869, + "grad_norm": 21.0, "learning_rate": 9.876000000000001e-05, - "loss": 0.8363, + "loss": 0.9533, "step": 2470 }, { - "epoch": 0.298579340235974, - "grad_norm": 6.4375, + "epoch": 1.768901569186876, + "grad_norm": 7.65625, "learning_rate": 9.916e-05, - "loss": 0.7429, + "loss": 0.9619, "step": 2480 }, { - "epoch": 0.2997832891885384, - "grad_norm": 11.5, + "epoch": 1.776034236804565, + "grad_norm": 13.625, "learning_rate": 9.956e-05, - "loss": 0.736, + "loss": 0.9425, "step": 2490 }, { - "epoch": 0.30098723814110284, - "grad_norm": 9.25, + "epoch": 1.783166904422254, + "grad_norm": 12.375, "learning_rate": 9.996000000000001e-05, - "loss": 0.8365, + "loss": 0.9893, "step": 2500 }, { - "epoch": 0.30098723814110284, + "epoch": 1.783166904422254, "eval/acc": 39.53488540649414, "step": 2500 }, { - "epoch": 0.30098723814110284, - "eval_loss": 2.713433027267456, - "eval_runtime": 0.2088, - "eval_samples_per_second": 205.919, - "eval_steps_per_second": 4.789, + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, "step": 2500 }, { - "epoch": 0.3021911870936672, - "grad_norm": 7.03125, + "epoch": 1.790299572039943, + "grad_norm": 10.0, "learning_rate": 9.996000000000001e-05, - "loss": 0.7664, + "loss": 1.0137, "step": 2510 }, { - "epoch": 0.30339513604623164, - "grad_norm": 7.75, + "epoch": 1.797432239657632, + "grad_norm": 10.125, "learning_rate": 9.991555555555556e-05, - "loss": 0.9128, + "loss": 1.059, "step": 2520 }, { - "epoch": 0.30459908499879607, - "grad_norm": 9.0, + "epoch": 1.804564907275321, + "grad_norm": 32.0, "learning_rate": 9.987111111111111e-05, - "loss": 0.8045, + "loss": 1.0498, "step": 2530 }, { - "epoch": 0.30580303395136044, - "grad_norm": 8.9375, + "epoch": 1.81169757489301, + "grad_norm": 10.125, "learning_rate": 9.982666666666667e-05, - "loss": 0.8292, + "loss": 1.1431, "step": 2540 }, { - "epoch": 0.30700698290392486, - "grad_norm": 7.40625, + "epoch": 1.818830242510699, + "grad_norm": 7.90625, "learning_rate": 9.978222222222223e-05, - "loss": 0.7557, + "loss": 1.0715, "step": 2550 }, { - "epoch": 0.3082109318564893, - "grad_norm": 7.625, + "epoch": 1.825962910128388, + "grad_norm": 10.9375, "learning_rate": 9.973777777777778e-05, - "loss": 0.683, + "loss": 1.0446, "step": 2560 }, { - "epoch": 0.3094148808090537, - "grad_norm": 8.1875, + "epoch": 1.833095577746077, + "grad_norm": 13.0, "learning_rate": 9.969333333333334e-05, - "loss": 0.8052, + "loss": 1.0291, "step": 2570 }, { - "epoch": 0.3106188297616181, - "grad_norm": 8.4375, + "epoch": 1.840228245363766, + "grad_norm": 9.75, "learning_rate": 9.964888888888889e-05, - "loss": 0.7819, + "loss": 0.9713, "step": 2580 }, { - "epoch": 0.3118227787141825, - "grad_norm": 10.8125, + "epoch": 1.847360912981455, + "grad_norm": 10.5625, "learning_rate": 9.960444444444444e-05, - "loss": 0.8452, + "loss": 1.2157, "step": 2590 }, { - "epoch": 0.31302672766674694, - "grad_norm": 6.21875, + "epoch": 1.854493580599144, + "grad_norm": 9.3125, "learning_rate": 9.956e-05, - "loss": 0.7478, + "loss": 1.0455, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval/acc": 34.88372039794922, + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, "step": 2600 }, { - "epoch": 0.31302672766674694, - "eval_loss": 2.6625020503997803, - "eval_runtime": 0.2061, - "eval_samples_per_second": 208.644, - "eval_steps_per_second": 4.852, + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, "step": 2600 }, { - "epoch": 0.31423067661931137, - "grad_norm": 7.375, + "epoch": 1.861626248216833, + "grad_norm": 10.5, "learning_rate": 9.951555555555556e-05, - "loss": 0.7623, + "loss": 1.0604, "step": 2610 }, { - "epoch": 0.31543462557187574, - "grad_norm": 9.0, + "epoch": 1.8687589158345221, + "grad_norm": 9.375, "learning_rate": 9.947111111111111e-05, - "loss": 0.8223, + "loss": 0.8715, "step": 2620 }, { - "epoch": 0.31663857452444016, - "grad_norm": 6.75, + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, "learning_rate": 9.942666666666667e-05, - "loss": 0.7797, + "loss": 1.0034, "step": 2630 }, { - "epoch": 0.3178425234770046, - "grad_norm": 9.125, + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, "learning_rate": 9.938222222222224e-05, - "loss": 0.6746, + "loss": 1.0557, "step": 2640 }, { - "epoch": 0.31904647242956896, - "grad_norm": 8.5, + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, "learning_rate": 9.933777777777779e-05, - "loss": 0.8434, + "loss": 0.974, "step": 2650 }, { - "epoch": 0.3202504213821334, - "grad_norm": 10.3125, + "epoch": 1.8972895863052783, + "grad_norm": 10.875, "learning_rate": 9.929333333333333e-05, - "loss": 0.8625, + "loss": 1.1366, "step": 2660 }, { - "epoch": 0.3214543703346978, - "grad_norm": 8.125, + "epoch": 1.9044222539229672, + "grad_norm": 28.75, "learning_rate": 9.92488888888889e-05, - "loss": 0.8003, + "loss": 1.0135, "step": 2670 }, { - "epoch": 0.32265831928726224, - "grad_norm": 8.5625, + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, "learning_rate": 9.920444444444444e-05, - "loss": 0.8145, + "loss": 1.0263, "step": 2680 }, { - "epoch": 0.3238622682398266, - "grad_norm": 8.0, + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, "learning_rate": 9.916e-05, - "loss": 0.6519, + "loss": 0.9952, "step": 2690 }, { - "epoch": 0.32506621719239104, - "grad_norm": 8.5625, + "epoch": 1.925820256776034, + "grad_norm": 8.8125, "learning_rate": 9.911555555555557e-05, - "loss": 0.7627, + "loss": 1.0438, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval/acc": 38.953487396240234, + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, "step": 2700 }, { - "epoch": 0.32506621719239104, - "eval_loss": 2.629239082336426, - "eval_runtime": 0.2162, - "eval_samples_per_second": 198.931, - "eval_steps_per_second": 4.626, + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, "step": 2700 }, { - "epoch": 0.32627016614495546, - "grad_norm": 7.625, + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, "learning_rate": 9.907111111111112e-05, - "loss": 0.7265, + "loss": 0.9522, "step": 2710 }, { - "epoch": 0.3274741150975199, - "grad_norm": 7.15625, + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, "learning_rate": 9.902666666666666e-05, - "loss": 0.7468, + "loss": 0.9729, "step": 2720 }, { - "epoch": 0.32867806405008426, - "grad_norm": 8.5, + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, "learning_rate": 9.898222222222223e-05, - "loss": 0.7816, + "loss": 1.0528, "step": 2730 }, { - "epoch": 0.3298820130026487, - "grad_norm": 6.8125, + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, "learning_rate": 9.893777777777779e-05, - "loss": 0.7828, + "loss": 1.1212, "step": 2740 }, { - "epoch": 0.3310859619552131, - "grad_norm": 8.5625, + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, "learning_rate": 9.889333333333334e-05, - "loss": 0.8273, + "loss": 0.9866, "step": 2750 }, { - "epoch": 0.3322899109077775, - "grad_norm": 7.28125, + "epoch": 1.9686162624821684, + "grad_norm": 8.25, "learning_rate": 9.884888888888889e-05, - "loss": 0.6265, + "loss": 0.8616, "step": 2760 }, { - "epoch": 0.3334938598603419, - "grad_norm": 7.78125, + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, "learning_rate": 9.880444444444445e-05, - "loss": 0.8716, + "loss": 0.9972, "step": 2770 }, { - "epoch": 0.33469780881290634, - "grad_norm": 6.0, + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, "learning_rate": 9.876000000000001e-05, - "loss": 0.7587, + "loss": 0.9781, "step": 2780 }, { - "epoch": 0.33590175776547077, - "grad_norm": 11.8125, + "epoch": 1.9900142653352355, + "grad_norm": 10.75, "learning_rate": 9.871555555555556e-05, - "loss": 0.836, + "loss": 1.0579, "step": 2790 }, { - "epoch": 0.33710570671803514, - "grad_norm": 8.3125, + "epoch": 1.9971469329529246, + "grad_norm": 8.25, "learning_rate": 9.867111111111112e-05, - "loss": 0.7196, + "loss": 1.0323, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval/acc": 34.88372039794922, + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, "step": 2800 }, { - "epoch": 0.33710570671803514, - "eval_loss": 2.5979089736938477, - "eval_runtime": 0.212, - "eval_samples_per_second": 202.843, - "eval_steps_per_second": 4.717, + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, "step": 2800 }, { - "epoch": 0.33830965567059956, - "grad_norm": 8.125, + "epoch": 2.0042796005706136, + "grad_norm": 10.25, "learning_rate": 9.862666666666667e-05, - "loss": 0.7128, + "loss": 1.0597, "step": 2810 }, { - "epoch": 0.339513604623164, - "grad_norm": 7.0, + "epoch": 2.011412268188302, + "grad_norm": 7.0625, "learning_rate": 9.858222222222223e-05, - "loss": 0.8709, + "loss": 0.9582, "step": 2820 }, { - "epoch": 0.3407175535757284, - "grad_norm": 10.875, + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, "learning_rate": 9.853777777777778e-05, - "loss": 0.6885, + "loss": 1.0058, "step": 2830 }, { - "epoch": 0.3419215025282928, - "grad_norm": 6.625, + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, "learning_rate": 9.849333333333334e-05, - "loss": 0.8262, + "loss": 1.009, "step": 2840 }, { - "epoch": 0.3431254514808572, - "grad_norm": 9.0625, + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, "learning_rate": 9.844888888888889e-05, - "loss": 0.6365, + "loss": 0.93, "step": 2850 }, { - "epoch": 0.34432940043342164, - "grad_norm": 7.96875, + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, "learning_rate": 9.840444444444445e-05, - "loss": 0.8177, + "loss": 1.0953, "step": 2860 }, { - "epoch": 0.345533349385986, - "grad_norm": 6.71875, + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, "learning_rate": 9.836000000000001e-05, - "loss": 0.7043, + "loss": 1.0437, "step": 2870 }, { - "epoch": 0.34673729833855044, - "grad_norm": 10.4375, + "epoch": 2.0542082738944365, + "grad_norm": 8.75, "learning_rate": 9.831555555555556e-05, - "loss": 0.7503, + "loss": 0.9873, "step": 2880 }, { - "epoch": 0.34794124729111486, - "grad_norm": 7.375, + "epoch": 2.0613409415121255, + "grad_norm": 8.375, "learning_rate": 9.827111111111111e-05, - "loss": 0.7532, + "loss": 0.9414, "step": 2890 }, { - "epoch": 0.3491451962436793, - "grad_norm": 7.65625, + "epoch": 2.0684736091298146, + "grad_norm": 9.0, "learning_rate": 9.822666666666667e-05, - "loss": 0.6942, + "loss": 0.9625, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval/acc": 37.79069900512695, + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, "step": 2900 }, { - "epoch": 0.3491451962436793, - "eval_loss": 2.698911190032959, - "eval_runtime": 1.2554, - "eval_samples_per_second": 34.253, - "eval_steps_per_second": 0.797, + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, "step": 2900 }, { - "epoch": 0.35034914519624366, - "grad_norm": 7.1875, + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, "learning_rate": 9.818222222222223e-05, - "loss": 0.7651, + "loss": 1.0246, "step": 2910 }, { - "epoch": 0.3515530941488081, - "grad_norm": 6.0, + "epoch": 2.0827389443651927, + "grad_norm": 8.125, "learning_rate": 9.813777777777778e-05, - "loss": 0.7786, + "loss": 0.9646, "step": 2920 }, { - "epoch": 0.3527570431013725, - "grad_norm": 9.375, + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, "learning_rate": 9.809333333333333e-05, - "loss": 0.8285, + "loss": 1.0022, "step": 2930 }, { - "epoch": 0.35396099205393694, - "grad_norm": 6.4375, + "epoch": 2.097004279600571, + "grad_norm": 8.625, "learning_rate": 9.80488888888889e-05, - "loss": 0.7339, + "loss": 0.9834, "step": 2940 }, { - "epoch": 0.3551649410065013, - "grad_norm": 8.8125, + "epoch": 2.10413694721826, + "grad_norm": 45.25, "learning_rate": 9.800444444444446e-05, - "loss": 0.6948, + "loss": 0.9159, "step": 2950 }, { - "epoch": 0.35636888995906574, - "grad_norm": 11.4375, + "epoch": 2.1112696148359484, + "grad_norm": 9.375, "learning_rate": 9.796e-05, - "loss": 0.8455, + "loss": 1.0598, "step": 2960 }, { - "epoch": 0.35757283891163016, - "grad_norm": 8.5625, + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, "learning_rate": 9.791555555555557e-05, - "loss": 0.791, + "loss": 0.8848, "step": 2970 }, { - "epoch": 0.35877678786419454, - "grad_norm": 7.84375, + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, "learning_rate": 9.787111111111111e-05, - "loss": 0.8574, + "loss": 0.942, "step": 2980 }, { - "epoch": 0.35998073681675896, - "grad_norm": 9.4375, + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, "learning_rate": 9.782666666666666e-05, - "loss": 0.7923, + "loss": 0.9583, "step": 2990 }, { - "epoch": 0.3611846857693234, - "grad_norm": 8.0625, + "epoch": 2.1398002853067046, + "grad_norm": 9.0, "learning_rate": 9.778222222222222e-05, - "loss": 0.863, + "loss": 0.9836, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval/acc": 41.86046600341797, + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, "step": 3000 }, { - "epoch": 0.3611846857693234, - "eval_loss": 2.5240559577941895, - "eval_runtime": 0.2105, - "eval_samples_per_second": 204.269, - "eval_steps_per_second": 4.75, + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, "step": 3000 }, { - "epoch": 0.3623886347218878, - "grad_norm": 6.71875, + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, "learning_rate": 9.773777777777779e-05, - "loss": 0.7726, + "loss": 1.028, "step": 3010 }, { - "epoch": 0.3635925836744522, - "grad_norm": 8.125, + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, "learning_rate": 9.769333333333334e-05, - "loss": 0.8234, + "loss": 0.9209, "step": 3020 }, { - "epoch": 0.3647965326270166, - "grad_norm": 7.90625, + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, "learning_rate": 9.764888888888888e-05, - "loss": 0.8125, + "loss": 0.9999, "step": 3030 }, { - "epoch": 0.36600048157958104, - "grad_norm": 5.875, + "epoch": 2.168330955777461, + "grad_norm": 8.375, "learning_rate": 9.760444444444446e-05, - "loss": 0.739, + "loss": 0.9576, "step": 3040 }, { - "epoch": 0.3672044305321454, - "grad_norm": 32.75, + "epoch": 2.17546362339515, + "grad_norm": 7.4375, "learning_rate": 9.756000000000001e-05, - "loss": 0.8773, + "loss": 0.8832, "step": 3050 }, { - "epoch": 0.36840837948470984, - "grad_norm": 8.625, + "epoch": 2.182596291012839, + "grad_norm": 8.125, "learning_rate": 9.751555555555556e-05, - "loss": 0.6411, + "loss": 0.933, "step": 3060 }, { - "epoch": 0.36961232843727426, - "grad_norm": 10.0625, + "epoch": 2.189728958630528, + "grad_norm": 8.9375, "learning_rate": 9.747111111111112e-05, - "loss": 0.7757, + "loss": 0.9962, "step": 3070 }, { - "epoch": 0.3708162773898387, - "grad_norm": 7.78125, + "epoch": 2.196861626248217, + "grad_norm": 7.1875, "learning_rate": 9.742666666666667e-05, - "loss": 0.8144, + "loss": 1.003, "step": 3080 }, { - "epoch": 0.37202022634240306, - "grad_norm": 8.25, + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, "learning_rate": 9.738222222222223e-05, - "loss": 0.7915, + "loss": 0.9441, "step": 3090 }, { - "epoch": 0.3732241752949675, - "grad_norm": 9.5, + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, "learning_rate": 9.733777777777778e-05, - "loss": 0.7808, + "loss": 1.0335, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval/acc": 39.53488540649414, + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, "step": 3100 }, { - "epoch": 0.3732241752949675, - "eval_loss": 2.6263325214385986, - "eval_runtime": 0.2107, - "eval_samples_per_second": 204.065, - "eval_steps_per_second": 4.746, + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, "step": 3100 }, { - "epoch": 0.3744281242475319, - "grad_norm": 7.34375, + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, "learning_rate": 9.729333333333334e-05, - "loss": 0.6467, + "loss": 0.9694, "step": 3110 }, { - "epoch": 0.37563207320009634, - "grad_norm": 10.5625, + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, "learning_rate": 9.724888888888889e-05, - "loss": 0.7271, + "loss": 1.0386, "step": 3120 }, { - "epoch": 0.3768360221526607, - "grad_norm": 19.375, + "epoch": 2.232524964336662, + "grad_norm": 8.6875, "learning_rate": 9.720444444444445e-05, - "loss": 0.8248, + "loss": 0.9614, "step": 3130 }, { - "epoch": 0.37803997110522514, - "grad_norm": 11.6875, + "epoch": 2.239657631954351, + "grad_norm": 8.3125, "learning_rate": 9.716000000000001e-05, - "loss": 0.7468, + "loss": 1.0643, "step": 3140 }, { - "epoch": 0.37924392005778956, - "grad_norm": 6.71875, + "epoch": 2.24679029957204, + "grad_norm": 8.125, "learning_rate": 9.711555555555556e-05, - "loss": 0.8189, + "loss": 0.9243, "step": 3150 }, { - "epoch": 0.38044786901035393, - "grad_norm": 7.15625, + "epoch": 2.253922967189729, + "grad_norm": 9.125, "learning_rate": 9.707111111111111e-05, - "loss": 0.7265, + "loss": 0.8419, "step": 3160 }, { - "epoch": 0.38165181796291836, - "grad_norm": 11.9375, + "epoch": 2.261055634807418, + "grad_norm": 9.125, "learning_rate": 9.702666666666667e-05, - "loss": 0.7502, + "loss": 0.9961, "step": 3170 }, { - "epoch": 0.3828557669154828, - "grad_norm": 7.78125, + "epoch": 2.268188302425107, + "grad_norm": 6.3125, "learning_rate": 9.698222222222223e-05, - "loss": 0.8412, + "loss": 0.8931, "step": 3180 }, { - "epoch": 0.3840597158680472, - "grad_norm": 6.75, + "epoch": 2.275320970042796, + "grad_norm": 7.875, "learning_rate": 9.693777777777778e-05, - "loss": 0.8689, + "loss": 1.0057, "step": 3190 }, { - "epoch": 0.3852636648206116, - "grad_norm": 7.6875, + "epoch": 2.282453637660485, + "grad_norm": 6.90625, "learning_rate": 9.689333333333333e-05, - "loss": 0.8053, + "loss": 0.9606, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval/acc": 39.53488540649414, + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, "step": 3200 }, { - "epoch": 0.3852636648206116, - "eval_loss": 2.6145706176757812, - "eval_runtime": 0.2093, - "eval_samples_per_second": 205.398, - "eval_steps_per_second": 4.777, + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, "step": 3200 }, { - "epoch": 0.386467613773176, - "grad_norm": 7.65625, + "epoch": 2.289586305278174, + "grad_norm": 11.8125, "learning_rate": 9.684888888888889e-05, - "loss": 0.7601, + "loss": 0.9218, "step": 3210 }, { - "epoch": 0.38767156272574044, - "grad_norm": 19.25, + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, "learning_rate": 9.680444444444445e-05, - "loss": 0.7944, + "loss": 1.0111, "step": 3220 }, { - "epoch": 0.38887551167830486, - "grad_norm": 9.375, + "epoch": 2.3038516405135523, + "grad_norm": 8.625, "learning_rate": 9.676e-05, - "loss": 0.839, + "loss": 1.0968, "step": 3230 }, { - "epoch": 0.39007946063086923, - "grad_norm": 8.5, + "epoch": 2.310984308131241, + "grad_norm": 7.1875, "learning_rate": 9.671555555555556e-05, - "loss": 0.7794, + "loss": 1.0236, "step": 3240 }, { - "epoch": 0.39128340958343366, - "grad_norm": 7.78125, + "epoch": 2.31811697574893, + "grad_norm": 6.84375, "learning_rate": 9.667111111111111e-05, - "loss": 0.753, + "loss": 0.92, "step": 3250 }, { - "epoch": 0.3924873585359981, - "grad_norm": 7.15625, + "epoch": 2.325249643366619, + "grad_norm": 8.75, "learning_rate": 9.662666666666667e-05, - "loss": 0.7326, + "loss": 0.8205, "step": 3260 }, { - "epoch": 0.39369130748856246, - "grad_norm": 13.4375, + "epoch": 2.332382310984308, + "grad_norm": 30.75, "learning_rate": 9.658222222222222e-05, - "loss": 0.6754, + "loss": 0.9676, "step": 3270 }, { - "epoch": 0.3948952564411269, - "grad_norm": 6.71875, + "epoch": 2.339514978601997, + "grad_norm": 13.0, "learning_rate": 9.653777777777778e-05, - "loss": 0.757, + "loss": 0.9086, "step": 3280 }, { - "epoch": 0.3960992053936913, - "grad_norm": 7.5625, + "epoch": 2.346647646219686, + "grad_norm": 9.375, "learning_rate": 9.649333333333333e-05, - "loss": 0.9203, + "loss": 1.0504, "step": 3290 }, { - "epoch": 0.39730315434625574, - "grad_norm": 8.375, + "epoch": 2.353780313837375, + "grad_norm": 39.0, "learning_rate": 9.64488888888889e-05, - "loss": 0.8552, + "loss": 0.9481, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval/acc": 44.1860466003418, + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, "step": 3300 }, { - "epoch": 0.39730315434625574, - "eval_loss": 2.571866273880005, - "eval_runtime": 0.2083, - "eval_samples_per_second": 206.479, - "eval_steps_per_second": 4.802, + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, "step": 3300 }, { - "epoch": 0.3985071032988201, - "grad_norm": 7.5625, + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, "learning_rate": 9.640444444444446e-05, - "loss": 0.7811, + "loss": 0.9641, "step": 3310 }, { - "epoch": 0.39971105225138454, - "grad_norm": 11.75, + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, "learning_rate": 9.636e-05, - "loss": 0.6717, + "loss": 0.9624, "step": 3320 }, { - "epoch": 0.40091500120394896, - "grad_norm": 8.1875, + "epoch": 2.3751783166904423, + "grad_norm": 12.625, "learning_rate": 9.631555555555555e-05, - "loss": 0.838, + "loss": 1.0082, "step": 3330 }, { - "epoch": 0.4021189501565134, - "grad_norm": 6.40625, + "epoch": 2.3823109843081314, + "grad_norm": 7.25, "learning_rate": 9.627111111111112e-05, - "loss": 0.8568, + "loss": 1.0249, "step": 3340 }, { - "epoch": 0.40332289910907776, - "grad_norm": 7.3125, + "epoch": 2.3894436519258204, + "grad_norm": 13.375, "learning_rate": 9.622666666666668e-05, - "loss": 0.6742, + "loss": 1.0153, "step": 3350 }, { - "epoch": 0.4045268480616422, - "grad_norm": 7.875, + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, "learning_rate": 9.618222222222223e-05, - "loss": 0.7849, + "loss": 0.9533, "step": 3360 }, { - "epoch": 0.4057307970142066, - "grad_norm": 8.5625, + "epoch": 2.403708987161198, + "grad_norm": 9.25, "learning_rate": 9.613777777777779e-05, - "loss": 0.7537, + "loss": 1.1051, "step": 3370 }, { - "epoch": 0.406934745966771, - "grad_norm": 8.5625, + "epoch": 2.410841654778887, + "grad_norm": 9.5625, "learning_rate": 9.609333333333334e-05, - "loss": 0.6935, + "loss": 1.0551, "step": 3380 }, { - "epoch": 0.4081386949193354, - "grad_norm": 6.3125, + "epoch": 2.417974322396576, + "grad_norm": 7.21875, "learning_rate": 9.604888888888889e-05, - "loss": 0.8065, + "loss": 0.9032, "step": 3390 }, { - "epoch": 0.40934264387189984, - "grad_norm": 26.25, + "epoch": 2.425106990014265, + "grad_norm": 8.5625, "learning_rate": 9.600444444444445e-05, - "loss": 0.6558, + "loss": 1.1008, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval/acc": 37.20930099487305, + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, "step": 3400 }, { - "epoch": 0.40934264387189984, - "eval_loss": 2.7212982177734375, - "eval_runtime": 0.2094, - "eval_samples_per_second": 205.345, - "eval_steps_per_second": 4.775, + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, "step": 3400 }, { - "epoch": 0.41054659282446426, - "grad_norm": 6.84375, + "epoch": 2.4322396576319543, + "grad_norm": 10.375, "learning_rate": 9.596000000000001e-05, - "loss": 0.7642, + "loss": 0.9562, "step": 3410 }, { - "epoch": 0.41175054177702863, - "grad_norm": 7.0625, + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, "learning_rate": 9.591555555555556e-05, - "loss": 0.7185, + "loss": 1.0756, "step": 3420 }, { - "epoch": 0.41295449072959306, - "grad_norm": 7.15625, + "epoch": 2.4465049928673324, + "grad_norm": 9.125, "learning_rate": 9.58711111111111e-05, - "loss": 0.6634, + "loss": 0.9554, "step": 3430 }, { - "epoch": 0.4141584396821575, - "grad_norm": 4.96875, + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, "learning_rate": 9.582666666666668e-05, - "loss": 0.6383, + "loss": 0.9122, "step": 3440 }, { - "epoch": 0.4153623886347219, - "grad_norm": 7.15625, + "epoch": 2.4607703281027105, + "grad_norm": 8.625, "learning_rate": 9.578222222222223e-05, - "loss": 0.8032, + "loss": 0.9311, "step": 3450 }, { - "epoch": 0.4165663375872863, - "grad_norm": 9.0625, + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, "learning_rate": 9.573777777777778e-05, - "loss": 0.7294, + "loss": 1.0023, "step": 3460 }, { - "epoch": 0.4177702865398507, - "grad_norm": 9.5, + "epoch": 2.4750356633380886, + "grad_norm": 8.125, "learning_rate": 9.569333333333334e-05, - "loss": 0.802, + "loss": 0.9172, "step": 3470 }, { - "epoch": 0.41897423549241514, - "grad_norm": 7.0, + "epoch": 2.4821683309557776, + "grad_norm": 7.375, "learning_rate": 9.56488888888889e-05, - "loss": 0.7307, + "loss": 0.9407, "step": 3480 }, { - "epoch": 0.4201781844449795, - "grad_norm": 6.34375, + "epoch": 2.4893009985734667, + "grad_norm": 10.25, "learning_rate": 9.560444444444445e-05, - "loss": 0.7239, + "loss": 0.9433, "step": 3490 }, { - "epoch": 0.42138213339754393, - "grad_norm": 6.5, + "epoch": 2.4964336661911553, + "grad_norm": 8.625, "learning_rate": 9.556e-05, - "loss": 0.6711, + "loss": 0.9934, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval/acc": 39.53488540649414, + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, "step": 3500 }, { - "epoch": 0.42138213339754393, - "eval_loss": 2.569326400756836, - "eval_runtime": 0.2066, - "eval_samples_per_second": 208.137, - "eval_steps_per_second": 4.84, + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, "step": 3500 }, { - "epoch": 0.42258608235010836, - "grad_norm": 8.125, + "epoch": 2.5035663338088447, + "grad_norm": 7.625, "learning_rate": 9.551555555555556e-05, - "loss": 0.695, + "loss": 0.9157, "step": 3510 }, { - "epoch": 0.4237900313026728, - "grad_norm": 8.3125, + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, "learning_rate": 9.547111111111111e-05, - "loss": 0.8691, + "loss": 0.9202, "step": 3520 }, { - "epoch": 0.42499398025523716, - "grad_norm": 8.6875, + "epoch": 2.5178316690442224, + "grad_norm": 9.25, "learning_rate": 9.542666666666667e-05, - "loss": 0.7582, + "loss": 0.8526, "step": 3530 }, { - "epoch": 0.4261979292078016, - "grad_norm": 7.25, + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, "learning_rate": 9.538222222222223e-05, - "loss": 0.7143, + "loss": 0.9562, "step": 3540 }, { - "epoch": 0.427401878160366, - "grad_norm": 8.6875, + "epoch": 2.5320970042796005, + "grad_norm": 9.75, "learning_rate": 9.533777777777778e-05, - "loss": 0.6754, + "loss": 0.9927, "step": 3550 }, { - "epoch": 0.42860582711293044, - "grad_norm": 7.8125, + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, "learning_rate": 9.529333333333333e-05, - "loss": 0.7153, + "loss": 0.9263, "step": 3560 }, { - "epoch": 0.4298097760654948, - "grad_norm": 7.5625, + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, "learning_rate": 9.52488888888889e-05, - "loss": 0.7293, + "loss": 0.9367, "step": 3570 }, { - "epoch": 0.43101372501805923, - "grad_norm": 7.5625, + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, "learning_rate": 9.520444444444446e-05, - "loss": 0.7066, + "loss": 0.9284, "step": 3580 }, { - "epoch": 0.43221767397062366, - "grad_norm": 8.1875, + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, "learning_rate": 9.516e-05, - "loss": 0.691, + "loss": 0.8394, "step": 3590 }, { - "epoch": 0.43342162292318803, - "grad_norm": 7.125, + "epoch": 2.5677603423680457, + "grad_norm": 10.25, "learning_rate": 9.511555555555555e-05, - "loss": 0.8239, + "loss": 0.9336, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval/acc": 44.1860466003418, + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, "step": 3600 }, { - "epoch": 0.43342162292318803, - "eval_loss": 2.4877374172210693, - "eval_runtime": 0.3957, - "eval_samples_per_second": 108.658, - "eval_steps_per_second": 2.527, + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, "step": 3600 }, { - "epoch": 0.43462557187575246, - "grad_norm": 6.375, + "epoch": 2.574893009985735, + "grad_norm": 10.0625, "learning_rate": 9.507111111111111e-05, - "loss": 0.6782, + "loss": 1.0005, "step": 3610 }, { - "epoch": 0.4358295208283169, - "grad_norm": 7.1875, + "epoch": 2.582025677603424, + "grad_norm": 8.375, "learning_rate": 9.502666666666668e-05, - "loss": 0.7602, + "loss": 0.9319, "step": 3620 }, { - "epoch": 0.4370334697808813, - "grad_norm": 8.125, + "epoch": 2.5891583452211124, + "grad_norm": 8.5, "learning_rate": 9.498222222222222e-05, - "loss": 0.7232, + "loss": 0.9125, "step": 3630 }, { - "epoch": 0.4382374187334457, - "grad_norm": 7.84375, + "epoch": 2.596291012838802, + "grad_norm": 7.71875, "learning_rate": 9.493777777777779e-05, - "loss": 0.729, + "loss": 0.9279, "step": 3640 }, { - "epoch": 0.4394413676860101, - "grad_norm": 8.375, + "epoch": 2.6034236804564905, + "grad_norm": 11.875, "learning_rate": 9.489333333333334e-05, - "loss": 0.8222, + "loss": 0.952, "step": 3650 }, { - "epoch": 0.44064531663857454, - "grad_norm": 8.125, + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, "learning_rate": 9.48488888888889e-05, - "loss": 0.6918, + "loss": 1.0043, "step": 3660 }, { - "epoch": 0.44184926559113896, - "grad_norm": 8.1875, + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, "learning_rate": 9.480444444444445e-05, - "loss": 0.6761, + "loss": 0.8932, "step": 3670 }, { - "epoch": 0.44305321454370333, - "grad_norm": 5.65625, + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, "learning_rate": 9.476000000000001e-05, - "loss": 0.7532, + "loss": 0.8775, "step": 3680 }, { - "epoch": 0.44425716349626776, - "grad_norm": 8.8125, + "epoch": 2.6319543509272467, + "grad_norm": 9.0, "learning_rate": 9.471555555555556e-05, - "loss": 0.7072, + "loss": 0.9756, "step": 3690 }, { - "epoch": 0.4454611124488322, - "grad_norm": 6.5625, + "epoch": 2.6390870185449358, + "grad_norm": 7.375, "learning_rate": 9.46711111111111e-05, - "loss": 0.8405, + "loss": 0.9345, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval/acc": 39.53488540649414, + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, "step": 3700 }, { - "epoch": 0.4454611124488322, - "eval_loss": 2.615053176879883, - "eval_runtime": 4.8304, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 0.207, + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, "step": 3700 }, { - "epoch": 0.44666506140139656, - "grad_norm": 8.6875, + "epoch": 2.646219686162625, + "grad_norm": 8.4375, "learning_rate": 9.462666666666668e-05, - "loss": 0.7249, + "loss": 0.9851, "step": 3710 }, { - "epoch": 0.447869010353961, - "grad_norm": 8.4375, + "epoch": 2.653352353780314, + "grad_norm": 31.75, "learning_rate": 9.458222222222223e-05, - "loss": 0.8561, + "loss": 0.9712, "step": 3720 }, { - "epoch": 0.4490729593065254, - "grad_norm": 7.3125, + "epoch": 2.660485021398003, + "grad_norm": 6.75, "learning_rate": 9.453777777777778e-05, - "loss": 0.7884, + "loss": 0.8641, "step": 3730 }, { - "epoch": 0.45027690825908984, - "grad_norm": 7.34375, + "epoch": 2.667617689015692, + "grad_norm": 6.5625, "learning_rate": 9.449333333333334e-05, - "loss": 0.7169, + "loss": 0.945, "step": 3740 }, { - "epoch": 0.4514808572116542, - "grad_norm": 5.5, + "epoch": 2.674750356633381, + "grad_norm": 6.0625, "learning_rate": 9.44488888888889e-05, - "loss": 0.7542, + "loss": 0.9535, "step": 3750 }, { - "epoch": 0.45268480616421863, - "grad_norm": 6.09375, + "epoch": 2.68188302425107, + "grad_norm": 7.90625, "learning_rate": 9.440444444444445e-05, - "loss": 0.6292, + "loss": 0.8844, "step": 3760 }, { - "epoch": 0.45388875511678306, - "grad_norm": 8.9375, + "epoch": 2.689015691868759, + "grad_norm": 9.8125, "learning_rate": 9.436e-05, - "loss": 0.6682, + "loss": 0.9064, "step": 3770 }, { - "epoch": 0.4550927040693475, - "grad_norm": 5.09375, + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, "learning_rate": 9.431555555555556e-05, - "loss": 0.6499, + "loss": 1.0119, "step": 3780 }, { - "epoch": 0.45629665302191186, - "grad_norm": 8.5, + "epoch": 2.703281027104137, + "grad_norm": 7.15625, "learning_rate": 9.427111111111112e-05, - "loss": 0.7859, + "loss": 0.9655, "step": 3790 }, { - "epoch": 0.4575006019744763, - "grad_norm": 14.5, + "epoch": 2.710413694721826, + "grad_norm": 9.4375, "learning_rate": 9.422666666666667e-05, - "loss": 0.7987, + "loss": 0.9187, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval/acc": 39.53488540649414, + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, "step": 3800 }, { - "epoch": 0.4575006019744763, - "eval_loss": 2.645066022872925, - "eval_runtime": 0.6165, - "eval_samples_per_second": 69.745, - "eval_steps_per_second": 1.622, + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, "step": 3800 }, { - "epoch": 0.4587045509270407, - "grad_norm": 6.25, + "epoch": 2.717546362339515, + "grad_norm": 9.25, "learning_rate": 9.418222222222223e-05, - "loss": 0.7035, + "loss": 0.8689, "step": 3810 }, { - "epoch": 0.4599084998796051, - "grad_norm": 6.46875, + "epoch": 2.724679029957204, + "grad_norm": 8.0625, "learning_rate": 9.413777777777778e-05, - "loss": 0.6329, + "loss": 0.9138, "step": 3820 }, { - "epoch": 0.4611124488321695, - "grad_norm": 8.875, + "epoch": 2.731811697574893, + "grad_norm": 14.3125, "learning_rate": 9.409333333333333e-05, - "loss": 0.7553, + "loss": 0.9129, "step": 3830 }, { - "epoch": 0.46231639778473393, - "grad_norm": 9.3125, + "epoch": 2.738944365192582, + "grad_norm": 6.78125, "learning_rate": 9.404888888888889e-05, - "loss": 0.6551, + "loss": 0.8666, "step": 3840 }, { - "epoch": 0.46352034673729836, - "grad_norm": 11.0625, + "epoch": 2.746077032810271, + "grad_norm": 7.4375, "learning_rate": 9.400444444444445e-05, - "loss": 0.6634, + "loss": 0.9474, "step": 3850 }, { - "epoch": 0.46472429568986273, - "grad_norm": 6.71875, + "epoch": 2.75320970042796, + "grad_norm": 7.46875, "learning_rate": 9.396e-05, - "loss": 0.6527, + "loss": 0.9312, "step": 3860 }, { - "epoch": 0.46592824464242716, - "grad_norm": 6.75, + "epoch": 2.760342368045649, + "grad_norm": 7.84375, "learning_rate": 9.391555555555555e-05, - "loss": 0.8268, + "loss": 0.943, "step": 3870 }, { - "epoch": 0.4671321935949916, - "grad_norm": 7.78125, + "epoch": 2.767475035663338, + "grad_norm": 8.125, "learning_rate": 9.387111111111113e-05, - "loss": 0.742, + "loss": 0.9471, "step": 3880 }, { - "epoch": 0.468336142547556, - "grad_norm": 6.53125, + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, "learning_rate": 9.382666666666667e-05, - "loss": 0.7446, + "loss": 0.9785, "step": 3890 }, { - "epoch": 0.4695400915001204, - "grad_norm": 7.0625, + "epoch": 2.7817403708987163, + "grad_norm": 10.5, "learning_rate": 9.378222222222222e-05, - "loss": 0.7764, + "loss": 1.0151, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval/acc": 37.79069900512695, + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, "step": 3900 }, { - "epoch": 0.4695400915001204, - "eval_loss": 2.6463897228240967, - "eval_runtime": 1.4145, - "eval_samples_per_second": 30.4, - "eval_steps_per_second": 0.707, + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, "step": 3900 }, { - "epoch": 0.4707440404526848, - "grad_norm": 5.625, + "epoch": 2.788873038516405, + "grad_norm": 9.75, "learning_rate": 9.373777777777778e-05, - "loss": 0.7248, + "loss": 0.9148, "step": 3910 }, { - "epoch": 0.47194798940524924, - "grad_norm": 7.09375, + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, "learning_rate": 9.369333333333333e-05, - "loss": 0.6977, + "loss": 1.0314, "step": 3920 }, { - "epoch": 0.4731519383578136, - "grad_norm": 7.53125, + "epoch": 2.803138373751783, + "grad_norm": 8.375, "learning_rate": 9.36488888888889e-05, - "loss": 0.6496, + "loss": 0.9076, "step": 3930 }, { - "epoch": 0.47435588731037803, - "grad_norm": 11.0, + "epoch": 2.810271041369472, + "grad_norm": 6.46875, "learning_rate": 9.360444444444444e-05, - "loss": 0.7309, + "loss": 0.8218, "step": 3940 }, { - "epoch": 0.47555983626294246, - "grad_norm": 10.5625, + "epoch": 2.817403708987161, + "grad_norm": 7.96875, "learning_rate": 9.356e-05, - "loss": 0.7837, + "loss": 0.9415, "step": 3950 }, { - "epoch": 0.4767637852155069, - "grad_norm": 6.9375, + "epoch": 2.82453637660485, + "grad_norm": 7.53125, "learning_rate": 9.351555555555555e-05, - "loss": 0.6769, + "loss": 0.9593, "step": 3960 }, { - "epoch": 0.47796773416807126, - "grad_norm": 6.84375, + "epoch": 2.831669044222539, + "grad_norm": 5.96875, "learning_rate": 9.347111111111112e-05, - "loss": 0.642, + "loss": 0.9134, "step": 3970 }, { - "epoch": 0.4791716831206357, - "grad_norm": 9.125, + "epoch": 2.8388017118402282, + "grad_norm": 8.25, "learning_rate": 9.342666666666668e-05, - "loss": 0.6947, + "loss": 0.9339, "step": 3980 }, { - "epoch": 0.4803756320732001, - "grad_norm": 7.4375, + "epoch": 2.8459343794579173, + "grad_norm": 9.625, "learning_rate": 9.338222222222223e-05, - "loss": 0.5902, + "loss": 1.0018, "step": 3990 }, { - "epoch": 0.4815795810257645, - "grad_norm": 8.1875, + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, "learning_rate": 9.333777777777777e-05, - "loss": 0.6075, + "loss": 0.9302, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval/acc": 34.88372039794922, + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, "step": 4000 }, { - "epoch": 0.4815795810257645, - "eval_loss": 2.6985960006713867, - "eval_runtime": 0.2767, - "eval_samples_per_second": 155.399, - "eval_steps_per_second": 3.614, + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, "step": 4000 }, { - "epoch": 0.4827835299783289, - "grad_norm": 6.8125, + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, "learning_rate": 9.329333333333334e-05, - "loss": 0.7166, + "loss": 0.9375, "step": 4010 }, { - "epoch": 0.48398747893089333, - "grad_norm": 6.375, + "epoch": 2.8673323823109844, + "grad_norm": 11.875, "learning_rate": 9.32488888888889e-05, - "loss": 0.6136, + "loss": 0.8406, "step": 4020 }, { - "epoch": 0.48519142788345776, - "grad_norm": 6.09375, + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, "learning_rate": 9.320444444444445e-05, - "loss": 0.7948, + "loss": 0.8863, "step": 4030 }, { - "epoch": 0.48639537683602213, - "grad_norm": 7.5625, + "epoch": 2.881597717546362, + "grad_norm": 6.9375, "learning_rate": 9.316000000000001e-05, - "loss": 0.7253, + "loss": 0.9546, "step": 4040 }, { - "epoch": 0.48759932578858656, - "grad_norm": 7.1875, + "epoch": 2.8887303851640516, + "grad_norm": 8.625, "learning_rate": 9.311555555555556e-05, - "loss": 0.7386, + "loss": 1.0175, "step": 4050 }, { - "epoch": 0.488803274741151, - "grad_norm": 7.71875, + "epoch": 2.89586305278174, + "grad_norm": 45.0, "learning_rate": 9.307111111111112e-05, - "loss": 0.7222, + "loss": 0.9058, "step": 4060 }, { - "epoch": 0.4900072236937154, - "grad_norm": 10.8125, + "epoch": 2.9029957203994297, + "grad_norm": 13.625, "learning_rate": 9.302666666666667e-05, - "loss": 0.6298, + "loss": 0.9137, "step": 4070 }, { - "epoch": 0.4912111726462798, - "grad_norm": 14.25, + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, "learning_rate": 9.298222222222223e-05, - "loss": 0.6551, + "loss": 0.8862, "step": 4080 }, { - "epoch": 0.4924151215988442, - "grad_norm": 7.75, + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, "learning_rate": 9.293777777777778e-05, - "loss": 0.7201, + "loss": 0.9152, "step": 4090 }, { - "epoch": 0.49361907055140863, - "grad_norm": 9.0625, + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, "learning_rate": 9.289333333333334e-05, - "loss": 0.708, + "loss": 0.9623, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval/acc": 34.88372039794922, + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, "step": 4100 }, { - "epoch": 0.49361907055140863, - "eval_loss": 2.7673676013946533, - "eval_runtime": 0.3468, - "eval_samples_per_second": 124.003, - "eval_steps_per_second": 2.884, + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, "step": 4100 }, { - "epoch": 0.494823019503973, - "grad_norm": 7.9375, + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, "learning_rate": 9.28488888888889e-05, - "loss": 0.6997, + "loss": 0.9088, "step": 4110 }, { - "epoch": 0.49602696845653743, - "grad_norm": 6.84375, + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, "learning_rate": 9.280444444444445e-05, - "loss": 0.6195, + "loss": 0.9927, "step": 4120 }, { - "epoch": 0.49723091740910186, - "grad_norm": 7.40625, + "epoch": 2.9457917261055635, + "grad_norm": 75.0, "learning_rate": 9.276e-05, - "loss": 0.765, + "loss": 0.912, "step": 4130 }, { - "epoch": 0.4984348663616663, - "grad_norm": 7.8125, + "epoch": 2.9529243937232525, + "grad_norm": 9.125, "learning_rate": 9.271555555555556e-05, - "loss": 0.7097, + "loss": 0.9878, "step": 4140 }, { - "epoch": 0.49963881531423066, - "grad_norm": 7.75, + "epoch": 2.9600570613409416, + "grad_norm": 7.125, "learning_rate": 9.267111111111112e-05, - "loss": 0.7067, + "loss": 0.8785, "step": 4150 }, { - "epoch": 0.5008427642667951, - "grad_norm": 27.875, + "epoch": 2.9671897289586306, + "grad_norm": 8.25, "learning_rate": 9.262666666666667e-05, - "loss": 0.7989, + "loss": 0.9296, "step": 4160 }, { - "epoch": 0.5020467132193595, - "grad_norm": 8.0, + "epoch": 2.9743223965763197, + "grad_norm": 8.75, "learning_rate": 9.258222222222222e-05, - "loss": 0.6744, + "loss": 0.9284, "step": 4170 }, { - "epoch": 0.5032506621719239, - "grad_norm": 7.96875, + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, "learning_rate": 9.253777777777778e-05, - "loss": 0.738, + "loss": 0.9566, "step": 4180 }, { - "epoch": 0.5044546111244883, - "grad_norm": 7.21875, + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, "learning_rate": 9.249333333333334e-05, - "loss": 0.7021, + "loss": 0.8368, "step": 4190 }, { - "epoch": 0.5056585600770528, - "grad_norm": 9.6875, + "epoch": 2.995720399429387, + "grad_norm": 9.875, "learning_rate": 9.244888888888889e-05, - "loss": 0.7133, + "loss": 1.0306, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval/acc": 32.55813980102539, + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, "step": 4200 }, { - "epoch": 0.5056585600770528, - "eval_loss": 2.7288577556610107, - "eval_runtime": 0.2266, - "eval_samples_per_second": 189.803, - "eval_steps_per_second": 4.414, + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, "step": 4200 }, { - "epoch": 0.5068625090296172, - "grad_norm": 10.5, + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, "learning_rate": 9.240444444444445e-05, - "loss": 0.6886, + "loss": 0.957, "step": 4210 }, { - "epoch": 0.5080664579821815, - "grad_norm": 9.0625, + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, "learning_rate": 9.236e-05, - "loss": 0.7944, + "loss": 0.884, "step": 4220 }, { - "epoch": 0.509270406934746, - "grad_norm": 7.78125, + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, "learning_rate": 9.231555555555555e-05, - "loss": 0.7869, + "loss": 0.9064, "step": 4230 }, { - "epoch": 0.5104743558873104, - "grad_norm": 6.375, + "epoch": 3.0242510699001426, + "grad_norm": 8.0, "learning_rate": 9.227111111111111e-05, - "loss": 0.6245, + "loss": 0.9164, "step": 4240 }, { - "epoch": 0.5116783048398748, - "grad_norm": 9.9375, + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, "learning_rate": 9.222666666666668e-05, - "loss": 0.7006, + "loss": 0.9787, "step": 4250 }, { - "epoch": 0.5128822537924392, - "grad_norm": 6.1875, + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, "learning_rate": 9.218222222222222e-05, - "loss": 0.7588, + "loss": 0.8852, "step": 4260 }, { - "epoch": 0.5140862027450036, - "grad_norm": 10.6875, + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, "learning_rate": 9.213777777777777e-05, - "loss": 0.737, + "loss": 1.0092, "step": 4270 }, { - "epoch": 0.515290151697568, - "grad_norm": 6.15625, + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, "learning_rate": 9.209333333333335e-05, - "loss": 0.6774, + "loss": 0.9972, "step": 4280 }, { - "epoch": 0.5164941006501325, - "grad_norm": 8.8125, + "epoch": 3.059914407988588, + "grad_norm": 7.25, "learning_rate": 9.20488888888889e-05, - "loss": 0.6972, + "loss": 0.9237, "step": 4290 }, { - "epoch": 0.5176980496026968, - "grad_norm": 6.40625, + "epoch": 3.067047075606277, + "grad_norm": 6.4375, "learning_rate": 9.200444444444445e-05, - "loss": 0.6423, + "loss": 0.9096, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval/acc": 38.953487396240234, + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, "step": 4300 }, { - "epoch": 0.5176980496026968, - "eval_loss": 2.7444300651550293, - "eval_runtime": 0.2708, - "eval_samples_per_second": 158.776, - "eval_steps_per_second": 3.692, + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, "step": 4300 }, { - "epoch": 0.5189019985552613, - "grad_norm": 6.8125, + "epoch": 3.074179743223966, + "grad_norm": 8.4375, "learning_rate": 9.196000000000001e-05, - "loss": 0.7705, + "loss": 0.9697, "step": 4310 }, { - "epoch": 0.5201059475078257, - "grad_norm": 5.90625, + "epoch": 3.081312410841655, + "grad_norm": 8.4375, "learning_rate": 9.191555555555556e-05, - "loss": 0.7534, + "loss": 0.8379, "step": 4320 }, { - "epoch": 0.52130989646039, - "grad_norm": 9.25, + "epoch": 3.088445078459344, + "grad_norm": 8.125, "learning_rate": 9.187111111111112e-05, - "loss": 0.6586, + "loss": 0.8576, "step": 4330 }, { - "epoch": 0.5225138454129545, - "grad_norm": 7.53125, + "epoch": 3.0955777460770326, + "grad_norm": 10.75, "learning_rate": 9.182666666666667e-05, - "loss": 0.7459, + "loss": 0.9616, "step": 4340 }, { - "epoch": 0.5237177943655189, - "grad_norm": 6.09375, + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, "learning_rate": 9.178222222222223e-05, - "loss": 0.7088, + "loss": 0.7674, "step": 4350 }, { - "epoch": 0.5249217433180833, - "grad_norm": 8.5, + "epoch": 3.1098430813124107, + "grad_norm": 8.375, "learning_rate": 9.173777777777778e-05, - "loss": 0.7313, + "loss": 0.8712, "step": 4360 }, { - "epoch": 0.5261256922706478, - "grad_norm": 8.8125, + "epoch": 3.1169757489300998, + "grad_norm": 8.375, "learning_rate": 9.169333333333334e-05, - "loss": 0.7364, + "loss": 0.8599, "step": 4370 }, { - "epoch": 0.5273296412232121, - "grad_norm": 7.09375, + "epoch": 3.124108416547789, + "grad_norm": 7.1875, "learning_rate": 9.16488888888889e-05, - "loss": 0.6962, + "loss": 0.9736, "step": 4380 }, { - "epoch": 0.5285335901757765, - "grad_norm": 6.28125, + "epoch": 3.131241084165478, + "grad_norm": 7.75, "learning_rate": 9.160444444444445e-05, - "loss": 0.6817, + "loss": 0.8663, "step": 4390 }, { - "epoch": 0.529737539128341, - "grad_norm": 8.25, + "epoch": 3.138373751783167, + "grad_norm": 7.53125, "learning_rate": 9.156e-05, - "loss": 0.6786, + "loss": 0.9221, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval/acc": 34.88372039794922, + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, "step": 4400 }, { - "epoch": 0.529737539128341, - "eval_loss": 2.728501081466675, - "eval_runtime": 0.3599, - "eval_samples_per_second": 119.474, - "eval_steps_per_second": 2.778, + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, "step": 4400 }, { - "epoch": 0.5309414880809054, - "grad_norm": 7.59375, + "epoch": 3.145506419400856, + "grad_norm": 8.125, "learning_rate": 9.151555555555556e-05, - "loss": 0.6744, + "loss": 0.9144, "step": 4410 }, { - "epoch": 0.5321454370334697, - "grad_norm": 8.0625, + "epoch": 3.152639087018545, + "grad_norm": 7.46875, "learning_rate": 9.147111111111112e-05, - "loss": 0.8287, + "loss": 0.9445, "step": 4420 }, { - "epoch": 0.5333493859860342, - "grad_norm": 8.1875, + "epoch": 3.159771754636234, + "grad_norm": 6.9375, "learning_rate": 9.142666666666667e-05, - "loss": 0.7069, + "loss": 0.8308, "step": 4430 }, { - "epoch": 0.5345533349385986, - "grad_norm": 8.125, + "epoch": 3.166904422253923, + "grad_norm": 7.53125, "learning_rate": 9.138222222222222e-05, - "loss": 0.662, + "loss": 0.8428, "step": 4440 }, { - "epoch": 0.5357572838911631, - "grad_norm": 7.46875, + "epoch": 3.174037089871612, + "grad_norm": 7.96875, "learning_rate": 9.133777777777778e-05, - "loss": 0.7424, + "loss": 0.9022, "step": 4450 }, { - "epoch": 0.5369612328437274, - "grad_norm": 6.96875, + "epoch": 3.181169757489301, + "grad_norm": 6.875, "learning_rate": 9.129333333333334e-05, - "loss": 0.7308, + "loss": 0.9955, "step": 4460 }, { - "epoch": 0.5381651817962918, - "grad_norm": 8.3125, + "epoch": 3.18830242510699, + "grad_norm": 9.5625, "learning_rate": 9.124888888888889e-05, - "loss": 0.7524, + "loss": 0.9493, "step": 4470 }, { - "epoch": 0.5393691307488563, - "grad_norm": 6.40625, + "epoch": 3.195435092724679, + "grad_norm": 9.0625, "learning_rate": 9.120444444444445e-05, - "loss": 0.7523, + "loss": 0.9608, "step": 4480 }, { - "epoch": 0.5405730797014207, - "grad_norm": 7.65625, + "epoch": 3.202567760342368, + "grad_norm": 8.625, "learning_rate": 9.116e-05, - "loss": 0.647, + "loss": 0.821, "step": 4490 }, { - "epoch": 0.541777028653985, - "grad_norm": 6.875, + "epoch": 3.209700427960057, + "grad_norm": 8.125, "learning_rate": 9.111555555555556e-05, - "loss": 0.6547, + "loss": 0.9175, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval/acc": 37.20930099487305, + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, "step": 4500 }, { - "epoch": 0.541777028653985, - "eval_loss": 2.8390543460845947, - "eval_runtime": 0.2096, - "eval_samples_per_second": 205.2, - "eval_steps_per_second": 4.772, + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, "step": 4500 }, { - "epoch": 0.5429809776065495, - "grad_norm": 9.375, + "epoch": 3.216833095577746, + "grad_norm": 8.0625, "learning_rate": 9.107111111111111e-05, - "loss": 0.6773, + "loss": 0.9169, "step": 4510 }, { - "epoch": 0.5441849265591139, - "grad_norm": 10.1875, + "epoch": 3.223965763195435, + "grad_norm": 8.3125, "learning_rate": 9.102666666666667e-05, - "loss": 0.704, + "loss": 0.8001, "step": 4520 }, { - "epoch": 0.5453888755116783, - "grad_norm": 5.0625, + "epoch": 3.231098430813124, + "grad_norm": 7.3125, "learning_rate": 9.098222222222222e-05, - "loss": 0.6303, + "loss": 0.8513, "step": 4530 }, { - "epoch": 0.5465928244642427, - "grad_norm": 8.25, + "epoch": 3.238231098430813, + "grad_norm": 7.625, "learning_rate": 9.093777777777777e-05, - "loss": 0.7469, + "loss": 0.912, "step": 4540 }, { - "epoch": 0.5477967734168071, - "grad_norm": 7.375, + "epoch": 3.245363766048502, + "grad_norm": 6.46875, "learning_rate": 9.089333333333335e-05, - "loss": 0.6995, + "loss": 0.9418, "step": 4550 }, { - "epoch": 0.5490007223693716, - "grad_norm": 7.78125, + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, "learning_rate": 9.08488888888889e-05, - "loss": 0.6965, + "loss": 0.871, "step": 4560 }, { - "epoch": 0.550204671321936, - "grad_norm": 13.625, + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, "learning_rate": 9.080444444444444e-05, - "loss": 0.759, + "loss": 0.8507, "step": 4570 }, { - "epoch": 0.5514086202745003, - "grad_norm": 6.875, + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, "learning_rate": 9.076e-05, - "loss": 0.7284, + "loss": 0.8058, "step": 4580 }, { - "epoch": 0.5526125692270648, - "grad_norm": 5.875, + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, "learning_rate": 9.071555555555557e-05, - "loss": 0.6721, + "loss": 0.7959, "step": 4590 }, { - "epoch": 0.5538165181796292, - "grad_norm": 5.46875, + "epoch": 3.281027104136947, + "grad_norm": 6.375, "learning_rate": 9.067111111111112e-05, - "loss": 0.6522, + "loss": 0.9206, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval/acc": 39.53488540649414, + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, "step": 4600 }, { - "epoch": 0.5538165181796292, - "eval_loss": 2.801618814468384, - "eval_runtime": 0.2155, - "eval_samples_per_second": 199.501, - "eval_steps_per_second": 4.64, + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, "step": 4600 }, { - "epoch": 0.5550204671321936, - "grad_norm": 8.5625, + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, "learning_rate": 9.062666666666666e-05, - "loss": 0.6399, + "loss": 0.8306, "step": 4610 }, { - "epoch": 0.556224416084758, - "grad_norm": 7.40625, + "epoch": 3.295292439372325, + "grad_norm": 6.9375, "learning_rate": 9.058222222222223e-05, - "loss": 0.7303, + "loss": 0.8958, "step": 4620 }, { - "epoch": 0.5574283650373224, - "grad_norm": 6.96875, + "epoch": 3.302425106990014, + "grad_norm": 7.96875, "learning_rate": 9.053777777777777e-05, - "loss": 0.7126, + "loss": 0.8919, "step": 4630 }, { - "epoch": 0.5586323139898868, - "grad_norm": 7.15625, + "epoch": 3.309557774607703, + "grad_norm": 6.9375, "learning_rate": 9.049333333333334e-05, - "loss": 0.702, + "loss": 0.8844, "step": 4640 }, { - "epoch": 0.5598362629424513, - "grad_norm": 6.625, + "epoch": 3.316690442225392, + "grad_norm": 7.21875, "learning_rate": 9.04488888888889e-05, - "loss": 0.6957, + "loss": 0.8335, "step": 4650 }, { - "epoch": 0.5610402118950156, - "grad_norm": 7.90625, + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, "learning_rate": 9.040444444444445e-05, - "loss": 0.703, + "loss": 0.9337, "step": 4660 }, { - "epoch": 0.5622441608475801, - "grad_norm": 7.75, + "epoch": 3.3309557774607703, + "grad_norm": 9.25, "learning_rate": 9.036e-05, - "loss": 0.7195, + "loss": 1.0282, "step": 4670 }, { - "epoch": 0.5634481098001445, - "grad_norm": 6.59375, + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, "learning_rate": 9.031555555555557e-05, - "loss": 0.6445, + "loss": 0.9401, "step": 4680 }, { - "epoch": 0.5646520587527089, - "grad_norm": 25.125, + "epoch": 3.3452211126961484, + "grad_norm": 7.25, "learning_rate": 9.027111111111112e-05, - "loss": 0.699, + "loss": 0.908, "step": 4690 }, { - "epoch": 0.5658560077052733, - "grad_norm": 8.125, + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, "learning_rate": 9.022666666666667e-05, - "loss": 0.716, + "loss": 0.9262, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval/acc": 34.88372039794922, + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, "step": 4700 }, { - "epoch": 0.5658560077052733, - "eval_loss": 2.777444839477539, - "eval_runtime": 0.218, - "eval_samples_per_second": 197.287, - "eval_steps_per_second": 4.588, + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, "step": 4700 }, { - "epoch": 0.5670599566578377, - "grad_norm": 7.0, + "epoch": 3.3594864479315265, + "grad_norm": 13.0, "learning_rate": 9.018222222222223e-05, - "loss": 0.693, + "loss": 0.9692, "step": 4710 }, { - "epoch": 0.5682639056104021, - "grad_norm": 8.8125, + "epoch": 3.3666191155492156, + "grad_norm": 5.875, "learning_rate": 9.013777777777779e-05, - "loss": 0.7, + "loss": 0.9071, "step": 4720 }, { - "epoch": 0.5694678545629666, - "grad_norm": 7.0, + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, "learning_rate": 9.009333333333334e-05, - "loss": 0.6616, + "loss": 0.8528, "step": 4730 }, { - "epoch": 0.5706718035155309, - "grad_norm": 7.75, + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, "learning_rate": 9.004888888888889e-05, - "loss": 0.7987, + "loss": 0.9408, "step": 4740 }, { - "epoch": 0.5718757524680953, - "grad_norm": 6.53125, + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, "learning_rate": 9.000444444444445e-05, - "loss": 0.7162, + "loss": 1.0017, "step": 4750 }, { - "epoch": 0.5730797014206598, - "grad_norm": 8.6875, + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, "learning_rate": 8.996e-05, - "loss": 0.673, + "loss": 0.9107, "step": 4760 }, { - "epoch": 0.5742836503732242, - "grad_norm": 6.5625, + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, "learning_rate": 8.991555555555556e-05, - "loss": 0.7389, + "loss": 0.9387, "step": 4770 }, { - "epoch": 0.5754875993257886, - "grad_norm": 7.25, + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, "learning_rate": 8.987111111111112e-05, - "loss": 0.6674, + "loss": 0.9775, "step": 4780 }, { - "epoch": 0.576691548278353, - "grad_norm": 8.8125, + "epoch": 3.4165477888730384, + "grad_norm": 8.375, "learning_rate": 8.982666666666667e-05, - "loss": 0.7464, + "loss": 0.8173, "step": 4790 }, { - "epoch": 0.5778954972309174, - "grad_norm": 7.65625, + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, "learning_rate": 8.978222222222222e-05, - "loss": 0.6979, + "loss": 0.9068, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval/acc": 37.20930099487305, + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, "step": 4800 }, { - "epoch": 0.5778954972309174, - "eval_loss": 2.7990331649780273, - "eval_runtime": 0.207, - "eval_samples_per_second": 207.72, - "eval_steps_per_second": 4.831, + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, "step": 4800 }, { - "epoch": 0.5790994461834819, - "grad_norm": 6.90625, + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, "learning_rate": 8.973777777777778e-05, - "loss": 0.7292, + "loss": 0.8262, "step": 4810 }, { - "epoch": 0.5803033951360462, - "grad_norm": 7.34375, + "epoch": 3.4379457917261056, + "grad_norm": 9.125, "learning_rate": 8.969333333333334e-05, - "loss": 0.6484, + "loss": 0.9207, "step": 4820 }, { - "epoch": 0.5815073440886106, - "grad_norm": 7.96875, + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, "learning_rate": 8.964888888888889e-05, - "loss": 0.6246, + "loss": 1.0115, "step": 4830 }, { - "epoch": 0.5827112930411751, - "grad_norm": 5.4375, + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, "learning_rate": 8.960444444444444e-05, - "loss": 0.6978, + "loss": 0.9031, "step": 4840 }, { - "epoch": 0.5839152419937395, - "grad_norm": 7.25, + "epoch": 3.4593437945791727, + "grad_norm": 7.875, "learning_rate": 8.956e-05, - "loss": 0.6848, + "loss": 0.9626, "step": 4850 }, { - "epoch": 0.5851191909463038, - "grad_norm": 8.9375, + "epoch": 3.466476462196862, + "grad_norm": 4.625, "learning_rate": 8.951555555555557e-05, - "loss": 0.7541, + "loss": 0.7793, "step": 4860 }, { - "epoch": 0.5863231398988683, - "grad_norm": 8.6875, + "epoch": 3.473609129814551, + "grad_norm": 7.40625, "learning_rate": 8.947111111111111e-05, - "loss": 0.6872, + "loss": 0.8733, "step": 4870 }, { - "epoch": 0.5875270888514327, - "grad_norm": 6.375, + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, "learning_rate": 8.942666666666668e-05, - "loss": 0.7521, + "loss": 0.8448, "step": 4880 }, { - "epoch": 0.5887310378039972, - "grad_norm": 7.34375, + "epoch": 3.4878744650499285, + "grad_norm": 8.625, "learning_rate": 8.938222222222222e-05, - "loss": 0.6741, + "loss": 0.815, "step": 4890 }, { - "epoch": 0.5899349867565615, - "grad_norm": 9.25, + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, "learning_rate": 8.933777777777779e-05, - "loss": 0.7085, + "loss": 0.7837, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval/acc": 32.55813980102539, + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, "step": 4900 }, { - "epoch": 0.5899349867565615, - "eval_loss": 2.822793483734131, - "eval_runtime": 0.2077, - "eval_samples_per_second": 206.985, - "eval_steps_per_second": 4.814, + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, "step": 4900 }, { - "epoch": 0.5911389357091259, - "grad_norm": 6.75, + "epoch": 3.5021398002853066, + "grad_norm": 7.25, "learning_rate": 8.929333333333333e-05, - "loss": 0.6908, + "loss": 0.9082, "step": 4910 }, { - "epoch": 0.5923428846616904, - "grad_norm": 14.3125, + "epoch": 3.5092724679029956, + "grad_norm": 9.0, "learning_rate": 8.92488888888889e-05, - "loss": 0.6954, + "loss": 0.8041, "step": 4920 }, { - "epoch": 0.5935468336142548, - "grad_norm": 5.03125, + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, "learning_rate": 8.920444444444444e-05, - "loss": 0.6255, + "loss": 0.878, "step": 4930 }, { - "epoch": 0.5947507825668191, - "grad_norm": 7.3125, + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, "learning_rate": 8.916e-05, - "loss": 0.6094, + "loss": 0.8609, "step": 4940 }, { - "epoch": 0.5959547315193836, - "grad_norm": 6.875, + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, "learning_rate": 8.911555555555557e-05, - "loss": 0.6488, + "loss": 0.8203, "step": 4950 }, { - "epoch": 0.597158680471948, - "grad_norm": 6.90625, + "epoch": 3.537803138373752, + "grad_norm": 6.4375, "learning_rate": 8.907111111111112e-05, - "loss": 0.6333, + "loss": 0.8976, "step": 4960 }, { - "epoch": 0.5983626294245123, - "grad_norm": 7.0, + "epoch": 3.544935805991441, + "grad_norm": 15.0, "learning_rate": 8.902666666666667e-05, - "loss": 0.6687, + "loss": 0.8585, "step": 4970 }, { - "epoch": 0.5995665783770768, - "grad_norm": 8.9375, + "epoch": 3.55206847360913, + "grad_norm": 6.21875, "learning_rate": 8.898222222222223e-05, - "loss": 0.6762, + "loss": 0.9642, "step": 4980 }, { - "epoch": 0.6007705273296412, - "grad_norm": 7.53125, + "epoch": 3.559201141226819, + "grad_norm": 9.8125, "learning_rate": 8.893777777777779e-05, - "loss": 0.6007, + "loss": 0.9241, "step": 4990 }, { - "epoch": 0.6019744762822057, - "grad_norm": 5.78125, + "epoch": 3.566333808844508, + "grad_norm": 9.25, "learning_rate": 8.889333333333334e-05, - "loss": 0.682, + "loss": 0.7841, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval/acc": 32.55813980102539, + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, "step": 5000 }, { - "epoch": 0.6019744762822057, - "eval_loss": 2.827073097229004, - "eval_runtime": 0.2073, - "eval_samples_per_second": 207.385, - "eval_steps_per_second": 4.823, + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, "step": 5000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 18, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/model.safetensors b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b98dce5180ec2d612b70665db845bd9c69293da --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bbab43b76cdbed4ee5364e337787e088ac7a5b381ebe2f680cc9ee3fbf04b17 +size 298041696 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/optimizer.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0242fb53042e5a94a518245b82c050c5e6350fbd --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff7e107bf5d1efad55fa123a28edf876fc0a79e6504a35e8436b491f3bce835 +size 596170443 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_0.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..672cff9ce8725ede76269af2d5c1218a49590bc5 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a25b1ca1a6812b6542e9e1ab355d92720b67020d48c39e80dfa44e2613e6782 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_1.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..290ff36b5a81e5bbe52cf035192e44421766663b --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062ea9ef2b29cc9252ab2bc026f7af9083d7a47fc2921720ed578a42d1a098b1 +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_2.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3a4ba33f202109616def348f833a4eeb9c23f2f --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fa19b4084f36e88e23466f3e38f8923ef64d0e637be7a81e9c16350b86e72a +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_3.pth b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..929ceb2abab1e52e36f2ce15aab552dbf7064596 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fcddbf722dbad693b2a4a95db5330dca78f263e2b699c99b8acc90a117bd68e +size 15429 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/runs/Nov23_21-20-55_nid005048/events.out.tfevents.1763925880.nid005048.122479.0 b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/runs/Nov23_21-20-55_nid005048/events.out.tfevents.1763925880.nid005048.122479.0 new file mode 100644 index 0000000000000000000000000000000000000000..69faf0fb23aea22bf10e4b13dcc690a2f1eb1b29 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/runs/Nov23_21-20-55_nid005048/events.out.tfevents.1763925880.nid005048.122479.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2306da6243e92b3c04ec952f4bb6a83cba8df51619e01ac9d0827cb0183ac48 +size 631289 diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/scheduler.pt b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e68a9a1b37435296824fe0d74ad28b671b046263 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1717a9054eef94433d1223ce97cfacf8af74d8a39634780628341d30ddbbddd +size 1465 diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/special_tokens_map.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/special_tokens_map.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/special_tokens_map.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/special_tokens_map.json diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/tokenizer.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/tokenizer.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/tokenizer.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/tokenizer.json diff --git a/modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/tokenizer_config.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/tokenizer_config.json similarity index 100% rename from modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/tokenizer_config.json rename to modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/tokenizer_config.json diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/trainer_state.json b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e3c9bda054d9b8656b9bf8c3e2543818c385a3bf --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/trainer_state.json @@ -0,0 +1,20784 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 17.83166904422254, + "eval_steps": 100, + "global_step": 25000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007132667617689016, + "grad_norm": 19.75, + "learning_rate": 3.6e-07, + "loss": 5.6319, + "step": 10 + }, + { + "epoch": 0.014265335235378032, + "grad_norm": 19.375, + "learning_rate": 7.6e-07, + "loss": 5.5914, + "step": 20 + }, + { + "epoch": 0.021398002853067047, + "grad_norm": 51.25, + "learning_rate": 1.16e-06, + "loss": 5.6495, + "step": 30 + }, + { + "epoch": 0.028530670470756064, + "grad_norm": 19.0, + "learning_rate": 1.56e-06, + "loss": 5.6581, + "step": 40 + }, + { + "epoch": 0.03566333808844508, + "grad_norm": 23.75, + "learning_rate": 1.96e-06, + "loss": 5.6366, + "step": 50 + }, + { + "epoch": 0.042796005706134094, + "grad_norm": 18.0, + "learning_rate": 2.36e-06, + "loss": 5.6411, + "step": 60 + }, + { + "epoch": 0.04992867332382311, + "grad_norm": 14.4375, + "learning_rate": 2.7600000000000003e-06, + "loss": 5.5919, + "step": 70 + }, + { + "epoch": 0.05706134094151213, + "grad_norm": 24.125, + "learning_rate": 3.1600000000000007e-06, + "loss": 5.6083, + "step": 80 + }, + { + "epoch": 0.06419400855920114, + "grad_norm": 18.25, + "learning_rate": 3.5600000000000002e-06, + "loss": 5.6599, + "step": 90 + }, + { + "epoch": 0.07132667617689016, + "grad_norm": 18.25, + "learning_rate": 3.96e-06, + "loss": 5.6652, + "step": 100 + }, + { + "epoch": 0.07132667617689016, + "eval/acc": 2.3255813121795654, + "step": 100 + }, + { + "epoch": 0.07132667617689016, + "eval_loss": 5.090479850769043, + "eval_runtime": 2.3284, + "eval_samples_per_second": 18.467, + "eval_steps_per_second": 0.429, + "step": 100 + }, + { + "epoch": 0.07845934379457917, + "grad_norm": 21.0, + "learning_rate": 4.360000000000001e-06, + "loss": 5.6402, + "step": 110 + }, + { + "epoch": 0.08559201141226819, + "grad_norm": 16.875, + "learning_rate": 4.76e-06, + "loss": 5.6535, + "step": 120 + }, + { + "epoch": 0.09272467902995721, + "grad_norm": 21.5, + "learning_rate": 5.1600000000000006e-06, + "loss": 5.5821, + "step": 130 + }, + { + "epoch": 0.09985734664764621, + "grad_norm": 18.5, + "learning_rate": 5.56e-06, + "loss": 5.6184, + "step": 140 + }, + { + "epoch": 0.10699001426533523, + "grad_norm": 14.9375, + "learning_rate": 5.9600000000000005e-06, + "loss": 5.5743, + "step": 150 + }, + { + "epoch": 0.11412268188302425, + "grad_norm": 16.875, + "learning_rate": 6.360000000000001e-06, + "loss": 5.5684, + "step": 160 + }, + { + "epoch": 0.12125534950071326, + "grad_norm": 22.125, + "learning_rate": 6.76e-06, + "loss": 5.535, + "step": 170 + }, + { + "epoch": 0.12838801711840228, + "grad_norm": 15.9375, + "learning_rate": 7.16e-06, + "loss": 5.4357, + "step": 180 + }, + { + "epoch": 0.1355206847360913, + "grad_norm": 16.375, + "learning_rate": 7.5600000000000005e-06, + "loss": 5.3766, + "step": 190 + }, + { + "epoch": 0.14265335235378032, + "grad_norm": 15.3125, + "learning_rate": 7.96e-06, + "loss": 5.4437, + "step": 200 + }, + { + "epoch": 0.14265335235378032, + "eval/acc": 2.3255813121795654, + "step": 200 + }, + { + "epoch": 0.14265335235378032, + "eval_loss": 4.956757545471191, + "eval_runtime": 0.2941, + "eval_samples_per_second": 146.185, + "eval_steps_per_second": 3.4, + "step": 200 + }, + { + "epoch": 0.14978601997146934, + "grad_norm": 16.75, + "learning_rate": 8.36e-06, + "loss": 5.4744, + "step": 210 + }, + { + "epoch": 0.15691868758915833, + "grad_norm": 43.25, + "learning_rate": 8.76e-06, + "loss": 5.381, + "step": 220 + }, + { + "epoch": 0.16405135520684735, + "grad_norm": 21.0, + "learning_rate": 9.16e-06, + "loss": 5.3092, + "step": 230 + }, + { + "epoch": 0.17118402282453637, + "grad_norm": 26.75, + "learning_rate": 9.560000000000002e-06, + "loss": 5.2752, + "step": 240 + }, + { + "epoch": 0.1783166904422254, + "grad_norm": 26.875, + "learning_rate": 9.96e-06, + "loss": 5.2194, + "step": 250 + }, + { + "epoch": 0.18544935805991442, + "grad_norm": 20.875, + "learning_rate": 1.036e-05, + "loss": 5.0657, + "step": 260 + }, + { + "epoch": 0.19258202567760344, + "grad_norm": 25.125, + "learning_rate": 1.076e-05, + "loss": 4.967, + "step": 270 + }, + { + "epoch": 0.19971469329529243, + "grad_norm": 30.125, + "learning_rate": 1.1160000000000002e-05, + "loss": 4.9544, + "step": 280 + }, + { + "epoch": 0.20684736091298145, + "grad_norm": 24.625, + "learning_rate": 1.156e-05, + "loss": 4.7585, + "step": 290 + }, + { + "epoch": 0.21398002853067047, + "grad_norm": 21.375, + "learning_rate": 1.196e-05, + "loss": 4.635, + "step": 300 + }, + { + "epoch": 0.21398002853067047, + "eval/acc": 9.302325248718262, + "step": 300 + }, + { + "epoch": 0.21398002853067047, + "eval_loss": 4.364280700683594, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.42, + "eval_steps_per_second": 4.661, + "step": 300 + }, + { + "epoch": 0.2211126961483595, + "grad_norm": 30.125, + "learning_rate": 1.236e-05, + "loss": 4.5333, + "step": 310 + }, + { + "epoch": 0.2282453637660485, + "grad_norm": 28.125, + "learning_rate": 1.276e-05, + "loss": 4.2888, + "step": 320 + }, + { + "epoch": 0.23537803138373753, + "grad_norm": 30.5, + "learning_rate": 1.316e-05, + "loss": 4.1744, + "step": 330 + }, + { + "epoch": 0.24251069900142652, + "grad_norm": 35.0, + "learning_rate": 1.356e-05, + "loss": 3.8812, + "step": 340 + }, + { + "epoch": 0.24964336661911554, + "grad_norm": 30.75, + "learning_rate": 1.396e-05, + "loss": 3.6772, + "step": 350 + }, + { + "epoch": 0.25677603423680456, + "grad_norm": 25.875, + "learning_rate": 1.4360000000000001e-05, + "loss": 3.3797, + "step": 360 + }, + { + "epoch": 0.26390870185449355, + "grad_norm": 31.375, + "learning_rate": 1.4760000000000001e-05, + "loss": 3.2338, + "step": 370 + }, + { + "epoch": 0.2710413694721826, + "grad_norm": 72.0, + "learning_rate": 1.5160000000000002e-05, + "loss": 2.976, + "step": 380 + }, + { + "epoch": 0.2781740370898716, + "grad_norm": 22.375, + "learning_rate": 1.556e-05, + "loss": 2.8207, + "step": 390 + }, + { + "epoch": 0.28530670470756064, + "grad_norm": 21.25, + "learning_rate": 1.596e-05, + "loss": 2.8341, + "step": 400 + }, + { + "epoch": 0.28530670470756064, + "eval/acc": 9.302325248718262, + "step": 400 + }, + { + "epoch": 0.28530670470756064, + "eval_loss": 3.6011083126068115, + "eval_runtime": 0.2115, + "eval_samples_per_second": 203.312, + "eval_steps_per_second": 4.728, + "step": 400 + }, + { + "epoch": 0.29243937232524964, + "grad_norm": 21.0, + "learning_rate": 1.636e-05, + "loss": 2.6431, + "step": 410 + }, + { + "epoch": 0.2995720399429387, + "grad_norm": 20.875, + "learning_rate": 1.6760000000000002e-05, + "loss": 2.6506, + "step": 420 + }, + { + "epoch": 0.3067047075606277, + "grad_norm": 21.125, + "learning_rate": 1.7160000000000002e-05, + "loss": 2.491, + "step": 430 + }, + { + "epoch": 0.31383737517831667, + "grad_norm": 31.75, + "learning_rate": 1.756e-05, + "loss": 2.423, + "step": 440 + }, + { + "epoch": 0.3209700427960057, + "grad_norm": 19.375, + "learning_rate": 1.796e-05, + "loss": 2.5108, + "step": 450 + }, + { + "epoch": 0.3281027104136947, + "grad_norm": 17.375, + "learning_rate": 1.8360000000000004e-05, + "loss": 2.4584, + "step": 460 + }, + { + "epoch": 0.33523537803138376, + "grad_norm": 22.625, + "learning_rate": 1.876e-05, + "loss": 2.3526, + "step": 470 + }, + { + "epoch": 0.34236804564907275, + "grad_norm": 30.25, + "learning_rate": 1.916e-05, + "loss": 2.3634, + "step": 480 + }, + { + "epoch": 0.34950071326676174, + "grad_norm": 15.5625, + "learning_rate": 1.956e-05, + "loss": 2.3339, + "step": 490 + }, + { + "epoch": 0.3566333808844508, + "grad_norm": 19.5, + "learning_rate": 1.9960000000000002e-05, + "loss": 2.268, + "step": 500 + }, + { + "epoch": 0.3566333808844508, + "eval/acc": 16.279069900512695, + "step": 500 + }, + { + "epoch": 0.3566333808844508, + "eval_loss": 3.2489778995513916, + "eval_runtime": 0.2558, + "eval_samples_per_second": 168.115, + "eval_steps_per_second": 3.91, + "step": 500 + }, + { + "epoch": 0.3637660485021398, + "grad_norm": 29.375, + "learning_rate": 2.036e-05, + "loss": 2.2728, + "step": 510 + }, + { + "epoch": 0.37089871611982883, + "grad_norm": 21.25, + "learning_rate": 2.076e-05, + "loss": 2.1346, + "step": 520 + }, + { + "epoch": 0.3780313837375178, + "grad_norm": 14.8125, + "learning_rate": 2.116e-05, + "loss": 2.2719, + "step": 530 + }, + { + "epoch": 0.38516405135520687, + "grad_norm": 27.75, + "learning_rate": 2.1560000000000004e-05, + "loss": 2.145, + "step": 540 + }, + { + "epoch": 0.39229671897289586, + "grad_norm": 16.125, + "learning_rate": 2.196e-05, + "loss": 2.0912, + "step": 550 + }, + { + "epoch": 0.39942938659058486, + "grad_norm": 20.25, + "learning_rate": 2.236e-05, + "loss": 2.0302, + "step": 560 + }, + { + "epoch": 0.4065620542082739, + "grad_norm": 17.75, + "learning_rate": 2.2760000000000002e-05, + "loss": 2.1832, + "step": 570 + }, + { + "epoch": 0.4136947218259629, + "grad_norm": 14.5, + "learning_rate": 2.3160000000000002e-05, + "loss": 1.9652, + "step": 580 + }, + { + "epoch": 0.42082738944365194, + "grad_norm": 17.0, + "learning_rate": 2.356e-05, + "loss": 1.8911, + "step": 590 + }, + { + "epoch": 0.42796005706134094, + "grad_norm": 20.0, + "learning_rate": 2.396e-05, + "loss": 2.0266, + "step": 600 + }, + { + "epoch": 0.42796005706134094, + "eval/acc": 30.23255729675293, + "step": 600 + }, + { + "epoch": 0.42796005706134094, + "eval_loss": 2.946028470993042, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.065, + "eval_steps_per_second": 4.676, + "step": 600 + }, + { + "epoch": 0.43509272467902993, + "grad_norm": 25.5, + "learning_rate": 2.4360000000000004e-05, + "loss": 1.9116, + "step": 610 + }, + { + "epoch": 0.442225392296719, + "grad_norm": 25.375, + "learning_rate": 2.476e-05, + "loss": 1.7644, + "step": 620 + }, + { + "epoch": 0.44935805991440797, + "grad_norm": 15.5, + "learning_rate": 2.516e-05, + "loss": 1.9008, + "step": 630 + }, + { + "epoch": 0.456490727532097, + "grad_norm": 16.875, + "learning_rate": 2.556e-05, + "loss": 1.619, + "step": 640 + }, + { + "epoch": 0.463623395149786, + "grad_norm": 37.25, + "learning_rate": 2.5960000000000002e-05, + "loss": 1.7725, + "step": 650 + }, + { + "epoch": 0.47075606276747506, + "grad_norm": 16.5, + "learning_rate": 2.6360000000000002e-05, + "loss": 1.7405, + "step": 660 + }, + { + "epoch": 0.47788873038516405, + "grad_norm": 16.25, + "learning_rate": 2.676e-05, + "loss": 1.5825, + "step": 670 + }, + { + "epoch": 0.48502139800285304, + "grad_norm": 68.5, + "learning_rate": 2.716e-05, + "loss": 1.8379, + "step": 680 + }, + { + "epoch": 0.4921540656205421, + "grad_norm": 50.0, + "learning_rate": 2.7560000000000004e-05, + "loss": 1.7989, + "step": 690 + }, + { + "epoch": 0.4992867332382311, + "grad_norm": 16.25, + "learning_rate": 2.7960000000000003e-05, + "loss": 1.7058, + "step": 700 + }, + { + "epoch": 0.4992867332382311, + "eval/acc": 30.23255729675293, + "step": 700 + }, + { + "epoch": 0.4992867332382311, + "eval_loss": 2.8809142112731934, + "eval_runtime": 0.2095, + "eval_samples_per_second": 205.208, + "eval_steps_per_second": 4.772, + "step": 700 + }, + { + "epoch": 0.5064194008559201, + "grad_norm": 14.625, + "learning_rate": 2.8360000000000003e-05, + "loss": 1.6542, + "step": 710 + }, + { + "epoch": 0.5135520684736091, + "grad_norm": 71.0, + "learning_rate": 2.8760000000000002e-05, + "loss": 1.6763, + "step": 720 + }, + { + "epoch": 0.5206847360912982, + "grad_norm": 17.125, + "learning_rate": 2.9160000000000005e-05, + "loss": 1.6858, + "step": 730 + }, + { + "epoch": 0.5278174037089871, + "grad_norm": 19.75, + "learning_rate": 2.9559999999999998e-05, + "loss": 1.6718, + "step": 740 + }, + { + "epoch": 0.5349500713266762, + "grad_norm": 13.375, + "learning_rate": 2.9959999999999998e-05, + "loss": 1.6164, + "step": 750 + }, + { + "epoch": 0.5420827389443652, + "grad_norm": 14.1875, + "learning_rate": 3.036e-05, + "loss": 1.6049, + "step": 760 + }, + { + "epoch": 0.5492154065620543, + "grad_norm": 35.75, + "learning_rate": 3.076e-05, + "loss": 1.5453, + "step": 770 + }, + { + "epoch": 0.5563480741797432, + "grad_norm": 28.75, + "learning_rate": 3.116e-05, + "loss": 1.4818, + "step": 780 + }, + { + "epoch": 0.5634807417974322, + "grad_norm": 17.375, + "learning_rate": 3.156e-05, + "loss": 1.5647, + "step": 790 + }, + { + "epoch": 0.5706134094151213, + "grad_norm": 13.6875, + "learning_rate": 3.196e-05, + "loss": 1.5206, + "step": 800 + }, + { + "epoch": 0.5706134094151213, + "eval/acc": 30.23255729675293, + "step": 800 + }, + { + "epoch": 0.5706134094151213, + "eval_loss": 2.8829538822174072, + "eval_runtime": 0.2122, + "eval_samples_per_second": 202.686, + "eval_steps_per_second": 4.714, + "step": 800 + }, + { + "epoch": 0.5777460770328102, + "grad_norm": 17.125, + "learning_rate": 3.236e-05, + "loss": 1.6124, + "step": 810 + }, + { + "epoch": 0.5848787446504993, + "grad_norm": 14.8125, + "learning_rate": 3.2760000000000005e-05, + "loss": 1.4254, + "step": 820 + }, + { + "epoch": 0.5920114122681883, + "grad_norm": 15.0, + "learning_rate": 3.316e-05, + "loss": 1.7124, + "step": 830 + }, + { + "epoch": 0.5991440798858774, + "grad_norm": 14.75, + "learning_rate": 3.3560000000000004e-05, + "loss": 1.5384, + "step": 840 + }, + { + "epoch": 0.6062767475035663, + "grad_norm": 31.5, + "learning_rate": 3.396e-05, + "loss": 1.4899, + "step": 850 + }, + { + "epoch": 0.6134094151212554, + "grad_norm": 13.875, + "learning_rate": 3.436e-05, + "loss": 1.5377, + "step": 860 + }, + { + "epoch": 0.6205420827389444, + "grad_norm": 14.9375, + "learning_rate": 3.4760000000000006e-05, + "loss": 1.4892, + "step": 870 + }, + { + "epoch": 0.6276747503566333, + "grad_norm": 37.25, + "learning_rate": 3.516e-05, + "loss": 1.4872, + "step": 880 + }, + { + "epoch": 0.6348074179743224, + "grad_norm": 18.875, + "learning_rate": 3.5560000000000005e-05, + "loss": 1.536, + "step": 890 + }, + { + "epoch": 0.6419400855920114, + "grad_norm": 18.625, + "learning_rate": 3.596e-05, + "loss": 1.5208, + "step": 900 + }, + { + "epoch": 0.6419400855920114, + "eval/acc": 30.23255729675293, + "step": 900 + }, + { + "epoch": 0.6419400855920114, + "eval_loss": 2.8081133365631104, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.434, + "eval_steps_per_second": 4.312, + "step": 900 + }, + { + "epoch": 0.6490727532097005, + "grad_norm": 19.875, + "learning_rate": 3.636e-05, + "loss": 1.4606, + "step": 910 + }, + { + "epoch": 0.6562054208273894, + "grad_norm": 12.625, + "learning_rate": 3.676e-05, + "loss": 1.4728, + "step": 920 + }, + { + "epoch": 0.6633380884450785, + "grad_norm": 15.0, + "learning_rate": 3.716e-05, + "loss": 1.449, + "step": 930 + }, + { + "epoch": 0.6704707560627675, + "grad_norm": 19.0, + "learning_rate": 3.756e-05, + "loss": 1.5292, + "step": 940 + }, + { + "epoch": 0.6776034236804565, + "grad_norm": 111.5, + "learning_rate": 3.796e-05, + "loss": 1.4891, + "step": 950 + }, + { + "epoch": 0.6847360912981455, + "grad_norm": 14.75, + "learning_rate": 3.836e-05, + "loss": 1.4202, + "step": 960 + }, + { + "epoch": 0.6918687589158345, + "grad_norm": 20.25, + "learning_rate": 3.876e-05, + "loss": 1.5258, + "step": 970 + }, + { + "epoch": 0.6990014265335235, + "grad_norm": 48.0, + "learning_rate": 3.9160000000000005e-05, + "loss": 1.3912, + "step": 980 + }, + { + "epoch": 0.7061340941512125, + "grad_norm": 13.0, + "learning_rate": 3.956e-05, + "loss": 1.4859, + "step": 990 + }, + { + "epoch": 0.7132667617689016, + "grad_norm": 15.5625, + "learning_rate": 3.9960000000000004e-05, + "loss": 1.4614, + "step": 1000 + }, + { + "epoch": 0.7132667617689016, + "eval/acc": 37.20930099487305, + "step": 1000 + }, + { + "epoch": 0.7132667617689016, + "eval_loss": 2.743621587753296, + "eval_runtime": 0.5811, + "eval_samples_per_second": 73.997, + "eval_steps_per_second": 1.721, + "step": 1000 + }, + { + "epoch": 0.7203994293865906, + "grad_norm": 16.625, + "learning_rate": 4.0360000000000007e-05, + "loss": 1.56, + "step": 1010 + }, + { + "epoch": 0.7275320970042796, + "grad_norm": 15.5625, + "learning_rate": 4.076e-05, + "loss": 1.4469, + "step": 1020 + }, + { + "epoch": 0.7346647646219686, + "grad_norm": 15.0, + "learning_rate": 4.1160000000000006e-05, + "loss": 1.381, + "step": 1030 + }, + { + "epoch": 0.7417974322396577, + "grad_norm": 13.625, + "learning_rate": 4.156e-05, + "loss": 1.3749, + "step": 1040 + }, + { + "epoch": 0.7489300998573466, + "grad_norm": 12.0625, + "learning_rate": 4.196e-05, + "loss": 1.3919, + "step": 1050 + }, + { + "epoch": 0.7560627674750356, + "grad_norm": 16.25, + "learning_rate": 4.236e-05, + "loss": 1.4208, + "step": 1060 + }, + { + "epoch": 0.7631954350927247, + "grad_norm": 27.75, + "learning_rate": 4.276e-05, + "loss": 1.3714, + "step": 1070 + }, + { + "epoch": 0.7703281027104137, + "grad_norm": 13.125, + "learning_rate": 4.316e-05, + "loss": 1.3344, + "step": 1080 + }, + { + "epoch": 0.7774607703281027, + "grad_norm": 11.9375, + "learning_rate": 4.356e-05, + "loss": 1.3291, + "step": 1090 + }, + { + "epoch": 0.7845934379457917, + "grad_norm": 17.125, + "learning_rate": 4.396e-05, + "loss": 1.3536, + "step": 1100 + }, + { + "epoch": 0.7845934379457917, + "eval/acc": 27.9069766998291, + "step": 1100 + }, + { + "epoch": 0.7845934379457917, + "eval_loss": 2.8128697872161865, + "eval_runtime": 0.484, + "eval_samples_per_second": 88.836, + "eval_steps_per_second": 2.066, + "step": 1100 + }, + { + "epoch": 0.7917261055634808, + "grad_norm": 13.3125, + "learning_rate": 4.436e-05, + "loss": 1.4598, + "step": 1110 + }, + { + "epoch": 0.7988587731811697, + "grad_norm": 15.25, + "learning_rate": 4.4760000000000005e-05, + "loss": 1.3795, + "step": 1120 + }, + { + "epoch": 0.8059914407988588, + "grad_norm": 12.0625, + "learning_rate": 4.516e-05, + "loss": 1.2518, + "step": 1130 + }, + { + "epoch": 0.8131241084165478, + "grad_norm": 16.625, + "learning_rate": 4.5560000000000004e-05, + "loss": 1.3104, + "step": 1140 + }, + { + "epoch": 0.8202567760342369, + "grad_norm": 11.875, + "learning_rate": 4.596e-05, + "loss": 1.2996, + "step": 1150 + }, + { + "epoch": 0.8273894436519258, + "grad_norm": 24.125, + "learning_rate": 4.636e-05, + "loss": 1.2067, + "step": 1160 + }, + { + "epoch": 0.8345221112696148, + "grad_norm": 11.0, + "learning_rate": 4.6760000000000006e-05, + "loss": 1.3035, + "step": 1170 + }, + { + "epoch": 0.8416547788873039, + "grad_norm": 13.125, + "learning_rate": 4.716e-05, + "loss": 1.2859, + "step": 1180 + }, + { + "epoch": 0.8487874465049928, + "grad_norm": 11.0, + "learning_rate": 4.7560000000000005e-05, + "loss": 1.3982, + "step": 1190 + }, + { + "epoch": 0.8559201141226819, + "grad_norm": 12.875, + "learning_rate": 4.796e-05, + "loss": 1.299, + "step": 1200 + }, + { + "epoch": 0.8559201141226819, + "eval/acc": 34.88372039794922, + "step": 1200 + }, + { + "epoch": 0.8559201141226819, + "eval_loss": 2.7250428199768066, + "eval_runtime": 0.3522, + "eval_samples_per_second": 122.084, + "eval_steps_per_second": 2.839, + "step": 1200 + }, + { + "epoch": 0.8630527817403709, + "grad_norm": 11.25, + "learning_rate": 4.836e-05, + "loss": 1.3549, + "step": 1210 + }, + { + "epoch": 0.8701854493580599, + "grad_norm": 15.25, + "learning_rate": 4.876e-05, + "loss": 1.3649, + "step": 1220 + }, + { + "epoch": 0.8773181169757489, + "grad_norm": 22.0, + "learning_rate": 4.9160000000000004e-05, + "loss": 1.2441, + "step": 1230 + }, + { + "epoch": 0.884450784593438, + "grad_norm": 12.375, + "learning_rate": 4.956e-05, + "loss": 1.2196, + "step": 1240 + }, + { + "epoch": 0.891583452211127, + "grad_norm": 14.25, + "learning_rate": 4.996e-05, + "loss": 1.3274, + "step": 1250 + }, + { + "epoch": 0.8987161198288159, + "grad_norm": 10.0625, + "learning_rate": 5.0360000000000006e-05, + "loss": 1.2896, + "step": 1260 + }, + { + "epoch": 0.905848787446505, + "grad_norm": 16.875, + "learning_rate": 5.076000000000001e-05, + "loss": 1.3019, + "step": 1270 + }, + { + "epoch": 0.912981455064194, + "grad_norm": 26.375, + "learning_rate": 5.1160000000000005e-05, + "loss": 1.3756, + "step": 1280 + }, + { + "epoch": 0.920114122681883, + "grad_norm": 18.25, + "learning_rate": 5.1559999999999994e-05, + "loss": 1.327, + "step": 1290 + }, + { + "epoch": 0.927246790299572, + "grad_norm": 11.3125, + "learning_rate": 5.196e-05, + "loss": 1.3237, + "step": 1300 + }, + { + "epoch": 0.927246790299572, + "eval/acc": 39.53488540649414, + "step": 1300 + }, + { + "epoch": 0.927246790299572, + "eval_loss": 2.733259916305542, + "eval_runtime": 0.51, + "eval_samples_per_second": 84.32, + "eval_steps_per_second": 1.961, + "step": 1300 + }, + { + "epoch": 0.9343794579172611, + "grad_norm": 18.125, + "learning_rate": 5.236e-05, + "loss": 1.256, + "step": 1310 + }, + { + "epoch": 0.9415121255349501, + "grad_norm": 10.25, + "learning_rate": 5.2759999999999996e-05, + "loss": 1.1386, + "step": 1320 + }, + { + "epoch": 0.948644793152639, + "grad_norm": 11.1875, + "learning_rate": 5.316e-05, + "loss": 1.3115, + "step": 1330 + }, + { + "epoch": 0.9557774607703281, + "grad_norm": 10.875, + "learning_rate": 5.356e-05, + "loss": 1.2315, + "step": 1340 + }, + { + "epoch": 0.9629101283880172, + "grad_norm": 12.0, + "learning_rate": 5.396e-05, + "loss": 1.3327, + "step": 1350 + }, + { + "epoch": 0.9700427960057061, + "grad_norm": 11.75, + "learning_rate": 5.436e-05, + "loss": 1.4052, + "step": 1360 + }, + { + "epoch": 0.9771754636233951, + "grad_norm": 11.4375, + "learning_rate": 5.476e-05, + "loss": 1.1349, + "step": 1370 + }, + { + "epoch": 0.9843081312410842, + "grad_norm": 15.125, + "learning_rate": 5.516e-05, + "loss": 1.3803, + "step": 1380 + }, + { + "epoch": 0.9914407988587732, + "grad_norm": 16.75, + "learning_rate": 5.556e-05, + "loss": 1.3536, + "step": 1390 + }, + { + "epoch": 0.9985734664764622, + "grad_norm": 10.625, + "learning_rate": 5.596e-05, + "loss": 1.2981, + "step": 1400 + }, + { + "epoch": 0.9985734664764622, + "eval/acc": 39.53488540649414, + "step": 1400 + }, + { + "epoch": 0.9985734664764622, + "eval_loss": 2.597245693206787, + "eval_runtime": 0.2116, + "eval_samples_per_second": 203.214, + "eval_steps_per_second": 4.726, + "step": 1400 + }, + { + "epoch": 1.005706134094151, + "grad_norm": 15.0, + "learning_rate": 5.636e-05, + "loss": 1.2173, + "step": 1410 + }, + { + "epoch": 1.0128388017118402, + "grad_norm": 15.4375, + "learning_rate": 5.6760000000000005e-05, + "loss": 1.1965, + "step": 1420 + }, + { + "epoch": 1.0199714693295292, + "grad_norm": 21.625, + "learning_rate": 5.716e-05, + "loss": 1.2494, + "step": 1430 + }, + { + "epoch": 1.0271041369472182, + "grad_norm": 13.0, + "learning_rate": 5.7560000000000005e-05, + "loss": 1.1948, + "step": 1440 + }, + { + "epoch": 1.0342368045649073, + "grad_norm": 11.0, + "learning_rate": 5.796e-05, + "loss": 1.2641, + "step": 1450 + }, + { + "epoch": 1.0413694721825963, + "grad_norm": 13.1875, + "learning_rate": 5.8360000000000004e-05, + "loss": 1.2526, + "step": 1460 + }, + { + "epoch": 1.0485021398002854, + "grad_norm": 46.0, + "learning_rate": 5.876000000000001e-05, + "loss": 1.0786, + "step": 1470 + }, + { + "epoch": 1.0556348074179742, + "grad_norm": 11.0, + "learning_rate": 5.916e-05, + "loss": 1.3154, + "step": 1480 + }, + { + "epoch": 1.0627674750356633, + "grad_norm": 18.75, + "learning_rate": 5.9560000000000006e-05, + "loss": 1.257, + "step": 1490 + }, + { + "epoch": 1.0699001426533523, + "grad_norm": 11.5625, + "learning_rate": 5.996e-05, + "loss": 1.2636, + "step": 1500 + }, + { + "epoch": 1.0699001426533523, + "eval/acc": 32.55813980102539, + "step": 1500 + }, + { + "epoch": 1.0699001426533523, + "eval_loss": 2.9036648273468018, + "eval_runtime": 0.4893, + "eval_samples_per_second": 87.889, + "eval_steps_per_second": 2.044, + "step": 1500 + }, + { + "epoch": 1.0770328102710414, + "grad_norm": 13.75, + "learning_rate": 6.0360000000000005e-05, + "loss": 1.2602, + "step": 1510 + }, + { + "epoch": 1.0841654778887304, + "grad_norm": 11.625, + "learning_rate": 6.076000000000001e-05, + "loss": 1.0823, + "step": 1520 + }, + { + "epoch": 1.0912981455064195, + "grad_norm": 9.0, + "learning_rate": 6.116e-05, + "loss": 1.3059, + "step": 1530 + }, + { + "epoch": 1.0984308131241085, + "grad_norm": 10.4375, + "learning_rate": 6.156e-05, + "loss": 1.2006, + "step": 1540 + }, + { + "epoch": 1.1055634807417973, + "grad_norm": 15.75, + "learning_rate": 6.196000000000001e-05, + "loss": 1.3731, + "step": 1550 + }, + { + "epoch": 1.1126961483594864, + "grad_norm": 9.5, + "learning_rate": 6.236e-05, + "loss": 1.1925, + "step": 1560 + }, + { + "epoch": 1.1198288159771754, + "grad_norm": 9.3125, + "learning_rate": 6.276e-05, + "loss": 1.1554, + "step": 1570 + }, + { + "epoch": 1.1269614835948645, + "grad_norm": 12.0625, + "learning_rate": 6.316000000000001e-05, + "loss": 1.0875, + "step": 1580 + }, + { + "epoch": 1.1340941512125535, + "grad_norm": 10.875, + "learning_rate": 6.356000000000001e-05, + "loss": 1.1895, + "step": 1590 + }, + { + "epoch": 1.1412268188302426, + "grad_norm": 12.0625, + "learning_rate": 6.396e-05, + "loss": 1.2354, + "step": 1600 + }, + { + "epoch": 1.1412268188302426, + "eval/acc": 34.88372039794922, + "step": 1600 + }, + { + "epoch": 1.1412268188302426, + "eval_loss": 2.9267771244049072, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.258, + "eval_steps_per_second": 4.285, + "step": 1600 + }, + { + "epoch": 1.1483594864479316, + "grad_norm": 12.375, + "learning_rate": 6.436e-05, + "loss": 1.2167, + "step": 1610 + }, + { + "epoch": 1.1554921540656204, + "grad_norm": 10.375, + "learning_rate": 6.476e-05, + "loss": 1.1638, + "step": 1620 + }, + { + "epoch": 1.1626248216833095, + "grad_norm": 9.8125, + "learning_rate": 6.515999999999999e-05, + "loss": 1.1666, + "step": 1630 + }, + { + "epoch": 1.1697574893009985, + "grad_norm": 12.6875, + "learning_rate": 6.556e-05, + "loss": 1.1961, + "step": 1640 + }, + { + "epoch": 1.1768901569186876, + "grad_norm": 9.875, + "learning_rate": 6.596e-05, + "loss": 1.2558, + "step": 1650 + }, + { + "epoch": 1.1840228245363766, + "grad_norm": 10.375, + "learning_rate": 6.636e-05, + "loss": 1.1728, + "step": 1660 + }, + { + "epoch": 1.1911554921540657, + "grad_norm": 10.1875, + "learning_rate": 6.676e-05, + "loss": 1.2947, + "step": 1670 + }, + { + "epoch": 1.1982881597717547, + "grad_norm": 11.3125, + "learning_rate": 6.716e-05, + "loss": 1.2151, + "step": 1680 + }, + { + "epoch": 1.2054208273894436, + "grad_norm": 10.5, + "learning_rate": 6.756e-05, + "loss": 1.0612, + "step": 1690 + }, + { + "epoch": 1.2125534950071326, + "grad_norm": 11.9375, + "learning_rate": 6.796e-05, + "loss": 1.1079, + "step": 1700 + }, + { + "epoch": 1.2125534950071326, + "eval/acc": 37.20930099487305, + "step": 1700 + }, + { + "epoch": 1.2125534950071326, + "eval_loss": 2.9951517581939697, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.192, + "eval_steps_per_second": 4.516, + "step": 1700 + }, + { + "epoch": 1.2196861626248217, + "grad_norm": 11.25, + "learning_rate": 6.836e-05, + "loss": 1.1541, + "step": 1710 + }, + { + "epoch": 1.2268188302425107, + "grad_norm": 8.125, + "learning_rate": 6.876e-05, + "loss": 1.0772, + "step": 1720 + }, + { + "epoch": 1.2339514978601998, + "grad_norm": 18.125, + "learning_rate": 6.916000000000001e-05, + "loss": 1.1623, + "step": 1730 + }, + { + "epoch": 1.2410841654778888, + "grad_norm": 10.125, + "learning_rate": 6.956e-05, + "loss": 1.182, + "step": 1740 + }, + { + "epoch": 1.2482168330955776, + "grad_norm": 9.75, + "learning_rate": 6.996e-05, + "loss": 1.0796, + "step": 1750 + }, + { + "epoch": 1.2553495007132667, + "grad_norm": 10.5, + "learning_rate": 7.036e-05, + "loss": 1.2374, + "step": 1760 + }, + { + "epoch": 1.2624821683309557, + "grad_norm": 20.875, + "learning_rate": 7.076000000000001e-05, + "loss": 1.2718, + "step": 1770 + }, + { + "epoch": 1.2696148359486448, + "grad_norm": 10.3125, + "learning_rate": 7.116e-05, + "loss": 1.0922, + "step": 1780 + }, + { + "epoch": 1.2767475035663338, + "grad_norm": 8.6875, + "learning_rate": 7.156e-05, + "loss": 1.0637, + "step": 1790 + }, + { + "epoch": 1.2838801711840229, + "grad_norm": 9.5, + "learning_rate": 7.196000000000001e-05, + "loss": 1.1661, + "step": 1800 + }, + { + "epoch": 1.2838801711840229, + "eval/acc": 39.53488540649414, + "step": 1800 + }, + { + "epoch": 1.2838801711840229, + "eval_loss": 2.763897180557251, + "eval_runtime": 0.2111, + "eval_samples_per_second": 203.712, + "eval_steps_per_second": 4.737, + "step": 1800 + }, + { + "epoch": 1.291012838801712, + "grad_norm": 14.3125, + "learning_rate": 7.236e-05, + "loss": 1.1139, + "step": 1810 + }, + { + "epoch": 1.298145506419401, + "grad_norm": 41.5, + "learning_rate": 7.276e-05, + "loss": 1.0817, + "step": 1820 + }, + { + "epoch": 1.3052781740370898, + "grad_norm": 15.125, + "learning_rate": 7.316000000000001e-05, + "loss": 1.2462, + "step": 1830 + }, + { + "epoch": 1.3124108416547788, + "grad_norm": 33.25, + "learning_rate": 7.356000000000001e-05, + "loss": 1.1143, + "step": 1840 + }, + { + "epoch": 1.3195435092724679, + "grad_norm": 13.625, + "learning_rate": 7.396e-05, + "loss": 1.1783, + "step": 1850 + }, + { + "epoch": 1.326676176890157, + "grad_norm": 18.375, + "learning_rate": 7.436000000000001e-05, + "loss": 1.2101, + "step": 1860 + }, + { + "epoch": 1.333808844507846, + "grad_norm": 13.875, + "learning_rate": 7.476000000000001e-05, + "loss": 1.1348, + "step": 1870 + }, + { + "epoch": 1.340941512125535, + "grad_norm": 13.9375, + "learning_rate": 7.516e-05, + "loss": 1.0747, + "step": 1880 + }, + { + "epoch": 1.3480741797432239, + "grad_norm": 29.75, + "learning_rate": 7.556000000000002e-05, + "loss": 1.1895, + "step": 1890 + }, + { + "epoch": 1.355206847360913, + "grad_norm": 17.25, + "learning_rate": 7.596000000000001e-05, + "loss": 1.2512, + "step": 1900 + }, + { + "epoch": 1.355206847360913, + "eval/acc": 39.53488540649414, + "step": 1900 + }, + { + "epoch": 1.355206847360913, + "eval_loss": 2.9442026615142822, + "eval_runtime": 0.2199, + "eval_samples_per_second": 195.51, + "eval_steps_per_second": 4.547, + "step": 1900 + }, + { + "epoch": 1.362339514978602, + "grad_norm": 21.125, + "learning_rate": 7.636e-05, + "loss": 1.1306, + "step": 1910 + }, + { + "epoch": 1.369472182596291, + "grad_norm": 9.0625, + "learning_rate": 7.676e-05, + "loss": 1.1139, + "step": 1920 + }, + { + "epoch": 1.37660485021398, + "grad_norm": 30.25, + "learning_rate": 7.716e-05, + "loss": 1.1595, + "step": 1930 + }, + { + "epoch": 1.383737517831669, + "grad_norm": 13.6875, + "learning_rate": 7.756e-05, + "loss": 1.2437, + "step": 1940 + }, + { + "epoch": 1.3908701854493581, + "grad_norm": 12.3125, + "learning_rate": 7.796e-05, + "loss": 1.1005, + "step": 1950 + }, + { + "epoch": 1.3980028530670472, + "grad_norm": 9.8125, + "learning_rate": 7.836e-05, + "loss": 1.0748, + "step": 1960 + }, + { + "epoch": 1.405135520684736, + "grad_norm": 9.125, + "learning_rate": 7.876e-05, + "loss": 1.1576, + "step": 1970 + }, + { + "epoch": 1.412268188302425, + "grad_norm": 11.375, + "learning_rate": 7.916e-05, + "loss": 1.0982, + "step": 1980 + }, + { + "epoch": 1.4194008559201141, + "grad_norm": 10.375, + "learning_rate": 7.956e-05, + "loss": 1.132, + "step": 1990 + }, + { + "epoch": 1.4265335235378032, + "grad_norm": 16.375, + "learning_rate": 7.996e-05, + "loss": 1.121, + "step": 2000 + }, + { + "epoch": 1.4265335235378032, + "eval/acc": 39.53488540649414, + "step": 2000 + }, + { + "epoch": 1.4265335235378032, + "eval_loss": 2.900298595428467, + "eval_runtime": 0.2112, + "eval_samples_per_second": 203.622, + "eval_steps_per_second": 4.735, + "step": 2000 + }, + { + "epoch": 1.4336661911554922, + "grad_norm": 9.125, + "learning_rate": 8.036e-05, + "loss": 1.2079, + "step": 2010 + }, + { + "epoch": 1.440798858773181, + "grad_norm": 12.125, + "learning_rate": 8.076e-05, + "loss": 1.1098, + "step": 2020 + }, + { + "epoch": 1.44793152639087, + "grad_norm": 8.8125, + "learning_rate": 8.116e-05, + "loss": 0.9849, + "step": 2030 + }, + { + "epoch": 1.4550641940085591, + "grad_norm": 9.0, + "learning_rate": 8.156e-05, + "loss": 1.0905, + "step": 2040 + }, + { + "epoch": 1.4621968616262482, + "grad_norm": 15.4375, + "learning_rate": 8.196000000000001e-05, + "loss": 1.2211, + "step": 2050 + }, + { + "epoch": 1.4693295292439372, + "grad_norm": 9.4375, + "learning_rate": 8.236e-05, + "loss": 1.0968, + "step": 2060 + }, + { + "epoch": 1.4764621968616263, + "grad_norm": 9.0, + "learning_rate": 8.276e-05, + "loss": 1.0973, + "step": 2070 + }, + { + "epoch": 1.4835948644793153, + "grad_norm": 8.0625, + "learning_rate": 8.316000000000001e-05, + "loss": 1.1012, + "step": 2080 + }, + { + "epoch": 1.4907275320970044, + "grad_norm": 31.0, + "learning_rate": 8.356e-05, + "loss": 1.0437, + "step": 2090 + }, + { + "epoch": 1.4978601997146934, + "grad_norm": 10.8125, + "learning_rate": 8.396e-05, + "loss": 1.0934, + "step": 2100 + }, + { + "epoch": 1.4978601997146934, + "eval/acc": 41.86046600341797, + "step": 2100 + }, + { + "epoch": 1.4978601997146934, + "eval_loss": 2.9042038917541504, + "eval_runtime": 0.2146, + "eval_samples_per_second": 200.363, + "eval_steps_per_second": 4.66, + "step": 2100 + }, + { + "epoch": 1.5049928673323825, + "grad_norm": 15.3125, + "learning_rate": 8.436000000000001e-05, + "loss": 1.0862, + "step": 2110 + }, + { + "epoch": 1.5121255349500713, + "grad_norm": 10.1875, + "learning_rate": 8.476000000000001e-05, + "loss": 1.0786, + "step": 2120 + }, + { + "epoch": 1.5192582025677603, + "grad_norm": 8.25, + "learning_rate": 8.516e-05, + "loss": 1.1496, + "step": 2130 + }, + { + "epoch": 1.5263908701854494, + "grad_norm": 12.6875, + "learning_rate": 8.556e-05, + "loss": 1.1132, + "step": 2140 + }, + { + "epoch": 1.5335235378031382, + "grad_norm": 21.375, + "learning_rate": 8.596000000000001e-05, + "loss": 1.1043, + "step": 2150 + }, + { + "epoch": 1.5406562054208273, + "grad_norm": 8.5625, + "learning_rate": 8.636e-05, + "loss": 1.2549, + "step": 2160 + }, + { + "epoch": 1.5477888730385163, + "grad_norm": 8.6875, + "learning_rate": 8.676e-05, + "loss": 1.115, + "step": 2170 + }, + { + "epoch": 1.5549215406562054, + "grad_norm": 8.375, + "learning_rate": 8.716000000000001e-05, + "loss": 1.1963, + "step": 2180 + }, + { + "epoch": 1.5620542082738944, + "grad_norm": 8.3125, + "learning_rate": 8.756000000000001e-05, + "loss": 1.1697, + "step": 2190 + }, + { + "epoch": 1.5691868758915835, + "grad_norm": 7.40625, + "learning_rate": 8.796e-05, + "loss": 0.9716, + "step": 2200 + }, + { + "epoch": 1.5691868758915835, + "eval/acc": 39.53488540649414, + "step": 2200 + }, + { + "epoch": 1.5691868758915835, + "eval_loss": 3.021289587020874, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.958, + "eval_steps_per_second": 4.72, + "step": 2200 + }, + { + "epoch": 1.5763195435092725, + "grad_norm": 10.0, + "learning_rate": 8.836000000000001e-05, + "loss": 1.0254, + "step": 2210 + }, + { + "epoch": 1.5834522111269616, + "grad_norm": 12.625, + "learning_rate": 8.876e-05, + "loss": 1.1672, + "step": 2220 + }, + { + "epoch": 1.5905848787446506, + "grad_norm": 11.5, + "learning_rate": 8.916e-05, + "loss": 1.0656, + "step": 2230 + }, + { + "epoch": 1.5977175463623396, + "grad_norm": 8.8125, + "learning_rate": 8.956e-05, + "loss": 1.035, + "step": 2240 + }, + { + "epoch": 1.6048502139800287, + "grad_norm": 9.25, + "learning_rate": 8.996e-05, + "loss": 1.0972, + "step": 2250 + }, + { + "epoch": 1.6119828815977175, + "grad_norm": 7.71875, + "learning_rate": 9.036e-05, + "loss": 1.0148, + "step": 2260 + }, + { + "epoch": 1.6191155492154066, + "grad_norm": 13.5, + "learning_rate": 9.076e-05, + "loss": 1.1202, + "step": 2270 + }, + { + "epoch": 1.6262482168330956, + "grad_norm": 9.125, + "learning_rate": 9.116e-05, + "loss": 1.1134, + "step": 2280 + }, + { + "epoch": 1.6333808844507844, + "grad_norm": 15.25, + "learning_rate": 9.156e-05, + "loss": 1.0373, + "step": 2290 + }, + { + "epoch": 1.6405135520684735, + "grad_norm": 9.125, + "learning_rate": 9.196000000000001e-05, + "loss": 1.0654, + "step": 2300 + }, + { + "epoch": 1.6405135520684735, + "eval/acc": 37.20930099487305, + "step": 2300 + }, + { + "epoch": 1.6405135520684735, + "eval_loss": 2.929560422897339, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.467, + "eval_steps_per_second": 4.616, + "step": 2300 + }, + { + "epoch": 1.6476462196861625, + "grad_norm": 8.25, + "learning_rate": 9.236e-05, + "loss": 1.0218, + "step": 2310 + }, + { + "epoch": 1.6547788873038516, + "grad_norm": 9.5625, + "learning_rate": 9.276e-05, + "loss": 1.106, + "step": 2320 + }, + { + "epoch": 1.6619115549215406, + "grad_norm": 8.25, + "learning_rate": 9.316000000000001e-05, + "loss": 1.0558, + "step": 2330 + }, + { + "epoch": 1.6690442225392297, + "grad_norm": 8.5625, + "learning_rate": 9.356e-05, + "loss": 0.9931, + "step": 2340 + }, + { + "epoch": 1.6761768901569187, + "grad_norm": 11.5625, + "learning_rate": 9.396e-05, + "loss": 1.0683, + "step": 2350 + }, + { + "epoch": 1.6833095577746078, + "grad_norm": 10.0625, + "learning_rate": 9.436e-05, + "loss": 1.0631, + "step": 2360 + }, + { + "epoch": 1.6904422253922968, + "grad_norm": 9.5625, + "learning_rate": 9.476000000000001e-05, + "loss": 1.049, + "step": 2370 + }, + { + "epoch": 1.6975748930099859, + "grad_norm": 12.8125, + "learning_rate": 9.516e-05, + "loss": 1.0259, + "step": 2380 + }, + { + "epoch": 1.7047075606276747, + "grad_norm": 9.0625, + "learning_rate": 9.556e-05, + "loss": 1.0085, + "step": 2390 + }, + { + "epoch": 1.7118402282453637, + "grad_norm": 131.0, + "learning_rate": 9.596000000000001e-05, + "loss": 0.944, + "step": 2400 + }, + { + "epoch": 1.7118402282453637, + "eval/acc": 37.20930099487305, + "step": 2400 + }, + { + "epoch": 1.7118402282453637, + "eval_loss": 3.0231707096099854, + "eval_runtime": 0.2075, + "eval_samples_per_second": 207.206, + "eval_steps_per_second": 4.819, + "step": 2400 + }, + { + "epoch": 1.7189728958630528, + "grad_norm": 8.375, + "learning_rate": 9.636e-05, + "loss": 1.0069, + "step": 2410 + }, + { + "epoch": 1.7261055634807418, + "grad_norm": 8.3125, + "learning_rate": 9.676e-05, + "loss": 1.0648, + "step": 2420 + }, + { + "epoch": 1.7332382310984307, + "grad_norm": 11.0625, + "learning_rate": 9.716000000000001e-05, + "loss": 1.0594, + "step": 2430 + }, + { + "epoch": 1.7403708987161197, + "grad_norm": 8.75, + "learning_rate": 9.756000000000001e-05, + "loss": 1.2082, + "step": 2440 + }, + { + "epoch": 1.7475035663338088, + "grad_norm": 9.875, + "learning_rate": 9.796e-05, + "loss": 1.0225, + "step": 2450 + }, + { + "epoch": 1.7546362339514978, + "grad_norm": 9.5625, + "learning_rate": 9.836000000000001e-05, + "loss": 0.9975, + "step": 2460 + }, + { + "epoch": 1.7617689015691869, + "grad_norm": 21.0, + "learning_rate": 9.876000000000001e-05, + "loss": 0.9533, + "step": 2470 + }, + { + "epoch": 1.768901569186876, + "grad_norm": 7.65625, + "learning_rate": 9.916e-05, + "loss": 0.9619, + "step": 2480 + }, + { + "epoch": 1.776034236804565, + "grad_norm": 13.625, + "learning_rate": 9.956e-05, + "loss": 0.9425, + "step": 2490 + }, + { + "epoch": 1.783166904422254, + "grad_norm": 12.375, + "learning_rate": 9.996000000000001e-05, + "loss": 0.9893, + "step": 2500 + }, + { + "epoch": 1.783166904422254, + "eval/acc": 39.53488540649414, + "step": 2500 + }, + { + "epoch": 1.783166904422254, + "eval_loss": 2.8728344440460205, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, + "step": 2500 + }, + { + "epoch": 1.790299572039943, + "grad_norm": 10.0, + "learning_rate": 9.996000000000001e-05, + "loss": 1.0137, + "step": 2510 + }, + { + "epoch": 1.797432239657632, + "grad_norm": 10.125, + "learning_rate": 9.991555555555556e-05, + "loss": 1.059, + "step": 2520 + }, + { + "epoch": 1.804564907275321, + "grad_norm": 32.0, + "learning_rate": 9.987111111111111e-05, + "loss": 1.0498, + "step": 2530 + }, + { + "epoch": 1.81169757489301, + "grad_norm": 10.125, + "learning_rate": 9.982666666666667e-05, + "loss": 1.1431, + "step": 2540 + }, + { + "epoch": 1.818830242510699, + "grad_norm": 7.90625, + "learning_rate": 9.978222222222223e-05, + "loss": 1.0715, + "step": 2550 + }, + { + "epoch": 1.825962910128388, + "grad_norm": 10.9375, + "learning_rate": 9.973777777777778e-05, + "loss": 1.0446, + "step": 2560 + }, + { + "epoch": 1.833095577746077, + "grad_norm": 13.0, + "learning_rate": 9.969333333333334e-05, + "loss": 1.0291, + "step": 2570 + }, + { + "epoch": 1.840228245363766, + "grad_norm": 9.75, + "learning_rate": 9.964888888888889e-05, + "loss": 0.9713, + "step": 2580 + }, + { + "epoch": 1.847360912981455, + "grad_norm": 10.5625, + "learning_rate": 9.960444444444444e-05, + "loss": 1.2157, + "step": 2590 + }, + { + "epoch": 1.854493580599144, + "grad_norm": 9.3125, + "learning_rate": 9.956e-05, + "loss": 1.0455, + "step": 2600 + }, + { + "epoch": 1.854493580599144, + "eval/acc": 37.20930099487305, + "step": 2600 + }, + { + "epoch": 1.854493580599144, + "eval_loss": 2.951470375061035, + "eval_runtime": 0.2899, + "eval_samples_per_second": 148.338, + "eval_steps_per_second": 3.45, + "step": 2600 + }, + { + "epoch": 1.861626248216833, + "grad_norm": 10.5, + "learning_rate": 9.951555555555556e-05, + "loss": 1.0604, + "step": 2610 + }, + { + "epoch": 1.8687589158345221, + "grad_norm": 9.375, + "learning_rate": 9.947111111111111e-05, + "loss": 0.8715, + "step": 2620 + }, + { + "epoch": 1.8758915834522112, + "grad_norm": 10.4375, + "learning_rate": 9.942666666666667e-05, + "loss": 1.0034, + "step": 2630 + }, + { + "epoch": 1.8830242510699002, + "grad_norm": 8.0625, + "learning_rate": 9.938222222222224e-05, + "loss": 1.0557, + "step": 2640 + }, + { + "epoch": 1.8901569186875893, + "grad_norm": 7.21875, + "learning_rate": 9.933777777777779e-05, + "loss": 0.974, + "step": 2650 + }, + { + "epoch": 1.8972895863052783, + "grad_norm": 10.875, + "learning_rate": 9.929333333333333e-05, + "loss": 1.1366, + "step": 2660 + }, + { + "epoch": 1.9044222539229672, + "grad_norm": 28.75, + "learning_rate": 9.92488888888889e-05, + "loss": 1.0135, + "step": 2670 + }, + { + "epoch": 1.9115549215406562, + "grad_norm": 10.5625, + "learning_rate": 9.920444444444444e-05, + "loss": 1.0263, + "step": 2680 + }, + { + "epoch": 1.9186875891583453, + "grad_norm": 6.65625, + "learning_rate": 9.916e-05, + "loss": 0.9952, + "step": 2690 + }, + { + "epoch": 1.925820256776034, + "grad_norm": 8.8125, + "learning_rate": 9.911555555555557e-05, + "loss": 1.0438, + "step": 2700 + }, + { + "epoch": 1.925820256776034, + "eval/acc": 39.53488540649414, + "step": 2700 + }, + { + "epoch": 1.925820256776034, + "eval_loss": 2.8668925762176514, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.649, + "eval_steps_per_second": 4.643, + "step": 2700 + }, + { + "epoch": 1.9329529243937231, + "grad_norm": 7.1875, + "learning_rate": 9.907111111111112e-05, + "loss": 0.9522, + "step": 2710 + }, + { + "epoch": 1.9400855920114122, + "grad_norm": 8.6875, + "learning_rate": 9.902666666666666e-05, + "loss": 0.9729, + "step": 2720 + }, + { + "epoch": 1.9472182596291012, + "grad_norm": 10.0625, + "learning_rate": 9.898222222222223e-05, + "loss": 1.0528, + "step": 2730 + }, + { + "epoch": 1.9543509272467903, + "grad_norm": 8.8125, + "learning_rate": 9.893777777777779e-05, + "loss": 1.1212, + "step": 2740 + }, + { + "epoch": 1.9614835948644793, + "grad_norm": 9.1875, + "learning_rate": 9.889333333333334e-05, + "loss": 0.9866, + "step": 2750 + }, + { + "epoch": 1.9686162624821684, + "grad_norm": 8.25, + "learning_rate": 9.884888888888889e-05, + "loss": 0.8616, + "step": 2760 + }, + { + "epoch": 1.9757489300998574, + "grad_norm": 8.5625, + "learning_rate": 9.880444444444445e-05, + "loss": 0.9972, + "step": 2770 + }, + { + "epoch": 1.9828815977175465, + "grad_norm": 10.0625, + "learning_rate": 9.876000000000001e-05, + "loss": 0.9781, + "step": 2780 + }, + { + "epoch": 1.9900142653352355, + "grad_norm": 10.75, + "learning_rate": 9.871555555555556e-05, + "loss": 1.0579, + "step": 2790 + }, + { + "epoch": 1.9971469329529246, + "grad_norm": 8.25, + "learning_rate": 9.867111111111112e-05, + "loss": 1.0323, + "step": 2800 + }, + { + "epoch": 1.9971469329529246, + "eval/acc": 37.20930099487305, + "step": 2800 + }, + { + "epoch": 1.9971469329529246, + "eval_loss": 2.9081883430480957, + "eval_runtime": 0.2058, + "eval_samples_per_second": 208.905, + "eval_steps_per_second": 4.858, + "step": 2800 + }, + { + "epoch": 2.0042796005706136, + "grad_norm": 10.25, + "learning_rate": 9.862666666666667e-05, + "loss": 1.0597, + "step": 2810 + }, + { + "epoch": 2.011412268188302, + "grad_norm": 7.0625, + "learning_rate": 9.858222222222223e-05, + "loss": 0.9582, + "step": 2820 + }, + { + "epoch": 2.0185449358059913, + "grad_norm": 7.0625, + "learning_rate": 9.853777777777778e-05, + "loss": 1.0058, + "step": 2830 + }, + { + "epoch": 2.0256776034236803, + "grad_norm": 7.09375, + "learning_rate": 9.849333333333334e-05, + "loss": 1.009, + "step": 2840 + }, + { + "epoch": 2.0328102710413694, + "grad_norm": 8.9375, + "learning_rate": 9.844888888888889e-05, + "loss": 0.93, + "step": 2850 + }, + { + "epoch": 2.0399429386590584, + "grad_norm": 8.1875, + "learning_rate": 9.840444444444445e-05, + "loss": 1.0953, + "step": 2860 + }, + { + "epoch": 2.0470756062767475, + "grad_norm": 7.78125, + "learning_rate": 9.836000000000001e-05, + "loss": 1.0437, + "step": 2870 + }, + { + "epoch": 2.0542082738944365, + "grad_norm": 8.75, + "learning_rate": 9.831555555555556e-05, + "loss": 0.9873, + "step": 2880 + }, + { + "epoch": 2.0613409415121255, + "grad_norm": 8.375, + "learning_rate": 9.827111111111111e-05, + "loss": 0.9414, + "step": 2890 + }, + { + "epoch": 2.0684736091298146, + "grad_norm": 9.0, + "learning_rate": 9.822666666666667e-05, + "loss": 0.9625, + "step": 2900 + }, + { + "epoch": 2.0684736091298146, + "eval/acc": 51.16279220581055, + "step": 2900 + }, + { + "epoch": 2.0684736091298146, + "eval_loss": 1.884637713432312, + "eval_runtime": 1.4017, + "eval_samples_per_second": 30.678, + "eval_steps_per_second": 0.713, + "step": 2900 + }, + { + "epoch": 2.0756062767475036, + "grad_norm": 9.5625, + "learning_rate": 9.818222222222223e-05, + "loss": 1.0246, + "step": 2910 + }, + { + "epoch": 2.0827389443651927, + "grad_norm": 8.125, + "learning_rate": 9.813777777777778e-05, + "loss": 0.9646, + "step": 2920 + }, + { + "epoch": 2.0898716119828817, + "grad_norm": 8.3125, + "learning_rate": 9.809333333333333e-05, + "loss": 1.0022, + "step": 2930 + }, + { + "epoch": 2.097004279600571, + "grad_norm": 8.625, + "learning_rate": 9.80488888888889e-05, + "loss": 0.9834, + "step": 2940 + }, + { + "epoch": 2.10413694721826, + "grad_norm": 45.25, + "learning_rate": 9.800444444444446e-05, + "loss": 0.9159, + "step": 2950 + }, + { + "epoch": 2.1112696148359484, + "grad_norm": 9.375, + "learning_rate": 9.796e-05, + "loss": 1.0598, + "step": 2960 + }, + { + "epoch": 2.1184022824536375, + "grad_norm": 6.90625, + "learning_rate": 9.791555555555557e-05, + "loss": 0.8848, + "step": 2970 + }, + { + "epoch": 2.1255349500713265, + "grad_norm": 7.5625, + "learning_rate": 9.787111111111111e-05, + "loss": 0.942, + "step": 2980 + }, + { + "epoch": 2.1326676176890156, + "grad_norm": 8.6875, + "learning_rate": 9.782666666666666e-05, + "loss": 0.9583, + "step": 2990 + }, + { + "epoch": 2.1398002853067046, + "grad_norm": 9.0, + "learning_rate": 9.778222222222222e-05, + "loss": 0.9836, + "step": 3000 + }, + { + "epoch": 2.1398002853067046, + "eval/acc": 48.83720779418945, + "step": 3000 + }, + { + "epoch": 2.1398002853067046, + "eval_loss": 1.9625970125198364, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.09, + "eval_steps_per_second": 4.351, + "step": 3000 + }, + { + "epoch": 2.1469329529243937, + "grad_norm": 8.4375, + "learning_rate": 9.773777777777779e-05, + "loss": 1.028, + "step": 3010 + }, + { + "epoch": 2.1540656205420827, + "grad_norm": 10.4375, + "learning_rate": 9.769333333333334e-05, + "loss": 0.9209, + "step": 3020 + }, + { + "epoch": 2.1611982881597718, + "grad_norm": 9.3125, + "learning_rate": 9.764888888888888e-05, + "loss": 0.9999, + "step": 3030 + }, + { + "epoch": 2.168330955777461, + "grad_norm": 8.375, + "learning_rate": 9.760444444444446e-05, + "loss": 0.9576, + "step": 3040 + }, + { + "epoch": 2.17546362339515, + "grad_norm": 7.4375, + "learning_rate": 9.756000000000001e-05, + "loss": 0.8832, + "step": 3050 + }, + { + "epoch": 2.182596291012839, + "grad_norm": 8.125, + "learning_rate": 9.751555555555556e-05, + "loss": 0.933, + "step": 3060 + }, + { + "epoch": 2.189728958630528, + "grad_norm": 8.9375, + "learning_rate": 9.747111111111112e-05, + "loss": 0.9962, + "step": 3070 + }, + { + "epoch": 2.196861626248217, + "grad_norm": 7.1875, + "learning_rate": 9.742666666666667e-05, + "loss": 1.003, + "step": 3080 + }, + { + "epoch": 2.2039942938659056, + "grad_norm": 8.1875, + "learning_rate": 9.738222222222223e-05, + "loss": 0.9441, + "step": 3090 + }, + { + "epoch": 2.2111269614835947, + "grad_norm": 7.21875, + "learning_rate": 9.733777777777778e-05, + "loss": 1.0335, + "step": 3100 + }, + { + "epoch": 2.2111269614835947, + "eval/acc": 51.16279220581055, + "step": 3100 + }, + { + "epoch": 2.2111269614835947, + "eval_loss": 1.8829365968704224, + "eval_runtime": 0.248, + "eval_samples_per_second": 173.381, + "eval_steps_per_second": 4.032, + "step": 3100 + }, + { + "epoch": 2.2182596291012837, + "grad_norm": 9.1875, + "learning_rate": 9.729333333333334e-05, + "loss": 0.9694, + "step": 3110 + }, + { + "epoch": 2.2253922967189728, + "grad_norm": 6.9375, + "learning_rate": 9.724888888888889e-05, + "loss": 1.0386, + "step": 3120 + }, + { + "epoch": 2.232524964336662, + "grad_norm": 8.6875, + "learning_rate": 9.720444444444445e-05, + "loss": 0.9614, + "step": 3130 + }, + { + "epoch": 2.239657631954351, + "grad_norm": 8.3125, + "learning_rate": 9.716000000000001e-05, + "loss": 1.0643, + "step": 3140 + }, + { + "epoch": 2.24679029957204, + "grad_norm": 8.125, + "learning_rate": 9.711555555555556e-05, + "loss": 0.9243, + "step": 3150 + }, + { + "epoch": 2.253922967189729, + "grad_norm": 9.125, + "learning_rate": 9.707111111111111e-05, + "loss": 0.8419, + "step": 3160 + }, + { + "epoch": 2.261055634807418, + "grad_norm": 9.125, + "learning_rate": 9.702666666666667e-05, + "loss": 0.9961, + "step": 3170 + }, + { + "epoch": 2.268188302425107, + "grad_norm": 6.3125, + "learning_rate": 9.698222222222223e-05, + "loss": 0.8931, + "step": 3180 + }, + { + "epoch": 2.275320970042796, + "grad_norm": 7.875, + "learning_rate": 9.693777777777778e-05, + "loss": 1.0057, + "step": 3190 + }, + { + "epoch": 2.282453637660485, + "grad_norm": 6.90625, + "learning_rate": 9.689333333333333e-05, + "loss": 0.9606, + "step": 3200 + }, + { + "epoch": 2.282453637660485, + "eval/acc": 48.83720779418945, + "step": 3200 + }, + { + "epoch": 2.282453637660485, + "eval_loss": 1.823419451713562, + "eval_runtime": 0.2149, + "eval_samples_per_second": 200.139, + "eval_steps_per_second": 4.654, + "step": 3200 + }, + { + "epoch": 2.289586305278174, + "grad_norm": 11.8125, + "learning_rate": 9.684888888888889e-05, + "loss": 0.9218, + "step": 3210 + }, + { + "epoch": 2.2967189728958632, + "grad_norm": 8.9375, + "learning_rate": 9.680444444444445e-05, + "loss": 1.0111, + "step": 3220 + }, + { + "epoch": 2.3038516405135523, + "grad_norm": 8.625, + "learning_rate": 9.676e-05, + "loss": 1.0968, + "step": 3230 + }, + { + "epoch": 2.310984308131241, + "grad_norm": 7.1875, + "learning_rate": 9.671555555555556e-05, + "loss": 1.0236, + "step": 3240 + }, + { + "epoch": 2.31811697574893, + "grad_norm": 6.84375, + "learning_rate": 9.667111111111111e-05, + "loss": 0.92, + "step": 3250 + }, + { + "epoch": 2.325249643366619, + "grad_norm": 8.75, + "learning_rate": 9.662666666666667e-05, + "loss": 0.8205, + "step": 3260 + }, + { + "epoch": 2.332382310984308, + "grad_norm": 30.75, + "learning_rate": 9.658222222222222e-05, + "loss": 0.9676, + "step": 3270 + }, + { + "epoch": 2.339514978601997, + "grad_norm": 13.0, + "learning_rate": 9.653777777777778e-05, + "loss": 0.9086, + "step": 3280 + }, + { + "epoch": 2.346647646219686, + "grad_norm": 9.375, + "learning_rate": 9.649333333333333e-05, + "loss": 1.0504, + "step": 3290 + }, + { + "epoch": 2.353780313837375, + "grad_norm": 39.0, + "learning_rate": 9.64488888888889e-05, + "loss": 0.9481, + "step": 3300 + }, + { + "epoch": 2.353780313837375, + "eval/acc": 46.511627197265625, + "step": 3300 + }, + { + "epoch": 2.353780313837375, + "eval_loss": 1.9555243253707886, + "eval_runtime": 0.6637, + "eval_samples_per_second": 64.792, + "eval_steps_per_second": 1.507, + "step": 3300 + }, + { + "epoch": 2.3609129814550642, + "grad_norm": 11.4375, + "learning_rate": 9.640444444444446e-05, + "loss": 0.9641, + "step": 3310 + }, + { + "epoch": 2.3680456490727533, + "grad_norm": 9.0625, + "learning_rate": 9.636e-05, + "loss": 0.9624, + "step": 3320 + }, + { + "epoch": 2.3751783166904423, + "grad_norm": 12.625, + "learning_rate": 9.631555555555555e-05, + "loss": 1.0082, + "step": 3330 + }, + { + "epoch": 2.3823109843081314, + "grad_norm": 7.25, + "learning_rate": 9.627111111111112e-05, + "loss": 1.0249, + "step": 3340 + }, + { + "epoch": 2.3894436519258204, + "grad_norm": 13.375, + "learning_rate": 9.622666666666668e-05, + "loss": 1.0153, + "step": 3350 + }, + { + "epoch": 2.3965763195435095, + "grad_norm": 6.6875, + "learning_rate": 9.618222222222223e-05, + "loss": 0.9533, + "step": 3360 + }, + { + "epoch": 2.403708987161198, + "grad_norm": 9.25, + "learning_rate": 9.613777777777779e-05, + "loss": 1.1051, + "step": 3370 + }, + { + "epoch": 2.410841654778887, + "grad_norm": 9.5625, + "learning_rate": 9.609333333333334e-05, + "loss": 1.0551, + "step": 3380 + }, + { + "epoch": 2.417974322396576, + "grad_norm": 7.21875, + "learning_rate": 9.604888888888889e-05, + "loss": 0.9032, + "step": 3390 + }, + { + "epoch": 2.425106990014265, + "grad_norm": 8.5625, + "learning_rate": 9.600444444444445e-05, + "loss": 1.1008, + "step": 3400 + }, + { + "epoch": 2.425106990014265, + "eval/acc": 51.16279220581055, + "step": 3400 + }, + { + "epoch": 2.425106990014265, + "eval_loss": 1.7766540050506592, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.16, + "eval_steps_per_second": 4.399, + "step": 3400 + }, + { + "epoch": 2.4322396576319543, + "grad_norm": 10.375, + "learning_rate": 9.596000000000001e-05, + "loss": 0.9562, + "step": 3410 + }, + { + "epoch": 2.4393723252496433, + "grad_norm": 8.9375, + "learning_rate": 9.591555555555556e-05, + "loss": 1.0756, + "step": 3420 + }, + { + "epoch": 2.4465049928673324, + "grad_norm": 9.125, + "learning_rate": 9.58711111111111e-05, + "loss": 0.9554, + "step": 3430 + }, + { + "epoch": 2.4536376604850214, + "grad_norm": 8.9375, + "learning_rate": 9.582666666666668e-05, + "loss": 0.9122, + "step": 3440 + }, + { + "epoch": 2.4607703281027105, + "grad_norm": 8.625, + "learning_rate": 9.578222222222223e-05, + "loss": 0.9311, + "step": 3450 + }, + { + "epoch": 2.4679029957203995, + "grad_norm": 6.65625, + "learning_rate": 9.573777777777778e-05, + "loss": 1.0023, + "step": 3460 + }, + { + "epoch": 2.4750356633380886, + "grad_norm": 8.125, + "learning_rate": 9.569333333333334e-05, + "loss": 0.9172, + "step": 3470 + }, + { + "epoch": 2.4821683309557776, + "grad_norm": 7.375, + "learning_rate": 9.56488888888889e-05, + "loss": 0.9407, + "step": 3480 + }, + { + "epoch": 2.4893009985734667, + "grad_norm": 10.25, + "learning_rate": 9.560444444444445e-05, + "loss": 0.9433, + "step": 3490 + }, + { + "epoch": 2.4964336661911553, + "grad_norm": 8.625, + "learning_rate": 9.556e-05, + "loss": 0.9934, + "step": 3500 + }, + { + "epoch": 2.4964336661911553, + "eval/acc": 53.488372802734375, + "step": 3500 + }, + { + "epoch": 2.4964336661911553, + "eval_loss": 1.9511696100234985, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, + "step": 3500 + }, + { + "epoch": 2.5035663338088447, + "grad_norm": 7.625, + "learning_rate": 9.551555555555556e-05, + "loss": 0.9157, + "step": 3510 + }, + { + "epoch": 2.5106990014265333, + "grad_norm": 7.4375, + "learning_rate": 9.547111111111111e-05, + "loss": 0.9202, + "step": 3520 + }, + { + "epoch": 2.5178316690442224, + "grad_norm": 9.25, + "learning_rate": 9.542666666666667e-05, + "loss": 0.8526, + "step": 3530 + }, + { + "epoch": 2.5249643366619114, + "grad_norm": 7.71875, + "learning_rate": 9.538222222222223e-05, + "loss": 0.9562, + "step": 3540 + }, + { + "epoch": 2.5320970042796005, + "grad_norm": 9.75, + "learning_rate": 9.533777777777778e-05, + "loss": 0.9927, + "step": 3550 + }, + { + "epoch": 2.5392296718972895, + "grad_norm": 8.1875, + "learning_rate": 9.529333333333333e-05, + "loss": 0.9263, + "step": 3560 + }, + { + "epoch": 2.5463623395149786, + "grad_norm": 6.9375, + "learning_rate": 9.52488888888889e-05, + "loss": 0.9367, + "step": 3570 + }, + { + "epoch": 2.5534950071326676, + "grad_norm": 9.5625, + "learning_rate": 9.520444444444446e-05, + "loss": 0.9284, + "step": 3580 + }, + { + "epoch": 2.5606276747503567, + "grad_norm": 8.5625, + "learning_rate": 9.516e-05, + "loss": 0.8394, + "step": 3590 + }, + { + "epoch": 2.5677603423680457, + "grad_norm": 10.25, + "learning_rate": 9.511555555555555e-05, + "loss": 0.9336, + "step": 3600 + }, + { + "epoch": 2.5677603423680457, + "eval/acc": 46.511627197265625, + "step": 3600 + }, + { + "epoch": 2.5677603423680457, + "eval_loss": 1.9759221076965332, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.935, + "eval_steps_per_second": 4.719, + "step": 3600 + }, + { + "epoch": 2.574893009985735, + "grad_norm": 10.0625, + "learning_rate": 9.507111111111111e-05, + "loss": 1.0005, + "step": 3610 + }, + { + "epoch": 2.582025677603424, + "grad_norm": 8.375, + "learning_rate": 9.502666666666668e-05, + "loss": 0.9319, + "step": 3620 + }, + { + "epoch": 2.5891583452211124, + "grad_norm": 8.5, + "learning_rate": 9.498222222222222e-05, + "loss": 0.9125, + "step": 3630 + }, + { + "epoch": 2.596291012838802, + "grad_norm": 7.71875, + "learning_rate": 9.493777777777779e-05, + "loss": 0.9279, + "step": 3640 + }, + { + "epoch": 2.6034236804564905, + "grad_norm": 11.875, + "learning_rate": 9.489333333333334e-05, + "loss": 0.952, + "step": 3650 + }, + { + "epoch": 2.6105563480741796, + "grad_norm": 7.5625, + "learning_rate": 9.48488888888889e-05, + "loss": 1.0043, + "step": 3660 + }, + { + "epoch": 2.6176890156918686, + "grad_norm": 11.5625, + "learning_rate": 9.480444444444445e-05, + "loss": 0.8932, + "step": 3670 + }, + { + "epoch": 2.6248216833095577, + "grad_norm": 7.84375, + "learning_rate": 9.476000000000001e-05, + "loss": 0.8775, + "step": 3680 + }, + { + "epoch": 2.6319543509272467, + "grad_norm": 9.0, + "learning_rate": 9.471555555555556e-05, + "loss": 0.9756, + "step": 3690 + }, + { + "epoch": 2.6390870185449358, + "grad_norm": 7.375, + "learning_rate": 9.46711111111111e-05, + "loss": 0.9345, + "step": 3700 + }, + { + "epoch": 2.6390870185449358, + "eval/acc": 51.16279220581055, + "step": 3700 + }, + { + "epoch": 2.6390870185449358, + "eval_loss": 1.9134849309921265, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.194, + "eval_steps_per_second": 4.307, + "step": 3700 + }, + { + "epoch": 2.646219686162625, + "grad_norm": 8.4375, + "learning_rate": 9.462666666666668e-05, + "loss": 0.9851, + "step": 3710 + }, + { + "epoch": 2.653352353780314, + "grad_norm": 31.75, + "learning_rate": 9.458222222222223e-05, + "loss": 0.9712, + "step": 3720 + }, + { + "epoch": 2.660485021398003, + "grad_norm": 6.75, + "learning_rate": 9.453777777777778e-05, + "loss": 0.8641, + "step": 3730 + }, + { + "epoch": 2.667617689015692, + "grad_norm": 6.5625, + "learning_rate": 9.449333333333334e-05, + "loss": 0.945, + "step": 3740 + }, + { + "epoch": 2.674750356633381, + "grad_norm": 6.0625, + "learning_rate": 9.44488888888889e-05, + "loss": 0.9535, + "step": 3750 + }, + { + "epoch": 2.68188302425107, + "grad_norm": 7.90625, + "learning_rate": 9.440444444444445e-05, + "loss": 0.8844, + "step": 3760 + }, + { + "epoch": 2.689015691868759, + "grad_norm": 9.8125, + "learning_rate": 9.436e-05, + "loss": 0.9064, + "step": 3770 + }, + { + "epoch": 2.6961483594864477, + "grad_norm": 8.4375, + "learning_rate": 9.431555555555556e-05, + "loss": 1.0119, + "step": 3780 + }, + { + "epoch": 2.703281027104137, + "grad_norm": 7.15625, + "learning_rate": 9.427111111111112e-05, + "loss": 0.9655, + "step": 3790 + }, + { + "epoch": 2.710413694721826, + "grad_norm": 9.4375, + "learning_rate": 9.422666666666667e-05, + "loss": 0.9187, + "step": 3800 + }, + { + "epoch": 2.710413694721826, + "eval/acc": 51.16279220581055, + "step": 3800 + }, + { + "epoch": 2.710413694721826, + "eval_loss": 1.9277268648147583, + "eval_runtime": 0.2166, + "eval_samples_per_second": 198.481, + "eval_steps_per_second": 4.616, + "step": 3800 + }, + { + "epoch": 2.717546362339515, + "grad_norm": 9.25, + "learning_rate": 9.418222222222223e-05, + "loss": 0.8689, + "step": 3810 + }, + { + "epoch": 2.724679029957204, + "grad_norm": 8.0625, + "learning_rate": 9.413777777777778e-05, + "loss": 0.9138, + "step": 3820 + }, + { + "epoch": 2.731811697574893, + "grad_norm": 14.3125, + "learning_rate": 9.409333333333333e-05, + "loss": 0.9129, + "step": 3830 + }, + { + "epoch": 2.738944365192582, + "grad_norm": 6.78125, + "learning_rate": 9.404888888888889e-05, + "loss": 0.8666, + "step": 3840 + }, + { + "epoch": 2.746077032810271, + "grad_norm": 7.4375, + "learning_rate": 9.400444444444445e-05, + "loss": 0.9474, + "step": 3850 + }, + { + "epoch": 2.75320970042796, + "grad_norm": 7.46875, + "learning_rate": 9.396e-05, + "loss": 0.9312, + "step": 3860 + }, + { + "epoch": 2.760342368045649, + "grad_norm": 7.84375, + "learning_rate": 9.391555555555555e-05, + "loss": 0.943, + "step": 3870 + }, + { + "epoch": 2.767475035663338, + "grad_norm": 8.125, + "learning_rate": 9.387111111111113e-05, + "loss": 0.9471, + "step": 3880 + }, + { + "epoch": 2.7746077032810272, + "grad_norm": 10.5625, + "learning_rate": 9.382666666666667e-05, + "loss": 0.9785, + "step": 3890 + }, + { + "epoch": 2.7817403708987163, + "grad_norm": 10.5, + "learning_rate": 9.378222222222222e-05, + "loss": 1.0151, + "step": 3900 + }, + { + "epoch": 2.7817403708987163, + "eval/acc": 44.1860466003418, + "step": 3900 + }, + { + "epoch": 2.7817403708987163, + "eval_loss": 1.970198154449463, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.028, + "eval_steps_per_second": 4.675, + "step": 3900 + }, + { + "epoch": 2.788873038516405, + "grad_norm": 9.75, + "learning_rate": 9.373777777777778e-05, + "loss": 0.9148, + "step": 3910 + }, + { + "epoch": 2.7960057061340944, + "grad_norm": 9.1875, + "learning_rate": 9.369333333333333e-05, + "loss": 1.0314, + "step": 3920 + }, + { + "epoch": 2.803138373751783, + "grad_norm": 8.375, + "learning_rate": 9.36488888888889e-05, + "loss": 0.9076, + "step": 3930 + }, + { + "epoch": 2.810271041369472, + "grad_norm": 6.46875, + "learning_rate": 9.360444444444444e-05, + "loss": 0.8218, + "step": 3940 + }, + { + "epoch": 2.817403708987161, + "grad_norm": 7.96875, + "learning_rate": 9.356e-05, + "loss": 0.9415, + "step": 3950 + }, + { + "epoch": 2.82453637660485, + "grad_norm": 7.53125, + "learning_rate": 9.351555555555555e-05, + "loss": 0.9593, + "step": 3960 + }, + { + "epoch": 2.831669044222539, + "grad_norm": 5.96875, + "learning_rate": 9.347111111111112e-05, + "loss": 0.9134, + "step": 3970 + }, + { + "epoch": 2.8388017118402282, + "grad_norm": 8.25, + "learning_rate": 9.342666666666668e-05, + "loss": 0.9339, + "step": 3980 + }, + { + "epoch": 2.8459343794579173, + "grad_norm": 9.625, + "learning_rate": 9.338222222222223e-05, + "loss": 1.0018, + "step": 3990 + }, + { + "epoch": 2.8530670470756063, + "grad_norm": 7.8125, + "learning_rate": 9.333777777777777e-05, + "loss": 0.9302, + "step": 4000 + }, + { + "epoch": 2.8530670470756063, + "eval/acc": 46.511627197265625, + "step": 4000 + }, + { + "epoch": 2.8530670470756063, + "eval_loss": 1.830549716949463, + "eval_runtime": 0.2202, + "eval_samples_per_second": 195.244, + "eval_steps_per_second": 4.541, + "step": 4000 + }, + { + "epoch": 2.8601997146932954, + "grad_norm": 6.65625, + "learning_rate": 9.329333333333334e-05, + "loss": 0.9375, + "step": 4010 + }, + { + "epoch": 2.8673323823109844, + "grad_norm": 11.875, + "learning_rate": 9.32488888888889e-05, + "loss": 0.8406, + "step": 4020 + }, + { + "epoch": 2.8744650499286735, + "grad_norm": 8.1875, + "learning_rate": 9.320444444444445e-05, + "loss": 0.8863, + "step": 4030 + }, + { + "epoch": 2.881597717546362, + "grad_norm": 6.9375, + "learning_rate": 9.316000000000001e-05, + "loss": 0.9546, + "step": 4040 + }, + { + "epoch": 2.8887303851640516, + "grad_norm": 8.625, + "learning_rate": 9.311555555555556e-05, + "loss": 1.0175, + "step": 4050 + }, + { + "epoch": 2.89586305278174, + "grad_norm": 45.0, + "learning_rate": 9.307111111111112e-05, + "loss": 0.9058, + "step": 4060 + }, + { + "epoch": 2.9029957203994297, + "grad_norm": 13.625, + "learning_rate": 9.302666666666667e-05, + "loss": 0.9137, + "step": 4070 + }, + { + "epoch": 2.9101283880171183, + "grad_norm": 6.8125, + "learning_rate": 9.298222222222223e-05, + "loss": 0.8862, + "step": 4080 + }, + { + "epoch": 2.9172610556348073, + "grad_norm": 13.8125, + "learning_rate": 9.293777777777778e-05, + "loss": 0.9152, + "step": 4090 + }, + { + "epoch": 2.9243937232524964, + "grad_norm": 13.3125, + "learning_rate": 9.289333333333334e-05, + "loss": 0.9623, + "step": 4100 + }, + { + "epoch": 2.9243937232524964, + "eval/acc": 46.511627197265625, + "step": 4100 + }, + { + "epoch": 2.9243937232524964, + "eval_loss": 1.9800893068313599, + "eval_runtime": 0.2162, + "eval_samples_per_second": 198.931, + "eval_steps_per_second": 4.626, + "step": 4100 + }, + { + "epoch": 2.9315263908701854, + "grad_norm": 7.1875, + "learning_rate": 9.28488888888889e-05, + "loss": 0.9088, + "step": 4110 + }, + { + "epoch": 2.9386590584878745, + "grad_norm": 8.3125, + "learning_rate": 9.280444444444445e-05, + "loss": 0.9927, + "step": 4120 + }, + { + "epoch": 2.9457917261055635, + "grad_norm": 75.0, + "learning_rate": 9.276e-05, + "loss": 0.912, + "step": 4130 + }, + { + "epoch": 2.9529243937232525, + "grad_norm": 9.125, + "learning_rate": 9.271555555555556e-05, + "loss": 0.9878, + "step": 4140 + }, + { + "epoch": 2.9600570613409416, + "grad_norm": 7.125, + "learning_rate": 9.267111111111112e-05, + "loss": 0.8785, + "step": 4150 + }, + { + "epoch": 2.9671897289586306, + "grad_norm": 8.25, + "learning_rate": 9.262666666666667e-05, + "loss": 0.9296, + "step": 4160 + }, + { + "epoch": 2.9743223965763197, + "grad_norm": 8.75, + "learning_rate": 9.258222222222222e-05, + "loss": 0.9284, + "step": 4170 + }, + { + "epoch": 2.9814550641940087, + "grad_norm": 8.8125, + "learning_rate": 9.253777777777778e-05, + "loss": 0.9566, + "step": 4180 + }, + { + "epoch": 2.9885877318116973, + "grad_norm": 6.90625, + "learning_rate": 9.249333333333334e-05, + "loss": 0.8368, + "step": 4190 + }, + { + "epoch": 2.995720399429387, + "grad_norm": 9.875, + "learning_rate": 9.244888888888889e-05, + "loss": 1.0306, + "step": 4200 + }, + { + "epoch": 2.995720399429387, + "eval/acc": 51.16279220581055, + "step": 4200 + }, + { + "epoch": 2.995720399429387, + "eval_loss": 1.8740426301956177, + "eval_runtime": 0.2157, + "eval_samples_per_second": 199.368, + "eval_steps_per_second": 4.636, + "step": 4200 + }, + { + "epoch": 3.0028530670470754, + "grad_norm": 9.5625, + "learning_rate": 9.240444444444445e-05, + "loss": 0.957, + "step": 4210 + }, + { + "epoch": 3.0099857346647645, + "grad_norm": 13.5625, + "learning_rate": 9.236e-05, + "loss": 0.884, + "step": 4220 + }, + { + "epoch": 3.0171184022824535, + "grad_norm": 8.0625, + "learning_rate": 9.231555555555555e-05, + "loss": 0.9064, + "step": 4230 + }, + { + "epoch": 3.0242510699001426, + "grad_norm": 8.0, + "learning_rate": 9.227111111111111e-05, + "loss": 0.9164, + "step": 4240 + }, + { + "epoch": 3.0313837375178316, + "grad_norm": 8.4375, + "learning_rate": 9.222666666666668e-05, + "loss": 0.9787, + "step": 4250 + }, + { + "epoch": 3.0385164051355207, + "grad_norm": 7.6875, + "learning_rate": 9.218222222222222e-05, + "loss": 0.8852, + "step": 4260 + }, + { + "epoch": 3.0456490727532097, + "grad_norm": 7.15625, + "learning_rate": 9.213777777777777e-05, + "loss": 1.0092, + "step": 4270 + }, + { + "epoch": 3.0527817403708988, + "grad_norm": 6.65625, + "learning_rate": 9.209333333333335e-05, + "loss": 0.9972, + "step": 4280 + }, + { + "epoch": 3.059914407988588, + "grad_norm": 7.25, + "learning_rate": 9.20488888888889e-05, + "loss": 0.9237, + "step": 4290 + }, + { + "epoch": 3.067047075606277, + "grad_norm": 6.4375, + "learning_rate": 9.200444444444445e-05, + "loss": 0.9096, + "step": 4300 + }, + { + "epoch": 3.067047075606277, + "eval/acc": 39.53488540649414, + "step": 4300 + }, + { + "epoch": 3.067047075606277, + "eval_loss": 2.0920908451080322, + "eval_runtime": 1.7384, + "eval_samples_per_second": 24.736, + "eval_steps_per_second": 0.575, + "step": 4300 + }, + { + "epoch": 3.074179743223966, + "grad_norm": 8.4375, + "learning_rate": 9.196000000000001e-05, + "loss": 0.9697, + "step": 4310 + }, + { + "epoch": 3.081312410841655, + "grad_norm": 8.4375, + "learning_rate": 9.191555555555556e-05, + "loss": 0.8379, + "step": 4320 + }, + { + "epoch": 3.088445078459344, + "grad_norm": 8.125, + "learning_rate": 9.187111111111112e-05, + "loss": 0.8576, + "step": 4330 + }, + { + "epoch": 3.0955777460770326, + "grad_norm": 10.75, + "learning_rate": 9.182666666666667e-05, + "loss": 0.9616, + "step": 4340 + }, + { + "epoch": 3.1027104136947217, + "grad_norm": 6.84375, + "learning_rate": 9.178222222222223e-05, + "loss": 0.7674, + "step": 4350 + }, + { + "epoch": 3.1098430813124107, + "grad_norm": 8.375, + "learning_rate": 9.173777777777778e-05, + "loss": 0.8712, + "step": 4360 + }, + { + "epoch": 3.1169757489300998, + "grad_norm": 8.375, + "learning_rate": 9.169333333333334e-05, + "loss": 0.8599, + "step": 4370 + }, + { + "epoch": 3.124108416547789, + "grad_norm": 7.1875, + "learning_rate": 9.16488888888889e-05, + "loss": 0.9736, + "step": 4380 + }, + { + "epoch": 3.131241084165478, + "grad_norm": 7.75, + "learning_rate": 9.160444444444445e-05, + "loss": 0.8663, + "step": 4390 + }, + { + "epoch": 3.138373751783167, + "grad_norm": 7.53125, + "learning_rate": 9.156e-05, + "loss": 0.9221, + "step": 4400 + }, + { + "epoch": 3.138373751783167, + "eval/acc": 39.53488540649414, + "step": 4400 + }, + { + "epoch": 3.138373751783167, + "eval_loss": 2.127244710922241, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.022, + "eval_steps_per_second": 4.466, + "step": 4400 + }, + { + "epoch": 3.145506419400856, + "grad_norm": 8.125, + "learning_rate": 9.151555555555556e-05, + "loss": 0.9144, + "step": 4410 + }, + { + "epoch": 3.152639087018545, + "grad_norm": 7.46875, + "learning_rate": 9.147111111111112e-05, + "loss": 0.9445, + "step": 4420 + }, + { + "epoch": 3.159771754636234, + "grad_norm": 6.9375, + "learning_rate": 9.142666666666667e-05, + "loss": 0.8308, + "step": 4430 + }, + { + "epoch": 3.166904422253923, + "grad_norm": 7.53125, + "learning_rate": 9.138222222222222e-05, + "loss": 0.8428, + "step": 4440 + }, + { + "epoch": 3.174037089871612, + "grad_norm": 7.96875, + "learning_rate": 9.133777777777778e-05, + "loss": 0.9022, + "step": 4450 + }, + { + "epoch": 3.181169757489301, + "grad_norm": 6.875, + "learning_rate": 9.129333333333334e-05, + "loss": 0.9955, + "step": 4460 + }, + { + "epoch": 3.18830242510699, + "grad_norm": 9.5625, + "learning_rate": 9.124888888888889e-05, + "loss": 0.9493, + "step": 4470 + }, + { + "epoch": 3.195435092724679, + "grad_norm": 9.0625, + "learning_rate": 9.120444444444445e-05, + "loss": 0.9608, + "step": 4480 + }, + { + "epoch": 3.202567760342368, + "grad_norm": 8.625, + "learning_rate": 9.116e-05, + "loss": 0.821, + "step": 4490 + }, + { + "epoch": 3.209700427960057, + "grad_norm": 8.125, + "learning_rate": 9.111555555555556e-05, + "loss": 0.9175, + "step": 4500 + }, + { + "epoch": 3.209700427960057, + "eval/acc": 39.53488540649414, + "step": 4500 + }, + { + "epoch": 3.209700427960057, + "eval_loss": 2.062082529067993, + "eval_runtime": 0.2236, + "eval_samples_per_second": 192.267, + "eval_steps_per_second": 4.471, + "step": 4500 + }, + { + "epoch": 3.216833095577746, + "grad_norm": 8.0625, + "learning_rate": 9.107111111111111e-05, + "loss": 0.9169, + "step": 4510 + }, + { + "epoch": 3.223965763195435, + "grad_norm": 8.3125, + "learning_rate": 9.102666666666667e-05, + "loss": 0.8001, + "step": 4520 + }, + { + "epoch": 3.231098430813124, + "grad_norm": 7.3125, + "learning_rate": 9.098222222222222e-05, + "loss": 0.8513, + "step": 4530 + }, + { + "epoch": 3.238231098430813, + "grad_norm": 7.625, + "learning_rate": 9.093777777777777e-05, + "loss": 0.912, + "step": 4540 + }, + { + "epoch": 3.245363766048502, + "grad_norm": 6.46875, + "learning_rate": 9.089333333333335e-05, + "loss": 0.9418, + "step": 4550 + }, + { + "epoch": 3.2524964336661912, + "grad_norm": 11.9375, + "learning_rate": 9.08488888888889e-05, + "loss": 0.871, + "step": 4560 + }, + { + "epoch": 3.2596291012838803, + "grad_norm": 7.03125, + "learning_rate": 9.080444444444444e-05, + "loss": 0.8507, + "step": 4570 + }, + { + "epoch": 3.2667617689015693, + "grad_norm": 7.21875, + "learning_rate": 9.076e-05, + "loss": 0.8058, + "step": 4580 + }, + { + "epoch": 3.2738944365192584, + "grad_norm": 8.4375, + "learning_rate": 9.071555555555557e-05, + "loss": 0.7959, + "step": 4590 + }, + { + "epoch": 3.281027104136947, + "grad_norm": 6.375, + "learning_rate": 9.067111111111112e-05, + "loss": 0.9206, + "step": 4600 + }, + { + "epoch": 3.281027104136947, + "eval/acc": 37.20930099487305, + "step": 4600 + }, + { + "epoch": 3.281027104136947, + "eval_loss": 2.1272616386413574, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.747, + "eval_steps_per_second": 4.529, + "step": 4600 + }, + { + "epoch": 3.2881597717546365, + "grad_norm": 7.65625, + "learning_rate": 9.062666666666666e-05, + "loss": 0.8306, + "step": 4610 + }, + { + "epoch": 3.295292439372325, + "grad_norm": 6.9375, + "learning_rate": 9.058222222222223e-05, + "loss": 0.8958, + "step": 4620 + }, + { + "epoch": 3.302425106990014, + "grad_norm": 7.96875, + "learning_rate": 9.053777777777777e-05, + "loss": 0.8919, + "step": 4630 + }, + { + "epoch": 3.309557774607703, + "grad_norm": 6.9375, + "learning_rate": 9.049333333333334e-05, + "loss": 0.8844, + "step": 4640 + }, + { + "epoch": 3.316690442225392, + "grad_norm": 7.21875, + "learning_rate": 9.04488888888889e-05, + "loss": 0.8335, + "step": 4650 + }, + { + "epoch": 3.3238231098430813, + "grad_norm": 7.6875, + "learning_rate": 9.040444444444445e-05, + "loss": 0.9337, + "step": 4660 + }, + { + "epoch": 3.3309557774607703, + "grad_norm": 9.25, + "learning_rate": 9.036e-05, + "loss": 1.0282, + "step": 4670 + }, + { + "epoch": 3.3380884450784594, + "grad_norm": 7.96875, + "learning_rate": 9.031555555555557e-05, + "loss": 0.9401, + "step": 4680 + }, + { + "epoch": 3.3452211126961484, + "grad_norm": 7.25, + "learning_rate": 9.027111111111112e-05, + "loss": 0.908, + "step": 4690 + }, + { + "epoch": 3.3523537803138375, + "grad_norm": 7.8125, + "learning_rate": 9.022666666666667e-05, + "loss": 0.9262, + "step": 4700 + }, + { + "epoch": 3.3523537803138375, + "eval/acc": 37.20930099487305, + "step": 4700 + }, + { + "epoch": 3.3523537803138375, + "eval_loss": 2.07535719871521, + "eval_runtime": 0.2246, + "eval_samples_per_second": 191.478, + "eval_steps_per_second": 4.453, + "step": 4700 + }, + { + "epoch": 3.3594864479315265, + "grad_norm": 13.0, + "learning_rate": 9.018222222222223e-05, + "loss": 0.9692, + "step": 4710 + }, + { + "epoch": 3.3666191155492156, + "grad_norm": 5.875, + "learning_rate": 9.013777777777779e-05, + "loss": 0.9071, + "step": 4720 + }, + { + "epoch": 3.3737517831669046, + "grad_norm": 7.71875, + "learning_rate": 9.009333333333334e-05, + "loss": 0.8528, + "step": 4730 + }, + { + "epoch": 3.3808844507845937, + "grad_norm": 7.46875, + "learning_rate": 9.004888888888889e-05, + "loss": 0.9408, + "step": 4740 + }, + { + "epoch": 3.3880171184022823, + "grad_norm": 7.8125, + "learning_rate": 9.000444444444445e-05, + "loss": 1.0017, + "step": 4750 + }, + { + "epoch": 3.3951497860199713, + "grad_norm": 6.15625, + "learning_rate": 8.996e-05, + "loss": 0.9107, + "step": 4760 + }, + { + "epoch": 3.4022824536376604, + "grad_norm": 6.8125, + "learning_rate": 8.991555555555556e-05, + "loss": 0.9387, + "step": 4770 + }, + { + "epoch": 3.4094151212553494, + "grad_norm": 8.8125, + "learning_rate": 8.987111111111112e-05, + "loss": 0.9775, + "step": 4780 + }, + { + "epoch": 3.4165477888730384, + "grad_norm": 8.375, + "learning_rate": 8.982666666666667e-05, + "loss": 0.8173, + "step": 4790 + }, + { + "epoch": 3.4236804564907275, + "grad_norm": 11.3125, + "learning_rate": 8.978222222222222e-05, + "loss": 0.9068, + "step": 4800 + }, + { + "epoch": 3.4236804564907275, + "eval/acc": 39.53488540649414, + "step": 4800 + }, + { + "epoch": 3.4236804564907275, + "eval_loss": 2.123344898223877, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.17, + "eval_steps_per_second": 4.469, + "step": 4800 + }, + { + "epoch": 3.4308131241084165, + "grad_norm": 6.65625, + "learning_rate": 8.973777777777778e-05, + "loss": 0.8262, + "step": 4810 + }, + { + "epoch": 3.4379457917261056, + "grad_norm": 9.125, + "learning_rate": 8.969333333333334e-05, + "loss": 0.9207, + "step": 4820 + }, + { + "epoch": 3.4450784593437946, + "grad_norm": 6.78125, + "learning_rate": 8.964888888888889e-05, + "loss": 1.0115, + "step": 4830 + }, + { + "epoch": 3.4522111269614837, + "grad_norm": 7.5625, + "learning_rate": 8.960444444444444e-05, + "loss": 0.9031, + "step": 4840 + }, + { + "epoch": 3.4593437945791727, + "grad_norm": 7.875, + "learning_rate": 8.956e-05, + "loss": 0.9626, + "step": 4850 + }, + { + "epoch": 3.466476462196862, + "grad_norm": 4.625, + "learning_rate": 8.951555555555557e-05, + "loss": 0.7793, + "step": 4860 + }, + { + "epoch": 3.473609129814551, + "grad_norm": 7.40625, + "learning_rate": 8.947111111111111e-05, + "loss": 0.8733, + "step": 4870 + }, + { + "epoch": 3.4807417974322394, + "grad_norm": 9.6875, + "learning_rate": 8.942666666666668e-05, + "loss": 0.8448, + "step": 4880 + }, + { + "epoch": 3.4878744650499285, + "grad_norm": 8.625, + "learning_rate": 8.938222222222222e-05, + "loss": 0.815, + "step": 4890 + }, + { + "epoch": 3.4950071326676175, + "grad_norm": 8.6875, + "learning_rate": 8.933777777777779e-05, + "loss": 0.7837, + "step": 4900 + }, + { + "epoch": 3.4950071326676175, + "eval/acc": 37.20930099487305, + "step": 4900 + }, + { + "epoch": 3.4950071326676175, + "eval_loss": 2.080425262451172, + "eval_runtime": 0.2476, + "eval_samples_per_second": 173.654, + "eval_steps_per_second": 4.038, + "step": 4900 + }, + { + "epoch": 3.5021398002853066, + "grad_norm": 7.25, + "learning_rate": 8.929333333333333e-05, + "loss": 0.9082, + "step": 4910 + }, + { + "epoch": 3.5092724679029956, + "grad_norm": 9.0, + "learning_rate": 8.92488888888889e-05, + "loss": 0.8041, + "step": 4920 + }, + { + "epoch": 3.5164051355206847, + "grad_norm": 7.5625, + "learning_rate": 8.920444444444444e-05, + "loss": 0.878, + "step": 4930 + }, + { + "epoch": 3.5235378031383737, + "grad_norm": 8.3125, + "learning_rate": 8.916e-05, + "loss": 0.8609, + "step": 4940 + }, + { + "epoch": 3.5306704707560628, + "grad_norm": 6.9375, + "learning_rate": 8.911555555555557e-05, + "loss": 0.8203, + "step": 4950 + }, + { + "epoch": 3.537803138373752, + "grad_norm": 6.4375, + "learning_rate": 8.907111111111112e-05, + "loss": 0.8976, + "step": 4960 + }, + { + "epoch": 3.544935805991441, + "grad_norm": 15.0, + "learning_rate": 8.902666666666667e-05, + "loss": 0.8585, + "step": 4970 + }, + { + "epoch": 3.55206847360913, + "grad_norm": 6.21875, + "learning_rate": 8.898222222222223e-05, + "loss": 0.9642, + "step": 4980 + }, + { + "epoch": 3.559201141226819, + "grad_norm": 9.8125, + "learning_rate": 8.893777777777779e-05, + "loss": 0.9241, + "step": 4990 + }, + { + "epoch": 3.566333808844508, + "grad_norm": 9.25, + "learning_rate": 8.889333333333334e-05, + "loss": 0.7841, + "step": 5000 + }, + { + "epoch": 3.566333808844508, + "eval/acc": 37.20930099487305, + "step": 5000 + }, + { + "epoch": 3.566333808844508, + "eval_loss": 2.0360865592956543, + "eval_runtime": 0.225, + "eval_samples_per_second": 191.102, + "eval_steps_per_second": 4.444, + "step": 5000 + }, + { + "epoch": 3.5734664764621966, + "grad_norm": 7.53125, + "learning_rate": 8.884888888888889e-05, + "loss": 0.8513, + "step": 5010 + }, + { + "epoch": 3.580599144079886, + "grad_norm": 7.3125, + "learning_rate": 8.880444444444445e-05, + "loss": 0.9502, + "step": 5020 + }, + { + "epoch": 3.5877318116975747, + "grad_norm": 7.375, + "learning_rate": 8.876e-05, + "loss": 0.9329, + "step": 5030 + }, + { + "epoch": 3.5948644793152638, + "grad_norm": 7.3125, + "learning_rate": 8.871555555555556e-05, + "loss": 0.8648, + "step": 5040 + }, + { + "epoch": 3.601997146932953, + "grad_norm": 6.5, + "learning_rate": 8.867111111111112e-05, + "loss": 0.8019, + "step": 5050 + }, + { + "epoch": 3.609129814550642, + "grad_norm": 9.0, + "learning_rate": 8.862666666666667e-05, + "loss": 0.8829, + "step": 5060 + }, + { + "epoch": 3.616262482168331, + "grad_norm": 6.46875, + "learning_rate": 8.858222222222222e-05, + "loss": 0.8419, + "step": 5070 + }, + { + "epoch": 3.62339514978602, + "grad_norm": 8.9375, + "learning_rate": 8.853777777777778e-05, + "loss": 0.9345, + "step": 5080 + }, + { + "epoch": 3.630527817403709, + "grad_norm": 7.09375, + "learning_rate": 8.849333333333334e-05, + "loss": 0.8204, + "step": 5090 + }, + { + "epoch": 3.637660485021398, + "grad_norm": 7.71875, + "learning_rate": 8.844888888888889e-05, + "loss": 0.9305, + "step": 5100 + }, + { + "epoch": 3.637660485021398, + "eval/acc": 39.53488540649414, + "step": 5100 + }, + { + "epoch": 3.637660485021398, + "eval_loss": 2.0034291744232178, + "eval_runtime": 0.2213, + "eval_samples_per_second": 194.329, + "eval_steps_per_second": 4.519, + "step": 5100 + }, + { + "epoch": 3.644793152639087, + "grad_norm": 6.09375, + "learning_rate": 8.840444444444444e-05, + "loss": 0.9168, + "step": 5110 + }, + { + "epoch": 3.651925820256776, + "grad_norm": 8.25, + "learning_rate": 8.836000000000001e-05, + "loss": 0.8155, + "step": 5120 + }, + { + "epoch": 3.659058487874465, + "grad_norm": 7.84375, + "learning_rate": 8.831555555555556e-05, + "loss": 0.8641, + "step": 5130 + }, + { + "epoch": 3.666191155492154, + "grad_norm": 6.5, + "learning_rate": 8.827111111111111e-05, + "loss": 0.8623, + "step": 5140 + }, + { + "epoch": 3.6733238231098433, + "grad_norm": 21.125, + "learning_rate": 8.822666666666667e-05, + "loss": 0.8205, + "step": 5150 + }, + { + "epoch": 3.680456490727532, + "grad_norm": 7.28125, + "learning_rate": 8.818222222222222e-05, + "loss": 0.7993, + "step": 5160 + }, + { + "epoch": 3.6875891583452214, + "grad_norm": 36.0, + "learning_rate": 8.813777777777778e-05, + "loss": 0.9083, + "step": 5170 + }, + { + "epoch": 3.69472182596291, + "grad_norm": 8.125, + "learning_rate": 8.809333333333333e-05, + "loss": 0.9264, + "step": 5180 + }, + { + "epoch": 3.701854493580599, + "grad_norm": 10.75, + "learning_rate": 8.80488888888889e-05, + "loss": 0.8496, + "step": 5190 + }, + { + "epoch": 3.708987161198288, + "grad_norm": 7.78125, + "learning_rate": 8.800444444444444e-05, + "loss": 0.8718, + "step": 5200 + }, + { + "epoch": 3.708987161198288, + "eval/acc": 39.53488540649414, + "step": 5200 + }, + { + "epoch": 3.708987161198288, + "eval_loss": 2.0305864810943604, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.683, + "eval_steps_per_second": 4.504, + "step": 5200 + }, + { + "epoch": 3.716119828815977, + "grad_norm": 9.3125, + "learning_rate": 8.796e-05, + "loss": 1.0077, + "step": 5210 + }, + { + "epoch": 3.723252496433666, + "grad_norm": 11.4375, + "learning_rate": 8.791555555555557e-05, + "loss": 0.8364, + "step": 5220 + }, + { + "epoch": 3.7303851640513552, + "grad_norm": 15.125, + "learning_rate": 8.787111111111112e-05, + "loss": 0.8557, + "step": 5230 + }, + { + "epoch": 3.7375178316690443, + "grad_norm": 7.875, + "learning_rate": 8.782666666666666e-05, + "loss": 0.8674, + "step": 5240 + }, + { + "epoch": 3.7446504992867333, + "grad_norm": 7.84375, + "learning_rate": 8.778222222222223e-05, + "loss": 0.8788, + "step": 5250 + }, + { + "epoch": 3.7517831669044224, + "grad_norm": 7.59375, + "learning_rate": 8.773777777777779e-05, + "loss": 0.8098, + "step": 5260 + }, + { + "epoch": 3.7589158345221114, + "grad_norm": 7.40625, + "learning_rate": 8.769333333333334e-05, + "loss": 0.8895, + "step": 5270 + }, + { + "epoch": 3.7660485021398005, + "grad_norm": 6.78125, + "learning_rate": 8.76488888888889e-05, + "loss": 0.823, + "step": 5280 + }, + { + "epoch": 3.773181169757489, + "grad_norm": 8.125, + "learning_rate": 8.760444444444445e-05, + "loss": 0.8418, + "step": 5290 + }, + { + "epoch": 3.7803138373751786, + "grad_norm": 8.4375, + "learning_rate": 8.756000000000001e-05, + "loss": 0.8202, + "step": 5300 + }, + { + "epoch": 3.7803138373751786, + "eval/acc": 41.86046600341797, + "step": 5300 + }, + { + "epoch": 3.7803138373751786, + "eval_loss": 2.100001811981201, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.218, + "eval_steps_per_second": 4.47, + "step": 5300 + }, + { + "epoch": 3.787446504992867, + "grad_norm": 7.78125, + "learning_rate": 8.751555555555556e-05, + "loss": 0.9786, + "step": 5310 + }, + { + "epoch": 3.794579172610556, + "grad_norm": 14.125, + "learning_rate": 8.747111111111112e-05, + "loss": 1.0893, + "step": 5320 + }, + { + "epoch": 3.8017118402282453, + "grad_norm": 6.71875, + "learning_rate": 8.742666666666667e-05, + "loss": 0.8484, + "step": 5330 + }, + { + "epoch": 3.8088445078459343, + "grad_norm": 7.53125, + "learning_rate": 8.738222222222222e-05, + "loss": 0.922, + "step": 5340 + }, + { + "epoch": 3.8159771754636234, + "grad_norm": 6.9375, + "learning_rate": 8.733777777777779e-05, + "loss": 0.87, + "step": 5350 + }, + { + "epoch": 3.8231098430813124, + "grad_norm": 6.75, + "learning_rate": 8.729333333333334e-05, + "loss": 0.9272, + "step": 5360 + }, + { + "epoch": 3.8302425106990015, + "grad_norm": 6.875, + "learning_rate": 8.724888888888889e-05, + "loss": 0.8358, + "step": 5370 + }, + { + "epoch": 3.8373751783166905, + "grad_norm": 7.53125, + "learning_rate": 8.720444444444445e-05, + "loss": 0.8764, + "step": 5380 + }, + { + "epoch": 3.8445078459343796, + "grad_norm": 7.96875, + "learning_rate": 8.716000000000001e-05, + "loss": 0.9348, + "step": 5390 + }, + { + "epoch": 3.8516405135520686, + "grad_norm": 7.5625, + "learning_rate": 8.711555555555556e-05, + "loss": 0.9033, + "step": 5400 + }, + { + "epoch": 3.8516405135520686, + "eval/acc": 39.53488540649414, + "step": 5400 + }, + { + "epoch": 3.8516405135520686, + "eval_loss": 2.0633187294006348, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.324, + "eval_steps_per_second": 4.449, + "step": 5400 + }, + { + "epoch": 3.8587731811697576, + "grad_norm": 6.90625, + "learning_rate": 8.707111111111111e-05, + "loss": 0.9344, + "step": 5410 + }, + { + "epoch": 3.8659058487874463, + "grad_norm": 7.5, + "learning_rate": 8.702666666666667e-05, + "loss": 0.9346, + "step": 5420 + }, + { + "epoch": 3.8730385164051357, + "grad_norm": 7.03125, + "learning_rate": 8.698222222222223e-05, + "loss": 0.8835, + "step": 5430 + }, + { + "epoch": 3.8801711840228243, + "grad_norm": 6.3125, + "learning_rate": 8.693777777777778e-05, + "loss": 0.8434, + "step": 5440 + }, + { + "epoch": 3.8873038516405134, + "grad_norm": 7.03125, + "learning_rate": 8.689333333333334e-05, + "loss": 0.8555, + "step": 5450 + }, + { + "epoch": 3.8944365192582024, + "grad_norm": 8.0, + "learning_rate": 8.684888888888889e-05, + "loss": 0.9287, + "step": 5460 + }, + { + "epoch": 3.9015691868758915, + "grad_norm": 8.1875, + "learning_rate": 8.680444444444444e-05, + "loss": 0.8738, + "step": 5470 + }, + { + "epoch": 3.9087018544935805, + "grad_norm": 7.96875, + "learning_rate": 8.676e-05, + "loss": 0.8189, + "step": 5480 + }, + { + "epoch": 3.9158345221112696, + "grad_norm": 10.1875, + "learning_rate": 8.671555555555556e-05, + "loss": 0.8983, + "step": 5490 + }, + { + "epoch": 3.9229671897289586, + "grad_norm": 10.375, + "learning_rate": 8.667111111111111e-05, + "loss": 0.8083, + "step": 5500 + }, + { + "epoch": 3.9229671897289586, + "eval/acc": 39.53488540649414, + "step": 5500 + }, + { + "epoch": 3.9229671897289586, + "eval_loss": 2.089243173599243, + "eval_runtime": 0.2203, + "eval_samples_per_second": 195.23, + "eval_steps_per_second": 4.54, + "step": 5500 + }, + { + "epoch": 3.9300998573466477, + "grad_norm": 13.125, + "learning_rate": 8.662666666666666e-05, + "loss": 0.8747, + "step": 5510 + }, + { + "epoch": 3.9372325249643367, + "grad_norm": 8.25, + "learning_rate": 8.658222222222224e-05, + "loss": 0.8609, + "step": 5520 + }, + { + "epoch": 3.944365192582026, + "grad_norm": 6.75, + "learning_rate": 8.653777777777779e-05, + "loss": 0.8563, + "step": 5530 + }, + { + "epoch": 3.951497860199715, + "grad_norm": 7.75, + "learning_rate": 8.649333333333333e-05, + "loss": 0.8912, + "step": 5540 + }, + { + "epoch": 3.9586305278174034, + "grad_norm": 6.40625, + "learning_rate": 8.64488888888889e-05, + "loss": 0.7477, + "step": 5550 + }, + { + "epoch": 3.965763195435093, + "grad_norm": 7.0, + "learning_rate": 8.640444444444444e-05, + "loss": 0.8185, + "step": 5560 + }, + { + "epoch": 3.9728958630527815, + "grad_norm": 5.6875, + "learning_rate": 8.636e-05, + "loss": 0.9497, + "step": 5570 + }, + { + "epoch": 3.980028530670471, + "grad_norm": 8.0, + "learning_rate": 8.631555555555556e-05, + "loss": 0.8117, + "step": 5580 + }, + { + "epoch": 3.9871611982881596, + "grad_norm": 6.625, + "learning_rate": 8.627111111111112e-05, + "loss": 0.8245, + "step": 5590 + }, + { + "epoch": 3.9942938659058487, + "grad_norm": 6.96875, + "learning_rate": 8.622666666666667e-05, + "loss": 0.902, + "step": 5600 + }, + { + "epoch": 3.9942938659058487, + "eval/acc": 39.53488540649414, + "step": 5600 + }, + { + "epoch": 3.9942938659058487, + "eval_loss": 2.186225652694702, + "eval_runtime": 0.2194, + "eval_samples_per_second": 196.001, + "eval_steps_per_second": 4.558, + "step": 5600 + }, + { + "epoch": 4.001426533523538, + "grad_norm": 6.78125, + "learning_rate": 8.618222222222223e-05, + "loss": 0.8757, + "step": 5610 + }, + { + "epoch": 4.008559201141227, + "grad_norm": 11.0625, + "learning_rate": 8.613777777777779e-05, + "loss": 0.885, + "step": 5620 + }, + { + "epoch": 4.015691868758916, + "grad_norm": 6.4375, + "learning_rate": 8.609333333333334e-05, + "loss": 0.8611, + "step": 5630 + }, + { + "epoch": 4.022824536376604, + "grad_norm": 14.8125, + "learning_rate": 8.604888888888889e-05, + "loss": 0.8262, + "step": 5640 + }, + { + "epoch": 4.029957203994294, + "grad_norm": 8.0625, + "learning_rate": 8.600444444444445e-05, + "loss": 0.7549, + "step": 5650 + }, + { + "epoch": 4.0370898716119825, + "grad_norm": 6.84375, + "learning_rate": 8.596000000000001e-05, + "loss": 0.8725, + "step": 5660 + }, + { + "epoch": 4.044222539229672, + "grad_norm": 8.0, + "learning_rate": 8.591555555555556e-05, + "loss": 0.8846, + "step": 5670 + }, + { + "epoch": 4.051355206847361, + "grad_norm": 7.84375, + "learning_rate": 8.587111111111111e-05, + "loss": 0.9373, + "step": 5680 + }, + { + "epoch": 4.05848787446505, + "grad_norm": 6.84375, + "learning_rate": 8.582666666666667e-05, + "loss": 0.7823, + "step": 5690 + }, + { + "epoch": 4.065620542082739, + "grad_norm": 11.4375, + "learning_rate": 8.578222222222223e-05, + "loss": 0.9588, + "step": 5700 + }, + { + "epoch": 4.065620542082739, + "eval/acc": 37.20930099487305, + "step": 5700 + }, + { + "epoch": 4.065620542082739, + "eval_loss": 2.841008424758911, + "eval_runtime": 1.3984, + "eval_samples_per_second": 30.749, + "eval_steps_per_second": 0.715, + "step": 5700 + }, + { + "epoch": 4.072753209700428, + "grad_norm": 5.5625, + "learning_rate": 8.573777777777778e-05, + "loss": 0.8014, + "step": 5710 + }, + { + "epoch": 4.079885877318117, + "grad_norm": 6.90625, + "learning_rate": 8.569333333333334e-05, + "loss": 0.818, + "step": 5720 + }, + { + "epoch": 4.087018544935806, + "grad_norm": 8.4375, + "learning_rate": 8.564888888888889e-05, + "loss": 0.8142, + "step": 5730 + }, + { + "epoch": 4.094151212553495, + "grad_norm": 7.75, + "learning_rate": 8.560444444444445e-05, + "loss": 0.863, + "step": 5740 + }, + { + "epoch": 4.101283880171184, + "grad_norm": 6.90625, + "learning_rate": 8.556e-05, + "loss": 0.8501, + "step": 5750 + }, + { + "epoch": 4.108416547788873, + "grad_norm": 7.15625, + "learning_rate": 8.551555555555556e-05, + "loss": 0.8293, + "step": 5760 + }, + { + "epoch": 4.1155492154065625, + "grad_norm": 8.125, + "learning_rate": 8.547111111111111e-05, + "loss": 0.8655, + "step": 5770 + }, + { + "epoch": 4.122681883024251, + "grad_norm": 7.75, + "learning_rate": 8.542666666666666e-05, + "loss": 0.7958, + "step": 5780 + }, + { + "epoch": 4.12981455064194, + "grad_norm": 8.3125, + "learning_rate": 8.538222222222224e-05, + "loss": 0.9186, + "step": 5790 + }, + { + "epoch": 4.136947218259629, + "grad_norm": 7.0625, + "learning_rate": 8.533777777777778e-05, + "loss": 0.9135, + "step": 5800 + }, + { + "epoch": 4.136947218259629, + "eval/acc": 37.20930099487305, + "step": 5800 + }, + { + "epoch": 4.136947218259629, + "eval_loss": 2.8186914920806885, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.722, + "eval_steps_per_second": 4.645, + "step": 5800 + }, + { + "epoch": 4.144079885877318, + "grad_norm": 8.125, + "learning_rate": 8.529333333333333e-05, + "loss": 0.8248, + "step": 5810 + }, + { + "epoch": 4.151212553495007, + "grad_norm": 7.65625, + "learning_rate": 8.52488888888889e-05, + "loss": 0.9186, + "step": 5820 + }, + { + "epoch": 4.158345221112696, + "grad_norm": 7.6875, + "learning_rate": 8.520444444444446e-05, + "loss": 0.8367, + "step": 5830 + }, + { + "epoch": 4.165477888730385, + "grad_norm": 9.75, + "learning_rate": 8.516e-05, + "loss": 0.8898, + "step": 5840 + }, + { + "epoch": 4.172610556348074, + "grad_norm": 8.5625, + "learning_rate": 8.511555555555555e-05, + "loss": 0.9218, + "step": 5850 + }, + { + "epoch": 4.1797432239657635, + "grad_norm": 6.0, + "learning_rate": 8.507111111111112e-05, + "loss": 0.8784, + "step": 5860 + }, + { + "epoch": 4.186875891583452, + "grad_norm": 8.5625, + "learning_rate": 8.502666666666666e-05, + "loss": 0.8361, + "step": 5870 + }, + { + "epoch": 4.194008559201142, + "grad_norm": 7.40625, + "learning_rate": 8.498222222222223e-05, + "loss": 0.816, + "step": 5880 + }, + { + "epoch": 4.20114122681883, + "grad_norm": 7.84375, + "learning_rate": 8.493777777777779e-05, + "loss": 0.897, + "step": 5890 + }, + { + "epoch": 4.20827389443652, + "grad_norm": 10.0625, + "learning_rate": 8.489333333333334e-05, + "loss": 0.7807, + "step": 5900 + }, + { + "epoch": 4.20827389443652, + "eval/acc": 37.20930099487305, + "step": 5900 + }, + { + "epoch": 4.20827389443652, + "eval_loss": 2.890333890914917, + "eval_runtime": 0.2187, + "eval_samples_per_second": 196.595, + "eval_steps_per_second": 4.572, + "step": 5900 + }, + { + "epoch": 4.215406562054208, + "grad_norm": 7.6875, + "learning_rate": 8.484888888888888e-05, + "loss": 0.8786, + "step": 5910 + }, + { + "epoch": 4.222539229671897, + "grad_norm": 7.46875, + "learning_rate": 8.480444444444445e-05, + "loss": 0.8689, + "step": 5920 + }, + { + "epoch": 4.229671897289586, + "grad_norm": 14.125, + "learning_rate": 8.476000000000001e-05, + "loss": 0.83, + "step": 5930 + }, + { + "epoch": 4.236804564907275, + "grad_norm": 6.09375, + "learning_rate": 8.471555555555556e-05, + "loss": 0.8921, + "step": 5940 + }, + { + "epoch": 4.2439372325249645, + "grad_norm": 8.875, + "learning_rate": 8.467111111111112e-05, + "loss": 0.9293, + "step": 5950 + }, + { + "epoch": 4.251069900142653, + "grad_norm": 10.5625, + "learning_rate": 8.462666666666667e-05, + "loss": 0.7955, + "step": 5960 + }, + { + "epoch": 4.258202567760343, + "grad_norm": 15.25, + "learning_rate": 8.458222222222223e-05, + "loss": 0.9267, + "step": 5970 + }, + { + "epoch": 4.265335235378031, + "grad_norm": 8.0, + "learning_rate": 8.453777777777778e-05, + "loss": 0.7665, + "step": 5980 + }, + { + "epoch": 4.272467902995721, + "grad_norm": 6.4375, + "learning_rate": 8.449333333333334e-05, + "loss": 0.8212, + "step": 5990 + }, + { + "epoch": 4.279600570613409, + "grad_norm": 8.0625, + "learning_rate": 8.444888888888889e-05, + "loss": 0.8294, + "step": 6000 + }, + { + "epoch": 4.279600570613409, + "eval/acc": 34.88372039794922, + "step": 6000 + }, + { + "epoch": 4.279600570613409, + "eval_loss": 2.8812708854675293, + "eval_runtime": 0.2262, + "eval_samples_per_second": 190.082, + "eval_steps_per_second": 4.421, + "step": 6000 + }, + { + "epoch": 4.286733238231099, + "grad_norm": 5.625, + "learning_rate": 8.440444444444445e-05, + "loss": 0.8813, + "step": 6010 + }, + { + "epoch": 4.293865905848787, + "grad_norm": 8.375, + "learning_rate": 8.436000000000001e-05, + "loss": 0.8792, + "step": 6020 + }, + { + "epoch": 4.300998573466477, + "grad_norm": 9.125, + "learning_rate": 8.431555555555556e-05, + "loss": 0.9509, + "step": 6030 + }, + { + "epoch": 4.3081312410841655, + "grad_norm": 7.34375, + "learning_rate": 8.427111111111111e-05, + "loss": 0.9452, + "step": 6040 + }, + { + "epoch": 4.315263908701855, + "grad_norm": 8.25, + "learning_rate": 8.422666666666667e-05, + "loss": 0.8801, + "step": 6050 + }, + { + "epoch": 4.3223965763195435, + "grad_norm": 6.75, + "learning_rate": 8.418222222222223e-05, + "loss": 0.805, + "step": 6060 + }, + { + "epoch": 4.329529243937232, + "grad_norm": 8.375, + "learning_rate": 8.413777777777778e-05, + "loss": 0.8176, + "step": 6070 + }, + { + "epoch": 4.336661911554922, + "grad_norm": 6.1875, + "learning_rate": 8.409333333333333e-05, + "loss": 0.8662, + "step": 6080 + }, + { + "epoch": 4.34379457917261, + "grad_norm": 6.03125, + "learning_rate": 8.404888888888889e-05, + "loss": 0.9121, + "step": 6090 + }, + { + "epoch": 4.3509272467903, + "grad_norm": 5.6875, + "learning_rate": 8.400444444444445e-05, + "loss": 0.8697, + "step": 6100 + }, + { + "epoch": 4.3509272467903, + "eval/acc": 39.53488540649414, + "step": 6100 + }, + { + "epoch": 4.3509272467903, + "eval_loss": 2.7605249881744385, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.191, + "eval_steps_per_second": 4.493, + "step": 6100 + }, + { + "epoch": 4.358059914407988, + "grad_norm": 8.125, + "learning_rate": 8.396e-05, + "loss": 0.783, + "step": 6110 + }, + { + "epoch": 4.365192582025678, + "grad_norm": 6.71875, + "learning_rate": 8.391555555555556e-05, + "loss": 0.7273, + "step": 6120 + }, + { + "epoch": 4.372325249643366, + "grad_norm": 7.625, + "learning_rate": 8.387111111111111e-05, + "loss": 0.9497, + "step": 6130 + }, + { + "epoch": 4.379457917261056, + "grad_norm": 7.625, + "learning_rate": 8.382666666666667e-05, + "loss": 0.9318, + "step": 6140 + }, + { + "epoch": 4.3865905848787445, + "grad_norm": 7.5625, + "learning_rate": 8.378222222222222e-05, + "loss": 0.7827, + "step": 6150 + }, + { + "epoch": 4.393723252496434, + "grad_norm": 7.4375, + "learning_rate": 8.373777777777779e-05, + "loss": 0.8471, + "step": 6160 + }, + { + "epoch": 4.400855920114123, + "grad_norm": 5.59375, + "learning_rate": 8.369333333333333e-05, + "loss": 0.866, + "step": 6170 + }, + { + "epoch": 4.407988587731811, + "grad_norm": 5.34375, + "learning_rate": 8.364888888888888e-05, + "loss": 0.8237, + "step": 6180 + }, + { + "epoch": 4.415121255349501, + "grad_norm": 9.375, + "learning_rate": 8.360444444444446e-05, + "loss": 0.896, + "step": 6190 + }, + { + "epoch": 4.422253922967189, + "grad_norm": 7.78125, + "learning_rate": 8.356e-05, + "loss": 0.8402, + "step": 6200 + }, + { + "epoch": 4.422253922967189, + "eval/acc": 37.20930099487305, + "step": 6200 + }, + { + "epoch": 4.422253922967189, + "eval_loss": 2.8444175720214844, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.997, + "eval_steps_per_second": 4.512, + "step": 6200 + }, + { + "epoch": 4.429386590584879, + "grad_norm": 7.625, + "learning_rate": 8.351555555555555e-05, + "loss": 0.8708, + "step": 6210 + }, + { + "epoch": 4.436519258202567, + "grad_norm": 7.28125, + "learning_rate": 8.347111111111112e-05, + "loss": 0.8505, + "step": 6220 + }, + { + "epoch": 4.443651925820257, + "grad_norm": 7.28125, + "learning_rate": 8.342666666666668e-05, + "loss": 0.878, + "step": 6230 + }, + { + "epoch": 4.4507845934379455, + "grad_norm": 8.0, + "learning_rate": 8.338222222222223e-05, + "loss": 0.7568, + "step": 6240 + }, + { + "epoch": 4.457917261055635, + "grad_norm": 7.28125, + "learning_rate": 8.333777777777778e-05, + "loss": 0.7909, + "step": 6250 + }, + { + "epoch": 4.465049928673324, + "grad_norm": 10.625, + "learning_rate": 8.329333333333334e-05, + "loss": 0.8732, + "step": 6260 + }, + { + "epoch": 4.472182596291013, + "grad_norm": 7.40625, + "learning_rate": 8.324888888888889e-05, + "loss": 0.8827, + "step": 6270 + }, + { + "epoch": 4.479315263908702, + "grad_norm": 11.25, + "learning_rate": 8.320444444444445e-05, + "loss": 0.7889, + "step": 6280 + }, + { + "epoch": 4.486447931526391, + "grad_norm": 7.59375, + "learning_rate": 8.316000000000001e-05, + "loss": 0.7808, + "step": 6290 + }, + { + "epoch": 4.49358059914408, + "grad_norm": 5.40625, + "learning_rate": 8.311555555555556e-05, + "loss": 0.8223, + "step": 6300 + }, + { + "epoch": 4.49358059914408, + "eval/acc": 37.20930099487305, + "step": 6300 + }, + { + "epoch": 4.49358059914408, + "eval_loss": 2.798743963241577, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.44, + "eval_steps_per_second": 4.592, + "step": 6300 + }, + { + "epoch": 4.500713266761769, + "grad_norm": 7.9375, + "learning_rate": 8.307111111111111e-05, + "loss": 0.8588, + "step": 6310 + }, + { + "epoch": 4.507845934379458, + "grad_norm": 8.0625, + "learning_rate": 8.302666666666667e-05, + "loss": 0.9003, + "step": 6320 + }, + { + "epoch": 4.5149786019971465, + "grad_norm": 7.21875, + "learning_rate": 8.298222222222223e-05, + "loss": 0.8942, + "step": 6330 + }, + { + "epoch": 4.522111269614836, + "grad_norm": 7.625, + "learning_rate": 8.293777777777778e-05, + "loss": 0.8622, + "step": 6340 + }, + { + "epoch": 4.529243937232525, + "grad_norm": 5.53125, + "learning_rate": 8.289333333333333e-05, + "loss": 0.8048, + "step": 6350 + }, + { + "epoch": 4.536376604850214, + "grad_norm": 9.125, + "learning_rate": 8.28488888888889e-05, + "loss": 0.8506, + "step": 6360 + }, + { + "epoch": 4.543509272467903, + "grad_norm": 6.125, + "learning_rate": 8.280444444444445e-05, + "loss": 0.7767, + "step": 6370 + }, + { + "epoch": 4.550641940085592, + "grad_norm": 6.90625, + "learning_rate": 8.276e-05, + "loss": 0.9143, + "step": 6380 + }, + { + "epoch": 4.557774607703281, + "grad_norm": 5.84375, + "learning_rate": 8.271555555555556e-05, + "loss": 0.8641, + "step": 6390 + }, + { + "epoch": 4.56490727532097, + "grad_norm": 6.3125, + "learning_rate": 8.267111111111111e-05, + "loss": 0.8297, + "step": 6400 + }, + { + "epoch": 4.56490727532097, + "eval/acc": 37.20930099487305, + "step": 6400 + }, + { + "epoch": 4.56490727532097, + "eval_loss": 2.804457426071167, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.742, + "eval_steps_per_second": 4.529, + "step": 6400 + }, + { + "epoch": 4.572039942938659, + "grad_norm": 7.15625, + "learning_rate": 8.262666666666667e-05, + "loss": 0.7398, + "step": 6410 + }, + { + "epoch": 4.579172610556348, + "grad_norm": 6.125, + "learning_rate": 8.258222222222222e-05, + "loss": 0.8443, + "step": 6420 + }, + { + "epoch": 4.586305278174037, + "grad_norm": 9.25, + "learning_rate": 8.253777777777778e-05, + "loss": 0.7983, + "step": 6430 + }, + { + "epoch": 4.5934379457917265, + "grad_norm": 7.3125, + "learning_rate": 8.249333333333333e-05, + "loss": 0.9705, + "step": 6440 + }, + { + "epoch": 4.600570613409415, + "grad_norm": 7.34375, + "learning_rate": 8.24488888888889e-05, + "loss": 1.0079, + "step": 6450 + }, + { + "epoch": 4.607703281027105, + "grad_norm": 8.875, + "learning_rate": 8.240444444444446e-05, + "loss": 0.8982, + "step": 6460 + }, + { + "epoch": 4.614835948644793, + "grad_norm": 8.375, + "learning_rate": 8.236e-05, + "loss": 0.8417, + "step": 6470 + }, + { + "epoch": 4.621968616262482, + "grad_norm": 7.78125, + "learning_rate": 8.231555555555555e-05, + "loss": 0.8566, + "step": 6480 + }, + { + "epoch": 4.629101283880171, + "grad_norm": 6.5625, + "learning_rate": 8.227111111111111e-05, + "loss": 0.8155, + "step": 6490 + }, + { + "epoch": 4.63623395149786, + "grad_norm": 5.875, + "learning_rate": 8.222666666666668e-05, + "loss": 0.9449, + "step": 6500 + }, + { + "epoch": 4.63623395149786, + "eval/acc": 41.86046600341797, + "step": 6500 + }, + { + "epoch": 4.63623395149786, + "eval_loss": 2.761596918106079, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.549, + "eval_steps_per_second": 4.664, + "step": 6500 + }, + { + "epoch": 4.643366619115549, + "grad_norm": 7.5, + "learning_rate": 8.218222222222223e-05, + "loss": 0.8549, + "step": 6510 + }, + { + "epoch": 4.650499286733238, + "grad_norm": 7.0625, + "learning_rate": 8.213777777777777e-05, + "loss": 0.8473, + "step": 6520 + }, + { + "epoch": 4.6576319543509275, + "grad_norm": 7.1875, + "learning_rate": 8.209333333333334e-05, + "loss": 0.8773, + "step": 6530 + }, + { + "epoch": 4.664764621968616, + "grad_norm": 7.25, + "learning_rate": 8.20488888888889e-05, + "loss": 0.789, + "step": 6540 + }, + { + "epoch": 4.671897289586306, + "grad_norm": 7.34375, + "learning_rate": 8.200444444444445e-05, + "loss": 0.852, + "step": 6550 + }, + { + "epoch": 4.679029957203994, + "grad_norm": 5.65625, + "learning_rate": 8.196000000000001e-05, + "loss": 0.8291, + "step": 6560 + }, + { + "epoch": 4.686162624821684, + "grad_norm": 5.5625, + "learning_rate": 8.191555555555556e-05, + "loss": 0.7943, + "step": 6570 + }, + { + "epoch": 4.693295292439372, + "grad_norm": 9.25, + "learning_rate": 8.18711111111111e-05, + "loss": 0.8418, + "step": 6580 + }, + { + "epoch": 4.700427960057061, + "grad_norm": 6.75, + "learning_rate": 8.182666666666667e-05, + "loss": 0.8661, + "step": 6590 + }, + { + "epoch": 4.70756062767475, + "grad_norm": 7.40625, + "learning_rate": 8.178222222222223e-05, + "loss": 0.768, + "step": 6600 + }, + { + "epoch": 4.70756062767475, + "eval/acc": 41.86046600341797, + "step": 6600 + }, + { + "epoch": 4.70756062767475, + "eval_loss": 2.8003947734832764, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.665, + "eval_steps_per_second": 4.527, + "step": 6600 + }, + { + "epoch": 4.71469329529244, + "grad_norm": 7.1875, + "learning_rate": 8.173777777777778e-05, + "loss": 0.9038, + "step": 6610 + }, + { + "epoch": 4.7218259629101285, + "grad_norm": 6.46875, + "learning_rate": 8.169333333333334e-05, + "loss": 0.7185, + "step": 6620 + }, + { + "epoch": 4.728958630527817, + "grad_norm": 6.3125, + "learning_rate": 8.16488888888889e-05, + "loss": 0.9515, + "step": 6630 + }, + { + "epoch": 4.736091298145507, + "grad_norm": 6.46875, + "learning_rate": 8.160444444444445e-05, + "loss": 0.8127, + "step": 6640 + }, + { + "epoch": 4.743223965763195, + "grad_norm": 6.4375, + "learning_rate": 8.156e-05, + "loss": 0.8914, + "step": 6650 + }, + { + "epoch": 4.750356633380885, + "grad_norm": 6.8125, + "learning_rate": 8.151555555555556e-05, + "loss": 0.8545, + "step": 6660 + }, + { + "epoch": 4.757489300998573, + "grad_norm": 7.21875, + "learning_rate": 8.147111111111112e-05, + "loss": 0.6783, + "step": 6670 + }, + { + "epoch": 4.764621968616263, + "grad_norm": 7.03125, + "learning_rate": 8.142666666666667e-05, + "loss": 0.9337, + "step": 6680 + }, + { + "epoch": 4.771754636233951, + "grad_norm": 10.5625, + "learning_rate": 8.138222222222223e-05, + "loss": 0.8181, + "step": 6690 + }, + { + "epoch": 4.778887303851641, + "grad_norm": 7.375, + "learning_rate": 8.133777777777778e-05, + "loss": 0.8639, + "step": 6700 + }, + { + "epoch": 4.778887303851641, + "eval/acc": 37.20930099487305, + "step": 6700 + }, + { + "epoch": 4.778887303851641, + "eval_loss": 2.8262782096862793, + "eval_runtime": 0.2194, + "eval_samples_per_second": 195.949, + "eval_steps_per_second": 4.557, + "step": 6700 + }, + { + "epoch": 4.7860199714693294, + "grad_norm": 10.8125, + "learning_rate": 8.129333333333333e-05, + "loss": 0.8742, + "step": 6710 + }, + { + "epoch": 4.793152639087019, + "grad_norm": 5.53125, + "learning_rate": 8.124888888888889e-05, + "loss": 0.7438, + "step": 6720 + }, + { + "epoch": 4.8002853067047075, + "grad_norm": 6.65625, + "learning_rate": 8.120444444444445e-05, + "loss": 0.7859, + "step": 6730 + }, + { + "epoch": 4.807417974322396, + "grad_norm": 6.78125, + "learning_rate": 8.116e-05, + "loss": 0.8942, + "step": 6740 + }, + { + "epoch": 4.814550641940086, + "grad_norm": 8.4375, + "learning_rate": 8.111555555555555e-05, + "loss": 0.8483, + "step": 6750 + }, + { + "epoch": 4.821683309557774, + "grad_norm": 6.40625, + "learning_rate": 8.107111111111113e-05, + "loss": 0.8284, + "step": 6760 + }, + { + "epoch": 4.828815977175464, + "grad_norm": 6.84375, + "learning_rate": 8.102666666666667e-05, + "loss": 0.8887, + "step": 6770 + }, + { + "epoch": 4.835948644793152, + "grad_norm": 8.875, + "learning_rate": 8.098222222222222e-05, + "loss": 0.8431, + "step": 6780 + }, + { + "epoch": 4.843081312410842, + "grad_norm": 6.90625, + "learning_rate": 8.093777777777779e-05, + "loss": 0.8325, + "step": 6790 + }, + { + "epoch": 4.85021398002853, + "grad_norm": 7.0, + "learning_rate": 8.089333333333333e-05, + "loss": 0.7742, + "step": 6800 + }, + { + "epoch": 4.85021398002853, + "eval/acc": 39.53488540649414, + "step": 6800 + }, + { + "epoch": 4.85021398002853, + "eval_loss": 2.7403292655944824, + "eval_runtime": 0.5509, + "eval_samples_per_second": 78.059, + "eval_steps_per_second": 1.815, + "step": 6800 + }, + { + "epoch": 4.85734664764622, + "grad_norm": 6.625, + "learning_rate": 8.08488888888889e-05, + "loss": 0.8418, + "step": 6810 + }, + { + "epoch": 4.8644793152639085, + "grad_norm": 7.65625, + "learning_rate": 8.080444444444444e-05, + "loss": 0.9022, + "step": 6820 + }, + { + "epoch": 4.871611982881598, + "grad_norm": 7.75, + "learning_rate": 8.076e-05, + "loss": 0.8201, + "step": 6830 + }, + { + "epoch": 4.878744650499287, + "grad_norm": 7.84375, + "learning_rate": 8.071555555555555e-05, + "loss": 0.8144, + "step": 6840 + }, + { + "epoch": 4.885877318116976, + "grad_norm": 8.3125, + "learning_rate": 8.067111111111112e-05, + "loss": 0.8821, + "step": 6850 + }, + { + "epoch": 4.893009985734665, + "grad_norm": 9.0, + "learning_rate": 8.062666666666668e-05, + "loss": 0.8572, + "step": 6860 + }, + { + "epoch": 4.900142653352354, + "grad_norm": 10.0, + "learning_rate": 8.058222222222223e-05, + "loss": 0.7498, + "step": 6870 + }, + { + "epoch": 4.907275320970043, + "grad_norm": 6.09375, + "learning_rate": 8.053777777777778e-05, + "loss": 0.8709, + "step": 6880 + }, + { + "epoch": 4.914407988587731, + "grad_norm": 7.84375, + "learning_rate": 8.049333333333334e-05, + "loss": 0.8045, + "step": 6890 + }, + { + "epoch": 4.921540656205421, + "grad_norm": 7.0625, + "learning_rate": 8.04488888888889e-05, + "loss": 0.8919, + "step": 6900 + }, + { + "epoch": 4.921540656205421, + "eval/acc": 34.88372039794922, + "step": 6900 + }, + { + "epoch": 4.921540656205421, + "eval_loss": 2.8702921867370605, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.143, + "eval_steps_per_second": 4.515, + "step": 6900 + }, + { + "epoch": 4.9286733238231095, + "grad_norm": 18.125, + "learning_rate": 8.040444444444445e-05, + "loss": 0.8407, + "step": 6910 + }, + { + "epoch": 4.935805991440799, + "grad_norm": 7.8125, + "learning_rate": 8.036e-05, + "loss": 0.9023, + "step": 6920 + }, + { + "epoch": 4.942938659058488, + "grad_norm": 6.53125, + "learning_rate": 8.031555555555556e-05, + "loss": 0.7747, + "step": 6930 + }, + { + "epoch": 4.950071326676177, + "grad_norm": 7.3125, + "learning_rate": 8.027111111111112e-05, + "loss": 0.7357, + "step": 6940 + }, + { + "epoch": 4.957203994293866, + "grad_norm": 5.71875, + "learning_rate": 8.022666666666667e-05, + "loss": 0.8914, + "step": 6950 + }, + { + "epoch": 4.964336661911555, + "grad_norm": 7.9375, + "learning_rate": 8.018222222222223e-05, + "loss": 0.8626, + "step": 6960 + }, + { + "epoch": 4.971469329529244, + "grad_norm": 6.9375, + "learning_rate": 8.013777777777778e-05, + "loss": 0.8388, + "step": 6970 + }, + { + "epoch": 4.978601997146933, + "grad_norm": 6.5, + "learning_rate": 8.009333333333334e-05, + "loss": 0.8321, + "step": 6980 + }, + { + "epoch": 4.985734664764622, + "grad_norm": 6.6875, + "learning_rate": 8.004888888888889e-05, + "loss": 0.8276, + "step": 6990 + }, + { + "epoch": 4.9928673323823105, + "grad_norm": 10.5625, + "learning_rate": 8.000444444444445e-05, + "loss": 0.8847, + "step": 7000 + }, + { + "epoch": 4.9928673323823105, + "eval/acc": 39.53488540649414, + "step": 7000 + }, + { + "epoch": 4.9928673323823105, + "eval_loss": 2.7940218448638916, + "eval_runtime": 0.2239, + "eval_samples_per_second": 192.063, + "eval_steps_per_second": 4.467, + "step": 7000 + }, + { + "epoch": 5.0, + "grad_norm": 7.1875, + "learning_rate": 7.996e-05, + "loss": 0.9472, + "step": 7010 + }, + { + "epoch": 5.007132667617689, + "grad_norm": 7.25, + "learning_rate": 7.991555555555555e-05, + "loss": 0.9009, + "step": 7020 + }, + { + "epoch": 5.014265335235378, + "grad_norm": 7.34375, + "learning_rate": 7.987111111111112e-05, + "loss": 0.8805, + "step": 7030 + }, + { + "epoch": 5.021398002853067, + "grad_norm": 5.78125, + "learning_rate": 7.982666666666667e-05, + "loss": 0.8475, + "step": 7040 + }, + { + "epoch": 5.028530670470756, + "grad_norm": 5.53125, + "learning_rate": 7.978222222222222e-05, + "loss": 0.7598, + "step": 7050 + }, + { + "epoch": 5.035663338088445, + "grad_norm": 6.25, + "learning_rate": 7.973777777777778e-05, + "loss": 0.8605, + "step": 7060 + }, + { + "epoch": 5.042796005706134, + "grad_norm": 7.46875, + "learning_rate": 7.969333333333335e-05, + "loss": 0.9293, + "step": 7070 + }, + { + "epoch": 5.049928673323823, + "grad_norm": 5.9375, + "learning_rate": 7.96488888888889e-05, + "loss": 0.7984, + "step": 7080 + }, + { + "epoch": 5.057061340941512, + "grad_norm": 8.375, + "learning_rate": 7.960444444444444e-05, + "loss": 0.8222, + "step": 7090 + }, + { + "epoch": 5.064194008559201, + "grad_norm": 6.9375, + "learning_rate": 7.956e-05, + "loss": 0.8535, + "step": 7100 + }, + { + "epoch": 5.064194008559201, + "eval/acc": 41.86046600341797, + "step": 7100 + }, + { + "epoch": 5.064194008559201, + "eval_loss": 2.631981134414673, + "eval_runtime": 2.5832, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 0.387, + "step": 7100 + }, + { + "epoch": 5.0713266761768905, + "grad_norm": 6.5625, + "learning_rate": 7.951555555555555e-05, + "loss": 0.8668, + "step": 7110 + }, + { + "epoch": 5.078459343794579, + "grad_norm": 9.0, + "learning_rate": 7.947111111111111e-05, + "loss": 0.8142, + "step": 7120 + }, + { + "epoch": 5.085592011412269, + "grad_norm": 8.3125, + "learning_rate": 7.942666666666668e-05, + "loss": 0.9271, + "step": 7130 + }, + { + "epoch": 5.092724679029957, + "grad_norm": 7.875, + "learning_rate": 7.938222222222222e-05, + "loss": 0.8213, + "step": 7140 + }, + { + "epoch": 5.099857346647646, + "grad_norm": 6.8125, + "learning_rate": 7.933777777777777e-05, + "loss": 0.8511, + "step": 7150 + }, + { + "epoch": 5.106990014265335, + "grad_norm": 7.53125, + "learning_rate": 7.929333333333334e-05, + "loss": 0.8525, + "step": 7160 + }, + { + "epoch": 5.114122681883024, + "grad_norm": 7.21875, + "learning_rate": 7.92488888888889e-05, + "loss": 0.8554, + "step": 7170 + }, + { + "epoch": 5.121255349500713, + "grad_norm": 6.84375, + "learning_rate": 7.920444444444445e-05, + "loss": 0.8128, + "step": 7180 + }, + { + "epoch": 5.128388017118402, + "grad_norm": 7.84375, + "learning_rate": 7.916e-05, + "loss": 0.7726, + "step": 7190 + }, + { + "epoch": 5.1355206847360915, + "grad_norm": 7.78125, + "learning_rate": 7.911555555555556e-05, + "loss": 0.8902, + "step": 7200 + }, + { + "epoch": 5.1355206847360915, + "eval/acc": 37.20930099487305, + "step": 7200 + }, + { + "epoch": 5.1355206847360915, + "eval_loss": 2.5633885860443115, + "eval_runtime": 0.2541, + "eval_samples_per_second": 169.248, + "eval_steps_per_second": 3.936, + "step": 7200 + }, + { + "epoch": 5.14265335235378, + "grad_norm": 6.8125, + "learning_rate": 7.907111111111112e-05, + "loss": 0.7482, + "step": 7210 + }, + { + "epoch": 5.14978601997147, + "grad_norm": 42.0, + "learning_rate": 7.902666666666667e-05, + "loss": 0.9007, + "step": 7220 + }, + { + "epoch": 5.156918687589158, + "grad_norm": 6.0625, + "learning_rate": 7.898222222222223e-05, + "loss": 0.8643, + "step": 7230 + }, + { + "epoch": 5.164051355206848, + "grad_norm": 7.03125, + "learning_rate": 7.893777777777778e-05, + "loss": 0.8899, + "step": 7240 + }, + { + "epoch": 5.171184022824536, + "grad_norm": 7.53125, + "learning_rate": 7.889333333333334e-05, + "loss": 0.7462, + "step": 7250 + }, + { + "epoch": 5.178316690442226, + "grad_norm": 7.21875, + "learning_rate": 7.884888888888889e-05, + "loss": 0.9199, + "step": 7260 + }, + { + "epoch": 5.185449358059914, + "grad_norm": 8.1875, + "learning_rate": 7.880444444444445e-05, + "loss": 0.7966, + "step": 7270 + }, + { + "epoch": 5.192582025677604, + "grad_norm": 8.0, + "learning_rate": 7.876e-05, + "loss": 0.9086, + "step": 7280 + }, + { + "epoch": 5.1997146932952925, + "grad_norm": 7.46875, + "learning_rate": 7.871555555555556e-05, + "loss": 0.9184, + "step": 7290 + }, + { + "epoch": 5.206847360912981, + "grad_norm": 7.28125, + "learning_rate": 7.867111111111112e-05, + "loss": 0.742, + "step": 7300 + }, + { + "epoch": 5.206847360912981, + "eval/acc": 39.53488540649414, + "step": 7300 + }, + { + "epoch": 5.206847360912981, + "eval_loss": 2.5178542137145996, + "eval_runtime": 0.2274, + "eval_samples_per_second": 189.112, + "eval_steps_per_second": 4.398, + "step": 7300 + }, + { + "epoch": 5.2139800285306706, + "grad_norm": 10.4375, + "learning_rate": 7.862666666666667e-05, + "loss": 0.8737, + "step": 7310 + }, + { + "epoch": 5.221112696148359, + "grad_norm": 6.8125, + "learning_rate": 7.858222222222222e-05, + "loss": 0.8197, + "step": 7320 + }, + { + "epoch": 5.228245363766049, + "grad_norm": 8.125, + "learning_rate": 7.853777777777778e-05, + "loss": 0.9561, + "step": 7330 + }, + { + "epoch": 5.235378031383737, + "grad_norm": 9.5, + "learning_rate": 7.849333333333334e-05, + "loss": 0.9066, + "step": 7340 + }, + { + "epoch": 5.242510699001427, + "grad_norm": 6.09375, + "learning_rate": 7.844888888888889e-05, + "loss": 0.839, + "step": 7350 + }, + { + "epoch": 5.249643366619115, + "grad_norm": 8.0625, + "learning_rate": 7.840444444444445e-05, + "loss": 0.8996, + "step": 7360 + }, + { + "epoch": 5.256776034236805, + "grad_norm": 6.3125, + "learning_rate": 7.836e-05, + "loss": 0.8253, + "step": 7370 + }, + { + "epoch": 5.263908701854493, + "grad_norm": 6.15625, + "learning_rate": 7.831555555555556e-05, + "loss": 0.7275, + "step": 7380 + }, + { + "epoch": 5.271041369472183, + "grad_norm": 6.375, + "learning_rate": 7.827111111111111e-05, + "loss": 0.8548, + "step": 7390 + }, + { + "epoch": 5.2781740370898715, + "grad_norm": 8.0625, + "learning_rate": 7.822666666666667e-05, + "loss": 0.8754, + "step": 7400 + }, + { + "epoch": 5.2781740370898715, + "eval/acc": 39.53488540649414, + "step": 7400 + }, + { + "epoch": 5.2781740370898715, + "eval_loss": 2.599212408065796, + "eval_runtime": 0.2355, + "eval_samples_per_second": 182.56, + "eval_steps_per_second": 4.246, + "step": 7400 + }, + { + "epoch": 5.285306704707561, + "grad_norm": 8.875, + "learning_rate": 7.818222222222222e-05, + "loss": 0.8725, + "step": 7410 + }, + { + "epoch": 5.29243937232525, + "grad_norm": 8.0625, + "learning_rate": 7.813777777777777e-05, + "loss": 0.8689, + "step": 7420 + }, + { + "epoch": 5.299572039942939, + "grad_norm": 7.59375, + "learning_rate": 7.809333333333335e-05, + "loss": 0.7615, + "step": 7430 + }, + { + "epoch": 5.306704707560628, + "grad_norm": 6.3125, + "learning_rate": 7.80488888888889e-05, + "loss": 0.8141, + "step": 7440 + }, + { + "epoch": 5.313837375178316, + "grad_norm": 6.84375, + "learning_rate": 7.800444444444444e-05, + "loss": 0.8328, + "step": 7450 + }, + { + "epoch": 5.320970042796006, + "grad_norm": 7.71875, + "learning_rate": 7.796e-05, + "loss": 0.8158, + "step": 7460 + }, + { + "epoch": 5.328102710413694, + "grad_norm": 7.0625, + "learning_rate": 7.791555555555557e-05, + "loss": 0.7663, + "step": 7470 + }, + { + "epoch": 5.335235378031384, + "grad_norm": 8.1875, + "learning_rate": 7.787111111111112e-05, + "loss": 0.7704, + "step": 7480 + }, + { + "epoch": 5.3423680456490725, + "grad_norm": 8.0, + "learning_rate": 7.782666666666666e-05, + "loss": 0.8511, + "step": 7490 + }, + { + "epoch": 5.349500713266762, + "grad_norm": 5.15625, + "learning_rate": 7.778222222222223e-05, + "loss": 0.783, + "step": 7500 + }, + { + "epoch": 5.349500713266762, + "eval/acc": 39.53488540649414, + "step": 7500 + }, + { + "epoch": 5.349500713266762, + "eval_loss": 2.6000046730041504, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.876, + "eval_steps_per_second": 4.392, + "step": 7500 + }, + { + "epoch": 5.356633380884451, + "grad_norm": 7.6875, + "learning_rate": 7.773777777777778e-05, + "loss": 0.7674, + "step": 7510 + }, + { + "epoch": 5.36376604850214, + "grad_norm": 6.53125, + "learning_rate": 7.769333333333334e-05, + "loss": 0.8338, + "step": 7520 + }, + { + "epoch": 5.370898716119829, + "grad_norm": 5.8125, + "learning_rate": 7.76488888888889e-05, + "loss": 0.8279, + "step": 7530 + }, + { + "epoch": 5.378031383737518, + "grad_norm": 7.0625, + "learning_rate": 7.760444444444445e-05, + "loss": 0.7954, + "step": 7540 + }, + { + "epoch": 5.385164051355207, + "grad_norm": 8.0, + "learning_rate": 7.756e-05, + "loss": 0.8632, + "step": 7550 + }, + { + "epoch": 5.392296718972895, + "grad_norm": 6.84375, + "learning_rate": 7.751555555555556e-05, + "loss": 0.8191, + "step": 7560 + }, + { + "epoch": 5.399429386590585, + "grad_norm": 7.375, + "learning_rate": 7.747111111111112e-05, + "loss": 0.708, + "step": 7570 + }, + { + "epoch": 5.4065620542082735, + "grad_norm": 7.15625, + "learning_rate": 7.742666666666667e-05, + "loss": 0.6851, + "step": 7580 + }, + { + "epoch": 5.413694721825963, + "grad_norm": 7.25, + "learning_rate": 7.738222222222222e-05, + "loss": 0.8769, + "step": 7590 + }, + { + "epoch": 5.420827389443652, + "grad_norm": 7.6875, + "learning_rate": 7.733777777777779e-05, + "loss": 0.8316, + "step": 7600 + }, + { + "epoch": 5.420827389443652, + "eval/acc": 39.53488540649414, + "step": 7600 + }, + { + "epoch": 5.420827389443652, + "eval_loss": 2.583944797515869, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.433, + "eval_steps_per_second": 4.522, + "step": 7600 + }, + { + "epoch": 5.427960057061341, + "grad_norm": 7.625, + "learning_rate": 7.729333333333334e-05, + "loss": 0.8444, + "step": 7610 + }, + { + "epoch": 5.43509272467903, + "grad_norm": 6.6875, + "learning_rate": 7.724888888888889e-05, + "loss": 0.8101, + "step": 7620 + }, + { + "epoch": 5.442225392296719, + "grad_norm": 6.375, + "learning_rate": 7.720444444444445e-05, + "loss": 0.8094, + "step": 7630 + }, + { + "epoch": 5.449358059914408, + "grad_norm": 7.09375, + "learning_rate": 7.716e-05, + "loss": 0.9292, + "step": 7640 + }, + { + "epoch": 5.456490727532097, + "grad_norm": 8.0, + "learning_rate": 7.711555555555556e-05, + "loss": 0.8544, + "step": 7650 + }, + { + "epoch": 5.463623395149786, + "grad_norm": 5.625, + "learning_rate": 7.707111111111111e-05, + "loss": 0.787, + "step": 7660 + }, + { + "epoch": 5.470756062767475, + "grad_norm": 8.375, + "learning_rate": 7.702666666666667e-05, + "loss": 0.8763, + "step": 7670 + }, + { + "epoch": 5.477888730385164, + "grad_norm": 12.9375, + "learning_rate": 7.698222222222222e-05, + "loss": 0.8317, + "step": 7680 + }, + { + "epoch": 5.4850213980028535, + "grad_norm": 8.125, + "learning_rate": 7.693777777777778e-05, + "loss": 0.8156, + "step": 7690 + }, + { + "epoch": 5.492154065620542, + "grad_norm": 6.96875, + "learning_rate": 7.689333333333334e-05, + "loss": 0.8998, + "step": 7700 + }, + { + "epoch": 5.492154065620542, + "eval/acc": 39.53488540649414, + "step": 7700 + }, + { + "epoch": 5.492154065620542, + "eval_loss": 2.6069791316986084, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.632, + "eval_steps_per_second": 4.457, + "step": 7700 + }, + { + "epoch": 5.499286733238231, + "grad_norm": 7.5625, + "learning_rate": 7.68488888888889e-05, + "loss": 0.7881, + "step": 7710 + }, + { + "epoch": 5.50641940085592, + "grad_norm": 6.65625, + "learning_rate": 7.680444444444444e-05, + "loss": 0.8379, + "step": 7720 + }, + { + "epoch": 5.513552068473609, + "grad_norm": 6.34375, + "learning_rate": 7.676e-05, + "loss": 0.844, + "step": 7730 + }, + { + "epoch": 5.520684736091298, + "grad_norm": 8.3125, + "learning_rate": 7.671555555555557e-05, + "loss": 0.8762, + "step": 7740 + }, + { + "epoch": 5.527817403708987, + "grad_norm": 7.09375, + "learning_rate": 7.667111111111111e-05, + "loss": 0.8621, + "step": 7750 + }, + { + "epoch": 5.534950071326676, + "grad_norm": 8.5625, + "learning_rate": 7.662666666666666e-05, + "loss": 1.0092, + "step": 7760 + }, + { + "epoch": 5.542082738944365, + "grad_norm": 6.3125, + "learning_rate": 7.658222222222222e-05, + "loss": 0.8743, + "step": 7770 + }, + { + "epoch": 5.5492154065620545, + "grad_norm": 6.0625, + "learning_rate": 7.653777777777779e-05, + "loss": 0.754, + "step": 7780 + }, + { + "epoch": 5.556348074179743, + "grad_norm": 7.6875, + "learning_rate": 7.649333333333334e-05, + "loss": 0.8504, + "step": 7790 + }, + { + "epoch": 5.563480741797433, + "grad_norm": 8.3125, + "learning_rate": 7.64488888888889e-05, + "loss": 0.7512, + "step": 7800 + }, + { + "epoch": 5.563480741797433, + "eval/acc": 37.20930099487305, + "step": 7800 + }, + { + "epoch": 5.563480741797433, + "eval_loss": 2.610304594039917, + "eval_runtime": 0.2338, + "eval_samples_per_second": 183.899, + "eval_steps_per_second": 4.277, + "step": 7800 + }, + { + "epoch": 5.570613409415121, + "grad_norm": 6.71875, + "learning_rate": 7.640444444444445e-05, + "loss": 0.8204, + "step": 7810 + }, + { + "epoch": 5.57774607703281, + "grad_norm": 6.625, + "learning_rate": 7.636e-05, + "loss": 0.734, + "step": 7820 + }, + { + "epoch": 5.584878744650499, + "grad_norm": 5.65625, + "learning_rate": 7.631555555555556e-05, + "loss": 0.8047, + "step": 7830 + }, + { + "epoch": 5.592011412268189, + "grad_norm": 6.40625, + "learning_rate": 7.627111111111112e-05, + "loss": 0.7179, + "step": 7840 + }, + { + "epoch": 5.599144079885877, + "grad_norm": 6.78125, + "learning_rate": 7.622666666666667e-05, + "loss": 0.849, + "step": 7850 + }, + { + "epoch": 5.606276747503566, + "grad_norm": 8.8125, + "learning_rate": 7.618222222222221e-05, + "loss": 0.8817, + "step": 7860 + }, + { + "epoch": 5.6134094151212555, + "grad_norm": 6.375, + "learning_rate": 7.613777777777779e-05, + "loss": 0.8812, + "step": 7870 + }, + { + "epoch": 5.620542082738944, + "grad_norm": 13.125, + "learning_rate": 7.609333333333334e-05, + "loss": 0.8522, + "step": 7880 + }, + { + "epoch": 5.627674750356634, + "grad_norm": 7.0625, + "learning_rate": 7.604888888888889e-05, + "loss": 0.731, + "step": 7890 + }, + { + "epoch": 5.634807417974322, + "grad_norm": 7.21875, + "learning_rate": 7.600444444444445e-05, + "loss": 0.8841, + "step": 7900 + }, + { + "epoch": 5.634807417974322, + "eval/acc": 39.53488540649414, + "step": 7900 + }, + { + "epoch": 5.634807417974322, + "eval_loss": 2.6105217933654785, + "eval_runtime": 0.2306, + "eval_samples_per_second": 186.447, + "eval_steps_per_second": 4.336, + "step": 7900 + }, + { + "epoch": 5.641940085592012, + "grad_norm": 7.625, + "learning_rate": 7.596000000000001e-05, + "loss": 0.8654, + "step": 7910 + }, + { + "epoch": 5.6490727532097, + "grad_norm": 26.75, + "learning_rate": 7.591555555555556e-05, + "loss": 0.8103, + "step": 7920 + }, + { + "epoch": 5.65620542082739, + "grad_norm": 7.375, + "learning_rate": 7.587111111111112e-05, + "loss": 0.7461, + "step": 7930 + }, + { + "epoch": 5.663338088445078, + "grad_norm": 6.09375, + "learning_rate": 7.582666666666667e-05, + "loss": 0.9693, + "step": 7940 + }, + { + "epoch": 5.670470756062768, + "grad_norm": 7.09375, + "learning_rate": 7.578222222222222e-05, + "loss": 0.8595, + "step": 7950 + }, + { + "epoch": 5.6776034236804565, + "grad_norm": 7.3125, + "learning_rate": 7.573777777777778e-05, + "loss": 0.8541, + "step": 7960 + }, + { + "epoch": 5.684736091298145, + "grad_norm": 7.90625, + "learning_rate": 7.569333333333334e-05, + "loss": 0.8774, + "step": 7970 + }, + { + "epoch": 5.6918687589158345, + "grad_norm": 9.0, + "learning_rate": 7.564888888888889e-05, + "loss": 0.8823, + "step": 7980 + }, + { + "epoch": 5.699001426533523, + "grad_norm": 6.09375, + "learning_rate": 7.560444444444444e-05, + "loss": 0.7302, + "step": 7990 + }, + { + "epoch": 5.706134094151213, + "grad_norm": 7.21875, + "learning_rate": 7.556000000000002e-05, + "loss": 0.8339, + "step": 8000 + }, + { + "epoch": 5.706134094151213, + "eval/acc": 37.20930099487305, + "step": 8000 + }, + { + "epoch": 5.706134094151213, + "eval_loss": 2.576781988143921, + "eval_runtime": 0.2231, + "eval_samples_per_second": 192.779, + "eval_steps_per_second": 4.483, + "step": 8000 + }, + { + "epoch": 5.713266761768901, + "grad_norm": 7.75, + "learning_rate": 7.551555555555556e-05, + "loss": 0.7642, + "step": 8010 + }, + { + "epoch": 5.720399429386591, + "grad_norm": 7.8125, + "learning_rate": 7.547111111111111e-05, + "loss": 0.9188, + "step": 8020 + }, + { + "epoch": 5.727532097004279, + "grad_norm": 7.28125, + "learning_rate": 7.542666666666667e-05, + "loss": 0.8202, + "step": 8030 + }, + { + "epoch": 5.734664764621969, + "grad_norm": 9.0, + "learning_rate": 7.538222222222222e-05, + "loss": 0.8286, + "step": 8040 + }, + { + "epoch": 5.741797432239657, + "grad_norm": 7.25, + "learning_rate": 7.533777777777778e-05, + "loss": 0.7856, + "step": 8050 + }, + { + "epoch": 5.748930099857347, + "grad_norm": 6.90625, + "learning_rate": 7.529333333333333e-05, + "loss": 0.8832, + "step": 8060 + }, + { + "epoch": 5.7560627674750355, + "grad_norm": 6.09375, + "learning_rate": 7.52488888888889e-05, + "loss": 0.7606, + "step": 8070 + }, + { + "epoch": 5.763195435092725, + "grad_norm": 6.625, + "learning_rate": 7.520444444444444e-05, + "loss": 0.8706, + "step": 8080 + }, + { + "epoch": 5.770328102710414, + "grad_norm": 7.25, + "learning_rate": 7.516e-05, + "loss": 0.8542, + "step": 8090 + }, + { + "epoch": 5.777460770328103, + "grad_norm": 6.84375, + "learning_rate": 7.511555555555557e-05, + "loss": 0.7988, + "step": 8100 + }, + { + "epoch": 5.777460770328103, + "eval/acc": 37.20930099487305, + "step": 8100 + }, + { + "epoch": 5.777460770328103, + "eval_loss": 2.598762273788452, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.04, + "eval_steps_per_second": 4.443, + "step": 8100 + }, + { + "epoch": 5.784593437945792, + "grad_norm": 8.875, + "learning_rate": 7.507111111111112e-05, + "loss": 0.8825, + "step": 8110 + }, + { + "epoch": 5.79172610556348, + "grad_norm": 7.375, + "learning_rate": 7.502666666666666e-05, + "loss": 0.8316, + "step": 8120 + }, + { + "epoch": 5.79885877318117, + "grad_norm": 8.125, + "learning_rate": 7.498222222222223e-05, + "loss": 0.8567, + "step": 8130 + }, + { + "epoch": 5.805991440798858, + "grad_norm": 6.3125, + "learning_rate": 7.493777777777779e-05, + "loss": 0.8415, + "step": 8140 + }, + { + "epoch": 5.813124108416548, + "grad_norm": 8.5, + "learning_rate": 7.489333333333334e-05, + "loss": 0.8369, + "step": 8150 + }, + { + "epoch": 5.8202567760342365, + "grad_norm": 13.25, + "learning_rate": 7.484888888888889e-05, + "loss": 0.8692, + "step": 8160 + }, + { + "epoch": 5.827389443651926, + "grad_norm": 7.71875, + "learning_rate": 7.480444444444445e-05, + "loss": 0.8535, + "step": 8170 + }, + { + "epoch": 5.834522111269615, + "grad_norm": 7.6875, + "learning_rate": 7.476000000000001e-05, + "loss": 0.8701, + "step": 8180 + }, + { + "epoch": 5.841654778887304, + "grad_norm": 5.46875, + "learning_rate": 7.471555555555556e-05, + "loss": 0.7843, + "step": 8190 + }, + { + "epoch": 5.848787446504993, + "grad_norm": 7.46875, + "learning_rate": 7.467111111111112e-05, + "loss": 0.7914, + "step": 8200 + }, + { + "epoch": 5.848787446504993, + "eval/acc": 37.20930099487305, + "step": 8200 + }, + { + "epoch": 5.848787446504993, + "eval_loss": 2.566337823867798, + "eval_runtime": 0.3566, + "eval_samples_per_second": 120.59, + "eval_steps_per_second": 2.804, + "step": 8200 + }, + { + "epoch": 5.855920114122682, + "grad_norm": 7.03125, + "learning_rate": 7.462666666666667e-05, + "loss": 0.849, + "step": 8210 + }, + { + "epoch": 5.863052781740371, + "grad_norm": 7.5625, + "learning_rate": 7.458222222222223e-05, + "loss": 0.8066, + "step": 8220 + }, + { + "epoch": 5.870185449358059, + "grad_norm": 6.875, + "learning_rate": 7.453777777777778e-05, + "loss": 0.8556, + "step": 8230 + }, + { + "epoch": 5.877318116975749, + "grad_norm": 8.0, + "learning_rate": 7.449333333333334e-05, + "loss": 0.9098, + "step": 8240 + }, + { + "epoch": 5.884450784593438, + "grad_norm": 8.375, + "learning_rate": 7.444888888888889e-05, + "loss": 0.8183, + "step": 8250 + }, + { + "epoch": 5.891583452211127, + "grad_norm": 13.9375, + "learning_rate": 7.440444444444444e-05, + "loss": 0.8316, + "step": 8260 + }, + { + "epoch": 5.898716119828816, + "grad_norm": 7.25, + "learning_rate": 7.436000000000001e-05, + "loss": 0.8563, + "step": 8270 + }, + { + "epoch": 5.905848787446505, + "grad_norm": 10.75, + "learning_rate": 7.431555555555556e-05, + "loss": 0.8473, + "step": 8280 + }, + { + "epoch": 5.912981455064194, + "grad_norm": 14.1875, + "learning_rate": 7.427111111111111e-05, + "loss": 0.774, + "step": 8290 + }, + { + "epoch": 5.920114122681883, + "grad_norm": 6.8125, + "learning_rate": 7.422666666666667e-05, + "loss": 0.8783, + "step": 8300 + }, + { + "epoch": 5.920114122681883, + "eval/acc": 34.88372039794922, + "step": 8300 + }, + { + "epoch": 5.920114122681883, + "eval_loss": 2.6135735511779785, + "eval_runtime": 0.2367, + "eval_samples_per_second": 181.665, + "eval_steps_per_second": 4.225, + "step": 8300 + }, + { + "epoch": 5.927246790299572, + "grad_norm": 7.5625, + "learning_rate": 7.418222222222223e-05, + "loss": 0.9057, + "step": 8310 + }, + { + "epoch": 5.934379457917261, + "grad_norm": 7.875, + "learning_rate": 7.413777777777778e-05, + "loss": 0.8854, + "step": 8320 + }, + { + "epoch": 5.94151212553495, + "grad_norm": 8.1875, + "learning_rate": 7.409333333333333e-05, + "loss": 0.8049, + "step": 8330 + }, + { + "epoch": 5.948644793152639, + "grad_norm": 6.90625, + "learning_rate": 7.404888888888889e-05, + "loss": 0.7738, + "step": 8340 + }, + { + "epoch": 5.955777460770328, + "grad_norm": 7.90625, + "learning_rate": 7.400444444444444e-05, + "loss": 0.8268, + "step": 8350 + }, + { + "epoch": 5.9629101283880175, + "grad_norm": 8.3125, + "learning_rate": 7.396e-05, + "loss": 0.8336, + "step": 8360 + }, + { + "epoch": 5.970042796005706, + "grad_norm": 7.375, + "learning_rate": 7.391555555555557e-05, + "loss": 0.8282, + "step": 8370 + }, + { + "epoch": 5.977175463623395, + "grad_norm": 6.8125, + "learning_rate": 7.387111111111111e-05, + "loss": 0.8234, + "step": 8380 + }, + { + "epoch": 5.984308131241084, + "grad_norm": 7.15625, + "learning_rate": 7.382666666666666e-05, + "loss": 0.8771, + "step": 8390 + }, + { + "epoch": 5.991440798858774, + "grad_norm": 8.5, + "learning_rate": 7.378222222222222e-05, + "loss": 0.8572, + "step": 8400 + }, + { + "epoch": 5.991440798858774, + "eval/acc": 34.88372039794922, + "step": 8400 + }, + { + "epoch": 5.991440798858774, + "eval_loss": 2.5367989540100098, + "eval_runtime": 0.224, + "eval_samples_per_second": 191.97, + "eval_steps_per_second": 4.464, + "step": 8400 + }, + { + "epoch": 5.998573466476462, + "grad_norm": 7.0, + "learning_rate": 7.373777777777779e-05, + "loss": 0.7468, + "step": 8410 + }, + { + "epoch": 6.005706134094151, + "grad_norm": 7.78125, + "learning_rate": 7.369333333333333e-05, + "loss": 0.7882, + "step": 8420 + }, + { + "epoch": 6.01283880171184, + "grad_norm": 9.1875, + "learning_rate": 7.364888888888888e-05, + "loss": 0.9419, + "step": 8430 + }, + { + "epoch": 6.019971469329529, + "grad_norm": 17.625, + "learning_rate": 7.360444444444445e-05, + "loss": 0.7904, + "step": 8440 + }, + { + "epoch": 6.0271041369472185, + "grad_norm": 8.0625, + "learning_rate": 7.356000000000001e-05, + "loss": 0.8125, + "step": 8450 + }, + { + "epoch": 6.034236804564907, + "grad_norm": 7.4375, + "learning_rate": 7.351555555555556e-05, + "loss": 0.8002, + "step": 8460 + }, + { + "epoch": 6.041369472182597, + "grad_norm": 5.6875, + "learning_rate": 7.347111111111112e-05, + "loss": 0.7719, + "step": 8470 + }, + { + "epoch": 6.048502139800285, + "grad_norm": 8.9375, + "learning_rate": 7.342666666666667e-05, + "loss": 0.8122, + "step": 8480 + }, + { + "epoch": 6.055634807417975, + "grad_norm": 9.875, + "learning_rate": 7.338222222222223e-05, + "loss": 0.8052, + "step": 8490 + }, + { + "epoch": 6.062767475035663, + "grad_norm": 9.125, + "learning_rate": 7.333777777777778e-05, + "loss": 0.8171, + "step": 8500 + }, + { + "epoch": 6.062767475035663, + "eval/acc": 46.511627197265625, + "step": 8500 + }, + { + "epoch": 6.062767475035663, + "eval_loss": 2.4180805683135986, + "eval_runtime": 1.182, + "eval_samples_per_second": 36.38, + "eval_steps_per_second": 0.846, + "step": 8500 + }, + { + "epoch": 6.069900142653353, + "grad_norm": 6.84375, + "learning_rate": 7.329333333333334e-05, + "loss": 0.9028, + "step": 8510 + }, + { + "epoch": 6.077032810271041, + "grad_norm": 23.625, + "learning_rate": 7.324888888888889e-05, + "loss": 0.8576, + "step": 8520 + }, + { + "epoch": 6.08416547788873, + "grad_norm": 6.96875, + "learning_rate": 7.320444444444445e-05, + "loss": 0.8407, + "step": 8530 + }, + { + "epoch": 6.0912981455064195, + "grad_norm": 8.6875, + "learning_rate": 7.316000000000001e-05, + "loss": 0.8419, + "step": 8540 + }, + { + "epoch": 6.098430813124108, + "grad_norm": 6.90625, + "learning_rate": 7.311555555555556e-05, + "loss": 0.7802, + "step": 8550 + }, + { + "epoch": 6.1055634807417976, + "grad_norm": 6.34375, + "learning_rate": 7.307111111111111e-05, + "loss": 0.7716, + "step": 8560 + }, + { + "epoch": 6.112696148359486, + "grad_norm": 13.5, + "learning_rate": 7.302666666666667e-05, + "loss": 0.8538, + "step": 8570 + }, + { + "epoch": 6.119828815977176, + "grad_norm": 6.59375, + "learning_rate": 7.298222222222223e-05, + "loss": 0.6951, + "step": 8580 + }, + { + "epoch": 6.126961483594864, + "grad_norm": 7.0625, + "learning_rate": 7.293777777777778e-05, + "loss": 0.794, + "step": 8590 + }, + { + "epoch": 6.134094151212554, + "grad_norm": 7.15625, + "learning_rate": 7.289333333333334e-05, + "loss": 0.8058, + "step": 8600 + }, + { + "epoch": 6.134094151212554, + "eval/acc": 46.511627197265625, + "step": 8600 + }, + { + "epoch": 6.134094151212554, + "eval_loss": 2.5194764137268066, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.736, + "eval_steps_per_second": 4.529, + "step": 8600 + }, + { + "epoch": 6.141226818830242, + "grad_norm": 8.0625, + "learning_rate": 7.284888888888889e-05, + "loss": 0.8754, + "step": 8610 + }, + { + "epoch": 6.148359486447932, + "grad_norm": 4.875, + "learning_rate": 7.280444444444445e-05, + "loss": 0.7852, + "step": 8620 + }, + { + "epoch": 6.1554921540656204, + "grad_norm": 8.0, + "learning_rate": 7.276e-05, + "loss": 0.8064, + "step": 8630 + }, + { + "epoch": 6.16262482168331, + "grad_norm": 6.3125, + "learning_rate": 7.271555555555556e-05, + "loss": 0.7643, + "step": 8640 + }, + { + "epoch": 6.1697574893009985, + "grad_norm": 8.875, + "learning_rate": 7.267111111111111e-05, + "loss": 0.7702, + "step": 8650 + }, + { + "epoch": 6.176890156918688, + "grad_norm": 18.5, + "learning_rate": 7.262666666666666e-05, + "loss": 0.903, + "step": 8660 + }, + { + "epoch": 6.184022824536377, + "grad_norm": 9.875, + "learning_rate": 7.258222222222224e-05, + "loss": 0.788, + "step": 8670 + }, + { + "epoch": 6.191155492154065, + "grad_norm": 7.71875, + "learning_rate": 7.253777777777778e-05, + "loss": 0.7504, + "step": 8680 + }, + { + "epoch": 6.198288159771755, + "grad_norm": 7.5, + "learning_rate": 7.249333333333333e-05, + "loss": 0.8821, + "step": 8690 + }, + { + "epoch": 6.205420827389443, + "grad_norm": 6.71875, + "learning_rate": 7.24488888888889e-05, + "loss": 0.9166, + "step": 8700 + }, + { + "epoch": 6.205420827389443, + "eval/acc": 48.83720779418945, + "step": 8700 + }, + { + "epoch": 6.205420827389443, + "eval_loss": 2.488805055618286, + "eval_runtime": 0.2195, + "eval_samples_per_second": 195.91, + "eval_steps_per_second": 4.556, + "step": 8700 + }, + { + "epoch": 6.212553495007133, + "grad_norm": 8.3125, + "learning_rate": 7.240444444444446e-05, + "loss": 0.7724, + "step": 8710 + }, + { + "epoch": 6.219686162624821, + "grad_norm": 7.84375, + "learning_rate": 7.236e-05, + "loss": 0.8881, + "step": 8720 + }, + { + "epoch": 6.226818830242511, + "grad_norm": 7.21875, + "learning_rate": 7.231555555555555e-05, + "loss": 0.8538, + "step": 8730 + }, + { + "epoch": 6.2339514978601995, + "grad_norm": 7.5, + "learning_rate": 7.227111111111112e-05, + "loss": 0.8909, + "step": 8740 + }, + { + "epoch": 6.241084165477889, + "grad_norm": 7.25, + "learning_rate": 7.222666666666666e-05, + "loss": 0.7965, + "step": 8750 + }, + { + "epoch": 6.248216833095578, + "grad_norm": 7.46875, + "learning_rate": 7.218222222222223e-05, + "loss": 0.8547, + "step": 8760 + }, + { + "epoch": 6.255349500713267, + "grad_norm": 6.1875, + "learning_rate": 7.213777777777779e-05, + "loss": 0.7528, + "step": 8770 + }, + { + "epoch": 6.262482168330956, + "grad_norm": 7.03125, + "learning_rate": 7.209333333333334e-05, + "loss": 0.8632, + "step": 8780 + }, + { + "epoch": 6.269614835948644, + "grad_norm": 8.375, + "learning_rate": 7.204888888888888e-05, + "loss": 0.7832, + "step": 8790 + }, + { + "epoch": 6.276747503566334, + "grad_norm": 8.125, + "learning_rate": 7.200444444444445e-05, + "loss": 0.7659, + "step": 8800 + }, + { + "epoch": 6.276747503566334, + "eval/acc": 48.83720779418945, + "step": 8800 + }, + { + "epoch": 6.276747503566334, + "eval_loss": 2.4990618228912354, + "eval_runtime": 0.2586, + "eval_samples_per_second": 166.3, + "eval_steps_per_second": 3.867, + "step": 8800 + }, + { + "epoch": 6.283880171184022, + "grad_norm": 7.375, + "learning_rate": 7.196000000000001e-05, + "loss": 0.7402, + "step": 8810 + }, + { + "epoch": 6.291012838801712, + "grad_norm": 7.0, + "learning_rate": 7.191555555555556e-05, + "loss": 0.8381, + "step": 8820 + }, + { + "epoch": 6.2981455064194005, + "grad_norm": 15.75, + "learning_rate": 7.18711111111111e-05, + "loss": 0.8837, + "step": 8830 + }, + { + "epoch": 6.30527817403709, + "grad_norm": 5.46875, + "learning_rate": 7.182666666666668e-05, + "loss": 0.8638, + "step": 8840 + }, + { + "epoch": 6.312410841654779, + "grad_norm": 5.46875, + "learning_rate": 7.178222222222223e-05, + "loss": 0.8348, + "step": 8850 + }, + { + "epoch": 6.319543509272468, + "grad_norm": 7.9375, + "learning_rate": 7.173777777777778e-05, + "loss": 0.8598, + "step": 8860 + }, + { + "epoch": 6.326676176890157, + "grad_norm": 7.15625, + "learning_rate": 7.169333333333334e-05, + "loss": 0.8124, + "step": 8870 + }, + { + "epoch": 6.333808844507846, + "grad_norm": 6.28125, + "learning_rate": 7.164888888888889e-05, + "loss": 0.8184, + "step": 8880 + }, + { + "epoch": 6.340941512125535, + "grad_norm": 7.25, + "learning_rate": 7.160444444444445e-05, + "loss": 0.8522, + "step": 8890 + }, + { + "epoch": 6.348074179743224, + "grad_norm": 8.4375, + "learning_rate": 7.156e-05, + "loss": 0.894, + "step": 8900 + }, + { + "epoch": 6.348074179743224, + "eval/acc": 46.511627197265625, + "step": 8900 + }, + { + "epoch": 6.348074179743224, + "eval_loss": 2.4536728858947754, + "eval_runtime": 0.2168, + "eval_samples_per_second": 198.347, + "eval_steps_per_second": 4.613, + "step": 8900 + }, + { + "epoch": 6.355206847360913, + "grad_norm": 8.3125, + "learning_rate": 7.151555555555556e-05, + "loss": 0.8331, + "step": 8910 + }, + { + "epoch": 6.362339514978602, + "grad_norm": 13.1875, + "learning_rate": 7.147111111111111e-05, + "loss": 0.8107, + "step": 8920 + }, + { + "epoch": 6.369472182596291, + "grad_norm": 7.0, + "learning_rate": 7.142666666666667e-05, + "loss": 0.9504, + "step": 8930 + }, + { + "epoch": 6.37660485021398, + "grad_norm": 9.5625, + "learning_rate": 7.138222222222223e-05, + "loss": 0.766, + "step": 8940 + }, + { + "epoch": 6.383737517831669, + "grad_norm": 13.4375, + "learning_rate": 7.133777777777778e-05, + "loss": 0.7923, + "step": 8950 + }, + { + "epoch": 6.390870185449358, + "grad_norm": 6.6875, + "learning_rate": 7.129333333333333e-05, + "loss": 0.7777, + "step": 8960 + }, + { + "epoch": 6.398002853067047, + "grad_norm": 6.09375, + "learning_rate": 7.124888888888889e-05, + "loss": 0.7729, + "step": 8970 + }, + { + "epoch": 6.405135520684736, + "grad_norm": 6.46875, + "learning_rate": 7.120444444444445e-05, + "loss": 0.8118, + "step": 8980 + }, + { + "epoch": 6.412268188302425, + "grad_norm": 6.21875, + "learning_rate": 7.116e-05, + "loss": 0.9006, + "step": 8990 + }, + { + "epoch": 6.419400855920114, + "grad_norm": 6.5625, + "learning_rate": 7.111555555555555e-05, + "loss": 0.7092, + "step": 9000 + }, + { + "epoch": 6.419400855920114, + "eval/acc": 44.1860466003418, + "step": 9000 + }, + { + "epoch": 6.419400855920114, + "eval_loss": 2.533996343612671, + "eval_runtime": 0.3418, + "eval_samples_per_second": 125.802, + "eval_steps_per_second": 2.926, + "step": 9000 + }, + { + "epoch": 6.426533523537803, + "grad_norm": 7.59375, + "learning_rate": 7.107111111111111e-05, + "loss": 0.7684, + "step": 9010 + }, + { + "epoch": 6.433666191155492, + "grad_norm": 6.8125, + "learning_rate": 7.102666666666668e-05, + "loss": 0.7654, + "step": 9020 + }, + { + "epoch": 6.4407988587731815, + "grad_norm": 7.5625, + "learning_rate": 7.098222222222222e-05, + "loss": 0.8404, + "step": 9030 + }, + { + "epoch": 6.44793152639087, + "grad_norm": 8.5, + "learning_rate": 7.093777777777779e-05, + "loss": 0.8519, + "step": 9040 + }, + { + "epoch": 6.45506419400856, + "grad_norm": 6.53125, + "learning_rate": 7.089333333333333e-05, + "loss": 0.8487, + "step": 9050 + }, + { + "epoch": 6.462196861626248, + "grad_norm": 7.59375, + "learning_rate": 7.084888888888888e-05, + "loss": 0.8695, + "step": 9060 + }, + { + "epoch": 6.469329529243938, + "grad_norm": 8.4375, + "learning_rate": 7.080444444444444e-05, + "loss": 0.7864, + "step": 9070 + }, + { + "epoch": 6.476462196861626, + "grad_norm": 66.5, + "learning_rate": 7.076000000000001e-05, + "loss": 0.7726, + "step": 9080 + }, + { + "epoch": 6.483594864479315, + "grad_norm": 6.96875, + "learning_rate": 7.071555555555556e-05, + "loss": 0.7832, + "step": 9090 + }, + { + "epoch": 6.490727532097004, + "grad_norm": 7.40625, + "learning_rate": 7.06711111111111e-05, + "loss": 0.8063, + "step": 9100 + }, + { + "epoch": 6.490727532097004, + "eval/acc": 44.1860466003418, + "step": 9100 + }, + { + "epoch": 6.490727532097004, + "eval_loss": 2.5438809394836426, + "eval_runtime": 0.2154, + "eval_samples_per_second": 199.594, + "eval_steps_per_second": 4.642, + "step": 9100 + }, + { + "epoch": 6.497860199714693, + "grad_norm": 7.21875, + "learning_rate": 7.062666666666668e-05, + "loss": 0.7605, + "step": 9110 + }, + { + "epoch": 6.5049928673323825, + "grad_norm": 7.90625, + "learning_rate": 7.058222222222223e-05, + "loss": 0.8032, + "step": 9120 + }, + { + "epoch": 6.512125534950071, + "grad_norm": 6.9375, + "learning_rate": 7.053777777777778e-05, + "loss": 0.743, + "step": 9130 + }, + { + "epoch": 6.519258202567761, + "grad_norm": 5.65625, + "learning_rate": 7.049333333333334e-05, + "loss": 0.8261, + "step": 9140 + }, + { + "epoch": 6.526390870185449, + "grad_norm": 7.03125, + "learning_rate": 7.04488888888889e-05, + "loss": 0.8099, + "step": 9150 + }, + { + "epoch": 6.533523537803139, + "grad_norm": 7.15625, + "learning_rate": 7.040444444444445e-05, + "loss": 0.817, + "step": 9160 + }, + { + "epoch": 6.540656205420827, + "grad_norm": 11.625, + "learning_rate": 7.036e-05, + "loss": 0.782, + "step": 9170 + }, + { + "epoch": 6.547788873038517, + "grad_norm": 7.5625, + "learning_rate": 7.031555555555556e-05, + "loss": 0.8145, + "step": 9180 + }, + { + "epoch": 6.554921540656205, + "grad_norm": 7.5625, + "learning_rate": 7.027111111111111e-05, + "loss": 0.8822, + "step": 9190 + }, + { + "epoch": 6.562054208273894, + "grad_norm": 6.53125, + "learning_rate": 7.022666666666667e-05, + "loss": 0.8132, + "step": 9200 + }, + { + "epoch": 6.562054208273894, + "eval/acc": 44.1860466003418, + "step": 9200 + }, + { + "epoch": 6.562054208273894, + "eval_loss": 2.528564929962158, + "eval_runtime": 0.2169, + "eval_samples_per_second": 198.28, + "eval_steps_per_second": 4.611, + "step": 9200 + }, + { + "epoch": 6.5691868758915835, + "grad_norm": 7.21875, + "learning_rate": 7.018222222222223e-05, + "loss": 0.7858, + "step": 9210 + }, + { + "epoch": 6.576319543509273, + "grad_norm": 6.4375, + "learning_rate": 7.013777777777778e-05, + "loss": 0.7098, + "step": 9220 + }, + { + "epoch": 6.5834522111269616, + "grad_norm": 7.125, + "learning_rate": 7.009333333333333e-05, + "loss": 0.8362, + "step": 9230 + }, + { + "epoch": 6.59058487874465, + "grad_norm": 5.78125, + "learning_rate": 7.004888888888889e-05, + "loss": 0.7737, + "step": 9240 + }, + { + "epoch": 6.59771754636234, + "grad_norm": 9.0625, + "learning_rate": 7.000444444444445e-05, + "loss": 0.857, + "step": 9250 + }, + { + "epoch": 6.604850213980028, + "grad_norm": 9.125, + "learning_rate": 6.996e-05, + "loss": 0.7562, + "step": 9260 + }, + { + "epoch": 6.611982881597718, + "grad_norm": 8.3125, + "learning_rate": 6.991555555555556e-05, + "loss": 0.8619, + "step": 9270 + }, + { + "epoch": 6.619115549215406, + "grad_norm": 6.78125, + "learning_rate": 6.987111111111111e-05, + "loss": 0.7212, + "step": 9280 + }, + { + "epoch": 6.626248216833096, + "grad_norm": 26.125, + "learning_rate": 6.982666666666667e-05, + "loss": 0.951, + "step": 9290 + }, + { + "epoch": 6.633380884450784, + "grad_norm": 7.03125, + "learning_rate": 6.978222222222222e-05, + "loss": 0.7791, + "step": 9300 + }, + { + "epoch": 6.633380884450784, + "eval/acc": 44.1860466003418, + "step": 9300 + }, + { + "epoch": 6.633380884450784, + "eval_loss": 2.587022304534912, + "eval_runtime": 0.2175, + "eval_samples_per_second": 197.663, + "eval_steps_per_second": 4.597, + "step": 9300 + }, + { + "epoch": 6.640513552068474, + "grad_norm": 6.6875, + "learning_rate": 6.973777777777778e-05, + "loss": 0.8082, + "step": 9310 + }, + { + "epoch": 6.6476462196861625, + "grad_norm": 7.625, + "learning_rate": 6.969333333333333e-05, + "loss": 0.6863, + "step": 9320 + }, + { + "epoch": 6.654778887303852, + "grad_norm": 8.625, + "learning_rate": 6.96488888888889e-05, + "loss": 0.7921, + "step": 9330 + }, + { + "epoch": 6.661911554921541, + "grad_norm": 6.5, + "learning_rate": 6.960444444444446e-05, + "loss": 0.7762, + "step": 9340 + }, + { + "epoch": 6.669044222539229, + "grad_norm": 12.6875, + "learning_rate": 6.956e-05, + "loss": 0.7977, + "step": 9350 + }, + { + "epoch": 6.676176890156919, + "grad_norm": 6.84375, + "learning_rate": 6.951555555555555e-05, + "loss": 0.907, + "step": 9360 + }, + { + "epoch": 6.683309557774607, + "grad_norm": 7.15625, + "learning_rate": 6.947111111111112e-05, + "loss": 0.792, + "step": 9370 + }, + { + "epoch": 6.690442225392297, + "grad_norm": 8.5, + "learning_rate": 6.942666666666668e-05, + "loss": 0.7838, + "step": 9380 + }, + { + "epoch": 6.697574893009985, + "grad_norm": 8.1875, + "learning_rate": 6.938222222222223e-05, + "loss": 0.8141, + "step": 9390 + }, + { + "epoch": 6.704707560627675, + "grad_norm": 7.875, + "learning_rate": 6.933777777777777e-05, + "loss": 0.8348, + "step": 9400 + }, + { + "epoch": 6.704707560627675, + "eval/acc": 39.53488540649414, + "step": 9400 + }, + { + "epoch": 6.704707560627675, + "eval_loss": 2.6398463249206543, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.145, + "eval_steps_per_second": 4.585, + "step": 9400 + }, + { + "epoch": 6.7118402282453635, + "grad_norm": 6.625, + "learning_rate": 6.929333333333334e-05, + "loss": 0.889, + "step": 9410 + }, + { + "epoch": 6.718972895863053, + "grad_norm": 7.3125, + "learning_rate": 6.92488888888889e-05, + "loss": 0.7913, + "step": 9420 + }, + { + "epoch": 6.726105563480742, + "grad_norm": 10.875, + "learning_rate": 6.920444444444445e-05, + "loss": 0.8099, + "step": 9430 + }, + { + "epoch": 6.733238231098431, + "grad_norm": 23.75, + "learning_rate": 6.916000000000001e-05, + "loss": 0.7098, + "step": 9440 + }, + { + "epoch": 6.74037089871612, + "grad_norm": 6.625, + "learning_rate": 6.911555555555556e-05, + "loss": 0.7859, + "step": 9450 + }, + { + "epoch": 6.747503566333809, + "grad_norm": 5.875, + "learning_rate": 6.907111111111112e-05, + "loss": 0.7947, + "step": 9460 + }, + { + "epoch": 6.754636233951498, + "grad_norm": 7.25, + "learning_rate": 6.902666666666667e-05, + "loss": 0.927, + "step": 9470 + }, + { + "epoch": 6.761768901569187, + "grad_norm": 12.875, + "learning_rate": 6.898222222222223e-05, + "loss": 0.8474, + "step": 9480 + }, + { + "epoch": 6.768901569186876, + "grad_norm": 6.8125, + "learning_rate": 6.893777777777778e-05, + "loss": 0.848, + "step": 9490 + }, + { + "epoch": 6.7760342368045645, + "grad_norm": 7.96875, + "learning_rate": 6.889333333333333e-05, + "loss": 0.8081, + "step": 9500 + }, + { + "epoch": 6.7760342368045645, + "eval/acc": 41.86046600341797, + "step": 9500 + }, + { + "epoch": 6.7760342368045645, + "eval_loss": 2.6681759357452393, + "eval_runtime": 0.2247, + "eval_samples_per_second": 191.372, + "eval_steps_per_second": 4.451, + "step": 9500 + }, + { + "epoch": 6.783166904422254, + "grad_norm": 9.0625, + "learning_rate": 6.88488888888889e-05, + "loss": 0.8836, + "step": 9510 + }, + { + "epoch": 6.790299572039943, + "grad_norm": 15.875, + "learning_rate": 6.880444444444445e-05, + "loss": 0.8696, + "step": 9520 + }, + { + "epoch": 6.797432239657632, + "grad_norm": 10.75, + "learning_rate": 6.876e-05, + "loss": 0.844, + "step": 9530 + }, + { + "epoch": 6.804564907275321, + "grad_norm": 23.875, + "learning_rate": 6.871555555555556e-05, + "loss": 0.823, + "step": 9540 + }, + { + "epoch": 6.81169757489301, + "grad_norm": 7.75, + "learning_rate": 6.867111111111112e-05, + "loss": 0.8875, + "step": 9550 + }, + { + "epoch": 6.818830242510699, + "grad_norm": 6.46875, + "learning_rate": 6.862666666666667e-05, + "loss": 0.7703, + "step": 9560 + }, + { + "epoch": 6.825962910128388, + "grad_norm": 6.375, + "learning_rate": 6.858222222222222e-05, + "loss": 0.8, + "step": 9570 + }, + { + "epoch": 6.833095577746077, + "grad_norm": 7.96875, + "learning_rate": 6.853777777777778e-05, + "loss": 0.8139, + "step": 9580 + }, + { + "epoch": 6.840228245363766, + "grad_norm": 11.625, + "learning_rate": 6.849333333333333e-05, + "loss": 0.8042, + "step": 9590 + }, + { + "epoch": 6.847360912981455, + "grad_norm": 10.8125, + "learning_rate": 6.844888888888889e-05, + "loss": 0.8403, + "step": 9600 + }, + { + "epoch": 6.847360912981455, + "eval/acc": 44.1860466003418, + "step": 9600 + }, + { + "epoch": 6.847360912981455, + "eval_loss": 2.6575427055358887, + "eval_runtime": 0.2186, + "eval_samples_per_second": 196.745, + "eval_steps_per_second": 4.575, + "step": 9600 + }, + { + "epoch": 6.854493580599144, + "grad_norm": 14.6875, + "learning_rate": 6.840444444444445e-05, + "loss": 0.8426, + "step": 9610 + }, + { + "epoch": 6.861626248216833, + "grad_norm": 7.84375, + "learning_rate": 6.836e-05, + "loss": 0.8874, + "step": 9620 + }, + { + "epoch": 6.868758915834523, + "grad_norm": 8.9375, + "learning_rate": 6.831555555555555e-05, + "loss": 0.78, + "step": 9630 + }, + { + "epoch": 6.875891583452211, + "grad_norm": 6.1875, + "learning_rate": 6.827111111111111e-05, + "loss": 0.7788, + "step": 9640 + }, + { + "epoch": 6.8830242510699, + "grad_norm": 6.34375, + "learning_rate": 6.822666666666668e-05, + "loss": 0.7385, + "step": 9650 + }, + { + "epoch": 6.890156918687589, + "grad_norm": 7.59375, + "learning_rate": 6.818222222222222e-05, + "loss": 0.8938, + "step": 9660 + }, + { + "epoch": 6.897289586305278, + "grad_norm": 10.8125, + "learning_rate": 6.813777777777777e-05, + "loss": 0.8154, + "step": 9670 + }, + { + "epoch": 6.904422253922967, + "grad_norm": 6.90625, + "learning_rate": 6.809333333333333e-05, + "loss": 0.9273, + "step": 9680 + }, + { + "epoch": 6.911554921540656, + "grad_norm": 8.3125, + "learning_rate": 6.80488888888889e-05, + "loss": 0.8595, + "step": 9690 + }, + { + "epoch": 6.9186875891583455, + "grad_norm": 10.75, + "learning_rate": 6.800444444444444e-05, + "loss": 0.8569, + "step": 9700 + }, + { + "epoch": 6.9186875891583455, + "eval/acc": 39.53488540649414, + "step": 9700 + }, + { + "epoch": 6.9186875891583455, + "eval_loss": 2.6524884700775146, + "eval_runtime": 0.215, + "eval_samples_per_second": 200.025, + "eval_steps_per_second": 4.652, + "step": 9700 + }, + { + "epoch": 6.925820256776034, + "grad_norm": 7.90625, + "learning_rate": 6.796e-05, + "loss": 0.7726, + "step": 9710 + }, + { + "epoch": 6.932952924393724, + "grad_norm": 7.71875, + "learning_rate": 6.791555555555556e-05, + "loss": 0.789, + "step": 9720 + }, + { + "epoch": 6.940085592011412, + "grad_norm": 7.4375, + "learning_rate": 6.787111111111112e-05, + "loss": 0.7525, + "step": 9730 + }, + { + "epoch": 6.947218259629102, + "grad_norm": 6.96875, + "learning_rate": 6.782666666666667e-05, + "loss": 0.8183, + "step": 9740 + }, + { + "epoch": 6.95435092724679, + "grad_norm": 6.5625, + "learning_rate": 6.778222222222223e-05, + "loss": 0.8713, + "step": 9750 + }, + { + "epoch": 6.961483594864479, + "grad_norm": 6.59375, + "learning_rate": 6.773777777777778e-05, + "loss": 0.8089, + "step": 9760 + }, + { + "epoch": 6.968616262482168, + "grad_norm": 7.46875, + "learning_rate": 6.769333333333334e-05, + "loss": 0.8173, + "step": 9770 + }, + { + "epoch": 6.975748930099857, + "grad_norm": 8.75, + "learning_rate": 6.76488888888889e-05, + "loss": 0.8359, + "step": 9780 + }, + { + "epoch": 6.9828815977175465, + "grad_norm": 6.96875, + "learning_rate": 6.760444444444445e-05, + "loss": 0.7308, + "step": 9790 + }, + { + "epoch": 6.990014265335235, + "grad_norm": 8.6875, + "learning_rate": 6.756e-05, + "loss": 0.7651, + "step": 9800 + }, + { + "epoch": 6.990014265335235, + "eval/acc": 44.1860466003418, + "step": 9800 + }, + { + "epoch": 6.990014265335235, + "eval_loss": 2.581909418106079, + "eval_runtime": 0.217, + "eval_samples_per_second": 198.162, + "eval_steps_per_second": 4.608, + "step": 9800 + }, + { + "epoch": 6.997146932952925, + "grad_norm": 7.6875, + "learning_rate": 6.751555555555556e-05, + "loss": 0.8653, + "step": 9810 + }, + { + "epoch": 7.004279600570613, + "grad_norm": 8.5, + "learning_rate": 6.747111111111112e-05, + "loss": 0.8445, + "step": 9820 + }, + { + "epoch": 7.011412268188303, + "grad_norm": 6.375, + "learning_rate": 6.742666666666667e-05, + "loss": 0.7759, + "step": 9830 + }, + { + "epoch": 7.018544935805991, + "grad_norm": 6.375, + "learning_rate": 6.738222222222222e-05, + "loss": 0.7709, + "step": 9840 + }, + { + "epoch": 7.025677603423681, + "grad_norm": 7.8125, + "learning_rate": 6.733777777777778e-05, + "loss": 0.768, + "step": 9850 + }, + { + "epoch": 7.032810271041369, + "grad_norm": 8.4375, + "learning_rate": 6.729333333333334e-05, + "loss": 0.8725, + "step": 9860 + }, + { + "epoch": 7.039942938659059, + "grad_norm": 7.8125, + "learning_rate": 6.724888888888889e-05, + "loss": 0.8146, + "step": 9870 + }, + { + "epoch": 7.0470756062767475, + "grad_norm": 70.0, + "learning_rate": 6.720444444444445e-05, + "loss": 0.8137, + "step": 9880 + }, + { + "epoch": 7.054208273894437, + "grad_norm": 7.03125, + "learning_rate": 6.716e-05, + "loss": 0.8025, + "step": 9890 + }, + { + "epoch": 7.0613409415121255, + "grad_norm": 7.15625, + "learning_rate": 6.711555555555555e-05, + "loss": 0.8237, + "step": 9900 + }, + { + "epoch": 7.0613409415121255, + "eval/acc": 62.79069900512695, + "step": 9900 + }, + { + "epoch": 7.0613409415121255, + "eval_loss": 2.023484706878662, + "eval_runtime": 1.3641, + "eval_samples_per_second": 31.524, + "eval_steps_per_second": 0.733, + "step": 9900 + }, + { + "epoch": 7.068473609129814, + "grad_norm": 10.375, + "learning_rate": 6.707111111111111e-05, + "loss": 0.7141, + "step": 9910 + }, + { + "epoch": 7.075606276747504, + "grad_norm": 9.25, + "learning_rate": 6.702666666666667e-05, + "loss": 0.7963, + "step": 9920 + }, + { + "epoch": 7.082738944365192, + "grad_norm": 7.375, + "learning_rate": 6.698222222222222e-05, + "loss": 0.7935, + "step": 9930 + }, + { + "epoch": 7.089871611982882, + "grad_norm": 6.8125, + "learning_rate": 6.693777777777778e-05, + "loss": 0.7882, + "step": 9940 + }, + { + "epoch": 7.09700427960057, + "grad_norm": 7.0625, + "learning_rate": 6.689333333333335e-05, + "loss": 0.7698, + "step": 9950 + }, + { + "epoch": 7.10413694721826, + "grad_norm": 6.9375, + "learning_rate": 6.68488888888889e-05, + "loss": 0.8595, + "step": 9960 + }, + { + "epoch": 7.111269614835948, + "grad_norm": 9.5, + "learning_rate": 6.680444444444444e-05, + "loss": 0.8158, + "step": 9970 + }, + { + "epoch": 7.118402282453638, + "grad_norm": 8.375, + "learning_rate": 6.676e-05, + "loss": 0.7916, + "step": 9980 + }, + { + "epoch": 7.1255349500713265, + "grad_norm": 6.3125, + "learning_rate": 6.671555555555555e-05, + "loss": 0.7455, + "step": 9990 + }, + { + "epoch": 7.132667617689016, + "grad_norm": 7.375, + "learning_rate": 6.667111111111112e-05, + "loss": 0.7398, + "step": 10000 + }, + { + "epoch": 7.132667617689016, + "eval/acc": 65.11627960205078, + "step": 10000 + }, + { + "epoch": 7.132667617689016, + "eval_loss": 2.0408403873443604, + "eval_runtime": 0.2184, + "eval_samples_per_second": 196.923, + "eval_steps_per_second": 4.58, + "step": 10000 + }, + { + "epoch": 7.139800285306705, + "grad_norm": 8.375, + "learning_rate": 6.662666666666668e-05, + "loss": 0.8887, + "step": 10010 + }, + { + "epoch": 7.146932952924394, + "grad_norm": 8.5, + "learning_rate": 6.658222222222223e-05, + "loss": 0.8945, + "step": 10020 + }, + { + "epoch": 7.154065620542083, + "grad_norm": 22.5, + "learning_rate": 6.653777777777777e-05, + "loss": 0.7934, + "step": 10030 + }, + { + "epoch": 7.161198288159771, + "grad_norm": 7.34375, + "learning_rate": 6.649333333333334e-05, + "loss": 0.8056, + "step": 10040 + }, + { + "epoch": 7.168330955777461, + "grad_norm": 7.59375, + "learning_rate": 6.64488888888889e-05, + "loss": 0.7893, + "step": 10050 + }, + { + "epoch": 7.175463623395149, + "grad_norm": 8.5, + "learning_rate": 6.640444444444445e-05, + "loss": 1.0099, + "step": 10060 + }, + { + "epoch": 7.182596291012839, + "grad_norm": 8.0625, + "learning_rate": 6.636e-05, + "loss": 0.8701, + "step": 10070 + }, + { + "epoch": 7.1897289586305275, + "grad_norm": 9.25, + "learning_rate": 6.631555555555557e-05, + "loss": 0.8203, + "step": 10080 + }, + { + "epoch": 7.196861626248217, + "grad_norm": 7.90625, + "learning_rate": 6.627111111111112e-05, + "loss": 0.8197, + "step": 10090 + }, + { + "epoch": 7.203994293865906, + "grad_norm": 6.03125, + "learning_rate": 6.622666666666667e-05, + "loss": 0.8087, + "step": 10100 + }, + { + "epoch": 7.203994293865906, + "eval/acc": 60.46511459350586, + "step": 10100 + }, + { + "epoch": 7.203994293865906, + "eval_loss": 1.940862774848938, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.814, + "eval_steps_per_second": 4.391, + "step": 10100 + }, + { + "epoch": 7.211126961483595, + "grad_norm": 8.1875, + "learning_rate": 6.618222222222223e-05, + "loss": 0.7932, + "step": 10110 + }, + { + "epoch": 7.218259629101284, + "grad_norm": 7.4375, + "learning_rate": 6.613777777777778e-05, + "loss": 0.7562, + "step": 10120 + }, + { + "epoch": 7.225392296718973, + "grad_norm": 8.4375, + "learning_rate": 6.609333333333334e-05, + "loss": 0.8474, + "step": 10130 + }, + { + "epoch": 7.232524964336662, + "grad_norm": 8.0, + "learning_rate": 6.604888888888889e-05, + "loss": 0.8197, + "step": 10140 + }, + { + "epoch": 7.239657631954351, + "grad_norm": 8.0625, + "learning_rate": 6.600444444444445e-05, + "loss": 0.7804, + "step": 10150 + }, + { + "epoch": 7.24679029957204, + "grad_norm": 21.25, + "learning_rate": 6.596e-05, + "loss": 0.8914, + "step": 10160 + }, + { + "epoch": 7.2539229671897285, + "grad_norm": 7.125, + "learning_rate": 6.591555555555556e-05, + "loss": 0.8185, + "step": 10170 + }, + { + "epoch": 7.261055634807418, + "grad_norm": 6.6875, + "learning_rate": 6.587111111111112e-05, + "loss": 0.7911, + "step": 10180 + }, + { + "epoch": 7.268188302425107, + "grad_norm": 7.21875, + "learning_rate": 6.582666666666667e-05, + "loss": 0.8004, + "step": 10190 + }, + { + "epoch": 7.275320970042796, + "grad_norm": 7.0, + "learning_rate": 6.578222222222222e-05, + "loss": 0.7226, + "step": 10200 + }, + { + "epoch": 7.275320970042796, + "eval/acc": 60.46511459350586, + "step": 10200 + }, + { + "epoch": 7.275320970042796, + "eval_loss": 2.004242420196533, + "eval_runtime": 0.2197, + "eval_samples_per_second": 195.738, + "eval_steps_per_second": 4.552, + "step": 10200 + }, + { + "epoch": 7.282453637660485, + "grad_norm": 16.25, + "learning_rate": 6.573777777777778e-05, + "loss": 0.8735, + "step": 10210 + }, + { + "epoch": 7.289586305278174, + "grad_norm": 6.8125, + "learning_rate": 6.569333333333334e-05, + "loss": 0.8356, + "step": 10220 + }, + { + "epoch": 7.296718972895863, + "grad_norm": 5.65625, + "learning_rate": 6.564888888888889e-05, + "loss": 0.8032, + "step": 10230 + }, + { + "epoch": 7.303851640513552, + "grad_norm": 6.125, + "learning_rate": 6.560444444444444e-05, + "loss": 0.7803, + "step": 10240 + }, + { + "epoch": 7.310984308131241, + "grad_norm": 9.375, + "learning_rate": 6.556e-05, + "loss": 0.8748, + "step": 10250 + }, + { + "epoch": 7.31811697574893, + "grad_norm": 6.625, + "learning_rate": 6.551555555555556e-05, + "loss": 0.7793, + "step": 10260 + }, + { + "epoch": 7.325249643366619, + "grad_norm": 13.625, + "learning_rate": 6.547111111111111e-05, + "loss": 0.8052, + "step": 10270 + }, + { + "epoch": 7.3323823109843085, + "grad_norm": 8.6875, + "learning_rate": 6.542666666666667e-05, + "loss": 0.8387, + "step": 10280 + }, + { + "epoch": 7.339514978601997, + "grad_norm": 6.84375, + "learning_rate": 6.538222222222222e-05, + "loss": 0.8713, + "step": 10290 + }, + { + "epoch": 7.346647646219687, + "grad_norm": 9.875, + "learning_rate": 6.533777777777777e-05, + "loss": 0.7266, + "step": 10300 + }, + { + "epoch": 7.346647646219687, + "eval/acc": 62.79069900512695, + "step": 10300 + }, + { + "epoch": 7.346647646219687, + "eval_loss": 1.9304108619689941, + "eval_runtime": 0.222, + "eval_samples_per_second": 193.684, + "eval_steps_per_second": 4.504, + "step": 10300 + }, + { + "epoch": 7.353780313837375, + "grad_norm": 9.5625, + "learning_rate": 6.529333333333333e-05, + "loss": 0.7775, + "step": 10310 + }, + { + "epoch": 7.360912981455064, + "grad_norm": 8.0625, + "learning_rate": 6.52488888888889e-05, + "loss": 0.7669, + "step": 10320 + }, + { + "epoch": 7.368045649072753, + "grad_norm": 7.0625, + "learning_rate": 6.520444444444444e-05, + "loss": 0.897, + "step": 10330 + }, + { + "epoch": 7.375178316690442, + "grad_norm": 7.4375, + "learning_rate": 6.515999999999999e-05, + "loss": 0.7859, + "step": 10340 + }, + { + "epoch": 7.382310984308131, + "grad_norm": 10.25, + "learning_rate": 6.511555555555557e-05, + "loss": 0.9447, + "step": 10350 + }, + { + "epoch": 7.38944365192582, + "grad_norm": 7.21875, + "learning_rate": 6.507111111111112e-05, + "loss": 0.78, + "step": 10360 + }, + { + "epoch": 7.3965763195435095, + "grad_norm": 8.625, + "learning_rate": 6.502666666666667e-05, + "loss": 0.9362, + "step": 10370 + }, + { + "epoch": 7.403708987161198, + "grad_norm": 8.125, + "learning_rate": 6.498222222222223e-05, + "loss": 0.7343, + "step": 10380 + }, + { + "epoch": 7.410841654778888, + "grad_norm": 8.125, + "learning_rate": 6.493777777777779e-05, + "loss": 0.8328, + "step": 10390 + }, + { + "epoch": 7.417974322396576, + "grad_norm": 7.8125, + "learning_rate": 6.489333333333334e-05, + "loss": 0.8261, + "step": 10400 + }, + { + "epoch": 7.417974322396576, + "eval/acc": 62.79069900512695, + "step": 10400 + }, + { + "epoch": 7.417974322396576, + "eval_loss": 1.9274901151657104, + "eval_runtime": 0.2217, + "eval_samples_per_second": 193.989, + "eval_steps_per_second": 4.511, + "step": 10400 + }, + { + "epoch": 7.425106990014266, + "grad_norm": 48.0, + "learning_rate": 6.484888888888889e-05, + "loss": 0.8167, + "step": 10410 + }, + { + "epoch": 7.432239657631954, + "grad_norm": 8.4375, + "learning_rate": 6.480444444444445e-05, + "loss": 0.8262, + "step": 10420 + }, + { + "epoch": 7.439372325249644, + "grad_norm": 6.90625, + "learning_rate": 6.476e-05, + "loss": 0.9254, + "step": 10430 + }, + { + "epoch": 7.446504992867332, + "grad_norm": 8.5625, + "learning_rate": 6.471555555555556e-05, + "loss": 0.7657, + "step": 10440 + }, + { + "epoch": 7.453637660485022, + "grad_norm": 6.875, + "learning_rate": 6.467111111111112e-05, + "loss": 0.8123, + "step": 10450 + }, + { + "epoch": 7.4607703281027105, + "grad_norm": 8.5625, + "learning_rate": 6.462666666666667e-05, + "loss": 0.8951, + "step": 10460 + }, + { + "epoch": 7.467902995720399, + "grad_norm": 7.46875, + "learning_rate": 6.458222222222222e-05, + "loss": 0.8287, + "step": 10470 + }, + { + "epoch": 7.4750356633380886, + "grad_norm": 6.28125, + "learning_rate": 6.453777777777778e-05, + "loss": 0.7364, + "step": 10480 + }, + { + "epoch": 7.482168330955777, + "grad_norm": 7.625, + "learning_rate": 6.449333333333334e-05, + "loss": 0.9265, + "step": 10490 + }, + { + "epoch": 7.489300998573467, + "grad_norm": 7.15625, + "learning_rate": 6.444888888888889e-05, + "loss": 0.7547, + "step": 10500 + }, + { + "epoch": 7.489300998573467, + "eval/acc": 62.79069900512695, + "step": 10500 + }, + { + "epoch": 7.489300998573467, + "eval_loss": 1.9239764213562012, + "eval_runtime": 0.2285, + "eval_samples_per_second": 188.187, + "eval_steps_per_second": 4.376, + "step": 10500 + }, + { + "epoch": 7.496433666191155, + "grad_norm": 7.875, + "learning_rate": 6.440444444444444e-05, + "loss": 0.8612, + "step": 10510 + }, + { + "epoch": 7.503566333808845, + "grad_norm": 7.46875, + "learning_rate": 6.436e-05, + "loss": 0.8751, + "step": 10520 + }, + { + "epoch": 7.510699001426533, + "grad_norm": 6.78125, + "learning_rate": 6.431555555555556e-05, + "loss": 0.7706, + "step": 10530 + }, + { + "epoch": 7.517831669044223, + "grad_norm": 6.375, + "learning_rate": 6.427111111111111e-05, + "loss": 0.7602, + "step": 10540 + }, + { + "epoch": 7.5249643366619114, + "grad_norm": 7.1875, + "learning_rate": 6.422666666666667e-05, + "loss": 0.7953, + "step": 10550 + }, + { + "epoch": 7.532097004279601, + "grad_norm": 6.5, + "learning_rate": 6.418222222222222e-05, + "loss": 0.871, + "step": 10560 + }, + { + "epoch": 7.5392296718972895, + "grad_norm": 6.65625, + "learning_rate": 6.413777777777778e-05, + "loss": 0.7343, + "step": 10570 + }, + { + "epoch": 7.546362339514978, + "grad_norm": 6.3125, + "learning_rate": 6.409333333333333e-05, + "loss": 0.8275, + "step": 10580 + }, + { + "epoch": 7.553495007132668, + "grad_norm": 6.125, + "learning_rate": 6.40488888888889e-05, + "loss": 0.8243, + "step": 10590 + }, + { + "epoch": 7.560627674750357, + "grad_norm": 7.75, + "learning_rate": 6.400444444444444e-05, + "loss": 0.8731, + "step": 10600 + }, + { + "epoch": 7.560627674750357, + "eval/acc": 58.13953399658203, + "step": 10600 + }, + { + "epoch": 7.560627674750357, + "eval_loss": 1.9751547574996948, + "eval_runtime": 0.2209, + "eval_samples_per_second": 194.634, + "eval_steps_per_second": 4.526, + "step": 10600 + }, + { + "epoch": 7.567760342368046, + "grad_norm": 7.34375, + "learning_rate": 6.396e-05, + "loss": 0.7555, + "step": 10610 + }, + { + "epoch": 7.574893009985734, + "grad_norm": 9.1875, + "learning_rate": 6.391555555555557e-05, + "loss": 0.7415, + "step": 10620 + }, + { + "epoch": 7.582025677603424, + "grad_norm": 11.875, + "learning_rate": 6.387111111111111e-05, + "loss": 0.7363, + "step": 10630 + }, + { + "epoch": 7.589158345221112, + "grad_norm": 7.90625, + "learning_rate": 6.382666666666666e-05, + "loss": 0.858, + "step": 10640 + }, + { + "epoch": 7.596291012838802, + "grad_norm": 8.25, + "learning_rate": 6.378222222222223e-05, + "loss": 0.7934, + "step": 10650 + }, + { + "epoch": 7.6034236804564905, + "grad_norm": 6.84375, + "learning_rate": 6.373777777777779e-05, + "loss": 0.7867, + "step": 10660 + }, + { + "epoch": 7.61055634807418, + "grad_norm": 8.3125, + "learning_rate": 6.369333333333334e-05, + "loss": 0.8519, + "step": 10670 + }, + { + "epoch": 7.617689015691869, + "grad_norm": 8.25, + "learning_rate": 6.36488888888889e-05, + "loss": 0.8771, + "step": 10680 + }, + { + "epoch": 7.624821683309558, + "grad_norm": 6.1875, + "learning_rate": 6.360444444444445e-05, + "loss": 0.8483, + "step": 10690 + }, + { + "epoch": 7.631954350927247, + "grad_norm": 34.25, + "learning_rate": 6.356000000000001e-05, + "loss": 0.8799, + "step": 10700 + }, + { + "epoch": 7.631954350927247, + "eval/acc": 62.79069900512695, + "step": 10700 + }, + { + "epoch": 7.631954350927247, + "eval_loss": 1.9270039796829224, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.545, + "eval_steps_per_second": 4.199, + "step": 10700 + }, + { + "epoch": 7.639087018544936, + "grad_norm": 7.875, + "learning_rate": 6.351555555555556e-05, + "loss": 0.8321, + "step": 10710 + }, + { + "epoch": 7.646219686162625, + "grad_norm": 7.0, + "learning_rate": 6.347111111111112e-05, + "loss": 0.8192, + "step": 10720 + }, + { + "epoch": 7.653352353780313, + "grad_norm": 8.25, + "learning_rate": 6.342666666666667e-05, + "loss": 0.7631, + "step": 10730 + }, + { + "epoch": 7.660485021398003, + "grad_norm": 7.1875, + "learning_rate": 6.338222222222222e-05, + "loss": 0.8088, + "step": 10740 + }, + { + "epoch": 7.6676176890156915, + "grad_norm": 6.5, + "learning_rate": 6.333777777777779e-05, + "loss": 0.7612, + "step": 10750 + }, + { + "epoch": 7.674750356633381, + "grad_norm": 7.5, + "learning_rate": 6.329333333333334e-05, + "loss": 0.8282, + "step": 10760 + }, + { + "epoch": 7.68188302425107, + "grad_norm": 6.46875, + "learning_rate": 6.324888888888889e-05, + "loss": 0.8197, + "step": 10770 + }, + { + "epoch": 7.689015691868759, + "grad_norm": 12.0, + "learning_rate": 6.320444444444445e-05, + "loss": 0.8304, + "step": 10780 + }, + { + "epoch": 7.696148359486448, + "grad_norm": 7.875, + "learning_rate": 6.316000000000001e-05, + "loss": 0.8242, + "step": 10790 + }, + { + "epoch": 7.703281027104137, + "grad_norm": 7.34375, + "learning_rate": 6.311555555555556e-05, + "loss": 0.7904, + "step": 10800 + }, + { + "epoch": 7.703281027104137, + "eval/acc": 60.46511459350586, + "step": 10800 + }, + { + "epoch": 7.703281027104137, + "eval_loss": 1.931999683380127, + "eval_runtime": 0.2226, + "eval_samples_per_second": 193.21, + "eval_steps_per_second": 4.493, + "step": 10800 + }, + { + "epoch": 7.710413694721826, + "grad_norm": 8.625, + "learning_rate": 6.307111111111111e-05, + "loss": 0.861, + "step": 10810 + }, + { + "epoch": 7.717546362339515, + "grad_norm": 12.0, + "learning_rate": 6.302666666666667e-05, + "loss": 0.7917, + "step": 10820 + }, + { + "epoch": 7.724679029957204, + "grad_norm": 6.5, + "learning_rate": 6.298222222222222e-05, + "loss": 0.709, + "step": 10830 + }, + { + "epoch": 7.731811697574893, + "grad_norm": 6.96875, + "learning_rate": 6.293777777777778e-05, + "loss": 0.8168, + "step": 10840 + }, + { + "epoch": 7.738944365192582, + "grad_norm": 7.625, + "learning_rate": 6.289333333333334e-05, + "loss": 0.7357, + "step": 10850 + }, + { + "epoch": 7.7460770328102715, + "grad_norm": 17.125, + "learning_rate": 6.284888888888889e-05, + "loss": 0.7115, + "step": 10860 + }, + { + "epoch": 7.75320970042796, + "grad_norm": 6.78125, + "learning_rate": 6.280444444444444e-05, + "loss": 0.6973, + "step": 10870 + }, + { + "epoch": 7.760342368045649, + "grad_norm": 6.75, + "learning_rate": 6.276e-05, + "loss": 0.7925, + "step": 10880 + }, + { + "epoch": 7.767475035663338, + "grad_norm": 6.78125, + "learning_rate": 6.271555555555556e-05, + "loss": 0.7927, + "step": 10890 + }, + { + "epoch": 7.774607703281027, + "grad_norm": 7.375, + "learning_rate": 6.267111111111111e-05, + "loss": 0.9383, + "step": 10900 + }, + { + "epoch": 7.774607703281027, + "eval/acc": 62.79069900512695, + "step": 10900 + }, + { + "epoch": 7.774607703281027, + "eval_loss": 1.947619915008545, + "eval_runtime": 0.2178, + "eval_samples_per_second": 197.442, + "eval_steps_per_second": 4.592, + "step": 10900 + }, + { + "epoch": 7.781740370898716, + "grad_norm": 13.75, + "learning_rate": 6.262666666666666e-05, + "loss": 0.8463, + "step": 10910 + }, + { + "epoch": 7.788873038516405, + "grad_norm": 10.875, + "learning_rate": 6.258222222222222e-05, + "loss": 0.7938, + "step": 10920 + }, + { + "epoch": 7.796005706134094, + "grad_norm": 7.75, + "learning_rate": 6.253777777777779e-05, + "loss": 0.8174, + "step": 10930 + }, + { + "epoch": 7.803138373751783, + "grad_norm": 6.3125, + "learning_rate": 6.249333333333333e-05, + "loss": 0.7583, + "step": 10940 + }, + { + "epoch": 7.8102710413694725, + "grad_norm": 6.625, + "learning_rate": 6.24488888888889e-05, + "loss": 0.7677, + "step": 10950 + }, + { + "epoch": 7.817403708987161, + "grad_norm": 7.03125, + "learning_rate": 6.240444444444444e-05, + "loss": 0.8211, + "step": 10960 + }, + { + "epoch": 7.824536376604851, + "grad_norm": 6.78125, + "learning_rate": 6.236e-05, + "loss": 0.8165, + "step": 10970 + }, + { + "epoch": 7.831669044222539, + "grad_norm": 7.25, + "learning_rate": 6.231555555555555e-05, + "loss": 0.8452, + "step": 10980 + }, + { + "epoch": 7.838801711840228, + "grad_norm": 7.78125, + "learning_rate": 6.227111111111112e-05, + "loss": 0.7316, + "step": 10990 + }, + { + "epoch": 7.845934379457917, + "grad_norm": 7.1875, + "learning_rate": 6.222666666666666e-05, + "loss": 0.7908, + "step": 11000 + }, + { + "epoch": 7.845934379457917, + "eval/acc": 60.46511459350586, + "step": 11000 + }, + { + "epoch": 7.845934379457917, + "eval_loss": 1.9402235746383667, + "eval_runtime": 0.2151, + "eval_samples_per_second": 199.927, + "eval_steps_per_second": 4.649, + "step": 11000 + }, + { + "epoch": 7.853067047075607, + "grad_norm": 7.3125, + "learning_rate": 6.218222222222223e-05, + "loss": 0.8226, + "step": 11010 + }, + { + "epoch": 7.860199714693295, + "grad_norm": 6.1875, + "learning_rate": 6.213777777777779e-05, + "loss": 0.7946, + "step": 11020 + }, + { + "epoch": 7.867332382310984, + "grad_norm": 6.21875, + "learning_rate": 6.209333333333334e-05, + "loss": 0.8494, + "step": 11030 + }, + { + "epoch": 7.8744650499286735, + "grad_norm": 6.875, + "learning_rate": 6.204888888888889e-05, + "loss": 0.7066, + "step": 11040 + }, + { + "epoch": 7.881597717546362, + "grad_norm": 6.375, + "learning_rate": 6.200444444444445e-05, + "loss": 0.8499, + "step": 11050 + }, + { + "epoch": 7.888730385164052, + "grad_norm": 8.0, + "learning_rate": 6.196000000000001e-05, + "loss": 0.8761, + "step": 11060 + }, + { + "epoch": 7.89586305278174, + "grad_norm": 5.75, + "learning_rate": 6.191555555555556e-05, + "loss": 0.8536, + "step": 11070 + }, + { + "epoch": 7.90299572039943, + "grad_norm": 7.0, + "learning_rate": 6.18711111111111e-05, + "loss": 0.9413, + "step": 11080 + }, + { + "epoch": 7.910128388017118, + "grad_norm": 8.0, + "learning_rate": 6.182666666666667e-05, + "loss": 0.7626, + "step": 11090 + }, + { + "epoch": 7.917261055634808, + "grad_norm": 6.375, + "learning_rate": 6.178222222222223e-05, + "loss": 0.8177, + "step": 11100 + }, + { + "epoch": 7.917261055634808, + "eval/acc": 65.11627960205078, + "step": 11100 + }, + { + "epoch": 7.917261055634808, + "eval_loss": 1.8976689577102661, + "eval_runtime": 0.2399, + "eval_samples_per_second": 179.237, + "eval_steps_per_second": 4.168, + "step": 11100 + }, + { + "epoch": 7.924393723252496, + "grad_norm": 7.4375, + "learning_rate": 6.173777777777778e-05, + "loss": 0.8178, + "step": 11110 + }, + { + "epoch": 7.931526390870186, + "grad_norm": 35.25, + "learning_rate": 6.169333333333334e-05, + "loss": 0.7931, + "step": 11120 + }, + { + "epoch": 7.9386590584878745, + "grad_norm": 26.75, + "learning_rate": 6.164888888888889e-05, + "loss": 0.7883, + "step": 11130 + }, + { + "epoch": 7.945791726105563, + "grad_norm": 6.375, + "learning_rate": 6.160444444444444e-05, + "loss": 0.7407, + "step": 11140 + }, + { + "epoch": 7.9529243937232525, + "grad_norm": 8.5, + "learning_rate": 6.156e-05, + "loss": 0.8509, + "step": 11150 + }, + { + "epoch": 7.960057061340941, + "grad_norm": 7.34375, + "learning_rate": 6.151555555555556e-05, + "loss": 0.7948, + "step": 11160 + }, + { + "epoch": 7.967189728958631, + "grad_norm": 5.90625, + "learning_rate": 6.147111111111111e-05, + "loss": 0.8066, + "step": 11170 + }, + { + "epoch": 7.974322396576319, + "grad_norm": 6.8125, + "learning_rate": 6.142666666666666e-05, + "loss": 0.7545, + "step": 11180 + }, + { + "epoch": 7.981455064194009, + "grad_norm": 7.40625, + "learning_rate": 6.138222222222223e-05, + "loss": 0.8842, + "step": 11190 + }, + { + "epoch": 7.988587731811697, + "grad_norm": 8.625, + "learning_rate": 6.133777777777778e-05, + "loss": 0.8874, + "step": 11200 + }, + { + "epoch": 7.988587731811697, + "eval/acc": 60.46511459350586, + "step": 11200 + }, + { + "epoch": 7.988587731811697, + "eval_loss": 1.9585436582565308, + "eval_runtime": 0.2145, + "eval_samples_per_second": 200.499, + "eval_steps_per_second": 4.663, + "step": 11200 + }, + { + "epoch": 7.995720399429387, + "grad_norm": 10.125, + "learning_rate": 6.129333333333333e-05, + "loss": 0.8723, + "step": 11210 + }, + { + "epoch": 8.002853067047075, + "grad_norm": 6.375, + "learning_rate": 6.12488888888889e-05, + "loss": 0.7986, + "step": 11220 + }, + { + "epoch": 8.009985734664765, + "grad_norm": 7.34375, + "learning_rate": 6.120444444444444e-05, + "loss": 0.8382, + "step": 11230 + }, + { + "epoch": 8.017118402282454, + "grad_norm": 6.21875, + "learning_rate": 6.116e-05, + "loss": 0.796, + "step": 11240 + }, + { + "epoch": 8.024251069900142, + "grad_norm": 30.5, + "learning_rate": 6.111555555555557e-05, + "loss": 0.8541, + "step": 11250 + }, + { + "epoch": 8.031383737517832, + "grad_norm": 7.90625, + "learning_rate": 6.107111111111111e-05, + "loss": 0.7689, + "step": 11260 + }, + { + "epoch": 8.038516405135521, + "grad_norm": 10.375, + "learning_rate": 6.102666666666666e-05, + "loss": 0.803, + "step": 11270 + }, + { + "epoch": 8.045649072753209, + "grad_norm": 8.3125, + "learning_rate": 6.098222222222223e-05, + "loss": 0.9584, + "step": 11280 + }, + { + "epoch": 8.052781740370898, + "grad_norm": 7.8125, + "learning_rate": 6.093777777777778e-05, + "loss": 0.761, + "step": 11290 + }, + { + "epoch": 8.059914407988588, + "grad_norm": 9.125, + "learning_rate": 6.0893333333333335e-05, + "loss": 0.7506, + "step": 11300 + }, + { + "epoch": 8.059914407988588, + "eval/acc": 48.83720779418945, + "step": 11300 + }, + { + "epoch": 8.059914407988588, + "eval_loss": 2.348471164703369, + "eval_runtime": 0.9666, + "eval_samples_per_second": 44.484, + "eval_steps_per_second": 1.035, + "step": 11300 + }, + { + "epoch": 8.067047075606277, + "grad_norm": 9.0625, + "learning_rate": 6.084888888888889e-05, + "loss": 0.7246, + "step": 11310 + }, + { + "epoch": 8.074179743223965, + "grad_norm": 24.5, + "learning_rate": 6.080444444444445e-05, + "loss": 0.8399, + "step": 11320 + }, + { + "epoch": 8.081312410841655, + "grad_norm": 8.0625, + "learning_rate": 6.076000000000001e-05, + "loss": 0.8196, + "step": 11330 + }, + { + "epoch": 8.088445078459344, + "grad_norm": 7.5625, + "learning_rate": 6.0715555555555556e-05, + "loss": 0.7496, + "step": 11340 + }, + { + "epoch": 8.095577746077034, + "grad_norm": 10.6875, + "learning_rate": 6.067111111111111e-05, + "loss": 0.791, + "step": 11350 + }, + { + "epoch": 8.102710413694721, + "grad_norm": 7.28125, + "learning_rate": 6.062666666666667e-05, + "loss": 0.7064, + "step": 11360 + }, + { + "epoch": 8.10984308131241, + "grad_norm": 7.28125, + "learning_rate": 6.058222222222223e-05, + "loss": 0.8306, + "step": 11370 + }, + { + "epoch": 8.1169757489301, + "grad_norm": 7.84375, + "learning_rate": 6.0537777777777784e-05, + "loss": 0.8394, + "step": 11380 + }, + { + "epoch": 8.12410841654779, + "grad_norm": 6.5625, + "learning_rate": 6.049333333333333e-05, + "loss": 0.789, + "step": 11390 + }, + { + "epoch": 8.131241084165477, + "grad_norm": 7.125, + "learning_rate": 6.044888888888889e-05, + "loss": 0.7752, + "step": 11400 + }, + { + "epoch": 8.131241084165477, + "eval/acc": 48.83720779418945, + "step": 11400 + }, + { + "epoch": 8.131241084165477, + "eval_loss": 2.3455872535705566, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.559, + "eval_steps_per_second": 4.664, + "step": 11400 + }, + { + "epoch": 8.138373751783167, + "grad_norm": 6.75, + "learning_rate": 6.040444444444445e-05, + "loss": 0.7773, + "step": 11410 + }, + { + "epoch": 8.145506419400856, + "grad_norm": 7.8125, + "learning_rate": 6.0360000000000005e-05, + "loss": 0.7369, + "step": 11420 + }, + { + "epoch": 8.152639087018544, + "grad_norm": 6.4375, + "learning_rate": 6.031555555555556e-05, + "loss": 0.8158, + "step": 11430 + }, + { + "epoch": 8.159771754636234, + "grad_norm": 7.53125, + "learning_rate": 6.027111111111111e-05, + "loss": 0.874, + "step": 11440 + }, + { + "epoch": 8.166904422253923, + "grad_norm": 8.0625, + "learning_rate": 6.0226666666666664e-05, + "loss": 0.7564, + "step": 11450 + }, + { + "epoch": 8.174037089871613, + "grad_norm": 6.65625, + "learning_rate": 6.0182222222222226e-05, + "loss": 0.8675, + "step": 11460 + }, + { + "epoch": 8.1811697574893, + "grad_norm": 7.34375, + "learning_rate": 6.013777777777778e-05, + "loss": 0.8338, + "step": 11470 + }, + { + "epoch": 8.18830242510699, + "grad_norm": 8.75, + "learning_rate": 6.0093333333333336e-05, + "loss": 0.7316, + "step": 11480 + }, + { + "epoch": 8.19543509272468, + "grad_norm": 8.625, + "learning_rate": 6.0048888888888885e-05, + "loss": 0.8842, + "step": 11490 + }, + { + "epoch": 8.202567760342369, + "grad_norm": 11.3125, + "learning_rate": 6.0004444444444453e-05, + "loss": 0.7852, + "step": 11500 + }, + { + "epoch": 8.202567760342369, + "eval/acc": 48.83720779418945, + "step": 11500 + }, + { + "epoch": 8.202567760342369, + "eval_loss": 2.352907657623291, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.909, + "eval_steps_per_second": 4.719, + "step": 11500 + }, + { + "epoch": 8.209700427960057, + "grad_norm": 7.9375, + "learning_rate": 5.996e-05, + "loss": 0.7898, + "step": 11510 + }, + { + "epoch": 8.216833095577746, + "grad_norm": 7.21875, + "learning_rate": 5.991555555555556e-05, + "loss": 0.7728, + "step": 11520 + }, + { + "epoch": 8.223965763195435, + "grad_norm": 8.1875, + "learning_rate": 5.987111111111111e-05, + "loss": 0.7456, + "step": 11530 + }, + { + "epoch": 8.231098430813125, + "grad_norm": 7.1875, + "learning_rate": 5.982666666666666e-05, + "loss": 0.8461, + "step": 11540 + }, + { + "epoch": 8.238231098430813, + "grad_norm": 7.9375, + "learning_rate": 5.978222222222223e-05, + "loss": 0.7297, + "step": 11550 + }, + { + "epoch": 8.245363766048502, + "grad_norm": 6.75, + "learning_rate": 5.973777777777778e-05, + "loss": 0.8327, + "step": 11560 + }, + { + "epoch": 8.252496433666192, + "grad_norm": 6.1875, + "learning_rate": 5.969333333333333e-05, + "loss": 0.8054, + "step": 11570 + }, + { + "epoch": 8.25962910128388, + "grad_norm": 8.375, + "learning_rate": 5.964888888888889e-05, + "loss": 0.7853, + "step": 11580 + }, + { + "epoch": 8.266761768901569, + "grad_norm": 8.9375, + "learning_rate": 5.960444444444445e-05, + "loss": 0.7891, + "step": 11590 + }, + { + "epoch": 8.273894436519258, + "grad_norm": 7.8125, + "learning_rate": 5.9560000000000006e-05, + "loss": 0.7407, + "step": 11600 + }, + { + "epoch": 8.273894436519258, + "eval/acc": 55.8139533996582, + "step": 11600 + }, + { + "epoch": 8.273894436519258, + "eval_loss": 2.3408679962158203, + "eval_runtime": 0.2119, + "eval_samples_per_second": 202.913, + "eval_steps_per_second": 4.719, + "step": 11600 + }, + { + "epoch": 8.281027104136948, + "grad_norm": 16.5, + "learning_rate": 5.951555555555556e-05, + "loss": 0.7662, + "step": 11610 + }, + { + "epoch": 8.288159771754636, + "grad_norm": 9.1875, + "learning_rate": 5.947111111111111e-05, + "loss": 0.8136, + "step": 11620 + }, + { + "epoch": 8.295292439372325, + "grad_norm": 6.5, + "learning_rate": 5.942666666666668e-05, + "loss": 0.8833, + "step": 11630 + }, + { + "epoch": 8.302425106990015, + "grad_norm": 9.8125, + "learning_rate": 5.938222222222223e-05, + "loss": 0.7388, + "step": 11640 + }, + { + "epoch": 8.309557774607704, + "grad_norm": 8.375, + "learning_rate": 5.933777777777778e-05, + "loss": 0.687, + "step": 11650 + }, + { + "epoch": 8.316690442225392, + "grad_norm": 6.75, + "learning_rate": 5.929333333333334e-05, + "loss": 0.7731, + "step": 11660 + }, + { + "epoch": 8.323823109843081, + "grad_norm": 5.875, + "learning_rate": 5.9248888888888886e-05, + "loss": 0.8294, + "step": 11670 + }, + { + "epoch": 8.33095577746077, + "grad_norm": 7.25, + "learning_rate": 5.9204444444444454e-05, + "loss": 0.8312, + "step": 11680 + }, + { + "epoch": 8.338088445078458, + "grad_norm": 6.15625, + "learning_rate": 5.916e-05, + "loss": 0.8745, + "step": 11690 + }, + { + "epoch": 8.345221112696148, + "grad_norm": 11.9375, + "learning_rate": 5.911555555555556e-05, + "loss": 0.8136, + "step": 11700 + }, + { + "epoch": 8.345221112696148, + "eval/acc": 53.488372802734375, + "step": 11700 + }, + { + "epoch": 8.345221112696148, + "eval_loss": 2.348762273788452, + "eval_runtime": 1.1232, + "eval_samples_per_second": 38.285, + "eval_steps_per_second": 0.89, + "step": 11700 + }, + { + "epoch": 8.352353780313837, + "grad_norm": 7.6875, + "learning_rate": 5.907111111111111e-05, + "loss": 0.8979, + "step": 11710 + }, + { + "epoch": 8.359486447931527, + "grad_norm": 7.75, + "learning_rate": 5.9026666666666675e-05, + "loss": 0.7527, + "step": 11720 + }, + { + "epoch": 8.366619115549215, + "grad_norm": 7.75, + "learning_rate": 5.898222222222223e-05, + "loss": 0.7397, + "step": 11730 + }, + { + "epoch": 8.373751783166904, + "grad_norm": 7.125, + "learning_rate": 5.893777777777778e-05, + "loss": 0.7371, + "step": 11740 + }, + { + "epoch": 8.380884450784594, + "grad_norm": 7.09375, + "learning_rate": 5.8893333333333334e-05, + "loss": 0.7787, + "step": 11750 + }, + { + "epoch": 8.388017118402283, + "grad_norm": 12.75, + "learning_rate": 5.884888888888889e-05, + "loss": 0.7745, + "step": 11760 + }, + { + "epoch": 8.39514978601997, + "grad_norm": 5.96875, + "learning_rate": 5.880444444444445e-05, + "loss": 0.7675, + "step": 11770 + }, + { + "epoch": 8.40228245363766, + "grad_norm": 7.28125, + "learning_rate": 5.876000000000001e-05, + "loss": 0.7369, + "step": 11780 + }, + { + "epoch": 8.40941512125535, + "grad_norm": 8.5625, + "learning_rate": 5.8715555555555555e-05, + "loss": 0.7679, + "step": 11790 + }, + { + "epoch": 8.41654778887304, + "grad_norm": 6.09375, + "learning_rate": 5.867111111111111e-05, + "loss": 0.7575, + "step": 11800 + }, + { + "epoch": 8.41654778887304, + "eval/acc": 48.83720779418945, + "step": 11800 + }, + { + "epoch": 8.41654778887304, + "eval_loss": 2.3886027336120605, + "eval_runtime": 0.2153, + "eval_samples_per_second": 199.752, + "eval_steps_per_second": 4.645, + "step": 11800 + }, + { + "epoch": 8.423680456490727, + "grad_norm": 7.75, + "learning_rate": 5.862666666666667e-05, + "loss": 0.7837, + "step": 11810 + }, + { + "epoch": 8.430813124108417, + "grad_norm": 7.03125, + "learning_rate": 5.858222222222223e-05, + "loss": 0.7153, + "step": 11820 + }, + { + "epoch": 8.437945791726106, + "grad_norm": 9.3125, + "learning_rate": 5.853777777777778e-05, + "loss": 0.7655, + "step": 11830 + }, + { + "epoch": 8.445078459343794, + "grad_norm": 7.34375, + "learning_rate": 5.849333333333333e-05, + "loss": 0.761, + "step": 11840 + }, + { + "epoch": 8.452211126961483, + "grad_norm": 8.875, + "learning_rate": 5.8448888888888886e-05, + "loss": 0.7985, + "step": 11850 + }, + { + "epoch": 8.459343794579173, + "grad_norm": 6.96875, + "learning_rate": 5.840444444444445e-05, + "loss": 0.7208, + "step": 11860 + }, + { + "epoch": 8.466476462196862, + "grad_norm": 6.25, + "learning_rate": 5.8360000000000004e-05, + "loss": 0.8474, + "step": 11870 + }, + { + "epoch": 8.47360912981455, + "grad_norm": 5.5625, + "learning_rate": 5.831555555555556e-05, + "loss": 0.773, + "step": 11880 + }, + { + "epoch": 8.48074179743224, + "grad_norm": 19.125, + "learning_rate": 5.827111111111111e-05, + "loss": 0.7026, + "step": 11890 + }, + { + "epoch": 8.487874465049929, + "grad_norm": 8.4375, + "learning_rate": 5.8226666666666676e-05, + "loss": 0.7825, + "step": 11900 + }, + { + "epoch": 8.487874465049929, + "eval/acc": 48.83720779418945, + "step": 11900 + }, + { + "epoch": 8.487874465049929, + "eval_loss": 2.395317316055298, + "eval_runtime": 0.2104, + "eval_samples_per_second": 204.361, + "eval_steps_per_second": 4.753, + "step": 11900 + }, + { + "epoch": 8.495007132667618, + "grad_norm": 8.5625, + "learning_rate": 5.8182222222222225e-05, + "loss": 0.8574, + "step": 11910 + }, + { + "epoch": 8.502139800285306, + "grad_norm": 8.0, + "learning_rate": 5.813777777777778e-05, + "loss": 0.8031, + "step": 11920 + }, + { + "epoch": 8.509272467902996, + "grad_norm": 8.125, + "learning_rate": 5.8093333333333335e-05, + "loss": 0.8578, + "step": 11930 + }, + { + "epoch": 8.516405135520685, + "grad_norm": 8.3125, + "learning_rate": 5.80488888888889e-05, + "loss": 0.854, + "step": 11940 + }, + { + "epoch": 8.523537803138375, + "grad_norm": 23.5, + "learning_rate": 5.800444444444445e-05, + "loss": 0.8375, + "step": 11950 + }, + { + "epoch": 8.530670470756062, + "grad_norm": 6.625, + "learning_rate": 5.796e-05, + "loss": 0.7793, + "step": 11960 + }, + { + "epoch": 8.537803138373752, + "grad_norm": 36.25, + "learning_rate": 5.7915555555555556e-05, + "loss": 0.7395, + "step": 11970 + }, + { + "epoch": 8.544935805991441, + "grad_norm": 7.96875, + "learning_rate": 5.787111111111111e-05, + "loss": 0.8492, + "step": 11980 + }, + { + "epoch": 8.552068473609129, + "grad_norm": 7.3125, + "learning_rate": 5.782666666666667e-05, + "loss": 0.7591, + "step": 11990 + }, + { + "epoch": 8.559201141226819, + "grad_norm": 13.75, + "learning_rate": 5.778222222222223e-05, + "loss": 0.7175, + "step": 12000 + }, + { + "epoch": 8.559201141226819, + "eval/acc": 48.83720779418945, + "step": 12000 + }, + { + "epoch": 8.559201141226819, + "eval_loss": 2.375894069671631, + "eval_runtime": 0.2121, + "eval_samples_per_second": 202.777, + "eval_steps_per_second": 4.716, + "step": 12000 + }, + { + "epoch": 8.566333808844508, + "grad_norm": 8.375, + "learning_rate": 5.773777777777778e-05, + "loss": 0.8009, + "step": 12010 + }, + { + "epoch": 8.573466476462198, + "grad_norm": 10.375, + "learning_rate": 5.769333333333333e-05, + "loss": 0.7651, + "step": 12020 + }, + { + "epoch": 8.580599144079885, + "grad_norm": 10.5, + "learning_rate": 5.7648888888888894e-05, + "loss": 0.7947, + "step": 12030 + }, + { + "epoch": 8.587731811697575, + "grad_norm": 9.375, + "learning_rate": 5.760444444444445e-05, + "loss": 0.8377, + "step": 12040 + }, + { + "epoch": 8.594864479315264, + "grad_norm": 7.0, + "learning_rate": 5.7560000000000005e-05, + "loss": 0.7803, + "step": 12050 + }, + { + "epoch": 8.601997146932954, + "grad_norm": 7.03125, + "learning_rate": 5.751555555555555e-05, + "loss": 0.7129, + "step": 12060 + }, + { + "epoch": 8.609129814550641, + "grad_norm": 9.6875, + "learning_rate": 5.747111111111111e-05, + "loss": 0.9395, + "step": 12070 + }, + { + "epoch": 8.616262482168331, + "grad_norm": 7.1875, + "learning_rate": 5.742666666666667e-05, + "loss": 0.8461, + "step": 12080 + }, + { + "epoch": 8.62339514978602, + "grad_norm": 8.3125, + "learning_rate": 5.7382222222222225e-05, + "loss": 0.8533, + "step": 12090 + }, + { + "epoch": 8.63052781740371, + "grad_norm": 7.75, + "learning_rate": 5.733777777777778e-05, + "loss": 0.7819, + "step": 12100 + }, + { + "epoch": 8.63052781740371, + "eval/acc": 46.511627197265625, + "step": 12100 + }, + { + "epoch": 8.63052781740371, + "eval_loss": 2.367159605026245, + "eval_runtime": 0.35, + "eval_samples_per_second": 122.848, + "eval_steps_per_second": 2.857, + "step": 12100 + }, + { + "epoch": 8.637660485021398, + "grad_norm": 8.1875, + "learning_rate": 5.729333333333333e-05, + "loss": 0.8752, + "step": 12110 + }, + { + "epoch": 8.644793152639087, + "grad_norm": 5.6875, + "learning_rate": 5.72488888888889e-05, + "loss": 0.8182, + "step": 12120 + }, + { + "epoch": 8.651925820256777, + "grad_norm": 7.09375, + "learning_rate": 5.7204444444444446e-05, + "loss": 0.8116, + "step": 12130 + }, + { + "epoch": 8.659058487874464, + "grad_norm": 7.65625, + "learning_rate": 5.716e-05, + "loss": 0.7563, + "step": 12140 + }, + { + "epoch": 8.666191155492154, + "grad_norm": 20.75, + "learning_rate": 5.711555555555556e-05, + "loss": 0.6896, + "step": 12150 + }, + { + "epoch": 8.673323823109843, + "grad_norm": 9.25, + "learning_rate": 5.7071111111111105e-05, + "loss": 0.8233, + "step": 12160 + }, + { + "epoch": 8.680456490727533, + "grad_norm": 11.0625, + "learning_rate": 5.7026666666666674e-05, + "loss": 0.8978, + "step": 12170 + }, + { + "epoch": 8.68758915834522, + "grad_norm": 8.1875, + "learning_rate": 5.698222222222222e-05, + "loss": 0.7671, + "step": 12180 + }, + { + "epoch": 8.69472182596291, + "grad_norm": 13.0625, + "learning_rate": 5.693777777777778e-05, + "loss": 0.7771, + "step": 12190 + }, + { + "epoch": 8.7018544935806, + "grad_norm": 8.25, + "learning_rate": 5.689333333333333e-05, + "loss": 0.758, + "step": 12200 + }, + { + "epoch": 8.7018544935806, + "eval/acc": 46.511627197265625, + "step": 12200 + }, + { + "epoch": 8.7018544935806, + "eval_loss": 2.3872835636138916, + "eval_runtime": 0.2139, + "eval_samples_per_second": 201.006, + "eval_steps_per_second": 4.675, + "step": 12200 + }, + { + "epoch": 8.708987161198289, + "grad_norm": 6.5, + "learning_rate": 5.6848888888888895e-05, + "loss": 0.8066, + "step": 12210 + }, + { + "epoch": 8.716119828815977, + "grad_norm": 7.21875, + "learning_rate": 5.680444444444445e-05, + "loss": 0.8287, + "step": 12220 + }, + { + "epoch": 8.723252496433666, + "grad_norm": 10.625, + "learning_rate": 5.6760000000000005e-05, + "loss": 0.917, + "step": 12230 + }, + { + "epoch": 8.730385164051356, + "grad_norm": 9.6875, + "learning_rate": 5.6715555555555554e-05, + "loss": 0.8417, + "step": 12240 + }, + { + "epoch": 8.737517831669045, + "grad_norm": 8.6875, + "learning_rate": 5.6671111111111116e-05, + "loss": 0.8405, + "step": 12250 + }, + { + "epoch": 8.744650499286733, + "grad_norm": 6.875, + "learning_rate": 5.662666666666667e-05, + "loss": 0.7838, + "step": 12260 + }, + { + "epoch": 8.751783166904422, + "grad_norm": 6.25, + "learning_rate": 5.6582222222222226e-05, + "loss": 0.6897, + "step": 12270 + }, + { + "epoch": 8.758915834522112, + "grad_norm": 7.375, + "learning_rate": 5.653777777777778e-05, + "loss": 0.7716, + "step": 12280 + }, + { + "epoch": 8.7660485021398, + "grad_norm": 7.96875, + "learning_rate": 5.649333333333333e-05, + "loss": 0.8497, + "step": 12290 + }, + { + "epoch": 8.773181169757489, + "grad_norm": 7.75, + "learning_rate": 5.64488888888889e-05, + "loss": 0.747, + "step": 12300 + }, + { + "epoch": 8.773181169757489, + "eval/acc": 48.83720779418945, + "step": 12300 + }, + { + "epoch": 8.773181169757489, + "eval_loss": 2.3708367347717285, + "eval_runtime": 0.2183, + "eval_samples_per_second": 197.001, + "eval_steps_per_second": 4.581, + "step": 12300 + }, + { + "epoch": 8.780313837375179, + "grad_norm": 7.28125, + "learning_rate": 5.640444444444445e-05, + "loss": 0.8225, + "step": 12310 + }, + { + "epoch": 8.787446504992868, + "grad_norm": 6.8125, + "learning_rate": 5.636e-05, + "loss": 0.684, + "step": 12320 + }, + { + "epoch": 8.794579172610556, + "grad_norm": 5.84375, + "learning_rate": 5.631555555555556e-05, + "loss": 0.8008, + "step": 12330 + }, + { + "epoch": 8.801711840228245, + "grad_norm": 6.8125, + "learning_rate": 5.627111111111112e-05, + "loss": 0.7119, + "step": 12340 + }, + { + "epoch": 8.808844507845935, + "grad_norm": 7.625, + "learning_rate": 5.6226666666666675e-05, + "loss": 0.7878, + "step": 12350 + }, + { + "epoch": 8.815977175463622, + "grad_norm": 6.5625, + "learning_rate": 5.6182222222222223e-05, + "loss": 0.8389, + "step": 12360 + }, + { + "epoch": 8.823109843081312, + "grad_norm": 7.8125, + "learning_rate": 5.613777777777778e-05, + "loss": 0.8858, + "step": 12370 + }, + { + "epoch": 8.830242510699001, + "grad_norm": 7.0, + "learning_rate": 5.6093333333333334e-05, + "loss": 0.797, + "step": 12380 + }, + { + "epoch": 8.837375178316691, + "grad_norm": 8.125, + "learning_rate": 5.6048888888888896e-05, + "loss": 0.7154, + "step": 12390 + }, + { + "epoch": 8.844507845934379, + "grad_norm": 6.59375, + "learning_rate": 5.600444444444445e-05, + "loss": 0.8543, + "step": 12400 + }, + { + "epoch": 8.844507845934379, + "eval/acc": 46.511627197265625, + "step": 12400 + }, + { + "epoch": 8.844507845934379, + "eval_loss": 2.3827686309814453, + "eval_runtime": 0.2144, + "eval_samples_per_second": 200.553, + "eval_steps_per_second": 4.664, + "step": 12400 + }, + { + "epoch": 8.851640513552068, + "grad_norm": 8.8125, + "learning_rate": 5.596e-05, + "loss": 0.8071, + "step": 12410 + }, + { + "epoch": 8.858773181169758, + "grad_norm": 6.0625, + "learning_rate": 5.5915555555555555e-05, + "loss": 0.7174, + "step": 12420 + }, + { + "epoch": 8.865905848787447, + "grad_norm": 9.8125, + "learning_rate": 5.587111111111112e-05, + "loss": 0.861, + "step": 12430 + }, + { + "epoch": 8.873038516405135, + "grad_norm": 8.0, + "learning_rate": 5.582666666666667e-05, + "loss": 0.831, + "step": 12440 + }, + { + "epoch": 8.880171184022824, + "grad_norm": 5.21875, + "learning_rate": 5.578222222222223e-05, + "loss": 0.7814, + "step": 12450 + }, + { + "epoch": 8.887303851640514, + "grad_norm": 6.78125, + "learning_rate": 5.5737777777777776e-05, + "loss": 0.6926, + "step": 12460 + }, + { + "epoch": 8.894436519258203, + "grad_norm": 8.6875, + "learning_rate": 5.569333333333333e-05, + "loss": 0.7977, + "step": 12470 + }, + { + "epoch": 8.901569186875891, + "grad_norm": 6.5625, + "learning_rate": 5.564888888888889e-05, + "loss": 0.7647, + "step": 12480 + }, + { + "epoch": 8.90870185449358, + "grad_norm": 10.875, + "learning_rate": 5.560444444444445e-05, + "loss": 0.8469, + "step": 12490 + }, + { + "epoch": 8.91583452211127, + "grad_norm": 12.0625, + "learning_rate": 5.556e-05, + "loss": 0.9152, + "step": 12500 + }, + { + "epoch": 8.91583452211127, + "eval/acc": 46.511627197265625, + "step": 12500 + }, + { + "epoch": 8.91583452211127, + "eval_loss": 2.3970413208007812, + "eval_runtime": 0.2133, + "eval_samples_per_second": 201.589, + "eval_steps_per_second": 4.688, + "step": 12500 + }, + { + "epoch": 8.922967189728958, + "grad_norm": 9.875, + "learning_rate": 5.551555555555555e-05, + "loss": 0.8202, + "step": 12510 + }, + { + "epoch": 8.930099857346647, + "grad_norm": 7.625, + "learning_rate": 5.547111111111112e-05, + "loss": 0.8159, + "step": 12520 + }, + { + "epoch": 8.937232524964337, + "grad_norm": 7.875, + "learning_rate": 5.542666666666667e-05, + "loss": 0.684, + "step": 12530 + }, + { + "epoch": 8.944365192582026, + "grad_norm": 6.59375, + "learning_rate": 5.5382222222222224e-05, + "loss": 0.7629, + "step": 12540 + }, + { + "epoch": 8.951497860199714, + "grad_norm": 6.90625, + "learning_rate": 5.533777777777778e-05, + "loss": 0.8227, + "step": 12550 + }, + { + "epoch": 8.958630527817403, + "grad_norm": 6.3125, + "learning_rate": 5.529333333333334e-05, + "loss": 0.8235, + "step": 12560 + }, + { + "epoch": 8.965763195435093, + "grad_norm": 6.5, + "learning_rate": 5.52488888888889e-05, + "loss": 0.7865, + "step": 12570 + }, + { + "epoch": 8.972895863052782, + "grad_norm": 5.875, + "learning_rate": 5.5204444444444445e-05, + "loss": 0.7331, + "step": 12580 + }, + { + "epoch": 8.98002853067047, + "grad_norm": 7.15625, + "learning_rate": 5.516e-05, + "loss": 0.8498, + "step": 12590 + }, + { + "epoch": 8.98716119828816, + "grad_norm": 7.75, + "learning_rate": 5.5115555555555556e-05, + "loss": 0.7825, + "step": 12600 + }, + { + "epoch": 8.98716119828816, + "eval/acc": 51.16279220581055, + "step": 12600 + }, + { + "epoch": 8.98716119828816, + "eval_loss": 2.3282017707824707, + "eval_runtime": 0.2152, + "eval_samples_per_second": 199.842, + "eval_steps_per_second": 4.647, + "step": 12600 + }, + { + "epoch": 8.99429386590585, + "grad_norm": 7.0, + "learning_rate": 5.507111111111112e-05, + "loss": 0.8485, + "step": 12610 + }, + { + "epoch": 9.001426533523539, + "grad_norm": 8.1875, + "learning_rate": 5.502666666666667e-05, + "loss": 0.8691, + "step": 12620 + }, + { + "epoch": 9.008559201141226, + "grad_norm": 8.875, + "learning_rate": 5.498222222222222e-05, + "loss": 0.8085, + "step": 12630 + }, + { + "epoch": 9.015691868758916, + "grad_norm": 10.875, + "learning_rate": 5.4937777777777777e-05, + "loss": 0.7221, + "step": 12640 + }, + { + "epoch": 9.022824536376605, + "grad_norm": 7.9375, + "learning_rate": 5.489333333333334e-05, + "loss": 0.8136, + "step": 12650 + }, + { + "epoch": 9.029957203994293, + "grad_norm": 6.78125, + "learning_rate": 5.4848888888888894e-05, + "loss": 0.6211, + "step": 12660 + }, + { + "epoch": 9.037089871611983, + "grad_norm": 7.09375, + "learning_rate": 5.480444444444445e-05, + "loss": 0.7893, + "step": 12670 + }, + { + "epoch": 9.044222539229672, + "grad_norm": 7.375, + "learning_rate": 5.476e-05, + "loss": 0.9348, + "step": 12680 + }, + { + "epoch": 9.051355206847362, + "grad_norm": 7.09375, + "learning_rate": 5.471555555555555e-05, + "loss": 0.9088, + "step": 12690 + }, + { + "epoch": 9.05848787446505, + "grad_norm": 7.9375, + "learning_rate": 5.4671111111111115e-05, + "loss": 0.8116, + "step": 12700 + }, + { + "epoch": 9.05848787446505, + "eval/acc": 32.55813980102539, + "step": 12700 + }, + { + "epoch": 9.05848787446505, + "eval_loss": 3.3768653869628906, + "eval_runtime": 1.089, + "eval_samples_per_second": 39.487, + "eval_steps_per_second": 0.918, + "step": 12700 + }, + { + "epoch": 9.065620542082739, + "grad_norm": 7.875, + "learning_rate": 5.462666666666667e-05, + "loss": 0.7748, + "step": 12710 + }, + { + "epoch": 9.072753209700428, + "grad_norm": 6.96875, + "learning_rate": 5.4582222222222225e-05, + "loss": 0.872, + "step": 12720 + }, + { + "epoch": 9.079885877318118, + "grad_norm": 7.59375, + "learning_rate": 5.4537777777777774e-05, + "loss": 0.8539, + "step": 12730 + }, + { + "epoch": 9.087018544935805, + "grad_norm": 6.9375, + "learning_rate": 5.449333333333334e-05, + "loss": 0.784, + "step": 12740 + }, + { + "epoch": 9.094151212553495, + "grad_norm": 6.5625, + "learning_rate": 5.444888888888889e-05, + "loss": 0.7998, + "step": 12750 + }, + { + "epoch": 9.101283880171184, + "grad_norm": 7.0625, + "learning_rate": 5.4404444444444446e-05, + "loss": 0.8213, + "step": 12760 + }, + { + "epoch": 9.108416547788874, + "grad_norm": 5.9375, + "learning_rate": 5.436e-05, + "loss": 0.8233, + "step": 12770 + }, + { + "epoch": 9.115549215406562, + "grad_norm": 6.53125, + "learning_rate": 5.431555555555555e-05, + "loss": 0.7617, + "step": 12780 + }, + { + "epoch": 9.122681883024251, + "grad_norm": 7.3125, + "learning_rate": 5.427111111111112e-05, + "loss": 0.8139, + "step": 12790 + }, + { + "epoch": 9.12981455064194, + "grad_norm": 7.625, + "learning_rate": 5.422666666666667e-05, + "loss": 0.7742, + "step": 12800 + }, + { + "epoch": 9.12981455064194, + "eval/acc": 34.88372039794922, + "step": 12800 + }, + { + "epoch": 9.12981455064194, + "eval_loss": 3.372913122177124, + "eval_runtime": 0.2597, + "eval_samples_per_second": 165.6, + "eval_steps_per_second": 3.851, + "step": 12800 + }, + { + "epoch": 9.136947218259628, + "grad_norm": 7.90625, + "learning_rate": 5.418222222222222e-05, + "loss": 0.8071, + "step": 12810 + }, + { + "epoch": 9.144079885877318, + "grad_norm": 6.5625, + "learning_rate": 5.413777777777778e-05, + "loss": 0.7691, + "step": 12820 + }, + { + "epoch": 9.151212553495007, + "grad_norm": 8.375, + "learning_rate": 5.409333333333334e-05, + "loss": 0.8105, + "step": 12830 + }, + { + "epoch": 9.158345221112697, + "grad_norm": 7.5, + "learning_rate": 5.4048888888888895e-05, + "loss": 0.83, + "step": 12840 + }, + { + "epoch": 9.165477888730384, + "grad_norm": 7.21875, + "learning_rate": 5.400444444444444e-05, + "loss": 0.8158, + "step": 12850 + }, + { + "epoch": 9.172610556348074, + "grad_norm": 8.0625, + "learning_rate": 5.396e-05, + "loss": 0.7359, + "step": 12860 + }, + { + "epoch": 9.179743223965763, + "grad_norm": 7.21875, + "learning_rate": 5.391555555555556e-05, + "loss": 0.7797, + "step": 12870 + }, + { + "epoch": 9.186875891583453, + "grad_norm": 11.8125, + "learning_rate": 5.3871111111111116e-05, + "loss": 0.8005, + "step": 12880 + }, + { + "epoch": 9.19400855920114, + "grad_norm": 14.0, + "learning_rate": 5.382666666666667e-05, + "loss": 0.8764, + "step": 12890 + }, + { + "epoch": 9.20114122681883, + "grad_norm": 6.96875, + "learning_rate": 5.3782222222222226e-05, + "loss": 0.6898, + "step": 12900 + }, + { + "epoch": 9.20114122681883, + "eval/acc": 32.55813980102539, + "step": 12900 + }, + { + "epoch": 9.20114122681883, + "eval_loss": 3.383354425430298, + "eval_runtime": 0.2453, + "eval_samples_per_second": 175.301, + "eval_steps_per_second": 4.077, + "step": 12900 + }, + { + "epoch": 9.20827389443652, + "grad_norm": 8.125, + "learning_rate": 5.3737777777777775e-05, + "loss": 0.8066, + "step": 12910 + }, + { + "epoch": 9.21540656205421, + "grad_norm": 7.96875, + "learning_rate": 5.369333333333334e-05, + "loss": 0.7809, + "step": 12920 + }, + { + "epoch": 9.222539229671897, + "grad_norm": 7.15625, + "learning_rate": 5.364888888888889e-05, + "loss": 0.7242, + "step": 12930 + }, + { + "epoch": 9.229671897289586, + "grad_norm": 7.65625, + "learning_rate": 5.360444444444445e-05, + "loss": 0.8201, + "step": 12940 + }, + { + "epoch": 9.236804564907276, + "grad_norm": 8.75, + "learning_rate": 5.356e-05, + "loss": 0.8531, + "step": 12950 + }, + { + "epoch": 9.243937232524964, + "grad_norm": 7.3125, + "learning_rate": 5.3515555555555564e-05, + "loss": 0.8004, + "step": 12960 + }, + { + "epoch": 9.251069900142653, + "grad_norm": 9.1875, + "learning_rate": 5.347111111111112e-05, + "loss": 0.8026, + "step": 12970 + }, + { + "epoch": 9.258202567760343, + "grad_norm": 8.75, + "learning_rate": 5.342666666666667e-05, + "loss": 0.9001, + "step": 12980 + }, + { + "epoch": 9.265335235378032, + "grad_norm": 6.75, + "learning_rate": 5.338222222222222e-05, + "loss": 0.8698, + "step": 12990 + }, + { + "epoch": 9.27246790299572, + "grad_norm": 5.75, + "learning_rate": 5.333777777777778e-05, + "loss": 0.7668, + "step": 13000 + }, + { + "epoch": 9.27246790299572, + "eval/acc": 34.88372039794922, + "step": 13000 + }, + { + "epoch": 9.27246790299572, + "eval_loss": 3.3350794315338135, + "eval_runtime": 0.2244, + "eval_samples_per_second": 191.644, + "eval_steps_per_second": 4.457, + "step": 13000 + }, + { + "epoch": 9.27960057061341, + "grad_norm": 7.90625, + "learning_rate": 5.329333333333334e-05, + "loss": 0.8908, + "step": 13010 + }, + { + "epoch": 9.286733238231099, + "grad_norm": 7.78125, + "learning_rate": 5.3248888888888896e-05, + "loss": 0.76, + "step": 13020 + }, + { + "epoch": 9.293865905848788, + "grad_norm": 22.125, + "learning_rate": 5.3204444444444444e-05, + "loss": 0.8205, + "step": 13030 + }, + { + "epoch": 9.300998573466476, + "grad_norm": 9.6875, + "learning_rate": 5.316e-05, + "loss": 0.7528, + "step": 13040 + }, + { + "epoch": 9.308131241084165, + "grad_norm": 6.8125, + "learning_rate": 5.311555555555556e-05, + "loss": 0.8987, + "step": 13050 + }, + { + "epoch": 9.315263908701855, + "grad_norm": 7.71875, + "learning_rate": 5.3071111111111116e-05, + "loss": 0.8056, + "step": 13060 + }, + { + "epoch": 9.322396576319543, + "grad_norm": 6.78125, + "learning_rate": 5.302666666666667e-05, + "loss": 0.7962, + "step": 13070 + }, + { + "epoch": 9.329529243937232, + "grad_norm": 6.3125, + "learning_rate": 5.298222222222222e-05, + "loss": 0.846, + "step": 13080 + }, + { + "epoch": 9.336661911554922, + "grad_norm": 8.75, + "learning_rate": 5.2937777777777775e-05, + "loss": 0.8005, + "step": 13090 + }, + { + "epoch": 9.343794579172611, + "grad_norm": 27.25, + "learning_rate": 5.289333333333334e-05, + "loss": 0.7313, + "step": 13100 + }, + { + "epoch": 9.343794579172611, + "eval/acc": 32.55813980102539, + "step": 13100 + }, + { + "epoch": 9.343794579172611, + "eval_loss": 3.3405187129974365, + "eval_runtime": 0.2462, + "eval_samples_per_second": 174.636, + "eval_steps_per_second": 4.061, + "step": 13100 + }, + { + "epoch": 9.350927246790299, + "grad_norm": 7.5625, + "learning_rate": 5.284888888888889e-05, + "loss": 0.8474, + "step": 13110 + }, + { + "epoch": 9.358059914407988, + "grad_norm": 7.25, + "learning_rate": 5.280444444444445e-05, + "loss": 0.8104, + "step": 13120 + }, + { + "epoch": 9.365192582025678, + "grad_norm": 7.71875, + "learning_rate": 5.2759999999999996e-05, + "loss": 0.8638, + "step": 13130 + }, + { + "epoch": 9.372325249643367, + "grad_norm": 8.25, + "learning_rate": 5.2715555555555565e-05, + "loss": 0.7968, + "step": 13140 + }, + { + "epoch": 9.379457917261055, + "grad_norm": 9.4375, + "learning_rate": 5.2671111111111114e-05, + "loss": 0.692, + "step": 13150 + }, + { + "epoch": 9.386590584878745, + "grad_norm": 6.1875, + "learning_rate": 5.262666666666667e-05, + "loss": 0.8222, + "step": 13160 + }, + { + "epoch": 9.393723252496434, + "grad_norm": 9.9375, + "learning_rate": 5.2582222222222224e-05, + "loss": 0.8494, + "step": 13170 + }, + { + "epoch": 9.400855920114124, + "grad_norm": 8.0, + "learning_rate": 5.2537777777777786e-05, + "loss": 0.8254, + "step": 13180 + }, + { + "epoch": 9.407988587731811, + "grad_norm": 7.375, + "learning_rate": 5.249333333333334e-05, + "loss": 0.8771, + "step": 13190 + }, + { + "epoch": 9.4151212553495, + "grad_norm": 7.34375, + "learning_rate": 5.244888888888889e-05, + "loss": 0.8563, + "step": 13200 + }, + { + "epoch": 9.4151212553495, + "eval/acc": 37.20930099487305, + "step": 13200 + }, + { + "epoch": 9.4151212553495, + "eval_loss": 3.293537139892578, + "eval_runtime": 0.219, + "eval_samples_per_second": 196.361, + "eval_steps_per_second": 4.567, + "step": 13200 + }, + { + "epoch": 9.42225392296719, + "grad_norm": 7.1875, + "learning_rate": 5.2404444444444445e-05, + "loss": 0.769, + "step": 13210 + }, + { + "epoch": 9.429386590584878, + "grad_norm": 8.5, + "learning_rate": 5.236e-05, + "loss": 0.778, + "step": 13220 + }, + { + "epoch": 9.436519258202567, + "grad_norm": 7.6875, + "learning_rate": 5.231555555555556e-05, + "loss": 0.8043, + "step": 13230 + }, + { + "epoch": 9.443651925820257, + "grad_norm": 7.59375, + "learning_rate": 5.227111111111112e-05, + "loss": 0.7962, + "step": 13240 + }, + { + "epoch": 9.450784593437946, + "grad_norm": 9.6875, + "learning_rate": 5.2226666666666666e-05, + "loss": 0.8623, + "step": 13250 + }, + { + "epoch": 9.457917261055634, + "grad_norm": 7.125, + "learning_rate": 5.218222222222222e-05, + "loss": 0.7408, + "step": 13260 + }, + { + "epoch": 9.465049928673324, + "grad_norm": 8.1875, + "learning_rate": 5.213777777777778e-05, + "loss": 0.7233, + "step": 13270 + }, + { + "epoch": 9.472182596291013, + "grad_norm": 9.375, + "learning_rate": 5.209333333333334e-05, + "loss": 0.7349, + "step": 13280 + }, + { + "epoch": 9.479315263908703, + "grad_norm": 6.75, + "learning_rate": 5.2048888888888894e-05, + "loss": 0.7311, + "step": 13290 + }, + { + "epoch": 9.48644793152639, + "grad_norm": 10.25, + "learning_rate": 5.200444444444444e-05, + "loss": 0.828, + "step": 13300 + }, + { + "epoch": 9.48644793152639, + "eval/acc": 34.88372039794922, + "step": 13300 + }, + { + "epoch": 9.48644793152639, + "eval_loss": 3.376410484313965, + "eval_runtime": 0.2205, + "eval_samples_per_second": 194.974, + "eval_steps_per_second": 4.534, + "step": 13300 + }, + { + "epoch": 9.49358059914408, + "grad_norm": 12.1875, + "learning_rate": 5.196e-05, + "loss": 0.6994, + "step": 13310 + }, + { + "epoch": 9.50071326676177, + "grad_norm": 10.375, + "learning_rate": 5.191555555555556e-05, + "loss": 0.7658, + "step": 13320 + }, + { + "epoch": 9.507845934379457, + "grad_norm": 7.625, + "learning_rate": 5.1871111111111114e-05, + "loss": 0.7453, + "step": 13330 + }, + { + "epoch": 9.514978601997147, + "grad_norm": 8.0, + "learning_rate": 5.182666666666667e-05, + "loss": 0.7407, + "step": 13340 + }, + { + "epoch": 9.522111269614836, + "grad_norm": 6.96875, + "learning_rate": 5.178222222222222e-05, + "loss": 0.8234, + "step": 13350 + }, + { + "epoch": 9.529243937232525, + "grad_norm": 6.59375, + "learning_rate": 5.173777777777779e-05, + "loss": 0.7517, + "step": 13360 + }, + { + "epoch": 9.536376604850213, + "grad_norm": 7.15625, + "learning_rate": 5.1693333333333335e-05, + "loss": 0.6939, + "step": 13370 + }, + { + "epoch": 9.543509272467903, + "grad_norm": 9.6875, + "learning_rate": 5.164888888888889e-05, + "loss": 0.7602, + "step": 13380 + }, + { + "epoch": 9.550641940085592, + "grad_norm": 7.375, + "learning_rate": 5.1604444444444446e-05, + "loss": 0.8016, + "step": 13390 + }, + { + "epoch": 9.557774607703282, + "grad_norm": 6.9375, + "learning_rate": 5.1559999999999994e-05, + "loss": 0.8258, + "step": 13400 + }, + { + "epoch": 9.557774607703282, + "eval/acc": 34.88372039794922, + "step": 13400 + }, + { + "epoch": 9.557774607703282, + "eval_loss": 3.3766846656799316, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.368, + "eval_steps_per_second": 4.404, + "step": 13400 + }, + { + "epoch": 9.56490727532097, + "grad_norm": 6.875, + "learning_rate": 5.151555555555556e-05, + "loss": 0.7926, + "step": 13410 + }, + { + "epoch": 9.572039942938659, + "grad_norm": 6.28125, + "learning_rate": 5.147111111111111e-05, + "loss": 0.6912, + "step": 13420 + }, + { + "epoch": 9.579172610556348, + "grad_norm": 60.5, + "learning_rate": 5.142666666666667e-05, + "loss": 0.8117, + "step": 13430 + }, + { + "epoch": 9.586305278174038, + "grad_norm": 10.5, + "learning_rate": 5.138222222222222e-05, + "loss": 0.7794, + "step": 13440 + }, + { + "epoch": 9.593437945791726, + "grad_norm": 5.6875, + "learning_rate": 5.1337777777777784e-05, + "loss": 0.6753, + "step": 13450 + }, + { + "epoch": 9.600570613409415, + "grad_norm": 8.4375, + "learning_rate": 5.129333333333334e-05, + "loss": 0.8676, + "step": 13460 + }, + { + "epoch": 9.607703281027105, + "grad_norm": 7.34375, + "learning_rate": 5.124888888888889e-05, + "loss": 0.7326, + "step": 13470 + }, + { + "epoch": 9.614835948644792, + "grad_norm": 13.9375, + "learning_rate": 5.120444444444444e-05, + "loss": 0.8177, + "step": 13480 + }, + { + "epoch": 9.621968616262482, + "grad_norm": 8.3125, + "learning_rate": 5.1160000000000005e-05, + "loss": 0.7928, + "step": 13490 + }, + { + "epoch": 9.629101283880171, + "grad_norm": 5.3125, + "learning_rate": 5.111555555555556e-05, + "loss": 0.7693, + "step": 13500 + }, + { + "epoch": 9.629101283880171, + "eval/acc": 37.20930099487305, + "step": 13500 + }, + { + "epoch": 9.629101283880171, + "eval_loss": 3.340432643890381, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.437, + "eval_steps_per_second": 4.522, + "step": 13500 + }, + { + "epoch": 9.63623395149786, + "grad_norm": 6.4375, + "learning_rate": 5.1071111111111115e-05, + "loss": 0.7974, + "step": 13510 + }, + { + "epoch": 9.643366619115548, + "grad_norm": 11.375, + "learning_rate": 5.1026666666666664e-05, + "loss": 0.8533, + "step": 13520 + }, + { + "epoch": 9.650499286733238, + "grad_norm": 8.4375, + "learning_rate": 5.098222222222222e-05, + "loss": 0.7578, + "step": 13530 + }, + { + "epoch": 9.657631954350927, + "grad_norm": 6.96875, + "learning_rate": 5.093777777777778e-05, + "loss": 0.8348, + "step": 13540 + }, + { + "epoch": 9.664764621968617, + "grad_norm": 6.75, + "learning_rate": 5.0893333333333336e-05, + "loss": 0.7562, + "step": 13550 + }, + { + "epoch": 9.671897289586305, + "grad_norm": 7.4375, + "learning_rate": 5.084888888888889e-05, + "loss": 0.8667, + "step": 13560 + }, + { + "epoch": 9.679029957203994, + "grad_norm": 11.4375, + "learning_rate": 5.080444444444445e-05, + "loss": 0.7158, + "step": 13570 + }, + { + "epoch": 9.686162624821684, + "grad_norm": 6.15625, + "learning_rate": 5.076000000000001e-05, + "loss": 0.7153, + "step": 13580 + }, + { + "epoch": 9.693295292439373, + "grad_norm": 10.25, + "learning_rate": 5.0715555555555564e-05, + "loss": 0.7698, + "step": 13590 + }, + { + "epoch": 9.70042796005706, + "grad_norm": 12.0, + "learning_rate": 5.067111111111111e-05, + "loss": 0.8033, + "step": 13600 + }, + { + "epoch": 9.70042796005706, + "eval/acc": 37.20930099487305, + "step": 13600 + }, + { + "epoch": 9.70042796005706, + "eval_loss": 3.325901985168457, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.369, + "eval_steps_per_second": 4.474, + "step": 13600 + }, + { + "epoch": 9.70756062767475, + "grad_norm": 11.5, + "learning_rate": 5.062666666666667e-05, + "loss": 0.7757, + "step": 13610 + }, + { + "epoch": 9.71469329529244, + "grad_norm": 7.0625, + "learning_rate": 5.058222222222222e-05, + "loss": 0.7335, + "step": 13620 + }, + { + "epoch": 9.721825962910128, + "grad_norm": 7.09375, + "learning_rate": 5.0537777777777785e-05, + "loss": 0.7219, + "step": 13630 + }, + { + "epoch": 9.728958630527817, + "grad_norm": 28.625, + "learning_rate": 5.049333333333334e-05, + "loss": 0.7445, + "step": 13640 + }, + { + "epoch": 9.736091298145507, + "grad_norm": 6.34375, + "learning_rate": 5.044888888888889e-05, + "loss": 0.7203, + "step": 13650 + }, + { + "epoch": 9.743223965763196, + "grad_norm": 12.25, + "learning_rate": 5.0404444444444444e-05, + "loss": 0.815, + "step": 13660 + }, + { + "epoch": 9.750356633380884, + "grad_norm": 8.125, + "learning_rate": 5.0360000000000006e-05, + "loss": 0.6969, + "step": 13670 + }, + { + "epoch": 9.757489300998573, + "grad_norm": 8.9375, + "learning_rate": 5.031555555555556e-05, + "loss": 0.742, + "step": 13680 + }, + { + "epoch": 9.764621968616263, + "grad_norm": 17.125, + "learning_rate": 5.0271111111111116e-05, + "loss": 0.8526, + "step": 13690 + }, + { + "epoch": 9.771754636233952, + "grad_norm": 9.3125, + "learning_rate": 5.0226666666666665e-05, + "loss": 0.795, + "step": 13700 + }, + { + "epoch": 9.771754636233952, + "eval/acc": 37.20930099487305, + "step": 13700 + }, + { + "epoch": 9.771754636233952, + "eval_loss": 3.3737363815307617, + "eval_runtime": 0.2349, + "eval_samples_per_second": 183.026, + "eval_steps_per_second": 4.256, + "step": 13700 + }, + { + "epoch": 9.77888730385164, + "grad_norm": 7.28125, + "learning_rate": 5.018222222222222e-05, + "loss": 0.7804, + "step": 13710 + }, + { + "epoch": 9.78601997146933, + "grad_norm": 8.25, + "learning_rate": 5.013777777777778e-05, + "loss": 0.8201, + "step": 13720 + }, + { + "epoch": 9.793152639087019, + "grad_norm": 7.125, + "learning_rate": 5.009333333333334e-05, + "loss": 0.7495, + "step": 13730 + }, + { + "epoch": 9.800285306704708, + "grad_norm": 7.96875, + "learning_rate": 5.004888888888889e-05, + "loss": 0.7827, + "step": 13740 + }, + { + "epoch": 9.807417974322396, + "grad_norm": 6.5625, + "learning_rate": 5.000444444444444e-05, + "loss": 0.8317, + "step": 13750 + }, + { + "epoch": 9.814550641940086, + "grad_norm": 7.8125, + "learning_rate": 4.996e-05, + "loss": 0.8547, + "step": 13760 + }, + { + "epoch": 9.821683309557775, + "grad_norm": 7.15625, + "learning_rate": 4.991555555555556e-05, + "loss": 0.8679, + "step": 13770 + }, + { + "epoch": 9.828815977175463, + "grad_norm": 7.8125, + "learning_rate": 4.987111111111111e-05, + "loss": 0.7479, + "step": 13780 + }, + { + "epoch": 9.835948644793152, + "grad_norm": 15.5, + "learning_rate": 4.982666666666667e-05, + "loss": 0.8501, + "step": 13790 + }, + { + "epoch": 9.843081312410842, + "grad_norm": 8.1875, + "learning_rate": 4.9782222222222224e-05, + "loss": 0.7662, + "step": 13800 + }, + { + "epoch": 9.843081312410842, + "eval/acc": 37.20930099487305, + "step": 13800 + }, + { + "epoch": 9.843081312410842, + "eval_loss": 3.3852930068969727, + "eval_runtime": 0.2208, + "eval_samples_per_second": 194.716, + "eval_steps_per_second": 4.528, + "step": 13800 + }, + { + "epoch": 9.850213980028531, + "grad_norm": 8.3125, + "learning_rate": 4.973777777777778e-05, + "loss": 0.8303, + "step": 13810 + }, + { + "epoch": 9.857346647646219, + "grad_norm": 8.8125, + "learning_rate": 4.9693333333333334e-05, + "loss": 0.7875, + "step": 13820 + }, + { + "epoch": 9.864479315263909, + "grad_norm": 7.625, + "learning_rate": 4.964888888888889e-05, + "loss": 0.7952, + "step": 13830 + }, + { + "epoch": 9.871611982881598, + "grad_norm": 6.96875, + "learning_rate": 4.9604444444444445e-05, + "loss": 0.8041, + "step": 13840 + }, + { + "epoch": 9.878744650499288, + "grad_norm": 6.375, + "learning_rate": 4.956e-05, + "loss": 0.6869, + "step": 13850 + }, + { + "epoch": 9.885877318116975, + "grad_norm": 7.125, + "learning_rate": 4.951555555555556e-05, + "loss": 0.7707, + "step": 13860 + }, + { + "epoch": 9.893009985734665, + "grad_norm": 8.125, + "learning_rate": 4.947111111111111e-05, + "loss": 0.7512, + "step": 13870 + }, + { + "epoch": 9.900142653352354, + "grad_norm": 8.8125, + "learning_rate": 4.942666666666667e-05, + "loss": 0.8059, + "step": 13880 + }, + { + "epoch": 9.907275320970044, + "grad_norm": 7.90625, + "learning_rate": 4.938222222222223e-05, + "loss": 0.729, + "step": 13890 + }, + { + "epoch": 9.914407988587731, + "grad_norm": 6.625, + "learning_rate": 4.933777777777778e-05, + "loss": 0.7958, + "step": 13900 + }, + { + "epoch": 9.914407988587731, + "eval/acc": 37.20930099487305, + "step": 13900 + }, + { + "epoch": 9.914407988587731, + "eval_loss": 3.400364875793457, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.329, + "eval_steps_per_second": 4.356, + "step": 13900 + }, + { + "epoch": 9.921540656205421, + "grad_norm": 7.0625, + "learning_rate": 4.929333333333334e-05, + "loss": 0.7314, + "step": 13910 + }, + { + "epoch": 9.92867332382311, + "grad_norm": 6.78125, + "learning_rate": 4.9248888888888886e-05, + "loss": 0.7581, + "step": 13920 + }, + { + "epoch": 9.935805991440798, + "grad_norm": 8.6875, + "learning_rate": 4.920444444444445e-05, + "loss": 0.7865, + "step": 13930 + }, + { + "epoch": 9.942938659058488, + "grad_norm": 7.78125, + "learning_rate": 4.9160000000000004e-05, + "loss": 0.7174, + "step": 13940 + }, + { + "epoch": 9.950071326676177, + "grad_norm": 7.6875, + "learning_rate": 4.911555555555556e-05, + "loss": 0.855, + "step": 13950 + }, + { + "epoch": 9.957203994293867, + "grad_norm": 7.46875, + "learning_rate": 4.9071111111111114e-05, + "loss": 0.7511, + "step": 13960 + }, + { + "epoch": 9.964336661911554, + "grad_norm": 6.34375, + "learning_rate": 4.902666666666667e-05, + "loss": 0.6901, + "step": 13970 + }, + { + "epoch": 9.971469329529244, + "grad_norm": 19.125, + "learning_rate": 4.8982222222222225e-05, + "loss": 0.7621, + "step": 13980 + }, + { + "epoch": 9.978601997146933, + "grad_norm": 9.375, + "learning_rate": 4.893777777777778e-05, + "loss": 0.7466, + "step": 13990 + }, + { + "epoch": 9.985734664764623, + "grad_norm": 7.1875, + "learning_rate": 4.8893333333333335e-05, + "loss": 0.749, + "step": 14000 + }, + { + "epoch": 9.985734664764623, + "eval/acc": 37.20930099487305, + "step": 14000 + }, + { + "epoch": 9.985734664764623, + "eval_loss": 3.3502047061920166, + "eval_runtime": 0.2251, + "eval_samples_per_second": 191.024, + "eval_steps_per_second": 4.442, + "step": 14000 + }, + { + "epoch": 9.99286733238231, + "grad_norm": 9.5625, + "learning_rate": 4.884888888888889e-05, + "loss": 0.6932, + "step": 14010 + }, + { + "epoch": 10.0, + "grad_norm": 5.875, + "learning_rate": 4.8804444444444445e-05, + "loss": 0.7939, + "step": 14020 + }, + { + "epoch": 10.00713266761769, + "grad_norm": 7.15625, + "learning_rate": 4.876e-05, + "loss": 0.8124, + "step": 14030 + }, + { + "epoch": 10.014265335235377, + "grad_norm": 6.0625, + "learning_rate": 4.8715555555555556e-05, + "loss": 0.6855, + "step": 14040 + }, + { + "epoch": 10.021398002853067, + "grad_norm": 50.75, + "learning_rate": 4.867111111111111e-05, + "loss": 0.8354, + "step": 14050 + }, + { + "epoch": 10.028530670470756, + "grad_norm": 7.46875, + "learning_rate": 4.862666666666667e-05, + "loss": 0.8605, + "step": 14060 + }, + { + "epoch": 10.035663338088446, + "grad_norm": 9.625, + "learning_rate": 4.858222222222222e-05, + "loss": 0.8626, + "step": 14070 + }, + { + "epoch": 10.042796005706133, + "grad_norm": 6.125, + "learning_rate": 4.8537777777777784e-05, + "loss": 0.7302, + "step": 14080 + }, + { + "epoch": 10.049928673323823, + "grad_norm": 8.0625, + "learning_rate": 4.849333333333333e-05, + "loss": 0.9058, + "step": 14090 + }, + { + "epoch": 10.057061340941512, + "grad_norm": 7.3125, + "learning_rate": 4.8448888888888894e-05, + "loss": 0.7981, + "step": 14100 + }, + { + "epoch": 10.057061340941512, + "eval/acc": 46.511627197265625, + "step": 14100 + }, + { + "epoch": 10.057061340941512, + "eval_loss": 2.7371480464935303, + "eval_runtime": 1.1832, + "eval_samples_per_second": 36.344, + "eval_steps_per_second": 0.845, + "step": 14100 + }, + { + "epoch": 10.064194008559202, + "grad_norm": 12.625, + "learning_rate": 4.840444444444445e-05, + "loss": 0.7461, + "step": 14110 + }, + { + "epoch": 10.07132667617689, + "grad_norm": 7.375, + "learning_rate": 4.836e-05, + "loss": 0.8649, + "step": 14120 + }, + { + "epoch": 10.078459343794579, + "grad_norm": 10.6875, + "learning_rate": 4.831555555555556e-05, + "loss": 0.8143, + "step": 14130 + }, + { + "epoch": 10.085592011412269, + "grad_norm": 43.5, + "learning_rate": 4.827111111111111e-05, + "loss": 0.8249, + "step": 14140 + }, + { + "epoch": 10.092724679029958, + "grad_norm": 6.6875, + "learning_rate": 4.822666666666667e-05, + "loss": 0.6324, + "step": 14150 + }, + { + "epoch": 10.099857346647646, + "grad_norm": 9.6875, + "learning_rate": 4.8182222222222225e-05, + "loss": 0.7795, + "step": 14160 + }, + { + "epoch": 10.106990014265335, + "grad_norm": 7.8125, + "learning_rate": 4.813777777777778e-05, + "loss": 0.8453, + "step": 14170 + }, + { + "epoch": 10.114122681883025, + "grad_norm": 6.21875, + "learning_rate": 4.8093333333333336e-05, + "loss": 0.735, + "step": 14180 + }, + { + "epoch": 10.121255349500712, + "grad_norm": 8.1875, + "learning_rate": 4.804888888888889e-05, + "loss": 0.6646, + "step": 14190 + }, + { + "epoch": 10.128388017118402, + "grad_norm": 38.0, + "learning_rate": 4.8004444444444446e-05, + "loss": 0.7963, + "step": 14200 + }, + { + "epoch": 10.128388017118402, + "eval/acc": 44.1860466003418, + "step": 14200 + }, + { + "epoch": 10.128388017118402, + "eval_loss": 2.7271535396575928, + "eval_runtime": 1.4583, + "eval_samples_per_second": 29.486, + "eval_steps_per_second": 0.686, + "step": 14200 + }, + { + "epoch": 10.135520684736091, + "grad_norm": 5.78125, + "learning_rate": 4.796e-05, + "loss": 0.693, + "step": 14210 + }, + { + "epoch": 10.142653352353781, + "grad_norm": 11.875, + "learning_rate": 4.791555555555556e-05, + "loss": 0.7578, + "step": 14220 + }, + { + "epoch": 10.149786019971469, + "grad_norm": 9.25, + "learning_rate": 4.787111111111111e-05, + "loss": 0.8127, + "step": 14230 + }, + { + "epoch": 10.156918687589158, + "grad_norm": 9.1875, + "learning_rate": 4.782666666666667e-05, + "loss": 0.6935, + "step": 14240 + }, + { + "epoch": 10.164051355206848, + "grad_norm": 8.25, + "learning_rate": 4.778222222222222e-05, + "loss": 0.7233, + "step": 14250 + }, + { + "epoch": 10.171184022824537, + "grad_norm": 6.71875, + "learning_rate": 4.7737777777777785e-05, + "loss": 0.8749, + "step": 14260 + }, + { + "epoch": 10.178316690442225, + "grad_norm": 7.84375, + "learning_rate": 4.769333333333333e-05, + "loss": 0.7786, + "step": 14270 + }, + { + "epoch": 10.185449358059914, + "grad_norm": 9.1875, + "learning_rate": 4.7648888888888895e-05, + "loss": 0.7024, + "step": 14280 + }, + { + "epoch": 10.192582025677604, + "grad_norm": 7.78125, + "learning_rate": 4.7604444444444443e-05, + "loss": 0.8525, + "step": 14290 + }, + { + "epoch": 10.199714693295292, + "grad_norm": 6.90625, + "learning_rate": 4.7560000000000005e-05, + "loss": 0.8181, + "step": 14300 + }, + { + "epoch": 10.199714693295292, + "eval/acc": 46.511627197265625, + "step": 14300 + }, + { + "epoch": 10.199714693295292, + "eval_loss": 2.766357898712158, + "eval_runtime": 2.3027, + "eval_samples_per_second": 18.674, + "eval_steps_per_second": 0.434, + "step": 14300 + }, + { + "epoch": 10.206847360912981, + "grad_norm": 5.90625, + "learning_rate": 4.751555555555556e-05, + "loss": 0.8026, + "step": 14310 + }, + { + "epoch": 10.21398002853067, + "grad_norm": 7.3125, + "learning_rate": 4.747111111111111e-05, + "loss": 0.8758, + "step": 14320 + }, + { + "epoch": 10.22111269614836, + "grad_norm": 9.4375, + "learning_rate": 4.742666666666667e-05, + "loss": 0.7889, + "step": 14330 + }, + { + "epoch": 10.228245363766048, + "grad_norm": 8.0, + "learning_rate": 4.738222222222222e-05, + "loss": 0.7343, + "step": 14340 + }, + { + "epoch": 10.235378031383737, + "grad_norm": 6.59375, + "learning_rate": 4.733777777777778e-05, + "loss": 0.788, + "step": 14350 + }, + { + "epoch": 10.242510699001427, + "grad_norm": 9.1875, + "learning_rate": 4.729333333333334e-05, + "loss": 0.8068, + "step": 14360 + }, + { + "epoch": 10.249643366619116, + "grad_norm": 7.53125, + "learning_rate": 4.724888888888889e-05, + "loss": 0.8188, + "step": 14370 + }, + { + "epoch": 10.256776034236804, + "grad_norm": 7.1875, + "learning_rate": 4.720444444444445e-05, + "loss": 0.7643, + "step": 14380 + }, + { + "epoch": 10.263908701854493, + "grad_norm": 9.125, + "learning_rate": 4.716e-05, + "loss": 0.7052, + "step": 14390 + }, + { + "epoch": 10.271041369472183, + "grad_norm": 10.5625, + "learning_rate": 4.711555555555556e-05, + "loss": 0.762, + "step": 14400 + }, + { + "epoch": 10.271041369472183, + "eval/acc": 46.511627197265625, + "step": 14400 + }, + { + "epoch": 10.271041369472183, + "eval_loss": 2.774780750274658, + "eval_runtime": 1.2232, + "eval_samples_per_second": 35.152, + "eval_steps_per_second": 0.817, + "step": 14400 + }, + { + "epoch": 10.278174037089872, + "grad_norm": 7.5625, + "learning_rate": 4.707111111111111e-05, + "loss": 0.8322, + "step": 14410 + }, + { + "epoch": 10.28530670470756, + "grad_norm": 35.25, + "learning_rate": 4.702666666666667e-05, + "loss": 0.8043, + "step": 14420 + }, + { + "epoch": 10.29243937232525, + "grad_norm": 7.09375, + "learning_rate": 4.6982222222222223e-05, + "loss": 0.7257, + "step": 14430 + }, + { + "epoch": 10.29957203994294, + "grad_norm": 15.375, + "learning_rate": 4.693777777777778e-05, + "loss": 0.7922, + "step": 14440 + }, + { + "epoch": 10.306704707560627, + "grad_norm": 7.09375, + "learning_rate": 4.6893333333333334e-05, + "loss": 0.694, + "step": 14450 + }, + { + "epoch": 10.313837375178316, + "grad_norm": 7.0625, + "learning_rate": 4.684888888888889e-05, + "loss": 0.7734, + "step": 14460 + }, + { + "epoch": 10.320970042796006, + "grad_norm": 6.75, + "learning_rate": 4.6804444444444444e-05, + "loss": 0.7469, + "step": 14470 + }, + { + "epoch": 10.328102710413695, + "grad_norm": 5.9375, + "learning_rate": 4.6760000000000006e-05, + "loss": 0.6948, + "step": 14480 + }, + { + "epoch": 10.335235378031383, + "grad_norm": 7.15625, + "learning_rate": 4.6715555555555555e-05, + "loss": 0.7593, + "step": 14490 + }, + { + "epoch": 10.342368045649073, + "grad_norm": 26.875, + "learning_rate": 4.667111111111112e-05, + "loss": 0.7302, + "step": 14500 + }, + { + "epoch": 10.342368045649073, + "eval/acc": 44.1860466003418, + "step": 14500 + }, + { + "epoch": 10.342368045649073, + "eval_loss": 2.7937443256378174, + "eval_runtime": 0.2689, + "eval_samples_per_second": 159.899, + "eval_steps_per_second": 3.719, + "step": 14500 + }, + { + "epoch": 10.349500713266762, + "grad_norm": 53.75, + "learning_rate": 4.6626666666666665e-05, + "loss": 0.8025, + "step": 14510 + }, + { + "epoch": 10.356633380884452, + "grad_norm": 10.4375, + "learning_rate": 4.658222222222223e-05, + "loss": 0.6807, + "step": 14520 + }, + { + "epoch": 10.36376604850214, + "grad_norm": 17.5, + "learning_rate": 4.653777777777778e-05, + "loss": 0.7773, + "step": 14530 + }, + { + "epoch": 10.370898716119829, + "grad_norm": 9.0625, + "learning_rate": 4.649333333333333e-05, + "loss": 0.7322, + "step": 14540 + }, + { + "epoch": 10.378031383737518, + "grad_norm": 7.5, + "learning_rate": 4.644888888888889e-05, + "loss": 0.801, + "step": 14550 + }, + { + "epoch": 10.385164051355208, + "grad_norm": 7.03125, + "learning_rate": 4.640444444444445e-05, + "loss": 0.7887, + "step": 14560 + }, + { + "epoch": 10.392296718972895, + "grad_norm": 5.78125, + "learning_rate": 4.636e-05, + "loss": 0.75, + "step": 14570 + }, + { + "epoch": 10.399429386590585, + "grad_norm": 11.8125, + "learning_rate": 4.631555555555556e-05, + "loss": 0.7594, + "step": 14580 + }, + { + "epoch": 10.406562054208274, + "grad_norm": 26.375, + "learning_rate": 4.6271111111111114e-05, + "loss": 0.7863, + "step": 14590 + }, + { + "epoch": 10.413694721825962, + "grad_norm": 11.875, + "learning_rate": 4.622666666666667e-05, + "loss": 0.7701, + "step": 14600 + }, + { + "epoch": 10.413694721825962, + "eval/acc": 44.1860466003418, + "step": 14600 + }, + { + "epoch": 10.413694721825962, + "eval_loss": 2.7834675312042236, + "eval_runtime": 0.4675, + "eval_samples_per_second": 91.973, + "eval_steps_per_second": 2.139, + "step": 14600 + }, + { + "epoch": 10.420827389443652, + "grad_norm": 12.5625, + "learning_rate": 4.6182222222222224e-05, + "loss": 0.7568, + "step": 14610 + }, + { + "epoch": 10.427960057061341, + "grad_norm": 7.40625, + "learning_rate": 4.613777777777778e-05, + "loss": 0.7547, + "step": 14620 + }, + { + "epoch": 10.43509272467903, + "grad_norm": 8.875, + "learning_rate": 4.6093333333333335e-05, + "loss": 0.7594, + "step": 14630 + }, + { + "epoch": 10.442225392296718, + "grad_norm": 25.0, + "learning_rate": 4.604888888888889e-05, + "loss": 0.8313, + "step": 14640 + }, + { + "epoch": 10.449358059914408, + "grad_norm": 7.9375, + "learning_rate": 4.6004444444444445e-05, + "loss": 0.8017, + "step": 14650 + }, + { + "epoch": 10.456490727532097, + "grad_norm": 7.59375, + "learning_rate": 4.596e-05, + "loss": 0.7648, + "step": 14660 + }, + { + "epoch": 10.463623395149787, + "grad_norm": 8.5625, + "learning_rate": 4.5915555555555556e-05, + "loss": 0.6931, + "step": 14670 + }, + { + "epoch": 10.470756062767475, + "grad_norm": 9.8125, + "learning_rate": 4.587111111111112e-05, + "loss": 0.7128, + "step": 14680 + }, + { + "epoch": 10.477888730385164, + "grad_norm": 8.0, + "learning_rate": 4.5826666666666666e-05, + "loss": 0.8199, + "step": 14690 + }, + { + "epoch": 10.485021398002853, + "grad_norm": 7.53125, + "learning_rate": 4.578222222222223e-05, + "loss": 0.8027, + "step": 14700 + }, + { + "epoch": 10.485021398002853, + "eval/acc": 48.83720779418945, + "step": 14700 + }, + { + "epoch": 10.485021398002853, + "eval_loss": 2.773456573486328, + "eval_runtime": 0.2477, + "eval_samples_per_second": 173.569, + "eval_steps_per_second": 4.036, + "step": 14700 + }, + { + "epoch": 10.492154065620543, + "grad_norm": 13.375, + "learning_rate": 4.5737777777777777e-05, + "loss": 0.8284, + "step": 14710 + }, + { + "epoch": 10.49928673323823, + "grad_norm": 7.875, + "learning_rate": 4.569333333333334e-05, + "loss": 0.7522, + "step": 14720 + }, + { + "epoch": 10.50641940085592, + "grad_norm": 6.375, + "learning_rate": 4.5648888888888894e-05, + "loss": 0.672, + "step": 14730 + }, + { + "epoch": 10.51355206847361, + "grad_norm": 6.40625, + "learning_rate": 4.560444444444444e-05, + "loss": 0.8234, + "step": 14740 + }, + { + "epoch": 10.520684736091297, + "grad_norm": 9.1875, + "learning_rate": 4.5560000000000004e-05, + "loss": 0.7505, + "step": 14750 + }, + { + "epoch": 10.527817403708987, + "grad_norm": 7.25, + "learning_rate": 4.551555555555555e-05, + "loss": 0.7694, + "step": 14760 + }, + { + "epoch": 10.534950071326676, + "grad_norm": 6.3125, + "learning_rate": 4.5471111111111115e-05, + "loss": 0.7743, + "step": 14770 + }, + { + "epoch": 10.542082738944366, + "grad_norm": 10.0, + "learning_rate": 4.542666666666667e-05, + "loss": 0.8179, + "step": 14780 + }, + { + "epoch": 10.549215406562054, + "grad_norm": 9.875, + "learning_rate": 4.5382222222222225e-05, + "loss": 0.9151, + "step": 14790 + }, + { + "epoch": 10.556348074179743, + "grad_norm": 8.6875, + "learning_rate": 4.533777777777778e-05, + "loss": 0.8133, + "step": 14800 + }, + { + "epoch": 10.556348074179743, + "eval/acc": 46.511627197265625, + "step": 14800 + }, + { + "epoch": 10.556348074179743, + "eval_loss": 2.8140347003936768, + "eval_runtime": 0.2939, + "eval_samples_per_second": 146.299, + "eval_steps_per_second": 3.402, + "step": 14800 + }, + { + "epoch": 10.563480741797433, + "grad_norm": 7.0, + "learning_rate": 4.5293333333333336e-05, + "loss": 0.7129, + "step": 14810 + }, + { + "epoch": 10.570613409415122, + "grad_norm": 8.5, + "learning_rate": 4.524888888888889e-05, + "loss": 0.7667, + "step": 14820 + }, + { + "epoch": 10.57774607703281, + "grad_norm": 7.4375, + "learning_rate": 4.5204444444444446e-05, + "loss": 0.7692, + "step": 14830 + }, + { + "epoch": 10.5848787446505, + "grad_norm": 8.0625, + "learning_rate": 4.516e-05, + "loss": 0.7613, + "step": 14840 + }, + { + "epoch": 10.592011412268189, + "grad_norm": 7.96875, + "learning_rate": 4.5115555555555557e-05, + "loss": 0.6925, + "step": 14850 + }, + { + "epoch": 10.599144079885878, + "grad_norm": 14.375, + "learning_rate": 4.507111111111111e-05, + "loss": 0.84, + "step": 14860 + }, + { + "epoch": 10.606276747503566, + "grad_norm": 11.4375, + "learning_rate": 4.502666666666667e-05, + "loss": 0.8508, + "step": 14870 + }, + { + "epoch": 10.613409415121255, + "grad_norm": 8.375, + "learning_rate": 4.498222222222222e-05, + "loss": 0.7863, + "step": 14880 + }, + { + "epoch": 10.620542082738945, + "grad_norm": 7.625, + "learning_rate": 4.493777777777778e-05, + "loss": 0.7177, + "step": 14890 + }, + { + "epoch": 10.627674750356633, + "grad_norm": 10.375, + "learning_rate": 4.489333333333334e-05, + "loss": 0.7795, + "step": 14900 + }, + { + "epoch": 10.627674750356633, + "eval/acc": 44.1860466003418, + "step": 14900 + }, + { + "epoch": 10.627674750356633, + "eval_loss": 2.830230951309204, + "eval_runtime": 0.2428, + "eval_samples_per_second": 177.067, + "eval_steps_per_second": 4.118, + "step": 14900 + }, + { + "epoch": 10.634807417974322, + "grad_norm": 10.875, + "learning_rate": 4.484888888888889e-05, + "loss": 0.7878, + "step": 14910 + }, + { + "epoch": 10.641940085592012, + "grad_norm": 9.0, + "learning_rate": 4.480444444444445e-05, + "loss": 0.8517, + "step": 14920 + }, + { + "epoch": 10.649072753209701, + "grad_norm": 6.9375, + "learning_rate": 4.4760000000000005e-05, + "loss": 0.8469, + "step": 14930 + }, + { + "epoch": 10.656205420827389, + "grad_norm": 7.28125, + "learning_rate": 4.4715555555555554e-05, + "loss": 0.7262, + "step": 14940 + }, + { + "epoch": 10.663338088445078, + "grad_norm": 6.15625, + "learning_rate": 4.4671111111111116e-05, + "loss": 0.739, + "step": 14950 + }, + { + "epoch": 10.670470756062768, + "grad_norm": 7.84375, + "learning_rate": 4.4626666666666664e-05, + "loss": 0.7671, + "step": 14960 + }, + { + "epoch": 10.677603423680456, + "grad_norm": 7.53125, + "learning_rate": 4.4582222222222226e-05, + "loss": 0.8059, + "step": 14970 + }, + { + "epoch": 10.684736091298145, + "grad_norm": 8.0, + "learning_rate": 4.453777777777778e-05, + "loss": 0.8167, + "step": 14980 + }, + { + "epoch": 10.691868758915835, + "grad_norm": 7.4375, + "learning_rate": 4.4493333333333337e-05, + "loss": 0.7768, + "step": 14990 + }, + { + "epoch": 10.699001426533524, + "grad_norm": 9.0625, + "learning_rate": 4.444888888888889e-05, + "loss": 0.7805, + "step": 15000 + }, + { + "epoch": 10.699001426533524, + "eval/acc": 46.511627197265625, + "step": 15000 + }, + { + "epoch": 10.699001426533524, + "eval_loss": 2.8128726482391357, + "eval_runtime": 0.239, + "eval_samples_per_second": 179.883, + "eval_steps_per_second": 4.183, + "step": 15000 + }, + { + "epoch": 10.706134094151212, + "grad_norm": 6.59375, + "learning_rate": 4.440444444444445e-05, + "loss": 0.7292, + "step": 15010 + }, + { + "epoch": 10.713266761768901, + "grad_norm": 7.625, + "learning_rate": 4.436e-05, + "loss": 0.7543, + "step": 15020 + }, + { + "epoch": 10.72039942938659, + "grad_norm": 7.6875, + "learning_rate": 4.431555555555556e-05, + "loss": 0.812, + "step": 15030 + }, + { + "epoch": 10.72753209700428, + "grad_norm": 8.375, + "learning_rate": 4.427111111111111e-05, + "loss": 0.8101, + "step": 15040 + }, + { + "epoch": 10.734664764621968, + "grad_norm": 9.3125, + "learning_rate": 4.422666666666667e-05, + "loss": 0.7984, + "step": 15050 + }, + { + "epoch": 10.741797432239657, + "grad_norm": 8.0, + "learning_rate": 4.418222222222222e-05, + "loss": 0.7325, + "step": 15060 + }, + { + "epoch": 10.748930099857347, + "grad_norm": 7.125, + "learning_rate": 4.413777777777778e-05, + "loss": 0.823, + "step": 15070 + }, + { + "epoch": 10.756062767475036, + "grad_norm": 7.40625, + "learning_rate": 4.4093333333333334e-05, + "loss": 0.8095, + "step": 15080 + }, + { + "epoch": 10.763195435092724, + "grad_norm": 7.875, + "learning_rate": 4.404888888888889e-05, + "loss": 0.7326, + "step": 15090 + }, + { + "epoch": 10.770328102710414, + "grad_norm": 31.0, + "learning_rate": 4.400444444444445e-05, + "loss": 0.8735, + "step": 15100 + }, + { + "epoch": 10.770328102710414, + "eval/acc": 46.511627197265625, + "step": 15100 + }, + { + "epoch": 10.770328102710414, + "eval_loss": 2.815082311630249, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.248, + "eval_steps_per_second": 3.541, + "step": 15100 + }, + { + "epoch": 10.777460770328103, + "grad_norm": 9.625, + "learning_rate": 4.396e-05, + "loss": 0.8593, + "step": 15110 + }, + { + "epoch": 10.78459343794579, + "grad_norm": 8.0, + "learning_rate": 4.391555555555556e-05, + "loss": 0.8309, + "step": 15120 + }, + { + "epoch": 10.79172610556348, + "grad_norm": 7.53125, + "learning_rate": 4.387111111111111e-05, + "loss": 0.6465, + "step": 15130 + }, + { + "epoch": 10.79885877318117, + "grad_norm": 9.25, + "learning_rate": 4.382666666666667e-05, + "loss": 0.8751, + "step": 15140 + }, + { + "epoch": 10.80599144079886, + "grad_norm": 7.6875, + "learning_rate": 4.378222222222223e-05, + "loss": 0.7533, + "step": 15150 + }, + { + "epoch": 10.813124108416547, + "grad_norm": 8.5625, + "learning_rate": 4.3737777777777775e-05, + "loss": 0.7803, + "step": 15160 + }, + { + "epoch": 10.820256776034237, + "grad_norm": 6.09375, + "learning_rate": 4.369333333333334e-05, + "loss": 0.6925, + "step": 15170 + }, + { + "epoch": 10.827389443651926, + "grad_norm": 8.1875, + "learning_rate": 4.3648888888888886e-05, + "loss": 0.8491, + "step": 15180 + }, + { + "epoch": 10.834522111269616, + "grad_norm": 13.125, + "learning_rate": 4.360444444444445e-05, + "loss": 0.7565, + "step": 15190 + }, + { + "epoch": 10.841654778887303, + "grad_norm": 9.25, + "learning_rate": 4.356e-05, + "loss": 0.9506, + "step": 15200 + }, + { + "epoch": 10.841654778887303, + "eval/acc": 41.86046600341797, + "step": 15200 + }, + { + "epoch": 10.841654778887303, + "eval_loss": 2.834817409515381, + "eval_runtime": 0.2334, + "eval_samples_per_second": 184.267, + "eval_steps_per_second": 4.285, + "step": 15200 + }, + { + "epoch": 10.848787446504993, + "grad_norm": 6.71875, + "learning_rate": 4.351555555555556e-05, + "loss": 0.8631, + "step": 15210 + }, + { + "epoch": 10.855920114122682, + "grad_norm": 7.25, + "learning_rate": 4.3471111111111114e-05, + "loss": 0.8258, + "step": 15220 + }, + { + "epoch": 10.863052781740372, + "grad_norm": 5.59375, + "learning_rate": 4.342666666666667e-05, + "loss": 0.7895, + "step": 15230 + }, + { + "epoch": 10.87018544935806, + "grad_norm": 6.65625, + "learning_rate": 4.3382222222222224e-05, + "loss": 0.8473, + "step": 15240 + }, + { + "epoch": 10.877318116975749, + "grad_norm": 6.59375, + "learning_rate": 4.333777777777778e-05, + "loss": 0.8323, + "step": 15250 + }, + { + "epoch": 10.884450784593438, + "grad_norm": 9.3125, + "learning_rate": 4.3293333333333334e-05, + "loss": 0.7446, + "step": 15260 + }, + { + "epoch": 10.891583452211126, + "grad_norm": 15.125, + "learning_rate": 4.324888888888889e-05, + "loss": 0.8885, + "step": 15270 + }, + { + "epoch": 10.898716119828816, + "grad_norm": 21.25, + "learning_rate": 4.3204444444444445e-05, + "loss": 0.7624, + "step": 15280 + }, + { + "epoch": 10.905848787446505, + "grad_norm": 7.125, + "learning_rate": 4.316e-05, + "loss": 0.6841, + "step": 15290 + }, + { + "epoch": 10.912981455064195, + "grad_norm": 8.5625, + "learning_rate": 4.311555555555556e-05, + "loss": 0.8645, + "step": 15300 + }, + { + "epoch": 10.912981455064195, + "eval/acc": 46.511627197265625, + "step": 15300 + }, + { + "epoch": 10.912981455064195, + "eval_loss": 2.790827512741089, + "eval_runtime": 0.2527, + "eval_samples_per_second": 170.168, + "eval_steps_per_second": 3.957, + "step": 15300 + }, + { + "epoch": 10.920114122681882, + "grad_norm": 9.0625, + "learning_rate": 4.307111111111111e-05, + "loss": 0.779, + "step": 15310 + }, + { + "epoch": 10.927246790299572, + "grad_norm": 5.96875, + "learning_rate": 4.302666666666667e-05, + "loss": 0.7987, + "step": 15320 + }, + { + "epoch": 10.934379457917261, + "grad_norm": 7.25, + "learning_rate": 4.298222222222222e-05, + "loss": 0.8278, + "step": 15330 + }, + { + "epoch": 10.94151212553495, + "grad_norm": 7.625, + "learning_rate": 4.293777777777778e-05, + "loss": 0.8022, + "step": 15340 + }, + { + "epoch": 10.948644793152638, + "grad_norm": 6.6875, + "learning_rate": 4.289333333333334e-05, + "loss": 0.7333, + "step": 15350 + }, + { + "epoch": 10.955777460770328, + "grad_norm": 7.4375, + "learning_rate": 4.284888888888889e-05, + "loss": 0.7398, + "step": 15360 + }, + { + "epoch": 10.962910128388017, + "grad_norm": 10.1875, + "learning_rate": 4.280444444444445e-05, + "loss": 0.7397, + "step": 15370 + }, + { + "epoch": 10.970042796005707, + "grad_norm": 7.375, + "learning_rate": 4.276e-05, + "loss": 0.7993, + "step": 15380 + }, + { + "epoch": 10.977175463623395, + "grad_norm": 7.59375, + "learning_rate": 4.271555555555556e-05, + "loss": 0.7811, + "step": 15390 + }, + { + "epoch": 10.984308131241084, + "grad_norm": 7.46875, + "learning_rate": 4.2671111111111114e-05, + "loss": 0.7611, + "step": 15400 + }, + { + "epoch": 10.984308131241084, + "eval/acc": 46.511627197265625, + "step": 15400 + }, + { + "epoch": 10.984308131241084, + "eval_loss": 2.8109776973724365, + "eval_runtime": 0.2365, + "eval_samples_per_second": 181.849, + "eval_steps_per_second": 4.229, + "step": 15400 + }, + { + "epoch": 10.991440798858774, + "grad_norm": 7.46875, + "learning_rate": 4.262666666666667e-05, + "loss": 0.8165, + "step": 15410 + }, + { + "epoch": 10.998573466476461, + "grad_norm": 8.625, + "learning_rate": 4.2582222222222225e-05, + "loss": 0.8449, + "step": 15420 + }, + { + "epoch": 11.00570613409415, + "grad_norm": 7.28125, + "learning_rate": 4.253777777777778e-05, + "loss": 0.7556, + "step": 15430 + }, + { + "epoch": 11.01283880171184, + "grad_norm": 30.375, + "learning_rate": 4.2493333333333335e-05, + "loss": 0.8541, + "step": 15440 + }, + { + "epoch": 11.01997146932953, + "grad_norm": 8.375, + "learning_rate": 4.244888888888889e-05, + "loss": 0.8616, + "step": 15450 + }, + { + "epoch": 11.027104136947218, + "grad_norm": 8.5, + "learning_rate": 4.2404444444444446e-05, + "loss": 0.7837, + "step": 15460 + }, + { + "epoch": 11.034236804564907, + "grad_norm": 9.125, + "learning_rate": 4.236e-05, + "loss": 0.6939, + "step": 15470 + }, + { + "epoch": 11.041369472182597, + "grad_norm": 8.1875, + "learning_rate": 4.2315555555555556e-05, + "loss": 0.6788, + "step": 15480 + }, + { + "epoch": 11.048502139800286, + "grad_norm": 8.1875, + "learning_rate": 4.227111111111111e-05, + "loss": 0.7654, + "step": 15490 + }, + { + "epoch": 11.055634807417974, + "grad_norm": 8.375, + "learning_rate": 4.222666666666667e-05, + "loss": 0.7765, + "step": 15500 + }, + { + "epoch": 11.055634807417974, + "eval/acc": 41.86046600341797, + "step": 15500 + }, + { + "epoch": 11.055634807417974, + "eval_loss": 2.01023268699646, + "eval_runtime": 5.0492, + "eval_samples_per_second": 8.516, + "eval_steps_per_second": 0.198, + "step": 15500 + }, + { + "epoch": 11.062767475035663, + "grad_norm": 9.125, + "learning_rate": 4.218222222222222e-05, + "loss": 0.8758, + "step": 15510 + }, + { + "epoch": 11.069900142653353, + "grad_norm": 8.375, + "learning_rate": 4.2137777777777784e-05, + "loss": 0.8337, + "step": 15520 + }, + { + "epoch": 11.077032810271042, + "grad_norm": 7.78125, + "learning_rate": 4.209333333333333e-05, + "loss": 0.8168, + "step": 15530 + }, + { + "epoch": 11.08416547788873, + "grad_norm": 7.03125, + "learning_rate": 4.2048888888888894e-05, + "loss": 0.8345, + "step": 15540 + }, + { + "epoch": 11.09129814550642, + "grad_norm": 8.8125, + "learning_rate": 4.200444444444445e-05, + "loss": 0.7392, + "step": 15550 + }, + { + "epoch": 11.098430813124109, + "grad_norm": 8.9375, + "learning_rate": 4.196e-05, + "loss": 0.7623, + "step": 15560 + }, + { + "epoch": 11.105563480741797, + "grad_norm": 9.875, + "learning_rate": 4.191555555555556e-05, + "loss": 0.6797, + "step": 15570 + }, + { + "epoch": 11.112696148359486, + "grad_norm": 7.96875, + "learning_rate": 4.187111111111111e-05, + "loss": 0.7957, + "step": 15580 + }, + { + "epoch": 11.119828815977176, + "grad_norm": 7.0625, + "learning_rate": 4.182666666666667e-05, + "loss": 0.7701, + "step": 15590 + }, + { + "epoch": 11.126961483594865, + "grad_norm": 6.90625, + "learning_rate": 4.1782222222222226e-05, + "loss": 0.8514, + "step": 15600 + }, + { + "epoch": 11.126961483594865, + "eval/acc": 39.53488540649414, + "step": 15600 + }, + { + "epoch": 11.126961483594865, + "eval_loss": 2.0217747688293457, + "eval_runtime": 0.2343, + "eval_samples_per_second": 183.512, + "eval_steps_per_second": 4.268, + "step": 15600 + }, + { + "epoch": 11.134094151212553, + "grad_norm": 7.3125, + "learning_rate": 4.173777777777778e-05, + "loss": 0.7757, + "step": 15610 + }, + { + "epoch": 11.141226818830242, + "grad_norm": 6.6875, + "learning_rate": 4.1693333333333336e-05, + "loss": 0.6947, + "step": 15620 + }, + { + "epoch": 11.148359486447932, + "grad_norm": 7.3125, + "learning_rate": 4.164888888888889e-05, + "loss": 0.8118, + "step": 15630 + }, + { + "epoch": 11.155492154065621, + "grad_norm": 8.0625, + "learning_rate": 4.160444444444445e-05, + "loss": 0.848, + "step": 15640 + }, + { + "epoch": 11.162624821683309, + "grad_norm": 6.6875, + "learning_rate": 4.156e-05, + "loss": 0.7256, + "step": 15650 + }, + { + "epoch": 11.169757489300999, + "grad_norm": 6.3125, + "learning_rate": 4.151555555555556e-05, + "loss": 0.7898, + "step": 15660 + }, + { + "epoch": 11.176890156918688, + "grad_norm": 5.53125, + "learning_rate": 4.147111111111111e-05, + "loss": 0.7057, + "step": 15670 + }, + { + "epoch": 11.184022824536376, + "grad_norm": 8.0, + "learning_rate": 4.142666666666667e-05, + "loss": 0.7065, + "step": 15680 + }, + { + "epoch": 11.191155492154065, + "grad_norm": 7.0, + "learning_rate": 4.138222222222222e-05, + "loss": 0.812, + "step": 15690 + }, + { + "epoch": 11.198288159771755, + "grad_norm": 35.0, + "learning_rate": 4.133777777777778e-05, + "loss": 0.7953, + "step": 15700 + }, + { + "epoch": 11.198288159771755, + "eval/acc": 39.53488540649414, + "step": 15700 + }, + { + "epoch": 11.198288159771755, + "eval_loss": 2.0371451377868652, + "eval_runtime": 0.2311, + "eval_samples_per_second": 186.077, + "eval_steps_per_second": 4.327, + "step": 15700 + }, + { + "epoch": 11.205420827389444, + "grad_norm": 6.8125, + "learning_rate": 4.129333333333333e-05, + "loss": 0.8272, + "step": 15710 + }, + { + "epoch": 11.212553495007132, + "grad_norm": 9.625, + "learning_rate": 4.1248888888888895e-05, + "loss": 0.782, + "step": 15720 + }, + { + "epoch": 11.219686162624821, + "grad_norm": 7.59375, + "learning_rate": 4.1204444444444444e-05, + "loss": 0.8176, + "step": 15730 + }, + { + "epoch": 11.226818830242511, + "grad_norm": 7.40625, + "learning_rate": 4.1160000000000006e-05, + "loss": 0.7592, + "step": 15740 + }, + { + "epoch": 11.2339514978602, + "grad_norm": 9.1875, + "learning_rate": 4.1115555555555554e-05, + "loss": 0.7587, + "step": 15750 + }, + { + "epoch": 11.241084165477888, + "grad_norm": 17.375, + "learning_rate": 4.1071111111111116e-05, + "loss": 0.7165, + "step": 15760 + }, + { + "epoch": 11.248216833095578, + "grad_norm": 8.5625, + "learning_rate": 4.102666666666667e-05, + "loss": 0.7391, + "step": 15770 + }, + { + "epoch": 11.255349500713267, + "grad_norm": 9.75, + "learning_rate": 4.098222222222222e-05, + "loss": 0.745, + "step": 15780 + }, + { + "epoch": 11.262482168330957, + "grad_norm": 8.75, + "learning_rate": 4.093777777777778e-05, + "loss": 0.7635, + "step": 15790 + }, + { + "epoch": 11.269614835948644, + "grad_norm": 6.5, + "learning_rate": 4.089333333333333e-05, + "loss": 0.8273, + "step": 15800 + }, + { + "epoch": 11.269614835948644, + "eval/acc": 41.86046600341797, + "step": 15800 + }, + { + "epoch": 11.269614835948644, + "eval_loss": 2.0061967372894287, + "eval_runtime": 0.2356, + "eval_samples_per_second": 182.538, + "eval_steps_per_second": 4.245, + "step": 15800 + }, + { + "epoch": 11.276747503566334, + "grad_norm": 7.0, + "learning_rate": 4.084888888888889e-05, + "loss": 0.7411, + "step": 15810 + }, + { + "epoch": 11.283880171184023, + "grad_norm": 11.375, + "learning_rate": 4.080444444444445e-05, + "loss": 0.7564, + "step": 15820 + }, + { + "epoch": 11.291012838801711, + "grad_norm": 8.5, + "learning_rate": 4.076e-05, + "loss": 0.8688, + "step": 15830 + }, + { + "epoch": 11.2981455064194, + "grad_norm": 7.03125, + "learning_rate": 4.071555555555556e-05, + "loss": 0.7351, + "step": 15840 + }, + { + "epoch": 11.30527817403709, + "grad_norm": 9.0, + "learning_rate": 4.067111111111111e-05, + "loss": 0.7432, + "step": 15850 + }, + { + "epoch": 11.31241084165478, + "grad_norm": 9.875, + "learning_rate": 4.062666666666667e-05, + "loss": 0.7984, + "step": 15860 + }, + { + "epoch": 11.319543509272467, + "grad_norm": 7.1875, + "learning_rate": 4.0582222222222224e-05, + "loss": 0.8125, + "step": 15870 + }, + { + "epoch": 11.326676176890157, + "grad_norm": 7.5, + "learning_rate": 4.053777777777778e-05, + "loss": 0.7995, + "step": 15880 + }, + { + "epoch": 11.333808844507846, + "grad_norm": 9.0, + "learning_rate": 4.0493333333333334e-05, + "loss": 0.7915, + "step": 15890 + }, + { + "epoch": 11.340941512125536, + "grad_norm": 9.5625, + "learning_rate": 4.044888888888889e-05, + "loss": 0.8598, + "step": 15900 + }, + { + "epoch": 11.340941512125536, + "eval/acc": 37.20930099487305, + "step": 15900 + }, + { + "epoch": 11.340941512125536, + "eval_loss": 2.0159339904785156, + "eval_runtime": 0.7376, + "eval_samples_per_second": 58.299, + "eval_steps_per_second": 1.356, + "step": 15900 + }, + { + "epoch": 11.348074179743223, + "grad_norm": 8.25, + "learning_rate": 4.0404444444444445e-05, + "loss": 0.8052, + "step": 15910 + }, + { + "epoch": 11.355206847360913, + "grad_norm": 10.75, + "learning_rate": 4.0360000000000007e-05, + "loss": 0.8008, + "step": 15920 + }, + { + "epoch": 11.362339514978602, + "grad_norm": 6.8125, + "learning_rate": 4.0315555555555555e-05, + "loss": 0.7922, + "step": 15930 + }, + { + "epoch": 11.36947218259629, + "grad_norm": 11.8125, + "learning_rate": 4.027111111111112e-05, + "loss": 0.7762, + "step": 15940 + }, + { + "epoch": 11.37660485021398, + "grad_norm": 8.4375, + "learning_rate": 4.0226666666666666e-05, + "loss": 0.7765, + "step": 15950 + }, + { + "epoch": 11.383737517831669, + "grad_norm": 8.25, + "learning_rate": 4.018222222222223e-05, + "loss": 0.7922, + "step": 15960 + }, + { + "epoch": 11.390870185449359, + "grad_norm": 9.5, + "learning_rate": 4.013777777777778e-05, + "loss": 0.7839, + "step": 15970 + }, + { + "epoch": 11.398002853067046, + "grad_norm": 8.125, + "learning_rate": 4.009333333333333e-05, + "loss": 0.6813, + "step": 15980 + }, + { + "epoch": 11.405135520684736, + "grad_norm": 6.28125, + "learning_rate": 4.004888888888889e-05, + "loss": 0.8077, + "step": 15990 + }, + { + "epoch": 11.412268188302425, + "grad_norm": 8.3125, + "learning_rate": 4.000444444444444e-05, + "loss": 0.7049, + "step": 16000 + }, + { + "epoch": 11.412268188302425, + "eval/acc": 41.86046600341797, + "step": 16000 + }, + { + "epoch": 11.412268188302425, + "eval_loss": 2.0035645961761475, + "eval_runtime": 0.3154, + "eval_samples_per_second": 136.335, + "eval_steps_per_second": 3.171, + "step": 16000 + }, + { + "epoch": 11.419400855920115, + "grad_norm": 6.75, + "learning_rate": 3.9960000000000004e-05, + "loss": 0.7572, + "step": 16010 + }, + { + "epoch": 11.426533523537802, + "grad_norm": 7.9375, + "learning_rate": 3.991555555555556e-05, + "loss": 0.7396, + "step": 16020 + }, + { + "epoch": 11.433666191155492, + "grad_norm": 8.5, + "learning_rate": 3.9871111111111114e-05, + "loss": 0.7213, + "step": 16030 + }, + { + "epoch": 11.440798858773181, + "grad_norm": 8.75, + "learning_rate": 3.982666666666667e-05, + "loss": 0.7403, + "step": 16040 + }, + { + "epoch": 11.447931526390871, + "grad_norm": 6.625, + "learning_rate": 3.9782222222222225e-05, + "loss": 0.7124, + "step": 16050 + }, + { + "epoch": 11.455064194008559, + "grad_norm": 8.75, + "learning_rate": 3.973777777777778e-05, + "loss": 0.7377, + "step": 16060 + }, + { + "epoch": 11.462196861626248, + "grad_norm": 9.1875, + "learning_rate": 3.9693333333333335e-05, + "loss": 0.8129, + "step": 16070 + }, + { + "epoch": 11.469329529243938, + "grad_norm": 8.5, + "learning_rate": 3.964888888888889e-05, + "loss": 0.8099, + "step": 16080 + }, + { + "epoch": 11.476462196861625, + "grad_norm": 8.75, + "learning_rate": 3.9604444444444445e-05, + "loss": 0.8657, + "step": 16090 + }, + { + "epoch": 11.483594864479315, + "grad_norm": 6.9375, + "learning_rate": 3.956e-05, + "loss": 0.8028, + "step": 16100 + }, + { + "epoch": 11.483594864479315, + "eval/acc": 39.53488540649414, + "step": 16100 + }, + { + "epoch": 11.483594864479315, + "eval_loss": 2.01182222366333, + "eval_runtime": 0.2322, + "eval_samples_per_second": 185.159, + "eval_steps_per_second": 4.306, + "step": 16100 + }, + { + "epoch": 11.490727532097004, + "grad_norm": 22.0, + "learning_rate": 3.9515555555555556e-05, + "loss": 0.9008, + "step": 16110 + }, + { + "epoch": 11.497860199714694, + "grad_norm": 7.96875, + "learning_rate": 3.947111111111111e-05, + "loss": 0.749, + "step": 16120 + }, + { + "epoch": 11.504992867332382, + "grad_norm": 9.625, + "learning_rate": 3.9426666666666666e-05, + "loss": 0.7008, + "step": 16130 + }, + { + "epoch": 11.512125534950071, + "grad_norm": 7.90625, + "learning_rate": 3.938222222222223e-05, + "loss": 0.6725, + "step": 16140 + }, + { + "epoch": 11.51925820256776, + "grad_norm": 6.9375, + "learning_rate": 3.933777777777778e-05, + "loss": 0.7104, + "step": 16150 + }, + { + "epoch": 11.52639087018545, + "grad_norm": 9.3125, + "learning_rate": 3.929333333333334e-05, + "loss": 0.7202, + "step": 16160 + }, + { + "epoch": 11.533523537803138, + "grad_norm": 8.5625, + "learning_rate": 3.924888888888889e-05, + "loss": 0.8841, + "step": 16170 + }, + { + "epoch": 11.540656205420827, + "grad_norm": 8.625, + "learning_rate": 3.920444444444444e-05, + "loss": 0.8151, + "step": 16180 + }, + { + "epoch": 11.547788873038517, + "grad_norm": 8.6875, + "learning_rate": 3.9160000000000005e-05, + "loss": 0.6946, + "step": 16190 + }, + { + "epoch": 11.554921540656206, + "grad_norm": 5.46875, + "learning_rate": 3.911555555555555e-05, + "loss": 0.8014, + "step": 16200 + }, + { + "epoch": 11.554921540656206, + "eval/acc": 37.20930099487305, + "step": 16200 + }, + { + "epoch": 11.554921540656206, + "eval_loss": 2.008047580718994, + "eval_runtime": 0.2377, + "eval_samples_per_second": 180.881, + "eval_steps_per_second": 4.207, + "step": 16200 + }, + { + "epoch": 11.562054208273894, + "grad_norm": 6.5, + "learning_rate": 3.9071111111111115e-05, + "loss": 0.8634, + "step": 16210 + }, + { + "epoch": 11.569186875891583, + "grad_norm": 10.0625, + "learning_rate": 3.902666666666667e-05, + "loss": 0.8836, + "step": 16220 + }, + { + "epoch": 11.576319543509273, + "grad_norm": 6.34375, + "learning_rate": 3.8982222222222225e-05, + "loss": 0.6787, + "step": 16230 + }, + { + "epoch": 11.58345221112696, + "grad_norm": 7.8125, + "learning_rate": 3.893777777777778e-05, + "loss": 0.7925, + "step": 16240 + }, + { + "epoch": 11.59058487874465, + "grad_norm": 10.6875, + "learning_rate": 3.8893333333333336e-05, + "loss": 0.7393, + "step": 16250 + }, + { + "epoch": 11.59771754636234, + "grad_norm": 6.65625, + "learning_rate": 3.884888888888889e-05, + "loss": 0.7407, + "step": 16260 + }, + { + "epoch": 11.60485021398003, + "grad_norm": 6.5625, + "learning_rate": 3.8804444444444446e-05, + "loss": 0.8039, + "step": 16270 + }, + { + "epoch": 11.611982881597717, + "grad_norm": 7.3125, + "learning_rate": 3.876e-05, + "loss": 0.8564, + "step": 16280 + }, + { + "epoch": 11.619115549215406, + "grad_norm": 6.6875, + "learning_rate": 3.871555555555556e-05, + "loss": 0.7674, + "step": 16290 + }, + { + "epoch": 11.626248216833096, + "grad_norm": 7.8125, + "learning_rate": 3.867111111111111e-05, + "loss": 0.8431, + "step": 16300 + }, + { + "epoch": 11.626248216833096, + "eval/acc": 41.86046600341797, + "step": 16300 + }, + { + "epoch": 11.626248216833096, + "eval_loss": 1.9884032011032104, + "eval_runtime": 0.2237, + "eval_samples_per_second": 192.225, + "eval_steps_per_second": 4.47, + "step": 16300 + }, + { + "epoch": 11.633380884450785, + "grad_norm": 6.84375, + "learning_rate": 3.862666666666667e-05, + "loss": 0.7772, + "step": 16310 + }, + { + "epoch": 11.640513552068473, + "grad_norm": 6.40625, + "learning_rate": 3.858222222222222e-05, + "loss": 0.8256, + "step": 16320 + }, + { + "epoch": 11.647646219686163, + "grad_norm": 7.96875, + "learning_rate": 3.853777777777778e-05, + "loss": 0.7004, + "step": 16330 + }, + { + "epoch": 11.654778887303852, + "grad_norm": 8.125, + "learning_rate": 3.849333333333334e-05, + "loss": 0.8883, + "step": 16340 + }, + { + "epoch": 11.661911554921542, + "grad_norm": 17.5, + "learning_rate": 3.844888888888889e-05, + "loss": 0.7894, + "step": 16350 + }, + { + "epoch": 11.66904422253923, + "grad_norm": 8.0, + "learning_rate": 3.840444444444445e-05, + "loss": 0.8491, + "step": 16360 + }, + { + "epoch": 11.676176890156919, + "grad_norm": 6.78125, + "learning_rate": 3.836e-05, + "loss": 0.8265, + "step": 16370 + }, + { + "epoch": 11.683309557774608, + "grad_norm": 8.25, + "learning_rate": 3.831555555555556e-05, + "loss": 0.7288, + "step": 16380 + }, + { + "epoch": 11.690442225392296, + "grad_norm": 7.96875, + "learning_rate": 3.8271111111111116e-05, + "loss": 0.7468, + "step": 16390 + }, + { + "epoch": 11.697574893009985, + "grad_norm": 7.28125, + "learning_rate": 3.8226666666666664e-05, + "loss": 0.7557, + "step": 16400 + }, + { + "epoch": 11.697574893009985, + "eval/acc": 39.53488540649414, + "step": 16400 + }, + { + "epoch": 11.697574893009985, + "eval_loss": 1.9937853813171387, + "eval_runtime": 0.2332, + "eval_samples_per_second": 184.37, + "eval_steps_per_second": 4.288, + "step": 16400 + }, + { + "epoch": 11.704707560627675, + "grad_norm": 9.5625, + "learning_rate": 3.8182222222222226e-05, + "loss": 0.8767, + "step": 16410 + }, + { + "epoch": 11.711840228245364, + "grad_norm": 7.96875, + "learning_rate": 3.8137777777777775e-05, + "loss": 0.6997, + "step": 16420 + }, + { + "epoch": 11.718972895863052, + "grad_norm": 6.75, + "learning_rate": 3.809333333333334e-05, + "loss": 0.8482, + "step": 16430 + }, + { + "epoch": 11.726105563480742, + "grad_norm": 9.25, + "learning_rate": 3.804888888888889e-05, + "loss": 0.8189, + "step": 16440 + }, + { + "epoch": 11.733238231098431, + "grad_norm": 7.21875, + "learning_rate": 3.800444444444445e-05, + "loss": 0.7069, + "step": 16450 + }, + { + "epoch": 11.74037089871612, + "grad_norm": 8.125, + "learning_rate": 3.796e-05, + "loss": 0.8172, + "step": 16460 + }, + { + "epoch": 11.747503566333808, + "grad_norm": 8.0625, + "learning_rate": 3.791555555555556e-05, + "loss": 0.7332, + "step": 16470 + }, + { + "epoch": 11.754636233951498, + "grad_norm": 7.75, + "learning_rate": 3.787111111111111e-05, + "loss": 0.7746, + "step": 16480 + }, + { + "epoch": 11.761768901569187, + "grad_norm": 8.8125, + "learning_rate": 3.782666666666667e-05, + "loss": 0.7679, + "step": 16490 + }, + { + "epoch": 11.768901569186877, + "grad_norm": 7.25, + "learning_rate": 3.778222222222222e-05, + "loss": 0.9753, + "step": 16500 + }, + { + "epoch": 11.768901569186877, + "eval/acc": 39.53488540649414, + "step": 16500 + }, + { + "epoch": 11.768901569186877, + "eval_loss": 1.9911694526672363, + "eval_runtime": 0.2278, + "eval_samples_per_second": 188.761, + "eval_steps_per_second": 4.39, + "step": 16500 + }, + { + "epoch": 11.776034236804565, + "grad_norm": 7.375, + "learning_rate": 3.773777777777778e-05, + "loss": 0.7303, + "step": 16510 + }, + { + "epoch": 11.783166904422254, + "grad_norm": 6.65625, + "learning_rate": 3.7693333333333334e-05, + "loss": 0.7036, + "step": 16520 + }, + { + "epoch": 11.790299572039943, + "grad_norm": 7.8125, + "learning_rate": 3.764888888888889e-05, + "loss": 0.6873, + "step": 16530 + }, + { + "epoch": 11.797432239657631, + "grad_norm": 13.3125, + "learning_rate": 3.760444444444445e-05, + "loss": 0.8784, + "step": 16540 + }, + { + "epoch": 11.80456490727532, + "grad_norm": 10.5625, + "learning_rate": 3.756e-05, + "loss": 0.8149, + "step": 16550 + }, + { + "epoch": 11.81169757489301, + "grad_norm": 9.75, + "learning_rate": 3.751555555555556e-05, + "loss": 0.7988, + "step": 16560 + }, + { + "epoch": 11.8188302425107, + "grad_norm": 8.5625, + "learning_rate": 3.747111111111111e-05, + "loss": 0.8117, + "step": 16570 + }, + { + "epoch": 11.825962910128387, + "grad_norm": 9.8125, + "learning_rate": 3.742666666666667e-05, + "loss": 0.7908, + "step": 16580 + }, + { + "epoch": 11.833095577746077, + "grad_norm": 8.0, + "learning_rate": 3.738222222222223e-05, + "loss": 0.8379, + "step": 16590 + }, + { + "epoch": 11.840228245363766, + "grad_norm": 7.21875, + "learning_rate": 3.7337777777777776e-05, + "loss": 0.7278, + "step": 16600 + }, + { + "epoch": 11.840228245363766, + "eval/acc": 37.20930099487305, + "step": 16600 + }, + { + "epoch": 11.840228245363766, + "eval_loss": 1.9659804105758667, + "eval_runtime": 0.2325, + "eval_samples_per_second": 184.932, + "eval_steps_per_second": 4.301, + "step": 16600 + }, + { + "epoch": 11.847360912981456, + "grad_norm": 8.0, + "learning_rate": 3.729333333333334e-05, + "loss": 0.773, + "step": 16610 + }, + { + "epoch": 11.854493580599144, + "grad_norm": 5.78125, + "learning_rate": 3.7248888888888886e-05, + "loss": 0.7377, + "step": 16620 + }, + { + "epoch": 11.861626248216833, + "grad_norm": 8.875, + "learning_rate": 3.720444444444445e-05, + "loss": 0.6644, + "step": 16630 + }, + { + "epoch": 11.868758915834523, + "grad_norm": 7.125, + "learning_rate": 3.716e-05, + "loss": 0.8759, + "step": 16640 + }, + { + "epoch": 11.87589158345221, + "grad_norm": 7.0625, + "learning_rate": 3.711555555555556e-05, + "loss": 0.8503, + "step": 16650 + }, + { + "epoch": 11.8830242510699, + "grad_norm": 5.75, + "learning_rate": 3.7071111111111114e-05, + "loss": 0.7204, + "step": 16660 + }, + { + "epoch": 11.89015691868759, + "grad_norm": 7.4375, + "learning_rate": 3.702666666666667e-05, + "loss": 0.8646, + "step": 16670 + }, + { + "epoch": 11.897289586305279, + "grad_norm": 7.875, + "learning_rate": 3.6982222222222224e-05, + "loss": 0.7951, + "step": 16680 + }, + { + "epoch": 11.904422253922966, + "grad_norm": 7.75, + "learning_rate": 3.693777777777778e-05, + "loss": 0.7474, + "step": 16690 + }, + { + "epoch": 11.911554921540656, + "grad_norm": 8.0625, + "learning_rate": 3.6893333333333335e-05, + "loss": 0.8184, + "step": 16700 + }, + { + "epoch": 11.911554921540656, + "eval/acc": 41.86046600341797, + "step": 16700 + }, + { + "epoch": 11.911554921540656, + "eval_loss": 1.9796830415725708, + "eval_runtime": 0.2288, + "eval_samples_per_second": 187.97, + "eval_steps_per_second": 4.371, + "step": 16700 + }, + { + "epoch": 11.918687589158345, + "grad_norm": 8.5625, + "learning_rate": 3.684888888888889e-05, + "loss": 0.7233, + "step": 16710 + }, + { + "epoch": 11.925820256776035, + "grad_norm": 6.84375, + "learning_rate": 3.6804444444444445e-05, + "loss": 0.8783, + "step": 16720 + }, + { + "epoch": 11.932952924393723, + "grad_norm": 8.0625, + "learning_rate": 3.676e-05, + "loss": 0.7848, + "step": 16730 + }, + { + "epoch": 11.940085592011412, + "grad_norm": 7.0, + "learning_rate": 3.6715555555555556e-05, + "loss": 0.7663, + "step": 16740 + }, + { + "epoch": 11.947218259629102, + "grad_norm": 8.25, + "learning_rate": 3.667111111111111e-05, + "loss": 0.7711, + "step": 16750 + }, + { + "epoch": 11.95435092724679, + "grad_norm": 8.5625, + "learning_rate": 3.662666666666667e-05, + "loss": 0.7848, + "step": 16760 + }, + { + "epoch": 11.961483594864479, + "grad_norm": 13.375, + "learning_rate": 3.658222222222222e-05, + "loss": 0.8355, + "step": 16770 + }, + { + "epoch": 11.968616262482168, + "grad_norm": 8.1875, + "learning_rate": 3.653777777777778e-05, + "loss": 0.8452, + "step": 16780 + }, + { + "epoch": 11.975748930099858, + "grad_norm": 7.53125, + "learning_rate": 3.649333333333333e-05, + "loss": 0.8508, + "step": 16790 + }, + { + "epoch": 11.982881597717546, + "grad_norm": 19.5, + "learning_rate": 3.644888888888889e-05, + "loss": 0.8187, + "step": 16800 + }, + { + "epoch": 11.982881597717546, + "eval/acc": 39.53488540649414, + "step": 16800 + }, + { + "epoch": 11.982881597717546, + "eval_loss": 1.9583516120910645, + "eval_runtime": 0.2249, + "eval_samples_per_second": 191.207, + "eval_steps_per_second": 4.447, + "step": 16800 + }, + { + "epoch": 11.990014265335235, + "grad_norm": 10.125, + "learning_rate": 3.640444444444445e-05, + "loss": 0.8547, + "step": 16810 + }, + { + "epoch": 11.997146932952925, + "grad_norm": 15.125, + "learning_rate": 3.636e-05, + "loss": 0.7173, + "step": 16820 + }, + { + "epoch": 12.004279600570614, + "grad_norm": 6.53125, + "learning_rate": 3.631555555555556e-05, + "loss": 0.7688, + "step": 16830 + }, + { + "epoch": 12.011412268188302, + "grad_norm": 78.5, + "learning_rate": 3.627111111111111e-05, + "loss": 0.8184, + "step": 16840 + }, + { + "epoch": 12.018544935805991, + "grad_norm": 8.5625, + "learning_rate": 3.622666666666667e-05, + "loss": 0.8062, + "step": 16850 + }, + { + "epoch": 12.02567760342368, + "grad_norm": 9.25, + "learning_rate": 3.6182222222222225e-05, + "loss": 0.839, + "step": 16860 + }, + { + "epoch": 12.03281027104137, + "grad_norm": 9.375, + "learning_rate": 3.613777777777778e-05, + "loss": 0.84, + "step": 16870 + }, + { + "epoch": 12.039942938659058, + "grad_norm": 7.46875, + "learning_rate": 3.6093333333333336e-05, + "loss": 0.7653, + "step": 16880 + }, + { + "epoch": 12.047075606276747, + "grad_norm": 14.875, + "learning_rate": 3.604888888888889e-05, + "loss": 0.7917, + "step": 16890 + }, + { + "epoch": 12.054208273894437, + "grad_norm": 11.0, + "learning_rate": 3.6004444444444446e-05, + "loss": 0.7125, + "step": 16900 + }, + { + "epoch": 12.054208273894437, + "eval/acc": 37.20930099487305, + "step": 16900 + }, + { + "epoch": 12.054208273894437, + "eval_loss": 3.0164332389831543, + "eval_runtime": 5.2863, + "eval_samples_per_second": 8.134, + "eval_steps_per_second": 0.189, + "step": 16900 + }, + { + "epoch": 12.061340941512125, + "grad_norm": 7.34375, + "learning_rate": 3.596e-05, + "loss": 0.8146, + "step": 16910 + }, + { + "epoch": 12.068473609129814, + "grad_norm": 6.96875, + "learning_rate": 3.5915555555555557e-05, + "loss": 0.7891, + "step": 16920 + }, + { + "epoch": 12.075606276747504, + "grad_norm": 6.78125, + "learning_rate": 3.587111111111111e-05, + "loss": 0.7922, + "step": 16930 + }, + { + "epoch": 12.082738944365193, + "grad_norm": 11.8125, + "learning_rate": 3.582666666666667e-05, + "loss": 0.7896, + "step": 16940 + }, + { + "epoch": 12.08987161198288, + "grad_norm": 6.25, + "learning_rate": 3.578222222222222e-05, + "loss": 0.7579, + "step": 16950 + }, + { + "epoch": 12.09700427960057, + "grad_norm": 7.9375, + "learning_rate": 3.5737777777777784e-05, + "loss": 0.8181, + "step": 16960 + }, + { + "epoch": 12.10413694721826, + "grad_norm": 12.75, + "learning_rate": 3.569333333333333e-05, + "loss": 0.7577, + "step": 16970 + }, + { + "epoch": 12.11126961483595, + "grad_norm": 50.75, + "learning_rate": 3.5648888888888895e-05, + "loss": 0.7314, + "step": 16980 + }, + { + "epoch": 12.118402282453637, + "grad_norm": 8.625, + "learning_rate": 3.560444444444444e-05, + "loss": 0.7212, + "step": 16990 + }, + { + "epoch": 12.125534950071327, + "grad_norm": 7.90625, + "learning_rate": 3.5560000000000005e-05, + "loss": 0.8013, + "step": 17000 + }, + { + "epoch": 12.125534950071327, + "eval/acc": 37.20930099487305, + "step": 17000 + }, + { + "epoch": 12.125534950071327, + "eval_loss": 3.037986993789673, + "eval_runtime": 0.2607, + "eval_samples_per_second": 164.968, + "eval_steps_per_second": 3.836, + "step": 17000 + }, + { + "epoch": 12.132667617689016, + "grad_norm": 8.0, + "learning_rate": 3.551555555555556e-05, + "loss": 0.7848, + "step": 17010 + }, + { + "epoch": 12.139800285306706, + "grad_norm": 8.9375, + "learning_rate": 3.547111111111111e-05, + "loss": 0.7902, + "step": 17020 + }, + { + "epoch": 12.146932952924393, + "grad_norm": 8.625, + "learning_rate": 3.542666666666667e-05, + "loss": 0.8201, + "step": 17030 + }, + { + "epoch": 12.154065620542083, + "grad_norm": 8.0625, + "learning_rate": 3.538222222222222e-05, + "loss": 0.7456, + "step": 17040 + }, + { + "epoch": 12.161198288159772, + "grad_norm": 6.375, + "learning_rate": 3.533777777777778e-05, + "loss": 0.7404, + "step": 17050 + }, + { + "epoch": 12.16833095577746, + "grad_norm": 10.5, + "learning_rate": 3.5293333333333336e-05, + "loss": 0.6491, + "step": 17060 + }, + { + "epoch": 12.17546362339515, + "grad_norm": 6.03125, + "learning_rate": 3.524888888888889e-05, + "loss": 0.8083, + "step": 17070 + }, + { + "epoch": 12.182596291012839, + "grad_norm": 7.15625, + "learning_rate": 3.520444444444445e-05, + "loss": 0.7394, + "step": 17080 + }, + { + "epoch": 12.189728958630528, + "grad_norm": 8.0, + "learning_rate": 3.516e-05, + "loss": 0.7967, + "step": 17090 + }, + { + "epoch": 12.196861626248216, + "grad_norm": 7.40625, + "learning_rate": 3.511555555555556e-05, + "loss": 0.7497, + "step": 17100 + }, + { + "epoch": 12.196861626248216, + "eval/acc": 37.20930099487305, + "step": 17100 + }, + { + "epoch": 12.196861626248216, + "eval_loss": 3.029151678085327, + "eval_runtime": 0.2337, + "eval_samples_per_second": 183.992, + "eval_steps_per_second": 4.279, + "step": 17100 + }, + { + "epoch": 12.203994293865906, + "grad_norm": 7.75, + "learning_rate": 3.507111111111111e-05, + "loss": 0.697, + "step": 17110 + }, + { + "epoch": 12.211126961483595, + "grad_norm": 12.0625, + "learning_rate": 3.502666666666667e-05, + "loss": 0.7795, + "step": 17120 + }, + { + "epoch": 12.218259629101285, + "grad_norm": 7.1875, + "learning_rate": 3.498222222222222e-05, + "loss": 0.7974, + "step": 17130 + }, + { + "epoch": 12.225392296718972, + "grad_norm": 8.875, + "learning_rate": 3.493777777777778e-05, + "loss": 0.8272, + "step": 17140 + }, + { + "epoch": 12.232524964336662, + "grad_norm": 15.4375, + "learning_rate": 3.4893333333333334e-05, + "loss": 0.8373, + "step": 17150 + }, + { + "epoch": 12.239657631954351, + "grad_norm": 6.9375, + "learning_rate": 3.484888888888889e-05, + "loss": 0.7922, + "step": 17160 + }, + { + "epoch": 12.24679029957204, + "grad_norm": 7.53125, + "learning_rate": 3.4804444444444444e-05, + "loss": 0.8099, + "step": 17170 + }, + { + "epoch": 12.253922967189729, + "grad_norm": 6.28125, + "learning_rate": 3.4760000000000006e-05, + "loss": 0.7522, + "step": 17180 + }, + { + "epoch": 12.261055634807418, + "grad_norm": 8.8125, + "learning_rate": 3.4715555555555554e-05, + "loss": 0.7338, + "step": 17190 + }, + { + "epoch": 12.268188302425107, + "grad_norm": 6.34375, + "learning_rate": 3.4671111111111116e-05, + "loss": 0.7782, + "step": 17200 + }, + { + "epoch": 12.268188302425107, + "eval/acc": 37.20930099487305, + "step": 17200 + }, + { + "epoch": 12.268188302425107, + "eval_loss": 3.063300848007202, + "eval_runtime": 0.2346, + "eval_samples_per_second": 183.302, + "eval_steps_per_second": 4.263, + "step": 17200 + }, + { + "epoch": 12.275320970042795, + "grad_norm": 7.1875, + "learning_rate": 3.462666666666667e-05, + "loss": 0.7274, + "step": 17210 + }, + { + "epoch": 12.282453637660485, + "grad_norm": 6.28125, + "learning_rate": 3.458222222222222e-05, + "loss": 0.7543, + "step": 17220 + }, + { + "epoch": 12.289586305278174, + "grad_norm": 12.75, + "learning_rate": 3.453777777777778e-05, + "loss": 0.8159, + "step": 17230 + }, + { + "epoch": 12.296718972895864, + "grad_norm": 7.03125, + "learning_rate": 3.449333333333333e-05, + "loss": 0.7927, + "step": 17240 + }, + { + "epoch": 12.303851640513551, + "grad_norm": 7.09375, + "learning_rate": 3.444888888888889e-05, + "loss": 0.7862, + "step": 17250 + }, + { + "epoch": 12.310984308131241, + "grad_norm": 11.375, + "learning_rate": 3.440444444444445e-05, + "loss": 0.8505, + "step": 17260 + }, + { + "epoch": 12.31811697574893, + "grad_norm": 6.40625, + "learning_rate": 3.436e-05, + "loss": 0.7432, + "step": 17270 + }, + { + "epoch": 12.32524964336662, + "grad_norm": 7.5, + "learning_rate": 3.431555555555556e-05, + "loss": 0.8234, + "step": 17280 + }, + { + "epoch": 12.332382310984308, + "grad_norm": 8.0, + "learning_rate": 3.4271111111111114e-05, + "loss": 0.8929, + "step": 17290 + }, + { + "epoch": 12.339514978601997, + "grad_norm": 8.8125, + "learning_rate": 3.422666666666667e-05, + "loss": 0.7827, + "step": 17300 + }, + { + "epoch": 12.339514978601997, + "eval/acc": 37.20930099487305, + "step": 17300 + }, + { + "epoch": 12.339514978601997, + "eval_loss": 3.0306241512298584, + "eval_runtime": 0.2382, + "eval_samples_per_second": 180.496, + "eval_steps_per_second": 4.198, + "step": 17300 + }, + { + "epoch": 12.346647646219687, + "grad_norm": 7.125, + "learning_rate": 3.4182222222222224e-05, + "loss": 0.8023, + "step": 17310 + }, + { + "epoch": 12.353780313837376, + "grad_norm": 6.375, + "learning_rate": 3.413777777777778e-05, + "loss": 0.7275, + "step": 17320 + }, + { + "epoch": 12.360912981455064, + "grad_norm": 9.75, + "learning_rate": 3.4093333333333334e-05, + "loss": 0.8026, + "step": 17330 + }, + { + "epoch": 12.368045649072753, + "grad_norm": 6.75, + "learning_rate": 3.404888888888889e-05, + "loss": 0.7214, + "step": 17340 + }, + { + "epoch": 12.375178316690443, + "grad_norm": 9.0, + "learning_rate": 3.4004444444444445e-05, + "loss": 0.808, + "step": 17350 + }, + { + "epoch": 12.38231098430813, + "grad_norm": 6.53125, + "learning_rate": 3.396e-05, + "loss": 0.7779, + "step": 17360 + }, + { + "epoch": 12.38944365192582, + "grad_norm": 9.625, + "learning_rate": 3.3915555555555555e-05, + "loss": 0.6953, + "step": 17370 + }, + { + "epoch": 12.39657631954351, + "grad_norm": 7.1875, + "learning_rate": 3.387111111111112e-05, + "loss": 0.7577, + "step": 17380 + }, + { + "epoch": 12.403708987161199, + "grad_norm": 10.375, + "learning_rate": 3.3826666666666666e-05, + "loss": 0.7793, + "step": 17390 + }, + { + "epoch": 12.410841654778887, + "grad_norm": 8.0, + "learning_rate": 3.378222222222223e-05, + "loss": 0.8273, + "step": 17400 + }, + { + "epoch": 12.410841654778887, + "eval/acc": 37.20930099487305, + "step": 17400 + }, + { + "epoch": 12.410841654778887, + "eval_loss": 3.02496075630188, + "eval_runtime": 0.2305, + "eval_samples_per_second": 186.586, + "eval_steps_per_second": 4.339, + "step": 17400 + }, + { + "epoch": 12.417974322396576, + "grad_norm": 6.0625, + "learning_rate": 3.3737777777777776e-05, + "loss": 0.7008, + "step": 17410 + }, + { + "epoch": 12.425106990014266, + "grad_norm": 8.0, + "learning_rate": 3.369333333333333e-05, + "loss": 0.7961, + "step": 17420 + }, + { + "epoch": 12.432239657631955, + "grad_norm": 8.9375, + "learning_rate": 3.3648888888888893e-05, + "loss": 0.7806, + "step": 17430 + }, + { + "epoch": 12.439372325249643, + "grad_norm": 8.75, + "learning_rate": 3.360444444444444e-05, + "loss": 0.6974, + "step": 17440 + }, + { + "epoch": 12.446504992867332, + "grad_norm": 5.59375, + "learning_rate": 3.3560000000000004e-05, + "loss": 0.6685, + "step": 17450 + }, + { + "epoch": 12.453637660485022, + "grad_norm": 5.8125, + "learning_rate": 3.351555555555555e-05, + "loss": 0.7812, + "step": 17460 + }, + { + "epoch": 12.46077032810271, + "grad_norm": 8.8125, + "learning_rate": 3.3471111111111114e-05, + "loss": 0.8677, + "step": 17470 + }, + { + "epoch": 12.467902995720399, + "grad_norm": 8.3125, + "learning_rate": 3.342666666666667e-05, + "loss": 0.7837, + "step": 17480 + }, + { + "epoch": 12.475035663338089, + "grad_norm": 11.0625, + "learning_rate": 3.3382222222222225e-05, + "loss": 0.7833, + "step": 17490 + }, + { + "epoch": 12.482168330955778, + "grad_norm": 7.75, + "learning_rate": 3.333777777777778e-05, + "loss": 0.8063, + "step": 17500 + }, + { + "epoch": 12.482168330955778, + "eval/acc": 37.20930099487305, + "step": 17500 + }, + { + "epoch": 12.482168330955778, + "eval_loss": 3.0436007976531982, + "eval_runtime": 0.2352, + "eval_samples_per_second": 182.855, + "eval_steps_per_second": 4.252, + "step": 17500 + }, + { + "epoch": 12.489300998573466, + "grad_norm": 5.46875, + "learning_rate": 3.3293333333333335e-05, + "loss": 0.8146, + "step": 17510 + }, + { + "epoch": 12.496433666191155, + "grad_norm": 7.6875, + "learning_rate": 3.324888888888889e-05, + "loss": 0.7964, + "step": 17520 + }, + { + "epoch": 12.503566333808845, + "grad_norm": 6.03125, + "learning_rate": 3.3204444444444446e-05, + "loss": 0.7377, + "step": 17530 + }, + { + "epoch": 12.510699001426534, + "grad_norm": 7.90625, + "learning_rate": 3.316e-05, + "loss": 0.7658, + "step": 17540 + }, + { + "epoch": 12.517831669044222, + "grad_norm": 8.0625, + "learning_rate": 3.3115555555555556e-05, + "loss": 0.7035, + "step": 17550 + }, + { + "epoch": 12.524964336661911, + "grad_norm": 7.75, + "learning_rate": 3.307111111111111e-05, + "loss": 0.794, + "step": 17560 + }, + { + "epoch": 12.532097004279601, + "grad_norm": 9.125, + "learning_rate": 3.302666666666667e-05, + "loss": 0.8046, + "step": 17570 + }, + { + "epoch": 12.539229671897289, + "grad_norm": 8.8125, + "learning_rate": 3.298222222222223e-05, + "loss": 0.7632, + "step": 17580 + }, + { + "epoch": 12.546362339514978, + "grad_norm": 9.375, + "learning_rate": 3.293777777777778e-05, + "loss": 0.8554, + "step": 17590 + }, + { + "epoch": 12.553495007132668, + "grad_norm": 7.0625, + "learning_rate": 3.289333333333334e-05, + "loss": 0.647, + "step": 17600 + }, + { + "epoch": 12.553495007132668, + "eval/acc": 37.20930099487305, + "step": 17600 + }, + { + "epoch": 12.553495007132668, + "eval_loss": 3.04180908203125, + "eval_runtime": 0.2372, + "eval_samples_per_second": 181.25, + "eval_steps_per_second": 4.215, + "step": 17600 + }, + { + "epoch": 12.560627674750357, + "grad_norm": 7.4375, + "learning_rate": 3.284888888888889e-05, + "loss": 0.7701, + "step": 17610 + }, + { + "epoch": 12.567760342368045, + "grad_norm": 81.0, + "learning_rate": 3.280444444444445e-05, + "loss": 0.7707, + "step": 17620 + }, + { + "epoch": 12.574893009985734, + "grad_norm": 6.75, + "learning_rate": 3.2760000000000005e-05, + "loss": 0.7871, + "step": 17630 + }, + { + "epoch": 12.582025677603424, + "grad_norm": 6.28125, + "learning_rate": 3.271555555555555e-05, + "loss": 0.8382, + "step": 17640 + }, + { + "epoch": 12.589158345221113, + "grad_norm": 8.875, + "learning_rate": 3.2671111111111115e-05, + "loss": 0.76, + "step": 17650 + }, + { + "epoch": 12.596291012838801, + "grad_norm": 7.75, + "learning_rate": 3.2626666666666664e-05, + "loss": 0.794, + "step": 17660 + }, + { + "epoch": 12.60342368045649, + "grad_norm": 7.15625, + "learning_rate": 3.2582222222222226e-05, + "loss": 0.7451, + "step": 17670 + }, + { + "epoch": 12.61055634807418, + "grad_norm": 6.75, + "learning_rate": 3.253777777777778e-05, + "loss": 0.7608, + "step": 17680 + }, + { + "epoch": 12.61768901569187, + "grad_norm": 8.8125, + "learning_rate": 3.2493333333333336e-05, + "loss": 0.8062, + "step": 17690 + }, + { + "epoch": 12.624821683309557, + "grad_norm": 9.5625, + "learning_rate": 3.244888888888889e-05, + "loss": 0.6761, + "step": 17700 + }, + { + "epoch": 12.624821683309557, + "eval/acc": 37.20930099487305, + "step": 17700 + }, + { + "epoch": 12.624821683309557, + "eval_loss": 3.029242515563965, + "eval_runtime": 0.2291, + "eval_samples_per_second": 187.698, + "eval_steps_per_second": 4.365, + "step": 17700 + }, + { + "epoch": 12.631954350927247, + "grad_norm": 24.125, + "learning_rate": 3.240444444444445e-05, + "loss": 0.6484, + "step": 17710 + }, + { + "epoch": 12.639087018544936, + "grad_norm": 5.125, + "learning_rate": 3.236e-05, + "loss": 0.716, + "step": 17720 + }, + { + "epoch": 12.646219686162624, + "grad_norm": 10.0625, + "learning_rate": 3.231555555555556e-05, + "loss": 0.7955, + "step": 17730 + }, + { + "epoch": 12.653352353780313, + "grad_norm": 10.6875, + "learning_rate": 3.227111111111111e-05, + "loss": 0.8146, + "step": 17740 + }, + { + "epoch": 12.660485021398003, + "grad_norm": 8.0, + "learning_rate": 3.222666666666667e-05, + "loss": 0.7711, + "step": 17750 + }, + { + "epoch": 12.667617689015692, + "grad_norm": 8.0, + "learning_rate": 3.218222222222222e-05, + "loss": 0.8463, + "step": 17760 + }, + { + "epoch": 12.67475035663338, + "grad_norm": 9.0, + "learning_rate": 3.213777777777778e-05, + "loss": 0.8483, + "step": 17770 + }, + { + "epoch": 12.68188302425107, + "grad_norm": 7.78125, + "learning_rate": 3.209333333333333e-05, + "loss": 0.8471, + "step": 17780 + }, + { + "epoch": 12.689015691868759, + "grad_norm": 7.9375, + "learning_rate": 3.204888888888889e-05, + "loss": 0.7851, + "step": 17790 + }, + { + "epoch": 12.696148359486449, + "grad_norm": 7.59375, + "learning_rate": 3.200444444444445e-05, + "loss": 0.7442, + "step": 17800 + }, + { + "epoch": 12.696148359486449, + "eval/acc": 37.20930099487305, + "step": 17800 + }, + { + "epoch": 12.696148359486449, + "eval_loss": 3.0417492389678955, + "eval_runtime": 0.2273, + "eval_samples_per_second": 189.151, + "eval_steps_per_second": 4.399, + "step": 17800 + }, + { + "epoch": 12.703281027104136, + "grad_norm": 7.5, + "learning_rate": 3.196e-05, + "loss": 0.8628, + "step": 17810 + }, + { + "epoch": 12.710413694721826, + "grad_norm": 9.5, + "learning_rate": 3.191555555555556e-05, + "loss": 0.7934, + "step": 17820 + }, + { + "epoch": 12.717546362339515, + "grad_norm": 7.15625, + "learning_rate": 3.187111111111111e-05, + "loss": 0.7312, + "step": 17830 + }, + { + "epoch": 12.724679029957205, + "grad_norm": 8.375, + "learning_rate": 3.1826666666666665e-05, + "loss": 0.7718, + "step": 17840 + }, + { + "epoch": 12.731811697574893, + "grad_norm": 13.75, + "learning_rate": 3.178222222222223e-05, + "loss": 0.7138, + "step": 17850 + }, + { + "epoch": 12.738944365192582, + "grad_norm": 7.5, + "learning_rate": 3.1737777777777775e-05, + "loss": 0.8093, + "step": 17860 + }, + { + "epoch": 12.746077032810271, + "grad_norm": 30.5, + "learning_rate": 3.169333333333334e-05, + "loss": 0.8289, + "step": 17870 + }, + { + "epoch": 12.75320970042796, + "grad_norm": 5.625, + "learning_rate": 3.164888888888889e-05, + "loss": 0.7022, + "step": 17880 + }, + { + "epoch": 12.760342368045649, + "grad_norm": 6.40625, + "learning_rate": 3.160444444444445e-05, + "loss": 0.6812, + "step": 17890 + }, + { + "epoch": 12.767475035663338, + "grad_norm": 9.0, + "learning_rate": 3.156e-05, + "loss": 0.717, + "step": 17900 + }, + { + "epoch": 12.767475035663338, + "eval/acc": 37.20930099487305, + "step": 17900 + }, + { + "epoch": 12.767475035663338, + "eval_loss": 3.0246851444244385, + "eval_runtime": 0.2524, + "eval_samples_per_second": 170.366, + "eval_steps_per_second": 3.962, + "step": 17900 + }, + { + "epoch": 12.774607703281028, + "grad_norm": 8.25, + "learning_rate": 3.151555555555556e-05, + "loss": 0.7882, + "step": 17910 + }, + { + "epoch": 12.781740370898715, + "grad_norm": 7.125, + "learning_rate": 3.147111111111111e-05, + "loss": 0.7156, + "step": 17920 + }, + { + "epoch": 12.788873038516405, + "grad_norm": 7.46875, + "learning_rate": 3.142666666666667e-05, + "loss": 0.7049, + "step": 17930 + }, + { + "epoch": 12.796005706134094, + "grad_norm": 8.6875, + "learning_rate": 3.1382222222222224e-05, + "loss": 0.9339, + "step": 17940 + }, + { + "epoch": 12.803138373751784, + "grad_norm": 7.3125, + "learning_rate": 3.133777777777778e-05, + "loss": 0.6531, + "step": 17950 + }, + { + "epoch": 12.810271041369472, + "grad_norm": 6.84375, + "learning_rate": 3.1293333333333334e-05, + "loss": 0.7576, + "step": 17960 + }, + { + "epoch": 12.817403708987161, + "grad_norm": 7.25, + "learning_rate": 3.124888888888889e-05, + "loss": 0.7153, + "step": 17970 + }, + { + "epoch": 12.82453637660485, + "grad_norm": 7.1875, + "learning_rate": 3.1204444444444445e-05, + "loss": 0.7138, + "step": 17980 + }, + { + "epoch": 12.83166904422254, + "grad_norm": 8.25, + "learning_rate": 3.116e-05, + "loss": 0.738, + "step": 17990 + }, + { + "epoch": 12.838801711840228, + "grad_norm": 9.5, + "learning_rate": 3.111555555555556e-05, + "loss": 0.7525, + "step": 18000 + }, + { + "epoch": 12.838801711840228, + "eval/acc": 37.20930099487305, + "step": 18000 + }, + { + "epoch": 12.838801711840228, + "eval_loss": 3.0545501708984375, + "eval_runtime": 0.2299, + "eval_samples_per_second": 187.055, + "eval_steps_per_second": 4.35, + "step": 18000 + }, + { + "epoch": 12.845934379457917, + "grad_norm": 7.4375, + "learning_rate": 3.107111111111111e-05, + "loss": 0.8798, + "step": 18010 + }, + { + "epoch": 12.853067047075607, + "grad_norm": 8.625, + "learning_rate": 3.102666666666667e-05, + "loss": 0.9301, + "step": 18020 + }, + { + "epoch": 12.860199714693294, + "grad_norm": 8.0625, + "learning_rate": 3.098222222222222e-05, + "loss": 0.808, + "step": 18030 + }, + { + "epoch": 12.867332382310984, + "grad_norm": 6.75, + "learning_rate": 3.0937777777777776e-05, + "loss": 0.848, + "step": 18040 + }, + { + "epoch": 12.874465049928673, + "grad_norm": 6.1875, + "learning_rate": 3.089333333333334e-05, + "loss": 0.667, + "step": 18050 + }, + { + "epoch": 12.881597717546363, + "grad_norm": 17.125, + "learning_rate": 3.0848888888888886e-05, + "loss": 0.778, + "step": 18060 + }, + { + "epoch": 12.88873038516405, + "grad_norm": 7.28125, + "learning_rate": 3.080444444444445e-05, + "loss": 0.7401, + "step": 18070 + }, + { + "epoch": 12.89586305278174, + "grad_norm": 10.25, + "learning_rate": 3.076e-05, + "loss": 0.8231, + "step": 18080 + }, + { + "epoch": 12.90299572039943, + "grad_norm": 6.9375, + "learning_rate": 3.071555555555556e-05, + "loss": 0.8827, + "step": 18090 + }, + { + "epoch": 12.91012838801712, + "grad_norm": 7.4375, + "learning_rate": 3.0671111111111114e-05, + "loss": 0.7568, + "step": 18100 + }, + { + "epoch": 12.91012838801712, + "eval/acc": 37.20930099487305, + "step": 18100 + }, + { + "epoch": 12.91012838801712, + "eval_loss": 3.0568392276763916, + "eval_runtime": 0.2321, + "eval_samples_per_second": 185.239, + "eval_steps_per_second": 4.308, + "step": 18100 + }, + { + "epoch": 12.917261055634807, + "grad_norm": 8.5, + "learning_rate": 3.062666666666667e-05, + "loss": 0.7822, + "step": 18110 + }, + { + "epoch": 12.924393723252496, + "grad_norm": 7.375, + "learning_rate": 3.0582222222222225e-05, + "loss": 0.8247, + "step": 18120 + }, + { + "epoch": 12.931526390870186, + "grad_norm": 8.8125, + "learning_rate": 3.053777777777778e-05, + "loss": 0.8443, + "step": 18130 + }, + { + "epoch": 12.938659058487875, + "grad_norm": 6.28125, + "learning_rate": 3.0493333333333335e-05, + "loss": 0.803, + "step": 18140 + }, + { + "epoch": 12.945791726105563, + "grad_norm": 38.75, + "learning_rate": 3.0448888888888887e-05, + "loss": 0.7168, + "step": 18150 + }, + { + "epoch": 12.952924393723253, + "grad_norm": 8.0, + "learning_rate": 3.0404444444444445e-05, + "loss": 0.8827, + "step": 18160 + }, + { + "epoch": 12.960057061340942, + "grad_norm": 17.75, + "learning_rate": 3.036e-05, + "loss": 0.822, + "step": 18170 + }, + { + "epoch": 12.96718972895863, + "grad_norm": 7.1875, + "learning_rate": 3.031555555555556e-05, + "loss": 0.7958, + "step": 18180 + }, + { + "epoch": 12.97432239657632, + "grad_norm": 6.125, + "learning_rate": 3.027111111111111e-05, + "loss": 0.676, + "step": 18190 + }, + { + "epoch": 12.981455064194009, + "grad_norm": 7.8125, + "learning_rate": 3.022666666666667e-05, + "loss": 0.8696, + "step": 18200 + }, + { + "epoch": 12.981455064194009, + "eval/acc": 37.20930099487305, + "step": 18200 + }, + { + "epoch": 12.981455064194009, + "eval_loss": 3.040698528289795, + "eval_runtime": 0.2626, + "eval_samples_per_second": 163.753, + "eval_steps_per_second": 3.808, + "step": 18200 + }, + { + "epoch": 12.988587731811698, + "grad_norm": 11.1875, + "learning_rate": 3.018222222222222e-05, + "loss": 0.7538, + "step": 18210 + }, + { + "epoch": 12.995720399429386, + "grad_norm": 12.0, + "learning_rate": 3.013777777777778e-05, + "loss": 0.7873, + "step": 18220 + }, + { + "epoch": 13.002853067047075, + "grad_norm": 8.4375, + "learning_rate": 3.0093333333333335e-05, + "loss": 0.7741, + "step": 18230 + }, + { + "epoch": 13.009985734664765, + "grad_norm": 7.5, + "learning_rate": 3.0048888888888894e-05, + "loss": 0.7644, + "step": 18240 + }, + { + "epoch": 13.017118402282454, + "grad_norm": 6.84375, + "learning_rate": 3.0004444444444446e-05, + "loss": 0.769, + "step": 18250 + }, + { + "epoch": 13.024251069900142, + "grad_norm": 6.375, + "learning_rate": 2.9959999999999998e-05, + "loss": 0.7497, + "step": 18260 + }, + { + "epoch": 13.031383737517832, + "grad_norm": 7.84375, + "learning_rate": 2.9915555555555556e-05, + "loss": 0.7643, + "step": 18270 + }, + { + "epoch": 13.038516405135521, + "grad_norm": 7.40625, + "learning_rate": 2.987111111111111e-05, + "loss": 0.7955, + "step": 18280 + }, + { + "epoch": 13.045649072753209, + "grad_norm": 14.125, + "learning_rate": 2.982666666666667e-05, + "loss": 0.7396, + "step": 18290 + }, + { + "epoch": 13.052781740370898, + "grad_norm": 6.6875, + "learning_rate": 2.9782222222222222e-05, + "loss": 0.7616, + "step": 18300 + }, + { + "epoch": 13.052781740370898, + "eval/acc": 44.1860466003418, + "step": 18300 + }, + { + "epoch": 13.052781740370898, + "eval_loss": 2.3592934608459473, + "eval_runtime": 4.7062, + "eval_samples_per_second": 9.137, + "eval_steps_per_second": 0.212, + "step": 18300 + }, + { + "epoch": 13.059914407988588, + "grad_norm": 6.875, + "learning_rate": 2.973777777777778e-05, + "loss": 0.7691, + "step": 18310 + }, + { + "epoch": 13.067047075606277, + "grad_norm": 12.4375, + "learning_rate": 2.9693333333333333e-05, + "loss": 0.8092, + "step": 18320 + }, + { + "epoch": 13.074179743223965, + "grad_norm": 7.65625, + "learning_rate": 2.964888888888889e-05, + "loss": 0.7653, + "step": 18330 + }, + { + "epoch": 13.081312410841655, + "grad_norm": 9.9375, + "learning_rate": 2.9604444444444446e-05, + "loss": 0.7936, + "step": 18340 + }, + { + "epoch": 13.088445078459344, + "grad_norm": 11.0625, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.7666, + "step": 18350 + }, + { + "epoch": 13.095577746077034, + "grad_norm": 12.75, + "learning_rate": 2.9515555555555557e-05, + "loss": 0.874, + "step": 18360 + }, + { + "epoch": 13.102710413694721, + "grad_norm": 6.6875, + "learning_rate": 2.9471111111111112e-05, + "loss": 0.6979, + "step": 18370 + }, + { + "epoch": 13.10984308131241, + "grad_norm": 7.6875, + "learning_rate": 2.942666666666667e-05, + "loss": 0.8215, + "step": 18380 + }, + { + "epoch": 13.1169757489301, + "grad_norm": 6.75, + "learning_rate": 2.9382222222222222e-05, + "loss": 0.6941, + "step": 18390 + }, + { + "epoch": 13.12410841654779, + "grad_norm": 8.5625, + "learning_rate": 2.933777777777778e-05, + "loss": 0.7365, + "step": 18400 + }, + { + "epoch": 13.12410841654779, + "eval/acc": 44.1860466003418, + "step": 18400 + }, + { + "epoch": 13.12410841654779, + "eval_loss": 2.368854284286499, + "eval_runtime": 7.2366, + "eval_samples_per_second": 5.942, + "eval_steps_per_second": 0.138, + "step": 18400 + }, + { + "epoch": 13.131241084165477, + "grad_norm": 9.0625, + "learning_rate": 2.9293333333333333e-05, + "loss": 0.7201, + "step": 18410 + }, + { + "epoch": 13.138373751783167, + "grad_norm": 8.4375, + "learning_rate": 2.924888888888889e-05, + "loss": 0.9061, + "step": 18420 + }, + { + "epoch": 13.145506419400856, + "grad_norm": 5.875, + "learning_rate": 2.9204444444444447e-05, + "loss": 0.7767, + "step": 18430 + }, + { + "epoch": 13.152639087018544, + "grad_norm": 6.53125, + "learning_rate": 2.9160000000000005e-05, + "loss": 0.8086, + "step": 18440 + }, + { + "epoch": 13.159771754636234, + "grad_norm": 8.1875, + "learning_rate": 2.9115555555555557e-05, + "loss": 0.7938, + "step": 18450 + }, + { + "epoch": 13.166904422253923, + "grad_norm": 7.84375, + "learning_rate": 2.907111111111111e-05, + "loss": 0.8435, + "step": 18460 + }, + { + "epoch": 13.174037089871613, + "grad_norm": 8.3125, + "learning_rate": 2.9026666666666668e-05, + "loss": 0.7333, + "step": 18470 + }, + { + "epoch": 13.1811697574893, + "grad_norm": 8.1875, + "learning_rate": 2.8982222222222223e-05, + "loss": 0.7546, + "step": 18480 + }, + { + "epoch": 13.18830242510699, + "grad_norm": 7.09375, + "learning_rate": 2.893777777777778e-05, + "loss": 0.7321, + "step": 18490 + }, + { + "epoch": 13.19543509272468, + "grad_norm": 9.375, + "learning_rate": 2.8893333333333333e-05, + "loss": 0.8419, + "step": 18500 + }, + { + "epoch": 13.19543509272468, + "eval/acc": 44.1860466003418, + "step": 18500 + }, + { + "epoch": 13.19543509272468, + "eval_loss": 2.343879461288452, + "eval_runtime": 0.2875, + "eval_samples_per_second": 149.591, + "eval_steps_per_second": 3.479, + "step": 18500 + }, + { + "epoch": 13.202567760342369, + "grad_norm": 9.8125, + "learning_rate": 2.8848888888888892e-05, + "loss": 0.7606, + "step": 18510 + }, + { + "epoch": 13.209700427960057, + "grad_norm": 6.125, + "learning_rate": 2.8804444444444444e-05, + "loss": 0.7617, + "step": 18520 + }, + { + "epoch": 13.216833095577746, + "grad_norm": 6.4375, + "learning_rate": 2.8760000000000002e-05, + "loss": 0.6793, + "step": 18530 + }, + { + "epoch": 13.223965763195435, + "grad_norm": 8.5625, + "learning_rate": 2.8715555555555558e-05, + "loss": 0.7353, + "step": 18540 + }, + { + "epoch": 13.231098430813125, + "grad_norm": 6.53125, + "learning_rate": 2.8671111111111116e-05, + "loss": 0.7745, + "step": 18550 + }, + { + "epoch": 13.238231098430813, + "grad_norm": 10.3125, + "learning_rate": 2.8626666666666668e-05, + "loss": 0.7891, + "step": 18560 + }, + { + "epoch": 13.245363766048502, + "grad_norm": 6.03125, + "learning_rate": 2.858222222222222e-05, + "loss": 0.7913, + "step": 18570 + }, + { + "epoch": 13.252496433666192, + "grad_norm": 6.5, + "learning_rate": 2.853777777777778e-05, + "loss": 0.7704, + "step": 18580 + }, + { + "epoch": 13.25962910128388, + "grad_norm": 11.5, + "learning_rate": 2.8493333333333334e-05, + "loss": 0.7956, + "step": 18590 + }, + { + "epoch": 13.266761768901569, + "grad_norm": 7.34375, + "learning_rate": 2.8448888888888892e-05, + "loss": 0.7904, + "step": 18600 + }, + { + "epoch": 13.266761768901569, + "eval/acc": 39.53488540649414, + "step": 18600 + }, + { + "epoch": 13.266761768901569, + "eval_loss": 2.3613929748535156, + "eval_runtime": 0.2243, + "eval_samples_per_second": 191.727, + "eval_steps_per_second": 4.459, + "step": 18600 + }, + { + "epoch": 13.273894436519258, + "grad_norm": 7.46875, + "learning_rate": 2.8404444444444444e-05, + "loss": 0.7528, + "step": 18610 + }, + { + "epoch": 13.281027104136948, + "grad_norm": 8.0625, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.7475, + "step": 18620 + }, + { + "epoch": 13.288159771754636, + "grad_norm": 9.9375, + "learning_rate": 2.8315555555555555e-05, + "loss": 0.7382, + "step": 18630 + }, + { + "epoch": 13.295292439372325, + "grad_norm": 7.28125, + "learning_rate": 2.8271111111111113e-05, + "loss": 0.8196, + "step": 18640 + }, + { + "epoch": 13.302425106990015, + "grad_norm": 8.5, + "learning_rate": 2.822666666666667e-05, + "loss": 0.9212, + "step": 18650 + }, + { + "epoch": 13.309557774607704, + "grad_norm": 7.71875, + "learning_rate": 2.818222222222222e-05, + "loss": 0.7357, + "step": 18660 + }, + { + "epoch": 13.316690442225392, + "grad_norm": 6.5, + "learning_rate": 2.813777777777778e-05, + "loss": 0.7228, + "step": 18670 + }, + { + "epoch": 13.323823109843081, + "grad_norm": 6.5625, + "learning_rate": 2.8093333333333334e-05, + "loss": 0.8229, + "step": 18680 + }, + { + "epoch": 13.33095577746077, + "grad_norm": 18.25, + "learning_rate": 2.8048888888888893e-05, + "loss": 0.8386, + "step": 18690 + }, + { + "epoch": 13.338088445078458, + "grad_norm": 11.0625, + "learning_rate": 2.8004444444444445e-05, + "loss": 0.7779, + "step": 18700 + }, + { + "epoch": 13.338088445078458, + "eval/acc": 39.53488540649414, + "step": 18700 + }, + { + "epoch": 13.338088445078458, + "eval_loss": 2.363105535507202, + "eval_runtime": 0.2271, + "eval_samples_per_second": 189.351, + "eval_steps_per_second": 4.404, + "step": 18700 + }, + { + "epoch": 13.345221112696148, + "grad_norm": 6.75, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.7851, + "step": 18710 + }, + { + "epoch": 13.352353780313837, + "grad_norm": 7.28125, + "learning_rate": 2.7915555555555555e-05, + "loss": 0.7634, + "step": 18720 + }, + { + "epoch": 13.359486447931527, + "grad_norm": 7.0, + "learning_rate": 2.7871111111111114e-05, + "loss": 0.8063, + "step": 18730 + }, + { + "epoch": 13.366619115549215, + "grad_norm": 7.15625, + "learning_rate": 2.782666666666667e-05, + "loss": 0.7746, + "step": 18740 + }, + { + "epoch": 13.373751783166904, + "grad_norm": 7.75, + "learning_rate": 2.7782222222222228e-05, + "loss": 0.7927, + "step": 18750 + }, + { + "epoch": 13.380884450784594, + "grad_norm": 10.4375, + "learning_rate": 2.773777777777778e-05, + "loss": 0.7496, + "step": 18760 + }, + { + "epoch": 13.388017118402283, + "grad_norm": 13.0625, + "learning_rate": 2.769333333333333e-05, + "loss": 0.7797, + "step": 18770 + }, + { + "epoch": 13.39514978601997, + "grad_norm": 7.15625, + "learning_rate": 2.764888888888889e-05, + "loss": 0.7104, + "step": 18780 + }, + { + "epoch": 13.40228245363766, + "grad_norm": 123.0, + "learning_rate": 2.7604444444444445e-05, + "loss": 0.7854, + "step": 18790 + }, + { + "epoch": 13.40941512125535, + "grad_norm": 8.9375, + "learning_rate": 2.7560000000000004e-05, + "loss": 0.7275, + "step": 18800 + }, + { + "epoch": 13.40941512125535, + "eval/acc": 46.511627197265625, + "step": 18800 + }, + { + "epoch": 13.40941512125535, + "eval_loss": 2.3632330894470215, + "eval_runtime": 0.223, + "eval_samples_per_second": 192.833, + "eval_steps_per_second": 4.484, + "step": 18800 + }, + { + "epoch": 13.41654778887304, + "grad_norm": 8.0625, + "learning_rate": 2.7515555555555556e-05, + "loss": 0.7429, + "step": 18810 + }, + { + "epoch": 13.423680456490727, + "grad_norm": 11.0, + "learning_rate": 2.7471111111111114e-05, + "loss": 0.7845, + "step": 18820 + }, + { + "epoch": 13.430813124108417, + "grad_norm": 6.6875, + "learning_rate": 2.7426666666666666e-05, + "loss": 0.6946, + "step": 18830 + }, + { + "epoch": 13.437945791726106, + "grad_norm": 6.875, + "learning_rate": 2.7382222222222225e-05, + "loss": 0.8386, + "step": 18840 + }, + { + "epoch": 13.445078459343794, + "grad_norm": 10.375, + "learning_rate": 2.733777777777778e-05, + "loss": 0.7235, + "step": 18850 + }, + { + "epoch": 13.452211126961483, + "grad_norm": 7.90625, + "learning_rate": 2.7293333333333332e-05, + "loss": 0.7586, + "step": 18860 + }, + { + "epoch": 13.459343794579173, + "grad_norm": 7.34375, + "learning_rate": 2.724888888888889e-05, + "loss": 0.7546, + "step": 18870 + }, + { + "epoch": 13.466476462196862, + "grad_norm": 5.9375, + "learning_rate": 2.7204444444444442e-05, + "loss": 0.7515, + "step": 18880 + }, + { + "epoch": 13.47360912981455, + "grad_norm": 8.1875, + "learning_rate": 2.716e-05, + "loss": 0.7242, + "step": 18890 + }, + { + "epoch": 13.48074179743224, + "grad_norm": 6.53125, + "learning_rate": 2.7115555555555556e-05, + "loss": 0.7571, + "step": 18900 + }, + { + "epoch": 13.48074179743224, + "eval/acc": 44.1860466003418, + "step": 18900 + }, + { + "epoch": 13.48074179743224, + "eval_loss": 2.3733906745910645, + "eval_runtime": 0.2277, + "eval_samples_per_second": 188.866, + "eval_steps_per_second": 4.392, + "step": 18900 + }, + { + "epoch": 13.487874465049929, + "grad_norm": 8.125, + "learning_rate": 2.7071111111111115e-05, + "loss": 0.813, + "step": 18910 + }, + { + "epoch": 13.495007132667618, + "grad_norm": 9.4375, + "learning_rate": 2.7026666666666667e-05, + "loss": 0.7889, + "step": 18920 + }, + { + "epoch": 13.502139800285306, + "grad_norm": 6.875, + "learning_rate": 2.6982222222222225e-05, + "loss": 0.6809, + "step": 18930 + }, + { + "epoch": 13.509272467902996, + "grad_norm": 17.875, + "learning_rate": 2.6937777777777777e-05, + "loss": 0.7368, + "step": 18940 + }, + { + "epoch": 13.516405135520685, + "grad_norm": 7.3125, + "learning_rate": 2.6893333333333336e-05, + "loss": 0.7115, + "step": 18950 + }, + { + "epoch": 13.523537803138375, + "grad_norm": 7.84375, + "learning_rate": 2.684888888888889e-05, + "loss": 0.6828, + "step": 18960 + }, + { + "epoch": 13.530670470756062, + "grad_norm": 8.4375, + "learning_rate": 2.6804444444444443e-05, + "loss": 0.7184, + "step": 18970 + }, + { + "epoch": 13.537803138373752, + "grad_norm": 65.0, + "learning_rate": 2.676e-05, + "loss": 0.7808, + "step": 18980 + }, + { + "epoch": 13.544935805991441, + "grad_norm": 8.75, + "learning_rate": 2.6715555555555553e-05, + "loss": 0.9181, + "step": 18990 + }, + { + "epoch": 13.552068473609129, + "grad_norm": 9.3125, + "learning_rate": 2.6671111111111112e-05, + "loss": 0.7868, + "step": 19000 + }, + { + "epoch": 13.552068473609129, + "eval/acc": 39.53488540649414, + "step": 19000 + }, + { + "epoch": 13.552068473609129, + "eval_loss": 2.3711330890655518, + "eval_runtime": 0.2397, + "eval_samples_per_second": 179.384, + "eval_steps_per_second": 4.172, + "step": 19000 + }, + { + "epoch": 13.559201141226819, + "grad_norm": 23.625, + "learning_rate": 2.6626666666666667e-05, + "loss": 0.8008, + "step": 19010 + }, + { + "epoch": 13.566333808844508, + "grad_norm": 7.75, + "learning_rate": 2.6582222222222226e-05, + "loss": 0.7326, + "step": 19020 + }, + { + "epoch": 13.573466476462198, + "grad_norm": 9.375, + "learning_rate": 2.6537777777777777e-05, + "loss": 0.7914, + "step": 19030 + }, + { + "epoch": 13.580599144079885, + "grad_norm": 5.875, + "learning_rate": 2.6493333333333336e-05, + "loss": 0.7849, + "step": 19040 + }, + { + "epoch": 13.587731811697575, + "grad_norm": 7.96875, + "learning_rate": 2.644888888888889e-05, + "loss": 0.7314, + "step": 19050 + }, + { + "epoch": 13.594864479315264, + "grad_norm": 7.90625, + "learning_rate": 2.640444444444445e-05, + "loss": 0.8637, + "step": 19060 + }, + { + "epoch": 13.601997146932954, + "grad_norm": 8.5625, + "learning_rate": 2.6360000000000002e-05, + "loss": 0.8337, + "step": 19070 + }, + { + "epoch": 13.609129814550641, + "grad_norm": 8.75, + "learning_rate": 2.6315555555555554e-05, + "loss": 0.7362, + "step": 19080 + }, + { + "epoch": 13.616262482168331, + "grad_norm": 7.3125, + "learning_rate": 2.6271111111111112e-05, + "loss": 0.7999, + "step": 19090 + }, + { + "epoch": 13.62339514978602, + "grad_norm": 7.21875, + "learning_rate": 2.6226666666666667e-05, + "loss": 0.8105, + "step": 19100 + }, + { + "epoch": 13.62339514978602, + "eval/acc": 41.86046600341797, + "step": 19100 + }, + { + "epoch": 13.62339514978602, + "eval_loss": 2.3755757808685303, + "eval_runtime": 0.2319, + "eval_samples_per_second": 185.454, + "eval_steps_per_second": 4.313, + "step": 19100 + }, + { + "epoch": 13.63052781740371, + "grad_norm": 30.5, + "learning_rate": 2.6182222222222226e-05, + "loss": 0.7981, + "step": 19110 + }, + { + "epoch": 13.637660485021398, + "grad_norm": 7.0, + "learning_rate": 2.6137777777777778e-05, + "loss": 0.7317, + "step": 19120 + }, + { + "epoch": 13.644793152639087, + "grad_norm": 7.40625, + "learning_rate": 2.6093333333333336e-05, + "loss": 0.808, + "step": 19130 + }, + { + "epoch": 13.651925820256777, + "grad_norm": 8.5, + "learning_rate": 2.604888888888889e-05, + "loss": 0.8222, + "step": 19140 + }, + { + "epoch": 13.659058487874464, + "grad_norm": 8.625, + "learning_rate": 2.6004444444444447e-05, + "loss": 0.7333, + "step": 19150 + }, + { + "epoch": 13.666191155492154, + "grad_norm": 8.1875, + "learning_rate": 2.5960000000000002e-05, + "loss": 0.9, + "step": 19160 + }, + { + "epoch": 13.673323823109843, + "grad_norm": 10.25, + "learning_rate": 2.5915555555555554e-05, + "loss": 0.7297, + "step": 19170 + }, + { + "epoch": 13.680456490727533, + "grad_norm": 8.875, + "learning_rate": 2.5871111111111113e-05, + "loss": 0.8401, + "step": 19180 + }, + { + "epoch": 13.68758915834522, + "grad_norm": 8.375, + "learning_rate": 2.5826666666666664e-05, + "loss": 0.803, + "step": 19190 + }, + { + "epoch": 13.69472182596291, + "grad_norm": 7.09375, + "learning_rate": 2.5782222222222223e-05, + "loss": 0.703, + "step": 19200 + }, + { + "epoch": 13.69472182596291, + "eval/acc": 41.86046600341797, + "step": 19200 + }, + { + "epoch": 13.69472182596291, + "eval_loss": 2.3849244117736816, + "eval_runtime": 0.2258, + "eval_samples_per_second": 190.453, + "eval_steps_per_second": 4.429, + "step": 19200 + }, + { + "epoch": 13.7018544935806, + "grad_norm": 7.71875, + "learning_rate": 2.573777777777778e-05, + "loss": 0.7636, + "step": 19210 + }, + { + "epoch": 13.708987161198289, + "grad_norm": 9.8125, + "learning_rate": 2.5693333333333337e-05, + "loss": 0.8003, + "step": 19220 + }, + { + "epoch": 13.716119828815977, + "grad_norm": 5.90625, + "learning_rate": 2.564888888888889e-05, + "loss": 0.7509, + "step": 19230 + }, + { + "epoch": 13.723252496433666, + "grad_norm": 8.8125, + "learning_rate": 2.5604444444444447e-05, + "loss": 0.8054, + "step": 19240 + }, + { + "epoch": 13.730385164051356, + "grad_norm": 8.0, + "learning_rate": 2.556e-05, + "loss": 0.8303, + "step": 19250 + }, + { + "epoch": 13.737517831669045, + "grad_norm": 15.6875, + "learning_rate": 2.5515555555555558e-05, + "loss": 0.8224, + "step": 19260 + }, + { + "epoch": 13.744650499286733, + "grad_norm": 17.25, + "learning_rate": 2.5471111111111113e-05, + "loss": 0.7978, + "step": 19270 + }, + { + "epoch": 13.751783166904422, + "grad_norm": 9.625, + "learning_rate": 2.5426666666666665e-05, + "loss": 0.7964, + "step": 19280 + }, + { + "epoch": 13.758915834522112, + "grad_norm": 6.875, + "learning_rate": 2.5382222222222224e-05, + "loss": 0.8014, + "step": 19290 + }, + { + "epoch": 13.7660485021398, + "grad_norm": 7.09375, + "learning_rate": 2.5337777777777775e-05, + "loss": 0.7804, + "step": 19300 + }, + { + "epoch": 13.7660485021398, + "eval/acc": 44.1860466003418, + "step": 19300 + }, + { + "epoch": 13.7660485021398, + "eval_loss": 2.3624305725097656, + "eval_runtime": 0.262, + "eval_samples_per_second": 164.141, + "eval_steps_per_second": 3.817, + "step": 19300 + }, + { + "epoch": 13.773181169757489, + "grad_norm": 6.46875, + "learning_rate": 2.5293333333333334e-05, + "loss": 0.7614, + "step": 19310 + }, + { + "epoch": 13.780313837375179, + "grad_norm": 7.8125, + "learning_rate": 2.524888888888889e-05, + "loss": 0.7323, + "step": 19320 + }, + { + "epoch": 13.787446504992868, + "grad_norm": 6.375, + "learning_rate": 2.5204444444444448e-05, + "loss": 0.6908, + "step": 19330 + }, + { + "epoch": 13.794579172610556, + "grad_norm": 10.4375, + "learning_rate": 2.516e-05, + "loss": 0.7742, + "step": 19340 + }, + { + "epoch": 13.801711840228245, + "grad_norm": 7.4375, + "learning_rate": 2.5115555555555558e-05, + "loss": 0.7686, + "step": 19350 + }, + { + "epoch": 13.808844507845935, + "grad_norm": 8.875, + "learning_rate": 2.5071111111111114e-05, + "loss": 0.8334, + "step": 19360 + }, + { + "epoch": 13.815977175463622, + "grad_norm": 7.5, + "learning_rate": 2.5026666666666672e-05, + "loss": 0.7758, + "step": 19370 + }, + { + "epoch": 13.823109843081312, + "grad_norm": 7.53125, + "learning_rate": 2.4982222222222224e-05, + "loss": 0.7814, + "step": 19380 + }, + { + "epoch": 13.830242510699001, + "grad_norm": 14.5625, + "learning_rate": 2.493777777777778e-05, + "loss": 0.8628, + "step": 19390 + }, + { + "epoch": 13.837375178316691, + "grad_norm": 6.5625, + "learning_rate": 2.4893333333333334e-05, + "loss": 0.7898, + "step": 19400 + }, + { + "epoch": 13.837375178316691, + "eval/acc": 41.86046600341797, + "step": 19400 + }, + { + "epoch": 13.837375178316691, + "eval_loss": 2.359168767929077, + "eval_runtime": 0.2248, + "eval_samples_per_second": 191.308, + "eval_steps_per_second": 4.449, + "step": 19400 + }, + { + "epoch": 13.844507845934379, + "grad_norm": 15.375, + "learning_rate": 2.484888888888889e-05, + "loss": 0.786, + "step": 19410 + }, + { + "epoch": 13.851640513552068, + "grad_norm": 5.59375, + "learning_rate": 2.4804444444444448e-05, + "loss": 0.8072, + "step": 19420 + }, + { + "epoch": 13.858773181169758, + "grad_norm": 8.1875, + "learning_rate": 2.476e-05, + "loss": 0.7777, + "step": 19430 + }, + { + "epoch": 13.865905848787447, + "grad_norm": 8.8125, + "learning_rate": 2.4715555555555555e-05, + "loss": 0.8148, + "step": 19440 + }, + { + "epoch": 13.873038516405135, + "grad_norm": 10.9375, + "learning_rate": 2.467111111111111e-05, + "loss": 0.7659, + "step": 19450 + }, + { + "epoch": 13.880171184022824, + "grad_norm": 5.59375, + "learning_rate": 2.4626666666666666e-05, + "loss": 0.7819, + "step": 19460 + }, + { + "epoch": 13.887303851640514, + "grad_norm": 8.125, + "learning_rate": 2.4582222222222224e-05, + "loss": 0.8459, + "step": 19470 + }, + { + "epoch": 13.894436519258203, + "grad_norm": 7.1875, + "learning_rate": 2.453777777777778e-05, + "loss": 0.7448, + "step": 19480 + }, + { + "epoch": 13.901569186875891, + "grad_norm": 7.6875, + "learning_rate": 2.4493333333333335e-05, + "loss": 0.8096, + "step": 19490 + }, + { + "epoch": 13.90870185449358, + "grad_norm": 12.0, + "learning_rate": 2.444888888888889e-05, + "loss": 0.7402, + "step": 19500 + }, + { + "epoch": 13.90870185449358, + "eval/acc": 39.53488540649414, + "step": 19500 + }, + { + "epoch": 13.90870185449358, + "eval_loss": 2.3664777278900146, + "eval_runtime": 0.2287, + "eval_samples_per_second": 188.004, + "eval_steps_per_second": 4.372, + "step": 19500 + }, + { + "epoch": 13.91583452211127, + "grad_norm": 8.5, + "learning_rate": 2.4404444444444445e-05, + "loss": 0.796, + "step": 19510 + }, + { + "epoch": 13.922967189728958, + "grad_norm": 10.3125, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.7331, + "step": 19520 + }, + { + "epoch": 13.930099857346647, + "grad_norm": 9.0, + "learning_rate": 2.431555555555556e-05, + "loss": 0.6982, + "step": 19530 + }, + { + "epoch": 13.937232524964337, + "grad_norm": 8.0625, + "learning_rate": 2.427111111111111e-05, + "loss": 0.7831, + "step": 19540 + }, + { + "epoch": 13.944365192582026, + "grad_norm": 8.1875, + "learning_rate": 2.4226666666666666e-05, + "loss": 0.7795, + "step": 19550 + }, + { + "epoch": 13.951497860199714, + "grad_norm": 6.25, + "learning_rate": 2.418222222222222e-05, + "loss": 0.7416, + "step": 19560 + }, + { + "epoch": 13.958630527817403, + "grad_norm": 7.03125, + "learning_rate": 2.413777777777778e-05, + "loss": 0.7553, + "step": 19570 + }, + { + "epoch": 13.965763195435093, + "grad_norm": 7.4375, + "learning_rate": 2.4093333333333335e-05, + "loss": 0.7573, + "step": 19580 + }, + { + "epoch": 13.972895863052782, + "grad_norm": 7.75, + "learning_rate": 2.404888888888889e-05, + "loss": 0.7041, + "step": 19590 + }, + { + "epoch": 13.98002853067047, + "grad_norm": 8.75, + "learning_rate": 2.4004444444444446e-05, + "loss": 0.8235, + "step": 19600 + }, + { + "epoch": 13.98002853067047, + "eval/acc": 44.1860466003418, + "step": 19600 + }, + { + "epoch": 13.98002853067047, + "eval_loss": 2.3475139141082764, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.095, + "eval_steps_per_second": 4.467, + "step": 19600 + }, + { + "epoch": 13.98716119828816, + "grad_norm": 8.9375, + "learning_rate": 2.396e-05, + "loss": 0.8462, + "step": 19610 + }, + { + "epoch": 13.99429386590585, + "grad_norm": 7.9375, + "learning_rate": 2.3915555555555556e-05, + "loss": 0.7937, + "step": 19620 + }, + { + "epoch": 14.001426533523539, + "grad_norm": 13.1875, + "learning_rate": 2.3871111111111115e-05, + "loss": 0.7584, + "step": 19630 + }, + { + "epoch": 14.008559201141226, + "grad_norm": 6.25, + "learning_rate": 2.3826666666666667e-05, + "loss": 0.7309, + "step": 19640 + }, + { + "epoch": 14.015691868758916, + "grad_norm": 6.9375, + "learning_rate": 2.3782222222222222e-05, + "loss": 0.7802, + "step": 19650 + }, + { + "epoch": 14.022824536376605, + "grad_norm": 7.4375, + "learning_rate": 2.3737777777777777e-05, + "loss": 0.6789, + "step": 19660 + }, + { + "epoch": 14.029957203994293, + "grad_norm": 8.25, + "learning_rate": 2.3693333333333332e-05, + "loss": 0.6285, + "step": 19670 + }, + { + "epoch": 14.037089871611983, + "grad_norm": 15.9375, + "learning_rate": 2.364888888888889e-05, + "loss": 0.8654, + "step": 19680 + }, + { + "epoch": 14.044222539229672, + "grad_norm": 8.625, + "learning_rate": 2.3604444444444446e-05, + "loss": 0.7778, + "step": 19690 + }, + { + "epoch": 14.051355206847362, + "grad_norm": 8.0625, + "learning_rate": 2.356e-05, + "loss": 0.8099, + "step": 19700 + }, + { + "epoch": 14.051355206847362, + "eval/acc": 51.16279220581055, + "step": 19700 + }, + { + "epoch": 14.051355206847362, + "eval_loss": 2.568962574005127, + "eval_runtime": 7.2029, + "eval_samples_per_second": 5.97, + "eval_steps_per_second": 0.139, + "step": 19700 + }, + { + "epoch": 14.05848787446505, + "grad_norm": 10.5, + "learning_rate": 2.3515555555555557e-05, + "loss": 0.7666, + "step": 19710 + }, + { + "epoch": 14.065620542082739, + "grad_norm": 10.9375, + "learning_rate": 2.3471111111111112e-05, + "loss": 0.7339, + "step": 19720 + }, + { + "epoch": 14.072753209700428, + "grad_norm": 7.21875, + "learning_rate": 2.342666666666667e-05, + "loss": 0.7559, + "step": 19730 + }, + { + "epoch": 14.079885877318118, + "grad_norm": 15.0625, + "learning_rate": 2.3382222222222222e-05, + "loss": 0.7539, + "step": 19740 + }, + { + "epoch": 14.087018544935805, + "grad_norm": 6.90625, + "learning_rate": 2.3337777777777778e-05, + "loss": 0.744, + "step": 19750 + }, + { + "epoch": 14.094151212553495, + "grad_norm": 7.28125, + "learning_rate": 2.3293333333333333e-05, + "loss": 0.8038, + "step": 19760 + }, + { + "epoch": 14.101283880171184, + "grad_norm": 8.8125, + "learning_rate": 2.3248888888888888e-05, + "loss": 0.819, + "step": 19770 + }, + { + "epoch": 14.108416547788874, + "grad_norm": 7.1875, + "learning_rate": 2.3204444444444447e-05, + "loss": 0.7756, + "step": 19780 + }, + { + "epoch": 14.115549215406562, + "grad_norm": 7.4375, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.8243, + "step": 19790 + }, + { + "epoch": 14.122681883024251, + "grad_norm": 6.21875, + "learning_rate": 2.3115555555555557e-05, + "loss": 0.7094, + "step": 19800 + }, + { + "epoch": 14.122681883024251, + "eval/acc": 51.16279220581055, + "step": 19800 + }, + { + "epoch": 14.122681883024251, + "eval_loss": 2.565817356109619, + "eval_runtime": 0.2181, + "eval_samples_per_second": 197.132, + "eval_steps_per_second": 4.584, + "step": 19800 + }, + { + "epoch": 14.12981455064194, + "grad_norm": 6.9375, + "learning_rate": 2.3071111111111112e-05, + "loss": 0.7453, + "step": 19810 + }, + { + "epoch": 14.136947218259628, + "grad_norm": 7.21875, + "learning_rate": 2.3026666666666668e-05, + "loss": 0.8192, + "step": 19820 + }, + { + "epoch": 14.144079885877318, + "grad_norm": 7.9375, + "learning_rate": 2.2982222222222223e-05, + "loss": 0.7633, + "step": 19830 + }, + { + "epoch": 14.151212553495007, + "grad_norm": 6.875, + "learning_rate": 2.293777777777778e-05, + "loss": 0.7686, + "step": 19840 + }, + { + "epoch": 14.158345221112697, + "grad_norm": 9.5, + "learning_rate": 2.2893333333333333e-05, + "loss": 0.7597, + "step": 19850 + }, + { + "epoch": 14.165477888730384, + "grad_norm": 7.03125, + "learning_rate": 2.284888888888889e-05, + "loss": 0.7675, + "step": 19860 + }, + { + "epoch": 14.172610556348074, + "grad_norm": 9.625, + "learning_rate": 2.2804444444444444e-05, + "loss": 0.7373, + "step": 19870 + }, + { + "epoch": 14.179743223965763, + "grad_norm": 7.0, + "learning_rate": 2.2760000000000002e-05, + "loss": 0.801, + "step": 19880 + }, + { + "epoch": 14.186875891583453, + "grad_norm": 11.875, + "learning_rate": 2.2715555555555558e-05, + "loss": 0.7575, + "step": 19890 + }, + { + "epoch": 14.19400855920114, + "grad_norm": 8.375, + "learning_rate": 2.2671111111111113e-05, + "loss": 0.8073, + "step": 19900 + }, + { + "epoch": 14.19400855920114, + "eval/acc": 53.488372802734375, + "step": 19900 + }, + { + "epoch": 14.19400855920114, + "eval_loss": 2.5913121700286865, + "eval_runtime": 0.2824, + "eval_samples_per_second": 152.262, + "eval_steps_per_second": 3.541, + "step": 19900 + }, + { + "epoch": 14.20114122681883, + "grad_norm": 7.71875, + "learning_rate": 2.2626666666666668e-05, + "loss": 0.7899, + "step": 19910 + }, + { + "epoch": 14.20827389443652, + "grad_norm": 11.625, + "learning_rate": 2.2582222222222223e-05, + "loss": 0.8058, + "step": 19920 + }, + { + "epoch": 14.21540656205421, + "grad_norm": 8.4375, + "learning_rate": 2.253777777777778e-05, + "loss": 0.7907, + "step": 19930 + }, + { + "epoch": 14.222539229671897, + "grad_norm": 7.3125, + "learning_rate": 2.2493333333333337e-05, + "loss": 0.7557, + "step": 19940 + }, + { + "epoch": 14.229671897289586, + "grad_norm": 6.25, + "learning_rate": 2.244888888888889e-05, + "loss": 0.7859, + "step": 19950 + }, + { + "epoch": 14.236804564907276, + "grad_norm": 9.25, + "learning_rate": 2.2404444444444444e-05, + "loss": 0.734, + "step": 19960 + }, + { + "epoch": 14.243937232524964, + "grad_norm": 7.78125, + "learning_rate": 2.236e-05, + "loss": 0.7623, + "step": 19970 + }, + { + "epoch": 14.251069900142653, + "grad_norm": 7.15625, + "learning_rate": 2.2315555555555555e-05, + "loss": 0.6912, + "step": 19980 + }, + { + "epoch": 14.258202567760343, + "grad_norm": 6.5, + "learning_rate": 2.2271111111111113e-05, + "loss": 0.7367, + "step": 19990 + }, + { + "epoch": 14.265335235378032, + "grad_norm": 7.375, + "learning_rate": 2.222666666666667e-05, + "loss": 0.7353, + "step": 20000 + }, + { + "epoch": 14.265335235378032, + "eval/acc": 51.16279220581055, + "step": 20000 + }, + { + "epoch": 14.265335235378032, + "eval_loss": 2.5749382972717285, + "eval_runtime": 0.2259, + "eval_samples_per_second": 190.358, + "eval_steps_per_second": 4.427, + "step": 20000 + }, + { + "epoch": 14.27246790299572, + "grad_norm": 8.8125, + "learning_rate": 2.2182222222222224e-05, + "loss": 0.7647, + "step": 20010 + }, + { + "epoch": 14.27960057061341, + "grad_norm": 8.25, + "learning_rate": 2.213777777777778e-05, + "loss": 0.7528, + "step": 20020 + }, + { + "epoch": 14.286733238231099, + "grad_norm": 8.125, + "learning_rate": 2.2093333333333334e-05, + "loss": 0.6316, + "step": 20030 + }, + { + "epoch": 14.293865905848788, + "grad_norm": 40.75, + "learning_rate": 2.2048888888888893e-05, + "loss": 0.6989, + "step": 20040 + }, + { + "epoch": 14.300998573466476, + "grad_norm": 7.1875, + "learning_rate": 2.2004444444444445e-05, + "loss": 0.7497, + "step": 20050 + }, + { + "epoch": 14.308131241084165, + "grad_norm": 7.46875, + "learning_rate": 2.196e-05, + "loss": 0.7738, + "step": 20060 + }, + { + "epoch": 14.315263908701855, + "grad_norm": 10.25, + "learning_rate": 2.1915555555555555e-05, + "loss": 0.7183, + "step": 20070 + }, + { + "epoch": 14.322396576319543, + "grad_norm": 13.125, + "learning_rate": 2.187111111111111e-05, + "loss": 0.7628, + "step": 20080 + }, + { + "epoch": 14.329529243937232, + "grad_norm": 6.96875, + "learning_rate": 2.182666666666667e-05, + "loss": 0.7416, + "step": 20090 + }, + { + "epoch": 14.336661911554922, + "grad_norm": 6.59375, + "learning_rate": 2.1782222222222224e-05, + "loss": 0.7937, + "step": 20100 + }, + { + "epoch": 14.336661911554922, + "eval/acc": 51.16279220581055, + "step": 20100 + }, + { + "epoch": 14.336661911554922, + "eval_loss": 2.5889506340026855, + "eval_runtime": 0.2201, + "eval_samples_per_second": 195.39, + "eval_steps_per_second": 4.544, + "step": 20100 + }, + { + "epoch": 14.343794579172611, + "grad_norm": 7.1875, + "learning_rate": 2.173777777777778e-05, + "loss": 0.8359, + "step": 20110 + }, + { + "epoch": 14.350927246790299, + "grad_norm": 6.84375, + "learning_rate": 2.1693333333333335e-05, + "loss": 0.782, + "step": 20120 + }, + { + "epoch": 14.358059914407988, + "grad_norm": 6.78125, + "learning_rate": 2.164888888888889e-05, + "loss": 0.7893, + "step": 20130 + }, + { + "epoch": 14.365192582025678, + "grad_norm": 9.125, + "learning_rate": 2.1604444444444445e-05, + "loss": 0.7187, + "step": 20140 + }, + { + "epoch": 14.372325249643367, + "grad_norm": 8.125, + "learning_rate": 2.1560000000000004e-05, + "loss": 0.7507, + "step": 20150 + }, + { + "epoch": 14.379457917261055, + "grad_norm": 9.1875, + "learning_rate": 2.1515555555555555e-05, + "loss": 0.7716, + "step": 20160 + }, + { + "epoch": 14.386590584878745, + "grad_norm": 7.53125, + "learning_rate": 2.147111111111111e-05, + "loss": 0.7996, + "step": 20170 + }, + { + "epoch": 14.393723252496434, + "grad_norm": 7.84375, + "learning_rate": 2.1426666666666666e-05, + "loss": 0.7752, + "step": 20180 + }, + { + "epoch": 14.400855920114124, + "grad_norm": 8.0, + "learning_rate": 2.1382222222222225e-05, + "loss": 0.8248, + "step": 20190 + }, + { + "epoch": 14.407988587731811, + "grad_norm": 7.1875, + "learning_rate": 2.133777777777778e-05, + "loss": 0.7153, + "step": 20200 + }, + { + "epoch": 14.407988587731811, + "eval/acc": 53.488372802734375, + "step": 20200 + }, + { + "epoch": 14.407988587731811, + "eval_loss": 2.5689048767089844, + "eval_runtime": 0.7977, + "eval_samples_per_second": 53.904, + "eval_steps_per_second": 1.254, + "step": 20200 + }, + { + "epoch": 14.4151212553495, + "grad_norm": 8.375, + "learning_rate": 2.1293333333333335e-05, + "loss": 0.809, + "step": 20210 + }, + { + "epoch": 14.42225392296719, + "grad_norm": 20.875, + "learning_rate": 2.124888888888889e-05, + "loss": 0.7955, + "step": 20220 + }, + { + "epoch": 14.429386590584878, + "grad_norm": 7.9375, + "learning_rate": 2.1204444444444445e-05, + "loss": 0.7775, + "step": 20230 + }, + { + "epoch": 14.436519258202567, + "grad_norm": 8.6875, + "learning_rate": 2.116e-05, + "loss": 0.8455, + "step": 20240 + }, + { + "epoch": 14.443651925820257, + "grad_norm": 7.125, + "learning_rate": 2.111555555555556e-05, + "loss": 0.815, + "step": 20250 + }, + { + "epoch": 14.450784593437946, + "grad_norm": 8.125, + "learning_rate": 2.107111111111111e-05, + "loss": 0.8153, + "step": 20260 + }, + { + "epoch": 14.457917261055634, + "grad_norm": 6.09375, + "learning_rate": 2.1026666666666666e-05, + "loss": 0.7566, + "step": 20270 + }, + { + "epoch": 14.465049928673324, + "grad_norm": 8.9375, + "learning_rate": 2.098222222222222e-05, + "loss": 0.8227, + "step": 20280 + }, + { + "epoch": 14.472182596291013, + "grad_norm": 6.28125, + "learning_rate": 2.0937777777777777e-05, + "loss": 0.7402, + "step": 20290 + }, + { + "epoch": 14.479315263908703, + "grad_norm": 6.84375, + "learning_rate": 2.0893333333333335e-05, + "loss": 0.826, + "step": 20300 + }, + { + "epoch": 14.479315263908703, + "eval/acc": 51.16279220581055, + "step": 20300 + }, + { + "epoch": 14.479315263908703, + "eval_loss": 2.570920467376709, + "eval_runtime": 0.2167, + "eval_samples_per_second": 198.423, + "eval_steps_per_second": 4.614, + "step": 20300 + }, + { + "epoch": 14.48644793152639, + "grad_norm": 9.5625, + "learning_rate": 2.084888888888889e-05, + "loss": 0.7123, + "step": 20310 + }, + { + "epoch": 14.49358059914408, + "grad_norm": 7.625, + "learning_rate": 2.0804444444444446e-05, + "loss": 0.7538, + "step": 20320 + }, + { + "epoch": 14.50071326676177, + "grad_norm": 6.34375, + "learning_rate": 2.076e-05, + "loss": 0.7735, + "step": 20330 + }, + { + "epoch": 14.507845934379457, + "grad_norm": 6.5625, + "learning_rate": 2.0715555555555556e-05, + "loss": 0.6973, + "step": 20340 + }, + { + "epoch": 14.514978601997147, + "grad_norm": 8.75, + "learning_rate": 2.0671111111111115e-05, + "loss": 0.8565, + "step": 20350 + }, + { + "epoch": 14.522111269614836, + "grad_norm": 7.6875, + "learning_rate": 2.0626666666666667e-05, + "loss": 0.8176, + "step": 20360 + }, + { + "epoch": 14.529243937232525, + "grad_norm": 6.875, + "learning_rate": 2.0582222222222222e-05, + "loss": 0.8526, + "step": 20370 + }, + { + "epoch": 14.536376604850213, + "grad_norm": 12.75, + "learning_rate": 2.0537777777777777e-05, + "loss": 0.7997, + "step": 20380 + }, + { + "epoch": 14.543509272467903, + "grad_norm": 8.25, + "learning_rate": 2.0493333333333333e-05, + "loss": 0.7924, + "step": 20390 + }, + { + "epoch": 14.550641940085592, + "grad_norm": 7.84375, + "learning_rate": 2.044888888888889e-05, + "loss": 0.7639, + "step": 20400 + }, + { + "epoch": 14.550641940085592, + "eval/acc": 51.16279220581055, + "step": 20400 + }, + { + "epoch": 14.550641940085592, + "eval_loss": 2.5919346809387207, + "eval_runtime": 0.2218, + "eval_samples_per_second": 193.859, + "eval_steps_per_second": 4.508, + "step": 20400 + }, + { + "epoch": 14.557774607703282, + "grad_norm": 8.1875, + "learning_rate": 2.0404444444444446e-05, + "loss": 0.7856, + "step": 20410 + }, + { + "epoch": 14.56490727532097, + "grad_norm": 9.0, + "learning_rate": 2.036e-05, + "loss": 0.7842, + "step": 20420 + }, + { + "epoch": 14.572039942938659, + "grad_norm": 6.75, + "learning_rate": 2.0315555555555557e-05, + "loss": 0.7689, + "step": 20430 + }, + { + "epoch": 14.579172610556348, + "grad_norm": 7.59375, + "learning_rate": 2.0271111111111112e-05, + "loss": 0.7918, + "step": 20440 + }, + { + "epoch": 14.586305278174038, + "grad_norm": 7.1875, + "learning_rate": 2.0226666666666667e-05, + "loss": 0.7329, + "step": 20450 + }, + { + "epoch": 14.593437945791726, + "grad_norm": 7.59375, + "learning_rate": 2.0182222222222222e-05, + "loss": 0.7926, + "step": 20460 + }, + { + "epoch": 14.600570613409415, + "grad_norm": 7.21875, + "learning_rate": 2.0137777777777778e-05, + "loss": 0.786, + "step": 20470 + }, + { + "epoch": 14.607703281027105, + "grad_norm": 10.6875, + "learning_rate": 2.0093333333333333e-05, + "loss": 0.7662, + "step": 20480 + }, + { + "epoch": 14.614835948644792, + "grad_norm": 7.375, + "learning_rate": 2.0048888888888888e-05, + "loss": 0.7225, + "step": 20490 + }, + { + "epoch": 14.621968616262482, + "grad_norm": 9.0625, + "learning_rate": 2.0004444444444447e-05, + "loss": 0.7034, + "step": 20500 + }, + { + "epoch": 14.621968616262482, + "eval/acc": 51.16279220581055, + "step": 20500 + }, + { + "epoch": 14.621968616262482, + "eval_loss": 2.564755916595459, + "eval_runtime": 0.2647, + "eval_samples_per_second": 162.462, + "eval_steps_per_second": 3.778, + "step": 20500 + }, + { + "epoch": 14.629101283880171, + "grad_norm": 34.25, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.7003, + "step": 20510 + }, + { + "epoch": 14.63623395149786, + "grad_norm": 7.53125, + "learning_rate": 1.9915555555555557e-05, + "loss": 0.6718, + "step": 20520 + }, + { + "epoch": 14.643366619115548, + "grad_norm": 8.375, + "learning_rate": 1.9871111111111112e-05, + "loss": 0.7539, + "step": 20530 + }, + { + "epoch": 14.650499286733238, + "grad_norm": 6.90625, + "learning_rate": 1.9826666666666668e-05, + "loss": 0.7997, + "step": 20540 + }, + { + "epoch": 14.657631954350927, + "grad_norm": 7.53125, + "learning_rate": 1.9782222222222223e-05, + "loss": 0.7655, + "step": 20550 + }, + { + "epoch": 14.664764621968617, + "grad_norm": 8.25, + "learning_rate": 1.973777777777778e-05, + "loss": 0.7282, + "step": 20560 + }, + { + "epoch": 14.671897289586305, + "grad_norm": 6.875, + "learning_rate": 1.9693333333333333e-05, + "loss": 0.8961, + "step": 20570 + }, + { + "epoch": 14.679029957203994, + "grad_norm": 7.71875, + "learning_rate": 1.964888888888889e-05, + "loss": 0.8167, + "step": 20580 + }, + { + "epoch": 14.686162624821684, + "grad_norm": 7.4375, + "learning_rate": 1.9604444444444444e-05, + "loss": 0.7412, + "step": 20590 + }, + { + "epoch": 14.693295292439373, + "grad_norm": 7.625, + "learning_rate": 1.956e-05, + "loss": 0.7148, + "step": 20600 + }, + { + "epoch": 14.693295292439373, + "eval/acc": 53.488372802734375, + "step": 20600 + }, + { + "epoch": 14.693295292439373, + "eval_loss": 2.580390691757202, + "eval_runtime": 0.2183, + "eval_samples_per_second": 196.951, + "eval_steps_per_second": 4.58, + "step": 20600 + }, + { + "epoch": 14.70042796005706, + "grad_norm": 6.5, + "learning_rate": 1.9515555555555558e-05, + "loss": 0.7357, + "step": 20610 + }, + { + "epoch": 14.70756062767475, + "grad_norm": 5.625, + "learning_rate": 1.9471111111111113e-05, + "loss": 0.771, + "step": 20620 + }, + { + "epoch": 14.71469329529244, + "grad_norm": 8.625, + "learning_rate": 1.9426666666666668e-05, + "loss": 0.8076, + "step": 20630 + }, + { + "epoch": 14.721825962910128, + "grad_norm": 7.625, + "learning_rate": 1.9382222222222223e-05, + "loss": 0.7406, + "step": 20640 + }, + { + "epoch": 14.728958630527817, + "grad_norm": 7.0, + "learning_rate": 1.933777777777778e-05, + "loss": 0.703, + "step": 20650 + }, + { + "epoch": 14.736091298145507, + "grad_norm": 7.5, + "learning_rate": 1.9293333333333334e-05, + "loss": 0.8349, + "step": 20660 + }, + { + "epoch": 14.743223965763196, + "grad_norm": 7.34375, + "learning_rate": 1.924888888888889e-05, + "loss": 0.7207, + "step": 20670 + }, + { + "epoch": 14.750356633380884, + "grad_norm": 6.78125, + "learning_rate": 1.9204444444444444e-05, + "loss": 0.7787, + "step": 20680 + }, + { + "epoch": 14.757489300998573, + "grad_norm": 7.875, + "learning_rate": 1.916e-05, + "loss": 0.7411, + "step": 20690 + }, + { + "epoch": 14.764621968616263, + "grad_norm": 7.5, + "learning_rate": 1.9115555555555555e-05, + "loss": 0.7974, + "step": 20700 + }, + { + "epoch": 14.764621968616263, + "eval/acc": 53.488372802734375, + "step": 20700 + }, + { + "epoch": 14.764621968616263, + "eval_loss": 2.5736501216888428, + "eval_runtime": 0.2193, + "eval_samples_per_second": 196.101, + "eval_steps_per_second": 4.56, + "step": 20700 + }, + { + "epoch": 14.771754636233952, + "grad_norm": 8.1875, + "learning_rate": 1.9071111111111113e-05, + "loss": 0.7605, + "step": 20710 + }, + { + "epoch": 14.77888730385164, + "grad_norm": 7.90625, + "learning_rate": 1.902666666666667e-05, + "loss": 0.9048, + "step": 20720 + }, + { + "epoch": 14.78601997146933, + "grad_norm": 7.25, + "learning_rate": 1.8982222222222224e-05, + "loss": 0.7619, + "step": 20730 + }, + { + "epoch": 14.793152639087019, + "grad_norm": 6.65625, + "learning_rate": 1.893777777777778e-05, + "loss": 0.7385, + "step": 20740 + }, + { + "epoch": 14.800285306704708, + "grad_norm": 7.96875, + "learning_rate": 1.8893333333333334e-05, + "loss": 0.7685, + "step": 20750 + }, + { + "epoch": 14.807417974322396, + "grad_norm": 8.375, + "learning_rate": 1.884888888888889e-05, + "loss": 0.7639, + "step": 20760 + }, + { + "epoch": 14.814550641940086, + "grad_norm": 9.5625, + "learning_rate": 1.8804444444444445e-05, + "loss": 0.9127, + "step": 20770 + }, + { + "epoch": 14.821683309557775, + "grad_norm": 8.0625, + "learning_rate": 1.876e-05, + "loss": 0.6761, + "step": 20780 + }, + { + "epoch": 14.828815977175463, + "grad_norm": 7.0, + "learning_rate": 1.8715555555555555e-05, + "loss": 0.7239, + "step": 20790 + }, + { + "epoch": 14.835948644793152, + "grad_norm": 7.15625, + "learning_rate": 1.867111111111111e-05, + "loss": 0.721, + "step": 20800 + }, + { + "epoch": 14.835948644793152, + "eval/acc": 51.16279220581055, + "step": 20800 + }, + { + "epoch": 14.835948644793152, + "eval_loss": 2.592449426651001, + "eval_runtime": 0.2341, + "eval_samples_per_second": 183.686, + "eval_steps_per_second": 4.272, + "step": 20800 + }, + { + "epoch": 14.843081312410842, + "grad_norm": 7.84375, + "learning_rate": 1.8626666666666666e-05, + "loss": 0.7113, + "step": 20810 + }, + { + "epoch": 14.850213980028531, + "grad_norm": 7.09375, + "learning_rate": 1.8582222222222224e-05, + "loss": 0.7946, + "step": 20820 + }, + { + "epoch": 14.857346647646219, + "grad_norm": 8.8125, + "learning_rate": 1.853777777777778e-05, + "loss": 0.8066, + "step": 20830 + }, + { + "epoch": 14.864479315263909, + "grad_norm": 5.375, + "learning_rate": 1.8493333333333335e-05, + "loss": 0.7468, + "step": 20840 + }, + { + "epoch": 14.871611982881598, + "grad_norm": 8.125, + "learning_rate": 1.844888888888889e-05, + "loss": 0.772, + "step": 20850 + }, + { + "epoch": 14.878744650499288, + "grad_norm": 11.0625, + "learning_rate": 1.8404444444444445e-05, + "loss": 0.8239, + "step": 20860 + }, + { + "epoch": 14.885877318116975, + "grad_norm": 9.0, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.8001, + "step": 20870 + }, + { + "epoch": 14.893009985734665, + "grad_norm": 6.71875, + "learning_rate": 1.8315555555555556e-05, + "loss": 0.7234, + "step": 20880 + }, + { + "epoch": 14.900142653352354, + "grad_norm": 7.125, + "learning_rate": 1.827111111111111e-05, + "loss": 0.7657, + "step": 20890 + }, + { + "epoch": 14.907275320970044, + "grad_norm": 7.0625, + "learning_rate": 1.8226666666666666e-05, + "loss": 0.8357, + "step": 20900 + }, + { + "epoch": 14.907275320970044, + "eval/acc": 51.16279220581055, + "step": 20900 + }, + { + "epoch": 14.907275320970044, + "eval_loss": 2.567725658416748, + "eval_runtime": 0.2255, + "eval_samples_per_second": 190.652, + "eval_steps_per_second": 4.434, + "step": 20900 + }, + { + "epoch": 14.914407988587731, + "grad_norm": 6.28125, + "learning_rate": 1.818222222222222e-05, + "loss": 0.7335, + "step": 20910 + }, + { + "epoch": 14.921540656205421, + "grad_norm": 9.25, + "learning_rate": 1.813777777777778e-05, + "loss": 0.7241, + "step": 20920 + }, + { + "epoch": 14.92867332382311, + "grad_norm": 8.1875, + "learning_rate": 1.8093333333333335e-05, + "loss": 0.8071, + "step": 20930 + }, + { + "epoch": 14.935805991440798, + "grad_norm": 7.6875, + "learning_rate": 1.804888888888889e-05, + "loss": 0.7734, + "step": 20940 + }, + { + "epoch": 14.942938659058488, + "grad_norm": 11.125, + "learning_rate": 1.8004444444444446e-05, + "loss": 0.8113, + "step": 20950 + }, + { + "epoch": 14.950071326676177, + "grad_norm": 15.875, + "learning_rate": 1.796e-05, + "loss": 0.747, + "step": 20960 + }, + { + "epoch": 14.957203994293867, + "grad_norm": 8.0625, + "learning_rate": 1.7915555555555556e-05, + "loss": 0.7377, + "step": 20970 + }, + { + "epoch": 14.964336661911554, + "grad_norm": 7.0, + "learning_rate": 1.787111111111111e-05, + "loss": 0.7357, + "step": 20980 + }, + { + "epoch": 14.971469329529244, + "grad_norm": 7.6875, + "learning_rate": 1.7826666666666667e-05, + "loss": 0.7443, + "step": 20990 + }, + { + "epoch": 14.978601997146933, + "grad_norm": 7.3125, + "learning_rate": 1.7782222222222222e-05, + "loss": 0.7437, + "step": 21000 + }, + { + "epoch": 14.978601997146933, + "eval/acc": 51.16279220581055, + "step": 21000 + }, + { + "epoch": 14.978601997146933, + "eval_loss": 2.595672369003296, + "eval_runtime": 0.2171, + "eval_samples_per_second": 198.079, + "eval_steps_per_second": 4.606, + "step": 21000 + }, + { + "epoch": 14.985734664764623, + "grad_norm": 14.0625, + "learning_rate": 1.7737777777777777e-05, + "loss": 0.7814, + "step": 21010 + }, + { + "epoch": 14.99286733238231, + "grad_norm": 6.75, + "learning_rate": 1.7693333333333336e-05, + "loss": 0.7794, + "step": 21020 + }, + { + "epoch": 15.0, + "grad_norm": 6.96875, + "learning_rate": 1.764888888888889e-05, + "loss": 0.7931, + "step": 21030 + }, + { + "epoch": 15.00713266761769, + "grad_norm": 7.8125, + "learning_rate": 1.7604444444444446e-05, + "loss": 0.8088, + "step": 21040 + }, + { + "epoch": 15.014265335235377, + "grad_norm": 7.0625, + "learning_rate": 1.756e-05, + "loss": 0.7975, + "step": 21050 + }, + { + "epoch": 15.021398002853067, + "grad_norm": 7.125, + "learning_rate": 1.7515555555555557e-05, + "loss": 0.819, + "step": 21060 + }, + { + "epoch": 15.028530670470756, + "grad_norm": 6.5625, + "learning_rate": 1.7471111111111112e-05, + "loss": 0.6872, + "step": 21070 + }, + { + "epoch": 15.035663338088446, + "grad_norm": 6.90625, + "learning_rate": 1.7426666666666667e-05, + "loss": 0.7137, + "step": 21080 + }, + { + "epoch": 15.042796005706133, + "grad_norm": 7.25, + "learning_rate": 1.7382222222222222e-05, + "loss": 0.7424, + "step": 21090 + }, + { + "epoch": 15.049928673323823, + "grad_norm": 8.6875, + "learning_rate": 1.7337777777777777e-05, + "loss": 0.7837, + "step": 21100 + }, + { + "epoch": 15.049928673323823, + "eval/acc": 51.16279220581055, + "step": 21100 + }, + { + "epoch": 15.049928673323823, + "eval_loss": 2.105516195297241, + "eval_runtime": 6.7681, + "eval_samples_per_second": 6.353, + "eval_steps_per_second": 0.148, + "step": 21100 + }, + { + "epoch": 15.057061340941512, + "grad_norm": 7.96875, + "learning_rate": 1.7293333333333333e-05, + "loss": 0.7133, + "step": 21110 + }, + { + "epoch": 15.064194008559202, + "grad_norm": 5.75, + "learning_rate": 1.7248888888888888e-05, + "loss": 0.6913, + "step": 21120 + }, + { + "epoch": 15.07132667617689, + "grad_norm": 18.5, + "learning_rate": 1.7204444444444446e-05, + "loss": 0.7564, + "step": 21130 + }, + { + "epoch": 15.078459343794579, + "grad_norm": 8.625, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.7526, + "step": 21140 + }, + { + "epoch": 15.085592011412269, + "grad_norm": 10.6875, + "learning_rate": 1.7115555555555557e-05, + "loss": 0.7741, + "step": 21150 + }, + { + "epoch": 15.092724679029958, + "grad_norm": 7.09375, + "learning_rate": 1.7071111111111112e-05, + "loss": 0.7387, + "step": 21160 + }, + { + "epoch": 15.099857346647646, + "grad_norm": 7.1875, + "learning_rate": 1.7026666666666667e-05, + "loss": 0.7139, + "step": 21170 + }, + { + "epoch": 15.106990014265335, + "grad_norm": 9.75, + "learning_rate": 1.6982222222222226e-05, + "loss": 0.7564, + "step": 21180 + }, + { + "epoch": 15.114122681883025, + "grad_norm": 7.1875, + "learning_rate": 1.6937777777777778e-05, + "loss": 0.7592, + "step": 21190 + }, + { + "epoch": 15.121255349500712, + "grad_norm": 11.4375, + "learning_rate": 1.6893333333333333e-05, + "loss": 0.7444, + "step": 21200 + }, + { + "epoch": 15.121255349500712, + "eval/acc": 51.16279220581055, + "step": 21200 + }, + { + "epoch": 15.121255349500712, + "eval_loss": 2.099931478500366, + "eval_runtime": 0.2305, + "eval_samples_per_second": 186.541, + "eval_steps_per_second": 4.338, + "step": 21200 + }, + { + "epoch": 15.128388017118402, + "grad_norm": 8.5625, + "learning_rate": 1.684888888888889e-05, + "loss": 0.7643, + "step": 21210 + }, + { + "epoch": 15.135520684736091, + "grad_norm": 7.40625, + "learning_rate": 1.6804444444444444e-05, + "loss": 0.7471, + "step": 21220 + }, + { + "epoch": 15.142653352353781, + "grad_norm": 6.0, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.8479, + "step": 21230 + }, + { + "epoch": 15.149786019971469, + "grad_norm": 8.3125, + "learning_rate": 1.6715555555555557e-05, + "loss": 0.8082, + "step": 21240 + }, + { + "epoch": 15.156918687589158, + "grad_norm": 7.375, + "learning_rate": 1.6671111111111113e-05, + "loss": 0.7453, + "step": 21250 + }, + { + "epoch": 15.164051355206848, + "grad_norm": 6.0, + "learning_rate": 1.6626666666666668e-05, + "loss": 0.784, + "step": 21260 + }, + { + "epoch": 15.171184022824537, + "grad_norm": 9.5, + "learning_rate": 1.6582222222222223e-05, + "loss": 0.7564, + "step": 21270 + }, + { + "epoch": 15.178316690442225, + "grad_norm": 7.90625, + "learning_rate": 1.6537777777777778e-05, + "loss": 0.7713, + "step": 21280 + }, + { + "epoch": 15.185449358059914, + "grad_norm": 7.0, + "learning_rate": 1.6493333333333334e-05, + "loss": 0.818, + "step": 21290 + }, + { + "epoch": 15.192582025677604, + "grad_norm": 7.15625, + "learning_rate": 1.644888888888889e-05, + "loss": 0.7564, + "step": 21300 + }, + { + "epoch": 15.192582025677604, + "eval/acc": 48.83720779418945, + "step": 21300 + }, + { + "epoch": 15.192582025677604, + "eval_loss": 2.1149563789367676, + "eval_runtime": 0.2268, + "eval_samples_per_second": 189.56, + "eval_steps_per_second": 4.408, + "step": 21300 + }, + { + "epoch": 15.199714693295292, + "grad_norm": 8.0, + "learning_rate": 1.6404444444444444e-05, + "loss": 0.7391, + "step": 21310 + }, + { + "epoch": 15.206847360912981, + "grad_norm": 9.5625, + "learning_rate": 1.636e-05, + "loss": 0.7907, + "step": 21320 + }, + { + "epoch": 15.21398002853067, + "grad_norm": 6.25, + "learning_rate": 1.6315555555555558e-05, + "loss": 0.7595, + "step": 21330 + }, + { + "epoch": 15.22111269614836, + "grad_norm": 10.25, + "learning_rate": 1.6271111111111113e-05, + "loss": 0.7629, + "step": 21340 + }, + { + "epoch": 15.228245363766048, + "grad_norm": 8.6875, + "learning_rate": 1.6226666666666668e-05, + "loss": 0.7305, + "step": 21350 + }, + { + "epoch": 15.235378031383737, + "grad_norm": 137.0, + "learning_rate": 1.6182222222222224e-05, + "loss": 0.7605, + "step": 21360 + }, + { + "epoch": 15.242510699001427, + "grad_norm": 6.75, + "learning_rate": 1.613777777777778e-05, + "loss": 0.7498, + "step": 21370 + }, + { + "epoch": 15.249643366619116, + "grad_norm": 6.46875, + "learning_rate": 1.6093333333333334e-05, + "loss": 0.773, + "step": 21380 + }, + { + "epoch": 15.256776034236804, + "grad_norm": 9.75, + "learning_rate": 1.604888888888889e-05, + "loss": 0.7547, + "step": 21390 + }, + { + "epoch": 15.263908701854493, + "grad_norm": 6.71875, + "learning_rate": 1.6004444444444444e-05, + "loss": 0.7922, + "step": 21400 + }, + { + "epoch": 15.263908701854493, + "eval/acc": 51.16279220581055, + "step": 21400 + }, + { + "epoch": 15.263908701854493, + "eval_loss": 2.0905489921569824, + "eval_runtime": 0.2297, + "eval_samples_per_second": 187.229, + "eval_steps_per_second": 4.354, + "step": 21400 + }, + { + "epoch": 15.271041369472183, + "grad_norm": 9.125, + "learning_rate": 1.596e-05, + "loss": 0.7484, + "step": 21410 + }, + { + "epoch": 15.278174037089872, + "grad_norm": 7.4375, + "learning_rate": 1.5915555555555555e-05, + "loss": 0.7577, + "step": 21420 + }, + { + "epoch": 15.28530670470756, + "grad_norm": 6.5, + "learning_rate": 1.587111111111111e-05, + "loss": 0.7756, + "step": 21430 + }, + { + "epoch": 15.29243937232525, + "grad_norm": 7.65625, + "learning_rate": 1.582666666666667e-05, + "loss": 0.7254, + "step": 21440 + }, + { + "epoch": 15.29957203994294, + "grad_norm": 6.96875, + "learning_rate": 1.5782222222222224e-05, + "loss": 0.7657, + "step": 21450 + }, + { + "epoch": 15.306704707560627, + "grad_norm": 7.5, + "learning_rate": 1.573777777777778e-05, + "loss": 0.8066, + "step": 21460 + }, + { + "epoch": 15.313837375178316, + "grad_norm": 8.5625, + "learning_rate": 1.5693333333333334e-05, + "loss": 0.7305, + "step": 21470 + }, + { + "epoch": 15.320970042796006, + "grad_norm": 16.5, + "learning_rate": 1.564888888888889e-05, + "loss": 0.769, + "step": 21480 + }, + { + "epoch": 15.328102710413695, + "grad_norm": 8.4375, + "learning_rate": 1.5604444444444445e-05, + "loss": 0.8408, + "step": 21490 + }, + { + "epoch": 15.335235378031383, + "grad_norm": 8.375, + "learning_rate": 1.556e-05, + "loss": 0.7811, + "step": 21500 + }, + { + "epoch": 15.335235378031383, + "eval/acc": 51.16279220581055, + "step": 21500 + }, + { + "epoch": 15.335235378031383, + "eval_loss": 2.1078341007232666, + "eval_runtime": 0.2375, + "eval_samples_per_second": 181.063, + "eval_steps_per_second": 4.211, + "step": 21500 + }, + { + "epoch": 15.342368045649073, + "grad_norm": 8.3125, + "learning_rate": 1.5515555555555555e-05, + "loss": 0.7697, + "step": 21510 + }, + { + "epoch": 15.349500713266762, + "grad_norm": 8.3125, + "learning_rate": 1.547111111111111e-05, + "loss": 0.7481, + "step": 21520 + }, + { + "epoch": 15.356633380884452, + "grad_norm": 6.3125, + "learning_rate": 1.5426666666666666e-05, + "loss": 0.8076, + "step": 21530 + }, + { + "epoch": 15.36376604850214, + "grad_norm": 6.40625, + "learning_rate": 1.5382222222222224e-05, + "loss": 0.803, + "step": 21540 + }, + { + "epoch": 15.370898716119829, + "grad_norm": 10.0625, + "learning_rate": 1.533777777777778e-05, + "loss": 0.7999, + "step": 21550 + }, + { + "epoch": 15.378031383737518, + "grad_norm": 7.5, + "learning_rate": 1.5293333333333335e-05, + "loss": 0.7886, + "step": 21560 + }, + { + "epoch": 15.385164051355208, + "grad_norm": 9.375, + "learning_rate": 1.524888888888889e-05, + "loss": 0.7446, + "step": 21570 + }, + { + "epoch": 15.392296718972895, + "grad_norm": 6.40625, + "learning_rate": 1.5204444444444445e-05, + "loss": 0.7721, + "step": 21580 + }, + { + "epoch": 15.399429386590585, + "grad_norm": 6.0, + "learning_rate": 1.5160000000000002e-05, + "loss": 0.8235, + "step": 21590 + }, + { + "epoch": 15.406562054208274, + "grad_norm": 9.625, + "learning_rate": 1.5115555555555556e-05, + "loss": 0.7322, + "step": 21600 + }, + { + "epoch": 15.406562054208274, + "eval/acc": 48.83720779418945, + "step": 21600 + }, + { + "epoch": 15.406562054208274, + "eval_loss": 2.0801358222961426, + "eval_runtime": 0.2459, + "eval_samples_per_second": 174.833, + "eval_steps_per_second": 4.066, + "step": 21600 + }, + { + "epoch": 15.413694721825962, + "grad_norm": 6.46875, + "learning_rate": 1.5071111111111111e-05, + "loss": 0.7196, + "step": 21610 + }, + { + "epoch": 15.420827389443652, + "grad_norm": 7.03125, + "learning_rate": 1.5026666666666666e-05, + "loss": 0.7608, + "step": 21620 + }, + { + "epoch": 15.427960057061341, + "grad_norm": 8.1875, + "learning_rate": 1.4982222222222223e-05, + "loss": 0.7963, + "step": 21630 + }, + { + "epoch": 15.43509272467903, + "grad_norm": 9.8125, + "learning_rate": 1.4937777777777778e-05, + "loss": 0.8916, + "step": 21640 + }, + { + "epoch": 15.442225392296718, + "grad_norm": 8.0, + "learning_rate": 1.4893333333333334e-05, + "loss": 0.7113, + "step": 21650 + }, + { + "epoch": 15.449358059914408, + "grad_norm": 7.71875, + "learning_rate": 1.484888888888889e-05, + "loss": 0.784, + "step": 21660 + }, + { + "epoch": 15.456490727532097, + "grad_norm": 13.0, + "learning_rate": 1.4804444444444446e-05, + "loss": 0.6864, + "step": 21670 + }, + { + "epoch": 15.463623395149787, + "grad_norm": 6.65625, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.7334, + "step": 21680 + }, + { + "epoch": 15.470756062767475, + "grad_norm": 8.25, + "learning_rate": 1.4715555555555558e-05, + "loss": 0.6251, + "step": 21690 + }, + { + "epoch": 15.477888730385164, + "grad_norm": 7.21875, + "learning_rate": 1.467111111111111e-05, + "loss": 0.6724, + "step": 21700 + }, + { + "epoch": 15.477888730385164, + "eval/acc": 48.83720779418945, + "step": 21700 + }, + { + "epoch": 15.477888730385164, + "eval_loss": 2.120603322982788, + "eval_runtime": 0.2216, + "eval_samples_per_second": 194.055, + "eval_steps_per_second": 4.513, + "step": 21700 + }, + { + "epoch": 15.485021398002853, + "grad_norm": 7.1875, + "learning_rate": 1.4626666666666667e-05, + "loss": 0.6897, + "step": 21710 + }, + { + "epoch": 15.492154065620543, + "grad_norm": 7.59375, + "learning_rate": 1.4582222222222222e-05, + "loss": 0.758, + "step": 21720 + }, + { + "epoch": 15.49928673323823, + "grad_norm": 16.5, + "learning_rate": 1.4537777777777777e-05, + "loss": 0.6712, + "step": 21730 + }, + { + "epoch": 15.50641940085592, + "grad_norm": 6.09375, + "learning_rate": 1.4493333333333334e-05, + "loss": 0.8117, + "step": 21740 + }, + { + "epoch": 15.51355206847361, + "grad_norm": 8.8125, + "learning_rate": 1.444888888888889e-05, + "loss": 0.7811, + "step": 21750 + }, + { + "epoch": 15.520684736091297, + "grad_norm": 6.78125, + "learning_rate": 1.4404444444444446e-05, + "loss": 0.8605, + "step": 21760 + }, + { + "epoch": 15.527817403708987, + "grad_norm": 17.25, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.7169, + "step": 21770 + }, + { + "epoch": 15.534950071326676, + "grad_norm": 9.625, + "learning_rate": 1.4315555555555557e-05, + "loss": 0.7355, + "step": 21780 + }, + { + "epoch": 15.542082738944366, + "grad_norm": 7.3125, + "learning_rate": 1.4271111111111114e-05, + "loss": 0.7565, + "step": 21790 + }, + { + "epoch": 15.549215406562054, + "grad_norm": 17.375, + "learning_rate": 1.4226666666666669e-05, + "loss": 0.7719, + "step": 21800 + }, + { + "epoch": 15.549215406562054, + "eval/acc": 51.16279220581055, + "step": 21800 + }, + { + "epoch": 15.549215406562054, + "eval_loss": 2.1045022010803223, + "eval_runtime": 0.2298, + "eval_samples_per_second": 187.156, + "eval_steps_per_second": 4.352, + "step": 21800 + }, + { + "epoch": 15.556348074179743, + "grad_norm": 10.1875, + "learning_rate": 1.4182222222222222e-05, + "loss": 0.8614, + "step": 21810 + }, + { + "epoch": 15.563480741797433, + "grad_norm": 7.15625, + "learning_rate": 1.4137777777777778e-05, + "loss": 0.6956, + "step": 21820 + }, + { + "epoch": 15.570613409415122, + "grad_norm": 7.625, + "learning_rate": 1.4093333333333333e-05, + "loss": 0.7024, + "step": 21830 + }, + { + "epoch": 15.57774607703281, + "grad_norm": 15.875, + "learning_rate": 1.404888888888889e-05, + "loss": 0.7581, + "step": 21840 + }, + { + "epoch": 15.5848787446505, + "grad_norm": 6.625, + "learning_rate": 1.4004444444444445e-05, + "loss": 0.6875, + "step": 21850 + }, + { + "epoch": 15.592011412268189, + "grad_norm": 8.8125, + "learning_rate": 1.396e-05, + "loss": 0.706, + "step": 21860 + }, + { + "epoch": 15.599144079885878, + "grad_norm": 8.25, + "learning_rate": 1.3915555555555557e-05, + "loss": 0.7664, + "step": 21870 + }, + { + "epoch": 15.606276747503566, + "grad_norm": 8.125, + "learning_rate": 1.3871111111111112e-05, + "loss": 0.8298, + "step": 21880 + }, + { + "epoch": 15.613409415121255, + "grad_norm": 7.5, + "learning_rate": 1.3826666666666668e-05, + "loss": 0.8146, + "step": 21890 + }, + { + "epoch": 15.620542082738945, + "grad_norm": 17.75, + "learning_rate": 1.3782222222222224e-05, + "loss": 0.7299, + "step": 21900 + }, + { + "epoch": 15.620542082738945, + "eval/acc": 51.16279220581055, + "step": 21900 + }, + { + "epoch": 15.620542082738945, + "eval_loss": 2.1079678535461426, + "eval_runtime": 0.2212, + "eval_samples_per_second": 194.362, + "eval_steps_per_second": 4.52, + "step": 21900 + }, + { + "epoch": 15.627674750356633, + "grad_norm": 7.21875, + "learning_rate": 1.3737777777777778e-05, + "loss": 0.7489, + "step": 21910 + }, + { + "epoch": 15.634807417974322, + "grad_norm": 7.5, + "learning_rate": 1.3693333333333333e-05, + "loss": 0.7586, + "step": 21920 + }, + { + "epoch": 15.641940085592012, + "grad_norm": 7.375, + "learning_rate": 1.3648888888888888e-05, + "loss": 0.6981, + "step": 21930 + }, + { + "epoch": 15.649072753209701, + "grad_norm": 8.25, + "learning_rate": 1.3604444444444445e-05, + "loss": 0.7119, + "step": 21940 + }, + { + "epoch": 15.656205420827389, + "grad_norm": 11.6875, + "learning_rate": 1.356e-05, + "loss": 0.8477, + "step": 21950 + }, + { + "epoch": 15.663338088445078, + "grad_norm": 5.78125, + "learning_rate": 1.3515555555555556e-05, + "loss": 0.7083, + "step": 21960 + }, + { + "epoch": 15.670470756062768, + "grad_norm": 6.40625, + "learning_rate": 1.3471111111111113e-05, + "loss": 0.7326, + "step": 21970 + }, + { + "epoch": 15.677603423680456, + "grad_norm": 13.75, + "learning_rate": 1.3426666666666668e-05, + "loss": 0.8506, + "step": 21980 + }, + { + "epoch": 15.684736091298145, + "grad_norm": 10.1875, + "learning_rate": 1.3382222222222223e-05, + "loss": 0.8144, + "step": 21990 + }, + { + "epoch": 15.691868758915835, + "grad_norm": 6.78125, + "learning_rate": 1.333777777777778e-05, + "loss": 0.735, + "step": 22000 + }, + { + "epoch": 15.691868758915835, + "eval/acc": 51.16279220581055, + "step": 22000 + }, + { + "epoch": 15.691868758915835, + "eval_loss": 2.1133952140808105, + "eval_runtime": 0.2219, + "eval_samples_per_second": 193.737, + "eval_steps_per_second": 4.506, + "step": 22000 + }, + { + "epoch": 15.699001426533524, + "grad_norm": 21.625, + "learning_rate": 1.3293333333333332e-05, + "loss": 0.7367, + "step": 22010 + }, + { + "epoch": 15.706134094151212, + "grad_norm": 6.25, + "learning_rate": 1.3248888888888889e-05, + "loss": 0.8047, + "step": 22020 + }, + { + "epoch": 15.713266761768901, + "grad_norm": 15.8125, + "learning_rate": 1.3204444444444444e-05, + "loss": 0.7082, + "step": 22030 + }, + { + "epoch": 15.72039942938659, + "grad_norm": 7.25, + "learning_rate": 1.316e-05, + "loss": 0.7995, + "step": 22040 + }, + { + "epoch": 15.72753209700428, + "grad_norm": 7.65625, + "learning_rate": 1.3115555555555556e-05, + "loss": 0.7402, + "step": 22050 + }, + { + "epoch": 15.734664764621968, + "grad_norm": 7.5, + "learning_rate": 1.3071111111111112e-05, + "loss": 0.7339, + "step": 22060 + }, + { + "epoch": 15.741797432239657, + "grad_norm": 9.0, + "learning_rate": 1.3026666666666667e-05, + "loss": 0.7367, + "step": 22070 + }, + { + "epoch": 15.748930099857347, + "grad_norm": 16.875, + "learning_rate": 1.2982222222222224e-05, + "loss": 0.7468, + "step": 22080 + }, + { + "epoch": 15.756062767475036, + "grad_norm": 6.53125, + "learning_rate": 1.2937777777777779e-05, + "loss": 0.6916, + "step": 22090 + }, + { + "epoch": 15.763195435092724, + "grad_norm": 6.625, + "learning_rate": 1.2893333333333336e-05, + "loss": 0.765, + "step": 22100 + }, + { + "epoch": 15.763195435092724, + "eval/acc": 48.83720779418945, + "step": 22100 + }, + { + "epoch": 15.763195435092724, + "eval_loss": 2.126471996307373, + "eval_runtime": 0.2307, + "eval_samples_per_second": 186.402, + "eval_steps_per_second": 4.335, + "step": 22100 + }, + { + "epoch": 15.770328102710414, + "grad_norm": 22.75, + "learning_rate": 1.2848888888888891e-05, + "loss": 0.8085, + "step": 22110 + }, + { + "epoch": 15.777460770328103, + "grad_norm": 9.4375, + "learning_rate": 1.2804444444444445e-05, + "loss": 0.7648, + "step": 22120 + }, + { + "epoch": 15.78459343794579, + "grad_norm": 6.625, + "learning_rate": 1.276e-05, + "loss": 0.7852, + "step": 22130 + }, + { + "epoch": 15.79172610556348, + "grad_norm": 10.0, + "learning_rate": 1.2715555555555555e-05, + "loss": 0.8195, + "step": 22140 + }, + { + "epoch": 15.79885877318117, + "grad_norm": 6.59375, + "learning_rate": 1.2671111111111112e-05, + "loss": 0.7586, + "step": 22150 + }, + { + "epoch": 15.80599144079886, + "grad_norm": 6.84375, + "learning_rate": 1.2626666666666667e-05, + "loss": 0.6953, + "step": 22160 + }, + { + "epoch": 15.813124108416547, + "grad_norm": 7.90625, + "learning_rate": 1.2582222222222222e-05, + "loss": 0.7339, + "step": 22170 + }, + { + "epoch": 15.820256776034237, + "grad_norm": 9.25, + "learning_rate": 1.253777777777778e-05, + "loss": 0.7394, + "step": 22180 + }, + { + "epoch": 15.827389443651926, + "grad_norm": 26.0, + "learning_rate": 1.2493333333333333e-05, + "loss": 0.6945, + "step": 22190 + }, + { + "epoch": 15.834522111269616, + "grad_norm": 7.96875, + "learning_rate": 1.244888888888889e-05, + "loss": 0.7714, + "step": 22200 + }, + { + "epoch": 15.834522111269616, + "eval/acc": 51.16279220581055, + "step": 22200 + }, + { + "epoch": 15.834522111269616, + "eval_loss": 2.120994806289673, + "eval_runtime": 0.2302, + "eval_samples_per_second": 186.806, + "eval_steps_per_second": 4.344, + "step": 22200 + }, + { + "epoch": 15.841654778887303, + "grad_norm": 5.5, + "learning_rate": 1.2404444444444445e-05, + "loss": 0.7303, + "step": 22210 + }, + { + "epoch": 15.848787446504993, + "grad_norm": 4.625, + "learning_rate": 1.236e-05, + "loss": 0.6918, + "step": 22220 + }, + { + "epoch": 15.855920114122682, + "grad_norm": 10.25, + "learning_rate": 1.2315555555555557e-05, + "loss": 0.676, + "step": 22230 + }, + { + "epoch": 15.863052781740372, + "grad_norm": 9.0, + "learning_rate": 1.2271111111111112e-05, + "loss": 0.7605, + "step": 22240 + }, + { + "epoch": 15.87018544935806, + "grad_norm": 5.78125, + "learning_rate": 1.2226666666666668e-05, + "loss": 0.7128, + "step": 22250 + }, + { + "epoch": 15.877318116975749, + "grad_norm": 6.8125, + "learning_rate": 1.2182222222222223e-05, + "loss": 0.7533, + "step": 22260 + }, + { + "epoch": 15.884450784593438, + "grad_norm": 34.0, + "learning_rate": 1.2137777777777778e-05, + "loss": 0.8246, + "step": 22270 + }, + { + "epoch": 15.891583452211126, + "grad_norm": 8.4375, + "learning_rate": 1.2093333333333335e-05, + "loss": 0.7453, + "step": 22280 + }, + { + "epoch": 15.898716119828816, + "grad_norm": 9.5, + "learning_rate": 1.204888888888889e-05, + "loss": 0.7955, + "step": 22290 + }, + { + "epoch": 15.905848787446505, + "grad_norm": 9.5, + "learning_rate": 1.2004444444444444e-05, + "loss": 0.794, + "step": 22300 + }, + { + "epoch": 15.905848787446505, + "eval/acc": 51.16279220581055, + "step": 22300 + }, + { + "epoch": 15.905848787446505, + "eval_loss": 2.1340696811676025, + "eval_runtime": 7.8966, + "eval_samples_per_second": 5.445, + "eval_steps_per_second": 0.127, + "step": 22300 + }, + { + "epoch": 15.912981455064195, + "grad_norm": 6.8125, + "learning_rate": 1.196e-05, + "loss": 0.7258, + "step": 22310 + }, + { + "epoch": 15.920114122681882, + "grad_norm": 8.1875, + "learning_rate": 1.1915555555555556e-05, + "loss": 0.7714, + "step": 22320 + }, + { + "epoch": 15.927246790299572, + "grad_norm": 24.0, + "learning_rate": 1.1871111111111113e-05, + "loss": 0.8007, + "step": 22330 + }, + { + "epoch": 15.934379457917261, + "grad_norm": 8.125, + "learning_rate": 1.1826666666666668e-05, + "loss": 0.7889, + "step": 22340 + }, + { + "epoch": 15.94151212553495, + "grad_norm": 7.625, + "learning_rate": 1.1782222222222222e-05, + "loss": 0.8811, + "step": 22350 + }, + { + "epoch": 15.948644793152638, + "grad_norm": 7.78125, + "learning_rate": 1.1737777777777779e-05, + "loss": 0.7475, + "step": 22360 + }, + { + "epoch": 15.955777460770328, + "grad_norm": 7.59375, + "learning_rate": 1.1693333333333334e-05, + "loss": 0.8789, + "step": 22370 + }, + { + "epoch": 15.962910128388017, + "grad_norm": 10.125, + "learning_rate": 1.1648888888888889e-05, + "loss": 0.695, + "step": 22380 + }, + { + "epoch": 15.970042796005707, + "grad_norm": 8.125, + "learning_rate": 1.1604444444444446e-05, + "loss": 0.8267, + "step": 22390 + }, + { + "epoch": 15.977175463623395, + "grad_norm": 17.125, + "learning_rate": 1.156e-05, + "loss": 0.786, + "step": 22400 + }, + { + "epoch": 15.977175463623395, + "eval/acc": 51.16279220581055, + "step": 22400 + }, + { + "epoch": 15.977175463623395, + "eval_loss": 2.109499216079712, + "eval_runtime": 0.2295, + "eval_samples_per_second": 187.377, + "eval_steps_per_second": 4.358, + "step": 22400 + }, + { + "epoch": 15.984308131241084, + "grad_norm": 9.8125, + "learning_rate": 1.1515555555555556e-05, + "loss": 0.8732, + "step": 22410 + }, + { + "epoch": 15.991440798858774, + "grad_norm": 59.25, + "learning_rate": 1.1471111111111112e-05, + "loss": 0.7541, + "step": 22420 + }, + { + "epoch": 15.998573466476461, + "grad_norm": 9.9375, + "learning_rate": 1.1426666666666667e-05, + "loss": 0.7847, + "step": 22430 + }, + { + "epoch": 16.00570613409415, + "grad_norm": 7.5625, + "learning_rate": 1.1382222222222224e-05, + "loss": 0.7616, + "step": 22440 + }, + { + "epoch": 16.01283880171184, + "grad_norm": 7.90625, + "learning_rate": 1.1337777777777777e-05, + "loss": 0.7538, + "step": 22450 + }, + { + "epoch": 16.01997146932953, + "grad_norm": 5.1875, + "learning_rate": 1.1293333333333334e-05, + "loss": 0.6573, + "step": 22460 + }, + { + "epoch": 16.02710413694722, + "grad_norm": 6.4375, + "learning_rate": 1.124888888888889e-05, + "loss": 0.6845, + "step": 22470 + }, + { + "epoch": 16.03423680456491, + "grad_norm": 6.875, + "learning_rate": 1.1204444444444445e-05, + "loss": 0.7176, + "step": 22480 + }, + { + "epoch": 16.041369472182595, + "grad_norm": 7.8125, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.6968, + "step": 22490 + }, + { + "epoch": 16.048502139800284, + "grad_norm": 7.3125, + "learning_rate": 1.1115555555555555e-05, + "loss": 0.7667, + "step": 22500 + }, + { + "epoch": 16.048502139800284, + "eval/acc": 46.511627197265625, + "step": 22500 + }, + { + "epoch": 16.048502139800284, + "eval_loss": 2.3885626792907715, + "eval_runtime": 7.6399, + "eval_samples_per_second": 5.628, + "eval_steps_per_second": 0.131, + "step": 22500 + }, + { + "epoch": 16.055634807417974, + "grad_norm": 8.125, + "learning_rate": 1.1071111111111112e-05, + "loss": 0.6981, + "step": 22510 + }, + { + "epoch": 16.062767475035663, + "grad_norm": 8.0, + "learning_rate": 1.1026666666666667e-05, + "loss": 0.8857, + "step": 22520 + }, + { + "epoch": 16.069900142653353, + "grad_norm": 7.21875, + "learning_rate": 1.0982222222222222e-05, + "loss": 0.7247, + "step": 22530 + }, + { + "epoch": 16.077032810271042, + "grad_norm": 8.8125, + "learning_rate": 1.093777777777778e-05, + "loss": 0.7655, + "step": 22540 + }, + { + "epoch": 16.08416547788873, + "grad_norm": 8.3125, + "learning_rate": 1.0893333333333333e-05, + "loss": 0.7946, + "step": 22550 + }, + { + "epoch": 16.091298145506418, + "grad_norm": 7.09375, + "learning_rate": 1.0848888888888888e-05, + "loss": 0.7194, + "step": 22560 + }, + { + "epoch": 16.098430813124107, + "grad_norm": 6.9375, + "learning_rate": 1.0804444444444445e-05, + "loss": 0.719, + "step": 22570 + }, + { + "epoch": 16.105563480741797, + "grad_norm": 8.0, + "learning_rate": 1.076e-05, + "loss": 0.7125, + "step": 22580 + }, + { + "epoch": 16.112696148359486, + "grad_norm": 8.125, + "learning_rate": 1.0715555555555557e-05, + "loss": 0.8118, + "step": 22590 + }, + { + "epoch": 16.119828815977176, + "grad_norm": 8.375, + "learning_rate": 1.0671111111111112e-05, + "loss": 0.7747, + "step": 22600 + }, + { + "epoch": 16.119828815977176, + "eval/acc": 46.511627197265625, + "step": 22600 + }, + { + "epoch": 16.119828815977176, + "eval_loss": 2.3585238456726074, + "eval_runtime": 0.2224, + "eval_samples_per_second": 193.314, + "eval_steps_per_second": 4.496, + "step": 22600 + }, + { + "epoch": 16.126961483594865, + "grad_norm": 8.3125, + "learning_rate": 1.0626666666666666e-05, + "loss": 0.7209, + "step": 22610 + }, + { + "epoch": 16.134094151212555, + "grad_norm": 6.9375, + "learning_rate": 1.0582222222222223e-05, + "loss": 0.7399, + "step": 22620 + }, + { + "epoch": 16.141226818830244, + "grad_norm": 8.25, + "learning_rate": 1.0537777777777778e-05, + "loss": 0.7628, + "step": 22630 + }, + { + "epoch": 16.14835948644793, + "grad_norm": 7.03125, + "learning_rate": 1.0493333333333333e-05, + "loss": 0.7365, + "step": 22640 + }, + { + "epoch": 16.15549215406562, + "grad_norm": 7.625, + "learning_rate": 1.044888888888889e-05, + "loss": 0.7585, + "step": 22650 + }, + { + "epoch": 16.16262482168331, + "grad_norm": 7.40625, + "learning_rate": 1.0404444444444444e-05, + "loss": 0.7341, + "step": 22660 + }, + { + "epoch": 16.169757489301, + "grad_norm": 6.875, + "learning_rate": 1.036e-05, + "loss": 0.8297, + "step": 22670 + }, + { + "epoch": 16.176890156918688, + "grad_norm": 8.9375, + "learning_rate": 1.0315555555555556e-05, + "loss": 0.8216, + "step": 22680 + }, + { + "epoch": 16.184022824536378, + "grad_norm": 12.625, + "learning_rate": 1.0271111111111111e-05, + "loss": 0.6936, + "step": 22690 + }, + { + "epoch": 16.191155492154067, + "grad_norm": 5.53125, + "learning_rate": 1.0226666666666668e-05, + "loss": 0.8264, + "step": 22700 + }, + { + "epoch": 16.191155492154067, + "eval/acc": 46.511627197265625, + "step": 22700 + }, + { + "epoch": 16.191155492154067, + "eval_loss": 2.3709049224853516, + "eval_runtime": 0.2215, + "eval_samples_per_second": 194.132, + "eval_steps_per_second": 4.515, + "step": 22700 + }, + { + "epoch": 16.198288159771753, + "grad_norm": 7.3125, + "learning_rate": 1.0182222222222222e-05, + "loss": 0.7458, + "step": 22710 + }, + { + "epoch": 16.205420827389442, + "grad_norm": 8.625, + "learning_rate": 1.0137777777777779e-05, + "loss": 0.8079, + "step": 22720 + }, + { + "epoch": 16.212553495007132, + "grad_norm": 7.53125, + "learning_rate": 1.0093333333333334e-05, + "loss": 0.7028, + "step": 22730 + }, + { + "epoch": 16.21968616262482, + "grad_norm": 8.1875, + "learning_rate": 1.0048888888888889e-05, + "loss": 0.76, + "step": 22740 + }, + { + "epoch": 16.22681883024251, + "grad_norm": 9.0625, + "learning_rate": 1.0004444444444446e-05, + "loss": 0.8226, + "step": 22750 + }, + { + "epoch": 16.2339514978602, + "grad_norm": 7.53125, + "learning_rate": 9.96e-06, + "loss": 0.7254, + "step": 22760 + }, + { + "epoch": 16.24108416547789, + "grad_norm": 42.5, + "learning_rate": 9.915555555555556e-06, + "loss": 0.7925, + "step": 22770 + }, + { + "epoch": 16.24821683309558, + "grad_norm": 13.0, + "learning_rate": 9.871111111111112e-06, + "loss": 0.8108, + "step": 22780 + }, + { + "epoch": 16.255349500713265, + "grad_norm": 6.875, + "learning_rate": 9.826666666666667e-06, + "loss": 0.8118, + "step": 22790 + }, + { + "epoch": 16.262482168330955, + "grad_norm": 8.25, + "learning_rate": 9.782222222222224e-06, + "loss": 0.7531, + "step": 22800 + }, + { + "epoch": 16.262482168330955, + "eval/acc": 44.1860466003418, + "step": 22800 + }, + { + "epoch": 16.262482168330955, + "eval_loss": 2.376155376434326, + "eval_runtime": 0.238, + "eval_samples_per_second": 180.665, + "eval_steps_per_second": 4.202, + "step": 22800 + }, + { + "epoch": 16.269614835948644, + "grad_norm": 8.5, + "learning_rate": 9.737777777777777e-06, + "loss": 0.829, + "step": 22810 + }, + { + "epoch": 16.276747503566334, + "grad_norm": 7.21875, + "learning_rate": 9.693333333333334e-06, + "loss": 0.6652, + "step": 22820 + }, + { + "epoch": 16.283880171184023, + "grad_norm": 8.125, + "learning_rate": 9.64888888888889e-06, + "loss": 0.7132, + "step": 22830 + }, + { + "epoch": 16.291012838801713, + "grad_norm": 9.3125, + "learning_rate": 9.604444444444445e-06, + "loss": 0.6487, + "step": 22840 + }, + { + "epoch": 16.298145506419402, + "grad_norm": 10.0625, + "learning_rate": 9.560000000000002e-06, + "loss": 0.7387, + "step": 22850 + }, + { + "epoch": 16.30527817403709, + "grad_norm": 48.5, + "learning_rate": 9.515555555555555e-06, + "loss": 0.7641, + "step": 22860 + }, + { + "epoch": 16.312410841654778, + "grad_norm": 9.4375, + "learning_rate": 9.47111111111111e-06, + "loss": 0.7797, + "step": 22870 + }, + { + "epoch": 16.319543509272467, + "grad_norm": 6.84375, + "learning_rate": 9.426666666666667e-06, + "loss": 0.7955, + "step": 22880 + }, + { + "epoch": 16.326676176890157, + "grad_norm": 6.6875, + "learning_rate": 9.382222222222223e-06, + "loss": 0.6976, + "step": 22890 + }, + { + "epoch": 16.333808844507846, + "grad_norm": 7.90625, + "learning_rate": 9.337777777777778e-06, + "loss": 0.7808, + "step": 22900 + }, + { + "epoch": 16.333808844507846, + "eval/acc": 46.511627197265625, + "step": 22900 + }, + { + "epoch": 16.333808844507846, + "eval_loss": 2.3756563663482666, + "eval_runtime": 0.2253, + "eval_samples_per_second": 190.817, + "eval_steps_per_second": 4.438, + "step": 22900 + }, + { + "epoch": 16.340941512125536, + "grad_norm": 8.25, + "learning_rate": 9.293333333333335e-06, + "loss": 0.743, + "step": 22910 + }, + { + "epoch": 16.348074179743225, + "grad_norm": 7.4375, + "learning_rate": 9.248888888888888e-06, + "loss": 0.7825, + "step": 22920 + }, + { + "epoch": 16.355206847360915, + "grad_norm": 32.25, + "learning_rate": 9.204444444444445e-06, + "loss": 0.7654, + "step": 22930 + }, + { + "epoch": 16.3623395149786, + "grad_norm": 7.6875, + "learning_rate": 9.16e-06, + "loss": 0.7186, + "step": 22940 + }, + { + "epoch": 16.36947218259629, + "grad_norm": 8.125, + "learning_rate": 9.115555555555556e-06, + "loss": 0.7137, + "step": 22950 + }, + { + "epoch": 16.37660485021398, + "grad_norm": 7.375, + "learning_rate": 9.071111111111113e-06, + "loss": 0.7046, + "step": 22960 + }, + { + "epoch": 16.38373751783167, + "grad_norm": 7.59375, + "learning_rate": 9.026666666666666e-06, + "loss": 0.8367, + "step": 22970 + }, + { + "epoch": 16.39087018544936, + "grad_norm": 14.4375, + "learning_rate": 8.982222222222223e-06, + "loss": 0.8183, + "step": 22980 + }, + { + "epoch": 16.398002853067048, + "grad_norm": 6.8125, + "learning_rate": 8.937777777777778e-06, + "loss": 0.6925, + "step": 22990 + }, + { + "epoch": 16.405135520684738, + "grad_norm": 8.6875, + "learning_rate": 8.893333333333333e-06, + "loss": 0.7486, + "step": 23000 + }, + { + "epoch": 16.405135520684738, + "eval/acc": 46.511627197265625, + "step": 23000 + }, + { + "epoch": 16.405135520684738, + "eval_loss": 2.3669910430908203, + "eval_runtime": 0.2182, + "eval_samples_per_second": 197.042, + "eval_steps_per_second": 4.582, + "step": 23000 + }, + { + "epoch": 16.412268188302424, + "grad_norm": 7.15625, + "learning_rate": 8.84888888888889e-06, + "loss": 0.7187, + "step": 23010 + }, + { + "epoch": 16.419400855920113, + "grad_norm": 7.6875, + "learning_rate": 8.804444444444444e-06, + "loss": 0.8066, + "step": 23020 + }, + { + "epoch": 16.426533523537802, + "grad_norm": 6.6875, + "learning_rate": 8.76e-06, + "loss": 0.7569, + "step": 23030 + }, + { + "epoch": 16.433666191155492, + "grad_norm": 7.28125, + "learning_rate": 8.715555555555556e-06, + "loss": 0.8201, + "step": 23040 + }, + { + "epoch": 16.44079885877318, + "grad_norm": 7.03125, + "learning_rate": 8.671111111111111e-06, + "loss": 0.7451, + "step": 23050 + }, + { + "epoch": 16.44793152639087, + "grad_norm": 7.3125, + "learning_rate": 8.626666666666668e-06, + "loss": 0.8503, + "step": 23060 + }, + { + "epoch": 16.45506419400856, + "grad_norm": 6.90625, + "learning_rate": 8.582222222222222e-06, + "loss": 0.7842, + "step": 23070 + }, + { + "epoch": 16.46219686162625, + "grad_norm": 6.15625, + "learning_rate": 8.537777777777779e-06, + "loss": 0.7893, + "step": 23080 + }, + { + "epoch": 16.469329529243936, + "grad_norm": 8.1875, + "learning_rate": 8.493333333333334e-06, + "loss": 0.7195, + "step": 23090 + }, + { + "epoch": 16.476462196861625, + "grad_norm": 10.3125, + "learning_rate": 8.448888888888889e-06, + "loss": 0.7708, + "step": 23100 + }, + { + "epoch": 16.476462196861625, + "eval/acc": 46.511627197265625, + "step": 23100 + }, + { + "epoch": 16.476462196861625, + "eval_loss": 2.35764741897583, + "eval_runtime": 0.2204, + "eval_samples_per_second": 195.143, + "eval_steps_per_second": 4.538, + "step": 23100 + }, + { + "epoch": 16.483594864479315, + "grad_norm": 7.40625, + "learning_rate": 8.404444444444446e-06, + "loss": 0.6914, + "step": 23110 + }, + { + "epoch": 16.490727532097004, + "grad_norm": 9.75, + "learning_rate": 8.36e-06, + "loss": 0.7454, + "step": 23120 + }, + { + "epoch": 16.497860199714694, + "grad_norm": 10.4375, + "learning_rate": 8.315555555555555e-06, + "loss": 0.793, + "step": 23130 + }, + { + "epoch": 16.504992867332383, + "grad_norm": 7.3125, + "learning_rate": 8.271111111111112e-06, + "loss": 0.7618, + "step": 23140 + }, + { + "epoch": 16.512125534950073, + "grad_norm": 8.3125, + "learning_rate": 8.226666666666667e-06, + "loss": 0.8735, + "step": 23150 + }, + { + "epoch": 16.51925820256776, + "grad_norm": 6.5625, + "learning_rate": 8.182222222222224e-06, + "loss": 0.7696, + "step": 23160 + }, + { + "epoch": 16.52639087018545, + "grad_norm": 7.0, + "learning_rate": 8.137777777777777e-06, + "loss": 0.7254, + "step": 23170 + }, + { + "epoch": 16.533523537803138, + "grad_norm": 6.09375, + "learning_rate": 8.093333333333333e-06, + "loss": 0.8109, + "step": 23180 + }, + { + "epoch": 16.540656205420827, + "grad_norm": 6.84375, + "learning_rate": 8.04888888888889e-06, + "loss": 0.7784, + "step": 23190 + }, + { + "epoch": 16.547788873038517, + "grad_norm": 8.0, + "learning_rate": 8.004444444444445e-06, + "loss": 0.7301, + "step": 23200 + }, + { + "epoch": 16.547788873038517, + "eval/acc": 46.511627197265625, + "step": 23200 + }, + { + "epoch": 16.547788873038517, + "eval_loss": 2.3700337409973145, + "eval_runtime": 0.2463, + "eval_samples_per_second": 174.56, + "eval_steps_per_second": 4.06, + "step": 23200 + }, + { + "epoch": 16.554921540656206, + "grad_norm": 7.96875, + "learning_rate": 7.96e-06, + "loss": 0.7905, + "step": 23210 + }, + { + "epoch": 16.562054208273896, + "grad_norm": 7.9375, + "learning_rate": 7.915555555555557e-06, + "loss": 0.7493, + "step": 23220 + }, + { + "epoch": 16.56918687589158, + "grad_norm": 7.03125, + "learning_rate": 7.87111111111111e-06, + "loss": 0.7384, + "step": 23230 + }, + { + "epoch": 16.57631954350927, + "grad_norm": 9.1875, + "learning_rate": 7.826666666666667e-06, + "loss": 0.8211, + "step": 23240 + }, + { + "epoch": 16.58345221112696, + "grad_norm": 53.75, + "learning_rate": 7.782222222222223e-06, + "loss": 0.8433, + "step": 23250 + }, + { + "epoch": 16.59058487874465, + "grad_norm": 8.125, + "learning_rate": 7.737777777777778e-06, + "loss": 0.7178, + "step": 23260 + }, + { + "epoch": 16.59771754636234, + "grad_norm": 6.34375, + "learning_rate": 7.693333333333335e-06, + "loss": 0.7261, + "step": 23270 + }, + { + "epoch": 16.60485021398003, + "grad_norm": 6.59375, + "learning_rate": 7.648888888888888e-06, + "loss": 0.7469, + "step": 23280 + }, + { + "epoch": 16.61198288159772, + "grad_norm": 5.15625, + "learning_rate": 7.604444444444444e-06, + "loss": 0.6811, + "step": 23290 + }, + { + "epoch": 16.619115549215408, + "grad_norm": 19.875, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.7753, + "step": 23300 + }, + { + "epoch": 16.619115549215408, + "eval/acc": 46.511627197265625, + "step": 23300 + }, + { + "epoch": 16.619115549215408, + "eval_loss": 2.3815970420837402, + "eval_runtime": 0.2316, + "eval_samples_per_second": 185.668, + "eval_steps_per_second": 4.318, + "step": 23300 + }, + { + "epoch": 16.626248216833094, + "grad_norm": 18.875, + "learning_rate": 7.5155555555555565e-06, + "loss": 0.7564, + "step": 23310 + }, + { + "epoch": 16.633380884450784, + "grad_norm": 4.875, + "learning_rate": 7.471111111111112e-06, + "loss": 0.6863, + "step": 23320 + }, + { + "epoch": 16.640513552068473, + "grad_norm": 7.25, + "learning_rate": 7.426666666666666e-06, + "loss": 0.8129, + "step": 23330 + }, + { + "epoch": 16.647646219686163, + "grad_norm": 8.0, + "learning_rate": 7.382222222222222e-06, + "loss": 0.7198, + "step": 23340 + }, + { + "epoch": 16.654778887303852, + "grad_norm": 5.78125, + "learning_rate": 7.337777777777778e-06, + "loss": 0.7266, + "step": 23350 + }, + { + "epoch": 16.66191155492154, + "grad_norm": 7.53125, + "learning_rate": 7.293333333333334e-06, + "loss": 0.7642, + "step": 23360 + }, + { + "epoch": 16.66904422253923, + "grad_norm": 7.40625, + "learning_rate": 7.24888888888889e-06, + "loss": 0.7696, + "step": 23370 + }, + { + "epoch": 16.676176890156917, + "grad_norm": 10.75, + "learning_rate": 7.204444444444444e-06, + "loss": 0.7654, + "step": 23380 + }, + { + "epoch": 16.683309557774606, + "grad_norm": 9.625, + "learning_rate": 7.16e-06, + "loss": 0.7238, + "step": 23390 + }, + { + "epoch": 16.690442225392296, + "grad_norm": 8.625, + "learning_rate": 7.115555555555556e-06, + "loss": 0.7249, + "step": 23400 + }, + { + "epoch": 16.690442225392296, + "eval/acc": 46.511627197265625, + "step": 23400 + }, + { + "epoch": 16.690442225392296, + "eval_loss": 2.3753066062927246, + "eval_runtime": 0.2235, + "eval_samples_per_second": 192.376, + "eval_steps_per_second": 4.474, + "step": 23400 + }, + { + "epoch": 16.697574893009985, + "grad_norm": 6.6875, + "learning_rate": 7.071111111111111e-06, + "loss": 0.6913, + "step": 23410 + }, + { + "epoch": 16.704707560627675, + "grad_norm": 7.96875, + "learning_rate": 7.0266666666666674e-06, + "loss": 0.7034, + "step": 23420 + }, + { + "epoch": 16.711840228245364, + "grad_norm": 16.625, + "learning_rate": 6.982222222222222e-06, + "loss": 0.7675, + "step": 23430 + }, + { + "epoch": 16.718972895863054, + "grad_norm": 8.5625, + "learning_rate": 6.937777777777778e-06, + "loss": 0.7343, + "step": 23440 + }, + { + "epoch": 16.726105563480743, + "grad_norm": 7.75, + "learning_rate": 6.893333333333334e-06, + "loss": 0.8078, + "step": 23450 + }, + { + "epoch": 16.73323823109843, + "grad_norm": 7.5, + "learning_rate": 6.848888888888889e-06, + "loss": 0.8483, + "step": 23460 + }, + { + "epoch": 16.74037089871612, + "grad_norm": 9.0625, + "learning_rate": 6.804444444444445e-06, + "loss": 0.7703, + "step": 23470 + }, + { + "epoch": 16.74750356633381, + "grad_norm": 8.3125, + "learning_rate": 6.76e-06, + "loss": 0.8339, + "step": 23480 + }, + { + "epoch": 16.754636233951498, + "grad_norm": 7.53125, + "learning_rate": 6.715555555555556e-06, + "loss": 0.7384, + "step": 23490 + }, + { + "epoch": 16.761768901569187, + "grad_norm": 6.96875, + "learning_rate": 6.671111111111111e-06, + "loss": 0.7953, + "step": 23500 + }, + { + "epoch": 16.761768901569187, + "eval/acc": 44.1860466003418, + "step": 23500 + }, + { + "epoch": 16.761768901569187, + "eval_loss": 2.3848800659179688, + "eval_runtime": 0.2328, + "eval_samples_per_second": 184.731, + "eval_steps_per_second": 4.296, + "step": 23500 + }, + { + "epoch": 16.768901569186877, + "grad_norm": 11.625, + "learning_rate": 6.626666666666667e-06, + "loss": 0.712, + "step": 23510 + }, + { + "epoch": 16.776034236804566, + "grad_norm": 8.5625, + "learning_rate": 6.582222222222223e-06, + "loss": 0.7337, + "step": 23520 + }, + { + "epoch": 16.783166904422252, + "grad_norm": 6.34375, + "learning_rate": 6.537777777777779e-06, + "loss": 0.6829, + "step": 23530 + }, + { + "epoch": 16.79029957203994, + "grad_norm": 6.40625, + "learning_rate": 6.4933333333333336e-06, + "loss": 0.7174, + "step": 23540 + }, + { + "epoch": 16.79743223965763, + "grad_norm": 8.1875, + "learning_rate": 6.448888888888889e-06, + "loss": 0.7812, + "step": 23550 + }, + { + "epoch": 16.80456490727532, + "grad_norm": 7.21875, + "learning_rate": 6.404444444444445e-06, + "loss": 0.6941, + "step": 23560 + }, + { + "epoch": 16.81169757489301, + "grad_norm": 7.75, + "learning_rate": 6.360000000000001e-06, + "loss": 0.8053, + "step": 23570 + }, + { + "epoch": 16.8188302425107, + "grad_norm": 7.46875, + "learning_rate": 6.315555555555556e-06, + "loss": 0.8218, + "step": 23580 + }, + { + "epoch": 16.82596291012839, + "grad_norm": 5.84375, + "learning_rate": 6.2711111111111105e-06, + "loss": 0.7829, + "step": 23590 + }, + { + "epoch": 16.83309557774608, + "grad_norm": 7.0, + "learning_rate": 6.226666666666667e-06, + "loss": 0.7756, + "step": 23600 + }, + { + "epoch": 16.83309557774608, + "eval/acc": 46.511627197265625, + "step": 23600 + }, + { + "epoch": 16.83309557774608, + "eval_loss": 2.3751766681671143, + "eval_runtime": 0.2292, + "eval_samples_per_second": 187.613, + "eval_steps_per_second": 4.363, + "step": 23600 + }, + { + "epoch": 16.840228245363765, + "grad_norm": 8.9375, + "learning_rate": 6.182222222222223e-06, + "loss": 0.859, + "step": 23610 + }, + { + "epoch": 16.847360912981454, + "grad_norm": 5.96875, + "learning_rate": 6.137777777777779e-06, + "loss": 0.7727, + "step": 23620 + }, + { + "epoch": 16.854493580599144, + "grad_norm": 10.6875, + "learning_rate": 6.093333333333333e-06, + "loss": 0.7321, + "step": 23630 + }, + { + "epoch": 16.861626248216833, + "grad_norm": 8.125, + "learning_rate": 6.048888888888889e-06, + "loss": 0.7517, + "step": 23640 + }, + { + "epoch": 16.868758915834523, + "grad_norm": 8.3125, + "learning_rate": 6.0044444444444445e-06, + "loss": 0.7328, + "step": 23650 + }, + { + "epoch": 16.875891583452212, + "grad_norm": 8.75, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.8173, + "step": 23660 + }, + { + "epoch": 16.8830242510699, + "grad_norm": 9.0, + "learning_rate": 5.915555555555556e-06, + "loss": 0.7312, + "step": 23670 + }, + { + "epoch": 16.890156918687588, + "grad_norm": 17.25, + "learning_rate": 5.871111111111111e-06, + "loss": 0.7407, + "step": 23680 + }, + { + "epoch": 16.897289586305277, + "grad_norm": 6.875, + "learning_rate": 5.826666666666667e-06, + "loss": 0.8061, + "step": 23690 + }, + { + "epoch": 16.904422253922966, + "grad_norm": 9.875, + "learning_rate": 5.782222222222222e-06, + "loss": 0.6627, + "step": 23700 + }, + { + "epoch": 16.904422253922966, + "eval/acc": 46.511627197265625, + "step": 23700 + }, + { + "epoch": 16.904422253922966, + "eval_loss": 2.367297887802124, + "eval_runtime": 0.2662, + "eval_samples_per_second": 161.545, + "eval_steps_per_second": 3.757, + "step": 23700 + }, + { + "epoch": 16.911554921540656, + "grad_norm": 11.75, + "learning_rate": 5.737777777777778e-06, + "loss": 0.7706, + "step": 23710 + }, + { + "epoch": 16.918687589158345, + "grad_norm": 8.5625, + "learning_rate": 5.693333333333334e-06, + "loss": 0.8145, + "step": 23720 + }, + { + "epoch": 16.925820256776035, + "grad_norm": 14.25, + "learning_rate": 5.648888888888889e-06, + "loss": 0.7464, + "step": 23730 + }, + { + "epoch": 16.932952924393724, + "grad_norm": 8.3125, + "learning_rate": 5.604444444444445e-06, + "loss": 0.8126, + "step": 23740 + }, + { + "epoch": 16.940085592011414, + "grad_norm": 5.78125, + "learning_rate": 5.56e-06, + "loss": 0.7277, + "step": 23750 + }, + { + "epoch": 16.9472182596291, + "grad_norm": 5.71875, + "learning_rate": 5.515555555555555e-06, + "loss": 0.795, + "step": 23760 + }, + { + "epoch": 16.95435092724679, + "grad_norm": 7.71875, + "learning_rate": 5.4711111111111114e-06, + "loss": 0.846, + "step": 23770 + }, + { + "epoch": 16.96148359486448, + "grad_norm": 5.84375, + "learning_rate": 5.426666666666667e-06, + "loss": 0.7186, + "step": 23780 + }, + { + "epoch": 16.96861626248217, + "grad_norm": 7.21875, + "learning_rate": 5.382222222222223e-06, + "loss": 0.698, + "step": 23790 + }, + { + "epoch": 16.975748930099858, + "grad_norm": 7.46875, + "learning_rate": 5.337777777777778e-06, + "loss": 0.7792, + "step": 23800 + }, + { + "epoch": 16.975748930099858, + "eval/acc": 46.511627197265625, + "step": 23800 + }, + { + "epoch": 16.975748930099858, + "eval_loss": 2.380526065826416, + "eval_runtime": 0.218, + "eval_samples_per_second": 197.26, + "eval_steps_per_second": 4.587, + "step": 23800 + }, + { + "epoch": 16.982881597717547, + "grad_norm": 7.21875, + "learning_rate": 5.293333333333333e-06, + "loss": 0.7599, + "step": 23810 + }, + { + "epoch": 16.990014265335237, + "grad_norm": 6.34375, + "learning_rate": 5.248888888888889e-06, + "loss": 0.7043, + "step": 23820 + }, + { + "epoch": 16.997146932952923, + "grad_norm": 8.25, + "learning_rate": 5.2044444444444445e-06, + "loss": 0.7336, + "step": 23830 + }, + { + "epoch": 17.004279600570612, + "grad_norm": 7.1875, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.7175, + "step": 23840 + }, + { + "epoch": 17.0114122681883, + "grad_norm": 7.75, + "learning_rate": 5.115555555555556e-06, + "loss": 0.7258, + "step": 23850 + }, + { + "epoch": 17.01854493580599, + "grad_norm": 6.90625, + "learning_rate": 5.071111111111111e-06, + "loss": 0.7136, + "step": 23860 + }, + { + "epoch": 17.02567760342368, + "grad_norm": 9.625, + "learning_rate": 5.026666666666667e-06, + "loss": 0.7191, + "step": 23870 + }, + { + "epoch": 17.03281027104137, + "grad_norm": 6.15625, + "learning_rate": 4.982222222222222e-06, + "loss": 0.7656, + "step": 23880 + }, + { + "epoch": 17.03994293865906, + "grad_norm": 7.65625, + "learning_rate": 4.9377777777777776e-06, + "loss": 0.6739, + "step": 23890 + }, + { + "epoch": 17.04707560627675, + "grad_norm": 8.25, + "learning_rate": 4.893333333333334e-06, + "loss": 0.7209, + "step": 23900 + }, + { + "epoch": 17.04707560627675, + "eval/acc": 51.16279220581055, + "step": 23900 + }, + { + "epoch": 17.04707560627675, + "eval_loss": 1.9211745262145996, + "eval_runtime": 7.2436, + "eval_samples_per_second": 5.936, + "eval_steps_per_second": 0.138, + "step": 23900 + }, + { + "epoch": 17.054208273894435, + "grad_norm": 7.40625, + "learning_rate": 4.848888888888889e-06, + "loss": 0.7848, + "step": 23910 + }, + { + "epoch": 17.061340941512125, + "grad_norm": 8.75, + "learning_rate": 4.804444444444445e-06, + "loss": 0.7272, + "step": 23920 + }, + { + "epoch": 17.068473609129814, + "grad_norm": 6.78125, + "learning_rate": 4.76e-06, + "loss": 0.6871, + "step": 23930 + }, + { + "epoch": 17.075606276747504, + "grad_norm": 8.375, + "learning_rate": 4.715555555555555e-06, + "loss": 0.7285, + "step": 23940 + }, + { + "epoch": 17.082738944365193, + "grad_norm": 7.15625, + "learning_rate": 4.6711111111111115e-06, + "loss": 0.7626, + "step": 23950 + }, + { + "epoch": 17.089871611982883, + "grad_norm": 8.5, + "learning_rate": 4.626666666666667e-06, + "loss": 0.8326, + "step": 23960 + }, + { + "epoch": 17.097004279600572, + "grad_norm": 6.78125, + "learning_rate": 4.582222222222223e-06, + "loss": 0.7334, + "step": 23970 + }, + { + "epoch": 17.104136947218258, + "grad_norm": 10.75, + "learning_rate": 4.537777777777778e-06, + "loss": 0.7101, + "step": 23980 + }, + { + "epoch": 17.111269614835948, + "grad_norm": 7.40625, + "learning_rate": 4.493333333333333e-06, + "loss": 0.7165, + "step": 23990 + }, + { + "epoch": 17.118402282453637, + "grad_norm": 7.0, + "learning_rate": 4.448888888888889e-06, + "loss": 0.7806, + "step": 24000 + }, + { + "epoch": 17.118402282453637, + "eval/acc": 48.83720779418945, + "step": 24000 + }, + { + "epoch": 17.118402282453637, + "eval_loss": 1.911737322807312, + "eval_runtime": 0.2294, + "eval_samples_per_second": 187.449, + "eval_steps_per_second": 4.359, + "step": 24000 + }, + { + "epoch": 17.125534950071327, + "grad_norm": 6.75, + "learning_rate": 4.4044444444444445e-06, + "loss": 0.6447, + "step": 24010 + }, + { + "epoch": 17.132667617689016, + "grad_norm": 30.75, + "learning_rate": 4.360000000000001e-06, + "loss": 0.7323, + "step": 24020 + }, + { + "epoch": 17.139800285306706, + "grad_norm": 8.0625, + "learning_rate": 4.315555555555556e-06, + "loss": 0.8296, + "step": 24030 + }, + { + "epoch": 17.146932952924395, + "grad_norm": 7.9375, + "learning_rate": 4.271111111111111e-06, + "loss": 0.7555, + "step": 24040 + }, + { + "epoch": 17.154065620542085, + "grad_norm": 5.875, + "learning_rate": 4.226666666666667e-06, + "loss": 0.7072, + "step": 24050 + }, + { + "epoch": 17.16119828815977, + "grad_norm": 7.03125, + "learning_rate": 4.182222222222222e-06, + "loss": 0.633, + "step": 24060 + }, + { + "epoch": 17.16833095577746, + "grad_norm": 7.65625, + "learning_rate": 4.137777777777778e-06, + "loss": 0.7191, + "step": 24070 + }, + { + "epoch": 17.17546362339515, + "grad_norm": 6.46875, + "learning_rate": 4.093333333333334e-06, + "loss": 0.7804, + "step": 24080 + }, + { + "epoch": 17.18259629101284, + "grad_norm": 7.53125, + "learning_rate": 4.048888888888889e-06, + "loss": 0.8225, + "step": 24090 + }, + { + "epoch": 17.18972895863053, + "grad_norm": 21.25, + "learning_rate": 4.004444444444445e-06, + "loss": 0.7856, + "step": 24100 + }, + { + "epoch": 17.18972895863053, + "eval/acc": 48.83720779418945, + "step": 24100 + }, + { + "epoch": 17.18972895863053, + "eval_loss": 1.914453387260437, + "eval_runtime": 0.2272, + "eval_samples_per_second": 189.227, + "eval_steps_per_second": 4.401, + "step": 24100 + }, + { + "epoch": 17.196861626248218, + "grad_norm": 16.0, + "learning_rate": 3.96e-06, + "loss": 0.6726, + "step": 24110 + }, + { + "epoch": 17.203994293865907, + "grad_norm": 12.1875, + "learning_rate": 3.9155555555555554e-06, + "loss": 0.8519, + "step": 24120 + }, + { + "epoch": 17.211126961483593, + "grad_norm": 7.53125, + "learning_rate": 3.8711111111111115e-06, + "loss": 0.6615, + "step": 24130 + }, + { + "epoch": 17.218259629101283, + "grad_norm": 7.53125, + "learning_rate": 3.826666666666667e-06, + "loss": 0.7664, + "step": 24140 + }, + { + "epoch": 17.225392296718972, + "grad_norm": 16.125, + "learning_rate": 3.7822222222222224e-06, + "loss": 0.7134, + "step": 24150 + }, + { + "epoch": 17.232524964336662, + "grad_norm": 7.84375, + "learning_rate": 3.737777777777778e-06, + "loss": 0.8229, + "step": 24160 + }, + { + "epoch": 17.23965763195435, + "grad_norm": 6.25, + "learning_rate": 3.6933333333333333e-06, + "loss": 0.7486, + "step": 24170 + }, + { + "epoch": 17.24679029957204, + "grad_norm": 7.875, + "learning_rate": 3.6488888888888893e-06, + "loss": 0.7403, + "step": 24180 + }, + { + "epoch": 17.25392296718973, + "grad_norm": 8.5, + "learning_rate": 3.604444444444444e-06, + "loss": 0.7794, + "step": 24190 + }, + { + "epoch": 17.261055634807416, + "grad_norm": 8.0625, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.8832, + "step": 24200 + }, + { + "epoch": 17.261055634807416, + "eval/acc": 46.511627197265625, + "step": 24200 + }, + { + "epoch": 17.261055634807416, + "eval_loss": 1.9235628843307495, + "eval_runtime": 0.2238, + "eval_samples_per_second": 192.12, + "eval_steps_per_second": 4.468, + "step": 24200 + }, + { + "epoch": 17.268188302425106, + "grad_norm": 7.75, + "learning_rate": 3.515555555555556e-06, + "loss": 0.8659, + "step": 24210 + }, + { + "epoch": 17.275320970042795, + "grad_norm": 7.65625, + "learning_rate": 3.471111111111111e-06, + "loss": 0.7829, + "step": 24220 + }, + { + "epoch": 17.282453637660485, + "grad_norm": 10.375, + "learning_rate": 3.4266666666666668e-06, + "loss": 0.7556, + "step": 24230 + }, + { + "epoch": 17.289586305278174, + "grad_norm": 8.25, + "learning_rate": 3.382222222222222e-06, + "loss": 0.7601, + "step": 24240 + }, + { + "epoch": 17.296718972895864, + "grad_norm": 7.53125, + "learning_rate": 3.337777777777778e-06, + "loss": 0.7801, + "step": 24250 + }, + { + "epoch": 17.303851640513553, + "grad_norm": 5.65625, + "learning_rate": 3.2933333333333337e-06, + "loss": 0.6873, + "step": 24260 + }, + { + "epoch": 17.310984308131243, + "grad_norm": 7.03125, + "learning_rate": 3.248888888888889e-06, + "loss": 0.701, + "step": 24270 + }, + { + "epoch": 17.31811697574893, + "grad_norm": 6.25, + "learning_rate": 3.2044444444444446e-06, + "loss": 0.7061, + "step": 24280 + }, + { + "epoch": 17.325249643366618, + "grad_norm": 5.78125, + "learning_rate": 3.1600000000000007e-06, + "loss": 0.7596, + "step": 24290 + }, + { + "epoch": 17.332382310984308, + "grad_norm": 7.5625, + "learning_rate": 3.1155555555555555e-06, + "loss": 0.813, + "step": 24300 + }, + { + "epoch": 17.332382310984308, + "eval/acc": 48.83720779418945, + "step": 24300 + }, + { + "epoch": 17.332382310984308, + "eval_loss": 1.9356474876403809, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.237, + "eval_steps_per_second": 4.517, + "step": 24300 + }, + { + "epoch": 17.339514978601997, + "grad_norm": 6.96875, + "learning_rate": 3.0711111111111115e-06, + "loss": 0.6874, + "step": 24310 + }, + { + "epoch": 17.346647646219687, + "grad_norm": 12.375, + "learning_rate": 3.0266666666666668e-06, + "loss": 0.7283, + "step": 24320 + }, + { + "epoch": 17.353780313837376, + "grad_norm": 25.875, + "learning_rate": 2.9822222222222224e-06, + "loss": 0.7118, + "step": 24330 + }, + { + "epoch": 17.360912981455066, + "grad_norm": 8.4375, + "learning_rate": 2.9377777777777776e-06, + "loss": 0.8043, + "step": 24340 + }, + { + "epoch": 17.36804564907275, + "grad_norm": 7.5, + "learning_rate": 2.8933333333333333e-06, + "loss": 0.7448, + "step": 24350 + }, + { + "epoch": 17.37517831669044, + "grad_norm": 7.9375, + "learning_rate": 2.848888888888889e-06, + "loss": 0.7841, + "step": 24360 + }, + { + "epoch": 17.38231098430813, + "grad_norm": 8.0625, + "learning_rate": 2.8044444444444446e-06, + "loss": 0.751, + "step": 24370 + }, + { + "epoch": 17.38944365192582, + "grad_norm": 8.5625, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.7078, + "step": 24380 + }, + { + "epoch": 17.39657631954351, + "grad_norm": 9.5625, + "learning_rate": 2.7155555555555555e-06, + "loss": 0.7847, + "step": 24390 + }, + { + "epoch": 17.4037089871612, + "grad_norm": 6.03125, + "learning_rate": 2.6711111111111116e-06, + "loss": 0.7537, + "step": 24400 + }, + { + "epoch": 17.4037089871612, + "eval/acc": 48.83720779418945, + "step": 24400 + }, + { + "epoch": 17.4037089871612, + "eval_loss": 1.9303936958312988, + "eval_runtime": 1.066, + "eval_samples_per_second": 40.339, + "eval_steps_per_second": 0.938, + "step": 24400 + }, + { + "epoch": 17.41084165477889, + "grad_norm": 9.375, + "learning_rate": 2.6266666666666668e-06, + "loss": 0.8343, + "step": 24410 + }, + { + "epoch": 17.417974322396578, + "grad_norm": 7.3125, + "learning_rate": 2.5822222222222224e-06, + "loss": 0.7967, + "step": 24420 + }, + { + "epoch": 17.425106990014264, + "grad_norm": 8.1875, + "learning_rate": 2.5377777777777777e-06, + "loss": 0.7392, + "step": 24430 + }, + { + "epoch": 17.432239657631953, + "grad_norm": 9.5625, + "learning_rate": 2.4933333333333333e-06, + "loss": 0.7619, + "step": 24440 + }, + { + "epoch": 17.439372325249643, + "grad_norm": 6.5, + "learning_rate": 2.448888888888889e-06, + "loss": 0.7465, + "step": 24450 + }, + { + "epoch": 17.446504992867332, + "grad_norm": 8.625, + "learning_rate": 2.4044444444444446e-06, + "loss": 0.718, + "step": 24460 + }, + { + "epoch": 17.453637660485022, + "grad_norm": 7.375, + "learning_rate": 2.36e-06, + "loss": 0.7457, + "step": 24470 + }, + { + "epoch": 17.46077032810271, + "grad_norm": 7.53125, + "learning_rate": 2.3155555555555555e-06, + "loss": 0.6436, + "step": 24480 + }, + { + "epoch": 17.4679029957204, + "grad_norm": 6.5625, + "learning_rate": 2.271111111111111e-06, + "loss": 0.7747, + "step": 24490 + }, + { + "epoch": 17.475035663338087, + "grad_norm": 8.125, + "learning_rate": 2.226666666666667e-06, + "loss": 0.7697, + "step": 24500 + }, + { + "epoch": 17.475035663338087, + "eval/acc": 48.83720779418945, + "step": 24500 + }, + { + "epoch": 17.475035663338087, + "eval_loss": 1.9065417051315308, + "eval_runtime": 0.2191, + "eval_samples_per_second": 196.239, + "eval_steps_per_second": 4.564, + "step": 24500 + }, + { + "epoch": 17.482168330955776, + "grad_norm": 6.90625, + "learning_rate": 2.1822222222222225e-06, + "loss": 0.7558, + "step": 24510 + }, + { + "epoch": 17.489300998573466, + "grad_norm": 19.375, + "learning_rate": 2.1377777777777777e-06, + "loss": 0.8242, + "step": 24520 + }, + { + "epoch": 17.496433666191155, + "grad_norm": 6.8125, + "learning_rate": 2.0933333333333338e-06, + "loss": 0.8656, + "step": 24530 + }, + { + "epoch": 17.503566333808845, + "grad_norm": 6.0, + "learning_rate": 2.048888888888889e-06, + "loss": 0.6613, + "step": 24540 + }, + { + "epoch": 17.510699001426534, + "grad_norm": 6.90625, + "learning_rate": 2.0044444444444446e-06, + "loss": 0.7427, + "step": 24550 + }, + { + "epoch": 17.517831669044224, + "grad_norm": 8.875, + "learning_rate": 1.96e-06, + "loss": 0.719, + "step": 24560 + }, + { + "epoch": 17.524964336661913, + "grad_norm": 6.59375, + "learning_rate": 1.9155555555555555e-06, + "loss": 0.7669, + "step": 24570 + }, + { + "epoch": 17.5320970042796, + "grad_norm": 7.78125, + "learning_rate": 1.8711111111111114e-06, + "loss": 0.7332, + "step": 24580 + }, + { + "epoch": 17.53922967189729, + "grad_norm": 7.8125, + "learning_rate": 1.8266666666666668e-06, + "loss": 0.712, + "step": 24590 + }, + { + "epoch": 17.546362339514978, + "grad_norm": 8.375, + "learning_rate": 1.7822222222222223e-06, + "loss": 0.876, + "step": 24600 + }, + { + "epoch": 17.546362339514978, + "eval/acc": 48.83720779418945, + "step": 24600 + }, + { + "epoch": 17.546362339514978, + "eval_loss": 1.9253615140914917, + "eval_runtime": 0.2214, + "eval_samples_per_second": 194.229, + "eval_steps_per_second": 4.517, + "step": 24600 + }, + { + "epoch": 17.553495007132668, + "grad_norm": 11.9375, + "learning_rate": 1.7377777777777777e-06, + "loss": 0.6975, + "step": 24610 + }, + { + "epoch": 17.560627674750357, + "grad_norm": 7.96875, + "learning_rate": 1.6933333333333336e-06, + "loss": 0.7823, + "step": 24620 + }, + { + "epoch": 17.567760342368047, + "grad_norm": 8.125, + "learning_rate": 1.648888888888889e-06, + "loss": 0.7674, + "step": 24630 + }, + { + "epoch": 17.574893009985736, + "grad_norm": 7.0, + "learning_rate": 1.6044444444444444e-06, + "loss": 0.7347, + "step": 24640 + }, + { + "epoch": 17.582025677603422, + "grad_norm": 7.625, + "learning_rate": 1.56e-06, + "loss": 0.6577, + "step": 24650 + }, + { + "epoch": 17.58915834522111, + "grad_norm": 7.5, + "learning_rate": 1.5155555555555558e-06, + "loss": 0.7647, + "step": 24660 + }, + { + "epoch": 17.5962910128388, + "grad_norm": 8.4375, + "learning_rate": 1.4711111111111112e-06, + "loss": 0.8421, + "step": 24670 + }, + { + "epoch": 17.60342368045649, + "grad_norm": 7.125, + "learning_rate": 1.4266666666666668e-06, + "loss": 0.6554, + "step": 24680 + }, + { + "epoch": 17.61055634807418, + "grad_norm": 8.375, + "learning_rate": 1.3822222222222223e-06, + "loss": 0.6979, + "step": 24690 + }, + { + "epoch": 17.61768901569187, + "grad_norm": 7.125, + "learning_rate": 1.337777777777778e-06, + "loss": 0.7341, + "step": 24700 + }, + { + "epoch": 17.61768901569187, + "eval/acc": 46.511627197265625, + "step": 24700 + }, + { + "epoch": 17.61768901569187, + "eval_loss": 1.942150592803955, + "eval_runtime": 0.2263, + "eval_samples_per_second": 190.005, + "eval_steps_per_second": 4.419, + "step": 24700 + }, + { + "epoch": 17.62482168330956, + "grad_norm": 6.6875, + "learning_rate": 1.2933333333333334e-06, + "loss": 0.7998, + "step": 24710 + }, + { + "epoch": 17.63195435092725, + "grad_norm": 7.3125, + "learning_rate": 1.248888888888889e-06, + "loss": 0.7074, + "step": 24720 + }, + { + "epoch": 17.639087018544934, + "grad_norm": 8.1875, + "learning_rate": 1.2044444444444445e-06, + "loss": 0.7117, + "step": 24730 + }, + { + "epoch": 17.646219686162624, + "grad_norm": 5.75, + "learning_rate": 1.16e-06, + "loss": 0.7245, + "step": 24740 + }, + { + "epoch": 17.653352353780313, + "grad_norm": 6.65625, + "learning_rate": 1.1155555555555556e-06, + "loss": 0.7581, + "step": 24750 + }, + { + "epoch": 17.660485021398003, + "grad_norm": 12.375, + "learning_rate": 1.071111111111111e-06, + "loss": 0.7798, + "step": 24760 + }, + { + "epoch": 17.667617689015692, + "grad_norm": 7.0625, + "learning_rate": 1.0266666666666666e-06, + "loss": 0.6628, + "step": 24770 + }, + { + "epoch": 17.674750356633382, + "grad_norm": 6.75, + "learning_rate": 9.822222222222223e-07, + "loss": 0.7847, + "step": 24780 + }, + { + "epoch": 17.68188302425107, + "grad_norm": 7.09375, + "learning_rate": 9.377777777777778e-07, + "loss": 0.7502, + "step": 24790 + }, + { + "epoch": 17.689015691868757, + "grad_norm": 9.375, + "learning_rate": 8.933333333333334e-07, + "loss": 0.7847, + "step": 24800 + }, + { + "epoch": 17.689015691868757, + "eval/acc": 48.83720779418945, + "step": 24800 + }, + { + "epoch": 17.689015691868757, + "eval_loss": 1.923488974571228, + "eval_runtime": 0.2182, + "eval_samples_per_second": 197.099, + "eval_steps_per_second": 4.584, + "step": 24800 + }, + { + "epoch": 17.696148359486447, + "grad_norm": 8.875, + "learning_rate": 8.48888888888889e-07, + "loss": 0.7152, + "step": 24810 + }, + { + "epoch": 17.703281027104136, + "grad_norm": 6.125, + "learning_rate": 8.044444444444445e-07, + "loss": 0.7513, + "step": 24820 + }, + { + "epoch": 17.710413694721826, + "grad_norm": 7.375, + "learning_rate": 7.6e-07, + "loss": 0.808, + "step": 24830 + }, + { + "epoch": 17.717546362339515, + "grad_norm": 7.71875, + "learning_rate": 7.155555555555556e-07, + "loss": 0.7731, + "step": 24840 + }, + { + "epoch": 17.724679029957205, + "grad_norm": 7.65625, + "learning_rate": 6.711111111111111e-07, + "loss": 0.7497, + "step": 24850 + }, + { + "epoch": 17.731811697574894, + "grad_norm": 55.5, + "learning_rate": 6.266666666666668e-07, + "loss": 0.7953, + "step": 24860 + }, + { + "epoch": 17.73894436519258, + "grad_norm": 9.3125, + "learning_rate": 5.822222222222223e-07, + "loss": 0.7664, + "step": 24870 + }, + { + "epoch": 17.74607703281027, + "grad_norm": 6.28125, + "learning_rate": 5.377777777777779e-07, + "loss": 0.7296, + "step": 24880 + }, + { + "epoch": 17.75320970042796, + "grad_norm": 9.5, + "learning_rate": 4.933333333333333e-07, + "loss": 0.8254, + "step": 24890 + }, + { + "epoch": 17.76034236804565, + "grad_norm": 7.125, + "learning_rate": 4.488888888888889e-07, + "loss": 0.7546, + "step": 24900 + }, + { + "epoch": 17.76034236804565, + "eval/acc": 46.511627197265625, + "step": 24900 + }, + { + "epoch": 17.76034236804565, + "eval_loss": 1.9365407228469849, + "eval_runtime": 0.2222, + "eval_samples_per_second": 193.53, + "eval_steps_per_second": 4.501, + "step": 24900 + }, + { + "epoch": 17.767475035663338, + "grad_norm": 13.3125, + "learning_rate": 4.0444444444444445e-07, + "loss": 0.7919, + "step": 24910 + }, + { + "epoch": 17.774607703281028, + "grad_norm": 6.84375, + "learning_rate": 3.6e-07, + "loss": 0.7368, + "step": 24920 + }, + { + "epoch": 17.781740370898717, + "grad_norm": 7.09375, + "learning_rate": 3.155555555555556e-07, + "loss": 0.6357, + "step": 24930 + }, + { + "epoch": 17.788873038516407, + "grad_norm": 6.09375, + "learning_rate": 2.7111111111111114e-07, + "loss": 0.7045, + "step": 24940 + }, + { + "epoch": 17.796005706134093, + "grad_norm": 8.1875, + "learning_rate": 2.2666666666666668e-07, + "loss": 0.749, + "step": 24950 + }, + { + "epoch": 17.803138373751782, + "grad_norm": 8.5, + "learning_rate": 1.8222222222222223e-07, + "loss": 0.7689, + "step": 24960 + }, + { + "epoch": 17.81027104136947, + "grad_norm": 17.875, + "learning_rate": 1.3777777777777778e-07, + "loss": 0.7133, + "step": 24970 + }, + { + "epoch": 17.81740370898716, + "grad_norm": 9.25, + "learning_rate": 9.333333333333334e-08, + "loss": 0.778, + "step": 24980 + }, + { + "epoch": 17.82453637660485, + "grad_norm": 6.6875, + "learning_rate": 4.888888888888889e-08, + "loss": 0.7655, + "step": 24990 + }, + { + "epoch": 17.83166904422254, + "grad_norm": 6.8125, + "learning_rate": 4.444444444444445e-09, + "loss": 0.746, + "step": 25000 + }, + { + "epoch": 17.83166904422254, + "eval/acc": 46.511627197265625, + "step": 25000 + }, + { + "epoch": 17.83166904422254, + "eval_loss": 1.9321389198303223, + "eval_runtime": 0.2223, + "eval_samples_per_second": 193.429, + "eval_steps_per_second": 4.498, + "step": 25000 + } + ], + "logging_steps": 10, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 18, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/training_args.bin b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1d680c060a6648e26d420ee8365324dd3d5fee7 --- /dev/null +++ b/modernbert-crux-researchy-pos_20.neg_51.filtered.b64_n512.1e-4.512/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24358e453225b271a5d5308309aefca4acc94bf8e5e69d279887179f7a31b1b7 +size 6161