DylanJHJ commited on 12 days ago

Commit

e218fa7

1 Parent(s): 4201aa8

update model ablation for smapling

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +44 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/config.json +45 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json +0 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/training_args.bin +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/config.json +45 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_0.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_1.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_2.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_3.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/scheduler.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/trainer_state.json +2109 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/training_args.bin +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/config.json +45 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json +4184 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/training_args.bin +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/config.json +45 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_0.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_1.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_2.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_3.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/scheduler.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/trainer_state.json +0 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/training_args.bin +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/config.json +45 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_0.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_1.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_2.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_3.pth +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/runs/Nov25_13-58-36_nid005118/events.out.tfevents.1764072154.nid005118.9241.0 +3 -0
modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/scheduler.pt +3 -0

.gitattributes CHANGED Viewed

@@ -63,3 +63,47 @@ modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/checkpoint-5000/mo
 modernbert-crux-researchy-pos_half.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
 modernbert-crux-researchy-pos_low.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
 modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text

 modernbert-crux-researchy-pos_half.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
 modernbert-crux-researchy-pos_low.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
 modernbert-crux-researchy-pos_high.neg_zero.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-7500/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-2500/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
+modernbert-crux-researchy-pos_high.neg_quarter.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/ce/70/ce70baabd51a179ecf81d62946475b863fa7b485ef3208ae2c04b60c275a2a96 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/a4/b6/a4b6b75bc77beffc95b5d6c546e916043aee3af38d55035c9f51ebbba73c6b33 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/c7/f2/c7f26a5b35e56ecb6bb72212ce3a7da9d14d66de5114f67202346c60edd18426 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/26/fa/26faaeca5a59a77444ef3f16961df2c4b1b3c298a93cb53f21213fbab0ade4be filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/26/dd/26ddea10d1b592b763ee59b05f1558c3cb2f0493c1b214f3c80feb714b66c3b1 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/83/6c/836c84924653789c9c3b731d22335a04af06d4df529543f7e65b0db31889ea74 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/88/92/88926b3be5727f9d0cc432ab2c00dfc3cab66da501336e8c8a7e08188cc0de25 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/1b/d3/1bd3c6439961fc549637f6cc43348eb9fbd2080ab2266164dcd783f7c3699e7a filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/22/3b/223b60387aa7ede0f506e51bf9e672bf2b2acae07c62b17cbf746a0246ac1da8 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/1a/ff/1aff66ee140ada517137e3deb00c2322cd5f3c5c26e947f38a24227e394683ac filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/8b/3f/8b3f482e56a92f0760e27f700de46d09e3a07b0a89d3ef90808d0d6b22512827 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/78/03/780374ad1143b7bbe5b73f25a7260aa5063a103a1169b688b4285124dcb04dcc filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/6d/76/6d761d2f71db6d9137a94a442e6e58d9d7a63ac279a004e02afd5fe416791c0f filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/5b/7d/5b7de0c3bda08d9fbc23706080095b6e9f2e2204ff73583266d980bde7676f45 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/c9/ce/c9cef30b887e6bf1661270424c3865351158cce8f23f20d0314c9463455010f0 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/d4/c8/d4c81558929d61f7e09725cdd0299c7c9a60354e8e937058fa212bd0fed0d5dc filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/2d/ca/2dca16734197d014f2cfd5f2b53a60d6c915ae69dcfc3813fbe9819c346e5316 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/06/8a/068adc19df70de3d48c90e1a4bfee35d3260dcecdfab93d036098987af28c762 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/5e/7c/5e7c5f3f7ec54eb7a6f288a9c3b9a239b5f1a88679a0d4ced6160c6bcd8b5c1f filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/56/00/560001d7760823f163c427fee75d58d3d89e41c166b9cbba91ea27edc32f94e1 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/f0/4c/f04c2e8ee9c6967f1709716e47b4464861ad1aeb50641b3b1dfbe87cb8894d58 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/f0/69/f069e96d36df3a41005ecc6796d5ed8429bdffa25d9e9d133e6191cbf90b3de7 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/65/f2/65f21ce22f521a5d6f2e5456473256b7984e5c46122adc14ec92997d9ca30764 filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/fd/d7/fdd7dd2e666a3126c4ed005d62b97fc63ca51fad19f7d480fb83d6eac7bfbbd0 filter=lfs diff=lfs merge=lfs -text

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 50368
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa648019534a54da16e5df11fb28257398ac4ee886de2d2ef90e587b14a698f7
+size 298041696

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a47b56f6d386053d17cfa1c908b16dcebcec2fa8dbf6ea679e0add277be30b3
+size 596170443

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c235c10397ca3fb3b82475883c48d3bb786206feaee53c2199c913179faf1fb
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:937bfac24cd2fe886a72cb180e9d726f8629acaf1e31b2beab1f7a03381ca0ca
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee0687693332dd9f28a675c2a9f27590ae650095d80dac61354fce4437e7f9de
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb4dab4ba8c60d5f5c48a1048c1ecc4e949aff462fd8340d7ad1a380fc12fdd
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d02be6d8bda4ea9c67040ed89f878acdc986bd4df3fbc60440a9d3eacca02d63
+size 1465

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-10000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
+size 6161

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 50368
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e030bc5d9a25e135113d1a5b5746b18f93841dc8941845cf34057ac91120ef2f
+size 298041696

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:560363724cc297d305f7f508a44b62eb6e5b3c38cc93a253e113fc093e5591d2
+size 596170443

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edd5aaa88c9c1fc8abf11cf4397d5571cd01f3d7b0f19ae2e2d129014be1fa8a
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bf8a2a24c0dde4941747f29f745d706701a3d3b8edb14d342e599b750fa7e64
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bbd2a817215a6f90c3632eb4c1cf3c7a57ed52c41b60ddeaaeaa878bfb142a5
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:019accfa5df5627ce0019fe3ba9da9a9010bf5c682c7b0defe46d07366d9649b
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a752d3f24f72817376cd37ffda577fd802575961fb476ede3db67c3cc89113bf
+size 1465

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2109 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.1133250311332503,
+  "eval_steps": 100,
+  "global_step": 2500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012453300124533,
+      "grad_norm": 15.75,
+      "learning_rate": 9e-07,
+      "loss": 5.5753,
+      "step": 10
+    },
+    {
+      "epoch": 0.024906600249066,
+      "grad_norm": 25.25,
+      "learning_rate": 1.9e-06,
+      "loss": 5.5776,
+      "step": 20
+    },
+    {
+      "epoch": 0.037359900373599,
+      "grad_norm": 16.25,
+      "learning_rate": 2.9e-06,
+      "loss": 5.5572,
+      "step": 30
+    },
+    {
+      "epoch": 0.049813200498132,
+      "grad_norm": 23.5,
+      "learning_rate": 3.9e-06,
+      "loss": 5.5201,
+      "step": 40
+    },
+    {
+      "epoch": 0.062266500622665005,
+      "grad_norm": 18.0,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 5.6297,
+      "step": 50
+    },
+    {
+      "epoch": 0.074719800747198,
+      "grad_norm": 19.125,
+      "learning_rate": 5.9e-06,
+      "loss": 5.5889,
+      "step": 60
+    },
+    {
+      "epoch": 0.08717310087173101,
+      "grad_norm": 23.125,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 5.4949,
+      "step": 70
+    },
+    {
+      "epoch": 0.099626400996264,
+      "grad_norm": 20.25,
+      "learning_rate": 7.9e-06,
+      "loss": 5.552,
+      "step": 80
+    },
+    {
+      "epoch": 0.11207970112079702,
+      "grad_norm": 17.0,
+      "learning_rate": 8.9e-06,
+      "loss": 5.4765,
+      "step": 90
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "grad_norm": 20.125,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 5.4519,
+      "step": 100
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "eval/acc": 2.3255813121795654,
+      "step": 100
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "eval_loss": 4.970202445983887,
+      "eval_runtime": 2.822,
+      "eval_samples_per_second": 15.237,
+      "eval_steps_per_second": 0.354,
+      "step": 100
+    },
+    {
+      "epoch": 0.136986301369863,
+      "grad_norm": 22.125,
+      "learning_rate": 1.09e-05,
+      "loss": 5.3401,
+      "step": 110
+    },
+    {
+      "epoch": 0.149439601494396,
+      "grad_norm": 16.25,
+      "learning_rate": 1.19e-05,
+      "loss": 5.3088,
+      "step": 120
+    },
+    {
+      "epoch": 0.16189290161892902,
+      "grad_norm": 18.75,
+      "learning_rate": 1.29e-05,
+      "loss": 5.1442,
+      "step": 130
+    },
+    {
+      "epoch": 0.17434620174346202,
+      "grad_norm": 19.5,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 5.0218,
+      "step": 140
+    },
+    {
+      "epoch": 0.18679950186799502,
+      "grad_norm": 25.75,
+      "learning_rate": 1.49e-05,
+      "loss": 4.8711,
+      "step": 150
+    },
+    {
+      "epoch": 0.199252801992528,
+      "grad_norm": 25.625,
+      "learning_rate": 1.59e-05,
+      "loss": 4.6046,
+      "step": 160
+    },
+    {
+      "epoch": 0.21170610211706103,
+      "grad_norm": 28.25,
+      "learning_rate": 1.69e-05,
+      "loss": 4.2891,
+      "step": 170
+    },
+    {
+      "epoch": 0.22415940224159403,
+      "grad_norm": 25.25,
+      "learning_rate": 1.79e-05,
+      "loss": 3.8055,
+      "step": 180
+    },
+    {
+      "epoch": 0.23661270236612703,
+      "grad_norm": 28.0,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 3.4139,
+      "step": 190
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "grad_norm": 29.5,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 2.974,
+      "step": 200
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "eval/acc": 11.627906799316406,
+      "step": 200
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "eval_loss": 3.7072134017944336,
+      "eval_runtime": 0.2742,
+      "eval_samples_per_second": 156.807,
+      "eval_steps_per_second": 3.647,
+      "step": 200
+    },
+    {
+      "epoch": 0.261519302615193,
+      "grad_norm": 30.5,
+      "learning_rate": 2.09e-05,
+      "loss": 2.8723,
+      "step": 210
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 19.625,
+      "learning_rate": 2.19e-05,
+      "loss": 2.6908,
+      "step": 220
+    },
+    {
+      "epoch": 0.286425902864259,
+      "grad_norm": 18.25,
+      "learning_rate": 2.29e-05,
+      "loss": 2.4715,
+      "step": 230
+    },
+    {
+      "epoch": 0.298879202988792,
+      "grad_norm": 16.75,
+      "learning_rate": 2.39e-05,
+      "loss": 2.4336,
+      "step": 240
+    },
+    {
+      "epoch": 0.31133250311332505,
+      "grad_norm": 16.875,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 2.3797,
+      "step": 250
+    },
+    {
+      "epoch": 0.32378580323785805,
+      "grad_norm": 18.375,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 2.2765,
+      "step": 260
+    },
+    {
+      "epoch": 0.33623910336239105,
+      "grad_norm": 13.625,
+      "learning_rate": 2.6900000000000003e-05,
+      "loss": 2.1124,
+      "step": 270
+    },
+    {
+      "epoch": 0.34869240348692404,
+      "grad_norm": 19.5,
+      "learning_rate": 2.7900000000000004e-05,
+      "loss": 2.0748,
+      "step": 280
+    },
+    {
+      "epoch": 0.36114570361145704,
+      "grad_norm": 18.0,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 2.1575,
+      "step": 290
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "grad_norm": 34.0,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 2.1195,
+      "step": 300
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "eval/acc": 23.255813598632812,
+      "step": 300
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "eval_loss": 3.1418063640594482,
+      "eval_runtime": 1.1652,
+      "eval_samples_per_second": 36.903,
+      "eval_steps_per_second": 0.858,
+      "step": 300
+    },
+    {
+      "epoch": 0.386052303860523,
+      "grad_norm": 19.125,
+      "learning_rate": 3.09e-05,
+      "loss": 2.1052,
+      "step": 310
+    },
+    {
+      "epoch": 0.398505603985056,
+      "grad_norm": 20.375,
+      "learning_rate": 3.19e-05,
+      "loss": 1.8924,
+      "step": 320
+    },
+    {
+      "epoch": 0.410958904109589,
+      "grad_norm": 17.125,
+      "learning_rate": 3.29e-05,
+      "loss": 2.025,
+      "step": 330
+    },
+    {
+      "epoch": 0.42341220423412207,
+      "grad_norm": 28.0,
+      "learning_rate": 3.3900000000000004e-05,
+      "loss": 1.8914,
+      "step": 340
+    },
+    {
+      "epoch": 0.43586550435865506,
+      "grad_norm": 22.125,
+      "learning_rate": 3.49e-05,
+      "loss": 1.8864,
+      "step": 350
+    },
+    {
+      "epoch": 0.44831880448318806,
+      "grad_norm": 34.0,
+      "learning_rate": 3.59e-05,
+      "loss": 1.8447,
+      "step": 360
+    },
+    {
+      "epoch": 0.46077210460772106,
+      "grad_norm": 15.4375,
+      "learning_rate": 3.69e-05,
+      "loss": 1.7981,
+      "step": 370
+    },
+    {
+      "epoch": 0.47322540473225405,
+      "grad_norm": 39.25,
+      "learning_rate": 3.79e-05,
+      "loss": 1.6967,
+      "step": 380
+    },
+    {
+      "epoch": 0.48567870485678705,
+      "grad_norm": 35.25,
+      "learning_rate": 3.8900000000000004e-05,
+      "loss": 1.7919,
+      "step": 390
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "grad_norm": 19.875,
+      "learning_rate": 3.99e-05,
+      "loss": 1.6083,
+      "step": 400
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "eval/acc": 27.9069766998291,
+      "step": 400
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "eval_loss": 2.988025665283203,
+      "eval_runtime": 0.2197,
+      "eval_samples_per_second": 195.684,
+      "eval_steps_per_second": 4.551,
+      "step": 400
+    },
+    {
+      "epoch": 0.5105853051058531,
+      "grad_norm": 14.1875,
+      "learning_rate": 4.09e-05,
+      "loss": 1.7039,
+      "step": 410
+    },
+    {
+      "epoch": 0.523038605230386,
+      "grad_norm": 33.25,
+      "learning_rate": 4.19e-05,
+      "loss": 1.7057,
+      "step": 420
+    },
+    {
+      "epoch": 0.5354919053549191,
+      "grad_norm": 15.5,
+      "learning_rate": 4.29e-05,
+      "loss": 1.6425,
+      "step": 430
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 29.625,
+      "learning_rate": 4.39e-05,
+      "loss": 1.4995,
+      "step": 440
+    },
+    {
+      "epoch": 0.5603985056039851,
+      "grad_norm": 15.0625,
+      "learning_rate": 4.49e-05,
+      "loss": 1.6621,
+      "step": 450
+    },
+    {
+      "epoch": 0.572851805728518,
+      "grad_norm": 22.25,
+      "learning_rate": 4.5900000000000004e-05,
+      "loss": 1.5684,
+      "step": 460
+    },
+    {
+      "epoch": 0.5853051058530511,
+      "grad_norm": 17.25,
+      "learning_rate": 4.69e-05,
+      "loss": 1.5414,
+      "step": 470
+    },
+    {
+      "epoch": 0.597758405977584,
+      "grad_norm": 19.25,
+      "learning_rate": 4.79e-05,
+      "loss": 1.5445,
+      "step": 480
+    },
+    {
+      "epoch": 0.6102117061021171,
+      "grad_norm": 205.0,
+      "learning_rate": 4.89e-05,
+      "loss": 1.4726,
+      "step": 490
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "grad_norm": 13.375,
+      "learning_rate": 4.99e-05,
+      "loss": 1.3783,
+      "step": 500
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "eval/acc": 30.23255729675293,
+      "step": 500
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "eval_loss": 2.777118444442749,
+      "eval_runtime": 0.2153,
+      "eval_samples_per_second": 199.749,
+      "eval_steps_per_second": 4.645,
+      "step": 500
+    },
+    {
+      "epoch": 0.635118306351183,
+      "grad_norm": 20.875,
+      "learning_rate": 5.0900000000000004e-05,
+      "loss": 1.4983,
+      "step": 510
+    },
+    {
+      "epoch": 0.6475716064757161,
+      "grad_norm": 16.625,
+      "learning_rate": 5.19e-05,
+      "loss": 1.5093,
+      "step": 520
+    },
+    {
+      "epoch": 0.660024906600249,
+      "grad_norm": 14.125,
+      "learning_rate": 5.2900000000000005e-05,
+      "loss": 1.4588,
+      "step": 530
+    },
+    {
+      "epoch": 0.6724782067247821,
+      "grad_norm": 37.0,
+      "learning_rate": 5.390000000000001e-05,
+      "loss": 1.4346,
+      "step": 540
+    },
+    {
+      "epoch": 0.684931506849315,
+      "grad_norm": 16.75,
+      "learning_rate": 5.4900000000000006e-05,
+      "loss": 1.5363,
+      "step": 550
+    },
+    {
+      "epoch": 0.6973848069738481,
+      "grad_norm": 28.375,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 1.4497,
+      "step": 560
+    },
+    {
+      "epoch": 0.709838107098381,
+      "grad_norm": 15.5625,
+      "learning_rate": 5.69e-05,
+      "loss": 1.4005,
+      "step": 570
+    },
+    {
+      "epoch": 0.7222914072229141,
+      "grad_norm": 14.75,
+      "learning_rate": 5.79e-05,
+      "loss": 1.4588,
+      "step": 580
+    },
+    {
+      "epoch": 0.7347447073474471,
+      "grad_norm": 18.5,
+      "learning_rate": 5.89e-05,
+      "loss": 1.3489,
+      "step": 590
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "grad_norm": 12.125,
+      "learning_rate": 5.99e-05,
+      "loss": 1.3295,
+      "step": 600
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "eval/acc": 39.53488540649414,
+      "step": 600
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "eval_loss": 2.6652462482452393,
+      "eval_runtime": 0.2211,
+      "eval_samples_per_second": 194.477,
+      "eval_steps_per_second": 4.523,
+      "step": 600
+    },
+    {
+      "epoch": 0.7596513075965131,
+      "grad_norm": 12.9375,
+      "learning_rate": 6.09e-05,
+      "loss": 1.3717,
+      "step": 610
+    },
+    {
+      "epoch": 0.772104607721046,
+      "grad_norm": 21.5,
+      "learning_rate": 6.19e-05,
+      "loss": 1.425,
+      "step": 620
+    },
+    {
+      "epoch": 0.7845579078455791,
+      "grad_norm": 13.6875,
+      "learning_rate": 6.29e-05,
+      "loss": 1.3017,
+      "step": 630
+    },
+    {
+      "epoch": 0.797011207970112,
+      "grad_norm": 12.8125,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 1.3533,
+      "step": 640
+    },
+    {
+      "epoch": 0.8094645080946451,
+      "grad_norm": 13.1875,
+      "learning_rate": 6.49e-05,
+      "loss": 1.271,
+      "step": 650
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 15.125,
+      "learning_rate": 6.59e-05,
+      "loss": 1.3734,
+      "step": 660
+    },
+    {
+      "epoch": 0.8343711083437111,
+      "grad_norm": 16.125,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 1.3092,
+      "step": 670
+    },
+    {
+      "epoch": 0.8468244084682441,
+      "grad_norm": 17.75,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 1.1803,
+      "step": 680
+    },
+    {
+      "epoch": 0.8592777085927771,
+      "grad_norm": 13.875,
+      "learning_rate": 6.89e-05,
+      "loss": 1.3383,
+      "step": 690
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "grad_norm": 11.25,
+      "learning_rate": 6.99e-05,
+      "loss": 1.3024,
+      "step": 700
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "eval/acc": 34.88372039794922,
+      "step": 700
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "eval_loss": 2.7215068340301514,
+      "eval_runtime": 0.3836,
+      "eval_samples_per_second": 112.097,
+      "eval_steps_per_second": 2.607,
+      "step": 700
+    },
+    {
+      "epoch": 0.8841843088418431,
+      "grad_norm": 67.0,
+      "learning_rate": 7.09e-05,
+      "loss": 1.2095,
+      "step": 710
+    },
+    {
+      "epoch": 0.8966376089663761,
+      "grad_norm": 9.875,
+      "learning_rate": 7.19e-05,
+      "loss": 1.2948,
+      "step": 720
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 14.1875,
+      "learning_rate": 7.29e-05,
+      "loss": 1.3225,
+      "step": 730
+    },
+    {
+      "epoch": 0.9215442092154421,
+      "grad_norm": 13.125,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 1.1936,
+      "step": 740
+    },
+    {
+      "epoch": 0.933997509339975,
+      "grad_norm": 12.875,
+      "learning_rate": 7.49e-05,
+      "loss": 1.2211,
+      "step": 750
+    },
+    {
+      "epoch": 0.9464508094645081,
+      "grad_norm": 13.5625,
+      "learning_rate": 7.59e-05,
+      "loss": 1.2435,
+      "step": 760
+    },
+    {
+      "epoch": 0.958904109589041,
+      "grad_norm": 19.25,
+      "learning_rate": 7.69e-05,
+      "loss": 1.1786,
+      "step": 770
+    },
+    {
+      "epoch": 0.9713574097135741,
+      "grad_norm": 14.375,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 1.2784,
+      "step": 780
+    },
+    {
+      "epoch": 0.9838107098381071,
+      "grad_norm": 10.625,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 1.2618,
+      "step": 790
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "grad_norm": 22.75,
+      "learning_rate": 7.99e-05,
+      "loss": 1.1601,
+      "step": 800
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "eval/acc": 41.86046600341797,
+      "step": 800
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "eval_loss": 2.622220516204834,
+      "eval_runtime": 0.2076,
+      "eval_samples_per_second": 207.126,
+      "eval_steps_per_second": 4.817,
+      "step": 800
+    },
+    {
+      "epoch": 1.0087173100871731,
+      "grad_norm": 12.75,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 1.0972,
+      "step": 810
+    },
+    {
+      "epoch": 1.0211706102117062,
+      "grad_norm": 9.6875,
+      "learning_rate": 8.19e-05,
+      "loss": 1.1999,
+      "step": 820
+    },
+    {
+      "epoch": 1.033623910336239,
+      "grad_norm": 11.625,
+      "learning_rate": 8.29e-05,
+      "loss": 1.1677,
+      "step": 830
+    },
+    {
+      "epoch": 1.046077210460772,
+      "grad_norm": 12.8125,
+      "learning_rate": 8.39e-05,
+      "loss": 1.1505,
+      "step": 840
+    },
+    {
+      "epoch": 1.0585305105853051,
+      "grad_norm": 11.6875,
+      "learning_rate": 8.49e-05,
+      "loss": 1.1599,
+      "step": 850
+    },
+    {
+      "epoch": 1.0709838107098382,
+      "grad_norm": 9.8125,
+      "learning_rate": 8.59e-05,
+      "loss": 1.1746,
+      "step": 860
+    },
+    {
+      "epoch": 1.083437110834371,
+      "grad_norm": 11.625,
+      "learning_rate": 8.69e-05,
+      "loss": 1.047,
+      "step": 870
+    },
+    {
+      "epoch": 1.095890410958904,
+      "grad_norm": 10.125,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 1.107,
+      "step": 880
+    },
+    {
+      "epoch": 1.108343711083437,
+      "grad_norm": 9.0,
+      "learning_rate": 8.89e-05,
+      "loss": 1.1105,
+      "step": 890
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "grad_norm": 13.125,
+      "learning_rate": 8.99e-05,
+      "loss": 1.1848,
+      "step": 900
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "eval/acc": 34.88372039794922,
+      "step": 900
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "eval_loss": 2.8814988136291504,
+      "eval_runtime": 1.0687,
+      "eval_samples_per_second": 40.237,
+      "eval_steps_per_second": 0.936,
+      "step": 900
+    },
+    {
+      "epoch": 1.1332503113325032,
+      "grad_norm": 13.25,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 1.1235,
+      "step": 910
+    },
+    {
+      "epoch": 1.145703611457036,
+      "grad_norm": 17.625,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 1.0304,
+      "step": 920
+    },
+    {
+      "epoch": 1.158156911581569,
+      "grad_norm": 11.5625,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 1.0373,
+      "step": 930
+    },
+    {
+      "epoch": 1.1706102117061021,
+      "grad_norm": 13.25,
+      "learning_rate": 9.39e-05,
+      "loss": 1.12,
+      "step": 940
+    },
+    {
+      "epoch": 1.1830635118306352,
+      "grad_norm": 10.4375,
+      "learning_rate": 9.49e-05,
+      "loss": 1.0623,
+      "step": 950
+    },
+    {
+      "epoch": 1.195516811955168,
+      "grad_norm": 14.625,
+      "learning_rate": 9.59e-05,
+      "loss": 1.0692,
+      "step": 960
+    },
+    {
+      "epoch": 1.207970112079701,
+      "grad_norm": 9.6875,
+      "learning_rate": 9.69e-05,
+      "loss": 1.1914,
+      "step": 970
+    },
+    {
+      "epoch": 1.2204234122042341,
+      "grad_norm": 10.4375,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 1.1094,
+      "step": 980
+    },
+    {
+      "epoch": 1.2328767123287672,
+      "grad_norm": 9.625,
+      "learning_rate": 9.89e-05,
+      "loss": 1.0557,
+      "step": 990
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "grad_norm": 15.75,
+      "learning_rate": 9.99e-05,
+      "loss": 0.9635,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "eval/acc": 34.88372039794922,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "eval_loss": 2.967315435409546,
+      "eval_runtime": 0.2242,
+      "eval_samples_per_second": 191.798,
+      "eval_steps_per_second": 4.46,
+      "step": 1000
+    },
+    {
+      "epoch": 1.257783312577833,
+      "grad_norm": 11.8125,
+      "learning_rate": 9.99e-05,
+      "loss": 1.0067,
+      "step": 1010
+    },
+    {
+      "epoch": 1.270236612702366,
+      "grad_norm": 11.4375,
+      "learning_rate": 9.97888888888889e-05,
+      "loss": 1.0609,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2826899128268991,
+      "grad_norm": 12.875,
+      "learning_rate": 9.967777777777779e-05,
+      "loss": 1.1566,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2951432129514322,
+      "grad_norm": 10.625,
+      "learning_rate": 9.956666666666667e-05,
+      "loss": 1.1045,
+      "step": 1040
+    },
+    {
+      "epoch": 1.307596513075965,
+      "grad_norm": 10.0625,
+      "learning_rate": 9.945555555555555e-05,
+      "loss": 1.1421,
+      "step": 1050
+    },
+    {
+      "epoch": 1.320049813200498,
+      "grad_norm": 11.5625,
+      "learning_rate": 9.934444444444445e-05,
+      "loss": 1.0453,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3325031133250311,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.923333333333334e-05,
+      "loss": 1.0531,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3449564134495642,
+      "grad_norm": 11.75,
+      "learning_rate": 9.912222222222222e-05,
+      "loss": 1.0286,
+      "step": 1080
+    },
+    {
+      "epoch": 1.3574097135740972,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.901111111111112e-05,
+      "loss": 0.9549,
+      "step": 1090
+    },
+    {
+      "epoch": 1.36986301369863,
+      "grad_norm": 10.5625,
+      "learning_rate": 9.89e-05,
+      "loss": 1.006,
+      "step": 1100
+    },
+    {
+      "epoch": 1.36986301369863,
+      "eval/acc": 34.88372039794922,
+      "step": 1100
+    },
+    {
+      "epoch": 1.36986301369863,
+      "eval_loss": 2.9681856632232666,
+      "eval_runtime": 0.2343,
+      "eval_samples_per_second": 183.518,
+      "eval_steps_per_second": 4.268,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3823163138231631,
+      "grad_norm": 13.4375,
+      "learning_rate": 9.87888888888889e-05,
+      "loss": 1.049,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3947696139476962,
+      "grad_norm": 13.125,
+      "learning_rate": 9.867777777777777e-05,
+      "loss": 0.951,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4072229140722292,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.856666666666667e-05,
+      "loss": 1.0806,
+      "step": 1130
+    },
+    {
+      "epoch": 1.419676214196762,
+      "grad_norm": 11.8125,
+      "learning_rate": 9.845555555555556e-05,
+      "loss": 0.9683,
+      "step": 1140
+    },
+    {
+      "epoch": 1.432129514321295,
+      "grad_norm": 14.875,
+      "learning_rate": 9.834444444444446e-05,
+      "loss": 0.977,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4445828144458281,
+      "grad_norm": 20.125,
+      "learning_rate": 9.823333333333333e-05,
+      "loss": 0.994,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4570361145703612,
+      "grad_norm": 11.0,
+      "learning_rate": 9.812222222222223e-05,
+      "loss": 1.037,
+      "step": 1170
+    },
+    {
+      "epoch": 1.4694894146948942,
+      "grad_norm": 15.5,
+      "learning_rate": 9.801111111111112e-05,
+      "loss": 1.1605,
+      "step": 1180
+    },
+    {
+      "epoch": 1.481942714819427,
+      "grad_norm": 10.9375,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 1.0113,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "grad_norm": 14.3125,
+      "learning_rate": 9.778888888888889e-05,
+      "loss": 0.9511,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "eval/acc": 37.20930099487305,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "eval_loss": 2.701927423477173,
+      "eval_runtime": 0.2099,
+      "eval_samples_per_second": 204.857,
+      "eval_steps_per_second": 4.764,
+      "step": 1200
+    },
+    {
+      "epoch": 1.5068493150684932,
+      "grad_norm": 11.9375,
+      "learning_rate": 9.767777777777778e-05,
+      "loss": 1.0408,
+      "step": 1210
+    },
+    {
+      "epoch": 1.519302615193026,
+      "grad_norm": 7.71875,
+      "learning_rate": 9.756666666666668e-05,
+      "loss": 0.9782,
+      "step": 1220
+    },
+    {
+      "epoch": 1.531755915317559,
+      "grad_norm": 7.5,
+      "learning_rate": 9.745555555555556e-05,
+      "loss": 1.0293,
+      "step": 1230
+    },
+    {
+      "epoch": 1.544209215442092,
+      "grad_norm": 9.6875,
+      "learning_rate": 9.734444444444444e-05,
+      "loss": 0.9718,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5566625155666252,
+      "grad_norm": 11.0,
+      "learning_rate": 9.723333333333334e-05,
+      "loss": 1.0542,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5691158156911582,
+      "grad_norm": 10.5,
+      "learning_rate": 9.712222222222223e-05,
+      "loss": 0.9537,
+      "step": 1260
+    },
+    {
+      "epoch": 1.5815691158156913,
+      "grad_norm": 13.1875,
+      "learning_rate": 9.701111111111111e-05,
+      "loss": 0.9756,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5940224159402243,
+      "grad_norm": 9.9375,
+      "learning_rate": 9.69e-05,
+      "loss": 0.8843,
+      "step": 1280
+    },
+    {
+      "epoch": 1.6064757160647571,
+      "grad_norm": 10.0,
+      "learning_rate": 9.67888888888889e-05,
+      "loss": 0.8808,
+      "step": 1290
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "grad_norm": 14.0,
+      "learning_rate": 9.667777777777778e-05,
+      "loss": 0.9589,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "eval/acc": 39.53488540649414,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "eval_loss": 2.7926037311553955,
+      "eval_runtime": 0.2238,
+      "eval_samples_per_second": 192.128,
+      "eval_steps_per_second": 4.468,
+      "step": 1300
+    },
+    {
+      "epoch": 1.631382316313823,
+      "grad_norm": 8.9375,
+      "learning_rate": 9.656666666666668e-05,
+      "loss": 0.9315,
+      "step": 1310
+    },
+    {
+      "epoch": 1.643835616438356,
+      "grad_norm": 10.0625,
+      "learning_rate": 9.645555555555556e-05,
+      "loss": 0.9295,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6562889165628891,
+      "grad_norm": 8.75,
+      "learning_rate": 9.634444444444445e-05,
+      "loss": 0.9255,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6687422166874222,
+      "grad_norm": 11.0625,
+      "learning_rate": 9.623333333333335e-05,
+      "loss": 0.9121,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6811955168119552,
+      "grad_norm": 11.375,
+      "learning_rate": 9.612222222222223e-05,
+      "loss": 0.9232,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6936488169364883,
+      "grad_norm": 11.875,
+      "learning_rate": 9.601111111111112e-05,
+      "loss": 0.8991,
+      "step": 1360
+    },
+    {
+      "epoch": 1.7061021170610213,
+      "grad_norm": 9.0,
+      "learning_rate": 9.59e-05,
+      "loss": 0.9405,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7185554171855542,
+      "grad_norm": 11.875,
+      "learning_rate": 9.57888888888889e-05,
+      "loss": 1.0191,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7310087173100872,
+      "grad_norm": 9.8125,
+      "learning_rate": 9.567777777777778e-05,
+      "loss": 0.9002,
+      "step": 1390
+    },
+    {
+      "epoch": 1.74346201743462,
+      "grad_norm": 12.375,
+      "learning_rate": 9.556666666666667e-05,
+      "loss": 0.9681,
+      "step": 1400
+    },
+    {
+      "epoch": 1.74346201743462,
+      "eval/acc": 39.53488540649414,
+      "step": 1400
+    },
+    {
+      "epoch": 1.74346201743462,
+      "eval_loss": 2.795476198196411,
+      "eval_runtime": 0.2152,
+      "eval_samples_per_second": 199.833,
+      "eval_steps_per_second": 4.647,
+      "step": 1400
+    },
+    {
+      "epoch": 1.755915317559153,
+      "grad_norm": 9.375,
+      "learning_rate": 9.545555555555557e-05,
+      "loss": 1.0222,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7683686176836861,
+      "grad_norm": 9.5625,
+      "learning_rate": 9.534444444444445e-05,
+      "loss": 0.9005,
+      "step": 1420
+    },
+    {
+      "epoch": 1.7808219178082192,
+      "grad_norm": 9.875,
+      "learning_rate": 9.523333333333334e-05,
+      "loss": 0.9616,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7932752179327522,
+      "grad_norm": 28.0,
+      "learning_rate": 9.512222222222222e-05,
+      "loss": 1.0197,
+      "step": 1440
+    },
+    {
+      "epoch": 1.8057285180572853,
+      "grad_norm": 10.75,
+      "learning_rate": 9.501111111111112e-05,
+      "loss": 0.9947,
+      "step": 1450
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 10.125,
+      "learning_rate": 9.49e-05,
+      "loss": 0.9064,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8306351183063512,
+      "grad_norm": 11.75,
+      "learning_rate": 9.478888888888889e-05,
+      "loss": 0.9425,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8430884184308842,
+      "grad_norm": 10.625,
+      "learning_rate": 9.467777777777779e-05,
+      "loss": 1.0284,
+      "step": 1480
+    },
+    {
+      "epoch": 1.855541718555417,
+      "grad_norm": 10.125,
+      "learning_rate": 9.456666666666667e-05,
+      "loss": 0.9175,
+      "step": 1490
+    },
+    {
+      "epoch": 1.86799501867995,
+      "grad_norm": 8.375,
+      "learning_rate": 9.445555555555557e-05,
+      "loss": 0.8608,
+      "step": 1500
+    },
+    {
+      "epoch": 1.86799501867995,
+      "eval/acc": 39.53488540649414,
+      "step": 1500
+    },
+    {
+      "epoch": 1.86799501867995,
+      "eval_loss": 2.8291714191436768,
+      "eval_runtime": 0.216,
+      "eval_samples_per_second": 199.031,
+      "eval_steps_per_second": 4.629,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8804483188044832,
+      "grad_norm": 9.625,
+      "learning_rate": 9.434444444444444e-05,
+      "loss": 0.9695,
+      "step": 1510
+    },
+    {
+      "epoch": 1.8929016189290162,
+      "grad_norm": 9.8125,
+      "learning_rate": 9.423333333333334e-05,
+      "loss": 0.9924,
+      "step": 1520
+    },
+    {
+      "epoch": 1.9053549190535493,
+      "grad_norm": 10.0,
+      "learning_rate": 9.412222222222222e-05,
+      "loss": 1.0733,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9178082191780823,
+      "grad_norm": 10.25,
+      "learning_rate": 9.401111111111112e-05,
+      "loss": 0.8818,
+      "step": 1540
+    },
+    {
+      "epoch": 1.9302615193026154,
+      "grad_norm": 15.3125,
+      "learning_rate": 9.39e-05,
+      "loss": 0.9053,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9427148194271482,
+      "grad_norm": 8.25,
+      "learning_rate": 9.378888888888889e-05,
+      "loss": 0.8586,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9551681195516812,
+      "grad_norm": 17.5,
+      "learning_rate": 9.367777777777779e-05,
+      "loss": 0.9316,
+      "step": 1570
+    },
+    {
+      "epoch": 1.967621419676214,
+      "grad_norm": 10.875,
+      "learning_rate": 9.356666666666667e-05,
+      "loss": 1.0195,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9800747198007471,
+      "grad_norm": 9.1875,
+      "learning_rate": 9.345555555555556e-05,
+      "loss": 0.8878,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "grad_norm": 10.3125,
+      "learning_rate": 9.334444444444444e-05,
+      "loss": 0.9765,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "eval/acc": 37.20930099487305,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "eval_loss": 2.9084553718566895,
+      "eval_runtime": 0.2099,
+      "eval_samples_per_second": 204.856,
+      "eval_steps_per_second": 4.764,
+      "step": 1600
+    },
+    {
+      "epoch": 2.004981320049813,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.323333333333334e-05,
+      "loss": 0.8596,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0174346201743463,
+      "grad_norm": 12.0625,
+      "learning_rate": 9.312222222222223e-05,
+      "loss": 0.9156,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0298879202988793,
+      "grad_norm": 10.1875,
+      "learning_rate": 9.301111111111111e-05,
+      "loss": 0.8404,
+      "step": 1630
+    },
+    {
+      "epoch": 2.0423412204234124,
+      "grad_norm": 10.125,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 0.8111,
+      "step": 1640
+    },
+    {
+      "epoch": 2.0547945205479454,
+      "grad_norm": 10.625,
+      "learning_rate": 9.278888888888889e-05,
+      "loss": 0.8124,
+      "step": 1650
+    },
+    {
+      "epoch": 2.067247820672478,
+      "grad_norm": 10.0,
+      "learning_rate": 9.267777777777779e-05,
+      "loss": 0.8124,
+      "step": 1660
+    },
+    {
+      "epoch": 2.079701120797011,
+      "grad_norm": 10.75,
+      "learning_rate": 9.256666666666666e-05,
+      "loss": 0.8384,
+      "step": 1670
+    },
+    {
+      "epoch": 2.092154420921544,
+      "grad_norm": 8.3125,
+      "learning_rate": 9.245555555555556e-05,
+      "loss": 0.8734,
+      "step": 1680
+    },
+    {
+      "epoch": 2.104607721046077,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.234444444444445e-05,
+      "loss": 0.7674,
+      "step": 1690
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "grad_norm": 6.6875,
+      "learning_rate": 9.223333333333334e-05,
+      "loss": 0.8514,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "eval/acc": 48.83720779418945,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "eval_loss": 1.9776915311813354,
+      "eval_runtime": 1.2116,
+      "eval_samples_per_second": 35.491,
+      "eval_steps_per_second": 0.825,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1295143212951433,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.212222222222223e-05,
+      "loss": 0.8502,
+      "step": 1710
+    },
+    {
+      "epoch": 2.1419676214196763,
+      "grad_norm": 12.0625,
+      "learning_rate": 9.201111111111111e-05,
+      "loss": 0.9026,
+      "step": 1720
+    },
+    {
+      "epoch": 2.1544209215442094,
+      "grad_norm": 8.3125,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 0.7893,
+      "step": 1730
+    },
+    {
+      "epoch": 2.166874221668742,
+      "grad_norm": 14.5625,
+      "learning_rate": 9.17888888888889e-05,
+      "loss": 0.7671,
+      "step": 1740
+    },
+    {
+      "epoch": 2.179327521793275,
+      "grad_norm": 11.25,
+      "learning_rate": 9.167777777777778e-05,
+      "loss": 0.7869,
+      "step": 1750
+    },
+    {
+      "epoch": 2.191780821917808,
+      "grad_norm": 8.625,
+      "learning_rate": 9.156666666666667e-05,
+      "loss": 0.8251,
+      "step": 1760
+    },
+    {
+      "epoch": 2.204234122042341,
+      "grad_norm": 7.8125,
+      "learning_rate": 9.145555555555556e-05,
+      "loss": 0.7838,
+      "step": 1770
+    },
+    {
+      "epoch": 2.216687422166874,
+      "grad_norm": 11.6875,
+      "learning_rate": 9.134444444444445e-05,
+      "loss": 0.8348,
+      "step": 1780
+    },
+    {
+      "epoch": 2.2291407222914073,
+      "grad_norm": 9.75,
+      "learning_rate": 9.123333333333333e-05,
+      "loss": 0.8322,
+      "step": 1790
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "grad_norm": 9.25,
+      "learning_rate": 9.112222222222223e-05,
+      "loss": 0.8514,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "eval/acc": 48.83720779418945,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "eval_loss": 1.968414306640625,
+      "eval_runtime": 0.2153,
+      "eval_samples_per_second": 199.733,
+      "eval_steps_per_second": 4.645,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2540473225404734,
+      "grad_norm": 12.6875,
+      "learning_rate": 9.101111111111112e-05,
+      "loss": 0.7841,
+      "step": 1810
+    },
+    {
+      "epoch": 2.2665006226650064,
+      "grad_norm": 7.09375,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 0.7889,
+      "step": 1820
+    },
+    {
+      "epoch": 2.2789539227895395,
+      "grad_norm": 8.1875,
+      "learning_rate": 9.078888888888889e-05,
+      "loss": 0.8088,
+      "step": 1830
+    },
+    {
+      "epoch": 2.291407222914072,
+      "grad_norm": 12.3125,
+      "learning_rate": 9.067777777777778e-05,
+      "loss": 0.8247,
+      "step": 1840
+    },
+    {
+      "epoch": 2.303860523038605,
+      "grad_norm": 7.40625,
+      "learning_rate": 9.056666666666667e-05,
+      "loss": 0.7383,
+      "step": 1850
+    },
+    {
+      "epoch": 2.316313823163138,
+      "grad_norm": 8.5,
+      "learning_rate": 9.045555555555557e-05,
+      "loss": 0.8074,
+      "step": 1860
+    },
+    {
+      "epoch": 2.328767123287671,
+      "grad_norm": 8.625,
+      "learning_rate": 9.034444444444445e-05,
+      "loss": 0.7866,
+      "step": 1870
+    },
+    {
+      "epoch": 2.3412204234122043,
+      "grad_norm": 10.1875,
+      "learning_rate": 9.023333333333334e-05,
+      "loss": 0.8159,
+      "step": 1880
+    },
+    {
+      "epoch": 2.3536737235367373,
+      "grad_norm": 9.875,
+      "learning_rate": 9.012222222222223e-05,
+      "loss": 0.831,
+      "step": 1890
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "grad_norm": 11.0,
+      "learning_rate": 9.001111111111112e-05,
+      "loss": 0.7215,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "eval/acc": 48.83720779418945,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "eval_loss": 1.9242758750915527,
+      "eval_runtime": 0.2274,
+      "eval_samples_per_second": 189.058,
+      "eval_steps_per_second": 4.397,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3785803237858034,
+      "grad_norm": 9.8125,
+      "learning_rate": 8.99e-05,
+      "loss": 0.8346,
+      "step": 1910
+    },
+    {
+      "epoch": 2.391033623910336,
+      "grad_norm": 9.9375,
+      "learning_rate": 8.978888888888889e-05,
+      "loss": 0.8141,
+      "step": 1920
+    },
+    {
+      "epoch": 2.403486924034869,
+      "grad_norm": 10.3125,
+      "learning_rate": 8.967777777777779e-05,
+      "loss": 0.911,
+      "step": 1930
+    },
+    {
+      "epoch": 2.415940224159402,
+      "grad_norm": 9.75,
+      "learning_rate": 8.956666666666667e-05,
+      "loss": 0.9486,
+      "step": 1940
+    },
+    {
+      "epoch": 2.428393524283935,
+      "grad_norm": 9.25,
+      "learning_rate": 8.945555555555556e-05,
+      "loss": 0.8775,
+      "step": 1950
+    },
+    {
+      "epoch": 2.4408468244084682,
+      "grad_norm": 8.0,
+      "learning_rate": 8.934444444444445e-05,
+      "loss": 0.8373,
+      "step": 1960
+    },
+    {
+      "epoch": 2.4533001245330013,
+      "grad_norm": 7.625,
+      "learning_rate": 8.923333333333334e-05,
+      "loss": 0.7469,
+      "step": 1970
+    },
+    {
+      "epoch": 2.4657534246575343,
+      "grad_norm": 37.75,
+      "learning_rate": 8.912222222222222e-05,
+      "loss": 0.7934,
+      "step": 1980
+    },
+    {
+      "epoch": 2.4782067247820674,
+      "grad_norm": 9.125,
+      "learning_rate": 8.901111111111111e-05,
+      "loss": 0.7733,
+      "step": 1990
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.89e-05,
+      "loss": 0.7488,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "eval/acc": 48.83720779418945,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "eval_loss": 1.8490980863571167,
+      "eval_runtime": 0.2184,
+      "eval_samples_per_second": 196.883,
+      "eval_steps_per_second": 4.579,
+      "step": 2000
+    },
+    {
+      "epoch": 2.5031133250311335,
+      "grad_norm": 8.0,
+      "learning_rate": 8.878888888888889e-05,
+      "loss": 0.8461,
+      "step": 2010
+    },
+    {
+      "epoch": 2.515566625155666,
+      "grad_norm": 8.75,
+      "learning_rate": 8.867777777777778e-05,
+      "loss": 0.7647,
+      "step": 2020
+    },
+    {
+      "epoch": 2.528019925280199,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.856666666666667e-05,
+      "loss": 0.796,
+      "step": 2030
+    },
+    {
+      "epoch": 2.540473225404732,
+      "grad_norm": 7.78125,
+      "learning_rate": 8.845555555555556e-05,
+      "loss": 0.7758,
+      "step": 2040
+    },
+    {
+      "epoch": 2.5529265255292652,
+      "grad_norm": 7.75,
+      "learning_rate": 8.834444444444446e-05,
+      "loss": 0.7753,
+      "step": 2050
+    },
+    {
+      "epoch": 2.5653798256537983,
+      "grad_norm": 8.9375,
+      "learning_rate": 8.823333333333334e-05,
+      "loss": 0.6914,
+      "step": 2060
+    },
+    {
+      "epoch": 2.5778331257783313,
+      "grad_norm": 9.4375,
+      "learning_rate": 8.812222222222223e-05,
+      "loss": 0.787,
+      "step": 2070
+    },
+    {
+      "epoch": 2.5902864259028644,
+      "grad_norm": 8.125,
+      "learning_rate": 8.801111111111111e-05,
+      "loss": 0.7742,
+      "step": 2080
+    },
+    {
+      "epoch": 2.602739726027397,
+      "grad_norm": 10.6875,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 0.7528,
+      "step": 2090
+    },
+    {
+      "epoch": 2.61519302615193,
+      "grad_norm": 9.1875,
+      "learning_rate": 8.77888888888889e-05,
+      "loss": 0.7392,
+      "step": 2100
+    },
+    {
+      "epoch": 2.61519302615193,
+      "eval/acc": 46.511627197265625,
+      "step": 2100
+    },
+    {
+      "epoch": 2.61519302615193,
+      "eval_loss": 1.9725399017333984,
+      "eval_runtime": 0.214,
+      "eval_samples_per_second": 200.925,
+      "eval_steps_per_second": 4.673,
+      "step": 2100
+    },
+    {
+      "epoch": 2.627646326276463,
+      "grad_norm": 9.375,
+      "learning_rate": 8.767777777777778e-05,
+      "loss": 0.7993,
+      "step": 2110
+    },
+    {
+      "epoch": 2.640099626400996,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.756666666666668e-05,
+      "loss": 0.854,
+      "step": 2120
+    },
+    {
+      "epoch": 2.652552926525529,
+      "grad_norm": 10.625,
+      "learning_rate": 8.745555555555556e-05,
+      "loss": 0.8887,
+      "step": 2130
+    },
+    {
+      "epoch": 2.6650062266500623,
+      "grad_norm": 7.75,
+      "learning_rate": 8.734444444444445e-05,
+      "loss": 0.7407,
+      "step": 2140
+    },
+    {
+      "epoch": 2.6774595267745953,
+      "grad_norm": 10.75,
+      "learning_rate": 8.723333333333333e-05,
+      "loss": 0.9187,
+      "step": 2150
+    },
+    {
+      "epoch": 2.6899128268991284,
+      "grad_norm": 7.71875,
+      "learning_rate": 8.712222222222223e-05,
+      "loss": 0.7804,
+      "step": 2160
+    },
+    {
+      "epoch": 2.7023661270236614,
+      "grad_norm": 7.34375,
+      "learning_rate": 8.701111111111111e-05,
+      "loss": 0.7368,
+      "step": 2170
+    },
+    {
+      "epoch": 2.7148194271481945,
+      "grad_norm": 10.0625,
+      "learning_rate": 8.69e-05,
+      "loss": 0.7027,
+      "step": 2180
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 12.875,
+      "learning_rate": 8.67888888888889e-05,
+      "loss": 0.8305,
+      "step": 2190
+    },
+    {
+      "epoch": 2.73972602739726,
+      "grad_norm": 9.125,
+      "learning_rate": 8.667777777777778e-05,
+      "loss": 0.7767,
+      "step": 2200
+    },
+    {
+      "epoch": 2.73972602739726,
+      "eval/acc": 48.83720779418945,
+      "step": 2200
+    },
+    {
+      "epoch": 2.73972602739726,
+      "eval_loss": 1.8356798887252808,
+      "eval_runtime": 0.2116,
+      "eval_samples_per_second": 203.247,
+      "eval_steps_per_second": 4.727,
+      "step": 2200
+    },
+    {
+      "epoch": 2.752179327521793,
+      "grad_norm": 8.4375,
+      "learning_rate": 8.656666666666668e-05,
+      "loss": 0.7547,
+      "step": 2210
+    },
+    {
+      "epoch": 2.7646326276463262,
+      "grad_norm": 7.5,
+      "learning_rate": 8.645555555555555e-05,
+      "loss": 0.8497,
+      "step": 2220
+    },
+    {
+      "epoch": 2.7770859277708593,
+      "grad_norm": 10.0625,
+      "learning_rate": 8.634444444444445e-05,
+      "loss": 0.8024,
+      "step": 2230
+    },
+    {
+      "epoch": 2.7895392278953923,
+      "grad_norm": 13.5,
+      "learning_rate": 8.623333333333333e-05,
+      "loss": 0.7806,
+      "step": 2240
+    },
+    {
+      "epoch": 2.8019925280199254,
+      "grad_norm": 10.8125,
+      "learning_rate": 8.612222222222223e-05,
+      "loss": 0.7021,
+      "step": 2250
+    },
+    {
+      "epoch": 2.8144458281444584,
+      "grad_norm": 9.3125,
+      "learning_rate": 8.601111111111112e-05,
+      "loss": 0.72,
+      "step": 2260
+    },
+    {
+      "epoch": 2.826899128268991,
+      "grad_norm": 8.875,
+      "learning_rate": 8.59e-05,
+      "loss": 0.8063,
+      "step": 2270
+    },
+    {
+      "epoch": 2.839352428393524,
+      "grad_norm": 8.75,
+      "learning_rate": 8.57888888888889e-05,
+      "loss": 0.8264,
+      "step": 2280
+    },
+    {
+      "epoch": 2.851805728518057,
+      "grad_norm": 8.75,
+      "learning_rate": 8.567777777777778e-05,
+      "loss": 0.814,
+      "step": 2290
+    },
+    {
+      "epoch": 2.86425902864259,
+      "grad_norm": 10.25,
+      "learning_rate": 8.556666666666667e-05,
+      "loss": 0.7985,
+      "step": 2300
+    },
+    {
+      "epoch": 2.86425902864259,
+      "eval/acc": 51.16279220581055,
+      "step": 2300
+    },
+    {
+      "epoch": 2.86425902864259,
+      "eval_loss": 1.9056586027145386,
+      "eval_runtime": 0.221,
+      "eval_samples_per_second": 194.606,
+      "eval_steps_per_second": 4.526,
+      "step": 2300
+    },
+    {
+      "epoch": 2.8767123287671232,
+      "grad_norm": 8.6875,
+      "learning_rate": 8.545555555555555e-05,
+      "loss": 0.7489,
+      "step": 2310
+    },
+    {
+      "epoch": 2.8891656288916563,
+      "grad_norm": 9.25,
+      "learning_rate": 8.534444444444445e-05,
+      "loss": 0.8398,
+      "step": 2320
+    },
+    {
+      "epoch": 2.9016189290161893,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.523333333333334e-05,
+      "loss": 0.7808,
+      "step": 2330
+    },
+    {
+      "epoch": 2.9140722291407224,
+      "grad_norm": 8.625,
+      "learning_rate": 8.512222222222222e-05,
+      "loss": 0.8163,
+      "step": 2340
+    },
+    {
+      "epoch": 2.9265255292652554,
+      "grad_norm": 13.9375,
+      "learning_rate": 8.501111111111112e-05,
+      "loss": 0.8038,
+      "step": 2350
+    },
+    {
+      "epoch": 2.9389788293897885,
+      "grad_norm": 11.8125,
+      "learning_rate": 8.49e-05,
+      "loss": 0.7362,
+      "step": 2360
+    },
+    {
+      "epoch": 2.9514321295143215,
+      "grad_norm": 12.0625,
+      "learning_rate": 8.47888888888889e-05,
+      "loss": 0.8096,
+      "step": 2370
+    },
+    {
+      "epoch": 2.963885429638854,
+      "grad_norm": 10.4375,
+      "learning_rate": 8.467777777777777e-05,
+      "loss": 0.7728,
+      "step": 2380
+    },
+    {
+      "epoch": 2.976338729763387,
+      "grad_norm": 11.875,
+      "learning_rate": 8.456666666666667e-05,
+      "loss": 0.8224,
+      "step": 2390
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "grad_norm": 8.375,
+      "learning_rate": 8.445555555555556e-05,
+      "loss": 0.8418,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "eval/acc": 46.511627197265625,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "eval_loss": 1.9594019651412964,
+      "eval_runtime": 0.2169,
+      "eval_samples_per_second": 198.209,
+      "eval_steps_per_second": 4.61,
+      "step": 2400
+    },
+    {
+      "epoch": 3.0012453300124533,
+      "grad_norm": 6.78125,
+      "learning_rate": 8.434444444444445e-05,
+      "loss": 0.7483,
+      "step": 2410
+    },
+    {
+      "epoch": 3.0136986301369864,
+      "grad_norm": 8.75,
+      "learning_rate": 8.423333333333334e-05,
+      "loss": 0.7396,
+      "step": 2420
+    },
+    {
+      "epoch": 3.0261519302615194,
+      "grad_norm": 15.6875,
+      "learning_rate": 8.412222222222222e-05,
+      "loss": 0.7436,
+      "step": 2430
+    },
+    {
+      "epoch": 3.0386052303860525,
+      "grad_norm": 8.5625,
+      "learning_rate": 8.401111111111112e-05,
+      "loss": 0.6092,
+      "step": 2440
+    },
+    {
+      "epoch": 3.0510585305105855,
+      "grad_norm": 11.0,
+      "learning_rate": 8.39e-05,
+      "loss": 0.7142,
+      "step": 2450
+    },
+    {
+      "epoch": 3.063511830635118,
+      "grad_norm": 11.8125,
+      "learning_rate": 8.378888888888889e-05,
+      "loss": 0.692,
+      "step": 2460
+    },
+    {
+      "epoch": 3.075965130759651,
+      "grad_norm": 10.8125,
+      "learning_rate": 8.367777777777778e-05,
+      "loss": 0.672,
+      "step": 2470
+    },
+    {
+      "epoch": 3.088418430884184,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.356666666666667e-05,
+      "loss": 0.6947,
+      "step": 2480
+    },
+    {
+      "epoch": 3.1008717310087173,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.345555555555556e-05,
+      "loss": 0.7188,
+      "step": 2490
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "grad_norm": 7.125,
+      "learning_rate": 8.334444444444444e-05,
+      "loss": 0.6621,
+      "step": 2500
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "eval/acc": 41.86046600341797,
+      "step": 2500
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "eval_loss": 2.2108497619628906,
+      "eval_runtime": 2.0608,
+      "eval_samples_per_second": 20.866,
+      "eval_steps_per_second": 0.485,
+      "step": 2500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 13,
+  "save_steps": 2500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-2500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
+size 6161

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 50368
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c8ced7bda17e694a323b59668f51b10dbaf3dd3577d631459b9cb69ef78adb7
+size 298041696

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edb64b8cef0a3dfc25477198f91d7185d0407eac4e6e1ff0b31a40675c252898
+size 596170443

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e02ca9ae115ea16fec032391633efd7a900f47635f27c79cf4d01a0dec960d3
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98fc86822b150867dd155dd03d026ce3dd7af59775e2a5feacb7751718cd127c
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15e20421b6d20eab5ea415631e2b4770e15ae33eba8329cddc9b7141c145aee0
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c654a7dfacad68a4b4444888a91f6df0461c57090ffb3da9c95f3de4477f1988
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:506f6e39bd983d811639cf9d5aea75be4643e6c5adeffc1e40a2ab6e23817ea8
+size 1465

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,4184 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 6.226650062266501,
+  "eval_steps": 100,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012453300124533,
+      "grad_norm": 15.75,
+      "learning_rate": 9e-07,
+      "loss": 5.5753,
+      "step": 10
+    },
+    {
+      "epoch": 0.024906600249066,
+      "grad_norm": 25.25,
+      "learning_rate": 1.9e-06,
+      "loss": 5.5776,
+      "step": 20
+    },
+    {
+      "epoch": 0.037359900373599,
+      "grad_norm": 16.25,
+      "learning_rate": 2.9e-06,
+      "loss": 5.5572,
+      "step": 30
+    },
+    {
+      "epoch": 0.049813200498132,
+      "grad_norm": 23.5,
+      "learning_rate": 3.9e-06,
+      "loss": 5.5201,
+      "step": 40
+    },
+    {
+      "epoch": 0.062266500622665005,
+      "grad_norm": 18.0,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 5.6297,
+      "step": 50
+    },
+    {
+      "epoch": 0.074719800747198,
+      "grad_norm": 19.125,
+      "learning_rate": 5.9e-06,
+      "loss": 5.5889,
+      "step": 60
+    },
+    {
+      "epoch": 0.08717310087173101,
+      "grad_norm": 23.125,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 5.4949,
+      "step": 70
+    },
+    {
+      "epoch": 0.099626400996264,
+      "grad_norm": 20.25,
+      "learning_rate": 7.9e-06,
+      "loss": 5.552,
+      "step": 80
+    },
+    {
+      "epoch": 0.11207970112079702,
+      "grad_norm": 17.0,
+      "learning_rate": 8.9e-06,
+      "loss": 5.4765,
+      "step": 90
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "grad_norm": 20.125,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 5.4519,
+      "step": 100
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "eval/acc": 2.3255813121795654,
+      "step": 100
+    },
+    {
+      "epoch": 0.12453300124533001,
+      "eval_loss": 4.970202445983887,
+      "eval_runtime": 2.822,
+      "eval_samples_per_second": 15.237,
+      "eval_steps_per_second": 0.354,
+      "step": 100
+    },
+    {
+      "epoch": 0.136986301369863,
+      "grad_norm": 22.125,
+      "learning_rate": 1.09e-05,
+      "loss": 5.3401,
+      "step": 110
+    },
+    {
+      "epoch": 0.149439601494396,
+      "grad_norm": 16.25,
+      "learning_rate": 1.19e-05,
+      "loss": 5.3088,
+      "step": 120
+    },
+    {
+      "epoch": 0.16189290161892902,
+      "grad_norm": 18.75,
+      "learning_rate": 1.29e-05,
+      "loss": 5.1442,
+      "step": 130
+    },
+    {
+      "epoch": 0.17434620174346202,
+      "grad_norm": 19.5,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 5.0218,
+      "step": 140
+    },
+    {
+      "epoch": 0.18679950186799502,
+      "grad_norm": 25.75,
+      "learning_rate": 1.49e-05,
+      "loss": 4.8711,
+      "step": 150
+    },
+    {
+      "epoch": 0.199252801992528,
+      "grad_norm": 25.625,
+      "learning_rate": 1.59e-05,
+      "loss": 4.6046,
+      "step": 160
+    },
+    {
+      "epoch": 0.21170610211706103,
+      "grad_norm": 28.25,
+      "learning_rate": 1.69e-05,
+      "loss": 4.2891,
+      "step": 170
+    },
+    {
+      "epoch": 0.22415940224159403,
+      "grad_norm": 25.25,
+      "learning_rate": 1.79e-05,
+      "loss": 3.8055,
+      "step": 180
+    },
+    {
+      "epoch": 0.23661270236612703,
+      "grad_norm": 28.0,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 3.4139,
+      "step": 190
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "grad_norm": 29.5,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 2.974,
+      "step": 200
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "eval/acc": 11.627906799316406,
+      "step": 200
+    },
+    {
+      "epoch": 0.24906600249066002,
+      "eval_loss": 3.7072134017944336,
+      "eval_runtime": 0.2742,
+      "eval_samples_per_second": 156.807,
+      "eval_steps_per_second": 3.647,
+      "step": 200
+    },
+    {
+      "epoch": 0.261519302615193,
+      "grad_norm": 30.5,
+      "learning_rate": 2.09e-05,
+      "loss": 2.8723,
+      "step": 210
+    },
+    {
+      "epoch": 0.273972602739726,
+      "grad_norm": 19.625,
+      "learning_rate": 2.19e-05,
+      "loss": 2.6908,
+      "step": 220
+    },
+    {
+      "epoch": 0.286425902864259,
+      "grad_norm": 18.25,
+      "learning_rate": 2.29e-05,
+      "loss": 2.4715,
+      "step": 230
+    },
+    {
+      "epoch": 0.298879202988792,
+      "grad_norm": 16.75,
+      "learning_rate": 2.39e-05,
+      "loss": 2.4336,
+      "step": 240
+    },
+    {
+      "epoch": 0.31133250311332505,
+      "grad_norm": 16.875,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 2.3797,
+      "step": 250
+    },
+    {
+      "epoch": 0.32378580323785805,
+      "grad_norm": 18.375,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 2.2765,
+      "step": 260
+    },
+    {
+      "epoch": 0.33623910336239105,
+      "grad_norm": 13.625,
+      "learning_rate": 2.6900000000000003e-05,
+      "loss": 2.1124,
+      "step": 270
+    },
+    {
+      "epoch": 0.34869240348692404,
+      "grad_norm": 19.5,
+      "learning_rate": 2.7900000000000004e-05,
+      "loss": 2.0748,
+      "step": 280
+    },
+    {
+      "epoch": 0.36114570361145704,
+      "grad_norm": 18.0,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 2.1575,
+      "step": 290
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "grad_norm": 34.0,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 2.1195,
+      "step": 300
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "eval/acc": 23.255813598632812,
+      "step": 300
+    },
+    {
+      "epoch": 0.37359900373599003,
+      "eval_loss": 3.1418063640594482,
+      "eval_runtime": 1.1652,
+      "eval_samples_per_second": 36.903,
+      "eval_steps_per_second": 0.858,
+      "step": 300
+    },
+    {
+      "epoch": 0.386052303860523,
+      "grad_norm": 19.125,
+      "learning_rate": 3.09e-05,
+      "loss": 2.1052,
+      "step": 310
+    },
+    {
+      "epoch": 0.398505603985056,
+      "grad_norm": 20.375,
+      "learning_rate": 3.19e-05,
+      "loss": 1.8924,
+      "step": 320
+    },
+    {
+      "epoch": 0.410958904109589,
+      "grad_norm": 17.125,
+      "learning_rate": 3.29e-05,
+      "loss": 2.025,
+      "step": 330
+    },
+    {
+      "epoch": 0.42341220423412207,
+      "grad_norm": 28.0,
+      "learning_rate": 3.3900000000000004e-05,
+      "loss": 1.8914,
+      "step": 340
+    },
+    {
+      "epoch": 0.43586550435865506,
+      "grad_norm": 22.125,
+      "learning_rate": 3.49e-05,
+      "loss": 1.8864,
+      "step": 350
+    },
+    {
+      "epoch": 0.44831880448318806,
+      "grad_norm": 34.0,
+      "learning_rate": 3.59e-05,
+      "loss": 1.8447,
+      "step": 360
+    },
+    {
+      "epoch": 0.46077210460772106,
+      "grad_norm": 15.4375,
+      "learning_rate": 3.69e-05,
+      "loss": 1.7981,
+      "step": 370
+    },
+    {
+      "epoch": 0.47322540473225405,
+      "grad_norm": 39.25,
+      "learning_rate": 3.79e-05,
+      "loss": 1.6967,
+      "step": 380
+    },
+    {
+      "epoch": 0.48567870485678705,
+      "grad_norm": 35.25,
+      "learning_rate": 3.8900000000000004e-05,
+      "loss": 1.7919,
+      "step": 390
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "grad_norm": 19.875,
+      "learning_rate": 3.99e-05,
+      "loss": 1.6083,
+      "step": 400
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "eval/acc": 27.9069766998291,
+      "step": 400
+    },
+    {
+      "epoch": 0.49813200498132004,
+      "eval_loss": 2.988025665283203,
+      "eval_runtime": 0.2197,
+      "eval_samples_per_second": 195.684,
+      "eval_steps_per_second": 4.551,
+      "step": 400
+    },
+    {
+      "epoch": 0.5105853051058531,
+      "grad_norm": 14.1875,
+      "learning_rate": 4.09e-05,
+      "loss": 1.7039,
+      "step": 410
+    },
+    {
+      "epoch": 0.523038605230386,
+      "grad_norm": 33.25,
+      "learning_rate": 4.19e-05,
+      "loss": 1.7057,
+      "step": 420
+    },
+    {
+      "epoch": 0.5354919053549191,
+      "grad_norm": 15.5,
+      "learning_rate": 4.29e-05,
+      "loss": 1.6425,
+      "step": 430
+    },
+    {
+      "epoch": 0.547945205479452,
+      "grad_norm": 29.625,
+      "learning_rate": 4.39e-05,
+      "loss": 1.4995,
+      "step": 440
+    },
+    {
+      "epoch": 0.5603985056039851,
+      "grad_norm": 15.0625,
+      "learning_rate": 4.49e-05,
+      "loss": 1.6621,
+      "step": 450
+    },
+    {
+      "epoch": 0.572851805728518,
+      "grad_norm": 22.25,
+      "learning_rate": 4.5900000000000004e-05,
+      "loss": 1.5684,
+      "step": 460
+    },
+    {
+      "epoch": 0.5853051058530511,
+      "grad_norm": 17.25,
+      "learning_rate": 4.69e-05,
+      "loss": 1.5414,
+      "step": 470
+    },
+    {
+      "epoch": 0.597758405977584,
+      "grad_norm": 19.25,
+      "learning_rate": 4.79e-05,
+      "loss": 1.5445,
+      "step": 480
+    },
+    {
+      "epoch": 0.6102117061021171,
+      "grad_norm": 205.0,
+      "learning_rate": 4.89e-05,
+      "loss": 1.4726,
+      "step": 490
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "grad_norm": 13.375,
+      "learning_rate": 4.99e-05,
+      "loss": 1.3783,
+      "step": 500
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "eval/acc": 30.23255729675293,
+      "step": 500
+    },
+    {
+      "epoch": 0.6226650062266501,
+      "eval_loss": 2.777118444442749,
+      "eval_runtime": 0.2153,
+      "eval_samples_per_second": 199.749,
+      "eval_steps_per_second": 4.645,
+      "step": 500
+    },
+    {
+      "epoch": 0.635118306351183,
+      "grad_norm": 20.875,
+      "learning_rate": 5.0900000000000004e-05,
+      "loss": 1.4983,
+      "step": 510
+    },
+    {
+      "epoch": 0.6475716064757161,
+      "grad_norm": 16.625,
+      "learning_rate": 5.19e-05,
+      "loss": 1.5093,
+      "step": 520
+    },
+    {
+      "epoch": 0.660024906600249,
+      "grad_norm": 14.125,
+      "learning_rate": 5.2900000000000005e-05,
+      "loss": 1.4588,
+      "step": 530
+    },
+    {
+      "epoch": 0.6724782067247821,
+      "grad_norm": 37.0,
+      "learning_rate": 5.390000000000001e-05,
+      "loss": 1.4346,
+      "step": 540
+    },
+    {
+      "epoch": 0.684931506849315,
+      "grad_norm": 16.75,
+      "learning_rate": 5.4900000000000006e-05,
+      "loss": 1.5363,
+      "step": 550
+    },
+    {
+      "epoch": 0.6973848069738481,
+      "grad_norm": 28.375,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 1.4497,
+      "step": 560
+    },
+    {
+      "epoch": 0.709838107098381,
+      "grad_norm": 15.5625,
+      "learning_rate": 5.69e-05,
+      "loss": 1.4005,
+      "step": 570
+    },
+    {
+      "epoch": 0.7222914072229141,
+      "grad_norm": 14.75,
+      "learning_rate": 5.79e-05,
+      "loss": 1.4588,
+      "step": 580
+    },
+    {
+      "epoch": 0.7347447073474471,
+      "grad_norm": 18.5,
+      "learning_rate": 5.89e-05,
+      "loss": 1.3489,
+      "step": 590
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "grad_norm": 12.125,
+      "learning_rate": 5.99e-05,
+      "loss": 1.3295,
+      "step": 600
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "eval/acc": 39.53488540649414,
+      "step": 600
+    },
+    {
+      "epoch": 0.7471980074719801,
+      "eval_loss": 2.6652462482452393,
+      "eval_runtime": 0.2211,
+      "eval_samples_per_second": 194.477,
+      "eval_steps_per_second": 4.523,
+      "step": 600
+    },
+    {
+      "epoch": 0.7596513075965131,
+      "grad_norm": 12.9375,
+      "learning_rate": 6.09e-05,
+      "loss": 1.3717,
+      "step": 610
+    },
+    {
+      "epoch": 0.772104607721046,
+      "grad_norm": 21.5,
+      "learning_rate": 6.19e-05,
+      "loss": 1.425,
+      "step": 620
+    },
+    {
+      "epoch": 0.7845579078455791,
+      "grad_norm": 13.6875,
+      "learning_rate": 6.29e-05,
+      "loss": 1.3017,
+      "step": 630
+    },
+    {
+      "epoch": 0.797011207970112,
+      "grad_norm": 12.8125,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 1.3533,
+      "step": 640
+    },
+    {
+      "epoch": 0.8094645080946451,
+      "grad_norm": 13.1875,
+      "learning_rate": 6.49e-05,
+      "loss": 1.271,
+      "step": 650
+    },
+    {
+      "epoch": 0.821917808219178,
+      "grad_norm": 15.125,
+      "learning_rate": 6.59e-05,
+      "loss": 1.3734,
+      "step": 660
+    },
+    {
+      "epoch": 0.8343711083437111,
+      "grad_norm": 16.125,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 1.3092,
+      "step": 670
+    },
+    {
+      "epoch": 0.8468244084682441,
+      "grad_norm": 17.75,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 1.1803,
+      "step": 680
+    },
+    {
+      "epoch": 0.8592777085927771,
+      "grad_norm": 13.875,
+      "learning_rate": 6.89e-05,
+      "loss": 1.3383,
+      "step": 690
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "grad_norm": 11.25,
+      "learning_rate": 6.99e-05,
+      "loss": 1.3024,
+      "step": 700
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "eval/acc": 34.88372039794922,
+      "step": 700
+    },
+    {
+      "epoch": 0.8717310087173101,
+      "eval_loss": 2.7215068340301514,
+      "eval_runtime": 0.3836,
+      "eval_samples_per_second": 112.097,
+      "eval_steps_per_second": 2.607,
+      "step": 700
+    },
+    {
+      "epoch": 0.8841843088418431,
+      "grad_norm": 67.0,
+      "learning_rate": 7.09e-05,
+      "loss": 1.2095,
+      "step": 710
+    },
+    {
+      "epoch": 0.8966376089663761,
+      "grad_norm": 9.875,
+      "learning_rate": 7.19e-05,
+      "loss": 1.2948,
+      "step": 720
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 14.1875,
+      "learning_rate": 7.29e-05,
+      "loss": 1.3225,
+      "step": 730
+    },
+    {
+      "epoch": 0.9215442092154421,
+      "grad_norm": 13.125,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 1.1936,
+      "step": 740
+    },
+    {
+      "epoch": 0.933997509339975,
+      "grad_norm": 12.875,
+      "learning_rate": 7.49e-05,
+      "loss": 1.2211,
+      "step": 750
+    },
+    {
+      "epoch": 0.9464508094645081,
+      "grad_norm": 13.5625,
+      "learning_rate": 7.59e-05,
+      "loss": 1.2435,
+      "step": 760
+    },
+    {
+      "epoch": 0.958904109589041,
+      "grad_norm": 19.25,
+      "learning_rate": 7.69e-05,
+      "loss": 1.1786,
+      "step": 770
+    },
+    {
+      "epoch": 0.9713574097135741,
+      "grad_norm": 14.375,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 1.2784,
+      "step": 780
+    },
+    {
+      "epoch": 0.9838107098381071,
+      "grad_norm": 10.625,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 1.2618,
+      "step": 790
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "grad_norm": 22.75,
+      "learning_rate": 7.99e-05,
+      "loss": 1.1601,
+      "step": 800
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "eval/acc": 41.86046600341797,
+      "step": 800
+    },
+    {
+      "epoch": 0.9962640099626401,
+      "eval_loss": 2.622220516204834,
+      "eval_runtime": 0.2076,
+      "eval_samples_per_second": 207.126,
+      "eval_steps_per_second": 4.817,
+      "step": 800
+    },
+    {
+      "epoch": 1.0087173100871731,
+      "grad_norm": 12.75,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 1.0972,
+      "step": 810
+    },
+    {
+      "epoch": 1.0211706102117062,
+      "grad_norm": 9.6875,
+      "learning_rate": 8.19e-05,
+      "loss": 1.1999,
+      "step": 820
+    },
+    {
+      "epoch": 1.033623910336239,
+      "grad_norm": 11.625,
+      "learning_rate": 8.29e-05,
+      "loss": 1.1677,
+      "step": 830
+    },
+    {
+      "epoch": 1.046077210460772,
+      "grad_norm": 12.8125,
+      "learning_rate": 8.39e-05,
+      "loss": 1.1505,
+      "step": 840
+    },
+    {
+      "epoch": 1.0585305105853051,
+      "grad_norm": 11.6875,
+      "learning_rate": 8.49e-05,
+      "loss": 1.1599,
+      "step": 850
+    },
+    {
+      "epoch": 1.0709838107098382,
+      "grad_norm": 9.8125,
+      "learning_rate": 8.59e-05,
+      "loss": 1.1746,
+      "step": 860
+    },
+    {
+      "epoch": 1.083437110834371,
+      "grad_norm": 11.625,
+      "learning_rate": 8.69e-05,
+      "loss": 1.047,
+      "step": 870
+    },
+    {
+      "epoch": 1.095890410958904,
+      "grad_norm": 10.125,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 1.107,
+      "step": 880
+    },
+    {
+      "epoch": 1.108343711083437,
+      "grad_norm": 9.0,
+      "learning_rate": 8.89e-05,
+      "loss": 1.1105,
+      "step": 890
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "grad_norm": 13.125,
+      "learning_rate": 8.99e-05,
+      "loss": 1.1848,
+      "step": 900
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "eval/acc": 34.88372039794922,
+      "step": 900
+    },
+    {
+      "epoch": 1.1207970112079702,
+      "eval_loss": 2.8814988136291504,
+      "eval_runtime": 1.0687,
+      "eval_samples_per_second": 40.237,
+      "eval_steps_per_second": 0.936,
+      "step": 900
+    },
+    {
+      "epoch": 1.1332503113325032,
+      "grad_norm": 13.25,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 1.1235,
+      "step": 910
+    },
+    {
+      "epoch": 1.145703611457036,
+      "grad_norm": 17.625,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 1.0304,
+      "step": 920
+    },
+    {
+      "epoch": 1.158156911581569,
+      "grad_norm": 11.5625,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 1.0373,
+      "step": 930
+    },
+    {
+      "epoch": 1.1706102117061021,
+      "grad_norm": 13.25,
+      "learning_rate": 9.39e-05,
+      "loss": 1.12,
+      "step": 940
+    },
+    {
+      "epoch": 1.1830635118306352,
+      "grad_norm": 10.4375,
+      "learning_rate": 9.49e-05,
+      "loss": 1.0623,
+      "step": 950
+    },
+    {
+      "epoch": 1.195516811955168,
+      "grad_norm": 14.625,
+      "learning_rate": 9.59e-05,
+      "loss": 1.0692,
+      "step": 960
+    },
+    {
+      "epoch": 1.207970112079701,
+      "grad_norm": 9.6875,
+      "learning_rate": 9.69e-05,
+      "loss": 1.1914,
+      "step": 970
+    },
+    {
+      "epoch": 1.2204234122042341,
+      "grad_norm": 10.4375,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 1.1094,
+      "step": 980
+    },
+    {
+      "epoch": 1.2328767123287672,
+      "grad_norm": 9.625,
+      "learning_rate": 9.89e-05,
+      "loss": 1.0557,
+      "step": 990
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "grad_norm": 15.75,
+      "learning_rate": 9.99e-05,
+      "loss": 0.9635,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "eval/acc": 34.88372039794922,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2453300124533002,
+      "eval_loss": 2.967315435409546,
+      "eval_runtime": 0.2242,
+      "eval_samples_per_second": 191.798,
+      "eval_steps_per_second": 4.46,
+      "step": 1000
+    },
+    {
+      "epoch": 1.257783312577833,
+      "grad_norm": 11.8125,
+      "learning_rate": 9.99e-05,
+      "loss": 1.0067,
+      "step": 1010
+    },
+    {
+      "epoch": 1.270236612702366,
+      "grad_norm": 11.4375,
+      "learning_rate": 9.97888888888889e-05,
+      "loss": 1.0609,
+      "step": 1020
+    },
+    {
+      "epoch": 1.2826899128268991,
+      "grad_norm": 12.875,
+      "learning_rate": 9.967777777777779e-05,
+      "loss": 1.1566,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2951432129514322,
+      "grad_norm": 10.625,
+      "learning_rate": 9.956666666666667e-05,
+      "loss": 1.1045,
+      "step": 1040
+    },
+    {
+      "epoch": 1.307596513075965,
+      "grad_norm": 10.0625,
+      "learning_rate": 9.945555555555555e-05,
+      "loss": 1.1421,
+      "step": 1050
+    },
+    {
+      "epoch": 1.320049813200498,
+      "grad_norm": 11.5625,
+      "learning_rate": 9.934444444444445e-05,
+      "loss": 1.0453,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3325031133250311,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.923333333333334e-05,
+      "loss": 1.0531,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3449564134495642,
+      "grad_norm": 11.75,
+      "learning_rate": 9.912222222222222e-05,
+      "loss": 1.0286,
+      "step": 1080
+    },
+    {
+      "epoch": 1.3574097135740972,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.901111111111112e-05,
+      "loss": 0.9549,
+      "step": 1090
+    },
+    {
+      "epoch": 1.36986301369863,
+      "grad_norm": 10.5625,
+      "learning_rate": 9.89e-05,
+      "loss": 1.006,
+      "step": 1100
+    },
+    {
+      "epoch": 1.36986301369863,
+      "eval/acc": 34.88372039794922,
+      "step": 1100
+    },
+    {
+      "epoch": 1.36986301369863,
+      "eval_loss": 2.9681856632232666,
+      "eval_runtime": 0.2343,
+      "eval_samples_per_second": 183.518,
+      "eval_steps_per_second": 4.268,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3823163138231631,
+      "grad_norm": 13.4375,
+      "learning_rate": 9.87888888888889e-05,
+      "loss": 1.049,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3947696139476962,
+      "grad_norm": 13.125,
+      "learning_rate": 9.867777777777777e-05,
+      "loss": 0.951,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4072229140722292,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.856666666666667e-05,
+      "loss": 1.0806,
+      "step": 1130
+    },
+    {
+      "epoch": 1.419676214196762,
+      "grad_norm": 11.8125,
+      "learning_rate": 9.845555555555556e-05,
+      "loss": 0.9683,
+      "step": 1140
+    },
+    {
+      "epoch": 1.432129514321295,
+      "grad_norm": 14.875,
+      "learning_rate": 9.834444444444446e-05,
+      "loss": 0.977,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4445828144458281,
+      "grad_norm": 20.125,
+      "learning_rate": 9.823333333333333e-05,
+      "loss": 0.994,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4570361145703612,
+      "grad_norm": 11.0,
+      "learning_rate": 9.812222222222223e-05,
+      "loss": 1.037,
+      "step": 1170
+    },
+    {
+      "epoch": 1.4694894146948942,
+      "grad_norm": 15.5,
+      "learning_rate": 9.801111111111112e-05,
+      "loss": 1.1605,
+      "step": 1180
+    },
+    {
+      "epoch": 1.481942714819427,
+      "grad_norm": 10.9375,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 1.0113,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "grad_norm": 14.3125,
+      "learning_rate": 9.778888888888889e-05,
+      "loss": 0.9511,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "eval/acc": 37.20930099487305,
+      "step": 1200
+    },
+    {
+      "epoch": 1.4943960149439601,
+      "eval_loss": 2.701927423477173,
+      "eval_runtime": 0.2099,
+      "eval_samples_per_second": 204.857,
+      "eval_steps_per_second": 4.764,
+      "step": 1200
+    },
+    {
+      "epoch": 1.5068493150684932,
+      "grad_norm": 11.9375,
+      "learning_rate": 9.767777777777778e-05,
+      "loss": 1.0408,
+      "step": 1210
+    },
+    {
+      "epoch": 1.519302615193026,
+      "grad_norm": 7.71875,
+      "learning_rate": 9.756666666666668e-05,
+      "loss": 0.9782,
+      "step": 1220
+    },
+    {
+      "epoch": 1.531755915317559,
+      "grad_norm": 7.5,
+      "learning_rate": 9.745555555555556e-05,
+      "loss": 1.0293,
+      "step": 1230
+    },
+    {
+      "epoch": 1.544209215442092,
+      "grad_norm": 9.6875,
+      "learning_rate": 9.734444444444444e-05,
+      "loss": 0.9718,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5566625155666252,
+      "grad_norm": 11.0,
+      "learning_rate": 9.723333333333334e-05,
+      "loss": 1.0542,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5691158156911582,
+      "grad_norm": 10.5,
+      "learning_rate": 9.712222222222223e-05,
+      "loss": 0.9537,
+      "step": 1260
+    },
+    {
+      "epoch": 1.5815691158156913,
+      "grad_norm": 13.1875,
+      "learning_rate": 9.701111111111111e-05,
+      "loss": 0.9756,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5940224159402243,
+      "grad_norm": 9.9375,
+      "learning_rate": 9.69e-05,
+      "loss": 0.8843,
+      "step": 1280
+    },
+    {
+      "epoch": 1.6064757160647571,
+      "grad_norm": 10.0,
+      "learning_rate": 9.67888888888889e-05,
+      "loss": 0.8808,
+      "step": 1290
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "grad_norm": 14.0,
+      "learning_rate": 9.667777777777778e-05,
+      "loss": 0.9589,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "eval/acc": 39.53488540649414,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6189290161892902,
+      "eval_loss": 2.7926037311553955,
+      "eval_runtime": 0.2238,
+      "eval_samples_per_second": 192.128,
+      "eval_steps_per_second": 4.468,
+      "step": 1300
+    },
+    {
+      "epoch": 1.631382316313823,
+      "grad_norm": 8.9375,
+      "learning_rate": 9.656666666666668e-05,
+      "loss": 0.9315,
+      "step": 1310
+    },
+    {
+      "epoch": 1.643835616438356,
+      "grad_norm": 10.0625,
+      "learning_rate": 9.645555555555556e-05,
+      "loss": 0.9295,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6562889165628891,
+      "grad_norm": 8.75,
+      "learning_rate": 9.634444444444445e-05,
+      "loss": 0.9255,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6687422166874222,
+      "grad_norm": 11.0625,
+      "learning_rate": 9.623333333333335e-05,
+      "loss": 0.9121,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6811955168119552,
+      "grad_norm": 11.375,
+      "learning_rate": 9.612222222222223e-05,
+      "loss": 0.9232,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6936488169364883,
+      "grad_norm": 11.875,
+      "learning_rate": 9.601111111111112e-05,
+      "loss": 0.8991,
+      "step": 1360
+    },
+    {
+      "epoch": 1.7061021170610213,
+      "grad_norm": 9.0,
+      "learning_rate": 9.59e-05,
+      "loss": 0.9405,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7185554171855542,
+      "grad_norm": 11.875,
+      "learning_rate": 9.57888888888889e-05,
+      "loss": 1.0191,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7310087173100872,
+      "grad_norm": 9.8125,
+      "learning_rate": 9.567777777777778e-05,
+      "loss": 0.9002,
+      "step": 1390
+    },
+    {
+      "epoch": 1.74346201743462,
+      "grad_norm": 12.375,
+      "learning_rate": 9.556666666666667e-05,
+      "loss": 0.9681,
+      "step": 1400
+    },
+    {
+      "epoch": 1.74346201743462,
+      "eval/acc": 39.53488540649414,
+      "step": 1400
+    },
+    {
+      "epoch": 1.74346201743462,
+      "eval_loss": 2.795476198196411,
+      "eval_runtime": 0.2152,
+      "eval_samples_per_second": 199.833,
+      "eval_steps_per_second": 4.647,
+      "step": 1400
+    },
+    {
+      "epoch": 1.755915317559153,
+      "grad_norm": 9.375,
+      "learning_rate": 9.545555555555557e-05,
+      "loss": 1.0222,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7683686176836861,
+      "grad_norm": 9.5625,
+      "learning_rate": 9.534444444444445e-05,
+      "loss": 0.9005,
+      "step": 1420
+    },
+    {
+      "epoch": 1.7808219178082192,
+      "grad_norm": 9.875,
+      "learning_rate": 9.523333333333334e-05,
+      "loss": 0.9616,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7932752179327522,
+      "grad_norm": 28.0,
+      "learning_rate": 9.512222222222222e-05,
+      "loss": 1.0197,
+      "step": 1440
+    },
+    {
+      "epoch": 1.8057285180572853,
+      "grad_norm": 10.75,
+      "learning_rate": 9.501111111111112e-05,
+      "loss": 0.9947,
+      "step": 1450
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 10.125,
+      "learning_rate": 9.49e-05,
+      "loss": 0.9064,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8306351183063512,
+      "grad_norm": 11.75,
+      "learning_rate": 9.478888888888889e-05,
+      "loss": 0.9425,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8430884184308842,
+      "grad_norm": 10.625,
+      "learning_rate": 9.467777777777779e-05,
+      "loss": 1.0284,
+      "step": 1480
+    },
+    {
+      "epoch": 1.855541718555417,
+      "grad_norm": 10.125,
+      "learning_rate": 9.456666666666667e-05,
+      "loss": 0.9175,
+      "step": 1490
+    },
+    {
+      "epoch": 1.86799501867995,
+      "grad_norm": 8.375,
+      "learning_rate": 9.445555555555557e-05,
+      "loss": 0.8608,
+      "step": 1500
+    },
+    {
+      "epoch": 1.86799501867995,
+      "eval/acc": 39.53488540649414,
+      "step": 1500
+    },
+    {
+      "epoch": 1.86799501867995,
+      "eval_loss": 2.8291714191436768,
+      "eval_runtime": 0.216,
+      "eval_samples_per_second": 199.031,
+      "eval_steps_per_second": 4.629,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8804483188044832,
+      "grad_norm": 9.625,
+      "learning_rate": 9.434444444444444e-05,
+      "loss": 0.9695,
+      "step": 1510
+    },
+    {
+      "epoch": 1.8929016189290162,
+      "grad_norm": 9.8125,
+      "learning_rate": 9.423333333333334e-05,
+      "loss": 0.9924,
+      "step": 1520
+    },
+    {
+      "epoch": 1.9053549190535493,
+      "grad_norm": 10.0,
+      "learning_rate": 9.412222222222222e-05,
+      "loss": 1.0733,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9178082191780823,
+      "grad_norm": 10.25,
+      "learning_rate": 9.401111111111112e-05,
+      "loss": 0.8818,
+      "step": 1540
+    },
+    {
+      "epoch": 1.9302615193026154,
+      "grad_norm": 15.3125,
+      "learning_rate": 9.39e-05,
+      "loss": 0.9053,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9427148194271482,
+      "grad_norm": 8.25,
+      "learning_rate": 9.378888888888889e-05,
+      "loss": 0.8586,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9551681195516812,
+      "grad_norm": 17.5,
+      "learning_rate": 9.367777777777779e-05,
+      "loss": 0.9316,
+      "step": 1570
+    },
+    {
+      "epoch": 1.967621419676214,
+      "grad_norm": 10.875,
+      "learning_rate": 9.356666666666667e-05,
+      "loss": 1.0195,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9800747198007471,
+      "grad_norm": 9.1875,
+      "learning_rate": 9.345555555555556e-05,
+      "loss": 0.8878,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "grad_norm": 10.3125,
+      "learning_rate": 9.334444444444444e-05,
+      "loss": 0.9765,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "eval/acc": 37.20930099487305,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9925280199252802,
+      "eval_loss": 2.9084553718566895,
+      "eval_runtime": 0.2099,
+      "eval_samples_per_second": 204.856,
+      "eval_steps_per_second": 4.764,
+      "step": 1600
+    },
+    {
+      "epoch": 2.004981320049813,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.323333333333334e-05,
+      "loss": 0.8596,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0174346201743463,
+      "grad_norm": 12.0625,
+      "learning_rate": 9.312222222222223e-05,
+      "loss": 0.9156,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0298879202988793,
+      "grad_norm": 10.1875,
+      "learning_rate": 9.301111111111111e-05,
+      "loss": 0.8404,
+      "step": 1630
+    },
+    {
+      "epoch": 2.0423412204234124,
+      "grad_norm": 10.125,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 0.8111,
+      "step": 1640
+    },
+    {
+      "epoch": 2.0547945205479454,
+      "grad_norm": 10.625,
+      "learning_rate": 9.278888888888889e-05,
+      "loss": 0.8124,
+      "step": 1650
+    },
+    {
+      "epoch": 2.067247820672478,
+      "grad_norm": 10.0,
+      "learning_rate": 9.267777777777779e-05,
+      "loss": 0.8124,
+      "step": 1660
+    },
+    {
+      "epoch": 2.079701120797011,
+      "grad_norm": 10.75,
+      "learning_rate": 9.256666666666666e-05,
+      "loss": 0.8384,
+      "step": 1670
+    },
+    {
+      "epoch": 2.092154420921544,
+      "grad_norm": 8.3125,
+      "learning_rate": 9.245555555555556e-05,
+      "loss": 0.8734,
+      "step": 1680
+    },
+    {
+      "epoch": 2.104607721046077,
+      "grad_norm": 8.6875,
+      "learning_rate": 9.234444444444445e-05,
+      "loss": 0.7674,
+      "step": 1690
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "grad_norm": 6.6875,
+      "learning_rate": 9.223333333333334e-05,
+      "loss": 0.8514,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "eval/acc": 48.83720779418945,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1170610211706102,
+      "eval_loss": 1.9776915311813354,
+      "eval_runtime": 1.2116,
+      "eval_samples_per_second": 35.491,
+      "eval_steps_per_second": 0.825,
+      "step": 1700
+    },
+    {
+      "epoch": 2.1295143212951433,
+      "grad_norm": 11.3125,
+      "learning_rate": 9.212222222222223e-05,
+      "loss": 0.8502,
+      "step": 1710
+    },
+    {
+      "epoch": 2.1419676214196763,
+      "grad_norm": 12.0625,
+      "learning_rate": 9.201111111111111e-05,
+      "loss": 0.9026,
+      "step": 1720
+    },
+    {
+      "epoch": 2.1544209215442094,
+      "grad_norm": 8.3125,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 0.7893,
+      "step": 1730
+    },
+    {
+      "epoch": 2.166874221668742,
+      "grad_norm": 14.5625,
+      "learning_rate": 9.17888888888889e-05,
+      "loss": 0.7671,
+      "step": 1740
+    },
+    {
+      "epoch": 2.179327521793275,
+      "grad_norm": 11.25,
+      "learning_rate": 9.167777777777778e-05,
+      "loss": 0.7869,
+      "step": 1750
+    },
+    {
+      "epoch": 2.191780821917808,
+      "grad_norm": 8.625,
+      "learning_rate": 9.156666666666667e-05,
+      "loss": 0.8251,
+      "step": 1760
+    },
+    {
+      "epoch": 2.204234122042341,
+      "grad_norm": 7.8125,
+      "learning_rate": 9.145555555555556e-05,
+      "loss": 0.7838,
+      "step": 1770
+    },
+    {
+      "epoch": 2.216687422166874,
+      "grad_norm": 11.6875,
+      "learning_rate": 9.134444444444445e-05,
+      "loss": 0.8348,
+      "step": 1780
+    },
+    {
+      "epoch": 2.2291407222914073,
+      "grad_norm": 9.75,
+      "learning_rate": 9.123333333333333e-05,
+      "loss": 0.8322,
+      "step": 1790
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "grad_norm": 9.25,
+      "learning_rate": 9.112222222222223e-05,
+      "loss": 0.8514,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "eval/acc": 48.83720779418945,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2415940224159403,
+      "eval_loss": 1.968414306640625,
+      "eval_runtime": 0.2153,
+      "eval_samples_per_second": 199.733,
+      "eval_steps_per_second": 4.645,
+      "step": 1800
+    },
+    {
+      "epoch": 2.2540473225404734,
+      "grad_norm": 12.6875,
+      "learning_rate": 9.101111111111112e-05,
+      "loss": 0.7841,
+      "step": 1810
+    },
+    {
+      "epoch": 2.2665006226650064,
+      "grad_norm": 7.09375,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 0.7889,
+      "step": 1820
+    },
+    {
+      "epoch": 2.2789539227895395,
+      "grad_norm": 8.1875,
+      "learning_rate": 9.078888888888889e-05,
+      "loss": 0.8088,
+      "step": 1830
+    },
+    {
+      "epoch": 2.291407222914072,
+      "grad_norm": 12.3125,
+      "learning_rate": 9.067777777777778e-05,
+      "loss": 0.8247,
+      "step": 1840
+    },
+    {
+      "epoch": 2.303860523038605,
+      "grad_norm": 7.40625,
+      "learning_rate": 9.056666666666667e-05,
+      "loss": 0.7383,
+      "step": 1850
+    },
+    {
+      "epoch": 2.316313823163138,
+      "grad_norm": 8.5,
+      "learning_rate": 9.045555555555557e-05,
+      "loss": 0.8074,
+      "step": 1860
+    },
+    {
+      "epoch": 2.328767123287671,
+      "grad_norm": 8.625,
+      "learning_rate": 9.034444444444445e-05,
+      "loss": 0.7866,
+      "step": 1870
+    },
+    {
+      "epoch": 2.3412204234122043,
+      "grad_norm": 10.1875,
+      "learning_rate": 9.023333333333334e-05,
+      "loss": 0.8159,
+      "step": 1880
+    },
+    {
+      "epoch": 2.3536737235367373,
+      "grad_norm": 9.875,
+      "learning_rate": 9.012222222222223e-05,
+      "loss": 0.831,
+      "step": 1890
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "grad_norm": 11.0,
+      "learning_rate": 9.001111111111112e-05,
+      "loss": 0.7215,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "eval/acc": 48.83720779418945,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3661270236612704,
+      "eval_loss": 1.9242758750915527,
+      "eval_runtime": 0.2274,
+      "eval_samples_per_second": 189.058,
+      "eval_steps_per_second": 4.397,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3785803237858034,
+      "grad_norm": 9.8125,
+      "learning_rate": 8.99e-05,
+      "loss": 0.8346,
+      "step": 1910
+    },
+    {
+      "epoch": 2.391033623910336,
+      "grad_norm": 9.9375,
+      "learning_rate": 8.978888888888889e-05,
+      "loss": 0.8141,
+      "step": 1920
+    },
+    {
+      "epoch": 2.403486924034869,
+      "grad_norm": 10.3125,
+      "learning_rate": 8.967777777777779e-05,
+      "loss": 0.911,
+      "step": 1930
+    },
+    {
+      "epoch": 2.415940224159402,
+      "grad_norm": 9.75,
+      "learning_rate": 8.956666666666667e-05,
+      "loss": 0.9486,
+      "step": 1940
+    },
+    {
+      "epoch": 2.428393524283935,
+      "grad_norm": 9.25,
+      "learning_rate": 8.945555555555556e-05,
+      "loss": 0.8775,
+      "step": 1950
+    },
+    {
+      "epoch": 2.4408468244084682,
+      "grad_norm": 8.0,
+      "learning_rate": 8.934444444444445e-05,
+      "loss": 0.8373,
+      "step": 1960
+    },
+    {
+      "epoch": 2.4533001245330013,
+      "grad_norm": 7.625,
+      "learning_rate": 8.923333333333334e-05,
+      "loss": 0.7469,
+      "step": 1970
+    },
+    {
+      "epoch": 2.4657534246575343,
+      "grad_norm": 37.75,
+      "learning_rate": 8.912222222222222e-05,
+      "loss": 0.7934,
+      "step": 1980
+    },
+    {
+      "epoch": 2.4782067247820674,
+      "grad_norm": 9.125,
+      "learning_rate": 8.901111111111111e-05,
+      "loss": 0.7733,
+      "step": 1990
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.89e-05,
+      "loss": 0.7488,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "eval/acc": 48.83720779418945,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4906600249066004,
+      "eval_loss": 1.8490980863571167,
+      "eval_runtime": 0.2184,
+      "eval_samples_per_second": 196.883,
+      "eval_steps_per_second": 4.579,
+      "step": 2000
+    },
+    {
+      "epoch": 2.5031133250311335,
+      "grad_norm": 8.0,
+      "learning_rate": 8.878888888888889e-05,
+      "loss": 0.8461,
+      "step": 2010
+    },
+    {
+      "epoch": 2.515566625155666,
+      "grad_norm": 8.75,
+      "learning_rate": 8.867777777777778e-05,
+      "loss": 0.7647,
+      "step": 2020
+    },
+    {
+      "epoch": 2.528019925280199,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.856666666666667e-05,
+      "loss": 0.796,
+      "step": 2030
+    },
+    {
+      "epoch": 2.540473225404732,
+      "grad_norm": 7.78125,
+      "learning_rate": 8.845555555555556e-05,
+      "loss": 0.7758,
+      "step": 2040
+    },
+    {
+      "epoch": 2.5529265255292652,
+      "grad_norm": 7.75,
+      "learning_rate": 8.834444444444446e-05,
+      "loss": 0.7753,
+      "step": 2050
+    },
+    {
+      "epoch": 2.5653798256537983,
+      "grad_norm": 8.9375,
+      "learning_rate": 8.823333333333334e-05,
+      "loss": 0.6914,
+      "step": 2060
+    },
+    {
+      "epoch": 2.5778331257783313,
+      "grad_norm": 9.4375,
+      "learning_rate": 8.812222222222223e-05,
+      "loss": 0.787,
+      "step": 2070
+    },
+    {
+      "epoch": 2.5902864259028644,
+      "grad_norm": 8.125,
+      "learning_rate": 8.801111111111111e-05,
+      "loss": 0.7742,
+      "step": 2080
+    },
+    {
+      "epoch": 2.602739726027397,
+      "grad_norm": 10.6875,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 0.7528,
+      "step": 2090
+    },
+    {
+      "epoch": 2.61519302615193,
+      "grad_norm": 9.1875,
+      "learning_rate": 8.77888888888889e-05,
+      "loss": 0.7392,
+      "step": 2100
+    },
+    {
+      "epoch": 2.61519302615193,
+      "eval/acc": 46.511627197265625,
+      "step": 2100
+    },
+    {
+      "epoch": 2.61519302615193,
+      "eval_loss": 1.9725399017333984,
+      "eval_runtime": 0.214,
+      "eval_samples_per_second": 200.925,
+      "eval_steps_per_second": 4.673,
+      "step": 2100
+    },
+    {
+      "epoch": 2.627646326276463,
+      "grad_norm": 9.375,
+      "learning_rate": 8.767777777777778e-05,
+      "loss": 0.7993,
+      "step": 2110
+    },
+    {
+      "epoch": 2.640099626400996,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.756666666666668e-05,
+      "loss": 0.854,
+      "step": 2120
+    },
+    {
+      "epoch": 2.652552926525529,
+      "grad_norm": 10.625,
+      "learning_rate": 8.745555555555556e-05,
+      "loss": 0.8887,
+      "step": 2130
+    },
+    {
+      "epoch": 2.6650062266500623,
+      "grad_norm": 7.75,
+      "learning_rate": 8.734444444444445e-05,
+      "loss": 0.7407,
+      "step": 2140
+    },
+    {
+      "epoch": 2.6774595267745953,
+      "grad_norm": 10.75,
+      "learning_rate": 8.723333333333333e-05,
+      "loss": 0.9187,
+      "step": 2150
+    },
+    {
+      "epoch": 2.6899128268991284,
+      "grad_norm": 7.71875,
+      "learning_rate": 8.712222222222223e-05,
+      "loss": 0.7804,
+      "step": 2160
+    },
+    {
+      "epoch": 2.7023661270236614,
+      "grad_norm": 7.34375,
+      "learning_rate": 8.701111111111111e-05,
+      "loss": 0.7368,
+      "step": 2170
+    },
+    {
+      "epoch": 2.7148194271481945,
+      "grad_norm": 10.0625,
+      "learning_rate": 8.69e-05,
+      "loss": 0.7027,
+      "step": 2180
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 12.875,
+      "learning_rate": 8.67888888888889e-05,
+      "loss": 0.8305,
+      "step": 2190
+    },
+    {
+      "epoch": 2.73972602739726,
+      "grad_norm": 9.125,
+      "learning_rate": 8.667777777777778e-05,
+      "loss": 0.7767,
+      "step": 2200
+    },
+    {
+      "epoch": 2.73972602739726,
+      "eval/acc": 48.83720779418945,
+      "step": 2200
+    },
+    {
+      "epoch": 2.73972602739726,
+      "eval_loss": 1.8356798887252808,
+      "eval_runtime": 0.2116,
+      "eval_samples_per_second": 203.247,
+      "eval_steps_per_second": 4.727,
+      "step": 2200
+    },
+    {
+      "epoch": 2.752179327521793,
+      "grad_norm": 8.4375,
+      "learning_rate": 8.656666666666668e-05,
+      "loss": 0.7547,
+      "step": 2210
+    },
+    {
+      "epoch": 2.7646326276463262,
+      "grad_norm": 7.5,
+      "learning_rate": 8.645555555555555e-05,
+      "loss": 0.8497,
+      "step": 2220
+    },
+    {
+      "epoch": 2.7770859277708593,
+      "grad_norm": 10.0625,
+      "learning_rate": 8.634444444444445e-05,
+      "loss": 0.8024,
+      "step": 2230
+    },
+    {
+      "epoch": 2.7895392278953923,
+      "grad_norm": 13.5,
+      "learning_rate": 8.623333333333333e-05,
+      "loss": 0.7806,
+      "step": 2240
+    },
+    {
+      "epoch": 2.8019925280199254,
+      "grad_norm": 10.8125,
+      "learning_rate": 8.612222222222223e-05,
+      "loss": 0.7021,
+      "step": 2250
+    },
+    {
+      "epoch": 2.8144458281444584,
+      "grad_norm": 9.3125,
+      "learning_rate": 8.601111111111112e-05,
+      "loss": 0.72,
+      "step": 2260
+    },
+    {
+      "epoch": 2.826899128268991,
+      "grad_norm": 8.875,
+      "learning_rate": 8.59e-05,
+      "loss": 0.8063,
+      "step": 2270
+    },
+    {
+      "epoch": 2.839352428393524,
+      "grad_norm": 8.75,
+      "learning_rate": 8.57888888888889e-05,
+      "loss": 0.8264,
+      "step": 2280
+    },
+    {
+      "epoch": 2.851805728518057,
+      "grad_norm": 8.75,
+      "learning_rate": 8.567777777777778e-05,
+      "loss": 0.814,
+      "step": 2290
+    },
+    {
+      "epoch": 2.86425902864259,
+      "grad_norm": 10.25,
+      "learning_rate": 8.556666666666667e-05,
+      "loss": 0.7985,
+      "step": 2300
+    },
+    {
+      "epoch": 2.86425902864259,
+      "eval/acc": 51.16279220581055,
+      "step": 2300
+    },
+    {
+      "epoch": 2.86425902864259,
+      "eval_loss": 1.9056586027145386,
+      "eval_runtime": 0.221,
+      "eval_samples_per_second": 194.606,
+      "eval_steps_per_second": 4.526,
+      "step": 2300
+    },
+    {
+      "epoch": 2.8767123287671232,
+      "grad_norm": 8.6875,
+      "learning_rate": 8.545555555555555e-05,
+      "loss": 0.7489,
+      "step": 2310
+    },
+    {
+      "epoch": 2.8891656288916563,
+      "grad_norm": 9.25,
+      "learning_rate": 8.534444444444445e-05,
+      "loss": 0.8398,
+      "step": 2320
+    },
+    {
+      "epoch": 2.9016189290161893,
+      "grad_norm": 8.8125,
+      "learning_rate": 8.523333333333334e-05,
+      "loss": 0.7808,
+      "step": 2330
+    },
+    {
+      "epoch": 2.9140722291407224,
+      "grad_norm": 8.625,
+      "learning_rate": 8.512222222222222e-05,
+      "loss": 0.8163,
+      "step": 2340
+    },
+    {
+      "epoch": 2.9265255292652554,
+      "grad_norm": 13.9375,
+      "learning_rate": 8.501111111111112e-05,
+      "loss": 0.8038,
+      "step": 2350
+    },
+    {
+      "epoch": 2.9389788293897885,
+      "grad_norm": 11.8125,
+      "learning_rate": 8.49e-05,
+      "loss": 0.7362,
+      "step": 2360
+    },
+    {
+      "epoch": 2.9514321295143215,
+      "grad_norm": 12.0625,
+      "learning_rate": 8.47888888888889e-05,
+      "loss": 0.8096,
+      "step": 2370
+    },
+    {
+      "epoch": 2.963885429638854,
+      "grad_norm": 10.4375,
+      "learning_rate": 8.467777777777777e-05,
+      "loss": 0.7728,
+      "step": 2380
+    },
+    {
+      "epoch": 2.976338729763387,
+      "grad_norm": 11.875,
+      "learning_rate": 8.456666666666667e-05,
+      "loss": 0.8224,
+      "step": 2390
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "grad_norm": 8.375,
+      "learning_rate": 8.445555555555556e-05,
+      "loss": 0.8418,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "eval/acc": 46.511627197265625,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9887920298879203,
+      "eval_loss": 1.9594019651412964,
+      "eval_runtime": 0.2169,
+      "eval_samples_per_second": 198.209,
+      "eval_steps_per_second": 4.61,
+      "step": 2400
+    },
+    {
+      "epoch": 3.0012453300124533,
+      "grad_norm": 6.78125,
+      "learning_rate": 8.434444444444445e-05,
+      "loss": 0.7483,
+      "step": 2410
+    },
+    {
+      "epoch": 3.0136986301369864,
+      "grad_norm": 8.75,
+      "learning_rate": 8.423333333333334e-05,
+      "loss": 0.7396,
+      "step": 2420
+    },
+    {
+      "epoch": 3.0261519302615194,
+      "grad_norm": 15.6875,
+      "learning_rate": 8.412222222222222e-05,
+      "loss": 0.7436,
+      "step": 2430
+    },
+    {
+      "epoch": 3.0386052303860525,
+      "grad_norm": 8.5625,
+      "learning_rate": 8.401111111111112e-05,
+      "loss": 0.6092,
+      "step": 2440
+    },
+    {
+      "epoch": 3.0510585305105855,
+      "grad_norm": 11.0,
+      "learning_rate": 8.39e-05,
+      "loss": 0.7142,
+      "step": 2450
+    },
+    {
+      "epoch": 3.063511830635118,
+      "grad_norm": 11.8125,
+      "learning_rate": 8.378888888888889e-05,
+      "loss": 0.692,
+      "step": 2460
+    },
+    {
+      "epoch": 3.075965130759651,
+      "grad_norm": 10.8125,
+      "learning_rate": 8.367777777777778e-05,
+      "loss": 0.672,
+      "step": 2470
+    },
+    {
+      "epoch": 3.088418430884184,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.356666666666667e-05,
+      "loss": 0.6947,
+      "step": 2480
+    },
+    {
+      "epoch": 3.1008717310087173,
+      "grad_norm": 9.0625,
+      "learning_rate": 8.345555555555556e-05,
+      "loss": 0.7188,
+      "step": 2490
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "grad_norm": 7.125,
+      "learning_rate": 8.334444444444444e-05,
+      "loss": 0.6621,
+      "step": 2500
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "eval/acc": 41.86046600341797,
+      "step": 2500
+    },
+    {
+      "epoch": 3.1133250311332503,
+      "eval_loss": 2.2108497619628906,
+      "eval_runtime": 2.0608,
+      "eval_samples_per_second": 20.866,
+      "eval_steps_per_second": 0.485,
+      "step": 2500
+    },
+    {
+      "epoch": 3.1257783312577834,
+      "grad_norm": 9.6875,
+      "learning_rate": 8.323333333333334e-05,
+      "loss": 0.7583,
+      "step": 2510
+    },
+    {
+      "epoch": 3.1382316313823164,
+      "grad_norm": 8.4375,
+      "learning_rate": 8.312222222222223e-05,
+      "loss": 0.6909,
+      "step": 2520
+    },
+    {
+      "epoch": 3.1506849315068495,
+      "grad_norm": 10.5625,
+      "learning_rate": 8.301111111111111e-05,
+      "loss": 0.7588,
+      "step": 2530
+    },
+    {
+      "epoch": 3.1631382316313825,
+      "grad_norm": 14.1875,
+      "learning_rate": 8.29e-05,
+      "loss": 0.6369,
+      "step": 2540
+    },
+    {
+      "epoch": 3.175591531755915,
+      "grad_norm": 8.3125,
+      "learning_rate": 8.27888888888889e-05,
+      "loss": 0.689,
+      "step": 2550
+    },
+    {
+      "epoch": 3.188044831880448,
+      "grad_norm": 6.6875,
+      "learning_rate": 8.267777777777778e-05,
+      "loss": 0.6762,
+      "step": 2560
+    },
+    {
+      "epoch": 3.2004981320049812,
+      "grad_norm": 12.3125,
+      "learning_rate": 8.256666666666666e-05,
+      "loss": 0.7338,
+      "step": 2570
+    },
+    {
+      "epoch": 3.2129514321295143,
+      "grad_norm": 7.53125,
+      "learning_rate": 8.245555555555556e-05,
+      "loss": 0.6732,
+      "step": 2580
+    },
+    {
+      "epoch": 3.2254047322540473,
+      "grad_norm": 11.1875,
+      "learning_rate": 8.234444444444445e-05,
+      "loss": 0.7448,
+      "step": 2590
+    },
+    {
+      "epoch": 3.2378580323785804,
+      "grad_norm": 10.25,
+      "learning_rate": 8.223333333333334e-05,
+      "loss": 0.6227,
+      "step": 2600
+    },
+    {
+      "epoch": 3.2378580323785804,
+      "eval/acc": 41.86046600341797,
+      "step": 2600
+    },
+    {
+      "epoch": 3.2378580323785804,
+      "eval_loss": 2.2032454013824463,
+      "eval_runtime": 0.2227,
+      "eval_samples_per_second": 193.082,
+      "eval_steps_per_second": 4.49,
+      "step": 2600
+    },
+    {
+      "epoch": 3.2503113325031134,
+      "grad_norm": 9.6875,
+      "learning_rate": 8.212222222222223e-05,
+      "loss": 0.6085,
+      "step": 2610
+    },
+    {
+      "epoch": 3.2627646326276465,
+      "grad_norm": 7.03125,
+      "learning_rate": 8.201111111111111e-05,
+      "loss": 0.6685,
+      "step": 2620
+    },
+    {
+      "epoch": 3.275217932752179,
+      "grad_norm": 9.5625,
+      "learning_rate": 8.19e-05,
+      "loss": 0.6985,
+      "step": 2630
+    },
+    {
+      "epoch": 3.287671232876712,
+      "grad_norm": 12.625,
+      "learning_rate": 8.17888888888889e-05,
+      "loss": 0.6639,
+      "step": 2640
+    },
+    {
+      "epoch": 3.300124533001245,
+      "grad_norm": 8.6875,
+      "learning_rate": 8.167777777777778e-05,
+      "loss": 0.6424,
+      "step": 2650
+    },
+    {
+      "epoch": 3.3125778331257782,
+      "grad_norm": 9.25,
+      "learning_rate": 8.156666666666667e-05,
+      "loss": 0.6259,
+      "step": 2660
+    },
+    {
+      "epoch": 3.3250311332503113,
+      "grad_norm": 9.5,
+      "learning_rate": 8.145555555555556e-05,
+      "loss": 0.6859,
+      "step": 2670
+    },
+    {
+      "epoch": 3.3374844333748444,
+      "grad_norm": 11.8125,
+      "learning_rate": 8.134444444444445e-05,
+      "loss": 0.7418,
+      "step": 2680
+    },
+    {
+      "epoch": 3.3499377334993774,
+      "grad_norm": 9.875,
+      "learning_rate": 8.123333333333333e-05,
+      "loss": 0.7075,
+      "step": 2690
+    },
+    {
+      "epoch": 3.3623910336239105,
+      "grad_norm": 8.375,
+      "learning_rate": 8.112222222222222e-05,
+      "loss": 0.6807,
+      "step": 2700
+    },
+    {
+      "epoch": 3.3623910336239105,
+      "eval/acc": 41.86046600341797,
+      "step": 2700
+    },
+    {
+      "epoch": 3.3623910336239105,
+      "eval_loss": 2.1954193115234375,
+      "eval_runtime": 0.2168,
+      "eval_samples_per_second": 198.341,
+      "eval_steps_per_second": 4.613,
+      "step": 2700
+    },
+    {
+      "epoch": 3.3748443337484435,
+      "grad_norm": 8.375,
+      "learning_rate": 8.101111111111112e-05,
+      "loss": 0.6836,
+      "step": 2710
+    },
+    {
+      "epoch": 3.3872976338729766,
+      "grad_norm": 7.9375,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 0.664,
+      "step": 2720
+    },
+    {
+      "epoch": 3.399750933997509,
+      "grad_norm": 12.25,
+      "learning_rate": 8.078888888888889e-05,
+      "loss": 0.7048,
+      "step": 2730
+    },
+    {
+      "epoch": 3.412204234122042,
+      "grad_norm": 7.90625,
+      "learning_rate": 8.067777777777778e-05,
+      "loss": 0.6156,
+      "step": 2740
+    },
+    {
+      "epoch": 3.4246575342465753,
+      "grad_norm": 10.8125,
+      "learning_rate": 8.056666666666667e-05,
+      "loss": 0.6308,
+      "step": 2750
+    },
+    {
+      "epoch": 3.4371108343711083,
+      "grad_norm": 10.5,
+      "learning_rate": 8.045555555555557e-05,
+      "loss": 0.7397,
+      "step": 2760
+    },
+    {
+      "epoch": 3.4495641344956414,
+      "grad_norm": 10.1875,
+      "learning_rate": 8.034444444444444e-05,
+      "loss": 0.5898,
+      "step": 2770
+    },
+    {
+      "epoch": 3.4620174346201744,
+      "grad_norm": 10.375,
+      "learning_rate": 8.023333333333334e-05,
+      "loss": 0.663,
+      "step": 2780
+    },
+    {
+      "epoch": 3.4744707347447075,
+      "grad_norm": 9.3125,
+      "learning_rate": 8.012222222222222e-05,
+      "loss": 0.7272,
+      "step": 2790
+    },
+    {
+      "epoch": 3.4869240348692405,
+      "grad_norm": 9.375,
+      "learning_rate": 8.001111111111112e-05,
+      "loss": 0.6591,
+      "step": 2800
+    },
+    {
+      "epoch": 3.4869240348692405,
+      "eval/acc": 37.20930099487305,
+      "step": 2800
+    },
+    {
+      "epoch": 3.4869240348692405,
+      "eval_loss": 2.223583698272705,
+      "eval_runtime": 0.2188,
+      "eval_samples_per_second": 196.518,
+      "eval_steps_per_second": 4.57,
+      "step": 2800
+    },
+    {
+      "epoch": 3.499377334993773,
+      "grad_norm": 10.4375,
+      "learning_rate": 7.99e-05,
+      "loss": 0.6579,
+      "step": 2810
+    },
+    {
+      "epoch": 3.511830635118306,
+      "grad_norm": 8.0625,
+      "learning_rate": 7.978888888888889e-05,
+      "loss": 0.5828,
+      "step": 2820
+    },
+    {
+      "epoch": 3.5242839352428392,
+      "grad_norm": 10.8125,
+      "learning_rate": 7.967777777777779e-05,
+      "loss": 0.6624,
+      "step": 2830
+    },
+    {
+      "epoch": 3.5367372353673723,
+      "grad_norm": 8.6875,
+      "learning_rate": 7.956666666666667e-05,
+      "loss": 0.6394,
+      "step": 2840
+    },
+    {
+      "epoch": 3.5491905354919053,
+      "grad_norm": 8.4375,
+      "learning_rate": 7.945555555555556e-05,
+      "loss": 0.6937,
+      "step": 2850
+    },
+    {
+      "epoch": 3.5616438356164384,
+      "grad_norm": 9.1875,
+      "learning_rate": 7.934444444444444e-05,
+      "loss": 0.7377,
+      "step": 2860
+    },
+    {
+      "epoch": 3.5740971357409714,
+      "grad_norm": 8.9375,
+      "learning_rate": 7.923333333333334e-05,
+      "loss": 0.6374,
+      "step": 2870
+    },
+    {
+      "epoch": 3.5865504358655045,
+      "grad_norm": 11.25,
+      "learning_rate": 7.912222222222224e-05,
+      "loss": 0.6606,
+      "step": 2880
+    },
+    {
+      "epoch": 3.5990037359900375,
+      "grad_norm": 11.0,
+      "learning_rate": 7.901111111111111e-05,
+      "loss": 0.692,
+      "step": 2890
+    },
+    {
+      "epoch": 3.6114570361145706,
+      "grad_norm": 8.375,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 0.6431,
+      "step": 2900
+    },
+    {
+      "epoch": 3.6114570361145706,
+      "eval/acc": 41.86046600341797,
+      "step": 2900
+    },
+    {
+      "epoch": 3.6114570361145706,
+      "eval_loss": 2.137084722518921,
+      "eval_runtime": 0.2198,
+      "eval_samples_per_second": 195.618,
+      "eval_steps_per_second": 4.549,
+      "step": 2900
+    },
+    {
+      "epoch": 3.6239103362391036,
+      "grad_norm": 32.25,
+      "learning_rate": 7.878888888888889e-05,
+      "loss": 0.6968,
+      "step": 2910
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 9.25,
+      "learning_rate": 7.867777777777779e-05,
+      "loss": 0.7186,
+      "step": 2920
+    },
+    {
+      "epoch": 3.6488169364881693,
+      "grad_norm": 10.0625,
+      "learning_rate": 7.856666666666666e-05,
+      "loss": 0.5565,
+      "step": 2930
+    },
+    {
+      "epoch": 3.6612702366127023,
+      "grad_norm": 8.4375,
+      "learning_rate": 7.845555555555556e-05,
+      "loss": 0.6036,
+      "step": 2940
+    },
+    {
+      "epoch": 3.6737235367372354,
+      "grad_norm": 10.6875,
+      "learning_rate": 7.834444444444446e-05,
+      "loss": 0.6465,
+      "step": 2950
+    },
+    {
+      "epoch": 3.6861768368617684,
+      "grad_norm": 11.1875,
+      "learning_rate": 7.823333333333334e-05,
+      "loss": 0.803,
+      "step": 2960
+    },
+    {
+      "epoch": 3.6986301369863015,
+      "grad_norm": 9.9375,
+      "learning_rate": 7.812222222222223e-05,
+      "loss": 0.5897,
+      "step": 2970
+    },
+    {
+      "epoch": 3.711083437110834,
+      "grad_norm": 9.1875,
+      "learning_rate": 7.801111111111111e-05,
+      "loss": 0.6373,
+      "step": 2980
+    },
+    {
+      "epoch": 3.723536737235367,
+      "grad_norm": 7.9375,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 0.696,
+      "step": 2990
+    },
+    {
+      "epoch": 3.7359900373599,
+      "grad_norm": 8.0625,
+      "learning_rate": 7.77888888888889e-05,
+      "loss": 0.612,
+      "step": 3000
+    },
+    {
+      "epoch": 3.7359900373599,
+      "eval/acc": 37.20930099487305,
+      "step": 3000
+    },
+    {
+      "epoch": 3.7359900373599,
+      "eval_loss": 2.2000465393066406,
+      "eval_runtime": 0.2179,
+      "eval_samples_per_second": 197.339,
+      "eval_steps_per_second": 4.589,
+      "step": 3000
+    },
+    {
+      "epoch": 3.7484433374844333,
+      "grad_norm": 8.8125,
+      "learning_rate": 7.767777777777778e-05,
+      "loss": 0.687,
+      "step": 3010
+    },
+    {
+      "epoch": 3.7608966376089663,
+      "grad_norm": 11.875,
+      "learning_rate": 7.756666666666666e-05,
+      "loss": 0.706,
+      "step": 3020
+    },
+    {
+      "epoch": 3.7733499377334994,
+      "grad_norm": 9.9375,
+      "learning_rate": 7.745555555555556e-05,
+      "loss": 0.7025,
+      "step": 3030
+    },
+    {
+      "epoch": 3.7858032378580324,
+      "grad_norm": 8.25,
+      "learning_rate": 7.734444444444445e-05,
+      "loss": 0.6412,
+      "step": 3040
+    },
+    {
+      "epoch": 3.7982565379825655,
+      "grad_norm": 9.5625,
+      "learning_rate": 7.723333333333333e-05,
+      "loss": 0.6873,
+      "step": 3050
+    },
+    {
+      "epoch": 3.8107098381070985,
+      "grad_norm": 9.9375,
+      "learning_rate": 7.712222222222223e-05,
+      "loss": 0.6956,
+      "step": 3060
+    },
+    {
+      "epoch": 3.8231631382316316,
+      "grad_norm": 7.46875,
+      "learning_rate": 7.701111111111111e-05,
+      "loss": 0.6178,
+      "step": 3070
+    },
+    {
+      "epoch": 3.8356164383561646,
+      "grad_norm": 8.75,
+      "learning_rate": 7.69e-05,
+      "loss": 0.6322,
+      "step": 3080
+    },
+    {
+      "epoch": 3.848069738480697,
+      "grad_norm": 7.375,
+      "learning_rate": 7.678888888888888e-05,
+      "loss": 0.6457,
+      "step": 3090
+    },
+    {
+      "epoch": 3.8605230386052303,
+      "grad_norm": 7.84375,
+      "learning_rate": 7.667777777777778e-05,
+      "loss": 0.6358,
+      "step": 3100
+    },
+    {
+      "epoch": 3.8605230386052303,
+      "eval/acc": 46.511627197265625,
+      "step": 3100
+    },
+    {
+      "epoch": 3.8605230386052303,
+      "eval_loss": 2.1861517429351807,
+      "eval_runtime": 0.2234,
+      "eval_samples_per_second": 192.489,
+      "eval_steps_per_second": 4.476,
+      "step": 3100
+    },
+    {
+      "epoch": 3.8729763387297633,
+      "grad_norm": 8.875,
+      "learning_rate": 7.656666666666668e-05,
+      "loss": 0.6032,
+      "step": 3110
+    },
+    {
+      "epoch": 3.8854296388542964,
+      "grad_norm": 9.75,
+      "learning_rate": 7.645555555555556e-05,
+      "loss": 0.6119,
+      "step": 3120
+    },
+    {
+      "epoch": 3.8978829389788294,
+      "grad_norm": 9.8125,
+      "learning_rate": 7.634444444444445e-05,
+      "loss": 0.6792,
+      "step": 3130
+    },
+    {
+      "epoch": 3.9103362391033625,
+      "grad_norm": 9.0,
+      "learning_rate": 7.623333333333333e-05,
+      "loss": 0.6191,
+      "step": 3140
+    },
+    {
+      "epoch": 3.9227895392278955,
+      "grad_norm": 8.8125,
+      "learning_rate": 7.612222222222223e-05,
+      "loss": 0.6312,
+      "step": 3150
+    },
+    {
+      "epoch": 3.935242839352428,
+      "grad_norm": 8.0,
+      "learning_rate": 7.601111111111112e-05,
+      "loss": 0.6148,
+      "step": 3160
+    },
+    {
+      "epoch": 3.947696139476961,
+      "grad_norm": 8.5,
+      "learning_rate": 7.59e-05,
+      "loss": 0.6006,
+      "step": 3170
+    },
+    {
+      "epoch": 3.9601494396014942,
+      "grad_norm": 8.375,
+      "learning_rate": 7.578888888888889e-05,
+      "loss": 0.6129,
+      "step": 3180
+    },
+    {
+      "epoch": 3.9726027397260273,
+      "grad_norm": 9.3125,
+      "learning_rate": 7.567777777777778e-05,
+      "loss": 0.6801,
+      "step": 3190
+    },
+    {
+      "epoch": 3.9850560398505603,
+      "grad_norm": 10.0,
+      "learning_rate": 7.556666666666667e-05,
+      "loss": 0.7283,
+      "step": 3200
+    },
+    {
+      "epoch": 3.9850560398505603,
+      "eval/acc": 44.1860466003418,
+      "step": 3200
+    },
+    {
+      "epoch": 3.9850560398505603,
+      "eval_loss": 2.154561758041382,
+      "eval_runtime": 0.2194,
+      "eval_samples_per_second": 196.004,
+      "eval_steps_per_second": 4.558,
+      "step": 3200
+    },
+    {
+      "epoch": 3.9975093399750934,
+      "grad_norm": 6.78125,
+      "learning_rate": 7.545555555555555e-05,
+      "loss": 0.6551,
+      "step": 3210
+    },
+    {
+      "epoch": 4.009962640099626,
+      "grad_norm": 7.15625,
+      "learning_rate": 7.534444444444445e-05,
+      "loss": 0.6154,
+      "step": 3220
+    },
+    {
+      "epoch": 4.0224159402241595,
+      "grad_norm": 8.3125,
+      "learning_rate": 7.523333333333334e-05,
+      "loss": 0.5325,
+      "step": 3230
+    },
+    {
+      "epoch": 4.0348692403486925,
+      "grad_norm": 12.125,
+      "learning_rate": 7.512222222222222e-05,
+      "loss": 0.6045,
+      "step": 3240
+    },
+    {
+      "epoch": 4.047322540473226,
+      "grad_norm": 8.875,
+      "learning_rate": 7.50111111111111e-05,
+      "loss": 0.6101,
+      "step": 3250
+    },
+    {
+      "epoch": 4.059775840597759,
+      "grad_norm": 8.9375,
+      "learning_rate": 7.49e-05,
+      "loss": 0.6041,
+      "step": 3260
+    },
+    {
+      "epoch": 4.072229140722292,
+      "grad_norm": 13.9375,
+      "learning_rate": 7.47888888888889e-05,
+      "loss": 0.5396,
+      "step": 3270
+    },
+    {
+      "epoch": 4.084682440846825,
+      "grad_norm": 10.4375,
+      "learning_rate": 7.467777777777777e-05,
+      "loss": 0.5914,
+      "step": 3280
+    },
+    {
+      "epoch": 4.097135740971358,
+      "grad_norm": 8.4375,
+      "learning_rate": 7.456666666666667e-05,
+      "loss": 0.5281,
+      "step": 3290
+    },
+    {
+      "epoch": 4.109589041095891,
+      "grad_norm": 10.0,
+      "learning_rate": 7.445555555555556e-05,
+      "loss": 0.6078,
+      "step": 3300
+    },
+    {
+      "epoch": 4.109589041095891,
+      "eval/acc": 32.55813980102539,
+      "step": 3300
+    },
+    {
+      "epoch": 4.109589041095891,
+      "eval_loss": 2.71657133102417,
+      "eval_runtime": 2.577,
+      "eval_samples_per_second": 16.686,
+      "eval_steps_per_second": 0.388,
+      "step": 3300
+    },
+    {
+      "epoch": 4.122042341220423,
+      "grad_norm": 9.5,
+      "learning_rate": 7.434444444444446e-05,
+      "loss": 0.6107,
+      "step": 3310
+    },
+    {
+      "epoch": 4.134495641344956,
+      "grad_norm": 8.375,
+      "learning_rate": 7.423333333333333e-05,
+      "loss": 0.5635,
+      "step": 3320
+    },
+    {
+      "epoch": 4.146948941469489,
+      "grad_norm": 9.4375,
+      "learning_rate": 7.412222222222222e-05,
+      "loss": 0.6277,
+      "step": 3330
+    },
+    {
+      "epoch": 4.159402241594022,
+      "grad_norm": 10.0,
+      "learning_rate": 7.401111111111112e-05,
+      "loss": 0.71,
+      "step": 3340
+    },
+    {
+      "epoch": 4.171855541718555,
+      "grad_norm": 9.625,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 0.5073,
+      "step": 3350
+    },
+    {
+      "epoch": 4.184308841843088,
+      "grad_norm": 11.125,
+      "learning_rate": 7.378888888888889e-05,
+      "loss": 0.613,
+      "step": 3360
+    },
+    {
+      "epoch": 4.196762141967621,
+      "grad_norm": 9.0625,
+      "learning_rate": 7.367777777777778e-05,
+      "loss": 0.6214,
+      "step": 3370
+    },
+    {
+      "epoch": 4.209215442092154,
+      "grad_norm": 7.75,
+      "learning_rate": 7.356666666666667e-05,
+      "loss": 0.5957,
+      "step": 3380
+    },
+    {
+      "epoch": 4.221668742216687,
+      "grad_norm": 7.71875,
+      "learning_rate": 7.345555555555556e-05,
+      "loss": 0.5365,
+      "step": 3390
+    },
+    {
+      "epoch": 4.2341220423412205,
+      "grad_norm": 7.5,
+      "learning_rate": 7.334444444444444e-05,
+      "loss": 0.4691,
+      "step": 3400
+    },
+    {
+      "epoch": 4.2341220423412205,
+      "eval/acc": 37.20930099487305,
+      "step": 3400
+    },
+    {
+      "epoch": 4.2341220423412205,
+      "eval_loss": 2.6280057430267334,
+      "eval_runtime": 0.2305,
+      "eval_samples_per_second": 186.512,
+      "eval_steps_per_second": 4.337,
+      "step": 3400
+    },
+    {
+      "epoch": 4.2465753424657535,
+      "grad_norm": 8.5,
+      "learning_rate": 7.323333333333333e-05,
+      "loss": 0.6429,
+      "step": 3410
+    },
+    {
+      "epoch": 4.259028642590287,
+      "grad_norm": 8.625,
+      "learning_rate": 7.312222222222223e-05,
+      "loss": 0.5898,
+      "step": 3420
+    },
+    {
+      "epoch": 4.27148194271482,
+      "grad_norm": 8.375,
+      "learning_rate": 7.301111111111113e-05,
+      "loss": 0.6207,
+      "step": 3430
+    },
+    {
+      "epoch": 4.283935242839353,
+      "grad_norm": 7.9375,
+      "learning_rate": 7.29e-05,
+      "loss": 0.5395,
+      "step": 3440
+    },
+    {
+      "epoch": 4.296388542963886,
+      "grad_norm": 7.25,
+      "learning_rate": 7.27888888888889e-05,
+      "loss": 0.5836,
+      "step": 3450
+    },
+    {
+      "epoch": 4.308841843088419,
+      "grad_norm": 8.5,
+      "learning_rate": 7.267777777777778e-05,
+      "loss": 0.661,
+      "step": 3460
+    },
+    {
+      "epoch": 4.321295143212952,
+      "grad_norm": 11.1875,
+      "learning_rate": 7.256666666666668e-05,
+      "loss": 0.5491,
+      "step": 3470
+    },
+    {
+      "epoch": 4.333748443337484,
+      "grad_norm": 9.25,
+      "learning_rate": 7.245555555555555e-05,
+      "loss": 0.5489,
+      "step": 3480
+    },
+    {
+      "epoch": 4.346201743462017,
+      "grad_norm": 10.0,
+      "learning_rate": 7.234444444444445e-05,
+      "loss": 0.6114,
+      "step": 3490
+    },
+    {
+      "epoch": 4.35865504358655,
+      "grad_norm": 10.9375,
+      "learning_rate": 7.223333333333335e-05,
+      "loss": 0.5994,
+      "step": 3500
+    },
+    {
+      "epoch": 4.35865504358655,
+      "eval/acc": 37.20930099487305,
+      "step": 3500
+    },
+    {
+      "epoch": 4.35865504358655,
+      "eval_loss": 2.7129628658294678,
+      "eval_runtime": 0.2205,
+      "eval_samples_per_second": 194.97,
+      "eval_steps_per_second": 4.534,
+      "step": 3500
+    },
+    {
+      "epoch": 4.371108343711083,
+      "grad_norm": 11.0625,
+      "learning_rate": 7.212222222222223e-05,
+      "loss": 0.6538,
+      "step": 3510
+    },
+    {
+      "epoch": 4.383561643835616,
+      "grad_norm": 8.875,
+      "learning_rate": 7.201111111111111e-05,
+      "loss": 0.6445,
+      "step": 3520
+    },
+    {
+      "epoch": 4.396014943960149,
+      "grad_norm": 8.0625,
+      "learning_rate": 7.19e-05,
+      "loss": 0.5865,
+      "step": 3530
+    },
+    {
+      "epoch": 4.408468244084682,
+      "grad_norm": 7.84375,
+      "learning_rate": 7.17888888888889e-05,
+      "loss": 0.5756,
+      "step": 3540
+    },
+    {
+      "epoch": 4.420921544209215,
+      "grad_norm": 9.6875,
+      "learning_rate": 7.167777777777778e-05,
+      "loss": 0.599,
+      "step": 3550
+    },
+    {
+      "epoch": 4.433374844333748,
+      "grad_norm": 7.71875,
+      "learning_rate": 7.156666666666667e-05,
+      "loss": 0.5953,
+      "step": 3560
+    },
+    {
+      "epoch": 4.4458281444582815,
+      "grad_norm": 8.375,
+      "learning_rate": 7.145555555555557e-05,
+      "loss": 0.5322,
+      "step": 3570
+    },
+    {
+      "epoch": 4.4582814445828145,
+      "grad_norm": 12.0625,
+      "learning_rate": 7.134444444444445e-05,
+      "loss": 0.6048,
+      "step": 3580
+    },
+    {
+      "epoch": 4.4707347447073476,
+      "grad_norm": 91.5,
+      "learning_rate": 7.123333333333333e-05,
+      "loss": 0.601,
+      "step": 3590
+    },
+    {
+      "epoch": 4.483188044831881,
+      "grad_norm": 9.5625,
+      "learning_rate": 7.112222222222222e-05,
+      "loss": 0.565,
+      "step": 3600
+    },
+    {
+      "epoch": 4.483188044831881,
+      "eval/acc": 37.20930099487305,
+      "step": 3600
+    },
+    {
+      "epoch": 4.483188044831881,
+      "eval_loss": 2.709101438522339,
+      "eval_runtime": 5.5657,
+      "eval_samples_per_second": 7.726,
+      "eval_steps_per_second": 0.18,
+      "step": 3600
+    },
+    {
+      "epoch": 4.495641344956414,
+      "grad_norm": 9.875,
+      "learning_rate": 7.101111111111112e-05,
+      "loss": 0.6184,
+      "step": 3610
+    },
+    {
+      "epoch": 4.508094645080947,
+      "grad_norm": 11.3125,
+      "learning_rate": 7.09e-05,
+      "loss": 0.5582,
+      "step": 3620
+    },
+    {
+      "epoch": 4.52054794520548,
+      "grad_norm": 10.3125,
+      "learning_rate": 7.078888888888889e-05,
+      "loss": 0.5685,
+      "step": 3630
+    },
+    {
+      "epoch": 4.533001245330013,
+      "grad_norm": 9.8125,
+      "learning_rate": 7.067777777777777e-05,
+      "loss": 0.6142,
+      "step": 3640
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 36.0,
+      "learning_rate": 7.056666666666667e-05,
+      "loss": 0.5638,
+      "step": 3650
+    },
+    {
+      "epoch": 4.557907845579079,
+      "grad_norm": 9.75,
+      "learning_rate": 7.045555555555557e-05,
+      "loss": 0.5066,
+      "step": 3660
+    },
+    {
+      "epoch": 4.570361145703611,
+      "grad_norm": 12.3125,
+      "learning_rate": 7.034444444444445e-05,
+      "loss": 0.5949,
+      "step": 3670
+    },
+    {
+      "epoch": 4.582814445828144,
+      "grad_norm": 9.1875,
+      "learning_rate": 7.023333333333334e-05,
+      "loss": 0.5913,
+      "step": 3680
+    },
+    {
+      "epoch": 4.595267745952677,
+      "grad_norm": 9.25,
+      "learning_rate": 7.012222222222222e-05,
+      "loss": 0.6607,
+      "step": 3690
+    },
+    {
+      "epoch": 4.60772104607721,
+      "grad_norm": 10.25,
+      "learning_rate": 7.001111111111112e-05,
+      "loss": 0.5912,
+      "step": 3700
+    },
+    {
+      "epoch": 4.60772104607721,
+      "eval/acc": 34.88372039794922,
+      "step": 3700
+    },
+    {
+      "epoch": 4.60772104607721,
+      "eval_loss": 2.7723538875579834,
+      "eval_runtime": 0.2246,
+      "eval_samples_per_second": 191.465,
+      "eval_steps_per_second": 4.453,
+      "step": 3700
+    },
+    {
+      "epoch": 4.620174346201743,
+      "grad_norm": 10.875,
+      "learning_rate": 6.99e-05,
+      "loss": 0.6132,
+      "step": 3710
+    },
+    {
+      "epoch": 4.632627646326276,
+      "grad_norm": 11.125,
+      "learning_rate": 6.978888888888889e-05,
+      "loss": 0.5569,
+      "step": 3720
+    },
+    {
+      "epoch": 4.645080946450809,
+      "grad_norm": 13.75,
+      "learning_rate": 6.967777777777779e-05,
+      "loss": 0.5752,
+      "step": 3730
+    },
+    {
+      "epoch": 4.657534246575342,
+      "grad_norm": 10.6875,
+      "learning_rate": 6.956666666666667e-05,
+      "loss": 0.584,
+      "step": 3740
+    },
+    {
+      "epoch": 4.6699875466998755,
+      "grad_norm": 8.5,
+      "learning_rate": 6.945555555555556e-05,
+      "loss": 0.5779,
+      "step": 3750
+    },
+    {
+      "epoch": 4.6824408468244085,
+      "grad_norm": 8.375,
+      "learning_rate": 6.934444444444444e-05,
+      "loss": 0.5435,
+      "step": 3760
+    },
+    {
+      "epoch": 4.694894146948942,
+      "grad_norm": 10.1875,
+      "learning_rate": 6.923333333333334e-05,
+      "loss": 0.6124,
+      "step": 3770
+    },
+    {
+      "epoch": 4.707347447073475,
+      "grad_norm": 8.5,
+      "learning_rate": 6.912222222222222e-05,
+      "loss": 0.6087,
+      "step": 3780
+    },
+    {
+      "epoch": 4.719800747198008,
+      "grad_norm": 10.625,
+      "learning_rate": 6.901111111111111e-05,
+      "loss": 0.5831,
+      "step": 3790
+    },
+    {
+      "epoch": 4.732254047322541,
+      "grad_norm": 7.84375,
+      "learning_rate": 6.89e-05,
+      "loss": 0.5711,
+      "step": 3800
+    },
+    {
+      "epoch": 4.732254047322541,
+      "eval/acc": 34.88372039794922,
+      "step": 3800
+    },
+    {
+      "epoch": 4.732254047322541,
+      "eval_loss": 2.7747089862823486,
+      "eval_runtime": 0.3882,
+      "eval_samples_per_second": 110.773,
+      "eval_steps_per_second": 2.576,
+      "step": 3800
+    },
+    {
+      "epoch": 4.744707347447074,
+      "grad_norm": 8.25,
+      "learning_rate": 6.878888888888889e-05,
+      "loss": 0.617,
+      "step": 3810
+    },
+    {
+      "epoch": 4.757160647571607,
+      "grad_norm": 10.125,
+      "learning_rate": 6.867777777777779e-05,
+      "loss": 0.5471,
+      "step": 3820
+    },
+    {
+      "epoch": 4.76961394769614,
+      "grad_norm": 10.8125,
+      "learning_rate": 6.856666666666666e-05,
+      "loss": 0.6168,
+      "step": 3830
+    },
+    {
+      "epoch": 4.782067247820672,
+      "grad_norm": 10.5,
+      "learning_rate": 6.845555555555556e-05,
+      "loss": 0.6136,
+      "step": 3840
+    },
+    {
+      "epoch": 4.794520547945205,
+      "grad_norm": 12.1875,
+      "learning_rate": 6.834444444444444e-05,
+      "loss": 0.6066,
+      "step": 3850
+    },
+    {
+      "epoch": 4.806973848069738,
+      "grad_norm": 11.125,
+      "learning_rate": 6.823333333333334e-05,
+      "loss": 0.6457,
+      "step": 3860
+    },
+    {
+      "epoch": 4.819427148194271,
+      "grad_norm": 12.375,
+      "learning_rate": 6.812222222222221e-05,
+      "loss": 0.6109,
+      "step": 3870
+    },
+    {
+      "epoch": 4.831880448318804,
+      "grad_norm": 10.0625,
+      "learning_rate": 6.801111111111111e-05,
+      "loss": 0.6063,
+      "step": 3880
+    },
+    {
+      "epoch": 4.844333748443337,
+      "grad_norm": 8.1875,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 0.5681,
+      "step": 3890
+    },
+    {
+      "epoch": 4.85678704856787,
+      "grad_norm": 24.125,
+      "learning_rate": 6.77888888888889e-05,
+      "loss": 0.5041,
+      "step": 3900
+    },
+    {
+      "epoch": 4.85678704856787,
+      "eval/acc": 32.55813980102539,
+      "step": 3900
+    },
+    {
+      "epoch": 4.85678704856787,
+      "eval_loss": 2.7735087871551514,
+      "eval_runtime": 0.2177,
+      "eval_samples_per_second": 197.481,
+      "eval_steps_per_second": 4.593,
+      "step": 3900
+    },
+    {
+      "epoch": 4.869240348692403,
+      "grad_norm": 10.875,
+      "learning_rate": 6.767777777777778e-05,
+      "loss": 0.5809,
+      "step": 3910
+    },
+    {
+      "epoch": 4.8816936488169365,
+      "grad_norm": 9.0625,
+      "learning_rate": 6.756666666666666e-05,
+      "loss": 0.5673,
+      "step": 3920
+    },
+    {
+      "epoch": 4.8941469489414695,
+      "grad_norm": 8.5625,
+      "learning_rate": 6.745555555555556e-05,
+      "loss": 0.625,
+      "step": 3930
+    },
+    {
+      "epoch": 4.906600249066003,
+      "grad_norm": 7.625,
+      "learning_rate": 6.734444444444445e-05,
+      "loss": 0.541,
+      "step": 3940
+    },
+    {
+      "epoch": 4.919053549190536,
+      "grad_norm": 8.75,
+      "learning_rate": 6.723333333333333e-05,
+      "loss": 0.5742,
+      "step": 3950
+    },
+    {
+      "epoch": 4.931506849315069,
+      "grad_norm": 12.1875,
+      "learning_rate": 6.712222222222223e-05,
+      "loss": 0.5545,
+      "step": 3960
+    },
+    {
+      "epoch": 4.943960149439602,
+      "grad_norm": 13.1875,
+      "learning_rate": 6.701111111111112e-05,
+      "loss": 0.6346,
+      "step": 3970
+    },
+    {
+      "epoch": 4.956413449564135,
+      "grad_norm": 11.6875,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 0.5706,
+      "step": 3980
+    },
+    {
+      "epoch": 4.968866749688668,
+      "grad_norm": 8.375,
+      "learning_rate": 6.678888888888888e-05,
+      "loss": 0.5641,
+      "step": 3990
+    },
+    {
+      "epoch": 4.981320049813201,
+      "grad_norm": 9.9375,
+      "learning_rate": 6.667777777777778e-05,
+      "loss": 0.5201,
+      "step": 4000
+    },
+    {
+      "epoch": 4.981320049813201,
+      "eval/acc": 23.255813598632812,
+      "step": 4000
+    },
+    {
+      "epoch": 4.981320049813201,
+      "eval_loss": 2.779590606689453,
+      "eval_runtime": 0.2285,
+      "eval_samples_per_second": 188.191,
+      "eval_steps_per_second": 4.377,
+      "step": 4000
+    },
+    {
+      "epoch": 4.993773349937733,
+      "grad_norm": 11.1875,
+      "learning_rate": 6.656666666666667e-05,
+      "loss": 0.6364,
+      "step": 4010
+    },
+    {
+      "epoch": 5.006226650062266,
+      "grad_norm": 8.3125,
+      "learning_rate": 6.645555555555557e-05,
+      "loss": 0.5594,
+      "step": 4020
+    },
+    {
+      "epoch": 5.018679950186799,
+      "grad_norm": 8.25,
+      "learning_rate": 6.634444444444444e-05,
+      "loss": 0.4926,
+      "step": 4030
+    },
+    {
+      "epoch": 5.031133250311332,
+      "grad_norm": 9.25,
+      "learning_rate": 6.623333333333334e-05,
+      "loss": 0.552,
+      "step": 4040
+    },
+    {
+      "epoch": 5.043586550435865,
+      "grad_norm": 10.3125,
+      "learning_rate": 6.612222222222223e-05,
+      "loss": 0.5068,
+      "step": 4050
+    },
+    {
+      "epoch": 5.056039850560398,
+      "grad_norm": 9.4375,
+      "learning_rate": 6.601111111111112e-05,
+      "loss": 0.526,
+      "step": 4060
+    },
+    {
+      "epoch": 5.068493150684931,
+      "grad_norm": 10.875,
+      "learning_rate": 6.59e-05,
+      "loss": 0.4818,
+      "step": 4070
+    },
+    {
+      "epoch": 5.080946450809464,
+      "grad_norm": 7.75,
+      "learning_rate": 6.578888888888889e-05,
+      "loss": 0.491,
+      "step": 4080
+    },
+    {
+      "epoch": 5.093399750933997,
+      "grad_norm": 8.6875,
+      "learning_rate": 6.567777777777779e-05,
+      "loss": 0.4351,
+      "step": 4090
+    },
+    {
+      "epoch": 5.1058530510585305,
+      "grad_norm": 9.3125,
+      "learning_rate": 6.556666666666667e-05,
+      "loss": 0.5479,
+      "step": 4100
+    },
+    {
+      "epoch": 5.1058530510585305,
+      "eval/acc": 34.88372039794922,
+      "step": 4100
+    },
+    {
+      "epoch": 5.1058530510585305,
+      "eval_loss": 3.02447772026062,
+      "eval_runtime": 6.1633,
+      "eval_samples_per_second": 6.977,
+      "eval_steps_per_second": 0.162,
+      "step": 4100
+    },
+    {
+      "epoch": 5.1183063511830635,
+      "grad_norm": 30.0,
+      "learning_rate": 6.545555555555555e-05,
+      "loss": 0.556,
+      "step": 4110
+    },
+    {
+      "epoch": 5.130759651307597,
+      "grad_norm": 9.5,
+      "learning_rate": 6.534444444444445e-05,
+      "loss": 0.5635,
+      "step": 4120
+    },
+    {
+      "epoch": 5.14321295143213,
+      "grad_norm": 8.9375,
+      "learning_rate": 6.523333333333334e-05,
+      "loss": 0.4989,
+      "step": 4130
+    },
+    {
+      "epoch": 5.155666251556663,
+      "grad_norm": 10.25,
+      "learning_rate": 6.512222222222222e-05,
+      "loss": 0.5918,
+      "step": 4140
+    },
+    {
+      "epoch": 5.168119551681196,
+      "grad_norm": 22.875,
+      "learning_rate": 6.501111111111111e-05,
+      "loss": 0.521,
+      "step": 4150
+    },
+    {
+      "epoch": 5.180572851805729,
+      "grad_norm": 10.75,
+      "learning_rate": 6.49e-05,
+      "loss": 0.452,
+      "step": 4160
+    },
+    {
+      "epoch": 5.193026151930262,
+      "grad_norm": 8.0,
+      "learning_rate": 6.478888888888889e-05,
+      "loss": 0.4939,
+      "step": 4170
+    },
+    {
+      "epoch": 5.205479452054795,
+      "grad_norm": 12.4375,
+      "learning_rate": 6.467777777777779e-05,
+      "loss": 0.5435,
+      "step": 4180
+    },
+    {
+      "epoch": 5.217932752179328,
+      "grad_norm": 12.75,
+      "learning_rate": 6.456666666666667e-05,
+      "loss": 0.5814,
+      "step": 4190
+    },
+    {
+      "epoch": 5.23038605230386,
+      "grad_norm": 8.75,
+      "learning_rate": 6.445555555555556e-05,
+      "loss": 0.5099,
+      "step": 4200
+    },
+    {
+      "epoch": 5.23038605230386,
+      "eval/acc": 34.88372039794922,
+      "step": 4200
+    },
+    {
+      "epoch": 5.23038605230386,
+      "eval_loss": 3.014472723007202,
+      "eval_runtime": 0.8915,
+      "eval_samples_per_second": 48.231,
+      "eval_steps_per_second": 1.122,
+      "step": 4200
+    },
+    {
+      "epoch": 5.242839352428393,
+      "grad_norm": 7.65625,
+      "learning_rate": 6.434444444444446e-05,
+      "loss": 0.5053,
+      "step": 4210
+    },
+    {
+      "epoch": 5.255292652552926,
+      "grad_norm": 11.625,
+      "learning_rate": 6.423333333333334e-05,
+      "loss": 0.4819,
+      "step": 4220
+    },
+    {
+      "epoch": 5.267745952677459,
+      "grad_norm": 7.3125,
+      "learning_rate": 6.412222222222223e-05,
+      "loss": 0.5072,
+      "step": 4230
+    },
+    {
+      "epoch": 5.280199252801992,
+      "grad_norm": 10.5625,
+      "learning_rate": 6.401111111111111e-05,
+      "loss": 0.5305,
+      "step": 4240
+    },
+    {
+      "epoch": 5.292652552926525,
+      "grad_norm": 10.25,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 0.4878,
+      "step": 4250
+    },
+    {
+      "epoch": 5.305105853051058,
+      "grad_norm": 10.9375,
+      "learning_rate": 6.378888888888889e-05,
+      "loss": 0.4998,
+      "step": 4260
+    },
+    {
+      "epoch": 5.3175591531755915,
+      "grad_norm": 10.0,
+      "learning_rate": 6.367777777777778e-05,
+      "loss": 0.5084,
+      "step": 4270
+    },
+    {
+      "epoch": 5.3300124533001245,
+      "grad_norm": 12.375,
+      "learning_rate": 6.356666666666668e-05,
+      "loss": 0.5479,
+      "step": 4280
+    },
+    {
+      "epoch": 5.342465753424658,
+      "grad_norm": 10.125,
+      "learning_rate": 6.345555555555556e-05,
+      "loss": 0.5765,
+      "step": 4290
+    },
+    {
+      "epoch": 5.354919053549191,
+      "grad_norm": 8.375,
+      "learning_rate": 6.334444444444445e-05,
+      "loss": 0.5085,
+      "step": 4300
+    },
+    {
+      "epoch": 5.354919053549191,
+      "eval/acc": 34.88372039794922,
+      "step": 4300
+    },
+    {
+      "epoch": 5.354919053549191,
+      "eval_loss": 2.994861602783203,
+      "eval_runtime": 0.2348,
+      "eval_samples_per_second": 183.171,
+      "eval_steps_per_second": 4.26,
+      "step": 4300
+    },
+    {
+      "epoch": 5.367372353673724,
+      "grad_norm": 11.4375,
+      "learning_rate": 6.323333333333333e-05,
+      "loss": 0.5267,
+      "step": 4310
+    },
+    {
+      "epoch": 5.379825653798257,
+      "grad_norm": 8.0625,
+      "learning_rate": 6.312222222222223e-05,
+      "loss": 0.5324,
+      "step": 4320
+    },
+    {
+      "epoch": 5.39227895392279,
+      "grad_norm": 6.6875,
+      "learning_rate": 6.301111111111111e-05,
+      "loss": 0.4314,
+      "step": 4330
+    },
+    {
+      "epoch": 5.404732254047323,
+      "grad_norm": 9.25,
+      "learning_rate": 6.29e-05,
+      "loss": 0.4408,
+      "step": 4340
+    },
+    {
+      "epoch": 5.417185554171856,
+      "grad_norm": 9.875,
+      "learning_rate": 6.27888888888889e-05,
+      "loss": 0.4803,
+      "step": 4350
+    },
+    {
+      "epoch": 5.429638854296389,
+      "grad_norm": 12.6875,
+      "learning_rate": 6.267777777777778e-05,
+      "loss": 0.4837,
+      "step": 4360
+    },
+    {
+      "epoch": 5.442092154420921,
+      "grad_norm": 12.0625,
+      "learning_rate": 6.256666666666668e-05,
+      "loss": 0.5353,
+      "step": 4370
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 10.375,
+      "learning_rate": 6.245555555555555e-05,
+      "loss": 0.5362,
+      "step": 4380
+    },
+    {
+      "epoch": 5.466998754669987,
+      "grad_norm": 8.375,
+      "learning_rate": 6.234444444444445e-05,
+      "loss": 0.5287,
+      "step": 4390
+    },
+    {
+      "epoch": 5.47945205479452,
+      "grad_norm": 19.875,
+      "learning_rate": 6.223333333333333e-05,
+      "loss": 0.5403,
+      "step": 4400
+    },
+    {
+      "epoch": 5.47945205479452,
+      "eval/acc": 34.88372039794922,
+      "step": 4400
+    },
+    {
+      "epoch": 5.47945205479452,
+      "eval_loss": 2.997584342956543,
+      "eval_runtime": 0.2299,
+      "eval_samples_per_second": 187.065,
+      "eval_steps_per_second": 4.35,
+      "step": 4400
+    },
+    {
+      "epoch": 5.491905354919053,
+      "grad_norm": 7.8125,
+      "learning_rate": 6.212222222222223e-05,
+      "loss": 0.4824,
+      "step": 4410
+    },
+    {
+      "epoch": 5.504358655043586,
+      "grad_norm": 8.375,
+      "learning_rate": 6.20111111111111e-05,
+      "loss": 0.5007,
+      "step": 4420
+    },
+    {
+      "epoch": 5.516811955168119,
+      "grad_norm": 9.9375,
+      "learning_rate": 6.19e-05,
+      "loss": 0.5269,
+      "step": 4430
+    },
+    {
+      "epoch": 5.5292652552926524,
+      "grad_norm": 8.0,
+      "learning_rate": 6.17888888888889e-05,
+      "loss": 0.5173,
+      "step": 4440
+    },
+    {
+      "epoch": 5.5417185554171855,
+      "grad_norm": 11.1875,
+      "learning_rate": 6.167777777777778e-05,
+      "loss": 0.5705,
+      "step": 4450
+    },
+    {
+      "epoch": 5.5541718555417185,
+      "grad_norm": 9.75,
+      "learning_rate": 6.156666666666667e-05,
+      "loss": 0.5471,
+      "step": 4460
+    },
+    {
+      "epoch": 5.566625155666252,
+      "grad_norm": 7.65625,
+      "learning_rate": 6.145555555555555e-05,
+      "loss": 0.5193,
+      "step": 4470
+    },
+    {
+      "epoch": 5.579078455790785,
+      "grad_norm": 9.0,
+      "learning_rate": 6.134444444444445e-05,
+      "loss": 0.5472,
+      "step": 4480
+    },
+    {
+      "epoch": 5.591531755915318,
+      "grad_norm": 9.25,
+      "learning_rate": 6.123333333333334e-05,
+      "loss": 0.5345,
+      "step": 4490
+    },
+    {
+      "epoch": 5.603985056039851,
+      "grad_norm": 10.5,
+      "learning_rate": 6.112222222222222e-05,
+      "loss": 0.5522,
+      "step": 4500
+    },
+    {
+      "epoch": 5.603985056039851,
+      "eval/acc": 34.88372039794922,
+      "step": 4500
+    },
+    {
+      "epoch": 5.603985056039851,
+      "eval_loss": 3.013643980026245,
+      "eval_runtime": 0.2357,
+      "eval_samples_per_second": 182.402,
+      "eval_steps_per_second": 4.242,
+      "step": 4500
+    },
+    {
+      "epoch": 5.616438356164384,
+      "grad_norm": 7.84375,
+      "learning_rate": 6.101111111111112e-05,
+      "loss": 0.571,
+      "step": 4510
+    },
+    {
+      "epoch": 5.628891656288917,
+      "grad_norm": 8.375,
+      "learning_rate": 6.09e-05,
+      "loss": 0.4792,
+      "step": 4520
+    },
+    {
+      "epoch": 5.64134495641345,
+      "grad_norm": 7.84375,
+      "learning_rate": 6.0788888888888895e-05,
+      "loss": 0.5154,
+      "step": 4530
+    },
+    {
+      "epoch": 5.653798256537982,
+      "grad_norm": 8.75,
+      "learning_rate": 6.067777777777778e-05,
+      "loss": 0.5508,
+      "step": 4540
+    },
+    {
+      "epoch": 5.666251556662516,
+      "grad_norm": 11.8125,
+      "learning_rate": 6.056666666666667e-05,
+      "loss": 0.5183,
+      "step": 4550
+    },
+    {
+      "epoch": 5.678704856787048,
+      "grad_norm": 10.75,
+      "learning_rate": 6.0455555555555555e-05,
+      "loss": 0.596,
+      "step": 4560
+    },
+    {
+      "epoch": 5.691158156911581,
+      "grad_norm": 9.8125,
+      "learning_rate": 6.034444444444445e-05,
+      "loss": 0.4746,
+      "step": 4570
+    },
+    {
+      "epoch": 5.703611457036114,
+      "grad_norm": 8.9375,
+      "learning_rate": 6.023333333333334e-05,
+      "loss": 0.4419,
+      "step": 4580
+    },
+    {
+      "epoch": 5.716064757160647,
+      "grad_norm": 9.5,
+      "learning_rate": 6.012222222222222e-05,
+      "loss": 0.4842,
+      "step": 4590
+    },
+    {
+      "epoch": 5.72851805728518,
+      "grad_norm": 9.1875,
+      "learning_rate": 6.001111111111112e-05,
+      "loss": 0.5282,
+      "step": 4600
+    },
+    {
+      "epoch": 5.72851805728518,
+      "eval/acc": 34.88372039794922,
+      "step": 4600
+    },
+    {
+      "epoch": 5.72851805728518,
+      "eval_loss": 3.12896466255188,
+      "eval_runtime": 0.224,
+      "eval_samples_per_second": 191.949,
+      "eval_steps_per_second": 4.464,
+      "step": 4600
+    },
+    {
+      "epoch": 5.740971357409713,
+      "grad_norm": 10.0625,
+      "learning_rate": 5.99e-05,
+      "loss": 0.5623,
+      "step": 4610
+    },
+    {
+      "epoch": 5.7534246575342465,
+      "grad_norm": 10.5625,
+      "learning_rate": 5.97888888888889e-05,
+      "loss": 0.5931,
+      "step": 4620
+    },
+    {
+      "epoch": 5.7658779576587795,
+      "grad_norm": 11.5,
+      "learning_rate": 5.9677777777777775e-05,
+      "loss": 0.5964,
+      "step": 4630
+    },
+    {
+      "epoch": 5.778331257783313,
+      "grad_norm": 9.5625,
+      "learning_rate": 5.9566666666666673e-05,
+      "loss": 0.5159,
+      "step": 4640
+    },
+    {
+      "epoch": 5.790784557907846,
+      "grad_norm": 9.75,
+      "learning_rate": 5.945555555555555e-05,
+      "loss": 0.4576,
+      "step": 4650
+    },
+    {
+      "epoch": 5.803237858032379,
+      "grad_norm": 11.9375,
+      "learning_rate": 5.934444444444445e-05,
+      "loss": 0.4963,
+      "step": 4660
+    },
+    {
+      "epoch": 5.815691158156912,
+      "grad_norm": 8.8125,
+      "learning_rate": 5.923333333333334e-05,
+      "loss": 0.4869,
+      "step": 4670
+    },
+    {
+      "epoch": 5.828144458281445,
+      "grad_norm": 9.3125,
+      "learning_rate": 5.9122222222222226e-05,
+      "loss": 0.4578,
+      "step": 4680
+    },
+    {
+      "epoch": 5.840597758405978,
+      "grad_norm": 8.5,
+      "learning_rate": 5.901111111111112e-05,
+      "loss": 0.491,
+      "step": 4690
+    },
+    {
+      "epoch": 5.853051058530511,
+      "grad_norm": 8.0,
+      "learning_rate": 5.89e-05,
+      "loss": 0.5223,
+      "step": 4700
+    },
+    {
+      "epoch": 5.853051058530511,
+      "eval/acc": 34.88372039794922,
+      "step": 4700
+    },
+    {
+      "epoch": 5.853051058530511,
+      "eval_loss": 3.08896541595459,
+      "eval_runtime": 0.2255,
+      "eval_samples_per_second": 190.673,
+      "eval_steps_per_second": 4.434,
+      "step": 4700
+    },
+    {
+      "epoch": 5.865504358655044,
+      "grad_norm": 9.125,
+      "learning_rate": 5.878888888888889e-05,
+      "loss": 0.4916,
+      "step": 4710
+    },
+    {
+      "epoch": 5.877957658779577,
+      "grad_norm": 11.625,
+      "learning_rate": 5.867777777777778e-05,
+      "loss": 0.5485,
+      "step": 4720
+    },
+    {
+      "epoch": 5.890410958904109,
+      "grad_norm": 8.9375,
+      "learning_rate": 5.856666666666667e-05,
+      "loss": 0.4782,
+      "step": 4730
+    },
+    {
+      "epoch": 5.902864259028642,
+      "grad_norm": 8.5,
+      "learning_rate": 5.845555555555556e-05,
+      "loss": 0.4756,
+      "step": 4740
+    },
+    {
+      "epoch": 5.915317559153175,
+      "grad_norm": 8.5,
+      "learning_rate": 5.8344444444444446e-05,
+      "loss": 0.4848,
+      "step": 4750
+    },
+    {
+      "epoch": 5.927770859277708,
+      "grad_norm": 10.3125,
+      "learning_rate": 5.823333333333334e-05,
+      "loss": 0.5772,
+      "step": 4760
+    },
+    {
+      "epoch": 5.940224159402241,
+      "grad_norm": 10.3125,
+      "learning_rate": 5.812222222222222e-05,
+      "loss": 0.4972,
+      "step": 4770
+    },
+    {
+      "epoch": 5.952677459526774,
+      "grad_norm": 13.125,
+      "learning_rate": 5.801111111111111e-05,
+      "loss": 0.5185,
+      "step": 4780
+    },
+    {
+      "epoch": 5.9651307596513075,
+      "grad_norm": 10.25,
+      "learning_rate": 5.79e-05,
+      "loss": 0.4956,
+      "step": 4790
+    },
+    {
+      "epoch": 5.9775840597758405,
+      "grad_norm": 7.59375,
+      "learning_rate": 5.778888888888889e-05,
+      "loss": 0.5193,
+      "step": 4800
+    },
+    {
+      "epoch": 5.9775840597758405,
+      "eval/acc": 34.88372039794922,
+      "step": 4800
+    },
+    {
+      "epoch": 5.9775840597758405,
+      "eval_loss": 3.0848581790924072,
+      "eval_runtime": 0.2247,
+      "eval_samples_per_second": 191.396,
+      "eval_steps_per_second": 4.451,
+      "step": 4800
+    },
+    {
+      "epoch": 5.990037359900374,
+      "grad_norm": 10.1875,
+      "learning_rate": 5.7677777777777774e-05,
+      "loss": 0.4816,
+      "step": 4810
+    },
+    {
+      "epoch": 6.002490660024907,
+      "grad_norm": 8.5,
+      "learning_rate": 5.7566666666666666e-05,
+      "loss": 0.512,
+      "step": 4820
+    },
+    {
+      "epoch": 6.01494396014944,
+      "grad_norm": 10.75,
+      "learning_rate": 5.7455555555555564e-05,
+      "loss": 0.536,
+      "step": 4830
+    },
+    {
+      "epoch": 6.027397260273973,
+      "grad_norm": 10.8125,
+      "learning_rate": 5.734444444444445e-05,
+      "loss": 0.4573,
+      "step": 4840
+    },
+    {
+      "epoch": 6.039850560398506,
+      "grad_norm": 10.1875,
+      "learning_rate": 5.723333333333334e-05,
+      "loss": 0.5556,
+      "step": 4850
+    },
+    {
+      "epoch": 6.052303860523039,
+      "grad_norm": 13.6875,
+      "learning_rate": 5.7122222222222225e-05,
+      "loss": 0.4513,
+      "step": 4860
+    },
+    {
+      "epoch": 6.064757160647572,
+      "grad_norm": 11.0,
+      "learning_rate": 5.7011111111111116e-05,
+      "loss": 0.4453,
+      "step": 4870
+    },
+    {
+      "epoch": 6.077210460772105,
+      "grad_norm": 11.3125,
+      "learning_rate": 5.69e-05,
+      "loss": 0.5147,
+      "step": 4880
+    },
+    {
+      "epoch": 6.089663760896638,
+      "grad_norm": 13.125,
+      "learning_rate": 5.678888888888889e-05,
+      "loss": 0.4948,
+      "step": 4890
+    },
+    {
+      "epoch": 6.102117061021171,
+      "grad_norm": 11.4375,
+      "learning_rate": 5.6677777777777784e-05,
+      "loss": 0.5324,
+      "step": 4900
+    },
+    {
+      "epoch": 6.102117061021171,
+      "eval/acc": 34.88372039794922,
+      "step": 4900
+    },
+    {
+      "epoch": 6.102117061021171,
+      "eval_loss": 2.459988832473755,
+      "eval_runtime": 6.3869,
+      "eval_samples_per_second": 6.733,
+      "eval_steps_per_second": 0.157,
+      "step": 4900
+    },
+    {
+      "epoch": 6.114570361145703,
+      "grad_norm": 10.6875,
+      "learning_rate": 5.656666666666667e-05,
+      "loss": 0.4818,
+      "step": 4910
+    },
+    {
+      "epoch": 6.127023661270236,
+      "grad_norm": 10.3125,
+      "learning_rate": 5.645555555555556e-05,
+      "loss": 0.5784,
+      "step": 4920
+    },
+    {
+      "epoch": 6.139476961394769,
+      "grad_norm": 13.4375,
+      "learning_rate": 5.6344444444444444e-05,
+      "loss": 0.3966,
+      "step": 4930
+    },
+    {
+      "epoch": 6.151930261519302,
+      "grad_norm": 10.375,
+      "learning_rate": 5.6233333333333336e-05,
+      "loss": 0.4861,
+      "step": 4940
+    },
+    {
+      "epoch": 6.164383561643835,
+      "grad_norm": 10.375,
+      "learning_rate": 5.612222222222222e-05,
+      "loss": 0.4646,
+      "step": 4950
+    },
+    {
+      "epoch": 6.176836861768368,
+      "grad_norm": 10.1875,
+      "learning_rate": 5.601111111111111e-05,
+      "loss": 0.5416,
+      "step": 4960
+    },
+    {
+      "epoch": 6.1892901618929015,
+      "grad_norm": 10.0625,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 0.4985,
+      "step": 4970
+    },
+    {
+      "epoch": 6.2017434620174345,
+      "grad_norm": 10.125,
+      "learning_rate": 5.578888888888889e-05,
+      "loss": 0.4865,
+      "step": 4980
+    },
+    {
+      "epoch": 6.214196762141968,
+      "grad_norm": 10.625,
+      "learning_rate": 5.5677777777777786e-05,
+      "loss": 0.4818,
+      "step": 4990
+    },
+    {
+      "epoch": 6.226650062266501,
+      "grad_norm": 10.75,
+      "learning_rate": 5.5566666666666664e-05,
+      "loss": 0.4909,
+      "step": 5000
+    },
+    {
+      "epoch": 6.226650062266501,
+      "eval/acc": 34.88372039794922,
+      "step": 5000
+    },
+    {
+      "epoch": 6.226650062266501,
+      "eval_loss": 2.4926505088806152,
+      "eval_runtime": 0.2216,
+      "eval_samples_per_second": 194.068,
+      "eval_steps_per_second": 4.513,
+      "step": 5000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 13,
+  "save_steps": 2500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-5000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
+size 6161

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 50368
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aecd4553a1d58ca11b17030014708270f38be0c51df4dcd752bb2e3ddc8dd81
+size 298041696

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d48bd01953728d4eac963af505f6d4253af613156cb9e6895810b87c6d7b7524
+size 596170443

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ef8cbbdda5cb37ab7cf20f5c005619f2776e3f4face3617c4680d91d8a07ece
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0be06a52a1be8ec61cf963589563ad06d498c8986f95cc6e2cf6dd2628b95f36
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8de3a32ca6dbe3f13cbc720107ecdd6dbaaca93202f28d4cd937551ab5665d8b
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cadd49afaa5e5d92514cc4b811a3968236ab79a1466a7061413f550c41026201
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:773947218d8d7737c8670043d737e80a30fe17375af8e46749692f7803f2df3b
+size 1465

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/checkpoint-7500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61b297d5e64c898db22dbb0b2c7feb17b604dfbcc3bfebf55e9e7ecbf9c3794c
+size 6161

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "dtype": "bfloat16",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 160000.0,
+  "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
+  "hidden_size": 768,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_norm_eps": 1e-05,
+  "local_attention": 128,
+  "local_rope_theta": 10000.0,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 22,
+  "pad_token_id": 50283,
+  "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "transformers_version": "4.57.1",
+  "vocab_size": 50368
+}

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa648019534a54da16e5df11fb28257398ac4ee886de2d2ef90e587b14a698f7
+size 298041696

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a47b56f6d386053d17cfa1c908b16dcebcec2fa8dbf6ea679e0add277be30b3
+size 596170443

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c235c10397ca3fb3b82475883c48d3bb786206feaee53c2199c913179faf1fb
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:937bfac24cd2fe886a72cb180e9d726f8629acaf1e31b2beab1f7a03381ca0ca
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee0687693332dd9f28a675c2a9f27590ae650095d80dac61354fce4437e7f9de
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffb4dab4ba8c60d5f5c48a1048c1ecc4e949aff462fd8340d7ad1a380fc12fdd
+size 15429

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/runs/Nov25_13-58-36_nid005118/events.out.tfevents.1764072154.nid005118.9241.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a00311704ec9f450f3bbd2a82bb88a4f98ffb9756b557103426b1bbd5b1571c
+size 254610

modernbert-crux-researchy-pos_high.neg_low.b64_n512.1e-4.512/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d02be6d8bda4ea9c67040ed89f878acdc986bd4df3fbc60440a9d3eacca02d63
+size 1465