DylanJHJ commited on
Commit
ef0f531
·
1 Parent(s): a1b683a

update the best crux-resaerchy

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +11 -0
  2. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/config.json +45 -0
  3. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors +3 -0
  4. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt +3 -0
  5. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth +3 -0
  6. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth +3 -0
  7. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth +3 -0
  8. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth +3 -0
  9. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt +3 -0
  10. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json +0 -0
  11. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/training_args.bin +3 -0
  12. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/config.json +45 -0
  13. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors +3 -0
  14. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt +3 -0
  15. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth +3 -0
  16. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth +3 -0
  17. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth +3 -0
  18. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth +3 -0
  19. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt +3 -0
  20. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json +0 -0
  21. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/training_args.bin +3 -0
  22. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/config.json +45 -0
  23. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors +3 -0
  24. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt +3 -0
  25. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth +3 -0
  26. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth +3 -0
  27. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth +3 -0
  28. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth +3 -0
  29. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt +3 -0
  30. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json +0 -0
  31. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/training_args.bin +3 -0
  32. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/config.json +45 -0
  33. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors +3 -0
  34. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt +3 -0
  35. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth +3 -0
  36. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth +3 -0
  37. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth +3 -0
  38. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth +3 -0
  39. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt +3 -0
  40. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json +0 -0
  41. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/training_args.bin +3 -0
  42. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/config.json +45 -0
  43. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors +3 -0
  44. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt +3 -0
  45. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth +3 -0
  46. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth +3 -0
  47. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth +3 -0
  48. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth +3 -0
  49. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt +3 -0
  50. modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json +4184 -0
.gitattributes CHANGED
@@ -143,3 +143,14 @@ modernbert-crux-researchy-pos_zero.neg_high.b64_n512.1e-4.512/optimizer.pt filte
143
  .git/lfs/objects/49/7c/497c0250f011068b2187e2dbf2b9faa58acb723801636dc0ec52e7d5cd643a2a filter=lfs diff=lfs merge=lfs -text
144
  .git/lfs/tmp/832543618 filter=lfs diff=lfs merge=lfs -text
145
  .git/lfs/tmp/1322077385 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
143
  .git/lfs/objects/49/7c/497c0250f011068b2187e2dbf2b9faa58acb723801636dc0ec52e7d5cd643a2a filter=lfs diff=lfs merge=lfs -text
144
  .git/lfs/tmp/832543618 filter=lfs diff=lfs merge=lfs -text
145
  .git/lfs/tmp/1322077385 filter=lfs diff=lfs merge=lfs -text
146
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/model.safetensors filter=lfs diff=lfs merge=lfs -text
147
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
148
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors filter=lfs diff=lfs merge=lfs -text
149
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
150
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors filter=lfs diff=lfs merge=lfs -text
151
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
152
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors filter=lfs diff=lfs merge=lfs -text
153
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
154
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors filter=lfs diff=lfs merge=lfs -text
155
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt filter=lfs diff=lfs merge=lfs -text
156
+ modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors filter=lfs diff=lfs merge=lfs -text
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04a7b7e9b10265cc9c1a6bd5bfb4bb4239a5da7a393588fc17c0f703daf9db7
3
+ size 298041696
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a50785f196369c06420598b40db7fe0178a90b0c3f804948751f4ad50381d84f
3
+ size 596170443
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74e7e89cbe7a70edf66e8968948906fb1a820f09a6a8809481256cb4f59eaf10
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5d3e5bce55f160ed0c87b1cbef42754767ac243615cafe6fe597c6c56abe221
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6004cc10346f251377bb583f9d9cb6fb19ba248f20a8ca5df932990f0b69313
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69bd5f01bd9ce43daeab69ea4b44d0bce391a11f8b9d1d80a742012fb4f66a87
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87226ad224173d93b5065f16bc500c8526a0b69039465aa0987938ac6db15343
3
+ size 1465
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b
3
+ size 6161
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a46354b82536b5fd3b84c627a781ab5dd486257d6ccaec684ef58965d14c8ba
3
+ size 298041696
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29d7a8844c0ec5618ca98b651b06c5d47a350267ffbc46cd92ee978330ed7107
3
+ size 596170443
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d7ef359718523590df9e682a92cb56fe7401ac013eeb40af3d6ce9eb52db3f
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e02846c018997fb2a63437b79039e6ffd03d4a1b2388956f198df7d435db23
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5345a9c5559eca496b295ecccec4ba0d714c05b192234ffe0bb22d9fb9f9fa65
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87ecc6acc3b0104c6b332cf9236660b108318e9bcba0d080d4a3915cd3eca90
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ee335a8687784a6fa7f307f4a9f1ef29b4e8f2f20541a4f32818983db603c9
3
+ size 1465
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-15000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b
3
+ size 6161
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5081c229000fd604fd5b2c8852a910dc1525539d70ab47b202247f261ace45
3
+ size 298041696
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ab12d2fdb52b6310449cb6653cf879090689802313feaa7917477e798206d38
3
+ size 596170443
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8833c00aebc0e619e587f9b710f631c27b0f144c194509f4b71fbc2b817fe73b
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10e93038fca3c85b4bd66cf943246af72046fa052f77329dadcf03b484882631
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:893995e08f7db4d1bd2fa7b61362d2ca2a6c5936eb6e9af8051c007c0afcd24a
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cbed03005b56a2cba542949acbe7f890ada7074d8c41dbf04128640c3459be0
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb76502a50a5dad9408ce3a4bed787cae974a61981e4235981c0d72cc1227c37
3
+ size 1465
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-20000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b
3
+ size 6161
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a8268831a8f08bfce731da9b4d1a69338adf279352c89832f2a6b9ec3400203
3
+ size 298041696
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c392d991a187c7fb9c677c9f7f3dc089284791c082912cbcad51d1534a37b0
3
+ size 596170443
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7c3317de65b30d603f92fe6e96f6799b60ade22ae5df6aac7a9339d5943f7f1
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970a742489b7a9284135996af739d3ba3335d58e54026a7f786d5bfb4f0dff69
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc9ee43db00b894b802bd59ba7b8b86295da75ac768fa84976d017cfadd8c106
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98094ade0b37e212a3a763c172efa6d586516480838a3f8fafb9403a87fb9492
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1717a9054eef94433d1223ce97cfacf8af74d8a39634780628341d30ddbbddd
3
+ size 1465
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-25000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73ab29bf2f573eaec3dd814ee64e6931ca0f3da63f7433c8289d0b0a84a938b
3
+ size 6161
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "mean",
12
+ "cls_token_id": 50281,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "dtype": "bfloat16",
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "repad_logits_with_grad": false,
40
+ "sep_token_id": 50282,
41
+ "sparse_pred_ignore_index": -100,
42
+ "sparse_prediction": false,
43
+ "transformers_version": "4.57.1",
44
+ "vocab_size": 50368
45
+ }
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d246c14f659e3528a5e9698dbc9236bc6bb9eb18c55b627e0292e5e33a6f0d46
3
+ size 298041696
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73caeb604d1153a7db8dd73b2fc37b9e826fec045c5ee26810e16966b749c507
3
+ size 596170443
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:676e3c962f8f33d9a16826c7fe7dd98a8b3bfb774ddb934acc5fb734b106b59d
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d374a9c193c317a1dab9c9052e0ce5250f98dbb111c85aa423a009c121e1fc49
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe84239abd2d7794d2c95c6c196b6450efef11198f055cb008f1ad56b35e4dbc
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:358b89bc71f93b52249a9889b255e6ef55fedc24db1a6a3e29b8f70d82acf972
3
+ size 15429
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fb542ee6beb5b26a9831445bc783ee70028bb4847cbcf43cb9d249b3a02f8a9
3
+ size 1465
modernbert-crux-researchy-flatten.pos_5.neg_1.b64_n512.1e-4.512/checkpoint-5000/trainer_state.json ADDED
@@ -0,0 +1,4184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.6019744762822057,
6
+ "eval_steps": 100,
7
+ "global_step": 5000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0012039489525644113,
14
+ "grad_norm": 29.25,
15
+ "learning_rate": 3.6e-07,
16
+ "loss": 5.6475,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0024078979051288226,
21
+ "grad_norm": 13.6875,
22
+ "learning_rate": 7.6e-07,
23
+ "loss": 5.6394,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.003611846857693234,
28
+ "grad_norm": 36.0,
29
+ "learning_rate": 1.16e-06,
30
+ "loss": 5.6168,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.004815795810257645,
35
+ "grad_norm": 17.0,
36
+ "learning_rate": 1.56e-06,
37
+ "loss": 5.6346,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.006019744762822056,
42
+ "grad_norm": 16.5,
43
+ "learning_rate": 1.96e-06,
44
+ "loss": 5.6391,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.007223693715386468,
49
+ "grad_norm": 16.5,
50
+ "learning_rate": 2.36e-06,
51
+ "loss": 5.6272,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.00842764266795088,
56
+ "grad_norm": 14.8125,
57
+ "learning_rate": 2.7600000000000003e-06,
58
+ "loss": 5.5979,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.00963159162051529,
63
+ "grad_norm": 22.375,
64
+ "learning_rate": 3.1600000000000007e-06,
65
+ "loss": 5.6515,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.010835540573079701,
70
+ "grad_norm": 17.125,
71
+ "learning_rate": 3.5600000000000002e-06,
72
+ "loss": 5.6018,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.012039489525644112,
77
+ "grad_norm": 14.9375,
78
+ "learning_rate": 3.96e-06,
79
+ "loss": 5.6342,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.012039489525644112,
84
+ "eval/acc": 3.4883720874786377,
85
+ "step": 100
86
+ },
87
+ {
88
+ "epoch": 0.012039489525644112,
89
+ "eval_loss": 5.140806198120117,
90
+ "eval_runtime": 2.4165,
91
+ "eval_samples_per_second": 17.794,
92
+ "eval_steps_per_second": 0.414,
93
+ "step": 100
94
+ },
95
+ {
96
+ "epoch": 0.013243438478208525,
97
+ "grad_norm": 13.0,
98
+ "learning_rate": 4.360000000000001e-06,
99
+ "loss": 5.6124,
100
+ "step": 110
101
+ },
102
+ {
103
+ "epoch": 0.014447387430772935,
104
+ "grad_norm": 18.625,
105
+ "learning_rate": 4.76e-06,
106
+ "loss": 5.6127,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 0.015651336383337346,
111
+ "grad_norm": 14.375,
112
+ "learning_rate": 5.1600000000000006e-06,
113
+ "loss": 5.5663,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.01685528533590176,
118
+ "grad_norm": 11.9375,
119
+ "learning_rate": 5.56e-06,
120
+ "loss": 5.55,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.018059234288466168,
125
+ "grad_norm": 14.5,
126
+ "learning_rate": 5.9600000000000005e-06,
127
+ "loss": 5.5839,
128
+ "step": 150
129
+ },
130
+ {
131
+ "epoch": 0.01926318324103058,
132
+ "grad_norm": 15.0625,
133
+ "learning_rate": 6.360000000000001e-06,
134
+ "loss": 5.5259,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 0.020467132193594993,
139
+ "grad_norm": 14.8125,
140
+ "learning_rate": 6.76e-06,
141
+ "loss": 5.4812,
142
+ "step": 170
143
+ },
144
+ {
145
+ "epoch": 0.021671081146159402,
146
+ "grad_norm": 15.375,
147
+ "learning_rate": 7.16e-06,
148
+ "loss": 5.4964,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 0.022875030098723815,
153
+ "grad_norm": 14.0625,
154
+ "learning_rate": 7.5600000000000005e-06,
155
+ "loss": 5.4023,
156
+ "step": 190
157
+ },
158
+ {
159
+ "epoch": 0.024078979051288224,
160
+ "grad_norm": 18.625,
161
+ "learning_rate": 7.96e-06,
162
+ "loss": 5.3778,
163
+ "step": 200
164
+ },
165
+ {
166
+ "epoch": 0.024078979051288224,
167
+ "eval/acc": 5.232558250427246,
168
+ "step": 200
169
+ },
170
+ {
171
+ "epoch": 0.024078979051288224,
172
+ "eval_loss": 4.991551399230957,
173
+ "eval_runtime": 0.2363,
174
+ "eval_samples_per_second": 181.988,
175
+ "eval_steps_per_second": 4.232,
176
+ "step": 200
177
+ },
178
+ {
179
+ "epoch": 0.025282928003852637,
180
+ "grad_norm": 16.25,
181
+ "learning_rate": 8.36e-06,
182
+ "loss": 5.3983,
183
+ "step": 210
184
+ },
185
+ {
186
+ "epoch": 0.02648687695641705,
187
+ "grad_norm": 17.25,
188
+ "learning_rate": 8.76e-06,
189
+ "loss": 5.2953,
190
+ "step": 220
191
+ },
192
+ {
193
+ "epoch": 0.02769082590898146,
194
+ "grad_norm": 15.9375,
195
+ "learning_rate": 9.16e-06,
196
+ "loss": 5.2266,
197
+ "step": 230
198
+ },
199
+ {
200
+ "epoch": 0.02889477486154587,
201
+ "grad_norm": 21.875,
202
+ "learning_rate": 9.560000000000002e-06,
203
+ "loss": 5.139,
204
+ "step": 240
205
+ },
206
+ {
207
+ "epoch": 0.03009872381411028,
208
+ "grad_norm": 17.875,
209
+ "learning_rate": 9.96e-06,
210
+ "loss": 5.0639,
211
+ "step": 250
212
+ },
213
+ {
214
+ "epoch": 0.03130267276667469,
215
+ "grad_norm": 18.875,
216
+ "learning_rate": 1.036e-05,
217
+ "loss": 5.0118,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.032506621719239105,
222
+ "grad_norm": 26.0,
223
+ "learning_rate": 1.076e-05,
224
+ "loss": 4.8959,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.03371057067180352,
229
+ "grad_norm": 18.5,
230
+ "learning_rate": 1.1160000000000002e-05,
231
+ "loss": 4.8454,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 0.03491451962436792,
236
+ "grad_norm": 28.0,
237
+ "learning_rate": 1.156e-05,
238
+ "loss": 4.6846,
239
+ "step": 290
240
+ },
241
+ {
242
+ "epoch": 0.036118468576932336,
243
+ "grad_norm": 25.5,
244
+ "learning_rate": 1.196e-05,
245
+ "loss": 4.5211,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.036118468576932336,
250
+ "eval/acc": 6.395349025726318,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 0.036118468576932336,
255
+ "eval_loss": 4.604515075683594,
256
+ "eval_runtime": 0.2156,
257
+ "eval_samples_per_second": 199.428,
258
+ "eval_steps_per_second": 4.638,
259
+ "step": 300
260
+ },
261
+ {
262
+ "epoch": 0.03732241752949675,
263
+ "grad_norm": 28.0,
264
+ "learning_rate": 1.236e-05,
265
+ "loss": 4.3466,
266
+ "step": 310
267
+ },
268
+ {
269
+ "epoch": 0.03852636648206116,
270
+ "grad_norm": 27.125,
271
+ "learning_rate": 1.276e-05,
272
+ "loss": 4.1005,
273
+ "step": 320
274
+ },
275
+ {
276
+ "epoch": 0.039730315434625574,
277
+ "grad_norm": 33.0,
278
+ "learning_rate": 1.316e-05,
279
+ "loss": 3.7904,
280
+ "step": 330
281
+ },
282
+ {
283
+ "epoch": 0.040934264387189986,
284
+ "grad_norm": 32.75,
285
+ "learning_rate": 1.356e-05,
286
+ "loss": 3.4061,
287
+ "step": 340
288
+ },
289
+ {
290
+ "epoch": 0.04213821333975439,
291
+ "grad_norm": 31.125,
292
+ "learning_rate": 1.396e-05,
293
+ "loss": 3.2838,
294
+ "step": 350
295
+ },
296
+ {
297
+ "epoch": 0.043342162292318805,
298
+ "grad_norm": 23.75,
299
+ "learning_rate": 1.4360000000000001e-05,
300
+ "loss": 2.9101,
301
+ "step": 360
302
+ },
303
+ {
304
+ "epoch": 0.04454611124488322,
305
+ "grad_norm": 44.75,
306
+ "learning_rate": 1.4760000000000001e-05,
307
+ "loss": 2.6306,
308
+ "step": 370
309
+ },
310
+ {
311
+ "epoch": 0.04575006019744763,
312
+ "grad_norm": 33.25,
313
+ "learning_rate": 1.5160000000000002e-05,
314
+ "loss": 2.5454,
315
+ "step": 380
316
+ },
317
+ {
318
+ "epoch": 0.04695400915001204,
319
+ "grad_norm": 31.375,
320
+ "learning_rate": 1.556e-05,
321
+ "loss": 2.5867,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.04815795810257645,
326
+ "grad_norm": 18.5,
327
+ "learning_rate": 1.596e-05,
328
+ "loss": 2.3251,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.04815795810257645,
333
+ "eval/acc": 12.209301948547363,
334
+ "step": 400
335
+ },
336
+ {
337
+ "epoch": 0.04815795810257645,
338
+ "eval_loss": 3.941906452178955,
339
+ "eval_runtime": 0.2265,
340
+ "eval_samples_per_second": 189.814,
341
+ "eval_steps_per_second": 4.414,
342
+ "step": 400
343
+ },
344
+ {
345
+ "epoch": 0.04936190705514086,
346
+ "grad_norm": 18.0,
347
+ "learning_rate": 1.636e-05,
348
+ "loss": 2.394,
349
+ "step": 410
350
+ },
351
+ {
352
+ "epoch": 0.05056585600770527,
353
+ "grad_norm": 22.375,
354
+ "learning_rate": 1.6760000000000002e-05,
355
+ "loss": 2.2856,
356
+ "step": 420
357
+ },
358
+ {
359
+ "epoch": 0.051769804960269686,
360
+ "grad_norm": 17.25,
361
+ "learning_rate": 1.7160000000000002e-05,
362
+ "loss": 2.3414,
363
+ "step": 430
364
+ },
365
+ {
366
+ "epoch": 0.0529737539128341,
367
+ "grad_norm": 15.25,
368
+ "learning_rate": 1.756e-05,
369
+ "loss": 2.156,
370
+ "step": 440
371
+ },
372
+ {
373
+ "epoch": 0.054177702865398504,
374
+ "grad_norm": 15.75,
375
+ "learning_rate": 1.796e-05,
376
+ "loss": 2.0164,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 0.05538165181796292,
381
+ "grad_norm": 28.5,
382
+ "learning_rate": 1.8360000000000004e-05,
383
+ "loss": 1.9555,
384
+ "step": 460
385
+ },
386
+ {
387
+ "epoch": 0.05658560077052733,
388
+ "grad_norm": 19.25,
389
+ "learning_rate": 1.876e-05,
390
+ "loss": 2.0277,
391
+ "step": 470
392
+ },
393
+ {
394
+ "epoch": 0.05778954972309174,
395
+ "grad_norm": 15.375,
396
+ "learning_rate": 1.916e-05,
397
+ "loss": 2.1719,
398
+ "step": 480
399
+ },
400
+ {
401
+ "epoch": 0.058993498675656154,
402
+ "grad_norm": 18.875,
403
+ "learning_rate": 1.956e-05,
404
+ "loss": 2.013,
405
+ "step": 490
406
+ },
407
+ {
408
+ "epoch": 0.06019744762822056,
409
+ "grad_norm": 18.625,
410
+ "learning_rate": 1.9960000000000002e-05,
411
+ "loss": 1.8574,
412
+ "step": 500
413
+ },
414
+ {
415
+ "epoch": 0.06019744762822056,
416
+ "eval/acc": 20.930233001708984,
417
+ "step": 500
418
+ },
419
+ {
420
+ "epoch": 0.06019744762822056,
421
+ "eval_loss": 3.6547293663024902,
422
+ "eval_runtime": 0.2139,
423
+ "eval_samples_per_second": 201.002,
424
+ "eval_steps_per_second": 4.674,
425
+ "step": 500
426
+ },
427
+ {
428
+ "epoch": 0.06140139658078497,
429
+ "grad_norm": 19.875,
430
+ "learning_rate": 2.036e-05,
431
+ "loss": 1.9431,
432
+ "step": 510
433
+ },
434
+ {
435
+ "epoch": 0.06260534553334939,
436
+ "grad_norm": 14.625,
437
+ "learning_rate": 2.076e-05,
438
+ "loss": 1.8311,
439
+ "step": 520
440
+ },
441
+ {
442
+ "epoch": 0.0638092944859138,
443
+ "grad_norm": 20.0,
444
+ "learning_rate": 2.116e-05,
445
+ "loss": 2.0005,
446
+ "step": 530
447
+ },
448
+ {
449
+ "epoch": 0.06501324343847821,
450
+ "grad_norm": 16.0,
451
+ "learning_rate": 2.1560000000000004e-05,
452
+ "loss": 1.7374,
453
+ "step": 540
454
+ },
455
+ {
456
+ "epoch": 0.06621719239104262,
457
+ "grad_norm": 13.0625,
458
+ "learning_rate": 2.196e-05,
459
+ "loss": 1.7838,
460
+ "step": 550
461
+ },
462
+ {
463
+ "epoch": 0.06742114134360704,
464
+ "grad_norm": 16.5,
465
+ "learning_rate": 2.236e-05,
466
+ "loss": 1.8264,
467
+ "step": 560
468
+ },
469
+ {
470
+ "epoch": 0.06862509029617145,
471
+ "grad_norm": 20.5,
472
+ "learning_rate": 2.2760000000000002e-05,
473
+ "loss": 1.658,
474
+ "step": 570
475
+ },
476
+ {
477
+ "epoch": 0.06982903924873585,
478
+ "grad_norm": 25.75,
479
+ "learning_rate": 2.3160000000000002e-05,
480
+ "loss": 1.7826,
481
+ "step": 580
482
+ },
483
+ {
484
+ "epoch": 0.07103298820130026,
485
+ "grad_norm": 19.375,
486
+ "learning_rate": 2.356e-05,
487
+ "loss": 1.6539,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 0.07223693715386467,
492
+ "grad_norm": 19.25,
493
+ "learning_rate": 2.396e-05,
494
+ "loss": 1.6278,
495
+ "step": 600
496
+ },
497
+ {
498
+ "epoch": 0.07223693715386467,
499
+ "eval/acc": 20.930233001708984,
500
+ "step": 600
501
+ },
502
+ {
503
+ "epoch": 0.07223693715386467,
504
+ "eval_loss": 3.387899398803711,
505
+ "eval_runtime": 0.2536,
506
+ "eval_samples_per_second": 169.572,
507
+ "eval_steps_per_second": 3.944,
508
+ "step": 600
509
+ },
510
+ {
511
+ "epoch": 0.07344088610642908,
512
+ "grad_norm": 12.0625,
513
+ "learning_rate": 2.4360000000000004e-05,
514
+ "loss": 1.5342,
515
+ "step": 610
516
+ },
517
+ {
518
+ "epoch": 0.0746448350589935,
519
+ "grad_norm": 15.625,
520
+ "learning_rate": 2.476e-05,
521
+ "loss": 1.5919,
522
+ "step": 620
523
+ },
524
+ {
525
+ "epoch": 0.07584878401155791,
526
+ "grad_norm": 25.5,
527
+ "learning_rate": 2.516e-05,
528
+ "loss": 1.5713,
529
+ "step": 630
530
+ },
531
+ {
532
+ "epoch": 0.07705273296412232,
533
+ "grad_norm": 14.8125,
534
+ "learning_rate": 2.556e-05,
535
+ "loss": 1.4714,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 0.07825668191668674,
540
+ "grad_norm": 21.5,
541
+ "learning_rate": 2.5960000000000002e-05,
542
+ "loss": 1.5835,
543
+ "step": 650
544
+ },
545
+ {
546
+ "epoch": 0.07946063086925115,
547
+ "grad_norm": 58.0,
548
+ "learning_rate": 2.6360000000000002e-05,
549
+ "loss": 1.5369,
550
+ "step": 660
551
+ },
552
+ {
553
+ "epoch": 0.08066457982181556,
554
+ "grad_norm": 45.0,
555
+ "learning_rate": 2.676e-05,
556
+ "loss": 1.4629,
557
+ "step": 670
558
+ },
559
+ {
560
+ "epoch": 0.08186852877437997,
561
+ "grad_norm": 14.1875,
562
+ "learning_rate": 2.716e-05,
563
+ "loss": 1.4288,
564
+ "step": 680
565
+ },
566
+ {
567
+ "epoch": 0.08307247772694437,
568
+ "grad_norm": 40.25,
569
+ "learning_rate": 2.7560000000000004e-05,
570
+ "loss": 1.4729,
571
+ "step": 690
572
+ },
573
+ {
574
+ "epoch": 0.08427642667950878,
575
+ "grad_norm": 13.625,
576
+ "learning_rate": 2.7960000000000003e-05,
577
+ "loss": 1.4883,
578
+ "step": 700
579
+ },
580
+ {
581
+ "epoch": 0.08427642667950878,
582
+ "eval/acc": 23.255813598632812,
583
+ "step": 700
584
+ },
585
+ {
586
+ "epoch": 0.08427642667950878,
587
+ "eval_loss": 3.206946611404419,
588
+ "eval_runtime": 0.4188,
589
+ "eval_samples_per_second": 102.684,
590
+ "eval_steps_per_second": 2.388,
591
+ "step": 700
592
+ },
593
+ {
594
+ "epoch": 0.0854803756320732,
595
+ "grad_norm": 15.75,
596
+ "learning_rate": 2.8360000000000003e-05,
597
+ "loss": 1.5656,
598
+ "step": 710
599
+ },
600
+ {
601
+ "epoch": 0.08668432458463761,
602
+ "grad_norm": 22.25,
603
+ "learning_rate": 2.8760000000000002e-05,
604
+ "loss": 1.6742,
605
+ "step": 720
606
+ },
607
+ {
608
+ "epoch": 0.08788827353720202,
609
+ "grad_norm": 12.3125,
610
+ "learning_rate": 2.9160000000000005e-05,
611
+ "loss": 1.35,
612
+ "step": 730
613
+ },
614
+ {
615
+ "epoch": 0.08909222248976643,
616
+ "grad_norm": 13.8125,
617
+ "learning_rate": 2.9559999999999998e-05,
618
+ "loss": 1.4435,
619
+ "step": 740
620
+ },
621
+ {
622
+ "epoch": 0.09029617144233085,
623
+ "grad_norm": 13.1875,
624
+ "learning_rate": 2.9959999999999998e-05,
625
+ "loss": 1.3843,
626
+ "step": 750
627
+ },
628
+ {
629
+ "epoch": 0.09150012039489526,
630
+ "grad_norm": 13.3125,
631
+ "learning_rate": 3.036e-05,
632
+ "loss": 1.3327,
633
+ "step": 760
634
+ },
635
+ {
636
+ "epoch": 0.09270406934745967,
637
+ "grad_norm": 18.875,
638
+ "learning_rate": 3.076e-05,
639
+ "loss": 1.4628,
640
+ "step": 770
641
+ },
642
+ {
643
+ "epoch": 0.09390801830002408,
644
+ "grad_norm": 14.5625,
645
+ "learning_rate": 3.116e-05,
646
+ "loss": 1.3306,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 0.09511196725258848,
651
+ "grad_norm": 18.75,
652
+ "learning_rate": 3.156e-05,
653
+ "loss": 1.4936,
654
+ "step": 790
655
+ },
656
+ {
657
+ "epoch": 0.0963159162051529,
658
+ "grad_norm": 11.5,
659
+ "learning_rate": 3.196e-05,
660
+ "loss": 1.3515,
661
+ "step": 800
662
+ },
663
+ {
664
+ "epoch": 0.0963159162051529,
665
+ "eval/acc": 22.674419403076172,
666
+ "step": 800
667
+ },
668
+ {
669
+ "epoch": 0.0963159162051529,
670
+ "eval_loss": 3.1510462760925293,
671
+ "eval_runtime": 0.2676,
672
+ "eval_samples_per_second": 160.701,
673
+ "eval_steps_per_second": 3.737,
674
+ "step": 800
675
+ },
676
+ {
677
+ "epoch": 0.09751986515771731,
678
+ "grad_norm": 11.6875,
679
+ "learning_rate": 3.236e-05,
680
+ "loss": 1.4593,
681
+ "step": 810
682
+ },
683
+ {
684
+ "epoch": 0.09872381411028172,
685
+ "grad_norm": 10.5625,
686
+ "learning_rate": 3.2760000000000005e-05,
687
+ "loss": 1.3453,
688
+ "step": 820
689
+ },
690
+ {
691
+ "epoch": 0.09992776306284613,
692
+ "grad_norm": 11.625,
693
+ "learning_rate": 3.316e-05,
694
+ "loss": 1.4041,
695
+ "step": 830
696
+ },
697
+ {
698
+ "epoch": 0.10113171201541055,
699
+ "grad_norm": 13.0,
700
+ "learning_rate": 3.3560000000000004e-05,
701
+ "loss": 1.2766,
702
+ "step": 840
703
+ },
704
+ {
705
+ "epoch": 0.10233566096797496,
706
+ "grad_norm": 40.0,
707
+ "learning_rate": 3.396e-05,
708
+ "loss": 1.2678,
709
+ "step": 850
710
+ },
711
+ {
712
+ "epoch": 0.10353960992053937,
713
+ "grad_norm": 13.75,
714
+ "learning_rate": 3.436e-05,
715
+ "loss": 1.2514,
716
+ "step": 860
717
+ },
718
+ {
719
+ "epoch": 0.10474355887310378,
720
+ "grad_norm": 11.75,
721
+ "learning_rate": 3.4760000000000006e-05,
722
+ "loss": 1.3518,
723
+ "step": 870
724
+ },
725
+ {
726
+ "epoch": 0.1059475078256682,
727
+ "grad_norm": 11.875,
728
+ "learning_rate": 3.516e-05,
729
+ "loss": 1.2675,
730
+ "step": 880
731
+ },
732
+ {
733
+ "epoch": 0.10715145677823261,
734
+ "grad_norm": 13.0,
735
+ "learning_rate": 3.5560000000000005e-05,
736
+ "loss": 1.294,
737
+ "step": 890
738
+ },
739
+ {
740
+ "epoch": 0.10835540573079701,
741
+ "grad_norm": 13.0,
742
+ "learning_rate": 3.596e-05,
743
+ "loss": 1.1209,
744
+ "step": 900
745
+ },
746
+ {
747
+ "epoch": 0.10835540573079701,
748
+ "eval/acc": 25.581396102905273,
749
+ "step": 900
750
+ },
751
+ {
752
+ "epoch": 0.10835540573079701,
753
+ "eval_loss": 3.0571491718292236,
754
+ "eval_runtime": 0.3097,
755
+ "eval_samples_per_second": 138.846,
756
+ "eval_steps_per_second": 3.229,
757
+ "step": 900
758
+ },
759
+ {
760
+ "epoch": 0.10955935468336142,
761
+ "grad_norm": 12.75,
762
+ "learning_rate": 3.636e-05,
763
+ "loss": 1.2681,
764
+ "step": 910
765
+ },
766
+ {
767
+ "epoch": 0.11076330363592583,
768
+ "grad_norm": 17.0,
769
+ "learning_rate": 3.676e-05,
770
+ "loss": 1.2606,
771
+ "step": 920
772
+ },
773
+ {
774
+ "epoch": 0.11196725258849025,
775
+ "grad_norm": 11.375,
776
+ "learning_rate": 3.716e-05,
777
+ "loss": 1.2194,
778
+ "step": 930
779
+ },
780
+ {
781
+ "epoch": 0.11317120154105466,
782
+ "grad_norm": 12.125,
783
+ "learning_rate": 3.756e-05,
784
+ "loss": 1.2905,
785
+ "step": 940
786
+ },
787
+ {
788
+ "epoch": 0.11437515049361907,
789
+ "grad_norm": 18.125,
790
+ "learning_rate": 3.796e-05,
791
+ "loss": 1.2563,
792
+ "step": 950
793
+ },
794
+ {
795
+ "epoch": 0.11557909944618348,
796
+ "grad_norm": 17.125,
797
+ "learning_rate": 3.836e-05,
798
+ "loss": 1.1894,
799
+ "step": 960
800
+ },
801
+ {
802
+ "epoch": 0.1167830483987479,
803
+ "grad_norm": 11.875,
804
+ "learning_rate": 3.876e-05,
805
+ "loss": 1.2441,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 0.11798699735131231,
810
+ "grad_norm": 15.8125,
811
+ "learning_rate": 3.9160000000000005e-05,
812
+ "loss": 1.2627,
813
+ "step": 980
814
+ },
815
+ {
816
+ "epoch": 0.11919094630387672,
817
+ "grad_norm": 17.375,
818
+ "learning_rate": 3.956e-05,
819
+ "loss": 1.3929,
820
+ "step": 990
821
+ },
822
+ {
823
+ "epoch": 0.12039489525644112,
824
+ "grad_norm": 11.125,
825
+ "learning_rate": 3.9960000000000004e-05,
826
+ "loss": 1.1332,
827
+ "step": 1000
828
+ },
829
+ {
830
+ "epoch": 0.12039489525644112,
831
+ "eval/acc": 26.162790298461914,
832
+ "step": 1000
833
+ },
834
+ {
835
+ "epoch": 0.12039489525644112,
836
+ "eval_loss": 2.9910976886749268,
837
+ "eval_runtime": 0.2826,
838
+ "eval_samples_per_second": 152.17,
839
+ "eval_steps_per_second": 3.539,
840
+ "step": 1000
841
+ },
842
+ {
843
+ "epoch": 0.12159884420900553,
844
+ "grad_norm": 13.75,
845
+ "learning_rate": 4.0360000000000007e-05,
846
+ "loss": 1.2314,
847
+ "step": 1010
848
+ },
849
+ {
850
+ "epoch": 0.12280279316156995,
851
+ "grad_norm": 11.875,
852
+ "learning_rate": 4.076e-05,
853
+ "loss": 1.2654,
854
+ "step": 1020
855
+ },
856
+ {
857
+ "epoch": 0.12400674211413436,
858
+ "grad_norm": 12.8125,
859
+ "learning_rate": 4.1160000000000006e-05,
860
+ "loss": 1.1432,
861
+ "step": 1030
862
+ },
863
+ {
864
+ "epoch": 0.12521069106669877,
865
+ "grad_norm": 13.9375,
866
+ "learning_rate": 4.156e-05,
867
+ "loss": 1.1669,
868
+ "step": 1040
869
+ },
870
+ {
871
+ "epoch": 0.1264146400192632,
872
+ "grad_norm": 19.25,
873
+ "learning_rate": 4.196e-05,
874
+ "loss": 1.1836,
875
+ "step": 1050
876
+ },
877
+ {
878
+ "epoch": 0.1276185889718276,
879
+ "grad_norm": 11.375,
880
+ "learning_rate": 4.236e-05,
881
+ "loss": 1.2449,
882
+ "step": 1060
883
+ },
884
+ {
885
+ "epoch": 0.128822537924392,
886
+ "grad_norm": 10.6875,
887
+ "learning_rate": 4.276e-05,
888
+ "loss": 1.1361,
889
+ "step": 1070
890
+ },
891
+ {
892
+ "epoch": 0.13002648687695642,
893
+ "grad_norm": 11.5,
894
+ "learning_rate": 4.316e-05,
895
+ "loss": 1.1989,
896
+ "step": 1080
897
+ },
898
+ {
899
+ "epoch": 0.13123043582952082,
900
+ "grad_norm": 13.0,
901
+ "learning_rate": 4.356e-05,
902
+ "loss": 1.1004,
903
+ "step": 1090
904
+ },
905
+ {
906
+ "epoch": 0.13243438478208525,
907
+ "grad_norm": 10.125,
908
+ "learning_rate": 4.396e-05,
909
+ "loss": 1.1308,
910
+ "step": 1100
911
+ },
912
+ {
913
+ "epoch": 0.13243438478208525,
914
+ "eval/acc": 27.9069766998291,
915
+ "step": 1100
916
+ },
917
+ {
918
+ "epoch": 0.13243438478208525,
919
+ "eval_loss": 3.0177316665649414,
920
+ "eval_runtime": 0.2801,
921
+ "eval_samples_per_second": 153.54,
922
+ "eval_steps_per_second": 3.571,
923
+ "step": 1100
924
+ },
925
+ {
926
+ "epoch": 0.13363833373464964,
927
+ "grad_norm": 9.5,
928
+ "learning_rate": 4.436e-05,
929
+ "loss": 1.1862,
930
+ "step": 1110
931
+ },
932
+ {
933
+ "epoch": 0.13484228268721407,
934
+ "grad_norm": 13.75,
935
+ "learning_rate": 4.4760000000000005e-05,
936
+ "loss": 1.1764,
937
+ "step": 1120
938
+ },
939
+ {
940
+ "epoch": 0.13604623163977847,
941
+ "grad_norm": 30.625,
942
+ "learning_rate": 4.516e-05,
943
+ "loss": 1.0422,
944
+ "step": 1130
945
+ },
946
+ {
947
+ "epoch": 0.1372501805923429,
948
+ "grad_norm": 9.875,
949
+ "learning_rate": 4.5560000000000004e-05,
950
+ "loss": 1.1796,
951
+ "step": 1140
952
+ },
953
+ {
954
+ "epoch": 0.1384541295449073,
955
+ "grad_norm": 13.1875,
956
+ "learning_rate": 4.596e-05,
957
+ "loss": 1.0483,
958
+ "step": 1150
959
+ },
960
+ {
961
+ "epoch": 0.1396580784974717,
962
+ "grad_norm": 11.75,
963
+ "learning_rate": 4.636e-05,
964
+ "loss": 1.1647,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 0.14086202745003612,
969
+ "grad_norm": 13.375,
970
+ "learning_rate": 4.6760000000000006e-05,
971
+ "loss": 1.2839,
972
+ "step": 1170
973
+ },
974
+ {
975
+ "epoch": 0.14206597640260052,
976
+ "grad_norm": 42.0,
977
+ "learning_rate": 4.716e-05,
978
+ "loss": 1.1594,
979
+ "step": 1180
980
+ },
981
+ {
982
+ "epoch": 0.14326992535516495,
983
+ "grad_norm": 15.625,
984
+ "learning_rate": 4.7560000000000005e-05,
985
+ "loss": 1.1073,
986
+ "step": 1190
987
+ },
988
+ {
989
+ "epoch": 0.14447387430772934,
990
+ "grad_norm": 11.5,
991
+ "learning_rate": 4.796e-05,
992
+ "loss": 1.1593,
993
+ "step": 1200
994
+ },
995
+ {
996
+ "epoch": 0.14447387430772934,
997
+ "eval/acc": 26.162790298461914,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 0.14447387430772934,
1002
+ "eval_loss": 3.0329606533050537,
1003
+ "eval_runtime": 0.2185,
1004
+ "eval_samples_per_second": 196.829,
1005
+ "eval_steps_per_second": 4.577,
1006
+ "step": 1200
1007
+ },
1008
+ {
1009
+ "epoch": 0.14567782326029377,
1010
+ "grad_norm": 12.5625,
1011
+ "learning_rate": 4.836e-05,
1012
+ "loss": 1.1088,
1013
+ "step": 1210
1014
+ },
1015
+ {
1016
+ "epoch": 0.14688177221285817,
1017
+ "grad_norm": 10.4375,
1018
+ "learning_rate": 4.876e-05,
1019
+ "loss": 1.1565,
1020
+ "step": 1220
1021
+ },
1022
+ {
1023
+ "epoch": 0.1480857211654226,
1024
+ "grad_norm": 11.3125,
1025
+ "learning_rate": 4.9160000000000004e-05,
1026
+ "loss": 1.0596,
1027
+ "step": 1230
1028
+ },
1029
+ {
1030
+ "epoch": 0.149289670117987,
1031
+ "grad_norm": 11.375,
1032
+ "learning_rate": 4.956e-05,
1033
+ "loss": 1.2416,
1034
+ "step": 1240
1035
+ },
1036
+ {
1037
+ "epoch": 0.15049361907055142,
1038
+ "grad_norm": 10.3125,
1039
+ "learning_rate": 4.996e-05,
1040
+ "loss": 1.0492,
1041
+ "step": 1250
1042
+ },
1043
+ {
1044
+ "epoch": 0.15169756802311582,
1045
+ "grad_norm": 10.9375,
1046
+ "learning_rate": 5.0360000000000006e-05,
1047
+ "loss": 1.0263,
1048
+ "step": 1260
1049
+ },
1050
+ {
1051
+ "epoch": 0.15290151697568022,
1052
+ "grad_norm": 11.0625,
1053
+ "learning_rate": 5.076000000000001e-05,
1054
+ "loss": 1.1197,
1055
+ "step": 1270
1056
+ },
1057
+ {
1058
+ "epoch": 0.15410546592824464,
1059
+ "grad_norm": 33.25,
1060
+ "learning_rate": 5.1160000000000005e-05,
1061
+ "loss": 1.0614,
1062
+ "step": 1280
1063
+ },
1064
+ {
1065
+ "epoch": 0.15530941488080904,
1066
+ "grad_norm": 11.3125,
1067
+ "learning_rate": 5.1559999999999994e-05,
1068
+ "loss": 1.0948,
1069
+ "step": 1290
1070
+ },
1071
+ {
1072
+ "epoch": 0.15651336383337347,
1073
+ "grad_norm": 24.5,
1074
+ "learning_rate": 5.196e-05,
1075
+ "loss": 1.1113,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 0.15651336383337347,
1080
+ "eval/acc": 25.581396102905273,
1081
+ "step": 1300
1082
+ },
1083
+ {
1084
+ "epoch": 0.15651336383337347,
1085
+ "eval_loss": 2.944797992706299,
1086
+ "eval_runtime": 0.3019,
1087
+ "eval_samples_per_second": 142.434,
1088
+ "eval_steps_per_second": 3.312,
1089
+ "step": 1300
1090
+ },
1091
+ {
1092
+ "epoch": 0.15771731278593787,
1093
+ "grad_norm": 12.4375,
1094
+ "learning_rate": 5.236e-05,
1095
+ "loss": 0.9531,
1096
+ "step": 1310
1097
+ },
1098
+ {
1099
+ "epoch": 0.1589212617385023,
1100
+ "grad_norm": 12.3125,
1101
+ "learning_rate": 5.2759999999999996e-05,
1102
+ "loss": 1.0079,
1103
+ "step": 1320
1104
+ },
1105
+ {
1106
+ "epoch": 0.1601252106910667,
1107
+ "grad_norm": 13.1875,
1108
+ "learning_rate": 5.316e-05,
1109
+ "loss": 1.0674,
1110
+ "step": 1330
1111
+ },
1112
+ {
1113
+ "epoch": 0.16132915964363112,
1114
+ "grad_norm": 16.875,
1115
+ "learning_rate": 5.356e-05,
1116
+ "loss": 1.1194,
1117
+ "step": 1340
1118
+ },
1119
+ {
1120
+ "epoch": 0.16253310859619552,
1121
+ "grad_norm": 10.625,
1122
+ "learning_rate": 5.396e-05,
1123
+ "loss": 1.0057,
1124
+ "step": 1350
1125
+ },
1126
+ {
1127
+ "epoch": 0.16373705754875995,
1128
+ "grad_norm": 9.125,
1129
+ "learning_rate": 5.436e-05,
1130
+ "loss": 1.1257,
1131
+ "step": 1360
1132
+ },
1133
+ {
1134
+ "epoch": 0.16494100650132434,
1135
+ "grad_norm": 8.5,
1136
+ "learning_rate": 5.476e-05,
1137
+ "loss": 0.9545,
1138
+ "step": 1370
1139
+ },
1140
+ {
1141
+ "epoch": 0.16614495545388874,
1142
+ "grad_norm": 10.25,
1143
+ "learning_rate": 5.516e-05,
1144
+ "loss": 1.0648,
1145
+ "step": 1380
1146
+ },
1147
+ {
1148
+ "epoch": 0.16734890440645317,
1149
+ "grad_norm": 14.9375,
1150
+ "learning_rate": 5.556e-05,
1151
+ "loss": 1.0364,
1152
+ "step": 1390
1153
+ },
1154
+ {
1155
+ "epoch": 0.16855285335901757,
1156
+ "grad_norm": 138.0,
1157
+ "learning_rate": 5.596e-05,
1158
+ "loss": 1.0255,
1159
+ "step": 1400
1160
+ },
1161
+ {
1162
+ "epoch": 0.16855285335901757,
1163
+ "eval/acc": 27.9069766998291,
1164
+ "step": 1400
1165
+ },
1166
+ {
1167
+ "epoch": 0.16855285335901757,
1168
+ "eval_loss": 2.763101100921631,
1169
+ "eval_runtime": 0.2759,
1170
+ "eval_samples_per_second": 155.826,
1171
+ "eval_steps_per_second": 3.624,
1172
+ "step": 1400
1173
+ },
1174
+ {
1175
+ "epoch": 0.169756802311582,
1176
+ "grad_norm": 11.8125,
1177
+ "learning_rate": 5.636e-05,
1178
+ "loss": 0.9813,
1179
+ "step": 1410
1180
+ },
1181
+ {
1182
+ "epoch": 0.1709607512641464,
1183
+ "grad_norm": 9.1875,
1184
+ "learning_rate": 5.6760000000000005e-05,
1185
+ "loss": 0.9929,
1186
+ "step": 1420
1187
+ },
1188
+ {
1189
+ "epoch": 0.17216470021671082,
1190
+ "grad_norm": 10.875,
1191
+ "learning_rate": 5.716e-05,
1192
+ "loss": 0.9113,
1193
+ "step": 1430
1194
+ },
1195
+ {
1196
+ "epoch": 0.17336864916927522,
1197
+ "grad_norm": 19.375,
1198
+ "learning_rate": 5.7560000000000005e-05,
1199
+ "loss": 1.0711,
1200
+ "step": 1440
1201
+ },
1202
+ {
1203
+ "epoch": 0.17457259812183964,
1204
+ "grad_norm": 9.8125,
1205
+ "learning_rate": 5.796e-05,
1206
+ "loss": 0.9322,
1207
+ "step": 1450
1208
+ },
1209
+ {
1210
+ "epoch": 0.17577654707440404,
1211
+ "grad_norm": 10.5,
1212
+ "learning_rate": 5.8360000000000004e-05,
1213
+ "loss": 1.0316,
1214
+ "step": 1460
1215
+ },
1216
+ {
1217
+ "epoch": 0.17698049602696847,
1218
+ "grad_norm": 10.25,
1219
+ "learning_rate": 5.876000000000001e-05,
1220
+ "loss": 1.0165,
1221
+ "step": 1470
1222
+ },
1223
+ {
1224
+ "epoch": 0.17818444497953287,
1225
+ "grad_norm": 10.4375,
1226
+ "learning_rate": 5.916e-05,
1227
+ "loss": 1.0229,
1228
+ "step": 1480
1229
+ },
1230
+ {
1231
+ "epoch": 0.17938839393209727,
1232
+ "grad_norm": 14.4375,
1233
+ "learning_rate": 5.9560000000000006e-05,
1234
+ "loss": 0.9684,
1235
+ "step": 1490
1236
+ },
1237
+ {
1238
+ "epoch": 0.1805923428846617,
1239
+ "grad_norm": 8.375,
1240
+ "learning_rate": 5.996e-05,
1241
+ "loss": 0.9948,
1242
+ "step": 1500
1243
+ },
1244
+ {
1245
+ "epoch": 0.1805923428846617,
1246
+ "eval/acc": 34.88372039794922,
1247
+ "step": 1500
1248
+ },
1249
+ {
1250
+ "epoch": 0.1805923428846617,
1251
+ "eval_loss": 2.8177433013916016,
1252
+ "eval_runtime": 0.208,
1253
+ "eval_samples_per_second": 206.732,
1254
+ "eval_steps_per_second": 4.808,
1255
+ "step": 1500
1256
+ },
1257
+ {
1258
+ "epoch": 0.1817962918372261,
1259
+ "grad_norm": 19.25,
1260
+ "learning_rate": 6.0360000000000005e-05,
1261
+ "loss": 0.9897,
1262
+ "step": 1510
1263
+ },
1264
+ {
1265
+ "epoch": 0.18300024078979052,
1266
+ "grad_norm": 32.5,
1267
+ "learning_rate": 6.076000000000001e-05,
1268
+ "loss": 0.9217,
1269
+ "step": 1520
1270
+ },
1271
+ {
1272
+ "epoch": 0.18420418974235492,
1273
+ "grad_norm": 9.5,
1274
+ "learning_rate": 6.116e-05,
1275
+ "loss": 1.0494,
1276
+ "step": 1530
1277
+ },
1278
+ {
1279
+ "epoch": 0.18540813869491934,
1280
+ "grad_norm": 9.25,
1281
+ "learning_rate": 6.156e-05,
1282
+ "loss": 0.9359,
1283
+ "step": 1540
1284
+ },
1285
+ {
1286
+ "epoch": 0.18661208764748374,
1287
+ "grad_norm": 11.375,
1288
+ "learning_rate": 6.196000000000001e-05,
1289
+ "loss": 0.9112,
1290
+ "step": 1550
1291
+ },
1292
+ {
1293
+ "epoch": 0.18781603660004817,
1294
+ "grad_norm": 12.6875,
1295
+ "learning_rate": 6.236e-05,
1296
+ "loss": 1.07,
1297
+ "step": 1560
1298
+ },
1299
+ {
1300
+ "epoch": 0.18901998555261257,
1301
+ "grad_norm": 11.1875,
1302
+ "learning_rate": 6.276e-05,
1303
+ "loss": 0.9853,
1304
+ "step": 1570
1305
+ },
1306
+ {
1307
+ "epoch": 0.19022393450517697,
1308
+ "grad_norm": 8.375,
1309
+ "learning_rate": 6.316000000000001e-05,
1310
+ "loss": 0.9579,
1311
+ "step": 1580
1312
+ },
1313
+ {
1314
+ "epoch": 0.1914278834577414,
1315
+ "grad_norm": 20.875,
1316
+ "learning_rate": 6.356000000000001e-05,
1317
+ "loss": 0.9401,
1318
+ "step": 1590
1319
+ },
1320
+ {
1321
+ "epoch": 0.1926318324103058,
1322
+ "grad_norm": 8.9375,
1323
+ "learning_rate": 6.396e-05,
1324
+ "loss": 1.0279,
1325
+ "step": 1600
1326
+ },
1327
+ {
1328
+ "epoch": 0.1926318324103058,
1329
+ "eval/acc": 30.23255729675293,
1330
+ "step": 1600
1331
+ },
1332
+ {
1333
+ "epoch": 0.1926318324103058,
1334
+ "eval_loss": 2.8526248931884766,
1335
+ "eval_runtime": 0.3114,
1336
+ "eval_samples_per_second": 138.103,
1337
+ "eval_steps_per_second": 3.212,
1338
+ "step": 1600
1339
+ },
1340
+ {
1341
+ "epoch": 0.19383578136287022,
1342
+ "grad_norm": 7.78125,
1343
+ "learning_rate": 6.436e-05,
1344
+ "loss": 0.8743,
1345
+ "step": 1610
1346
+ },
1347
+ {
1348
+ "epoch": 0.19503973031543462,
1349
+ "grad_norm": 9.8125,
1350
+ "learning_rate": 6.476e-05,
1351
+ "loss": 0.8702,
1352
+ "step": 1620
1353
+ },
1354
+ {
1355
+ "epoch": 0.19624367926799904,
1356
+ "grad_norm": 12.4375,
1357
+ "learning_rate": 6.515999999999999e-05,
1358
+ "loss": 1.0028,
1359
+ "step": 1630
1360
+ },
1361
+ {
1362
+ "epoch": 0.19744762822056344,
1363
+ "grad_norm": 10.125,
1364
+ "learning_rate": 6.556e-05,
1365
+ "loss": 0.9377,
1366
+ "step": 1640
1367
+ },
1368
+ {
1369
+ "epoch": 0.19865157717312787,
1370
+ "grad_norm": 8.9375,
1371
+ "learning_rate": 6.596e-05,
1372
+ "loss": 1.031,
1373
+ "step": 1650
1374
+ },
1375
+ {
1376
+ "epoch": 0.19985552612569227,
1377
+ "grad_norm": 8.5625,
1378
+ "learning_rate": 6.636e-05,
1379
+ "loss": 1.0162,
1380
+ "step": 1660
1381
+ },
1382
+ {
1383
+ "epoch": 0.2010594750782567,
1384
+ "grad_norm": 33.75,
1385
+ "learning_rate": 6.676e-05,
1386
+ "loss": 0.9448,
1387
+ "step": 1670
1388
+ },
1389
+ {
1390
+ "epoch": 0.2022634240308211,
1391
+ "grad_norm": 9.625,
1392
+ "learning_rate": 6.716e-05,
1393
+ "loss": 1.0077,
1394
+ "step": 1680
1395
+ },
1396
+ {
1397
+ "epoch": 0.2034673729833855,
1398
+ "grad_norm": 8.6875,
1399
+ "learning_rate": 6.756e-05,
1400
+ "loss": 0.9654,
1401
+ "step": 1690
1402
+ },
1403
+ {
1404
+ "epoch": 0.20467132193594992,
1405
+ "grad_norm": 12.625,
1406
+ "learning_rate": 6.796e-05,
1407
+ "loss": 0.8899,
1408
+ "step": 1700
1409
+ },
1410
+ {
1411
+ "epoch": 0.20467132193594992,
1412
+ "eval/acc": 32.55813980102539,
1413
+ "step": 1700
1414
+ },
1415
+ {
1416
+ "epoch": 0.20467132193594992,
1417
+ "eval_loss": 2.7813549041748047,
1418
+ "eval_runtime": 0.2132,
1419
+ "eval_samples_per_second": 201.701,
1420
+ "eval_steps_per_second": 4.691,
1421
+ "step": 1700
1422
+ },
1423
+ {
1424
+ "epoch": 0.20587527088851432,
1425
+ "grad_norm": 12.0,
1426
+ "learning_rate": 6.836e-05,
1427
+ "loss": 1.0412,
1428
+ "step": 1710
1429
+ },
1430
+ {
1431
+ "epoch": 0.20707921984107874,
1432
+ "grad_norm": 11.75,
1433
+ "learning_rate": 6.876e-05,
1434
+ "loss": 0.9239,
1435
+ "step": 1720
1436
+ },
1437
+ {
1438
+ "epoch": 0.20828316879364314,
1439
+ "grad_norm": 11.375,
1440
+ "learning_rate": 6.916000000000001e-05,
1441
+ "loss": 0.9243,
1442
+ "step": 1730
1443
+ },
1444
+ {
1445
+ "epoch": 0.20948711774620757,
1446
+ "grad_norm": 12.0,
1447
+ "learning_rate": 6.956e-05,
1448
+ "loss": 1.0204,
1449
+ "step": 1740
1450
+ },
1451
+ {
1452
+ "epoch": 0.21069106669877197,
1453
+ "grad_norm": 13.0625,
1454
+ "learning_rate": 6.996e-05,
1455
+ "loss": 0.8811,
1456
+ "step": 1750
1457
+ },
1458
+ {
1459
+ "epoch": 0.2118950156513364,
1460
+ "grad_norm": 17.0,
1461
+ "learning_rate": 7.036e-05,
1462
+ "loss": 0.8755,
1463
+ "step": 1760
1464
+ },
1465
+ {
1466
+ "epoch": 0.2130989646039008,
1467
+ "grad_norm": 11.25,
1468
+ "learning_rate": 7.076000000000001e-05,
1469
+ "loss": 0.858,
1470
+ "step": 1770
1471
+ },
1472
+ {
1473
+ "epoch": 0.21430291355646522,
1474
+ "grad_norm": 9.625,
1475
+ "learning_rate": 7.116e-05,
1476
+ "loss": 0.9076,
1477
+ "step": 1780
1478
+ },
1479
+ {
1480
+ "epoch": 0.21550686250902962,
1481
+ "grad_norm": 10.4375,
1482
+ "learning_rate": 7.156e-05,
1483
+ "loss": 0.8817,
1484
+ "step": 1790
1485
+ },
1486
+ {
1487
+ "epoch": 0.21671081146159402,
1488
+ "grad_norm": 12.8125,
1489
+ "learning_rate": 7.196000000000001e-05,
1490
+ "loss": 0.9121,
1491
+ "step": 1800
1492
+ },
1493
+ {
1494
+ "epoch": 0.21671081146159402,
1495
+ "eval/acc": 30.813953399658203,
1496
+ "step": 1800
1497
+ },
1498
+ {
1499
+ "epoch": 0.21671081146159402,
1500
+ "eval_loss": 2.6508796215057373,
1501
+ "eval_runtime": 0.2185,
1502
+ "eval_samples_per_second": 196.798,
1503
+ "eval_steps_per_second": 4.577,
1504
+ "step": 1800
1505
+ },
1506
+ {
1507
+ "epoch": 0.21791476041415844,
1508
+ "grad_norm": 16.5,
1509
+ "learning_rate": 7.236e-05,
1510
+ "loss": 0.9044,
1511
+ "step": 1810
1512
+ },
1513
+ {
1514
+ "epoch": 0.21911870936672284,
1515
+ "grad_norm": 15.1875,
1516
+ "learning_rate": 7.276e-05,
1517
+ "loss": 0.9552,
1518
+ "step": 1820
1519
+ },
1520
+ {
1521
+ "epoch": 0.22032265831928727,
1522
+ "grad_norm": 11.375,
1523
+ "learning_rate": 7.316000000000001e-05,
1524
+ "loss": 0.9264,
1525
+ "step": 1830
1526
+ },
1527
+ {
1528
+ "epoch": 0.22152660727185167,
1529
+ "grad_norm": 8.8125,
1530
+ "learning_rate": 7.356000000000001e-05,
1531
+ "loss": 0.8928,
1532
+ "step": 1840
1533
+ },
1534
+ {
1535
+ "epoch": 0.2227305562244161,
1536
+ "grad_norm": 9.625,
1537
+ "learning_rate": 7.396e-05,
1538
+ "loss": 0.9515,
1539
+ "step": 1850
1540
+ },
1541
+ {
1542
+ "epoch": 0.2239345051769805,
1543
+ "grad_norm": 31.0,
1544
+ "learning_rate": 7.436000000000001e-05,
1545
+ "loss": 0.8989,
1546
+ "step": 1860
1547
+ },
1548
+ {
1549
+ "epoch": 0.22513845412954492,
1550
+ "grad_norm": 9.5,
1551
+ "learning_rate": 7.476000000000001e-05,
1552
+ "loss": 1.0206,
1553
+ "step": 1870
1554
+ },
1555
+ {
1556
+ "epoch": 0.22634240308210932,
1557
+ "grad_norm": 8.625,
1558
+ "learning_rate": 7.516e-05,
1559
+ "loss": 0.8961,
1560
+ "step": 1880
1561
+ },
1562
+ {
1563
+ "epoch": 0.22754635203467374,
1564
+ "grad_norm": 9.0,
1565
+ "learning_rate": 7.556000000000002e-05,
1566
+ "loss": 0.9421,
1567
+ "step": 1890
1568
+ },
1569
+ {
1570
+ "epoch": 0.22875030098723814,
1571
+ "grad_norm": 12.0625,
1572
+ "learning_rate": 7.596000000000001e-05,
1573
+ "loss": 0.9049,
1574
+ "step": 1900
1575
+ },
1576
+ {
1577
+ "epoch": 0.22875030098723814,
1578
+ "eval/acc": 36.046512603759766,
1579
+ "step": 1900
1580
+ },
1581
+ {
1582
+ "epoch": 0.22875030098723814,
1583
+ "eval_loss": 2.636018753051758,
1584
+ "eval_runtime": 0.2084,
1585
+ "eval_samples_per_second": 206.343,
1586
+ "eval_steps_per_second": 4.799,
1587
+ "step": 1900
1588
+ },
1589
+ {
1590
+ "epoch": 0.22995424993980254,
1591
+ "grad_norm": 8.0625,
1592
+ "learning_rate": 7.636e-05,
1593
+ "loss": 0.8983,
1594
+ "step": 1910
1595
+ },
1596
+ {
1597
+ "epoch": 0.23115819889236697,
1598
+ "grad_norm": 11.875,
1599
+ "learning_rate": 7.676e-05,
1600
+ "loss": 0.9293,
1601
+ "step": 1920
1602
+ },
1603
+ {
1604
+ "epoch": 0.23236214784493137,
1605
+ "grad_norm": 11.75,
1606
+ "learning_rate": 7.716e-05,
1607
+ "loss": 0.8602,
1608
+ "step": 1930
1609
+ },
1610
+ {
1611
+ "epoch": 0.2335660967974958,
1612
+ "grad_norm": 11.5625,
1613
+ "learning_rate": 7.756e-05,
1614
+ "loss": 0.8078,
1615
+ "step": 1940
1616
+ },
1617
+ {
1618
+ "epoch": 0.2347700457500602,
1619
+ "grad_norm": 9.125,
1620
+ "learning_rate": 7.796e-05,
1621
+ "loss": 0.8773,
1622
+ "step": 1950
1623
+ },
1624
+ {
1625
+ "epoch": 0.23597399470262462,
1626
+ "grad_norm": 10.6875,
1627
+ "learning_rate": 7.836e-05,
1628
+ "loss": 0.8464,
1629
+ "step": 1960
1630
+ },
1631
+ {
1632
+ "epoch": 0.23717794365518902,
1633
+ "grad_norm": 18.25,
1634
+ "learning_rate": 7.876e-05,
1635
+ "loss": 0.8779,
1636
+ "step": 1970
1637
+ },
1638
+ {
1639
+ "epoch": 0.23838189260775344,
1640
+ "grad_norm": 10.875,
1641
+ "learning_rate": 7.916e-05,
1642
+ "loss": 0.9351,
1643
+ "step": 1980
1644
+ },
1645
+ {
1646
+ "epoch": 0.23958584156031784,
1647
+ "grad_norm": 11.0,
1648
+ "learning_rate": 7.956e-05,
1649
+ "loss": 0.8581,
1650
+ "step": 1990
1651
+ },
1652
+ {
1653
+ "epoch": 0.24078979051288224,
1654
+ "grad_norm": 8.875,
1655
+ "learning_rate": 7.996e-05,
1656
+ "loss": 0.9799,
1657
+ "step": 2000
1658
+ },
1659
+ {
1660
+ "epoch": 0.24078979051288224,
1661
+ "eval/acc": 36.046512603759766,
1662
+ "step": 2000
1663
+ },
1664
+ {
1665
+ "epoch": 0.24078979051288224,
1666
+ "eval_loss": 2.716654062271118,
1667
+ "eval_runtime": 0.21,
1668
+ "eval_samples_per_second": 204.721,
1669
+ "eval_steps_per_second": 4.761,
1670
+ "step": 2000
1671
+ },
1672
+ {
1673
+ "epoch": 0.24199373946544667,
1674
+ "grad_norm": 11.0625,
1675
+ "learning_rate": 8.036e-05,
1676
+ "loss": 0.8678,
1677
+ "step": 2010
1678
+ },
1679
+ {
1680
+ "epoch": 0.24319768841801107,
1681
+ "grad_norm": 12.125,
1682
+ "learning_rate": 8.076e-05,
1683
+ "loss": 0.8832,
1684
+ "step": 2020
1685
+ },
1686
+ {
1687
+ "epoch": 0.2444016373705755,
1688
+ "grad_norm": 8.25,
1689
+ "learning_rate": 8.116e-05,
1690
+ "loss": 0.8689,
1691
+ "step": 2030
1692
+ },
1693
+ {
1694
+ "epoch": 0.2456055863231399,
1695
+ "grad_norm": 6.53125,
1696
+ "learning_rate": 8.156e-05,
1697
+ "loss": 0.8829,
1698
+ "step": 2040
1699
+ },
1700
+ {
1701
+ "epoch": 0.24680953527570432,
1702
+ "grad_norm": 9.5625,
1703
+ "learning_rate": 8.196000000000001e-05,
1704
+ "loss": 0.9181,
1705
+ "step": 2050
1706
+ },
1707
+ {
1708
+ "epoch": 0.24801348422826872,
1709
+ "grad_norm": 22.875,
1710
+ "learning_rate": 8.236e-05,
1711
+ "loss": 0.8011,
1712
+ "step": 2060
1713
+ },
1714
+ {
1715
+ "epoch": 0.24921743318083314,
1716
+ "grad_norm": 14.4375,
1717
+ "learning_rate": 8.276e-05,
1718
+ "loss": 0.9163,
1719
+ "step": 2070
1720
+ },
1721
+ {
1722
+ "epoch": 0.25042138213339754,
1723
+ "grad_norm": 10.625,
1724
+ "learning_rate": 8.316000000000001e-05,
1725
+ "loss": 0.7869,
1726
+ "step": 2080
1727
+ },
1728
+ {
1729
+ "epoch": 0.25162533108596197,
1730
+ "grad_norm": 11.0,
1731
+ "learning_rate": 8.356e-05,
1732
+ "loss": 0.8779,
1733
+ "step": 2090
1734
+ },
1735
+ {
1736
+ "epoch": 0.2528292800385264,
1737
+ "grad_norm": 12.625,
1738
+ "learning_rate": 8.396e-05,
1739
+ "loss": 0.889,
1740
+ "step": 2100
1741
+ },
1742
+ {
1743
+ "epoch": 0.2528292800385264,
1744
+ "eval/acc": 37.20930099487305,
1745
+ "step": 2100
1746
+ },
1747
+ {
1748
+ "epoch": 0.2528292800385264,
1749
+ "eval_loss": 2.626293182373047,
1750
+ "eval_runtime": 0.2735,
1751
+ "eval_samples_per_second": 157.235,
1752
+ "eval_steps_per_second": 3.657,
1753
+ "step": 2100
1754
+ },
1755
+ {
1756
+ "epoch": 0.25403322899109076,
1757
+ "grad_norm": 8.3125,
1758
+ "learning_rate": 8.436000000000001e-05,
1759
+ "loss": 0.8363,
1760
+ "step": 2110
1761
+ },
1762
+ {
1763
+ "epoch": 0.2552371779436552,
1764
+ "grad_norm": 8.625,
1765
+ "learning_rate": 8.476000000000001e-05,
1766
+ "loss": 0.8762,
1767
+ "step": 2120
1768
+ },
1769
+ {
1770
+ "epoch": 0.2564411268962196,
1771
+ "grad_norm": 7.4375,
1772
+ "learning_rate": 8.516e-05,
1773
+ "loss": 0.7925,
1774
+ "step": 2130
1775
+ },
1776
+ {
1777
+ "epoch": 0.257645075848784,
1778
+ "grad_norm": 9.1875,
1779
+ "learning_rate": 8.556e-05,
1780
+ "loss": 0.9575,
1781
+ "step": 2140
1782
+ },
1783
+ {
1784
+ "epoch": 0.2588490248013484,
1785
+ "grad_norm": 9.8125,
1786
+ "learning_rate": 8.596000000000001e-05,
1787
+ "loss": 0.7551,
1788
+ "step": 2150
1789
+ },
1790
+ {
1791
+ "epoch": 0.26005297375391284,
1792
+ "grad_norm": 7.15625,
1793
+ "learning_rate": 8.636e-05,
1794
+ "loss": 0.808,
1795
+ "step": 2160
1796
+ },
1797
+ {
1798
+ "epoch": 0.26125692270647727,
1799
+ "grad_norm": 8.3125,
1800
+ "learning_rate": 8.676e-05,
1801
+ "loss": 0.9449,
1802
+ "step": 2170
1803
+ },
1804
+ {
1805
+ "epoch": 0.26246087165904164,
1806
+ "grad_norm": 11.5,
1807
+ "learning_rate": 8.716000000000001e-05,
1808
+ "loss": 0.8712,
1809
+ "step": 2180
1810
+ },
1811
+ {
1812
+ "epoch": 0.26366482061160607,
1813
+ "grad_norm": 8.0,
1814
+ "learning_rate": 8.756000000000001e-05,
1815
+ "loss": 0.9389,
1816
+ "step": 2190
1817
+ },
1818
+ {
1819
+ "epoch": 0.2648687695641705,
1820
+ "grad_norm": 13.5,
1821
+ "learning_rate": 8.796e-05,
1822
+ "loss": 0.7875,
1823
+ "step": 2200
1824
+ },
1825
+ {
1826
+ "epoch": 0.2648687695641705,
1827
+ "eval/acc": 35.46511459350586,
1828
+ "step": 2200
1829
+ },
1830
+ {
1831
+ "epoch": 0.2648687695641705,
1832
+ "eval_loss": 2.5862526893615723,
1833
+ "eval_runtime": 0.2151,
1834
+ "eval_samples_per_second": 199.927,
1835
+ "eval_steps_per_second": 4.649,
1836
+ "step": 2200
1837
+ },
1838
+ {
1839
+ "epoch": 0.26607271851673486,
1840
+ "grad_norm": 11.5625,
1841
+ "learning_rate": 8.836000000000001e-05,
1842
+ "loss": 0.9947,
1843
+ "step": 2210
1844
+ },
1845
+ {
1846
+ "epoch": 0.2672766674692993,
1847
+ "grad_norm": 8.25,
1848
+ "learning_rate": 8.876e-05,
1849
+ "loss": 0.717,
1850
+ "step": 2220
1851
+ },
1852
+ {
1853
+ "epoch": 0.2684806164218637,
1854
+ "grad_norm": 26.25,
1855
+ "learning_rate": 8.916e-05,
1856
+ "loss": 0.8688,
1857
+ "step": 2230
1858
+ },
1859
+ {
1860
+ "epoch": 0.26968456537442814,
1861
+ "grad_norm": 11.5,
1862
+ "learning_rate": 8.956e-05,
1863
+ "loss": 0.9134,
1864
+ "step": 2240
1865
+ },
1866
+ {
1867
+ "epoch": 0.2708885143269925,
1868
+ "grad_norm": 6.875,
1869
+ "learning_rate": 8.996e-05,
1870
+ "loss": 0.8592,
1871
+ "step": 2250
1872
+ },
1873
+ {
1874
+ "epoch": 0.27209246327955694,
1875
+ "grad_norm": 7.21875,
1876
+ "learning_rate": 9.036e-05,
1877
+ "loss": 0.6548,
1878
+ "step": 2260
1879
+ },
1880
+ {
1881
+ "epoch": 0.27329641223212137,
1882
+ "grad_norm": 12.25,
1883
+ "learning_rate": 9.076e-05,
1884
+ "loss": 0.8613,
1885
+ "step": 2270
1886
+ },
1887
+ {
1888
+ "epoch": 0.2745003611846858,
1889
+ "grad_norm": 8.875,
1890
+ "learning_rate": 9.116e-05,
1891
+ "loss": 0.7455,
1892
+ "step": 2280
1893
+ },
1894
+ {
1895
+ "epoch": 0.27570431013725016,
1896
+ "grad_norm": 12.5625,
1897
+ "learning_rate": 9.156e-05,
1898
+ "loss": 0.8458,
1899
+ "step": 2290
1900
+ },
1901
+ {
1902
+ "epoch": 0.2769082590898146,
1903
+ "grad_norm": 8.8125,
1904
+ "learning_rate": 9.196000000000001e-05,
1905
+ "loss": 0.8003,
1906
+ "step": 2300
1907
+ },
1908
+ {
1909
+ "epoch": 0.2769082590898146,
1910
+ "eval/acc": 32.55813980102539,
1911
+ "step": 2300
1912
+ },
1913
+ {
1914
+ "epoch": 0.2769082590898146,
1915
+ "eval_loss": 2.6594340801239014,
1916
+ "eval_runtime": 0.2129,
1917
+ "eval_samples_per_second": 201.965,
1918
+ "eval_steps_per_second": 4.697,
1919
+ "step": 2300
1920
+ },
1921
+ {
1922
+ "epoch": 0.278112208042379,
1923
+ "grad_norm": 10.6875,
1924
+ "learning_rate": 9.236e-05,
1925
+ "loss": 0.812,
1926
+ "step": 2310
1927
+ },
1928
+ {
1929
+ "epoch": 0.2793161569949434,
1930
+ "grad_norm": 12.1875,
1931
+ "learning_rate": 9.276e-05,
1932
+ "loss": 0.781,
1933
+ "step": 2320
1934
+ },
1935
+ {
1936
+ "epoch": 0.2805201059475078,
1937
+ "grad_norm": 8.125,
1938
+ "learning_rate": 9.316000000000001e-05,
1939
+ "loss": 0.9682,
1940
+ "step": 2330
1941
+ },
1942
+ {
1943
+ "epoch": 0.28172405490007224,
1944
+ "grad_norm": 8.8125,
1945
+ "learning_rate": 9.356e-05,
1946
+ "loss": 0.7531,
1947
+ "step": 2340
1948
+ },
1949
+ {
1950
+ "epoch": 0.28292800385263667,
1951
+ "grad_norm": 7.375,
1952
+ "learning_rate": 9.396e-05,
1953
+ "loss": 0.7235,
1954
+ "step": 2350
1955
+ },
1956
+ {
1957
+ "epoch": 0.28413195280520104,
1958
+ "grad_norm": 7.8125,
1959
+ "learning_rate": 9.436e-05,
1960
+ "loss": 0.9204,
1961
+ "step": 2360
1962
+ },
1963
+ {
1964
+ "epoch": 0.28533590175776546,
1965
+ "grad_norm": 6.65625,
1966
+ "learning_rate": 9.476000000000001e-05,
1967
+ "loss": 0.7636,
1968
+ "step": 2370
1969
+ },
1970
+ {
1971
+ "epoch": 0.2865398507103299,
1972
+ "grad_norm": 9.625,
1973
+ "learning_rate": 9.516e-05,
1974
+ "loss": 0.855,
1975
+ "step": 2380
1976
+ },
1977
+ {
1978
+ "epoch": 0.2877437996628943,
1979
+ "grad_norm": 9.6875,
1980
+ "learning_rate": 9.556e-05,
1981
+ "loss": 0.8643,
1982
+ "step": 2390
1983
+ },
1984
+ {
1985
+ "epoch": 0.2889477486154587,
1986
+ "grad_norm": 7.1875,
1987
+ "learning_rate": 9.596000000000001e-05,
1988
+ "loss": 0.8258,
1989
+ "step": 2400
1990
+ },
1991
+ {
1992
+ "epoch": 0.2889477486154587,
1993
+ "eval/acc": 36.627906799316406,
1994
+ "step": 2400
1995
+ },
1996
+ {
1997
+ "epoch": 0.2889477486154587,
1998
+ "eval_loss": 2.7174084186553955,
1999
+ "eval_runtime": 0.2111,
2000
+ "eval_samples_per_second": 203.672,
2001
+ "eval_steps_per_second": 4.737,
2002
+ "step": 2400
2003
+ },
2004
+ {
2005
+ "epoch": 0.2901516975680231,
2006
+ "grad_norm": 7.65625,
2007
+ "learning_rate": 9.636e-05,
2008
+ "loss": 0.8752,
2009
+ "step": 2410
2010
+ },
2011
+ {
2012
+ "epoch": 0.29135564652058754,
2013
+ "grad_norm": 8.75,
2014
+ "learning_rate": 9.676e-05,
2015
+ "loss": 0.8082,
2016
+ "step": 2420
2017
+ },
2018
+ {
2019
+ "epoch": 0.2925595954731519,
2020
+ "grad_norm": 10.4375,
2021
+ "learning_rate": 9.716000000000001e-05,
2022
+ "loss": 0.7538,
2023
+ "step": 2430
2024
+ },
2025
+ {
2026
+ "epoch": 0.29376354442571634,
2027
+ "grad_norm": 6.4375,
2028
+ "learning_rate": 9.756000000000001e-05,
2029
+ "loss": 0.7766,
2030
+ "step": 2440
2031
+ },
2032
+ {
2033
+ "epoch": 0.29496749337828077,
2034
+ "grad_norm": 7.96875,
2035
+ "learning_rate": 9.796e-05,
2036
+ "loss": 0.844,
2037
+ "step": 2450
2038
+ },
2039
+ {
2040
+ "epoch": 0.2961714423308452,
2041
+ "grad_norm": 7.75,
2042
+ "learning_rate": 9.836000000000001e-05,
2043
+ "loss": 0.7127,
2044
+ "step": 2460
2045
+ },
2046
+ {
2047
+ "epoch": 0.29737539128340956,
2048
+ "grad_norm": 11.5,
2049
+ "learning_rate": 9.876000000000001e-05,
2050
+ "loss": 0.8363,
2051
+ "step": 2470
2052
+ },
2053
+ {
2054
+ "epoch": 0.298579340235974,
2055
+ "grad_norm": 6.4375,
2056
+ "learning_rate": 9.916e-05,
2057
+ "loss": 0.7429,
2058
+ "step": 2480
2059
+ },
2060
+ {
2061
+ "epoch": 0.2997832891885384,
2062
+ "grad_norm": 11.5,
2063
+ "learning_rate": 9.956e-05,
2064
+ "loss": 0.736,
2065
+ "step": 2490
2066
+ },
2067
+ {
2068
+ "epoch": 0.30098723814110284,
2069
+ "grad_norm": 9.25,
2070
+ "learning_rate": 9.996000000000001e-05,
2071
+ "loss": 0.8365,
2072
+ "step": 2500
2073
+ },
2074
+ {
2075
+ "epoch": 0.30098723814110284,
2076
+ "eval/acc": 39.53488540649414,
2077
+ "step": 2500
2078
+ },
2079
+ {
2080
+ "epoch": 0.30098723814110284,
2081
+ "eval_loss": 2.713433027267456,
2082
+ "eval_runtime": 0.2088,
2083
+ "eval_samples_per_second": 205.919,
2084
+ "eval_steps_per_second": 4.789,
2085
+ "step": 2500
2086
+ },
2087
+ {
2088
+ "epoch": 0.3021911870936672,
2089
+ "grad_norm": 7.03125,
2090
+ "learning_rate": 9.996000000000001e-05,
2091
+ "loss": 0.7664,
2092
+ "step": 2510
2093
+ },
2094
+ {
2095
+ "epoch": 0.30339513604623164,
2096
+ "grad_norm": 7.75,
2097
+ "learning_rate": 9.991555555555556e-05,
2098
+ "loss": 0.9128,
2099
+ "step": 2520
2100
+ },
2101
+ {
2102
+ "epoch": 0.30459908499879607,
2103
+ "grad_norm": 9.0,
2104
+ "learning_rate": 9.987111111111111e-05,
2105
+ "loss": 0.8045,
2106
+ "step": 2530
2107
+ },
2108
+ {
2109
+ "epoch": 0.30580303395136044,
2110
+ "grad_norm": 8.9375,
2111
+ "learning_rate": 9.982666666666667e-05,
2112
+ "loss": 0.8292,
2113
+ "step": 2540
2114
+ },
2115
+ {
2116
+ "epoch": 0.30700698290392486,
2117
+ "grad_norm": 7.40625,
2118
+ "learning_rate": 9.978222222222223e-05,
2119
+ "loss": 0.7557,
2120
+ "step": 2550
2121
+ },
2122
+ {
2123
+ "epoch": 0.3082109318564893,
2124
+ "grad_norm": 7.625,
2125
+ "learning_rate": 9.973777777777778e-05,
2126
+ "loss": 0.683,
2127
+ "step": 2560
2128
+ },
2129
+ {
2130
+ "epoch": 0.3094148808090537,
2131
+ "grad_norm": 8.1875,
2132
+ "learning_rate": 9.969333333333334e-05,
2133
+ "loss": 0.8052,
2134
+ "step": 2570
2135
+ },
2136
+ {
2137
+ "epoch": 0.3106188297616181,
2138
+ "grad_norm": 8.4375,
2139
+ "learning_rate": 9.964888888888889e-05,
2140
+ "loss": 0.7819,
2141
+ "step": 2580
2142
+ },
2143
+ {
2144
+ "epoch": 0.3118227787141825,
2145
+ "grad_norm": 10.8125,
2146
+ "learning_rate": 9.960444444444444e-05,
2147
+ "loss": 0.8452,
2148
+ "step": 2590
2149
+ },
2150
+ {
2151
+ "epoch": 0.31302672766674694,
2152
+ "grad_norm": 6.21875,
2153
+ "learning_rate": 9.956e-05,
2154
+ "loss": 0.7478,
2155
+ "step": 2600
2156
+ },
2157
+ {
2158
+ "epoch": 0.31302672766674694,
2159
+ "eval/acc": 34.88372039794922,
2160
+ "step": 2600
2161
+ },
2162
+ {
2163
+ "epoch": 0.31302672766674694,
2164
+ "eval_loss": 2.6625020503997803,
2165
+ "eval_runtime": 0.2061,
2166
+ "eval_samples_per_second": 208.644,
2167
+ "eval_steps_per_second": 4.852,
2168
+ "step": 2600
2169
+ },
2170
+ {
2171
+ "epoch": 0.31423067661931137,
2172
+ "grad_norm": 7.375,
2173
+ "learning_rate": 9.951555555555556e-05,
2174
+ "loss": 0.7623,
2175
+ "step": 2610
2176
+ },
2177
+ {
2178
+ "epoch": 0.31543462557187574,
2179
+ "grad_norm": 9.0,
2180
+ "learning_rate": 9.947111111111111e-05,
2181
+ "loss": 0.8223,
2182
+ "step": 2620
2183
+ },
2184
+ {
2185
+ "epoch": 0.31663857452444016,
2186
+ "grad_norm": 6.75,
2187
+ "learning_rate": 9.942666666666667e-05,
2188
+ "loss": 0.7797,
2189
+ "step": 2630
2190
+ },
2191
+ {
2192
+ "epoch": 0.3178425234770046,
2193
+ "grad_norm": 9.125,
2194
+ "learning_rate": 9.938222222222224e-05,
2195
+ "loss": 0.6746,
2196
+ "step": 2640
2197
+ },
2198
+ {
2199
+ "epoch": 0.31904647242956896,
2200
+ "grad_norm": 8.5,
2201
+ "learning_rate": 9.933777777777779e-05,
2202
+ "loss": 0.8434,
2203
+ "step": 2650
2204
+ },
2205
+ {
2206
+ "epoch": 0.3202504213821334,
2207
+ "grad_norm": 10.3125,
2208
+ "learning_rate": 9.929333333333333e-05,
2209
+ "loss": 0.8625,
2210
+ "step": 2660
2211
+ },
2212
+ {
2213
+ "epoch": 0.3214543703346978,
2214
+ "grad_norm": 8.125,
2215
+ "learning_rate": 9.92488888888889e-05,
2216
+ "loss": 0.8003,
2217
+ "step": 2670
2218
+ },
2219
+ {
2220
+ "epoch": 0.32265831928726224,
2221
+ "grad_norm": 8.5625,
2222
+ "learning_rate": 9.920444444444444e-05,
2223
+ "loss": 0.8145,
2224
+ "step": 2680
2225
+ },
2226
+ {
2227
+ "epoch": 0.3238622682398266,
2228
+ "grad_norm": 8.0,
2229
+ "learning_rate": 9.916e-05,
2230
+ "loss": 0.6519,
2231
+ "step": 2690
2232
+ },
2233
+ {
2234
+ "epoch": 0.32506621719239104,
2235
+ "grad_norm": 8.5625,
2236
+ "learning_rate": 9.911555555555557e-05,
2237
+ "loss": 0.7627,
2238
+ "step": 2700
2239
+ },
2240
+ {
2241
+ "epoch": 0.32506621719239104,
2242
+ "eval/acc": 38.953487396240234,
2243
+ "step": 2700
2244
+ },
2245
+ {
2246
+ "epoch": 0.32506621719239104,
2247
+ "eval_loss": 2.629239082336426,
2248
+ "eval_runtime": 0.2162,
2249
+ "eval_samples_per_second": 198.931,
2250
+ "eval_steps_per_second": 4.626,
2251
+ "step": 2700
2252
+ },
2253
+ {
2254
+ "epoch": 0.32627016614495546,
2255
+ "grad_norm": 7.625,
2256
+ "learning_rate": 9.907111111111112e-05,
2257
+ "loss": 0.7265,
2258
+ "step": 2710
2259
+ },
2260
+ {
2261
+ "epoch": 0.3274741150975199,
2262
+ "grad_norm": 7.15625,
2263
+ "learning_rate": 9.902666666666666e-05,
2264
+ "loss": 0.7468,
2265
+ "step": 2720
2266
+ },
2267
+ {
2268
+ "epoch": 0.32867806405008426,
2269
+ "grad_norm": 8.5,
2270
+ "learning_rate": 9.898222222222223e-05,
2271
+ "loss": 0.7816,
2272
+ "step": 2730
2273
+ },
2274
+ {
2275
+ "epoch": 0.3298820130026487,
2276
+ "grad_norm": 6.8125,
2277
+ "learning_rate": 9.893777777777779e-05,
2278
+ "loss": 0.7828,
2279
+ "step": 2740
2280
+ },
2281
+ {
2282
+ "epoch": 0.3310859619552131,
2283
+ "grad_norm": 8.5625,
2284
+ "learning_rate": 9.889333333333334e-05,
2285
+ "loss": 0.8273,
2286
+ "step": 2750
2287
+ },
2288
+ {
2289
+ "epoch": 0.3322899109077775,
2290
+ "grad_norm": 7.28125,
2291
+ "learning_rate": 9.884888888888889e-05,
2292
+ "loss": 0.6265,
2293
+ "step": 2760
2294
+ },
2295
+ {
2296
+ "epoch": 0.3334938598603419,
2297
+ "grad_norm": 7.78125,
2298
+ "learning_rate": 9.880444444444445e-05,
2299
+ "loss": 0.8716,
2300
+ "step": 2770
2301
+ },
2302
+ {
2303
+ "epoch": 0.33469780881290634,
2304
+ "grad_norm": 6.0,
2305
+ "learning_rate": 9.876000000000001e-05,
2306
+ "loss": 0.7587,
2307
+ "step": 2780
2308
+ },
2309
+ {
2310
+ "epoch": 0.33590175776547077,
2311
+ "grad_norm": 11.8125,
2312
+ "learning_rate": 9.871555555555556e-05,
2313
+ "loss": 0.836,
2314
+ "step": 2790
2315
+ },
2316
+ {
2317
+ "epoch": 0.33710570671803514,
2318
+ "grad_norm": 8.3125,
2319
+ "learning_rate": 9.867111111111112e-05,
2320
+ "loss": 0.7196,
2321
+ "step": 2800
2322
+ },
2323
+ {
2324
+ "epoch": 0.33710570671803514,
2325
+ "eval/acc": 34.88372039794922,
2326
+ "step": 2800
2327
+ },
2328
+ {
2329
+ "epoch": 0.33710570671803514,
2330
+ "eval_loss": 2.5979089736938477,
2331
+ "eval_runtime": 0.212,
2332
+ "eval_samples_per_second": 202.843,
2333
+ "eval_steps_per_second": 4.717,
2334
+ "step": 2800
2335
+ },
2336
+ {
2337
+ "epoch": 0.33830965567059956,
2338
+ "grad_norm": 8.125,
2339
+ "learning_rate": 9.862666666666667e-05,
2340
+ "loss": 0.7128,
2341
+ "step": 2810
2342
+ },
2343
+ {
2344
+ "epoch": 0.339513604623164,
2345
+ "grad_norm": 7.0,
2346
+ "learning_rate": 9.858222222222223e-05,
2347
+ "loss": 0.8709,
2348
+ "step": 2820
2349
+ },
2350
+ {
2351
+ "epoch": 0.3407175535757284,
2352
+ "grad_norm": 10.875,
2353
+ "learning_rate": 9.853777777777778e-05,
2354
+ "loss": 0.6885,
2355
+ "step": 2830
2356
+ },
2357
+ {
2358
+ "epoch": 0.3419215025282928,
2359
+ "grad_norm": 6.625,
2360
+ "learning_rate": 9.849333333333334e-05,
2361
+ "loss": 0.8262,
2362
+ "step": 2840
2363
+ },
2364
+ {
2365
+ "epoch": 0.3431254514808572,
2366
+ "grad_norm": 9.0625,
2367
+ "learning_rate": 9.844888888888889e-05,
2368
+ "loss": 0.6365,
2369
+ "step": 2850
2370
+ },
2371
+ {
2372
+ "epoch": 0.34432940043342164,
2373
+ "grad_norm": 7.96875,
2374
+ "learning_rate": 9.840444444444445e-05,
2375
+ "loss": 0.8177,
2376
+ "step": 2860
2377
+ },
2378
+ {
2379
+ "epoch": 0.345533349385986,
2380
+ "grad_norm": 6.71875,
2381
+ "learning_rate": 9.836000000000001e-05,
2382
+ "loss": 0.7043,
2383
+ "step": 2870
2384
+ },
2385
+ {
2386
+ "epoch": 0.34673729833855044,
2387
+ "grad_norm": 10.4375,
2388
+ "learning_rate": 9.831555555555556e-05,
2389
+ "loss": 0.7503,
2390
+ "step": 2880
2391
+ },
2392
+ {
2393
+ "epoch": 0.34794124729111486,
2394
+ "grad_norm": 7.375,
2395
+ "learning_rate": 9.827111111111111e-05,
2396
+ "loss": 0.7532,
2397
+ "step": 2890
2398
+ },
2399
+ {
2400
+ "epoch": 0.3491451962436793,
2401
+ "grad_norm": 7.65625,
2402
+ "learning_rate": 9.822666666666667e-05,
2403
+ "loss": 0.6942,
2404
+ "step": 2900
2405
+ },
2406
+ {
2407
+ "epoch": 0.3491451962436793,
2408
+ "eval/acc": 37.79069900512695,
2409
+ "step": 2900
2410
+ },
2411
+ {
2412
+ "epoch": 0.3491451962436793,
2413
+ "eval_loss": 2.698911190032959,
2414
+ "eval_runtime": 1.2554,
2415
+ "eval_samples_per_second": 34.253,
2416
+ "eval_steps_per_second": 0.797,
2417
+ "step": 2900
2418
+ },
2419
+ {
2420
+ "epoch": 0.35034914519624366,
2421
+ "grad_norm": 7.1875,
2422
+ "learning_rate": 9.818222222222223e-05,
2423
+ "loss": 0.7651,
2424
+ "step": 2910
2425
+ },
2426
+ {
2427
+ "epoch": 0.3515530941488081,
2428
+ "grad_norm": 6.0,
2429
+ "learning_rate": 9.813777777777778e-05,
2430
+ "loss": 0.7786,
2431
+ "step": 2920
2432
+ },
2433
+ {
2434
+ "epoch": 0.3527570431013725,
2435
+ "grad_norm": 9.375,
2436
+ "learning_rate": 9.809333333333333e-05,
2437
+ "loss": 0.8285,
2438
+ "step": 2930
2439
+ },
2440
+ {
2441
+ "epoch": 0.35396099205393694,
2442
+ "grad_norm": 6.4375,
2443
+ "learning_rate": 9.80488888888889e-05,
2444
+ "loss": 0.7339,
2445
+ "step": 2940
2446
+ },
2447
+ {
2448
+ "epoch": 0.3551649410065013,
2449
+ "grad_norm": 8.8125,
2450
+ "learning_rate": 9.800444444444446e-05,
2451
+ "loss": 0.6948,
2452
+ "step": 2950
2453
+ },
2454
+ {
2455
+ "epoch": 0.35636888995906574,
2456
+ "grad_norm": 11.4375,
2457
+ "learning_rate": 9.796e-05,
2458
+ "loss": 0.8455,
2459
+ "step": 2960
2460
+ },
2461
+ {
2462
+ "epoch": 0.35757283891163016,
2463
+ "grad_norm": 8.5625,
2464
+ "learning_rate": 9.791555555555557e-05,
2465
+ "loss": 0.791,
2466
+ "step": 2970
2467
+ },
2468
+ {
2469
+ "epoch": 0.35877678786419454,
2470
+ "grad_norm": 7.84375,
2471
+ "learning_rate": 9.787111111111111e-05,
2472
+ "loss": 0.8574,
2473
+ "step": 2980
2474
+ },
2475
+ {
2476
+ "epoch": 0.35998073681675896,
2477
+ "grad_norm": 9.4375,
2478
+ "learning_rate": 9.782666666666666e-05,
2479
+ "loss": 0.7923,
2480
+ "step": 2990
2481
+ },
2482
+ {
2483
+ "epoch": 0.3611846857693234,
2484
+ "grad_norm": 8.0625,
2485
+ "learning_rate": 9.778222222222222e-05,
2486
+ "loss": 0.863,
2487
+ "step": 3000
2488
+ },
2489
+ {
2490
+ "epoch": 0.3611846857693234,
2491
+ "eval/acc": 41.86046600341797,
2492
+ "step": 3000
2493
+ },
2494
+ {
2495
+ "epoch": 0.3611846857693234,
2496
+ "eval_loss": 2.5240559577941895,
2497
+ "eval_runtime": 0.2105,
2498
+ "eval_samples_per_second": 204.269,
2499
+ "eval_steps_per_second": 4.75,
2500
+ "step": 3000
2501
+ },
2502
+ {
2503
+ "epoch": 0.3623886347218878,
2504
+ "grad_norm": 6.71875,
2505
+ "learning_rate": 9.773777777777779e-05,
2506
+ "loss": 0.7726,
2507
+ "step": 3010
2508
+ },
2509
+ {
2510
+ "epoch": 0.3635925836744522,
2511
+ "grad_norm": 8.125,
2512
+ "learning_rate": 9.769333333333334e-05,
2513
+ "loss": 0.8234,
2514
+ "step": 3020
2515
+ },
2516
+ {
2517
+ "epoch": 0.3647965326270166,
2518
+ "grad_norm": 7.90625,
2519
+ "learning_rate": 9.764888888888888e-05,
2520
+ "loss": 0.8125,
2521
+ "step": 3030
2522
+ },
2523
+ {
2524
+ "epoch": 0.36600048157958104,
2525
+ "grad_norm": 5.875,
2526
+ "learning_rate": 9.760444444444446e-05,
2527
+ "loss": 0.739,
2528
+ "step": 3040
2529
+ },
2530
+ {
2531
+ "epoch": 0.3672044305321454,
2532
+ "grad_norm": 32.75,
2533
+ "learning_rate": 9.756000000000001e-05,
2534
+ "loss": 0.8773,
2535
+ "step": 3050
2536
+ },
2537
+ {
2538
+ "epoch": 0.36840837948470984,
2539
+ "grad_norm": 8.625,
2540
+ "learning_rate": 9.751555555555556e-05,
2541
+ "loss": 0.6411,
2542
+ "step": 3060
2543
+ },
2544
+ {
2545
+ "epoch": 0.36961232843727426,
2546
+ "grad_norm": 10.0625,
2547
+ "learning_rate": 9.747111111111112e-05,
2548
+ "loss": 0.7757,
2549
+ "step": 3070
2550
+ },
2551
+ {
2552
+ "epoch": 0.3708162773898387,
2553
+ "grad_norm": 7.78125,
2554
+ "learning_rate": 9.742666666666667e-05,
2555
+ "loss": 0.8144,
2556
+ "step": 3080
2557
+ },
2558
+ {
2559
+ "epoch": 0.37202022634240306,
2560
+ "grad_norm": 8.25,
2561
+ "learning_rate": 9.738222222222223e-05,
2562
+ "loss": 0.7915,
2563
+ "step": 3090
2564
+ },
2565
+ {
2566
+ "epoch": 0.3732241752949675,
2567
+ "grad_norm": 9.5,
2568
+ "learning_rate": 9.733777777777778e-05,
2569
+ "loss": 0.7808,
2570
+ "step": 3100
2571
+ },
2572
+ {
2573
+ "epoch": 0.3732241752949675,
2574
+ "eval/acc": 39.53488540649414,
2575
+ "step": 3100
2576
+ },
2577
+ {
2578
+ "epoch": 0.3732241752949675,
2579
+ "eval_loss": 2.6263325214385986,
2580
+ "eval_runtime": 0.2107,
2581
+ "eval_samples_per_second": 204.065,
2582
+ "eval_steps_per_second": 4.746,
2583
+ "step": 3100
2584
+ },
2585
+ {
2586
+ "epoch": 0.3744281242475319,
2587
+ "grad_norm": 7.34375,
2588
+ "learning_rate": 9.729333333333334e-05,
2589
+ "loss": 0.6467,
2590
+ "step": 3110
2591
+ },
2592
+ {
2593
+ "epoch": 0.37563207320009634,
2594
+ "grad_norm": 10.5625,
2595
+ "learning_rate": 9.724888888888889e-05,
2596
+ "loss": 0.7271,
2597
+ "step": 3120
2598
+ },
2599
+ {
2600
+ "epoch": 0.3768360221526607,
2601
+ "grad_norm": 19.375,
2602
+ "learning_rate": 9.720444444444445e-05,
2603
+ "loss": 0.8248,
2604
+ "step": 3130
2605
+ },
2606
+ {
2607
+ "epoch": 0.37803997110522514,
2608
+ "grad_norm": 11.6875,
2609
+ "learning_rate": 9.716000000000001e-05,
2610
+ "loss": 0.7468,
2611
+ "step": 3140
2612
+ },
2613
+ {
2614
+ "epoch": 0.37924392005778956,
2615
+ "grad_norm": 6.71875,
2616
+ "learning_rate": 9.711555555555556e-05,
2617
+ "loss": 0.8189,
2618
+ "step": 3150
2619
+ },
2620
+ {
2621
+ "epoch": 0.38044786901035393,
2622
+ "grad_norm": 7.15625,
2623
+ "learning_rate": 9.707111111111111e-05,
2624
+ "loss": 0.7265,
2625
+ "step": 3160
2626
+ },
2627
+ {
2628
+ "epoch": 0.38165181796291836,
2629
+ "grad_norm": 11.9375,
2630
+ "learning_rate": 9.702666666666667e-05,
2631
+ "loss": 0.7502,
2632
+ "step": 3170
2633
+ },
2634
+ {
2635
+ "epoch": 0.3828557669154828,
2636
+ "grad_norm": 7.78125,
2637
+ "learning_rate": 9.698222222222223e-05,
2638
+ "loss": 0.8412,
2639
+ "step": 3180
2640
+ },
2641
+ {
2642
+ "epoch": 0.3840597158680472,
2643
+ "grad_norm": 6.75,
2644
+ "learning_rate": 9.693777777777778e-05,
2645
+ "loss": 0.8689,
2646
+ "step": 3190
2647
+ },
2648
+ {
2649
+ "epoch": 0.3852636648206116,
2650
+ "grad_norm": 7.6875,
2651
+ "learning_rate": 9.689333333333333e-05,
2652
+ "loss": 0.8053,
2653
+ "step": 3200
2654
+ },
2655
+ {
2656
+ "epoch": 0.3852636648206116,
2657
+ "eval/acc": 39.53488540649414,
2658
+ "step": 3200
2659
+ },
2660
+ {
2661
+ "epoch": 0.3852636648206116,
2662
+ "eval_loss": 2.6145706176757812,
2663
+ "eval_runtime": 0.2093,
2664
+ "eval_samples_per_second": 205.398,
2665
+ "eval_steps_per_second": 4.777,
2666
+ "step": 3200
2667
+ },
2668
+ {
2669
+ "epoch": 0.386467613773176,
2670
+ "grad_norm": 7.65625,
2671
+ "learning_rate": 9.684888888888889e-05,
2672
+ "loss": 0.7601,
2673
+ "step": 3210
2674
+ },
2675
+ {
2676
+ "epoch": 0.38767156272574044,
2677
+ "grad_norm": 19.25,
2678
+ "learning_rate": 9.680444444444445e-05,
2679
+ "loss": 0.7944,
2680
+ "step": 3220
2681
+ },
2682
+ {
2683
+ "epoch": 0.38887551167830486,
2684
+ "grad_norm": 9.375,
2685
+ "learning_rate": 9.676e-05,
2686
+ "loss": 0.839,
2687
+ "step": 3230
2688
+ },
2689
+ {
2690
+ "epoch": 0.39007946063086923,
2691
+ "grad_norm": 8.5,
2692
+ "learning_rate": 9.671555555555556e-05,
2693
+ "loss": 0.7794,
2694
+ "step": 3240
2695
+ },
2696
+ {
2697
+ "epoch": 0.39128340958343366,
2698
+ "grad_norm": 7.78125,
2699
+ "learning_rate": 9.667111111111111e-05,
2700
+ "loss": 0.753,
2701
+ "step": 3250
2702
+ },
2703
+ {
2704
+ "epoch": 0.3924873585359981,
2705
+ "grad_norm": 7.15625,
2706
+ "learning_rate": 9.662666666666667e-05,
2707
+ "loss": 0.7326,
2708
+ "step": 3260
2709
+ },
2710
+ {
2711
+ "epoch": 0.39369130748856246,
2712
+ "grad_norm": 13.4375,
2713
+ "learning_rate": 9.658222222222222e-05,
2714
+ "loss": 0.6754,
2715
+ "step": 3270
2716
+ },
2717
+ {
2718
+ "epoch": 0.3948952564411269,
2719
+ "grad_norm": 6.71875,
2720
+ "learning_rate": 9.653777777777778e-05,
2721
+ "loss": 0.757,
2722
+ "step": 3280
2723
+ },
2724
+ {
2725
+ "epoch": 0.3960992053936913,
2726
+ "grad_norm": 7.5625,
2727
+ "learning_rate": 9.649333333333333e-05,
2728
+ "loss": 0.9203,
2729
+ "step": 3290
2730
+ },
2731
+ {
2732
+ "epoch": 0.39730315434625574,
2733
+ "grad_norm": 8.375,
2734
+ "learning_rate": 9.64488888888889e-05,
2735
+ "loss": 0.8552,
2736
+ "step": 3300
2737
+ },
2738
+ {
2739
+ "epoch": 0.39730315434625574,
2740
+ "eval/acc": 44.1860466003418,
2741
+ "step": 3300
2742
+ },
2743
+ {
2744
+ "epoch": 0.39730315434625574,
2745
+ "eval_loss": 2.571866273880005,
2746
+ "eval_runtime": 0.2083,
2747
+ "eval_samples_per_second": 206.479,
2748
+ "eval_steps_per_second": 4.802,
2749
+ "step": 3300
2750
+ },
2751
+ {
2752
+ "epoch": 0.3985071032988201,
2753
+ "grad_norm": 7.5625,
2754
+ "learning_rate": 9.640444444444446e-05,
2755
+ "loss": 0.7811,
2756
+ "step": 3310
2757
+ },
2758
+ {
2759
+ "epoch": 0.39971105225138454,
2760
+ "grad_norm": 11.75,
2761
+ "learning_rate": 9.636e-05,
2762
+ "loss": 0.6717,
2763
+ "step": 3320
2764
+ },
2765
+ {
2766
+ "epoch": 0.40091500120394896,
2767
+ "grad_norm": 8.1875,
2768
+ "learning_rate": 9.631555555555555e-05,
2769
+ "loss": 0.838,
2770
+ "step": 3330
2771
+ },
2772
+ {
2773
+ "epoch": 0.4021189501565134,
2774
+ "grad_norm": 6.40625,
2775
+ "learning_rate": 9.627111111111112e-05,
2776
+ "loss": 0.8568,
2777
+ "step": 3340
2778
+ },
2779
+ {
2780
+ "epoch": 0.40332289910907776,
2781
+ "grad_norm": 7.3125,
2782
+ "learning_rate": 9.622666666666668e-05,
2783
+ "loss": 0.6742,
2784
+ "step": 3350
2785
+ },
2786
+ {
2787
+ "epoch": 0.4045268480616422,
2788
+ "grad_norm": 7.875,
2789
+ "learning_rate": 9.618222222222223e-05,
2790
+ "loss": 0.7849,
2791
+ "step": 3360
2792
+ },
2793
+ {
2794
+ "epoch": 0.4057307970142066,
2795
+ "grad_norm": 8.5625,
2796
+ "learning_rate": 9.613777777777779e-05,
2797
+ "loss": 0.7537,
2798
+ "step": 3370
2799
+ },
2800
+ {
2801
+ "epoch": 0.406934745966771,
2802
+ "grad_norm": 8.5625,
2803
+ "learning_rate": 9.609333333333334e-05,
2804
+ "loss": 0.6935,
2805
+ "step": 3380
2806
+ },
2807
+ {
2808
+ "epoch": 0.4081386949193354,
2809
+ "grad_norm": 6.3125,
2810
+ "learning_rate": 9.604888888888889e-05,
2811
+ "loss": 0.8065,
2812
+ "step": 3390
2813
+ },
2814
+ {
2815
+ "epoch": 0.40934264387189984,
2816
+ "grad_norm": 26.25,
2817
+ "learning_rate": 9.600444444444445e-05,
2818
+ "loss": 0.6558,
2819
+ "step": 3400
2820
+ },
2821
+ {
2822
+ "epoch": 0.40934264387189984,
2823
+ "eval/acc": 37.20930099487305,
2824
+ "step": 3400
2825
+ },
2826
+ {
2827
+ "epoch": 0.40934264387189984,
2828
+ "eval_loss": 2.7212982177734375,
2829
+ "eval_runtime": 0.2094,
2830
+ "eval_samples_per_second": 205.345,
2831
+ "eval_steps_per_second": 4.775,
2832
+ "step": 3400
2833
+ },
2834
+ {
2835
+ "epoch": 0.41054659282446426,
2836
+ "grad_norm": 6.84375,
2837
+ "learning_rate": 9.596000000000001e-05,
2838
+ "loss": 0.7642,
2839
+ "step": 3410
2840
+ },
2841
+ {
2842
+ "epoch": 0.41175054177702863,
2843
+ "grad_norm": 7.0625,
2844
+ "learning_rate": 9.591555555555556e-05,
2845
+ "loss": 0.7185,
2846
+ "step": 3420
2847
+ },
2848
+ {
2849
+ "epoch": 0.41295449072959306,
2850
+ "grad_norm": 7.15625,
2851
+ "learning_rate": 9.58711111111111e-05,
2852
+ "loss": 0.6634,
2853
+ "step": 3430
2854
+ },
2855
+ {
2856
+ "epoch": 0.4141584396821575,
2857
+ "grad_norm": 4.96875,
2858
+ "learning_rate": 9.582666666666668e-05,
2859
+ "loss": 0.6383,
2860
+ "step": 3440
2861
+ },
2862
+ {
2863
+ "epoch": 0.4153623886347219,
2864
+ "grad_norm": 7.15625,
2865
+ "learning_rate": 9.578222222222223e-05,
2866
+ "loss": 0.8032,
2867
+ "step": 3450
2868
+ },
2869
+ {
2870
+ "epoch": 0.4165663375872863,
2871
+ "grad_norm": 9.0625,
2872
+ "learning_rate": 9.573777777777778e-05,
2873
+ "loss": 0.7294,
2874
+ "step": 3460
2875
+ },
2876
+ {
2877
+ "epoch": 0.4177702865398507,
2878
+ "grad_norm": 9.5,
2879
+ "learning_rate": 9.569333333333334e-05,
2880
+ "loss": 0.802,
2881
+ "step": 3470
2882
+ },
2883
+ {
2884
+ "epoch": 0.41897423549241514,
2885
+ "grad_norm": 7.0,
2886
+ "learning_rate": 9.56488888888889e-05,
2887
+ "loss": 0.7307,
2888
+ "step": 3480
2889
+ },
2890
+ {
2891
+ "epoch": 0.4201781844449795,
2892
+ "grad_norm": 6.34375,
2893
+ "learning_rate": 9.560444444444445e-05,
2894
+ "loss": 0.7239,
2895
+ "step": 3490
2896
+ },
2897
+ {
2898
+ "epoch": 0.42138213339754393,
2899
+ "grad_norm": 6.5,
2900
+ "learning_rate": 9.556e-05,
2901
+ "loss": 0.6711,
2902
+ "step": 3500
2903
+ },
2904
+ {
2905
+ "epoch": 0.42138213339754393,
2906
+ "eval/acc": 39.53488540649414,
2907
+ "step": 3500
2908
+ },
2909
+ {
2910
+ "epoch": 0.42138213339754393,
2911
+ "eval_loss": 2.569326400756836,
2912
+ "eval_runtime": 0.2066,
2913
+ "eval_samples_per_second": 208.137,
2914
+ "eval_steps_per_second": 4.84,
2915
+ "step": 3500
2916
+ },
2917
+ {
2918
+ "epoch": 0.42258608235010836,
2919
+ "grad_norm": 8.125,
2920
+ "learning_rate": 9.551555555555556e-05,
2921
+ "loss": 0.695,
2922
+ "step": 3510
2923
+ },
2924
+ {
2925
+ "epoch": 0.4237900313026728,
2926
+ "grad_norm": 8.3125,
2927
+ "learning_rate": 9.547111111111111e-05,
2928
+ "loss": 0.8691,
2929
+ "step": 3520
2930
+ },
2931
+ {
2932
+ "epoch": 0.42499398025523716,
2933
+ "grad_norm": 8.6875,
2934
+ "learning_rate": 9.542666666666667e-05,
2935
+ "loss": 0.7582,
2936
+ "step": 3530
2937
+ },
2938
+ {
2939
+ "epoch": 0.4261979292078016,
2940
+ "grad_norm": 7.25,
2941
+ "learning_rate": 9.538222222222223e-05,
2942
+ "loss": 0.7143,
2943
+ "step": 3540
2944
+ },
2945
+ {
2946
+ "epoch": 0.427401878160366,
2947
+ "grad_norm": 8.6875,
2948
+ "learning_rate": 9.533777777777778e-05,
2949
+ "loss": 0.6754,
2950
+ "step": 3550
2951
+ },
2952
+ {
2953
+ "epoch": 0.42860582711293044,
2954
+ "grad_norm": 7.8125,
2955
+ "learning_rate": 9.529333333333333e-05,
2956
+ "loss": 0.7153,
2957
+ "step": 3560
2958
+ },
2959
+ {
2960
+ "epoch": 0.4298097760654948,
2961
+ "grad_norm": 7.5625,
2962
+ "learning_rate": 9.52488888888889e-05,
2963
+ "loss": 0.7293,
2964
+ "step": 3570
2965
+ },
2966
+ {
2967
+ "epoch": 0.43101372501805923,
2968
+ "grad_norm": 7.5625,
2969
+ "learning_rate": 9.520444444444446e-05,
2970
+ "loss": 0.7066,
2971
+ "step": 3580
2972
+ },
2973
+ {
2974
+ "epoch": 0.43221767397062366,
2975
+ "grad_norm": 8.1875,
2976
+ "learning_rate": 9.516e-05,
2977
+ "loss": 0.691,
2978
+ "step": 3590
2979
+ },
2980
+ {
2981
+ "epoch": 0.43342162292318803,
2982
+ "grad_norm": 7.125,
2983
+ "learning_rate": 9.511555555555555e-05,
2984
+ "loss": 0.8239,
2985
+ "step": 3600
2986
+ },
2987
+ {
2988
+ "epoch": 0.43342162292318803,
2989
+ "eval/acc": 44.1860466003418,
2990
+ "step": 3600
2991
+ },
2992
+ {
2993
+ "epoch": 0.43342162292318803,
2994
+ "eval_loss": 2.4877374172210693,
2995
+ "eval_runtime": 0.3957,
2996
+ "eval_samples_per_second": 108.658,
2997
+ "eval_steps_per_second": 2.527,
2998
+ "step": 3600
2999
+ },
3000
+ {
3001
+ "epoch": 0.43462557187575246,
3002
+ "grad_norm": 6.375,
3003
+ "learning_rate": 9.507111111111111e-05,
3004
+ "loss": 0.6782,
3005
+ "step": 3610
3006
+ },
3007
+ {
3008
+ "epoch": 0.4358295208283169,
3009
+ "grad_norm": 7.1875,
3010
+ "learning_rate": 9.502666666666668e-05,
3011
+ "loss": 0.7602,
3012
+ "step": 3620
3013
+ },
3014
+ {
3015
+ "epoch": 0.4370334697808813,
3016
+ "grad_norm": 8.125,
3017
+ "learning_rate": 9.498222222222222e-05,
3018
+ "loss": 0.7232,
3019
+ "step": 3630
3020
+ },
3021
+ {
3022
+ "epoch": 0.4382374187334457,
3023
+ "grad_norm": 7.84375,
3024
+ "learning_rate": 9.493777777777779e-05,
3025
+ "loss": 0.729,
3026
+ "step": 3640
3027
+ },
3028
+ {
3029
+ "epoch": 0.4394413676860101,
3030
+ "grad_norm": 8.375,
3031
+ "learning_rate": 9.489333333333334e-05,
3032
+ "loss": 0.8222,
3033
+ "step": 3650
3034
+ },
3035
+ {
3036
+ "epoch": 0.44064531663857454,
3037
+ "grad_norm": 8.125,
3038
+ "learning_rate": 9.48488888888889e-05,
3039
+ "loss": 0.6918,
3040
+ "step": 3660
3041
+ },
3042
+ {
3043
+ "epoch": 0.44184926559113896,
3044
+ "grad_norm": 8.1875,
3045
+ "learning_rate": 9.480444444444445e-05,
3046
+ "loss": 0.6761,
3047
+ "step": 3670
3048
+ },
3049
+ {
3050
+ "epoch": 0.44305321454370333,
3051
+ "grad_norm": 5.65625,
3052
+ "learning_rate": 9.476000000000001e-05,
3053
+ "loss": 0.7532,
3054
+ "step": 3680
3055
+ },
3056
+ {
3057
+ "epoch": 0.44425716349626776,
3058
+ "grad_norm": 8.8125,
3059
+ "learning_rate": 9.471555555555556e-05,
3060
+ "loss": 0.7072,
3061
+ "step": 3690
3062
+ },
3063
+ {
3064
+ "epoch": 0.4454611124488322,
3065
+ "grad_norm": 6.5625,
3066
+ "learning_rate": 9.46711111111111e-05,
3067
+ "loss": 0.8405,
3068
+ "step": 3700
3069
+ },
3070
+ {
3071
+ "epoch": 0.4454611124488322,
3072
+ "eval/acc": 39.53488540649414,
3073
+ "step": 3700
3074
+ },
3075
+ {
3076
+ "epoch": 0.4454611124488322,
3077
+ "eval_loss": 2.615053176879883,
3078
+ "eval_runtime": 4.8304,
3079
+ "eval_samples_per_second": 8.902,
3080
+ "eval_steps_per_second": 0.207,
3081
+ "step": 3700
3082
+ },
3083
+ {
3084
+ "epoch": 0.44666506140139656,
3085
+ "grad_norm": 8.6875,
3086
+ "learning_rate": 9.462666666666668e-05,
3087
+ "loss": 0.7249,
3088
+ "step": 3710
3089
+ },
3090
+ {
3091
+ "epoch": 0.447869010353961,
3092
+ "grad_norm": 8.4375,
3093
+ "learning_rate": 9.458222222222223e-05,
3094
+ "loss": 0.8561,
3095
+ "step": 3720
3096
+ },
3097
+ {
3098
+ "epoch": 0.4490729593065254,
3099
+ "grad_norm": 7.3125,
3100
+ "learning_rate": 9.453777777777778e-05,
3101
+ "loss": 0.7884,
3102
+ "step": 3730
3103
+ },
3104
+ {
3105
+ "epoch": 0.45027690825908984,
3106
+ "grad_norm": 7.34375,
3107
+ "learning_rate": 9.449333333333334e-05,
3108
+ "loss": 0.7169,
3109
+ "step": 3740
3110
+ },
3111
+ {
3112
+ "epoch": 0.4514808572116542,
3113
+ "grad_norm": 5.5,
3114
+ "learning_rate": 9.44488888888889e-05,
3115
+ "loss": 0.7542,
3116
+ "step": 3750
3117
+ },
3118
+ {
3119
+ "epoch": 0.45268480616421863,
3120
+ "grad_norm": 6.09375,
3121
+ "learning_rate": 9.440444444444445e-05,
3122
+ "loss": 0.6292,
3123
+ "step": 3760
3124
+ },
3125
+ {
3126
+ "epoch": 0.45388875511678306,
3127
+ "grad_norm": 8.9375,
3128
+ "learning_rate": 9.436e-05,
3129
+ "loss": 0.6682,
3130
+ "step": 3770
3131
+ },
3132
+ {
3133
+ "epoch": 0.4550927040693475,
3134
+ "grad_norm": 5.09375,
3135
+ "learning_rate": 9.431555555555556e-05,
3136
+ "loss": 0.6499,
3137
+ "step": 3780
3138
+ },
3139
+ {
3140
+ "epoch": 0.45629665302191186,
3141
+ "grad_norm": 8.5,
3142
+ "learning_rate": 9.427111111111112e-05,
3143
+ "loss": 0.7859,
3144
+ "step": 3790
3145
+ },
3146
+ {
3147
+ "epoch": 0.4575006019744763,
3148
+ "grad_norm": 14.5,
3149
+ "learning_rate": 9.422666666666667e-05,
3150
+ "loss": 0.7987,
3151
+ "step": 3800
3152
+ },
3153
+ {
3154
+ "epoch": 0.4575006019744763,
3155
+ "eval/acc": 39.53488540649414,
3156
+ "step": 3800
3157
+ },
3158
+ {
3159
+ "epoch": 0.4575006019744763,
3160
+ "eval_loss": 2.645066022872925,
3161
+ "eval_runtime": 0.6165,
3162
+ "eval_samples_per_second": 69.745,
3163
+ "eval_steps_per_second": 1.622,
3164
+ "step": 3800
3165
+ },
3166
+ {
3167
+ "epoch": 0.4587045509270407,
3168
+ "grad_norm": 6.25,
3169
+ "learning_rate": 9.418222222222223e-05,
3170
+ "loss": 0.7035,
3171
+ "step": 3810
3172
+ },
3173
+ {
3174
+ "epoch": 0.4599084998796051,
3175
+ "grad_norm": 6.46875,
3176
+ "learning_rate": 9.413777777777778e-05,
3177
+ "loss": 0.6329,
3178
+ "step": 3820
3179
+ },
3180
+ {
3181
+ "epoch": 0.4611124488321695,
3182
+ "grad_norm": 8.875,
3183
+ "learning_rate": 9.409333333333333e-05,
3184
+ "loss": 0.7553,
3185
+ "step": 3830
3186
+ },
3187
+ {
3188
+ "epoch": 0.46231639778473393,
3189
+ "grad_norm": 9.3125,
3190
+ "learning_rate": 9.404888888888889e-05,
3191
+ "loss": 0.6551,
3192
+ "step": 3840
3193
+ },
3194
+ {
3195
+ "epoch": 0.46352034673729836,
3196
+ "grad_norm": 11.0625,
3197
+ "learning_rate": 9.400444444444445e-05,
3198
+ "loss": 0.6634,
3199
+ "step": 3850
3200
+ },
3201
+ {
3202
+ "epoch": 0.46472429568986273,
3203
+ "grad_norm": 6.71875,
3204
+ "learning_rate": 9.396e-05,
3205
+ "loss": 0.6527,
3206
+ "step": 3860
3207
+ },
3208
+ {
3209
+ "epoch": 0.46592824464242716,
3210
+ "grad_norm": 6.75,
3211
+ "learning_rate": 9.391555555555555e-05,
3212
+ "loss": 0.8268,
3213
+ "step": 3870
3214
+ },
3215
+ {
3216
+ "epoch": 0.4671321935949916,
3217
+ "grad_norm": 7.78125,
3218
+ "learning_rate": 9.387111111111113e-05,
3219
+ "loss": 0.742,
3220
+ "step": 3880
3221
+ },
3222
+ {
3223
+ "epoch": 0.468336142547556,
3224
+ "grad_norm": 6.53125,
3225
+ "learning_rate": 9.382666666666667e-05,
3226
+ "loss": 0.7446,
3227
+ "step": 3890
3228
+ },
3229
+ {
3230
+ "epoch": 0.4695400915001204,
3231
+ "grad_norm": 7.0625,
3232
+ "learning_rate": 9.378222222222222e-05,
3233
+ "loss": 0.7764,
3234
+ "step": 3900
3235
+ },
3236
+ {
3237
+ "epoch": 0.4695400915001204,
3238
+ "eval/acc": 37.79069900512695,
3239
+ "step": 3900
3240
+ },
3241
+ {
3242
+ "epoch": 0.4695400915001204,
3243
+ "eval_loss": 2.6463897228240967,
3244
+ "eval_runtime": 1.4145,
3245
+ "eval_samples_per_second": 30.4,
3246
+ "eval_steps_per_second": 0.707,
3247
+ "step": 3900
3248
+ },
3249
+ {
3250
+ "epoch": 0.4707440404526848,
3251
+ "grad_norm": 5.625,
3252
+ "learning_rate": 9.373777777777778e-05,
3253
+ "loss": 0.7248,
3254
+ "step": 3910
3255
+ },
3256
+ {
3257
+ "epoch": 0.47194798940524924,
3258
+ "grad_norm": 7.09375,
3259
+ "learning_rate": 9.369333333333333e-05,
3260
+ "loss": 0.6977,
3261
+ "step": 3920
3262
+ },
3263
+ {
3264
+ "epoch": 0.4731519383578136,
3265
+ "grad_norm": 7.53125,
3266
+ "learning_rate": 9.36488888888889e-05,
3267
+ "loss": 0.6496,
3268
+ "step": 3930
3269
+ },
3270
+ {
3271
+ "epoch": 0.47435588731037803,
3272
+ "grad_norm": 11.0,
3273
+ "learning_rate": 9.360444444444444e-05,
3274
+ "loss": 0.7309,
3275
+ "step": 3940
3276
+ },
3277
+ {
3278
+ "epoch": 0.47555983626294246,
3279
+ "grad_norm": 10.5625,
3280
+ "learning_rate": 9.356e-05,
3281
+ "loss": 0.7837,
3282
+ "step": 3950
3283
+ },
3284
+ {
3285
+ "epoch": 0.4767637852155069,
3286
+ "grad_norm": 6.9375,
3287
+ "learning_rate": 9.351555555555555e-05,
3288
+ "loss": 0.6769,
3289
+ "step": 3960
3290
+ },
3291
+ {
3292
+ "epoch": 0.47796773416807126,
3293
+ "grad_norm": 6.84375,
3294
+ "learning_rate": 9.347111111111112e-05,
3295
+ "loss": 0.642,
3296
+ "step": 3970
3297
+ },
3298
+ {
3299
+ "epoch": 0.4791716831206357,
3300
+ "grad_norm": 9.125,
3301
+ "learning_rate": 9.342666666666668e-05,
3302
+ "loss": 0.6947,
3303
+ "step": 3980
3304
+ },
3305
+ {
3306
+ "epoch": 0.4803756320732001,
3307
+ "grad_norm": 7.4375,
3308
+ "learning_rate": 9.338222222222223e-05,
3309
+ "loss": 0.5902,
3310
+ "step": 3990
3311
+ },
3312
+ {
3313
+ "epoch": 0.4815795810257645,
3314
+ "grad_norm": 8.1875,
3315
+ "learning_rate": 9.333777777777777e-05,
3316
+ "loss": 0.6075,
3317
+ "step": 4000
3318
+ },
3319
+ {
3320
+ "epoch": 0.4815795810257645,
3321
+ "eval/acc": 34.88372039794922,
3322
+ "step": 4000
3323
+ },
3324
+ {
3325
+ "epoch": 0.4815795810257645,
3326
+ "eval_loss": 2.6985960006713867,
3327
+ "eval_runtime": 0.2767,
3328
+ "eval_samples_per_second": 155.399,
3329
+ "eval_steps_per_second": 3.614,
3330
+ "step": 4000
3331
+ },
3332
+ {
3333
+ "epoch": 0.4827835299783289,
3334
+ "grad_norm": 6.8125,
3335
+ "learning_rate": 9.329333333333334e-05,
3336
+ "loss": 0.7166,
3337
+ "step": 4010
3338
+ },
3339
+ {
3340
+ "epoch": 0.48398747893089333,
3341
+ "grad_norm": 6.375,
3342
+ "learning_rate": 9.32488888888889e-05,
3343
+ "loss": 0.6136,
3344
+ "step": 4020
3345
+ },
3346
+ {
3347
+ "epoch": 0.48519142788345776,
3348
+ "grad_norm": 6.09375,
3349
+ "learning_rate": 9.320444444444445e-05,
3350
+ "loss": 0.7948,
3351
+ "step": 4030
3352
+ },
3353
+ {
3354
+ "epoch": 0.48639537683602213,
3355
+ "grad_norm": 7.5625,
3356
+ "learning_rate": 9.316000000000001e-05,
3357
+ "loss": 0.7253,
3358
+ "step": 4040
3359
+ },
3360
+ {
3361
+ "epoch": 0.48759932578858656,
3362
+ "grad_norm": 7.1875,
3363
+ "learning_rate": 9.311555555555556e-05,
3364
+ "loss": 0.7386,
3365
+ "step": 4050
3366
+ },
3367
+ {
3368
+ "epoch": 0.488803274741151,
3369
+ "grad_norm": 7.71875,
3370
+ "learning_rate": 9.307111111111112e-05,
3371
+ "loss": 0.7222,
3372
+ "step": 4060
3373
+ },
3374
+ {
3375
+ "epoch": 0.4900072236937154,
3376
+ "grad_norm": 10.8125,
3377
+ "learning_rate": 9.302666666666667e-05,
3378
+ "loss": 0.6298,
3379
+ "step": 4070
3380
+ },
3381
+ {
3382
+ "epoch": 0.4912111726462798,
3383
+ "grad_norm": 14.25,
3384
+ "learning_rate": 9.298222222222223e-05,
3385
+ "loss": 0.6551,
3386
+ "step": 4080
3387
+ },
3388
+ {
3389
+ "epoch": 0.4924151215988442,
3390
+ "grad_norm": 7.75,
3391
+ "learning_rate": 9.293777777777778e-05,
3392
+ "loss": 0.7201,
3393
+ "step": 4090
3394
+ },
3395
+ {
3396
+ "epoch": 0.49361907055140863,
3397
+ "grad_norm": 9.0625,
3398
+ "learning_rate": 9.289333333333334e-05,
3399
+ "loss": 0.708,
3400
+ "step": 4100
3401
+ },
3402
+ {
3403
+ "epoch": 0.49361907055140863,
3404
+ "eval/acc": 34.88372039794922,
3405
+ "step": 4100
3406
+ },
3407
+ {
3408
+ "epoch": 0.49361907055140863,
3409
+ "eval_loss": 2.7673676013946533,
3410
+ "eval_runtime": 0.3468,
3411
+ "eval_samples_per_second": 124.003,
3412
+ "eval_steps_per_second": 2.884,
3413
+ "step": 4100
3414
+ },
3415
+ {
3416
+ "epoch": 0.494823019503973,
3417
+ "grad_norm": 7.9375,
3418
+ "learning_rate": 9.28488888888889e-05,
3419
+ "loss": 0.6997,
3420
+ "step": 4110
3421
+ },
3422
+ {
3423
+ "epoch": 0.49602696845653743,
3424
+ "grad_norm": 6.84375,
3425
+ "learning_rate": 9.280444444444445e-05,
3426
+ "loss": 0.6195,
3427
+ "step": 4120
3428
+ },
3429
+ {
3430
+ "epoch": 0.49723091740910186,
3431
+ "grad_norm": 7.40625,
3432
+ "learning_rate": 9.276e-05,
3433
+ "loss": 0.765,
3434
+ "step": 4130
3435
+ },
3436
+ {
3437
+ "epoch": 0.4984348663616663,
3438
+ "grad_norm": 7.8125,
3439
+ "learning_rate": 9.271555555555556e-05,
3440
+ "loss": 0.7097,
3441
+ "step": 4140
3442
+ },
3443
+ {
3444
+ "epoch": 0.49963881531423066,
3445
+ "grad_norm": 7.75,
3446
+ "learning_rate": 9.267111111111112e-05,
3447
+ "loss": 0.7067,
3448
+ "step": 4150
3449
+ },
3450
+ {
3451
+ "epoch": 0.5008427642667951,
3452
+ "grad_norm": 27.875,
3453
+ "learning_rate": 9.262666666666667e-05,
3454
+ "loss": 0.7989,
3455
+ "step": 4160
3456
+ },
3457
+ {
3458
+ "epoch": 0.5020467132193595,
3459
+ "grad_norm": 8.0,
3460
+ "learning_rate": 9.258222222222222e-05,
3461
+ "loss": 0.6744,
3462
+ "step": 4170
3463
+ },
3464
+ {
3465
+ "epoch": 0.5032506621719239,
3466
+ "grad_norm": 7.96875,
3467
+ "learning_rate": 9.253777777777778e-05,
3468
+ "loss": 0.738,
3469
+ "step": 4180
3470
+ },
3471
+ {
3472
+ "epoch": 0.5044546111244883,
3473
+ "grad_norm": 7.21875,
3474
+ "learning_rate": 9.249333333333334e-05,
3475
+ "loss": 0.7021,
3476
+ "step": 4190
3477
+ },
3478
+ {
3479
+ "epoch": 0.5056585600770528,
3480
+ "grad_norm": 9.6875,
3481
+ "learning_rate": 9.244888888888889e-05,
3482
+ "loss": 0.7133,
3483
+ "step": 4200
3484
+ },
3485
+ {
3486
+ "epoch": 0.5056585600770528,
3487
+ "eval/acc": 32.55813980102539,
3488
+ "step": 4200
3489
+ },
3490
+ {
3491
+ "epoch": 0.5056585600770528,
3492
+ "eval_loss": 2.7288577556610107,
3493
+ "eval_runtime": 0.2266,
3494
+ "eval_samples_per_second": 189.803,
3495
+ "eval_steps_per_second": 4.414,
3496
+ "step": 4200
3497
+ },
3498
+ {
3499
+ "epoch": 0.5068625090296172,
3500
+ "grad_norm": 10.5,
3501
+ "learning_rate": 9.240444444444445e-05,
3502
+ "loss": 0.6886,
3503
+ "step": 4210
3504
+ },
3505
+ {
3506
+ "epoch": 0.5080664579821815,
3507
+ "grad_norm": 9.0625,
3508
+ "learning_rate": 9.236e-05,
3509
+ "loss": 0.7944,
3510
+ "step": 4220
3511
+ },
3512
+ {
3513
+ "epoch": 0.509270406934746,
3514
+ "grad_norm": 7.78125,
3515
+ "learning_rate": 9.231555555555555e-05,
3516
+ "loss": 0.7869,
3517
+ "step": 4230
3518
+ },
3519
+ {
3520
+ "epoch": 0.5104743558873104,
3521
+ "grad_norm": 6.375,
3522
+ "learning_rate": 9.227111111111111e-05,
3523
+ "loss": 0.6245,
3524
+ "step": 4240
3525
+ },
3526
+ {
3527
+ "epoch": 0.5116783048398748,
3528
+ "grad_norm": 9.9375,
3529
+ "learning_rate": 9.222666666666668e-05,
3530
+ "loss": 0.7006,
3531
+ "step": 4250
3532
+ },
3533
+ {
3534
+ "epoch": 0.5128822537924392,
3535
+ "grad_norm": 6.1875,
3536
+ "learning_rate": 9.218222222222222e-05,
3537
+ "loss": 0.7588,
3538
+ "step": 4260
3539
+ },
3540
+ {
3541
+ "epoch": 0.5140862027450036,
3542
+ "grad_norm": 10.6875,
3543
+ "learning_rate": 9.213777777777777e-05,
3544
+ "loss": 0.737,
3545
+ "step": 4270
3546
+ },
3547
+ {
3548
+ "epoch": 0.515290151697568,
3549
+ "grad_norm": 6.15625,
3550
+ "learning_rate": 9.209333333333335e-05,
3551
+ "loss": 0.6774,
3552
+ "step": 4280
3553
+ },
3554
+ {
3555
+ "epoch": 0.5164941006501325,
3556
+ "grad_norm": 8.8125,
3557
+ "learning_rate": 9.20488888888889e-05,
3558
+ "loss": 0.6972,
3559
+ "step": 4290
3560
+ },
3561
+ {
3562
+ "epoch": 0.5176980496026968,
3563
+ "grad_norm": 6.40625,
3564
+ "learning_rate": 9.200444444444445e-05,
3565
+ "loss": 0.6423,
3566
+ "step": 4300
3567
+ },
3568
+ {
3569
+ "epoch": 0.5176980496026968,
3570
+ "eval/acc": 38.953487396240234,
3571
+ "step": 4300
3572
+ },
3573
+ {
3574
+ "epoch": 0.5176980496026968,
3575
+ "eval_loss": 2.7444300651550293,
3576
+ "eval_runtime": 0.2708,
3577
+ "eval_samples_per_second": 158.776,
3578
+ "eval_steps_per_second": 3.692,
3579
+ "step": 4300
3580
+ },
3581
+ {
3582
+ "epoch": 0.5189019985552613,
3583
+ "grad_norm": 6.8125,
3584
+ "learning_rate": 9.196000000000001e-05,
3585
+ "loss": 0.7705,
3586
+ "step": 4310
3587
+ },
3588
+ {
3589
+ "epoch": 0.5201059475078257,
3590
+ "grad_norm": 5.90625,
3591
+ "learning_rate": 9.191555555555556e-05,
3592
+ "loss": 0.7534,
3593
+ "step": 4320
3594
+ },
3595
+ {
3596
+ "epoch": 0.52130989646039,
3597
+ "grad_norm": 9.25,
3598
+ "learning_rate": 9.187111111111112e-05,
3599
+ "loss": 0.6586,
3600
+ "step": 4330
3601
+ },
3602
+ {
3603
+ "epoch": 0.5225138454129545,
3604
+ "grad_norm": 7.53125,
3605
+ "learning_rate": 9.182666666666667e-05,
3606
+ "loss": 0.7459,
3607
+ "step": 4340
3608
+ },
3609
+ {
3610
+ "epoch": 0.5237177943655189,
3611
+ "grad_norm": 6.09375,
3612
+ "learning_rate": 9.178222222222223e-05,
3613
+ "loss": 0.7088,
3614
+ "step": 4350
3615
+ },
3616
+ {
3617
+ "epoch": 0.5249217433180833,
3618
+ "grad_norm": 8.5,
3619
+ "learning_rate": 9.173777777777778e-05,
3620
+ "loss": 0.7313,
3621
+ "step": 4360
3622
+ },
3623
+ {
3624
+ "epoch": 0.5261256922706478,
3625
+ "grad_norm": 8.8125,
3626
+ "learning_rate": 9.169333333333334e-05,
3627
+ "loss": 0.7364,
3628
+ "step": 4370
3629
+ },
3630
+ {
3631
+ "epoch": 0.5273296412232121,
3632
+ "grad_norm": 7.09375,
3633
+ "learning_rate": 9.16488888888889e-05,
3634
+ "loss": 0.6962,
3635
+ "step": 4380
3636
+ },
3637
+ {
3638
+ "epoch": 0.5285335901757765,
3639
+ "grad_norm": 6.28125,
3640
+ "learning_rate": 9.160444444444445e-05,
3641
+ "loss": 0.6817,
3642
+ "step": 4390
3643
+ },
3644
+ {
3645
+ "epoch": 0.529737539128341,
3646
+ "grad_norm": 8.25,
3647
+ "learning_rate": 9.156e-05,
3648
+ "loss": 0.6786,
3649
+ "step": 4400
3650
+ },
3651
+ {
3652
+ "epoch": 0.529737539128341,
3653
+ "eval/acc": 34.88372039794922,
3654
+ "step": 4400
3655
+ },
3656
+ {
3657
+ "epoch": 0.529737539128341,
3658
+ "eval_loss": 2.728501081466675,
3659
+ "eval_runtime": 0.3599,
3660
+ "eval_samples_per_second": 119.474,
3661
+ "eval_steps_per_second": 2.778,
3662
+ "step": 4400
3663
+ },
3664
+ {
3665
+ "epoch": 0.5309414880809054,
3666
+ "grad_norm": 7.59375,
3667
+ "learning_rate": 9.151555555555556e-05,
3668
+ "loss": 0.6744,
3669
+ "step": 4410
3670
+ },
3671
+ {
3672
+ "epoch": 0.5321454370334697,
3673
+ "grad_norm": 8.0625,
3674
+ "learning_rate": 9.147111111111112e-05,
3675
+ "loss": 0.8287,
3676
+ "step": 4420
3677
+ },
3678
+ {
3679
+ "epoch": 0.5333493859860342,
3680
+ "grad_norm": 8.1875,
3681
+ "learning_rate": 9.142666666666667e-05,
3682
+ "loss": 0.7069,
3683
+ "step": 4430
3684
+ },
3685
+ {
3686
+ "epoch": 0.5345533349385986,
3687
+ "grad_norm": 8.125,
3688
+ "learning_rate": 9.138222222222222e-05,
3689
+ "loss": 0.662,
3690
+ "step": 4440
3691
+ },
3692
+ {
3693
+ "epoch": 0.5357572838911631,
3694
+ "grad_norm": 7.46875,
3695
+ "learning_rate": 9.133777777777778e-05,
3696
+ "loss": 0.7424,
3697
+ "step": 4450
3698
+ },
3699
+ {
3700
+ "epoch": 0.5369612328437274,
3701
+ "grad_norm": 6.96875,
3702
+ "learning_rate": 9.129333333333334e-05,
3703
+ "loss": 0.7308,
3704
+ "step": 4460
3705
+ },
3706
+ {
3707
+ "epoch": 0.5381651817962918,
3708
+ "grad_norm": 8.3125,
3709
+ "learning_rate": 9.124888888888889e-05,
3710
+ "loss": 0.7524,
3711
+ "step": 4470
3712
+ },
3713
+ {
3714
+ "epoch": 0.5393691307488563,
3715
+ "grad_norm": 6.40625,
3716
+ "learning_rate": 9.120444444444445e-05,
3717
+ "loss": 0.7523,
3718
+ "step": 4480
3719
+ },
3720
+ {
3721
+ "epoch": 0.5405730797014207,
3722
+ "grad_norm": 7.65625,
3723
+ "learning_rate": 9.116e-05,
3724
+ "loss": 0.647,
3725
+ "step": 4490
3726
+ },
3727
+ {
3728
+ "epoch": 0.541777028653985,
3729
+ "grad_norm": 6.875,
3730
+ "learning_rate": 9.111555555555556e-05,
3731
+ "loss": 0.6547,
3732
+ "step": 4500
3733
+ },
3734
+ {
3735
+ "epoch": 0.541777028653985,
3736
+ "eval/acc": 37.20930099487305,
3737
+ "step": 4500
3738
+ },
3739
+ {
3740
+ "epoch": 0.541777028653985,
3741
+ "eval_loss": 2.8390543460845947,
3742
+ "eval_runtime": 0.2096,
3743
+ "eval_samples_per_second": 205.2,
3744
+ "eval_steps_per_second": 4.772,
3745
+ "step": 4500
3746
+ },
3747
+ {
3748
+ "epoch": 0.5429809776065495,
3749
+ "grad_norm": 9.375,
3750
+ "learning_rate": 9.107111111111111e-05,
3751
+ "loss": 0.6773,
3752
+ "step": 4510
3753
+ },
3754
+ {
3755
+ "epoch": 0.5441849265591139,
3756
+ "grad_norm": 10.1875,
3757
+ "learning_rate": 9.102666666666667e-05,
3758
+ "loss": 0.704,
3759
+ "step": 4520
3760
+ },
3761
+ {
3762
+ "epoch": 0.5453888755116783,
3763
+ "grad_norm": 5.0625,
3764
+ "learning_rate": 9.098222222222222e-05,
3765
+ "loss": 0.6303,
3766
+ "step": 4530
3767
+ },
3768
+ {
3769
+ "epoch": 0.5465928244642427,
3770
+ "grad_norm": 8.25,
3771
+ "learning_rate": 9.093777777777777e-05,
3772
+ "loss": 0.7469,
3773
+ "step": 4540
3774
+ },
3775
+ {
3776
+ "epoch": 0.5477967734168071,
3777
+ "grad_norm": 7.375,
3778
+ "learning_rate": 9.089333333333335e-05,
3779
+ "loss": 0.6995,
3780
+ "step": 4550
3781
+ },
3782
+ {
3783
+ "epoch": 0.5490007223693716,
3784
+ "grad_norm": 7.78125,
3785
+ "learning_rate": 9.08488888888889e-05,
3786
+ "loss": 0.6965,
3787
+ "step": 4560
3788
+ },
3789
+ {
3790
+ "epoch": 0.550204671321936,
3791
+ "grad_norm": 13.625,
3792
+ "learning_rate": 9.080444444444444e-05,
3793
+ "loss": 0.759,
3794
+ "step": 4570
3795
+ },
3796
+ {
3797
+ "epoch": 0.5514086202745003,
3798
+ "grad_norm": 6.875,
3799
+ "learning_rate": 9.076e-05,
3800
+ "loss": 0.7284,
3801
+ "step": 4580
3802
+ },
3803
+ {
3804
+ "epoch": 0.5526125692270648,
3805
+ "grad_norm": 5.875,
3806
+ "learning_rate": 9.071555555555557e-05,
3807
+ "loss": 0.6721,
3808
+ "step": 4590
3809
+ },
3810
+ {
3811
+ "epoch": 0.5538165181796292,
3812
+ "grad_norm": 5.46875,
3813
+ "learning_rate": 9.067111111111112e-05,
3814
+ "loss": 0.6522,
3815
+ "step": 4600
3816
+ },
3817
+ {
3818
+ "epoch": 0.5538165181796292,
3819
+ "eval/acc": 39.53488540649414,
3820
+ "step": 4600
3821
+ },
3822
+ {
3823
+ "epoch": 0.5538165181796292,
3824
+ "eval_loss": 2.801618814468384,
3825
+ "eval_runtime": 0.2155,
3826
+ "eval_samples_per_second": 199.501,
3827
+ "eval_steps_per_second": 4.64,
3828
+ "step": 4600
3829
+ },
3830
+ {
3831
+ "epoch": 0.5550204671321936,
3832
+ "grad_norm": 8.5625,
3833
+ "learning_rate": 9.062666666666666e-05,
3834
+ "loss": 0.6399,
3835
+ "step": 4610
3836
+ },
3837
+ {
3838
+ "epoch": 0.556224416084758,
3839
+ "grad_norm": 7.40625,
3840
+ "learning_rate": 9.058222222222223e-05,
3841
+ "loss": 0.7303,
3842
+ "step": 4620
3843
+ },
3844
+ {
3845
+ "epoch": 0.5574283650373224,
3846
+ "grad_norm": 6.96875,
3847
+ "learning_rate": 9.053777777777777e-05,
3848
+ "loss": 0.7126,
3849
+ "step": 4630
3850
+ },
3851
+ {
3852
+ "epoch": 0.5586323139898868,
3853
+ "grad_norm": 7.15625,
3854
+ "learning_rate": 9.049333333333334e-05,
3855
+ "loss": 0.702,
3856
+ "step": 4640
3857
+ },
3858
+ {
3859
+ "epoch": 0.5598362629424513,
3860
+ "grad_norm": 6.625,
3861
+ "learning_rate": 9.04488888888889e-05,
3862
+ "loss": 0.6957,
3863
+ "step": 4650
3864
+ },
3865
+ {
3866
+ "epoch": 0.5610402118950156,
3867
+ "grad_norm": 7.90625,
3868
+ "learning_rate": 9.040444444444445e-05,
3869
+ "loss": 0.703,
3870
+ "step": 4660
3871
+ },
3872
+ {
3873
+ "epoch": 0.5622441608475801,
3874
+ "grad_norm": 7.75,
3875
+ "learning_rate": 9.036e-05,
3876
+ "loss": 0.7195,
3877
+ "step": 4670
3878
+ },
3879
+ {
3880
+ "epoch": 0.5634481098001445,
3881
+ "grad_norm": 6.59375,
3882
+ "learning_rate": 9.031555555555557e-05,
3883
+ "loss": 0.6445,
3884
+ "step": 4680
3885
+ },
3886
+ {
3887
+ "epoch": 0.5646520587527089,
3888
+ "grad_norm": 25.125,
3889
+ "learning_rate": 9.027111111111112e-05,
3890
+ "loss": 0.699,
3891
+ "step": 4690
3892
+ },
3893
+ {
3894
+ "epoch": 0.5658560077052733,
3895
+ "grad_norm": 8.125,
3896
+ "learning_rate": 9.022666666666667e-05,
3897
+ "loss": 0.716,
3898
+ "step": 4700
3899
+ },
3900
+ {
3901
+ "epoch": 0.5658560077052733,
3902
+ "eval/acc": 34.88372039794922,
3903
+ "step": 4700
3904
+ },
3905
+ {
3906
+ "epoch": 0.5658560077052733,
3907
+ "eval_loss": 2.777444839477539,
3908
+ "eval_runtime": 0.218,
3909
+ "eval_samples_per_second": 197.287,
3910
+ "eval_steps_per_second": 4.588,
3911
+ "step": 4700
3912
+ },
3913
+ {
3914
+ "epoch": 0.5670599566578377,
3915
+ "grad_norm": 7.0,
3916
+ "learning_rate": 9.018222222222223e-05,
3917
+ "loss": 0.693,
3918
+ "step": 4710
3919
+ },
3920
+ {
3921
+ "epoch": 0.5682639056104021,
3922
+ "grad_norm": 8.8125,
3923
+ "learning_rate": 9.013777777777779e-05,
3924
+ "loss": 0.7,
3925
+ "step": 4720
3926
+ },
3927
+ {
3928
+ "epoch": 0.5694678545629666,
3929
+ "grad_norm": 7.0,
3930
+ "learning_rate": 9.009333333333334e-05,
3931
+ "loss": 0.6616,
3932
+ "step": 4730
3933
+ },
3934
+ {
3935
+ "epoch": 0.5706718035155309,
3936
+ "grad_norm": 7.75,
3937
+ "learning_rate": 9.004888888888889e-05,
3938
+ "loss": 0.7987,
3939
+ "step": 4740
3940
+ },
3941
+ {
3942
+ "epoch": 0.5718757524680953,
3943
+ "grad_norm": 6.53125,
3944
+ "learning_rate": 9.000444444444445e-05,
3945
+ "loss": 0.7162,
3946
+ "step": 4750
3947
+ },
3948
+ {
3949
+ "epoch": 0.5730797014206598,
3950
+ "grad_norm": 8.6875,
3951
+ "learning_rate": 8.996e-05,
3952
+ "loss": 0.673,
3953
+ "step": 4760
3954
+ },
3955
+ {
3956
+ "epoch": 0.5742836503732242,
3957
+ "grad_norm": 6.5625,
3958
+ "learning_rate": 8.991555555555556e-05,
3959
+ "loss": 0.7389,
3960
+ "step": 4770
3961
+ },
3962
+ {
3963
+ "epoch": 0.5754875993257886,
3964
+ "grad_norm": 7.25,
3965
+ "learning_rate": 8.987111111111112e-05,
3966
+ "loss": 0.6674,
3967
+ "step": 4780
3968
+ },
3969
+ {
3970
+ "epoch": 0.576691548278353,
3971
+ "grad_norm": 8.8125,
3972
+ "learning_rate": 8.982666666666667e-05,
3973
+ "loss": 0.7464,
3974
+ "step": 4790
3975
+ },
3976
+ {
3977
+ "epoch": 0.5778954972309174,
3978
+ "grad_norm": 7.65625,
3979
+ "learning_rate": 8.978222222222222e-05,
3980
+ "loss": 0.6979,
3981
+ "step": 4800
3982
+ },
3983
+ {
3984
+ "epoch": 0.5778954972309174,
3985
+ "eval/acc": 37.20930099487305,
3986
+ "step": 4800
3987
+ },
3988
+ {
3989
+ "epoch": 0.5778954972309174,
3990
+ "eval_loss": 2.7990331649780273,
3991
+ "eval_runtime": 0.207,
3992
+ "eval_samples_per_second": 207.72,
3993
+ "eval_steps_per_second": 4.831,
3994
+ "step": 4800
3995
+ },
3996
+ {
3997
+ "epoch": 0.5790994461834819,
3998
+ "grad_norm": 6.90625,
3999
+ "learning_rate": 8.973777777777778e-05,
4000
+ "loss": 0.7292,
4001
+ "step": 4810
4002
+ },
4003
+ {
4004
+ "epoch": 0.5803033951360462,
4005
+ "grad_norm": 7.34375,
4006
+ "learning_rate": 8.969333333333334e-05,
4007
+ "loss": 0.6484,
4008
+ "step": 4820
4009
+ },
4010
+ {
4011
+ "epoch": 0.5815073440886106,
4012
+ "grad_norm": 7.96875,
4013
+ "learning_rate": 8.964888888888889e-05,
4014
+ "loss": 0.6246,
4015
+ "step": 4830
4016
+ },
4017
+ {
4018
+ "epoch": 0.5827112930411751,
4019
+ "grad_norm": 5.4375,
4020
+ "learning_rate": 8.960444444444444e-05,
4021
+ "loss": 0.6978,
4022
+ "step": 4840
4023
+ },
4024
+ {
4025
+ "epoch": 0.5839152419937395,
4026
+ "grad_norm": 7.25,
4027
+ "learning_rate": 8.956e-05,
4028
+ "loss": 0.6848,
4029
+ "step": 4850
4030
+ },
4031
+ {
4032
+ "epoch": 0.5851191909463038,
4033
+ "grad_norm": 8.9375,
4034
+ "learning_rate": 8.951555555555557e-05,
4035
+ "loss": 0.7541,
4036
+ "step": 4860
4037
+ },
4038
+ {
4039
+ "epoch": 0.5863231398988683,
4040
+ "grad_norm": 8.6875,
4041
+ "learning_rate": 8.947111111111111e-05,
4042
+ "loss": 0.6872,
4043
+ "step": 4870
4044
+ },
4045
+ {
4046
+ "epoch": 0.5875270888514327,
4047
+ "grad_norm": 6.375,
4048
+ "learning_rate": 8.942666666666668e-05,
4049
+ "loss": 0.7521,
4050
+ "step": 4880
4051
+ },
4052
+ {
4053
+ "epoch": 0.5887310378039972,
4054
+ "grad_norm": 7.34375,
4055
+ "learning_rate": 8.938222222222222e-05,
4056
+ "loss": 0.6741,
4057
+ "step": 4890
4058
+ },
4059
+ {
4060
+ "epoch": 0.5899349867565615,
4061
+ "grad_norm": 9.25,
4062
+ "learning_rate": 8.933777777777779e-05,
4063
+ "loss": 0.7085,
4064
+ "step": 4900
4065
+ },
4066
+ {
4067
+ "epoch": 0.5899349867565615,
4068
+ "eval/acc": 32.55813980102539,
4069
+ "step": 4900
4070
+ },
4071
+ {
4072
+ "epoch": 0.5899349867565615,
4073
+ "eval_loss": 2.822793483734131,
4074
+ "eval_runtime": 0.2077,
4075
+ "eval_samples_per_second": 206.985,
4076
+ "eval_steps_per_second": 4.814,
4077
+ "step": 4900
4078
+ },
4079
+ {
4080
+ "epoch": 0.5911389357091259,
4081
+ "grad_norm": 6.75,
4082
+ "learning_rate": 8.929333333333333e-05,
4083
+ "loss": 0.6908,
4084
+ "step": 4910
4085
+ },
4086
+ {
4087
+ "epoch": 0.5923428846616904,
4088
+ "grad_norm": 14.3125,
4089
+ "learning_rate": 8.92488888888889e-05,
4090
+ "loss": 0.6954,
4091
+ "step": 4920
4092
+ },
4093
+ {
4094
+ "epoch": 0.5935468336142548,
4095
+ "grad_norm": 5.03125,
4096
+ "learning_rate": 8.920444444444444e-05,
4097
+ "loss": 0.6255,
4098
+ "step": 4930
4099
+ },
4100
+ {
4101
+ "epoch": 0.5947507825668191,
4102
+ "grad_norm": 7.3125,
4103
+ "learning_rate": 8.916e-05,
4104
+ "loss": 0.6094,
4105
+ "step": 4940
4106
+ },
4107
+ {
4108
+ "epoch": 0.5959547315193836,
4109
+ "grad_norm": 6.875,
4110
+ "learning_rate": 8.911555555555557e-05,
4111
+ "loss": 0.6488,
4112
+ "step": 4950
4113
+ },
4114
+ {
4115
+ "epoch": 0.597158680471948,
4116
+ "grad_norm": 6.90625,
4117
+ "learning_rate": 8.907111111111112e-05,
4118
+ "loss": 0.6333,
4119
+ "step": 4960
4120
+ },
4121
+ {
4122
+ "epoch": 0.5983626294245123,
4123
+ "grad_norm": 7.0,
4124
+ "learning_rate": 8.902666666666667e-05,
4125
+ "loss": 0.6687,
4126
+ "step": 4970
4127
+ },
4128
+ {
4129
+ "epoch": 0.5995665783770768,
4130
+ "grad_norm": 8.9375,
4131
+ "learning_rate": 8.898222222222223e-05,
4132
+ "loss": 0.6762,
4133
+ "step": 4980
4134
+ },
4135
+ {
4136
+ "epoch": 0.6007705273296412,
4137
+ "grad_norm": 7.53125,
4138
+ "learning_rate": 8.893777777777779e-05,
4139
+ "loss": 0.6007,
4140
+ "step": 4990
4141
+ },
4142
+ {
4143
+ "epoch": 0.6019744762822057,
4144
+ "grad_norm": 5.78125,
4145
+ "learning_rate": 8.889333333333334e-05,
4146
+ "loss": 0.682,
4147
+ "step": 5000
4148
+ },
4149
+ {
4150
+ "epoch": 0.6019744762822057,
4151
+ "eval/acc": 32.55813980102539,
4152
+ "step": 5000
4153
+ },
4154
+ {
4155
+ "epoch": 0.6019744762822057,
4156
+ "eval_loss": 2.827073097229004,
4157
+ "eval_runtime": 0.2073,
4158
+ "eval_samples_per_second": 207.385,
4159
+ "eval_steps_per_second": 4.823,
4160
+ "step": 5000
4161
+ }
4162
+ ],
4163
+ "logging_steps": 10,
4164
+ "max_steps": 25000,
4165
+ "num_input_tokens_seen": 0,
4166
+ "num_train_epochs": 4,
4167
+ "save_steps": 5000,
4168
+ "stateful_callbacks": {
4169
+ "TrainerControl": {
4170
+ "args": {
4171
+ "should_epoch_stop": false,
4172
+ "should_evaluate": false,
4173
+ "should_log": false,
4174
+ "should_save": true,
4175
+ "should_training_stop": false
4176
+ },
4177
+ "attributes": {}
4178
+ }
4179
+ },
4180
+ "total_flos": 0.0,
4181
+ "train_batch_size": 16,
4182
+ "trial_name": null,
4183
+ "trial_params": null
4184
+ }