qwen-click-dit / trainer_state.json
HusseinLezzaik's picture
Upload Qwen-Click-DiT model
5f12201 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008,
"grad_norm": 4.96875,
"learning_rate": 7.964601769911505e-06,
"loss": 1.4112,
"step": 10
},
{
"epoch": 0.016,
"grad_norm": 5.3125,
"learning_rate": 1.6814159292035402e-05,
"loss": 1.4365,
"step": 20
},
{
"epoch": 0.024,
"grad_norm": 2.265625,
"learning_rate": 2.5663716814159294e-05,
"loss": 1.2118,
"step": 30
},
{
"epoch": 0.032,
"grad_norm": 2.78125,
"learning_rate": 3.451327433628319e-05,
"loss": 1.1222,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 1.953125,
"learning_rate": 4.3362831858407084e-05,
"loss": 1.0566,
"step": 50
},
{
"epoch": 0.048,
"grad_norm": 2.75,
"learning_rate": 5.221238938053098e-05,
"loss": 1.0392,
"step": 60
},
{
"epoch": 0.056,
"grad_norm": 2.578125,
"learning_rate": 6.106194690265487e-05,
"loss": 1.1753,
"step": 70
},
{
"epoch": 0.064,
"grad_norm": 1.421875,
"learning_rate": 6.991150442477876e-05,
"loss": 1.0422,
"step": 80
},
{
"epoch": 0.072,
"grad_norm": 3.765625,
"learning_rate": 7.876106194690266e-05,
"loss": 1.026,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 2.609375,
"learning_rate": 8.761061946902655e-05,
"loss": 1.0633,
"step": 100
},
{
"epoch": 0.088,
"grad_norm": 3.171875,
"learning_rate": 9.646017699115044e-05,
"loss": 1.1138,
"step": 110
},
{
"epoch": 0.096,
"grad_norm": 3.609375,
"learning_rate": 9.999932848660433e-05,
"loss": 1.1079,
"step": 120
},
{
"epoch": 0.104,
"grad_norm": 2.765625,
"learning_rate": 9.99952248589506e-05,
"loss": 1.0538,
"step": 130
},
{
"epoch": 0.112,
"grad_norm": 3.5625,
"learning_rate": 9.998739097245067e-05,
"loss": 0.9637,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 5.6875,
"learning_rate": 9.997582741160886e-05,
"loss": 0.8554,
"step": 150
},
{
"epoch": 0.128,
"grad_norm": 7.40625,
"learning_rate": 9.99605350392091e-05,
"loss": 0.6704,
"step": 160
},
{
"epoch": 0.136,
"grad_norm": 7.03125,
"learning_rate": 9.994151499625049e-05,
"loss": 0.8075,
"step": 170
},
{
"epoch": 0.144,
"grad_norm": 3.828125,
"learning_rate": 9.991876870186222e-05,
"loss": 0.7331,
"step": 180
},
{
"epoch": 0.152,
"grad_norm": 3.890625,
"learning_rate": 9.98922978531977e-05,
"loss": 0.7264,
"step": 190
},
{
"epoch": 0.16,
"grad_norm": 4.15625,
"learning_rate": 9.986210442530788e-05,
"loss": 0.5792,
"step": 200
},
{
"epoch": 0.168,
"grad_norm": 5.53125,
"learning_rate": 9.982819067099396e-05,
"loss": 0.6228,
"step": 210
},
{
"epoch": 0.176,
"grad_norm": 5.875,
"learning_rate": 9.979055912063925e-05,
"loss": 0.7417,
"step": 220
},
{
"epoch": 0.184,
"grad_norm": 3.71875,
"learning_rate": 9.974921258202036e-05,
"loss": 0.472,
"step": 230
},
{
"epoch": 0.192,
"grad_norm": 4.15625,
"learning_rate": 9.970415414009773e-05,
"loss": 0.6284,
"step": 240
},
{
"epoch": 0.2,
"grad_norm": 5.1875,
"learning_rate": 9.965538715678548e-05,
"loss": 0.5349,
"step": 250
},
{
"epoch": 0.208,
"grad_norm": 4.59375,
"learning_rate": 9.960291527070051e-05,
"loss": 0.5165,
"step": 260
},
{
"epoch": 0.216,
"grad_norm": 4.0625,
"learning_rate": 9.954674239689109e-05,
"loss": 0.5656,
"step": 270
},
{
"epoch": 0.224,
"grad_norm": 3.875,
"learning_rate": 9.948687272654464e-05,
"loss": 0.6713,
"step": 280
},
{
"epoch": 0.232,
"grad_norm": 5.375,
"learning_rate": 9.942331072667517e-05,
"loss": 0.4347,
"step": 290
},
{
"epoch": 0.24,
"grad_norm": 5.875,
"learning_rate": 9.935606113978981e-05,
"loss": 0.4404,
"step": 300
},
{
"epoch": 0.248,
"grad_norm": 4.28125,
"learning_rate": 9.92851289835351e-05,
"loss": 0.4865,
"step": 310
},
{
"epoch": 0.256,
"grad_norm": 6.46875,
"learning_rate": 9.921051955032253e-05,
"loss": 0.5393,
"step": 320
},
{
"epoch": 0.264,
"grad_norm": 9.0625,
"learning_rate": 9.913223840693375e-05,
"loss": 0.5358,
"step": 330
},
{
"epoch": 0.272,
"grad_norm": 5.0625,
"learning_rate": 9.905029139410508e-05,
"loss": 0.5756,
"step": 340
},
{
"epoch": 0.28,
"grad_norm": 10.625,
"learning_rate": 9.896468462609186e-05,
"loss": 0.4554,
"step": 350
},
{
"epoch": 0.288,
"grad_norm": 3.6875,
"learning_rate": 9.887542449021214e-05,
"loss": 0.3889,
"step": 360
},
{
"epoch": 0.296,
"grad_norm": 6.25,
"learning_rate": 9.878251764637023e-05,
"loss": 0.4468,
"step": 370
},
{
"epoch": 0.304,
"grad_norm": 5.5,
"learning_rate": 9.868597102655968e-05,
"loss": 0.4612,
"step": 380
},
{
"epoch": 0.312,
"grad_norm": 5.4375,
"learning_rate": 9.858579183434605e-05,
"loss": 0.6069,
"step": 390
},
{
"epoch": 0.32,
"grad_norm": 5.84375,
"learning_rate": 9.848198754432959e-05,
"loss": 0.4464,
"step": 400
},
{
"epoch": 0.328,
"grad_norm": 5.125,
"learning_rate": 9.837456590158738e-05,
"loss": 0.5598,
"step": 410
},
{
"epoch": 0.336,
"grad_norm": 3.671875,
"learning_rate": 9.826353492109555e-05,
"loss": 0.4534,
"step": 420
},
{
"epoch": 0.344,
"grad_norm": 7.34375,
"learning_rate": 9.814890288713121e-05,
"loss": 0.5155,
"step": 430
},
{
"epoch": 0.352,
"grad_norm": 6.875,
"learning_rate": 9.803067835265436e-05,
"loss": 0.484,
"step": 440
},
{
"epoch": 0.36,
"grad_norm": 3.875,
"learning_rate": 9.790887013866973e-05,
"loss": 0.4077,
"step": 450
},
{
"epoch": 0.368,
"grad_norm": 5.4375,
"learning_rate": 9.778348733356868e-05,
"loss": 0.3779,
"step": 460
},
{
"epoch": 0.376,
"grad_norm": 5.34375,
"learning_rate": 9.765453929245096e-05,
"loss": 0.521,
"step": 470
},
{
"epoch": 0.384,
"grad_norm": 4.8125,
"learning_rate": 9.752203563642688e-05,
"loss": 0.4114,
"step": 480
},
{
"epoch": 0.392,
"grad_norm": 8.25,
"learning_rate": 9.73859862518993e-05,
"loss": 0.4361,
"step": 490
},
{
"epoch": 0.4,
"grad_norm": 7.40625,
"learning_rate": 9.724640128982605e-05,
"loss": 0.518,
"step": 500
},
{
"epoch": 0.408,
"grad_norm": 5.5625,
"learning_rate": 9.710329116496259e-05,
"loss": 0.4413,
"step": 510
},
{
"epoch": 0.416,
"grad_norm": 5.46875,
"learning_rate": 9.695666655508483e-05,
"loss": 0.4276,
"step": 520
},
{
"epoch": 0.424,
"grad_norm": 5.375,
"learning_rate": 9.680653840019259e-05,
"loss": 0.4476,
"step": 530
},
{
"epoch": 0.432,
"grad_norm": 5.875,
"learning_rate": 9.665291790169311e-05,
"loss": 0.3562,
"step": 540
},
{
"epoch": 0.44,
"grad_norm": 6.4375,
"learning_rate": 9.649581652156559e-05,
"loss": 0.4511,
"step": 550
},
{
"epoch": 0.448,
"grad_norm": 5.1875,
"learning_rate": 9.633524598150568e-05,
"loss": 0.3985,
"step": 560
},
{
"epoch": 0.456,
"grad_norm": 4.1875,
"learning_rate": 9.617121826205116e-05,
"loss": 0.5117,
"step": 570
},
{
"epoch": 0.464,
"grad_norm": 5.5,
"learning_rate": 9.600374560168783e-05,
"loss": 0.4569,
"step": 580
},
{
"epoch": 0.472,
"grad_norm": 4.5,
"learning_rate": 9.583284049593652e-05,
"loss": 0.449,
"step": 590
},
{
"epoch": 0.48,
"grad_norm": 4.25,
"learning_rate": 9.56585156964207e-05,
"loss": 0.4769,
"step": 600
},
{
"epoch": 0.488,
"grad_norm": 5.9375,
"learning_rate": 9.548078420991506e-05,
"loss": 0.5081,
"step": 610
},
{
"epoch": 0.496,
"grad_norm": 7.4375,
"learning_rate": 9.529965929737506e-05,
"loss": 0.4803,
"step": 620
},
{
"epoch": 0.504,
"grad_norm": 4.59375,
"learning_rate": 9.511515447294748e-05,
"loss": 0.5015,
"step": 630
},
{
"epoch": 0.512,
"grad_norm": 6.71875,
"learning_rate": 9.49272835029621e-05,
"loss": 0.5174,
"step": 640
},
{
"epoch": 0.52,
"grad_norm": 4.875,
"learning_rate": 9.47360604049046e-05,
"loss": 0.4957,
"step": 650
},
{
"epoch": 0.528,
"grad_norm": 4.71875,
"learning_rate": 9.454149944637064e-05,
"loss": 0.379,
"step": 660
},
{
"epoch": 0.536,
"grad_norm": 4.96875,
"learning_rate": 9.434361514400132e-05,
"loss": 0.4857,
"step": 670
},
{
"epoch": 0.544,
"grad_norm": 5.71875,
"learning_rate": 9.414242226240012e-05,
"loss": 0.4595,
"step": 680
},
{
"epoch": 0.552,
"grad_norm": 6.8125,
"learning_rate": 9.393793581303116e-05,
"loss": 0.4157,
"step": 690
},
{
"epoch": 0.56,
"grad_norm": 5.03125,
"learning_rate": 9.37301710530993e-05,
"loss": 0.4432,
"step": 700
},
{
"epoch": 0.568,
"grad_norm": 6.75,
"learning_rate": 9.351914348441169e-05,
"loss": 0.4695,
"step": 710
},
{
"epoch": 0.576,
"grad_norm": 2.890625,
"learning_rate": 9.330486885222114e-05,
"loss": 0.3493,
"step": 720
},
{
"epoch": 0.584,
"grad_norm": 3.4375,
"learning_rate": 9.308736314405134e-05,
"loss": 0.4304,
"step": 730
},
{
"epoch": 0.592,
"grad_norm": 3.1875,
"learning_rate": 9.286664258850402e-05,
"loss": 0.5057,
"step": 740
},
{
"epoch": 0.6,
"grad_norm": 4.375,
"learning_rate": 9.264272365404805e-05,
"loss": 0.4159,
"step": 750
},
{
"epoch": 0.608,
"grad_norm": 4.3125,
"learning_rate": 9.241562304779072e-05,
"loss": 0.3647,
"step": 760
},
{
"epoch": 0.616,
"grad_norm": 10.3125,
"learning_rate": 9.21853577142312e-05,
"loss": 0.4851,
"step": 770
},
{
"epoch": 0.624,
"grad_norm": 6.3125,
"learning_rate": 9.195194483399625e-05,
"loss": 0.5071,
"step": 780
},
{
"epoch": 0.632,
"grad_norm": 4.34375,
"learning_rate": 9.17154018225583e-05,
"loss": 0.3939,
"step": 790
},
{
"epoch": 0.64,
"grad_norm": 4.03125,
"learning_rate": 9.147574632893611e-05,
"loss": 0.3762,
"step": 800
},
{
"epoch": 0.648,
"grad_norm": 3.1875,
"learning_rate": 9.12329962343779e-05,
"loss": 0.4427,
"step": 810
},
{
"epoch": 0.656,
"grad_norm": 3.890625,
"learning_rate": 9.098716965102716e-05,
"loss": 0.357,
"step": 820
},
{
"epoch": 0.664,
"grad_norm": 8.25,
"learning_rate": 9.073828492057133e-05,
"loss": 0.4071,
"step": 830
},
{
"epoch": 0.672,
"grad_norm": 5.78125,
"learning_rate": 9.048636061287325e-05,
"loss": 0.4037,
"step": 840
},
{
"epoch": 0.68,
"grad_norm": 3.84375,
"learning_rate": 9.023141552458559e-05,
"loss": 0.3884,
"step": 850
},
{
"epoch": 0.688,
"grad_norm": 3.453125,
"learning_rate": 8.997346867774839e-05,
"loss": 0.3641,
"step": 860
},
{
"epoch": 0.696,
"grad_norm": 7.875,
"learning_rate": 8.97125393183699e-05,
"loss": 0.5387,
"step": 870
},
{
"epoch": 0.704,
"grad_norm": 4.6875,
"learning_rate": 8.94486469149904e-05,
"loss": 0.4085,
"step": 880
},
{
"epoch": 0.712,
"grad_norm": 6.03125,
"learning_rate": 8.918181115722976e-05,
"loss": 0.4055,
"step": 890
},
{
"epoch": 0.72,
"grad_norm": 7.375,
"learning_rate": 8.891205195431831e-05,
"loss": 0.42,
"step": 900
},
{
"epoch": 0.728,
"grad_norm": 3.21875,
"learning_rate": 8.863938943361128e-05,
"loss": 0.3372,
"step": 910
},
{
"epoch": 0.736,
"grad_norm": 3.65625,
"learning_rate": 8.836384393908721e-05,
"loss": 0.4544,
"step": 920
},
{
"epoch": 0.744,
"grad_norm": 3.703125,
"learning_rate": 8.808543602982993e-05,
"loss": 0.4979,
"step": 930
},
{
"epoch": 0.752,
"grad_norm": 2.90625,
"learning_rate": 8.780418647849458e-05,
"loss": 0.3366,
"step": 940
},
{
"epoch": 0.76,
"grad_norm": 4.53125,
"learning_rate": 8.752011626975781e-05,
"loss": 0.3778,
"step": 950
},
{
"epoch": 0.768,
"grad_norm": 3.4375,
"learning_rate": 8.723324659875201e-05,
"loss": 0.4498,
"step": 960
},
{
"epoch": 0.776,
"grad_norm": 4.90625,
"learning_rate": 8.694359886948384e-05,
"loss": 0.4232,
"step": 970
},
{
"epoch": 0.784,
"grad_norm": 4.15625,
"learning_rate": 8.665119469323737e-05,
"loss": 0.3602,
"step": 980
},
{
"epoch": 0.792,
"grad_norm": 5.375,
"learning_rate": 8.635605588696148e-05,
"loss": 0.4095,
"step": 990
},
{
"epoch": 0.8,
"grad_norm": 3.5625,
"learning_rate": 8.605820447164206e-05,
"loss": 0.405,
"step": 1000
},
{
"epoch": 0.808,
"grad_norm": 4.40625,
"learning_rate": 8.575766267065905e-05,
"loss": 0.3137,
"step": 1010
},
{
"epoch": 0.816,
"grad_norm": 5.65625,
"learning_rate": 8.54544529081283e-05,
"loss": 0.3701,
"step": 1020
},
{
"epoch": 0.824,
"grad_norm": 5.0,
"learning_rate": 8.514859780722833e-05,
"loss": 0.3759,
"step": 1030
},
{
"epoch": 0.832,
"grad_norm": 4.9375,
"learning_rate": 8.484012018851246e-05,
"loss": 0.3032,
"step": 1040
},
{
"epoch": 0.84,
"grad_norm": 4.375,
"learning_rate": 8.452904306820618e-05,
"loss": 0.4171,
"step": 1050
},
{
"epoch": 0.848,
"grad_norm": 4.53125,
"learning_rate": 8.421538965648966e-05,
"loss": 0.374,
"step": 1060
},
{
"epoch": 0.856,
"grad_norm": 4.3125,
"learning_rate": 8.389918335576623e-05,
"loss": 0.3358,
"step": 1070
},
{
"epoch": 0.864,
"grad_norm": 4.21875,
"learning_rate": 8.358044775891605e-05,
"loss": 0.3586,
"step": 1080
},
{
"epoch": 0.872,
"grad_norm": 4.15625,
"learning_rate": 8.325920664753595e-05,
"loss": 0.4036,
"step": 1090
},
{
"epoch": 0.88,
"grad_norm": 4.625,
"learning_rate": 8.293548399016491e-05,
"loss": 0.3673,
"step": 1100
},
{
"epoch": 0.888,
"grad_norm": 2.625,
"learning_rate": 8.260930394049583e-05,
"loss": 0.3388,
"step": 1110
},
{
"epoch": 0.896,
"grad_norm": 4.78125,
"learning_rate": 8.228069083557328e-05,
"loss": 0.427,
"step": 1120
},
{
"epoch": 0.904,
"grad_norm": 4.21875,
"learning_rate": 8.194966919397767e-05,
"loss": 0.3926,
"step": 1130
},
{
"epoch": 0.912,
"grad_norm": 5.84375,
"learning_rate": 8.161626371399591e-05,
"loss": 0.3654,
"step": 1140
},
{
"epoch": 0.92,
"grad_norm": 3.71875,
"learning_rate": 8.128049927177854e-05,
"loss": 0.4047,
"step": 1150
},
{
"epoch": 0.928,
"grad_norm": 5.0,
"learning_rate": 8.094240091948375e-05,
"loss": 0.4114,
"step": 1160
},
{
"epoch": 0.936,
"grad_norm": 6.625,
"learning_rate": 8.06019938834081e-05,
"loss": 0.4485,
"step": 1170
},
{
"epoch": 0.944,
"grad_norm": 4.4375,
"learning_rate": 8.025930356210439e-05,
"loss": 0.3833,
"step": 1180
},
{
"epoch": 0.952,
"grad_norm": 3.171875,
"learning_rate": 7.991435552448657e-05,
"loss": 0.3742,
"step": 1190
},
{
"epoch": 0.96,
"grad_norm": 4.65625,
"learning_rate": 7.956717550792199e-05,
"loss": 0.3284,
"step": 1200
},
{
"epoch": 0.968,
"grad_norm": 4.4375,
"learning_rate": 7.921778941631113e-05,
"loss": 0.3862,
"step": 1210
},
{
"epoch": 0.976,
"grad_norm": 8.9375,
"learning_rate": 7.886622331815477e-05,
"loss": 0.4144,
"step": 1220
},
{
"epoch": 0.984,
"grad_norm": 3.875,
"learning_rate": 7.851250344460902e-05,
"loss": 0.3654,
"step": 1230
},
{
"epoch": 0.992,
"grad_norm": 2.59375,
"learning_rate": 7.815665618752812e-05,
"loss": 0.3808,
"step": 1240
},
{
"epoch": 1.0,
"grad_norm": 5.0625,
"learning_rate": 7.77987080974953e-05,
"loss": 0.3482,
"step": 1250
},
{
"epoch": 1.008,
"grad_norm": 3.28125,
"learning_rate": 7.743868588184176e-05,
"loss": 0.3312,
"step": 1260
},
{
"epoch": 1.016,
"grad_norm": 4.375,
"learning_rate": 7.707661640265401e-05,
"loss": 0.37,
"step": 1270
},
{
"epoch": 1.024,
"grad_norm": 4.40625,
"learning_rate": 7.67125266747696e-05,
"loss": 0.3253,
"step": 1280
},
{
"epoch": 1.032,
"grad_norm": 4.0625,
"learning_rate": 7.634644386376149e-05,
"loss": 0.4361,
"step": 1290
},
{
"epoch": 1.04,
"grad_norm": 3.953125,
"learning_rate": 7.597839528391114e-05,
"loss": 0.3981,
"step": 1300
},
{
"epoch": 1.048,
"grad_norm": 7.6875,
"learning_rate": 7.560840839617056e-05,
"loss": 0.3634,
"step": 1310
},
{
"epoch": 1.056,
"grad_norm": 4.03125,
"learning_rate": 7.523651080611341e-05,
"loss": 0.3653,
"step": 1320
},
{
"epoch": 1.064,
"grad_norm": 3.09375,
"learning_rate": 7.48627302618752e-05,
"loss": 0.3433,
"step": 1330
},
{
"epoch": 1.072,
"grad_norm": 4.21875,
"learning_rate": 7.448709465208299e-05,
"loss": 0.3587,
"step": 1340
},
{
"epoch": 1.08,
"grad_norm": 5.375,
"learning_rate": 7.410963200377458e-05,
"loss": 0.346,
"step": 1350
},
{
"epoch": 1.088,
"grad_norm": 4.0625,
"learning_rate": 7.373037048030731e-05,
"loss": 0.4562,
"step": 1360
},
{
"epoch": 1.096,
"grad_norm": 3.75,
"learning_rate": 7.334933837925675e-05,
"loss": 0.4333,
"step": 1370
},
{
"epoch": 1.104,
"grad_norm": 3.46875,
"learning_rate": 7.296656413030531e-05,
"loss": 0.306,
"step": 1380
},
{
"epoch": 1.112,
"grad_norm": 5.25,
"learning_rate": 7.25820762931211e-05,
"loss": 0.4095,
"step": 1390
},
{
"epoch": 1.12,
"grad_norm": 3.578125,
"learning_rate": 7.219590355522697e-05,
"loss": 0.369,
"step": 1400
},
{
"epoch": 1.1280000000000001,
"grad_norm": 4.96875,
"learning_rate": 7.180807472986009e-05,
"loss": 0.3763,
"step": 1410
},
{
"epoch": 1.1360000000000001,
"grad_norm": 5.25,
"learning_rate": 7.141861875382215e-05,
"loss": 0.4269,
"step": 1420
},
{
"epoch": 1.144,
"grad_norm": 3.53125,
"learning_rate": 7.102756468532027e-05,
"loss": 0.4017,
"step": 1430
},
{
"epoch": 1.152,
"grad_norm": 4.6875,
"learning_rate": 7.063494170179898e-05,
"loss": 0.3601,
"step": 1440
},
{
"epoch": 1.16,
"grad_norm": 3.8125,
"learning_rate": 7.024077909776309e-05,
"loss": 0.3678,
"step": 1450
},
{
"epoch": 1.168,
"grad_norm": 5.25,
"learning_rate": 6.984510628259212e-05,
"loss": 0.3732,
"step": 1460
},
{
"epoch": 1.176,
"grad_norm": 2.140625,
"learning_rate": 6.94479527783459e-05,
"loss": 0.3332,
"step": 1470
},
{
"epoch": 1.184,
"grad_norm": 4.5,
"learning_rate": 6.904934821756184e-05,
"loss": 0.3887,
"step": 1480
},
{
"epoch": 1.192,
"grad_norm": 2.296875,
"learning_rate": 6.864932234104409e-05,
"loss": 0.3196,
"step": 1490
},
{
"epoch": 1.2,
"grad_norm": 4.15625,
"learning_rate": 6.824790499564435e-05,
"loss": 0.3256,
"step": 1500
},
{
"epoch": 1.208,
"grad_norm": 3.6875,
"learning_rate": 6.784512613203511e-05,
"loss": 0.3074,
"step": 1510
},
{
"epoch": 1.216,
"grad_norm": 3.0,
"learning_rate": 6.744101580247481e-05,
"loss": 0.35,
"step": 1520
},
{
"epoch": 1.224,
"grad_norm": 3.828125,
"learning_rate": 6.703560415856565e-05,
"loss": 0.3731,
"step": 1530
},
{
"epoch": 1.232,
"grad_norm": 5.625,
"learning_rate": 6.662892144900388e-05,
"loss": 0.3769,
"step": 1540
},
{
"epoch": 1.24,
"grad_norm": 8.1875,
"learning_rate": 6.62209980173229e-05,
"loss": 0.4385,
"step": 1550
},
{
"epoch": 1.248,
"grad_norm": 5.9375,
"learning_rate": 6.581186429962922e-05,
"loss": 0.3563,
"step": 1560
},
{
"epoch": 1.256,
"grad_norm": 3.53125,
"learning_rate": 6.54015508223316e-05,
"loss": 0.3225,
"step": 1570
},
{
"epoch": 1.264,
"grad_norm": 4.125,
"learning_rate": 6.499008819986339e-05,
"loss": 0.3246,
"step": 1580
},
{
"epoch": 1.272,
"grad_norm": 4.34375,
"learning_rate": 6.457750713239828e-05,
"loss": 0.304,
"step": 1590
},
{
"epoch": 1.28,
"grad_norm": 4.75,
"learning_rate": 6.41638384035597e-05,
"loss": 0.383,
"step": 1600
},
{
"epoch": 1.288,
"grad_norm": 4.46875,
"learning_rate": 6.374911287812406e-05,
"loss": 0.331,
"step": 1610
},
{
"epoch": 1.296,
"grad_norm": 3.984375,
"learning_rate": 6.333336149971776e-05,
"loss": 0.3022,
"step": 1620
},
{
"epoch": 1.304,
"grad_norm": 5.53125,
"learning_rate": 6.291661528850844e-05,
"loss": 0.3257,
"step": 1630
},
{
"epoch": 1.312,
"grad_norm": 4.03125,
"learning_rate": 6.249890533889054e-05,
"loss": 0.3071,
"step": 1640
},
{
"epoch": 1.32,
"grad_norm": 4.25,
"learning_rate": 6.208026281716521e-05,
"loss": 0.3833,
"step": 1650
},
{
"epoch": 1.328,
"grad_norm": 3.109375,
"learning_rate": 6.166071895921496e-05,
"loss": 0.3378,
"step": 1660
},
{
"epoch": 1.336,
"grad_norm": 3.5,
"learning_rate": 6.124030506817309e-05,
"loss": 0.31,
"step": 1670
},
{
"epoch": 1.3439999999999999,
"grad_norm": 2.65625,
"learning_rate": 6.0819052512088057e-05,
"loss": 0.3139,
"step": 1680
},
{
"epoch": 1.3519999999999999,
"grad_norm": 4.90625,
"learning_rate": 6.039699272158305e-05,
"loss": 0.4388,
"step": 1690
},
{
"epoch": 1.3599999999999999,
"grad_norm": 5.59375,
"learning_rate": 5.997415718751086e-05,
"loss": 0.3989,
"step": 1700
},
{
"epoch": 1.3679999999999999,
"grad_norm": 5.5,
"learning_rate": 5.955057745860435e-05,
"loss": 0.3977,
"step": 1710
},
{
"epoch": 1.376,
"grad_norm": 6.25,
"learning_rate": 5.9126285139122406e-05,
"loss": 0.3527,
"step": 1720
},
{
"epoch": 1.384,
"grad_norm": 3.28125,
"learning_rate": 5.8701311886491947e-05,
"loss": 0.4044,
"step": 1730
},
{
"epoch": 1.392,
"grad_norm": 5.5625,
"learning_rate": 5.827568940894593e-05,
"loss": 0.374,
"step": 1740
},
{
"epoch": 1.4,
"grad_norm": 3.625,
"learning_rate": 5.7849449463157435e-05,
"loss": 0.3479,
"step": 1750
},
{
"epoch": 1.408,
"grad_norm": 4.84375,
"learning_rate": 5.742262385187028e-05,
"loss": 0.3666,
"step": 1760
},
{
"epoch": 1.416,
"grad_norm": 5.4375,
"learning_rate": 5.699524442152613e-05,
"loss": 0.3707,
"step": 1770
},
{
"epoch": 1.424,
"grad_norm": 4.4375,
"learning_rate": 5.656734305988839e-05,
"loss": 0.3847,
"step": 1780
},
{
"epoch": 1.432,
"grad_norm": 3.96875,
"learning_rate": 5.613895169366292e-05,
"loss": 0.3515,
"step": 1790
},
{
"epoch": 1.44,
"grad_norm": 3.734375,
"learning_rate": 5.571010228611597e-05,
"loss": 0.3763,
"step": 1800
},
{
"epoch": 1.448,
"grad_norm": 5.71875,
"learning_rate": 5.528082683468934e-05,
"loss": 0.3548,
"step": 1810
},
{
"epoch": 1.456,
"grad_norm": 5.03125,
"learning_rate": 5.485115736861288e-05,
"loss": 0.3903,
"step": 1820
},
{
"epoch": 1.464,
"grad_norm": 4.375,
"learning_rate": 5.442112594651484e-05,
"loss": 0.257,
"step": 1830
},
{
"epoch": 1.472,
"grad_norm": 5.46875,
"learning_rate": 5.399076465402979e-05,
"loss": 0.3424,
"step": 1840
},
{
"epoch": 1.48,
"grad_norm": 3.40625,
"learning_rate": 5.356010560140475e-05,
"loss": 0.3317,
"step": 1850
},
{
"epoch": 1.488,
"grad_norm": 5.0,
"learning_rate": 5.312918092110325e-05,
"loss": 0.2753,
"step": 1860
},
{
"epoch": 1.496,
"grad_norm": 3.84375,
"learning_rate": 5.269802276540795e-05,
"loss": 0.3318,
"step": 1870
},
{
"epoch": 1.504,
"grad_norm": 5.71875,
"learning_rate": 5.226666330402164e-05,
"loss": 0.3836,
"step": 1880
},
{
"epoch": 1.512,
"grad_norm": 3.671875,
"learning_rate": 5.1835134721666956e-05,
"loss": 0.3498,
"step": 1890
},
{
"epoch": 1.52,
"grad_norm": 2.90625,
"learning_rate": 5.1403469215685094e-05,
"loss": 0.4228,
"step": 1900
},
{
"epoch": 1.528,
"grad_norm": 4.4375,
"learning_rate": 5.097169899363342e-05,
"loss": 0.3703,
"step": 1910
},
{
"epoch": 1.536,
"grad_norm": 3.90625,
"learning_rate": 5.053985627088238e-05,
"loss": 0.3816,
"step": 1920
},
{
"epoch": 1.544,
"grad_norm": 2.71875,
"learning_rate": 5.010797326821189e-05,
"loss": 0.3842,
"step": 1930
},
{
"epoch": 1.552,
"grad_norm": 3.875,
"learning_rate": 4.9676082209407254e-05,
"loss": 0.3848,
"step": 1940
},
{
"epoch": 1.56,
"grad_norm": 3.484375,
"learning_rate": 4.924421531885481e-05,
"loss": 0.3416,
"step": 1950
},
{
"epoch": 1.568,
"grad_norm": 4.1875,
"learning_rate": 4.881240481913773e-05,
"loss": 0.3407,
"step": 1960
},
{
"epoch": 1.576,
"grad_norm": 3.921875,
"learning_rate": 4.838068292863164e-05,
"loss": 0.3319,
"step": 1970
},
{
"epoch": 1.584,
"grad_norm": 4.4375,
"learning_rate": 4.7949081859100896e-05,
"loss": 0.3979,
"step": 1980
},
{
"epoch": 1.592,
"grad_norm": 4.84375,
"learning_rate": 4.7517633813295114e-05,
"loss": 0.4184,
"step": 1990
},
{
"epoch": 1.6,
"grad_norm": 3.890625,
"learning_rate": 4.708637098254644e-05,
"loss": 0.3959,
"step": 2000
},
{
"epoch": 1.608,
"grad_norm": 4.8125,
"learning_rate": 4.6655325544367715e-05,
"loss": 0.313,
"step": 2010
},
{
"epoch": 1.616,
"grad_norm": 4.21875,
"learning_rate": 4.6224529660051593e-05,
"loss": 0.3012,
"step": 2020
},
{
"epoch": 1.624,
"grad_norm": 3.734375,
"learning_rate": 4.579401547227096e-05,
"loss": 0.2532,
"step": 2030
},
{
"epoch": 1.6320000000000001,
"grad_norm": 2.953125,
"learning_rate": 4.53638151026807e-05,
"loss": 0.2714,
"step": 2040
},
{
"epoch": 1.6400000000000001,
"grad_norm": 4.1875,
"learning_rate": 4.493396064952093e-05,
"loss": 0.3468,
"step": 2050
},
{
"epoch": 1.6480000000000001,
"grad_norm": 5.125,
"learning_rate": 4.450448418522221e-05,
"loss": 0.4547,
"step": 2060
},
{
"epoch": 1.6560000000000001,
"grad_norm": 3.5,
"learning_rate": 4.4075417754012475e-05,
"loss": 0.2839,
"step": 2070
},
{
"epoch": 1.6640000000000001,
"grad_norm": 5.84375,
"learning_rate": 4.364679336952609e-05,
"loss": 0.3426,
"step": 2080
},
{
"epoch": 1.6720000000000002,
"grad_norm": 4.84375,
"learning_rate": 4.321864301241535e-05,
"loss": 0.3325,
"step": 2090
},
{
"epoch": 1.6800000000000002,
"grad_norm": 4.46875,
"learning_rate": 4.279099862796427e-05,
"loss": 0.314,
"step": 2100
},
{
"epoch": 1.688,
"grad_norm": 2.8125,
"learning_rate": 4.23638921237051e-05,
"loss": 0.4189,
"step": 2110
},
{
"epoch": 1.696,
"grad_norm": 3.59375,
"learning_rate": 4.1937355367037516e-05,
"loss": 0.3436,
"step": 2120
},
{
"epoch": 1.704,
"grad_norm": 6.21875,
"learning_rate": 4.151142018285112e-05,
"loss": 0.3681,
"step": 2130
},
{
"epoch": 1.712,
"grad_norm": 4.0625,
"learning_rate": 4.1086118351150785e-05,
"loss": 0.3716,
"step": 2140
},
{
"epoch": 1.72,
"grad_norm": 2.703125,
"learning_rate": 4.066148160468543e-05,
"loss": 0.2761,
"step": 2150
},
{
"epoch": 1.728,
"grad_norm": 3.765625,
"learning_rate": 4.023754162658051e-05,
"loss": 0.2904,
"step": 2160
},
{
"epoch": 1.736,
"grad_norm": 5.4375,
"learning_rate": 3.981433004797395e-05,
"loss": 0.3563,
"step": 2170
},
{
"epoch": 1.744,
"grad_norm": 4.125,
"learning_rate": 3.939187844565616e-05,
"loss": 0.3248,
"step": 2180
},
{
"epoch": 1.752,
"grad_norm": 5.71875,
"learning_rate": 3.897021833971386e-05,
"loss": 0.3246,
"step": 2190
},
{
"epoch": 1.76,
"grad_norm": 4.21875,
"learning_rate": 3.8549381191178516e-05,
"loss": 0.4073,
"step": 2200
},
{
"epoch": 1.768,
"grad_norm": 4.4375,
"learning_rate": 3.8129398399678814e-05,
"loss": 0.4147,
"step": 2210
},
{
"epoch": 1.776,
"grad_norm": 4.4375,
"learning_rate": 3.771030130109785e-05,
"loss": 0.2378,
"step": 2220
},
{
"epoch": 1.784,
"grad_norm": 3.65625,
"learning_rate": 3.729212116523518e-05,
"loss": 0.3305,
"step": 2230
},
{
"epoch": 1.792,
"grad_norm": 3.09375,
"learning_rate": 3.6874889193473646e-05,
"loss": 0.3865,
"step": 2240
},
{
"epoch": 1.8,
"grad_norm": 4.5,
"learning_rate": 3.64586365164514e-05,
"loss": 0.3443,
"step": 2250
},
{
"epoch": 1.808,
"grad_norm": 2.71875,
"learning_rate": 3.604339419173912e-05,
"loss": 0.2762,
"step": 2260
},
{
"epoch": 1.8159999999999998,
"grad_norm": 5.15625,
"learning_rate": 3.5629193201522794e-05,
"loss": 0.3787,
"step": 2270
},
{
"epoch": 1.8239999999999998,
"grad_norm": 5.53125,
"learning_rate": 3.521606445029208e-05,
"loss": 0.4157,
"step": 2280
},
{
"epoch": 1.8319999999999999,
"grad_norm": 5.34375,
"learning_rate": 3.480403876253432e-05,
"loss": 0.3345,
"step": 2290
},
{
"epoch": 1.8399999999999999,
"grad_norm": 3.3125,
"learning_rate": 3.4393146880434845e-05,
"loss": 0.3111,
"step": 2300
},
{
"epoch": 1.8479999999999999,
"grad_norm": 4.03125,
"learning_rate": 3.398341946158311e-05,
"loss": 0.3763,
"step": 2310
},
{
"epoch": 1.8559999999999999,
"grad_norm": 3.703125,
"learning_rate": 3.357488707668529e-05,
"loss": 0.3246,
"step": 2320
},
{
"epoch": 1.8639999999999999,
"grad_norm": 4.65625,
"learning_rate": 3.316758020728327e-05,
"loss": 0.3852,
"step": 2330
},
{
"epoch": 1.8719999999999999,
"grad_norm": 3.859375,
"learning_rate": 3.276152924348046e-05,
"loss": 0.3295,
"step": 2340
},
{
"epoch": 1.88,
"grad_norm": 5.15625,
"learning_rate": 3.2356764481674254e-05,
"loss": 0.3567,
"step": 2350
},
{
"epoch": 1.888,
"grad_norm": 3.421875,
"learning_rate": 3.1953316122295554e-05,
"loss": 0.3091,
"step": 2360
},
{
"epoch": 1.896,
"grad_norm": 3.265625,
"learning_rate": 3.1551214267555416e-05,
"loss": 0.3847,
"step": 2370
},
{
"epoch": 1.904,
"grad_norm": 6.09375,
"learning_rate": 3.1150488919199124e-05,
"loss": 0.3958,
"step": 2380
},
{
"epoch": 1.912,
"grad_norm": 4.03125,
"learning_rate": 3.075116997626764e-05,
"loss": 0.384,
"step": 2390
},
{
"epoch": 1.92,
"grad_norm": 4.0625,
"learning_rate": 3.0353287232866736e-05,
"loss": 0.3349,
"step": 2400
},
{
"epoch": 1.928,
"grad_norm": 4.375,
"learning_rate": 2.995687037594408e-05,
"loss": 0.3801,
"step": 2410
},
{
"epoch": 1.936,
"grad_norm": 5.09375,
"learning_rate": 2.9561948983074174e-05,
"loss": 0.3281,
"step": 2420
},
{
"epoch": 1.944,
"grad_norm": 6.8125,
"learning_rate": 2.916855252025149e-05,
"loss": 0.3549,
"step": 2430
},
{
"epoch": 1.952,
"grad_norm": 4.125,
"learning_rate": 2.877671033969193e-05,
"loss": 0.4092,
"step": 2440
},
{
"epoch": 1.96,
"grad_norm": 4.21875,
"learning_rate": 2.8386451677642878e-05,
"loss": 0.3866,
"step": 2450
},
{
"epoch": 1.968,
"grad_norm": 3.078125,
"learning_rate": 2.7997805652201714e-05,
"loss": 0.3484,
"step": 2460
},
{
"epoch": 1.976,
"grad_norm": 4.09375,
"learning_rate": 2.7610801261143283e-05,
"loss": 0.3496,
"step": 2470
},
{
"epoch": 1.984,
"grad_norm": 4.6875,
"learning_rate": 2.7225467379756314e-05,
"loss": 0.3691,
"step": 2480
},
{
"epoch": 1.992,
"grad_norm": 5.5,
"learning_rate": 2.6841832758689002e-05,
"loss": 0.3698,
"step": 2490
},
{
"epoch": 2.0,
"grad_norm": 3.875,
"learning_rate": 2.645992602180377e-05,
"loss": 0.3577,
"step": 2500
},
{
"epoch": 2.008,
"grad_norm": 4.40625,
"learning_rate": 2.607977566404164e-05,
"loss": 0.2871,
"step": 2510
},
{
"epoch": 2.016,
"grad_norm": 4.0625,
"learning_rate": 2.570141004929612e-05,
"loss": 0.3426,
"step": 2520
},
{
"epoch": 2.024,
"grad_norm": 3.5,
"learning_rate": 2.5324857408296994e-05,
"loss": 0.2656,
"step": 2530
},
{
"epoch": 2.032,
"grad_norm": 3.578125,
"learning_rate": 2.4950145836503836e-05,
"loss": 0.3473,
"step": 2540
},
{
"epoch": 2.04,
"grad_norm": 3.953125,
"learning_rate": 2.4577303292009822e-05,
"loss": 0.3588,
"step": 2550
},
{
"epoch": 2.048,
"grad_norm": 4.75,
"learning_rate": 2.4206357593455743e-05,
"loss": 0.3953,
"step": 2560
},
{
"epoch": 2.056,
"grad_norm": 3.96875,
"learning_rate": 2.383733641795428e-05,
"loss": 0.3209,
"step": 2570
},
{
"epoch": 2.064,
"grad_norm": 2.515625,
"learning_rate": 2.3470267299025068e-05,
"loss": 0.3299,
"step": 2580
},
{
"epoch": 2.072,
"grad_norm": 3.375,
"learning_rate": 2.3105177624540252e-05,
"loss": 0.2311,
"step": 2590
},
{
"epoch": 2.08,
"grad_norm": 4.46875,
"learning_rate": 2.274209463468117e-05,
"loss": 0.3035,
"step": 2600
},
{
"epoch": 2.088,
"grad_norm": 2.578125,
"learning_rate": 2.2381045419905655e-05,
"loss": 0.3344,
"step": 2610
},
{
"epoch": 2.096,
"grad_norm": 5.28125,
"learning_rate": 2.2022056918927037e-05,
"loss": 0.2794,
"step": 2620
},
{
"epoch": 2.104,
"grad_norm": 5.25,
"learning_rate": 2.166515591670394e-05,
"loss": 0.3416,
"step": 2630
},
{
"epoch": 2.112,
"grad_norm": 3.96875,
"learning_rate": 2.1310369042441985e-05,
"loss": 0.3152,
"step": 2640
},
{
"epoch": 2.12,
"grad_norm": 3.203125,
"learning_rate": 2.0957722767606774e-05,
"loss": 0.3015,
"step": 2650
},
{
"epoch": 2.128,
"grad_norm": 3.78125,
"learning_rate": 2.0607243403948863e-05,
"loss": 0.3843,
"step": 2660
},
{
"epoch": 2.136,
"grad_norm": 6.34375,
"learning_rate": 2.0258957101540625e-05,
"loss": 0.3299,
"step": 2670
},
{
"epoch": 2.144,
"grad_norm": 4.1875,
"learning_rate": 1.9912889846825038e-05,
"loss": 0.3636,
"step": 2680
},
{
"epoch": 2.152,
"grad_norm": 3.6875,
"learning_rate": 1.956906746067683e-05,
"loss": 0.3596,
"step": 2690
},
{
"epoch": 2.16,
"grad_norm": 2.984375,
"learning_rate": 1.922751559647591e-05,
"loss": 0.3796,
"step": 2700
},
{
"epoch": 2.168,
"grad_norm": 3.21875,
"learning_rate": 1.888825973819336e-05,
"loss": 0.3175,
"step": 2710
},
{
"epoch": 2.176,
"grad_norm": 4.03125,
"learning_rate": 1.8551325198489887e-05,
"loss": 0.2928,
"step": 2720
},
{
"epoch": 2.184,
"grad_norm": 4.1875,
"learning_rate": 1.8216737116827378e-05,
"loss": 0.2791,
"step": 2730
},
{
"epoch": 2.192,
"grad_norm": 5.71875,
"learning_rate": 1.7884520457592984e-05,
"loss": 0.3925,
"step": 2740
},
{
"epoch": 2.2,
"grad_norm": 3.28125,
"learning_rate": 1.755470000823667e-05,
"loss": 0.2967,
"step": 2750
},
{
"epoch": 2.208,
"grad_norm": 2.203125,
"learning_rate": 1.7227300377421574e-05,
"loss": 0.2475,
"step": 2760
},
{
"epoch": 2.216,
"grad_norm": 3.515625,
"learning_rate": 1.6902345993188017e-05,
"loss": 0.34,
"step": 2770
},
{
"epoch": 2.224,
"grad_norm": 3.453125,
"learning_rate": 1.6579861101130896e-05,
"loss": 0.3418,
"step": 2780
},
{
"epoch": 2.232,
"grad_norm": 6.5,
"learning_rate": 1.6259869762590503e-05,
"loss": 0.4639,
"step": 2790
},
{
"epoch": 2.24,
"grad_norm": 5.0625,
"learning_rate": 1.5942395852857466e-05,
"loss": 0.4252,
"step": 2800
},
{
"epoch": 2.248,
"grad_norm": 4.71875,
"learning_rate": 1.5627463059391173e-05,
"loss": 0.3562,
"step": 2810
},
{
"epoch": 2.2560000000000002,
"grad_norm": 3.78125,
"learning_rate": 1.531509488005257e-05,
"loss": 0.2792,
"step": 2820
},
{
"epoch": 2.2640000000000002,
"grad_norm": 5.25,
"learning_rate": 1.5005314621350709e-05,
"loss": 0.2659,
"step": 2830
},
{
"epoch": 2.2720000000000002,
"grad_norm": 4.03125,
"learning_rate": 1.4698145396704044e-05,
"loss": 0.2647,
"step": 2840
},
{
"epoch": 2.2800000000000002,
"grad_norm": 5.3125,
"learning_rate": 1.4393610124715696e-05,
"loss": 0.2826,
"step": 2850
},
{
"epoch": 2.288,
"grad_norm": 4.59375,
"learning_rate": 1.4091731527463526e-05,
"loss": 0.2643,
"step": 2860
},
{
"epoch": 2.296,
"grad_norm": 5.75,
"learning_rate": 1.3792532128804803e-05,
"loss": 0.3758,
"step": 2870
},
{
"epoch": 2.304,
"grad_norm": 6.25,
"learning_rate": 1.3496034252695599e-05,
"loss": 0.3,
"step": 2880
},
{
"epoch": 2.312,
"grad_norm": 5.1875,
"learning_rate": 1.3202260021525158e-05,
"loss": 0.3376,
"step": 2890
},
{
"epoch": 2.32,
"grad_norm": 4.90625,
"learning_rate": 1.2911231354465303e-05,
"loss": 0.3686,
"step": 2900
},
{
"epoch": 2.328,
"grad_norm": 3.859375,
"learning_rate": 1.262296996583504e-05,
"loss": 0.3372,
"step": 2910
},
{
"epoch": 2.336,
"grad_norm": 3.421875,
"learning_rate": 1.2337497363480317e-05,
"loss": 0.3071,
"step": 2920
},
{
"epoch": 2.344,
"grad_norm": 4.15625,
"learning_rate": 1.2054834847169316e-05,
"loss": 0.3724,
"step": 2930
},
{
"epoch": 2.352,
"grad_norm": 4.21875,
"learning_rate": 1.1775003507003236e-05,
"loss": 0.2919,
"step": 2940
},
{
"epoch": 2.36,
"grad_norm": 4.0625,
"learning_rate": 1.1498024221842735e-05,
"loss": 0.2496,
"step": 2950
},
{
"epoch": 2.368,
"grad_norm": 5.28125,
"learning_rate": 1.1223917657750033e-05,
"loss": 0.265,
"step": 2960
},
{
"epoch": 2.376,
"grad_norm": 6.03125,
"learning_rate": 1.095270426644705e-05,
"loss": 0.3083,
"step": 2970
},
{
"epoch": 2.384,
"grad_norm": 4.25,
"learning_rate": 1.0684404283789385e-05,
"loss": 0.3392,
"step": 2980
},
{
"epoch": 2.392,
"grad_norm": 3.765625,
"learning_rate": 1.0419037728256564e-05,
"loss": 0.3743,
"step": 2990
},
{
"epoch": 2.4,
"grad_norm": 4.5625,
"learning_rate": 1.015662439945832e-05,
"loss": 0.3846,
"step": 3000
},
{
"epoch": 2.408,
"grad_norm": 4.28125,
"learning_rate": 9.89718387665734e-06,
"loss": 0.2818,
"step": 3010
},
{
"epoch": 2.416,
"grad_norm": 4.1875,
"learning_rate": 9.640735517308435e-06,
"loss": 0.3442,
"step": 3020
},
{
"epoch": 2.424,
"grad_norm": 5.28125,
"learning_rate": 9.387298455614191e-06,
"loss": 0.2982,
"step": 3030
},
{
"epoch": 2.432,
"grad_norm": 7.5,
"learning_rate": 9.136891601097347e-06,
"loss": 0.3924,
"step": 3040
},
{
"epoch": 2.44,
"grad_norm": 3.15625,
"learning_rate": 8.889533637189895e-06,
"loss": 0.2838,
"step": 3050
},
{
"epoch": 2.448,
"grad_norm": 6.125,
"learning_rate": 8.645243019839112e-06,
"loss": 0.3035,
"step": 3060
},
{
"epoch": 2.456,
"grad_norm": 3.609375,
"learning_rate": 8.404037976130458e-06,
"loss": 0.3713,
"step": 3070
},
{
"epoch": 2.464,
"grad_norm": 4.84375,
"learning_rate": 8.16593650292764e-06,
"loss": 0.3242,
"step": 3080
},
{
"epoch": 2.472,
"grad_norm": 3.21875,
"learning_rate": 7.930956365529818e-06,
"loss": 0.3214,
"step": 3090
},
{
"epoch": 2.48,
"grad_norm": 2.953125,
"learning_rate": 7.699115096346139e-06,
"loss": 0.3072,
"step": 3100
},
{
"epoch": 2.488,
"grad_norm": 4.5,
"learning_rate": 7.4704299935875185e-06,
"loss": 0.2528,
"step": 3110
},
{
"epoch": 2.496,
"grad_norm": 4.28125,
"learning_rate": 7.244918119976035e-06,
"loss": 0.3366,
"step": 3120
},
{
"epoch": 2.504,
"grad_norm": 5.53125,
"learning_rate": 7.022596301471868e-06,
"loss": 0.3603,
"step": 3130
},
{
"epoch": 2.512,
"grad_norm": 4.28125,
"learning_rate": 6.803481126017808e-06,
"loss": 0.2996,
"step": 3140
},
{
"epoch": 2.52,
"grad_norm": 3.90625,
"learning_rate": 6.587588942301626e-06,
"loss": 0.3519,
"step": 3150
},
{
"epoch": 2.528,
"grad_norm": 5.3125,
"learning_rate": 6.374935858536257e-06,
"loss": 0.2668,
"step": 3160
},
{
"epoch": 2.536,
"grad_norm": 5.25,
"learning_rate": 6.165537741257971e-06,
"loss": 0.3093,
"step": 3170
},
{
"epoch": 2.544,
"grad_norm": 5.0,
"learning_rate": 5.959410214142419e-06,
"loss": 0.3223,
"step": 3180
},
{
"epoch": 2.552,
"grad_norm": 5.15625,
"learning_rate": 5.756568656839056e-06,
"loss": 0.4137,
"step": 3190
},
{
"epoch": 2.56,
"grad_norm": 5.78125,
"learning_rate": 5.557028203823522e-06,
"loss": 0.3785,
"step": 3200
},
{
"epoch": 2.568,
"grad_norm": 3.59375,
"learning_rate": 5.360803743268494e-06,
"loss": 0.3343,
"step": 3210
},
{
"epoch": 2.576,
"grad_norm": 4.96875,
"learning_rate": 5.167909915932801e-06,
"loss": 0.3217,
"step": 3220
},
{
"epoch": 2.584,
"grad_norm": 3.6875,
"learning_rate": 4.9783611140690415e-06,
"loss": 0.3157,
"step": 3230
},
{
"epoch": 2.592,
"grad_norm": 3.84375,
"learning_rate": 4.7921714803498165e-06,
"loss": 0.2983,
"step": 3240
},
{
"epoch": 2.6,
"grad_norm": 5.3125,
"learning_rate": 4.609354906812374e-06,
"loss": 0.3362,
"step": 3250
},
{
"epoch": 2.608,
"grad_norm": 3.953125,
"learning_rate": 4.429925033822252e-06,
"loss": 0.3844,
"step": 3260
},
{
"epoch": 2.616,
"grad_norm": 4.28125,
"learning_rate": 4.253895249055412e-06,
"loss": 0.2974,
"step": 3270
},
{
"epoch": 2.624,
"grad_norm": 3.734375,
"learning_rate": 4.0812786864994566e-06,
"loss": 0.3442,
"step": 3280
},
{
"epoch": 2.632,
"grad_norm": 3.171875,
"learning_rate": 3.912088225473537e-06,
"loss": 0.3572,
"step": 3290
},
{
"epoch": 2.64,
"grad_norm": 4.59375,
"learning_rate": 3.7463364896675735e-06,
"loss": 0.3092,
"step": 3300
},
{
"epoch": 2.648,
"grad_norm": 4.03125,
"learning_rate": 3.584035846200201e-06,
"loss": 0.3093,
"step": 3310
},
{
"epoch": 2.656,
"grad_norm": 3.8125,
"learning_rate": 3.425198404696178e-06,
"loss": 0.3035,
"step": 3320
},
{
"epoch": 2.664,
"grad_norm": 4.90625,
"learning_rate": 3.2698360163827325e-06,
"loss": 0.3166,
"step": 3330
},
{
"epoch": 2.672,
"grad_norm": 4.03125,
"learning_rate": 3.1179602732053947e-06,
"loss": 0.2739,
"step": 3340
},
{
"epoch": 2.68,
"grad_norm": 6.4375,
"learning_rate": 2.969582506963098e-06,
"loss": 0.3551,
"step": 3350
},
{
"epoch": 2.6879999999999997,
"grad_norm": 4.625,
"learning_rate": 2.824713788462602e-06,
"loss": 0.3293,
"step": 3360
},
{
"epoch": 2.6959999999999997,
"grad_norm": 3.71875,
"learning_rate": 2.6833649266925943e-06,
"loss": 0.3278,
"step": 3370
},
{
"epoch": 2.7039999999999997,
"grad_norm": 3.984375,
"learning_rate": 2.5455464680171126e-06,
"loss": 0.2763,
"step": 3380
},
{
"epoch": 2.7119999999999997,
"grad_norm": 6.125,
"learning_rate": 2.411268695388719e-06,
"loss": 0.3378,
"step": 3390
},
{
"epoch": 2.7199999999999998,
"grad_norm": 5.09375,
"learning_rate": 2.28054162758119e-06,
"loss": 0.2644,
"step": 3400
},
{
"epoch": 2.7279999999999998,
"grad_norm": 4.09375,
"learning_rate": 2.1533750184420832e-06,
"loss": 0.3154,
"step": 3410
},
{
"epoch": 2.7359999999999998,
"grad_norm": 5.28125,
"learning_rate": 2.0297783561649244e-06,
"loss": 0.2217,
"step": 3420
},
{
"epoch": 2.7439999999999998,
"grad_norm": 5.59375,
"learning_rate": 1.9097608625812726e-06,
"loss": 0.3446,
"step": 3430
},
{
"epoch": 2.752,
"grad_norm": 5.25,
"learning_rate": 1.7933314924726886e-06,
"loss": 0.387,
"step": 3440
},
{
"epoch": 2.76,
"grad_norm": 5.1875,
"learning_rate": 1.6804989329025521e-06,
"loss": 0.3531,
"step": 3450
},
{
"epoch": 2.768,
"grad_norm": 5.9375,
"learning_rate": 1.5712716025679587e-06,
"loss": 0.2906,
"step": 3460
},
{
"epoch": 2.776,
"grad_norm": 5.28125,
"learning_rate": 1.4656576511715204e-06,
"loss": 0.2759,
"step": 3470
},
{
"epoch": 2.784,
"grad_norm": 4.0625,
"learning_rate": 1.3636649588133432e-06,
"loss": 0.3646,
"step": 3480
},
{
"epoch": 2.792,
"grad_norm": 4.4375,
"learning_rate": 1.265301135403052e-06,
"loss": 0.3467,
"step": 3490
},
{
"epoch": 2.8,
"grad_norm": 2.71875,
"learning_rate": 1.1705735200920053e-06,
"loss": 0.2817,
"step": 3500
},
{
"epoch": 2.808,
"grad_norm": 5.4375,
"learning_rate": 1.0794891807256956e-06,
"loss": 0.3304,
"step": 3510
},
{
"epoch": 2.816,
"grad_norm": 3.53125,
"learning_rate": 9.920549133164314e-07,
"loss": 0.3544,
"step": 3520
},
{
"epoch": 2.824,
"grad_norm": 6.21875,
"learning_rate": 9.08277241536215e-07,
"loss": 0.3082,
"step": 3530
},
{
"epoch": 2.832,
"grad_norm": 3.984375,
"learning_rate": 8.281624162300494e-07,
"loss": 0.2201,
"step": 3540
},
{
"epoch": 2.84,
"grad_norm": 6.9375,
"learning_rate": 7.517164149495326e-07,
"loss": 0.2885,
"step": 3550
},
{
"epoch": 2.848,
"grad_norm": 5.625,
"learning_rate": 6.789449415068316e-07,
"loss": 0.2716,
"step": 3560
},
{
"epoch": 2.856,
"grad_norm": 5.46875,
"learning_rate": 6.098534255491561e-07,
"loss": 0.2723,
"step": 3570
},
{
"epoch": 2.864,
"grad_norm": 2.765625,
"learning_rate": 5.44447022153588e-07,
"loss": 0.3245,
"step": 3580
},
{
"epoch": 2.872,
"grad_norm": 5.34375,
"learning_rate": 4.827306114425056e-07,
"loss": 0.2905,
"step": 3590
},
{
"epoch": 2.88,
"grad_norm": 5.59375,
"learning_rate": 4.2470879821941423e-07,
"loss": 0.3986,
"step": 3600
},
{
"epoch": 2.888,
"grad_norm": 3.203125,
"learning_rate": 3.703859116254038e-07,
"loss": 0.3328,
"step": 3610
},
{
"epoch": 2.896,
"grad_norm": 5.875,
"learning_rate": 3.197660048161133e-07,
"loss": 0.2893,
"step": 3620
},
{
"epoch": 2.904,
"grad_norm": 4.875,
"learning_rate": 2.728528546593667e-07,
"loss": 0.3573,
"step": 3630
},
{
"epoch": 2.912,
"grad_norm": 3.921875,
"learning_rate": 2.2964996145330986e-07,
"loss": 0.2721,
"step": 3640
},
{
"epoch": 2.92,
"grad_norm": 3.296875,
"learning_rate": 1.9016054866528576e-07,
"loss": 0.2943,
"step": 3650
},
{
"epoch": 2.928,
"grad_norm": 3.84375,
"learning_rate": 1.5438756269130495e-07,
"loss": 0.3179,
"step": 3660
},
{
"epoch": 2.936,
"grad_norm": 3.453125,
"learning_rate": 1.223336726362323e-07,
"loss": 0.3203,
"step": 3670
},
{
"epoch": 2.944,
"grad_norm": 3.34375,
"learning_rate": 9.400127011461312e-08,
"loss": 0.3184,
"step": 3680
},
{
"epoch": 2.952,
"grad_norm": 4.625,
"learning_rate": 6.939246907222696e-08,
"loss": 0.3581,
"step": 3690
},
{
"epoch": 2.96,
"grad_norm": 4.5,
"learning_rate": 4.850910562839151e-08,
"loss": 0.3222,
"step": 3700
},
{
"epoch": 2.968,
"grad_norm": 6.59375,
"learning_rate": 3.135273793893889e-08,
"loss": 0.2907,
"step": 3710
},
{
"epoch": 2.976,
"grad_norm": 3.5625,
"learning_rate": 1.7924646079964248e-08,
"loss": 0.3589,
"step": 3720
},
{
"epoch": 2.984,
"grad_norm": 4.625,
"learning_rate": 8.225831952324292e-09,
"loss": 0.3172,
"step": 3730
},
{
"epoch": 2.992,
"grad_norm": 5.75,
"learning_rate": 2.257019206874933e-09,
"loss": 0.29,
"step": 3740
},
{
"epoch": 3.0,
"grad_norm": 3.109375,
"learning_rate": 1.8653190470008242e-11,
"loss": 0.2897,
"step": 3750
},
{
"epoch": 3.0,
"step": 3750,
"total_flos": 6.386198055566157e+17,
"train_loss": 0.40305233942667645,
"train_runtime": 8900.4049,
"train_samples_per_second": 6.741,
"train_steps_per_second": 0.421
}
],
"logging_steps": 10,
"max_steps": 3750,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.386198055566157e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}