TianJiangZhuGe_3B / trainer_state.json
walkerrr's picture
Upload folder using huggingface_hub
5ba0aa5 verified
{
"best_metric": 0.60154372,
"best_model_checkpoint": "/nfs4/models/Qwen2.5-VL/Reject_sft_Qwen2.5-VL-3B-Instruct/v7-20250617-161549/checkpoint-800",
"epoch": 181.8372093023256,
"eval_steps": 200,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09302325581395349,
"grad_norm": 2.441588224180784,
"learning_rate": 2e-09,
"loss": 0.7878831624984741,
"memory(GiB)": 65.48,
"step": 1,
"token_acc": 0.7861313342463778,
"train_speed(iter/s)": 0.055435
},
{
"epoch": 0.46511627906976744,
"grad_norm": 2.8780390737909576,
"learning_rate": 1e-08,
"loss": 0.8473173379898071,
"memory(GiB)": 65.5,
"step": 5,
"token_acc": 0.7882713944766451,
"train_speed(iter/s)": 0.101539
},
{
"epoch": 0.9302325581395349,
"grad_norm": 2.5438182772777616,
"learning_rate": 2e-08,
"loss": 0.8371296882629394,
"memory(GiB)": 66.93,
"step": 10,
"token_acc": 0.7700506562717737,
"train_speed(iter/s)": 0.110961
},
{
"epoch": 1.372093023255814,
"grad_norm": 2.4572297135546735,
"learning_rate": 3e-08,
"loss": 0.8263990402221679,
"memory(GiB)": 66.93,
"step": 15,
"token_acc": 0.7914247785857225,
"train_speed(iter/s)": 0.118864
},
{
"epoch": 1.8372093023255816,
"grad_norm": 2.817513169380205,
"learning_rate": 4e-08,
"loss": 0.8524192810058594,
"memory(GiB)": 66.93,
"step": 20,
"token_acc": 0.8086610622604439,
"train_speed(iter/s)": 0.120417
},
{
"epoch": 2.2790697674418605,
"grad_norm": 2.5728578917158496,
"learning_rate": 5e-08,
"loss": 0.8347753524780274,
"memory(GiB)": 66.93,
"step": 25,
"token_acc": 0.7975967163791022,
"train_speed(iter/s)": 0.1197
},
{
"epoch": 2.744186046511628,
"grad_norm": 7.697786718127836,
"learning_rate": 6e-08,
"loss": 0.8387296676635743,
"memory(GiB)": 66.93,
"step": 30,
"token_acc": 0.7844551282051282,
"train_speed(iter/s)": 0.121618
},
{
"epoch": 3.186046511627907,
"grad_norm": 2.6215878302524973,
"learning_rate": 6.999999999999999e-08,
"loss": 0.8212770462036133,
"memory(GiB)": 66.94,
"step": 35,
"token_acc": 0.7918978074644326,
"train_speed(iter/s)": 0.121919
},
{
"epoch": 3.6511627906976747,
"grad_norm": 2.3356396729850886,
"learning_rate": 8e-08,
"loss": 0.8247488021850586,
"memory(GiB)": 66.94,
"step": 40,
"token_acc": 0.7788534837627688,
"train_speed(iter/s)": 0.122256
},
{
"epoch": 4.093023255813954,
"grad_norm": 2.24036454294963,
"learning_rate": 9e-08,
"loss": 0.8493685722351074,
"memory(GiB)": 66.94,
"step": 45,
"token_acc": 0.7944452759188386,
"train_speed(iter/s)": 0.124476
},
{
"epoch": 4.558139534883721,
"grad_norm": 2.3710774237116135,
"learning_rate": 1e-07,
"loss": 0.8277470588684082,
"memory(GiB)": 66.94,
"step": 50,
"token_acc": 0.8012501821832845,
"train_speed(iter/s)": 0.124595
},
{
"epoch": 5.0,
"grad_norm": 3.0036536630441435,
"learning_rate": 1.1e-07,
"loss": 0.8450939178466796,
"memory(GiB)": 66.94,
"step": 55,
"token_acc": 0.7760649403867543,
"train_speed(iter/s)": 0.125053
},
{
"epoch": 5.465116279069767,
"grad_norm": 2.5881257114238627,
"learning_rate": 1.2e-07,
"loss": 0.8497460365295411,
"memory(GiB)": 66.94,
"step": 60,
"token_acc": 0.796596503868389,
"train_speed(iter/s)": 0.125941
},
{
"epoch": 5.930232558139535,
"grad_norm": 2.3744096454020376,
"learning_rate": 1.3e-07,
"loss": 0.8116294860839843,
"memory(GiB)": 66.94,
"step": 65,
"token_acc": 0.7651333807767786,
"train_speed(iter/s)": 0.125394
},
{
"epoch": 6.372093023255814,
"grad_norm": 2.235166994874194,
"learning_rate": 1.3999999999999998e-07,
"loss": 0.8369662284851074,
"memory(GiB)": 66.94,
"step": 70,
"token_acc": 0.7880281843764316,
"train_speed(iter/s)": 0.126206
},
{
"epoch": 6.837209302325581,
"grad_norm": 8.546638989645471,
"learning_rate": 1.5e-07,
"loss": 0.8286456108093262,
"memory(GiB)": 66.94,
"step": 75,
"token_acc": 0.7748851144806365,
"train_speed(iter/s)": 0.126162
},
{
"epoch": 7.27906976744186,
"grad_norm": 2.1654224438993,
"learning_rate": 1.6e-07,
"loss": 0.8329730033874512,
"memory(GiB)": 66.94,
"step": 80,
"token_acc": 0.80111933970264,
"train_speed(iter/s)": 0.126637
},
{
"epoch": 7.7441860465116275,
"grad_norm": 3.346508648878843,
"learning_rate": 1.7e-07,
"loss": 0.8077556610107421,
"memory(GiB)": 66.94,
"step": 85,
"token_acc": 0.7893712675300275,
"train_speed(iter/s)": 0.126958
},
{
"epoch": 8.186046511627907,
"grad_norm": 2.1195939617622908,
"learning_rate": 1.8e-07,
"loss": 0.8190940856933594,
"memory(GiB)": 66.94,
"step": 90,
"token_acc": 0.7837902316300859,
"train_speed(iter/s)": 0.127783
},
{
"epoch": 8.651162790697674,
"grad_norm": 2.196504569870541,
"learning_rate": 1.8999999999999998e-07,
"loss": 0.7897569179534912,
"memory(GiB)": 66.94,
"step": 95,
"token_acc": 0.8031453890349596,
"train_speed(iter/s)": 0.12748
},
{
"epoch": 9.093023255813954,
"grad_norm": 2.4902973786655798,
"learning_rate": 2e-07,
"loss": 0.8305625915527344,
"memory(GiB)": 66.94,
"step": 100,
"token_acc": 0.7491283167239546,
"train_speed(iter/s)": 0.127599
},
{
"epoch": 9.55813953488372,
"grad_norm": 1.9236232576368646,
"learning_rate": 1.9999658256641745e-07,
"loss": 0.8344329833984375,
"memory(GiB)": 66.94,
"step": 105,
"token_acc": 0.7713534087092802,
"train_speed(iter/s)": 0.128253
},
{
"epoch": 10.0,
"grad_norm": 1.9264695576885342,
"learning_rate": 1.999863304992469e-07,
"loss": 0.772977876663208,
"memory(GiB)": 66.94,
"step": 110,
"token_acc": 0.7893902319663306,
"train_speed(iter/s)": 0.128413
},
{
"epoch": 10.465116279069768,
"grad_norm": 1.7921071186450859,
"learning_rate": 1.9996924449920347e-07,
"loss": 0.7723042488098144,
"memory(GiB)": 66.94,
"step": 115,
"token_acc": 0.8070967633232802,
"train_speed(iter/s)": 0.128698
},
{
"epoch": 10.930232558139535,
"grad_norm": 2.0488653924721487,
"learning_rate": 1.999453257340926e-07,
"loss": 0.805912971496582,
"memory(GiB)": 66.94,
"step": 120,
"token_acc": 0.7839763435738668,
"train_speed(iter/s)": 0.128633
},
{
"epoch": 11.372093023255815,
"grad_norm": 1.8887448764254238,
"learning_rate": 1.9991457583873009e-07,
"loss": 0.7916177272796631,
"memory(GiB)": 66.94,
"step": 125,
"token_acc": 0.7835127698472789,
"train_speed(iter/s)": 0.129044
},
{
"epoch": 11.837209302325581,
"grad_norm": 2.087347255237122,
"learning_rate": 1.9987699691483047e-07,
"loss": 0.7750067710876465,
"memory(GiB)": 66.94,
"step": 130,
"token_acc": 0.79361802524478,
"train_speed(iter/s)": 0.128698
},
{
"epoch": 12.279069767441861,
"grad_norm": 1.8497770530709863,
"learning_rate": 1.9983259153086325e-07,
"loss": 0.7334749698638916,
"memory(GiB)": 66.94,
"step": 135,
"token_acc": 0.8016341430131004,
"train_speed(iter/s)": 0.129134
},
{
"epoch": 12.744186046511627,
"grad_norm": 1.3780662997872353,
"learning_rate": 1.9978136272187745e-07,
"loss": 0.7617583274841309,
"memory(GiB)": 66.94,
"step": 140,
"token_acc": 0.8071877904067482,
"train_speed(iter/s)": 0.128965
},
{
"epoch": 13.186046511627907,
"grad_norm": 1.4470094463921936,
"learning_rate": 1.997233139892941e-07,
"loss": 0.7472479820251465,
"memory(GiB)": 66.94,
"step": 145,
"token_acc": 0.7839292328474627,
"train_speed(iter/s)": 0.129158
},
{
"epoch": 13.651162790697674,
"grad_norm": 1.969343282689861,
"learning_rate": 1.9965844930066698e-07,
"loss": 0.7178962707519532,
"memory(GiB)": 66.94,
"step": 150,
"token_acc": 0.7930578931176141,
"train_speed(iter/s)": 0.129381
},
{
"epoch": 14.093023255813954,
"grad_norm": 1.659886865517498,
"learning_rate": 1.9958677308941136e-07,
"loss": 0.7550750255584717,
"memory(GiB)": 66.94,
"step": 155,
"token_acc": 0.7681622703125359,
"train_speed(iter/s)": 0.129371
},
{
"epoch": 14.55813953488372,
"grad_norm": 1.3482878555174083,
"learning_rate": 1.9950829025450114e-07,
"loss": 0.7135652542114258,
"memory(GiB)": 66.94,
"step": 160,
"token_acc": 0.7849006160641636,
"train_speed(iter/s)": 0.129416
},
{
"epoch": 15.0,
"grad_norm": 1.6524876656316168,
"learning_rate": 1.9942300616013377e-07,
"loss": 0.7475796699523926,
"memory(GiB)": 74.95,
"step": 165,
"token_acc": 0.796426354182834,
"train_speed(iter/s)": 0.129454
},
{
"epoch": 15.465116279069768,
"grad_norm": 1.4018266466879952,
"learning_rate": 1.993309266353638e-07,
"loss": 0.7252517223358155,
"memory(GiB)": 74.96,
"step": 170,
"token_acc": 0.8084311921640781,
"train_speed(iter/s)": 0.129786
},
{
"epoch": 15.930232558139535,
"grad_norm": 1.2395976325818243,
"learning_rate": 1.992320579737045e-07,
"loss": 0.7289777755737304,
"memory(GiB)": 74.96,
"step": 175,
"token_acc": 0.810318895442178,
"train_speed(iter/s)": 0.129608
},
{
"epoch": 16.372093023255815,
"grad_norm": 2.5675852224219553,
"learning_rate": 1.9912640693269751e-07,
"loss": 0.6915578365325927,
"memory(GiB)": 74.96,
"step": 180,
"token_acc": 0.7905717628859845,
"train_speed(iter/s)": 0.129489
},
{
"epoch": 16.837209302325583,
"grad_norm": 1.4358606025818346,
"learning_rate": 1.9901398073345117e-07,
"loss": 0.7248753547668457,
"memory(GiB)": 74.96,
"step": 185,
"token_acc": 0.8124335543968005,
"train_speed(iter/s)": 0.129395
},
{
"epoch": 17.27906976744186,
"grad_norm": 1.262748163163051,
"learning_rate": 1.9889478706014683e-07,
"loss": 0.7250626564025879,
"memory(GiB)": 74.96,
"step": 190,
"token_acc": 0.7930634826915087,
"train_speed(iter/s)": 0.129717
},
{
"epoch": 17.74418604651163,
"grad_norm": 1.467844482343943,
"learning_rate": 1.9876883405951376e-07,
"loss": 0.7151264190673828,
"memory(GiB)": 74.96,
"step": 195,
"token_acc": 0.8009663075081238,
"train_speed(iter/s)": 0.129659
},
{
"epoch": 18.186046511627907,
"grad_norm": 1.1965217379377007,
"learning_rate": 1.9863613034027222e-07,
"loss": 0.667814064025879,
"memory(GiB)": 74.96,
"step": 200,
"token_acc": 0.8150907451820857,
"train_speed(iter/s)": 0.12962
},
{
"epoch": 18.186046511627907,
"eval_loss": 0.708366334438324,
"eval_runtime": 0.7627,
"eval_samples_per_second": 17.045,
"eval_steps_per_second": 2.622,
"eval_token_acc": 0.8243126736277421,
"step": 200
},
{
"epoch": 18.651162790697676,
"grad_norm": 1.1855878327264966,
"learning_rate": 1.9849668497254518e-07,
"loss": 0.7150158882141113,
"memory(GiB)": 74.96,
"step": 205,
"token_acc": 0.8079891491231421,
"train_speed(iter/s)": 0.128041
},
{
"epoch": 19.093023255813954,
"grad_norm": 1.0438364306476957,
"learning_rate": 1.9835050748723822e-07,
"loss": 0.6731427669525146,
"memory(GiB)": 74.96,
"step": 210,
"token_acc": 0.8069213383230205,
"train_speed(iter/s)": 0.128417
},
{
"epoch": 19.558139534883722,
"grad_norm": 0.9853618641588676,
"learning_rate": 1.9819760787538837e-07,
"loss": 0.6843628883361816,
"memory(GiB)": 74.96,
"step": 215,
"token_acc": 0.8041327124563445,
"train_speed(iter/s)": 0.128537
},
{
"epoch": 20.0,
"grad_norm": 1.1619660544446906,
"learning_rate": 1.9803799658748093e-07,
"loss": 0.6671229839324951,
"memory(GiB)": 74.97,
"step": 220,
"token_acc": 0.8175119885190241,
"train_speed(iter/s)": 0.12854
},
{
"epoch": 20.46511627906977,
"grad_norm": 1.2626864222948397,
"learning_rate": 1.9787168453273545e-07,
"loss": 0.6970132827758789,
"memory(GiB)": 74.97,
"step": 225,
"token_acc": 0.8270612144784331,
"train_speed(iter/s)": 0.128495
},
{
"epoch": 20.930232558139537,
"grad_norm": 0.7941657042448518,
"learning_rate": 1.9769868307835993e-07,
"loss": 0.6455688953399659,
"memory(GiB)": 74.97,
"step": 230,
"token_acc": 0.8348736389299637,
"train_speed(iter/s)": 0.128518
},
{
"epoch": 21.372093023255815,
"grad_norm": 1.1822694017861601,
"learning_rate": 1.9751900404877398e-07,
"loss": 0.6348400115966797,
"memory(GiB)": 74.97,
"step": 235,
"token_acc": 0.8331182941735705,
"train_speed(iter/s)": 0.128694
},
{
"epoch": 21.837209302325583,
"grad_norm": 0.9081549570182597,
"learning_rate": 1.9733265972480058e-07,
"loss": 0.6620560646057129,
"memory(GiB)": 74.97,
"step": 240,
"token_acc": 0.8267432385239223,
"train_speed(iter/s)": 0.128483
},
{
"epoch": 22.27906976744186,
"grad_norm": 0.8139654483754177,
"learning_rate": 1.9713966284282674e-07,
"loss": 0.6350464820861816,
"memory(GiB)": 74.97,
"step": 245,
"token_acc": 0.8140620540628695,
"train_speed(iter/s)": 0.128624
},
{
"epoch": 22.74418604651163,
"grad_norm": 1.39238370567191,
"learning_rate": 1.9694002659393302e-07,
"loss": 0.6755290031433105,
"memory(GiB)": 74.97,
"step": 250,
"token_acc": 0.8166282714604026,
"train_speed(iter/s)": 0.128733
},
{
"epoch": 23.186046511627907,
"grad_norm": 1.0689850821114422,
"learning_rate": 1.9673376462299182e-07,
"loss": 0.6278616905212402,
"memory(GiB)": 74.97,
"step": 255,
"token_acc": 0.8235556962260989,
"train_speed(iter/s)": 0.128805
},
{
"epoch": 23.651162790697676,
"grad_norm": 0.826203146475013,
"learning_rate": 1.9652089102773487e-07,
"loss": 0.6573570728302002,
"memory(GiB)": 74.97,
"step": 260,
"token_acc": 0.8203604745946925,
"train_speed(iter/s)": 0.128858
},
{
"epoch": 24.093023255813954,
"grad_norm": 1.0075863589078984,
"learning_rate": 1.963014203577896e-07,
"loss": 0.6461727619171143,
"memory(GiB)": 74.97,
"step": 265,
"token_acc": 0.799577569399313,
"train_speed(iter/s)": 0.128878
},
{
"epoch": 24.558139534883722,
"grad_norm": 1.1793630828397141,
"learning_rate": 1.9607536761368482e-07,
"loss": 0.634314775466919,
"memory(GiB)": 74.97,
"step": 270,
"token_acc": 0.7944581869582389,
"train_speed(iter/s)": 0.128808
},
{
"epoch": 25.0,
"grad_norm": 0.9698025031342606,
"learning_rate": 1.9584274824582527e-07,
"loss": 0.6515589714050293,
"memory(GiB)": 74.97,
"step": 275,
"token_acc": 0.8235917962402285,
"train_speed(iter/s)": 0.128916
},
{
"epoch": 25.46511627906977,
"grad_norm": 3.191013934790137,
"learning_rate": 1.9560357815343574e-07,
"loss": 0.6280710697174072,
"memory(GiB)": 74.97,
"step": 280,
"token_acc": 0.8348407138350025,
"train_speed(iter/s)": 0.128889
},
{
"epoch": 25.930232558139537,
"grad_norm": 2.6857457017550184,
"learning_rate": 1.9535787368347442e-07,
"loss": 0.6403141498565674,
"memory(GiB)": 74.97,
"step": 285,
"token_acc": 0.8304662656603196,
"train_speed(iter/s)": 0.128946
},
{
"epoch": 26.372093023255815,
"grad_norm": 0.7179384373982066,
"learning_rate": 1.9510565162951537e-07,
"loss": 0.6483189582824707,
"memory(GiB)": 74.97,
"step": 290,
"token_acc": 0.8042392190472208,
"train_speed(iter/s)": 0.129194
},
{
"epoch": 26.837209302325583,
"grad_norm": 0.8336349818317007,
"learning_rate": 1.9484692923060094e-07,
"loss": 0.6260199546813965,
"memory(GiB)": 74.97,
"step": 295,
"token_acc": 0.8142812170144997,
"train_speed(iter/s)": 0.129275
},
{
"epoch": 27.27906976744186,
"grad_norm": 0.8073425015755772,
"learning_rate": 1.9458172417006346e-07,
"loss": 0.6319057464599609,
"memory(GiB)": 74.97,
"step": 300,
"token_acc": 0.8167601892733382,
"train_speed(iter/s)": 0.129283
},
{
"epoch": 27.74418604651163,
"grad_norm": 0.8293440372694967,
"learning_rate": 1.943100545743165e-07,
"loss": 0.6321963310241699,
"memory(GiB)": 74.97,
"step": 305,
"token_acc": 0.8145223890527623,
"train_speed(iter/s)": 0.129158
},
{
"epoch": 28.186046511627907,
"grad_norm": 0.8851271223039491,
"learning_rate": 1.9403193901161612e-07,
"loss": 0.6186152935028076,
"memory(GiB)": 74.97,
"step": 310,
"token_acc": 0.8423929547525053,
"train_speed(iter/s)": 0.129305
},
{
"epoch": 28.651162790697676,
"grad_norm": 0.9560469073452553,
"learning_rate": 1.9374739649079154e-07,
"loss": 0.6388277053833008,
"memory(GiB)": 74.97,
"step": 315,
"token_acc": 0.8255307825359716,
"train_speed(iter/s)": 0.129291
},
{
"epoch": 29.093023255813954,
"grad_norm": 1.0797696361091218,
"learning_rate": 1.9345644645994608e-07,
"loss": 0.6270732879638672,
"memory(GiB)": 74.97,
"step": 320,
"token_acc": 0.8329987798638171,
"train_speed(iter/s)": 0.129427
},
{
"epoch": 29.558139534883722,
"grad_norm": 1.035746534298127,
"learning_rate": 1.9315910880512788e-07,
"loss": 0.6154883861541748,
"memory(GiB)": 74.97,
"step": 325,
"token_acc": 0.8229807039658683,
"train_speed(iter/s)": 0.129368
},
{
"epoch": 30.0,
"grad_norm": 0.9448004935095479,
"learning_rate": 1.928554038489707e-07,
"loss": 0.6246993541717529,
"memory(GiB)": 74.97,
"step": 330,
"token_acc": 0.8252855659397715,
"train_speed(iter/s)": 0.129558
},
{
"epoch": 30.46511627906977,
"grad_norm": 0.7400543933440672,
"learning_rate": 1.9254535234930483e-07,
"loss": 0.6015793323516846,
"memory(GiB)": 74.97,
"step": 335,
"token_acc": 0.8212677580369298,
"train_speed(iter/s)": 0.129568
},
{
"epoch": 30.930232558139537,
"grad_norm": 0.6862921067098382,
"learning_rate": 1.9222897549773846e-07,
"loss": 0.627756404876709,
"memory(GiB)": 74.97,
"step": 340,
"token_acc": 0.8131175537754646,
"train_speed(iter/s)": 0.129532
},
{
"epoch": 31.372093023255815,
"grad_norm": 1.0706787922118046,
"learning_rate": 1.9190629491820908e-07,
"loss": 0.6050760269165039,
"memory(GiB)": 74.97,
"step": 345,
"token_acc": 0.8153731376034056,
"train_speed(iter/s)": 0.129658
},
{
"epoch": 31.837209302325583,
"grad_norm": 0.7747208875253631,
"learning_rate": 1.9157733266550572e-07,
"loss": 0.6289189338684082,
"memory(GiB)": 74.97,
"step": 350,
"token_acc": 0.8139119876370594,
"train_speed(iter/s)": 0.129542
},
{
"epoch": 32.27906976744186,
"grad_norm": 0.773459886431363,
"learning_rate": 1.9124211122376135e-07,
"loss": 0.6157156944274902,
"memory(GiB)": 74.97,
"step": 355,
"token_acc": 0.8152114721365039,
"train_speed(iter/s)": 0.129801
},
{
"epoch": 32.74418604651163,
"grad_norm": 1.1738935206395225,
"learning_rate": 1.9090065350491624e-07,
"loss": 0.6239834785461426,
"memory(GiB)": 74.97,
"step": 360,
"token_acc": 0.833327410355734,
"train_speed(iter/s)": 0.129897
},
{
"epoch": 33.18604651162791,
"grad_norm": 0.848966063311304,
"learning_rate": 1.905529828471519e-07,
"loss": 0.5887202262878418,
"memory(GiB)": 74.97,
"step": 365,
"token_acc": 0.8398133748055988,
"train_speed(iter/s)": 0.129873
},
{
"epoch": 33.651162790697676,
"grad_norm": 2.144137430723947,
"learning_rate": 1.901991230132959e-07,
"loss": 0.6359727859497071,
"memory(GiB)": 74.97,
"step": 370,
"token_acc": 0.8069930345126126,
"train_speed(iter/s)": 0.129943
},
{
"epoch": 34.093023255813954,
"grad_norm": 0.7367545693321746,
"learning_rate": 1.8983909818919788e-07,
"loss": 0.5804174900054931,
"memory(GiB)": 74.97,
"step": 375,
"token_acc": 0.8437245411415153,
"train_speed(iter/s)": 0.129967
},
{
"epoch": 34.55813953488372,
"grad_norm": 0.7507232728161667,
"learning_rate": 1.8947293298207635e-07,
"loss": 0.5902613639831543,
"memory(GiB)": 74.97,
"step": 380,
"token_acc": 0.8308984660336012,
"train_speed(iter/s)": 0.129921
},
{
"epoch": 35.0,
"grad_norm": 1.3494911901833562,
"learning_rate": 1.8910065241883678e-07,
"loss": 0.6213099479675293,
"memory(GiB)": 74.97,
"step": 385,
"token_acc": 0.8180765456329735,
"train_speed(iter/s)": 0.129994
},
{
"epoch": 35.46511627906977,
"grad_norm": 0.9327927885382011,
"learning_rate": 1.8872228194436116e-07,
"loss": 0.61426682472229,
"memory(GiB)": 74.97,
"step": 390,
"token_acc": 0.8016005335111704,
"train_speed(iter/s)": 0.130043
},
{
"epoch": 35.93023255813954,
"grad_norm": 0.8590493021171992,
"learning_rate": 1.8833784741976886e-07,
"loss": 0.5930656433105469,
"memory(GiB)": 74.97,
"step": 395,
"token_acc": 0.8238509177734666,
"train_speed(iter/s)": 0.129952
},
{
"epoch": 36.372093023255815,
"grad_norm": 0.692718053612059,
"learning_rate": 1.8794737512064888e-07,
"loss": 0.601491117477417,
"memory(GiB)": 74.97,
"step": 400,
"token_acc": 0.8390804597701149,
"train_speed(iter/s)": 0.130015
},
{
"epoch": 36.372093023255815,
"eval_loss": 0.6246538758277893,
"eval_runtime": 0.7289,
"eval_samples_per_second": 17.836,
"eval_steps_per_second": 2.744,
"eval_token_acc": 0.8392566337771817,
"step": 400
},
{
"epoch": 36.83720930232558,
"grad_norm": 0.8580896624897943,
"learning_rate": 1.875508917352643e-07,
"loss": 0.6003564834594727,
"memory(GiB)": 74.97,
"step": 405,
"token_acc": 0.8357933251629633,
"train_speed(iter/s)": 0.129242
},
{
"epoch": 37.27906976744186,
"grad_norm": 0.9684611433600051,
"learning_rate": 1.871484243627277e-07,
"loss": 0.6055225372314453,
"memory(GiB)": 74.97,
"step": 410,
"token_acc": 0.8125408092339449,
"train_speed(iter/s)": 0.129415
},
{
"epoch": 37.74418604651163,
"grad_norm": 0.8148508280992611,
"learning_rate": 1.867400005111495e-07,
"loss": 0.5952893257141113,
"memory(GiB)": 74.97,
"step": 415,
"token_acc": 0.8260123541523678,
"train_speed(iter/s)": 0.129433
},
{
"epoch": 38.18604651162791,
"grad_norm": 0.7992095356192797,
"learning_rate": 1.8632564809575738e-07,
"loss": 0.6156826496124268,
"memory(GiB)": 74.97,
"step": 420,
"token_acc": 0.8205879974118409,
"train_speed(iter/s)": 0.12938
},
{
"epoch": 38.651162790697676,
"grad_norm": 3.6721651326108775,
"learning_rate": 1.859053954369885e-07,
"loss": 0.610502815246582,
"memory(GiB)": 74.97,
"step": 425,
"token_acc": 0.8172398589065256,
"train_speed(iter/s)": 0.129325
},
{
"epoch": 39.093023255813954,
"grad_norm": 0.9272484168885702,
"learning_rate": 1.854792712585539e-07,
"loss": 0.5535663604736328,
"memory(GiB)": 74.97,
"step": 430,
"token_acc": 0.8236255683739807,
"train_speed(iter/s)": 0.129482
},
{
"epoch": 39.55813953488372,
"grad_norm": 0.8018214646006986,
"learning_rate": 1.8504730468547506e-07,
"loss": 0.5991367340087891,
"memory(GiB)": 74.97,
"step": 435,
"token_acc": 0.8261135086719322,
"train_speed(iter/s)": 0.129405
},
{
"epoch": 40.0,
"grad_norm": 1.2379263967079543,
"learning_rate": 1.846095252420935e-07,
"loss": 0.585663890838623,
"memory(GiB)": 74.97,
"step": 440,
"token_acc": 0.8266845321477151,
"train_speed(iter/s)": 0.129434
},
{
"epoch": 40.46511627906977,
"grad_norm": 0.833466025772104,
"learning_rate": 1.841659628500527e-07,
"loss": 0.5750086784362793,
"memory(GiB)": 74.97,
"step": 445,
"token_acc": 0.8343643862202814,
"train_speed(iter/s)": 0.129525
},
{
"epoch": 40.93023255813954,
"grad_norm": 0.7870441769315963,
"learning_rate": 1.8371664782625284e-07,
"loss": 0.5996095180511475,
"memory(GiB)": 74.97,
"step": 450,
"token_acc": 0.8262060770106785,
"train_speed(iter/s)": 0.129426
},
{
"epoch": 41.372093023255815,
"grad_norm": 0.7270750065258582,
"learning_rate": 1.8326161088077904e-07,
"loss": 0.5774937629699707,
"memory(GiB)": 74.97,
"step": 455,
"token_acc": 0.8339674588455729,
"train_speed(iter/s)": 0.129531
},
{
"epoch": 41.83720930232558,
"grad_norm": 0.6345626674708744,
"learning_rate": 1.82800883114802e-07,
"loss": 0.5982451438903809,
"memory(GiB)": 74.97,
"step": 460,
"token_acc": 0.83098393668337,
"train_speed(iter/s)": 0.129577
},
{
"epoch": 42.27906976744186,
"grad_norm": 0.8020609888197409,
"learning_rate": 1.8233449601845256e-07,
"loss": 0.5845087051391602,
"memory(GiB)": 74.97,
"step": 465,
"token_acc": 0.8175882797882081,
"train_speed(iter/s)": 0.129629
},
{
"epoch": 42.74418604651163,
"grad_norm": 0.8480884031667174,
"learning_rate": 1.8186248146866925e-07,
"loss": 0.591459846496582,
"memory(GiB)": 74.97,
"step": 470,
"token_acc": 0.8345550327140474,
"train_speed(iter/s)": 0.129656
},
{
"epoch": 43.18604651162791,
"grad_norm": 4.128756169670704,
"learning_rate": 1.8138487172701948e-07,
"loss": 0.5832277297973633,
"memory(GiB)": 74.97,
"step": 475,
"token_acc": 0.8327794561933535,
"train_speed(iter/s)": 0.129649
},
{
"epoch": 43.651162790697676,
"grad_norm": 0.691292587718326,
"learning_rate": 1.8090169943749475e-07,
"loss": 0.5771265029907227,
"memory(GiB)": 74.97,
"step": 480,
"token_acc": 0.8235854875792071,
"train_speed(iter/s)": 0.129594
},
{
"epoch": 44.093023255813954,
"grad_norm": 0.9411447489425482,
"learning_rate": 1.8041299762427914e-07,
"loss": 0.5849340438842774,
"memory(GiB)": 74.97,
"step": 485,
"token_acc": 0.8348119811167182,
"train_speed(iter/s)": 0.12971
},
{
"epoch": 44.55813953488372,
"grad_norm": 0.9906151143939281,
"learning_rate": 1.7991879968949247e-07,
"loss": 0.6044949531555176,
"memory(GiB)": 74.97,
"step": 490,
"token_acc": 0.8391592252658489,
"train_speed(iter/s)": 0.129794
},
{
"epoch": 45.0,
"grad_norm": 0.6320054379409873,
"learning_rate": 1.794191394109071e-07,
"loss": 0.5554977893829346,
"memory(GiB)": 74.97,
"step": 495,
"token_acc": 0.8345945945945946,
"train_speed(iter/s)": 0.12979
},
{
"epoch": 45.46511627906977,
"grad_norm": 0.7061286584704719,
"learning_rate": 1.7891405093963936e-07,
"loss": 0.5755014896392823,
"memory(GiB)": 74.97,
"step": 500,
"token_acc": 0.8254359194017598,
"train_speed(iter/s)": 0.129688
},
{
"epoch": 45.93023255813954,
"grad_norm": 0.7195669164082512,
"learning_rate": 1.7840356879781529e-07,
"loss": 0.5827363014221192,
"memory(GiB)": 74.97,
"step": 505,
"token_acc": 0.839882368874185,
"train_speed(iter/s)": 0.129779
},
{
"epoch": 46.372093023255815,
"grad_norm": 0.6968950428332337,
"learning_rate": 1.7788772787621125e-07,
"loss": 0.5568270683288574,
"memory(GiB)": 74.97,
"step": 510,
"token_acc": 0.8614190870002142,
"train_speed(iter/s)": 0.129886
},
{
"epoch": 46.83720930232558,
"grad_norm": 0.7064063028804808,
"learning_rate": 1.7736656343186894e-07,
"loss": 0.5865127563476562,
"memory(GiB)": 74.97,
"step": 515,
"token_acc": 0.8082950799781602,
"train_speed(iter/s)": 0.129852
},
{
"epoch": 47.27906976744186,
"grad_norm": 0.6403030213655208,
"learning_rate": 1.768401110856859e-07,
"loss": 0.5599156379699707,
"memory(GiB)": 74.97,
"step": 520,
"token_acc": 0.8375492061100334,
"train_speed(iter/s)": 0.129883
},
{
"epoch": 47.74418604651163,
"grad_norm": 0.9706781013185869,
"learning_rate": 1.7630840681998066e-07,
"loss": 0.5808145523071289,
"memory(GiB)": 74.97,
"step": 525,
"token_acc": 0.8431429663747747,
"train_speed(iter/s)": 0.129855
},
{
"epoch": 48.18604651162791,
"grad_norm": 0.7377603527859908,
"learning_rate": 1.7577148697603348e-07,
"loss": 0.5715710639953613,
"memory(GiB)": 74.97,
"step": 530,
"token_acc": 0.8220905089196077,
"train_speed(iter/s)": 0.129985
},
{
"epoch": 48.651162790697676,
"grad_norm": 0.8535997732414037,
"learning_rate": 1.7522938825160247e-07,
"loss": 0.5609760284423828,
"memory(GiB)": 74.97,
"step": 535,
"token_acc": 0.8485186181454867,
"train_speed(iter/s)": 0.12997
},
{
"epoch": 49.093023255813954,
"grad_norm": 1.6196917405162314,
"learning_rate": 1.7468214769841538e-07,
"loss": 0.5788634777069092,
"memory(GiB)": 74.97,
"step": 540,
"token_acc": 0.8494809430899153,
"train_speed(iter/s)": 0.129998
},
{
"epoch": 49.55813953488372,
"grad_norm": 0.8074628776305832,
"learning_rate": 1.7412980271963708e-07,
"loss": 0.5682050704956054,
"memory(GiB)": 74.97,
"step": 545,
"token_acc": 0.8164148196748201,
"train_speed(iter/s)": 0.129923
},
{
"epoch": 50.0,
"grad_norm": 0.9098109454481578,
"learning_rate": 1.7357239106731316e-07,
"loss": 0.5588317394256592,
"memory(GiB)": 74.97,
"step": 550,
"token_acc": 0.8282426370196996,
"train_speed(iter/s)": 0.130049
},
{
"epoch": 50.46511627906977,
"grad_norm": 0.8717894931304141,
"learning_rate": 1.7300995083978961e-07,
"loss": 0.560645866394043,
"memory(GiB)": 74.97,
"step": 555,
"token_acc": 0.8580127632625887,
"train_speed(iter/s)": 0.129978
},
{
"epoch": 50.93023255813954,
"grad_norm": 0.901907102378853,
"learning_rate": 1.724425204791089e-07,
"loss": 0.5699704647064209,
"memory(GiB)": 74.97,
"step": 560,
"token_acc": 0.8169467583456241,
"train_speed(iter/s)": 0.129949
},
{
"epoch": 51.372093023255815,
"grad_norm": 0.6850047743663971,
"learning_rate": 1.7187013876838238e-07,
"loss": 0.5511385917663574,
"memory(GiB)": 74.97,
"step": 565,
"token_acc": 0.8470804299681305,
"train_speed(iter/s)": 0.130045
},
{
"epoch": 51.83720930232558,
"grad_norm": 0.693108198878134,
"learning_rate": 1.712928448291397e-07,
"loss": 0.560858964920044,
"memory(GiB)": 74.97,
"step": 570,
"token_acc": 0.8014341226733077,
"train_speed(iter/s)": 0.130065
},
{
"epoch": 52.27906976744186,
"grad_norm": 1.168154430184055,
"learning_rate": 1.7071067811865473e-07,
"loss": 0.5584731578826905,
"memory(GiB)": 74.97,
"step": 575,
"token_acc": 0.8305429323128438,
"train_speed(iter/s)": 0.130047
},
{
"epoch": 52.74418604651163,
"grad_norm": 0.8940504753420614,
"learning_rate": 1.7012367842724884e-07,
"loss": 0.5449427127838135,
"memory(GiB)": 74.97,
"step": 580,
"token_acc": 0.8454416804487562,
"train_speed(iter/s)": 0.13012
},
{
"epoch": 53.18604651162791,
"grad_norm": 1.3182438739088296,
"learning_rate": 1.695318858755712e-07,
"loss": 0.5867147445678711,
"memory(GiB)": 74.97,
"step": 585,
"token_acc": 0.8317076233934776,
"train_speed(iter/s)": 0.130155
},
{
"epoch": 53.651162790697676,
"grad_norm": 5.89431265738365,
"learning_rate": 1.6893534091185658e-07,
"loss": 0.5429623603820801,
"memory(GiB)": 74.97,
"step": 590,
"token_acc": 0.8596458176337604,
"train_speed(iter/s)": 0.130144
},
{
"epoch": 54.093023255813954,
"grad_norm": 0.8228392742664287,
"learning_rate": 1.6833408430916082e-07,
"loss": 0.5783446311950684,
"memory(GiB)": 74.97,
"step": 595,
"token_acc": 0.8510537851964256,
"train_speed(iter/s)": 0.130222
},
{
"epoch": 54.55813953488372,
"grad_norm": 0.6782178805084175,
"learning_rate": 1.6772815716257412e-07,
"loss": 0.5568069458007813,
"memory(GiB)": 74.97,
"step": 600,
"token_acc": 0.8492520719628057,
"train_speed(iter/s)": 0.130138
},
{
"epoch": 54.55813953488372,
"eval_loss": 0.6069812774658203,
"eval_runtime": 0.7244,
"eval_samples_per_second": 17.946,
"eval_steps_per_second": 2.761,
"eval_token_acc": 0.8424178561164862,
"step": 600
},
{
"epoch": 55.0,
"grad_norm": 0.9571327352378861,
"learning_rate": 1.6711760088641197e-07,
"loss": 0.549845027923584,
"memory(GiB)": 74.97,
"step": 605,
"token_acc": 0.8441368444744543,
"train_speed(iter/s)": 0.129683
},
{
"epoch": 55.46511627906977,
"grad_norm": 0.6574337050432097,
"learning_rate": 1.665024572113848e-07,
"loss": 0.5540960311889649,
"memory(GiB)": 74.97,
"step": 610,
"token_acc": 0.8468528296996988,
"train_speed(iter/s)": 0.12963
},
{
"epoch": 55.93023255813954,
"grad_norm": 1.3701583003213704,
"learning_rate": 1.6588276818174578e-07,
"loss": 0.5496389389038085,
"memory(GiB)": 74.97,
"step": 615,
"token_acc": 0.8450532311656608,
"train_speed(iter/s)": 0.129682
},
{
"epoch": 56.372093023255815,
"grad_norm": 0.6379537701462664,
"learning_rate": 1.6525857615241686e-07,
"loss": 0.5491930484771729,
"memory(GiB)": 74.97,
"step": 620,
"token_acc": 0.8525308496423799,
"train_speed(iter/s)": 0.129772
},
{
"epoch": 56.83720930232558,
"grad_norm": 1.0493433605209441,
"learning_rate": 1.6462992378609406e-07,
"loss": 0.5360322952270508,
"memory(GiB)": 74.97,
"step": 625,
"token_acc": 0.8368756439119319,
"train_speed(iter/s)": 0.129772
},
{
"epoch": 57.27906976744186,
"grad_norm": 1.1362722651257062,
"learning_rate": 1.6399685405033166e-07,
"loss": 0.5665555000305176,
"memory(GiB)": 74.97,
"step": 630,
"token_acc": 0.8487739334900907,
"train_speed(iter/s)": 0.129826
},
{
"epoch": 57.74418604651163,
"grad_norm": 0.6512954800566325,
"learning_rate": 1.6335941021460504e-07,
"loss": 0.5384564399719238,
"memory(GiB)": 74.97,
"step": 635,
"token_acc": 0.8314396783289121,
"train_speed(iter/s)": 0.129882
},
{
"epoch": 58.18604651162791,
"grad_norm": 0.6514693162473681,
"learning_rate": 1.627176358473537e-07,
"loss": 0.5575238227844238,
"memory(GiB)": 74.97,
"step": 640,
"token_acc": 0.8362654193227916,
"train_speed(iter/s)": 0.129896
},
{
"epoch": 58.651162790697676,
"grad_norm": 0.6211369831346565,
"learning_rate": 1.6207157481300312e-07,
"loss": 0.5277935981750488,
"memory(GiB)": 74.97,
"step": 645,
"token_acc": 0.8476069720412159,
"train_speed(iter/s)": 0.129829
},
{
"epoch": 59.093023255813954,
"grad_norm": 0.93341409437694,
"learning_rate": 1.614212712689668e-07,
"loss": 0.5535923480987549,
"memory(GiB)": 74.97,
"step": 650,
"token_acc": 0.8373809799159632,
"train_speed(iter/s)": 0.129933
},
{
"epoch": 59.55813953488372,
"grad_norm": 0.7951026197647952,
"learning_rate": 1.607667696626281e-07,
"loss": 0.5427175045013428,
"memory(GiB)": 74.97,
"step": 655,
"token_acc": 0.845807408479236,
"train_speed(iter/s)": 0.129879
},
{
"epoch": 60.0,
"grad_norm": 0.8112289345971331,
"learning_rate": 1.601081147283025e-07,
"loss": 0.544118070602417,
"memory(GiB)": 74.97,
"step": 660,
"token_acc": 0.8465872536213518,
"train_speed(iter/s)": 0.130007
},
{
"epoch": 60.46511627906977,
"grad_norm": 0.8973071989809348,
"learning_rate": 1.594453514841798e-07,
"loss": 0.5551681041717529,
"memory(GiB)": 74.97,
"step": 665,
"token_acc": 0.8406223717409588,
"train_speed(iter/s)": 0.129985
},
{
"epoch": 60.93023255813954,
"grad_norm": 0.6961112129897833,
"learning_rate": 1.5877852522924732e-07,
"loss": 0.5278561592102051,
"memory(GiB)": 74.97,
"step": 670,
"token_acc": 0.8361272191105745,
"train_speed(iter/s)": 0.12996
},
{
"epoch": 61.372093023255815,
"grad_norm": 0.8454621530526435,
"learning_rate": 1.5810768154019382e-07,
"loss": 0.5304566383361816,
"memory(GiB)": 74.97,
"step": 675,
"token_acc": 0.8467184191954834,
"train_speed(iter/s)": 0.130101
},
{
"epoch": 61.83720930232558,
"grad_norm": 0.8048317682461219,
"learning_rate": 1.5743286626829435e-07,
"loss": 0.556386137008667,
"memory(GiB)": 74.97,
"step": 680,
"token_acc": 0.8513160602079739,
"train_speed(iter/s)": 0.130049
},
{
"epoch": 62.27906976744186,
"grad_norm": 1.2555197833096778,
"learning_rate": 1.5675412553627636e-07,
"loss": 0.5487345695495606,
"memory(GiB)": 74.97,
"step": 685,
"token_acc": 0.8283330021855752,
"train_speed(iter/s)": 0.130158
},
{
"epoch": 62.74418604651163,
"grad_norm": 0.6737924387221673,
"learning_rate": 1.5607150573516727e-07,
"loss": 0.5273719787597656,
"memory(GiB)": 74.97,
"step": 690,
"token_acc": 0.8344278568974075,
"train_speed(iter/s)": 0.130149
},
{
"epoch": 63.18604651162791,
"grad_norm": 0.6321277650100168,
"learning_rate": 1.5538505352112372e-07,
"loss": 0.5302412986755372,
"memory(GiB)": 74.97,
"step": 695,
"token_acc": 0.838855421686747,
"train_speed(iter/s)": 0.130224
},
{
"epoch": 63.651162790697676,
"grad_norm": 0.6665444493375449,
"learning_rate": 1.546948158122427e-07,
"loss": 0.5358945846557617,
"memory(GiB)": 74.97,
"step": 700,
"token_acc": 0.826061751191652,
"train_speed(iter/s)": 0.130179
},
{
"epoch": 64.09302325581395,
"grad_norm": 0.7540141255217923,
"learning_rate": 1.540008397853547e-07,
"loss": 0.5356395244598389,
"memory(GiB)": 74.97,
"step": 705,
"token_acc": 0.8476590569896634,
"train_speed(iter/s)": 0.130248
},
{
"epoch": 64.55813953488372,
"grad_norm": 0.7630626447995367,
"learning_rate": 1.5330317287279937e-07,
"loss": 0.5312513828277587,
"memory(GiB)": 74.97,
"step": 710,
"token_acc": 0.8489824739281576,
"train_speed(iter/s)": 0.130176
},
{
"epoch": 65.0,
"grad_norm": 1.2266930256462827,
"learning_rate": 1.526018627591834e-07,
"loss": 0.5403413295745849,
"memory(GiB)": 74.97,
"step": 715,
"token_acc": 0.8551674468851278,
"train_speed(iter/s)": 0.130251
},
{
"epoch": 65.46511627906976,
"grad_norm": 0.7496283095791967,
"learning_rate": 1.5189695737812152e-07,
"loss": 0.5270286560058594,
"memory(GiB)": 74.97,
"step": 720,
"token_acc": 0.8398781740525149,
"train_speed(iter/s)": 0.130162
},
{
"epoch": 65.93023255813954,
"grad_norm": 0.8680329950142557,
"learning_rate": 1.511885049089601e-07,
"loss": 0.5444748878479004,
"memory(GiB)": 74.97,
"step": 725,
"token_acc": 0.8464486183074266,
"train_speed(iter/s)": 0.130252
},
{
"epoch": 66.37209302325581,
"grad_norm": 0.6415609894652046,
"learning_rate": 1.5047655377348439e-07,
"loss": 0.5128337383270264,
"memory(GiB)": 74.97,
"step": 730,
"token_acc": 0.864701716521094,
"train_speed(iter/s)": 0.130315
},
{
"epoch": 66.83720930232558,
"grad_norm": 0.6939531108133022,
"learning_rate": 1.4976115263260874e-07,
"loss": 0.5571429252624511,
"memory(GiB)": 74.97,
"step": 735,
"token_acc": 0.8357370669769121,
"train_speed(iter/s)": 0.130322
},
{
"epoch": 67.27906976744185,
"grad_norm": 0.7218530264815206,
"learning_rate": 1.4904235038305082e-07,
"loss": 0.5194293975830078,
"memory(GiB)": 74.97,
"step": 740,
"token_acc": 0.8460596389007441,
"train_speed(iter/s)": 0.130299
},
{
"epoch": 67.74418604651163,
"grad_norm": 1.285168120381986,
"learning_rate": 1.483201961539896e-07,
"loss": 0.5455545425415039,
"memory(GiB)": 74.97,
"step": 745,
"token_acc": 0.8313979656134666,
"train_speed(iter/s)": 0.130272
},
{
"epoch": 68.18604651162791,
"grad_norm": 1.94952748533025,
"learning_rate": 1.4759473930370737e-07,
"loss": 0.5241846084594727,
"memory(GiB)": 74.97,
"step": 750,
"token_acc": 0.8599992655699178,
"train_speed(iter/s)": 0.130347
},
{
"epoch": 68.65116279069767,
"grad_norm": 0.7193543863488733,
"learning_rate": 1.4686602941621615e-07,
"loss": 0.5322785377502441,
"memory(GiB)": 74.97,
"step": 755,
"token_acc": 0.8371367656348705,
"train_speed(iter/s)": 0.130295
},
{
"epoch": 69.09302325581395,
"grad_norm": 1.0867783614431274,
"learning_rate": 1.4613411629786877e-07,
"loss": 0.521461296081543,
"memory(GiB)": 74.97,
"step": 760,
"token_acc": 0.8467171046810017,
"train_speed(iter/s)": 0.130339
},
{
"epoch": 69.55813953488372,
"grad_norm": 0.7455956742708548,
"learning_rate": 1.4539904997395468e-07,
"loss": 0.5118254661560059,
"memory(GiB)": 74.97,
"step": 765,
"token_acc": 0.8578669369898095,
"train_speed(iter/s)": 0.13034
},
{
"epoch": 70.0,
"grad_norm": 0.8528350805883835,
"learning_rate": 1.4466088068528067e-07,
"loss": 0.5299886703491211,
"memory(GiB)": 74.97,
"step": 770,
"token_acc": 0.8476385063027893,
"train_speed(iter/s)": 0.130365
},
{
"epoch": 70.46511627906976,
"grad_norm": 0.6395748070686201,
"learning_rate": 1.4391965888473702e-07,
"loss": 0.5287624359130859,
"memory(GiB)": 74.97,
"step": 775,
"token_acc": 0.8381954887218045,
"train_speed(iter/s)": 0.130324
},
{
"epoch": 70.93023255813954,
"grad_norm": 0.842531216333987,
"learning_rate": 1.4317543523384928e-07,
"loss": 0.5287698745727539,
"memory(GiB)": 74.97,
"step": 780,
"token_acc": 0.8566830651213208,
"train_speed(iter/s)": 0.130359
},
{
"epoch": 71.37209302325581,
"grad_norm": 0.722140572381901,
"learning_rate": 1.4242826059931536e-07,
"loss": 0.5152388572692871,
"memory(GiB)": 74.97,
"step": 785,
"token_acc": 0.8451972291311229,
"train_speed(iter/s)": 0.130398
},
{
"epoch": 71.83720930232558,
"grad_norm": 1.1033571214972513,
"learning_rate": 1.4167818604952903e-07,
"loss": 0.5234486579895019,
"memory(GiB)": 74.97,
"step": 790,
"token_acc": 0.8461698837673958,
"train_speed(iter/s)": 0.130382
},
{
"epoch": 72.27906976744185,
"grad_norm": 0.7546592396468452,
"learning_rate": 1.4092526285108939e-07,
"loss": 0.5231525897979736,
"memory(GiB)": 74.97,
"step": 795,
"token_acc": 0.8471460044061686,
"train_speed(iter/s)": 0.130495
},
{
"epoch": 72.74418604651163,
"grad_norm": 0.7665462491639092,
"learning_rate": 1.4016954246529695e-07,
"loss": 0.5139668941497803,
"memory(GiB)": 74.97,
"step": 800,
"token_acc": 0.8447760249371035,
"train_speed(iter/s)": 0.130432
},
{
"epoch": 72.74418604651163,
"eval_loss": 0.6015437245368958,
"eval_runtime": 0.7284,
"eval_samples_per_second": 17.847,
"eval_steps_per_second": 2.746,
"eval_token_acc": 0.8434715968962544,
"step": 800
},
{
"epoch": 73.18604651162791,
"grad_norm": 0.7272029873141171,
"learning_rate": 1.3941107654463616e-07,
"loss": 0.5379150390625,
"memory(GiB)": 74.97,
"step": 805,
"token_acc": 0.8408949295116442,
"train_speed(iter/s)": 0.130058
},
{
"epoch": 73.65116279069767,
"grad_norm": 0.7995205555897585,
"learning_rate": 1.3864991692924522e-07,
"loss": 0.5211355209350585,
"memory(GiB)": 74.97,
"step": 810,
"token_acc": 0.8419526596025093,
"train_speed(iter/s)": 0.130097
},
{
"epoch": 74.09302325581395,
"grad_norm": 0.9263844311361451,
"learning_rate": 1.3788611564337276e-07,
"loss": 0.5166553497314453,
"memory(GiB)": 74.97,
"step": 815,
"token_acc": 0.8460784079221183,
"train_speed(iter/s)": 0.130043
},
{
"epoch": 74.55813953488372,
"grad_norm": 0.9100848478509656,
"learning_rate": 1.3711972489182207e-07,
"loss": 0.5152887344360352,
"memory(GiB)": 74.97,
"step": 820,
"token_acc": 0.8641304347826086,
"train_speed(iter/s)": 0.129973
},
{
"epoch": 75.0,
"grad_norm": 0.8520157723565999,
"learning_rate": 1.3635079705638297e-07,
"loss": 0.5118432998657226,
"memory(GiB)": 74.97,
"step": 825,
"token_acc": 0.8406333086780081,
"train_speed(iter/s)": 0.130038
},
{
"epoch": 75.46511627906976,
"grad_norm": 1.89559334384708,
"learning_rate": 1.3557938469225164e-07,
"loss": 0.5238603591918946,
"memory(GiB)": 74.97,
"step": 830,
"token_acc": 0.8296420958151015,
"train_speed(iter/s)": 0.129953
},
{
"epoch": 75.93023255813954,
"grad_norm": 0.8445066662231647,
"learning_rate": 1.3480554052443843e-07,
"loss": 0.5140830516815186,
"memory(GiB)": 74.97,
"step": 835,
"token_acc": 0.8494189687565236,
"train_speed(iter/s)": 0.13002
},
{
"epoch": 76.37209302325581,
"grad_norm": 1.5322849456525907,
"learning_rate": 1.340293174441643e-07,
"loss": 0.5148379325866699,
"memory(GiB)": 74.97,
"step": 840,
"token_acc": 0.8386292834890966,
"train_speed(iter/s)": 0.13
},
{
"epoch": 76.83720930232558,
"grad_norm": 0.7284489005308602,
"learning_rate": 1.332507685052457e-07,
"loss": 0.5148776531219482,
"memory(GiB)": 74.97,
"step": 845,
"token_acc": 0.8438160869248159,
"train_speed(iter/s)": 0.130024
},
{
"epoch": 77.27906976744185,
"grad_norm": 0.8254251521761937,
"learning_rate": 1.3246994692046836e-07,
"loss": 0.5172486305236816,
"memory(GiB)": 74.97,
"step": 850,
"token_acc": 0.8467165799851403,
"train_speed(iter/s)": 0.130065
},
{
"epoch": 77.74418604651163,
"grad_norm": 0.888794754410688,
"learning_rate": 1.3168690605795043e-07,
"loss": 0.515445613861084,
"memory(GiB)": 74.97,
"step": 855,
"token_acc": 0.8480349170918368,
"train_speed(iter/s)": 0.130098
},
{
"epoch": 78.18604651162791,
"grad_norm": 0.8024083233168969,
"learning_rate": 1.3090169943749475e-07,
"loss": 0.5077299118041992,
"memory(GiB)": 74.97,
"step": 860,
"token_acc": 0.8461068818804495,
"train_speed(iter/s)": 0.130157
},
{
"epoch": 78.65116279069767,
"grad_norm": 0.7968691650808981,
"learning_rate": 1.3011438072693074e-07,
"loss": 0.5154001235961914,
"memory(GiB)": 74.97,
"step": 865,
"token_acc": 0.8603395311236863,
"train_speed(iter/s)": 0.130118
},
{
"epoch": 79.09302325581395,
"grad_norm": 1.4489088486628856,
"learning_rate": 1.2932500373844649e-07,
"loss": 0.5220766544342041,
"memory(GiB)": 74.97,
"step": 870,
"token_acc": 0.8575108409621586,
"train_speed(iter/s)": 0.130187
},
{
"epoch": 79.55813953488372,
"grad_norm": 0.833164944608322,
"learning_rate": 1.2853362242491051e-07,
"loss": 0.5146864414215088,
"memory(GiB)": 74.97,
"step": 875,
"token_acc": 0.8354072612769832,
"train_speed(iter/s)": 0.130231
},
{
"epoch": 80.0,
"grad_norm": 1.0334544104049193,
"learning_rate": 1.2774029087618446e-07,
"loss": 0.5196131706237793,
"memory(GiB)": 74.97,
"step": 880,
"token_acc": 0.8273188610093036,
"train_speed(iter/s)": 0.130243
},
{
"epoch": 80.46511627906976,
"grad_norm": 0.7327428116602168,
"learning_rate": 1.2694506331542577e-07,
"loss": 0.5012516975402832,
"memory(GiB)": 74.97,
"step": 885,
"token_acc": 0.8552629297640307,
"train_speed(iter/s)": 0.130266
},
{
"epoch": 80.93023255813954,
"grad_norm": 0.7823436928202996,
"learning_rate": 1.2614799409538198e-07,
"loss": 0.5132665634155273,
"memory(GiB)": 74.97,
"step": 890,
"token_acc": 0.8614560088497263,
"train_speed(iter/s)": 0.130236
},
{
"epoch": 81.37209302325581,
"grad_norm": 0.8496813139641767,
"learning_rate": 1.253491376946754e-07,
"loss": 0.5047847747802734,
"memory(GiB)": 74.97,
"step": 895,
"token_acc": 0.8672797358731915,
"train_speed(iter/s)": 0.130316
},
{
"epoch": 81.83720930232558,
"grad_norm": 0.7662540093111049,
"learning_rate": 1.2454854871407992e-07,
"loss": 0.5070115566253662,
"memory(GiB)": 74.97,
"step": 900,
"token_acc": 0.8437890633276128,
"train_speed(iter/s)": 0.130345
},
{
"epoch": 82.27906976744185,
"grad_norm": 1.1403186852474703,
"learning_rate": 1.2374628187278885e-07,
"loss": 0.5135304450988769,
"memory(GiB)": 74.97,
"step": 905,
"token_acc": 0.8760885832099473,
"train_speed(iter/s)": 0.130362
},
{
"epoch": 82.74418604651163,
"grad_norm": 0.6850775896882327,
"learning_rate": 1.2294239200467515e-07,
"loss": 0.48610854148864746,
"memory(GiB)": 74.97,
"step": 910,
"token_acc": 0.864081524616199,
"train_speed(iter/s)": 0.130315
},
{
"epoch": 83.18604651162791,
"grad_norm": 1.7277139603374756,
"learning_rate": 1.2213693405454345e-07,
"loss": 0.5195373058319092,
"memory(GiB)": 74.97,
"step": 915,
"token_acc": 0.842862242005585,
"train_speed(iter/s)": 0.130334
},
{
"epoch": 83.65116279069767,
"grad_norm": 1.562225291111122,
"learning_rate": 1.213299630743747e-07,
"loss": 0.5000184059143067,
"memory(GiB)": 74.97,
"step": 920,
"token_acc": 0.8502656832421286,
"train_speed(iter/s)": 0.130352
},
{
"epoch": 84.09302325581395,
"grad_norm": 0.7432167354378622,
"learning_rate": 1.205215342195634e-07,
"loss": 0.4978955745697021,
"memory(GiB)": 74.97,
"step": 925,
"token_acc": 0.8535459925769887,
"train_speed(iter/s)": 0.130407
},
{
"epoch": 84.55813953488372,
"grad_norm": 2.2667269366172267,
"learning_rate": 1.1971170274514802e-07,
"loss": 0.5232599258422852,
"memory(GiB)": 74.97,
"step": 930,
"token_acc": 0.8631094983089064,
"train_speed(iter/s)": 0.130392
},
{
"epoch": 85.0,
"grad_norm": 0.7640021499203492,
"learning_rate": 1.1890052400203402e-07,
"loss": 0.48494710922241213,
"memory(GiB)": 74.97,
"step": 935,
"token_acc": 0.8383060054320491,
"train_speed(iter/s)": 0.130461
},
{
"epoch": 85.46511627906976,
"grad_norm": 0.797825246843515,
"learning_rate": 1.18088053433211e-07,
"loss": 0.4894867897033691,
"memory(GiB)": 74.97,
"step": 940,
"token_acc": 0.862217698107348,
"train_speed(iter/s)": 0.130536
},
{
"epoch": 85.93023255813954,
"grad_norm": 1.118805326320862,
"learning_rate": 1.1727434656996305e-07,
"loss": 0.5085083961486816,
"memory(GiB)": 74.97,
"step": 945,
"token_acc": 0.8468460041903622,
"train_speed(iter/s)": 0.130472
},
{
"epoch": 86.37209302325581,
"grad_norm": 0.8642381524493187,
"learning_rate": 1.1645945902807339e-07,
"loss": 0.501039457321167,
"memory(GiB)": 74.97,
"step": 950,
"token_acc": 0.8637289013917678,
"train_speed(iter/s)": 0.130524
},
{
"epoch": 86.83720930232558,
"grad_norm": 0.876594093463965,
"learning_rate": 1.1564344650402309e-07,
"loss": 0.5047001838684082,
"memory(GiB)": 74.97,
"step": 955,
"token_acc": 0.8469405442884382,
"train_speed(iter/s)": 0.130517
},
{
"epoch": 87.27906976744185,
"grad_norm": 1.2339377952227535,
"learning_rate": 1.1482636477118419e-07,
"loss": 0.5183281898498535,
"memory(GiB)": 74.97,
"step": 960,
"token_acc": 0.848177734504658,
"train_speed(iter/s)": 0.130587
},
{
"epoch": 87.74418604651163,
"grad_norm": 0.631851683029857,
"learning_rate": 1.1400826967600779e-07,
"loss": 0.483397912979126,
"memory(GiB)": 74.97,
"step": 965,
"token_acc": 0.8719364241861677,
"train_speed(iter/s)": 0.130556
},
{
"epoch": 88.18604651162791,
"grad_norm": 1.0951446409255636,
"learning_rate": 1.131892171342069e-07,
"loss": 0.5028903007507324,
"memory(GiB)": 74.97,
"step": 970,
"token_acc": 0.8738672544697527,
"train_speed(iter/s)": 0.130594
},
{
"epoch": 88.65116279069767,
"grad_norm": 0.7683275760751048,
"learning_rate": 1.1236926312693478e-07,
"loss": 0.4880162239074707,
"memory(GiB)": 74.97,
"step": 975,
"token_acc": 0.8594904599095622,
"train_speed(iter/s)": 0.130573
},
{
"epoch": 89.09302325581395,
"grad_norm": 6.943858471099767,
"learning_rate": 1.1154846369695863e-07,
"loss": 0.5035033226013184,
"memory(GiB)": 74.97,
"step": 980,
"token_acc": 0.8718237375361853,
"train_speed(iter/s)": 0.130588
},
{
"epoch": 89.55813953488372,
"grad_norm": 0.722153826562248,
"learning_rate": 1.1072687494482918e-07,
"loss": 0.5015533447265625,
"memory(GiB)": 74.97,
"step": 985,
"token_acc": 0.8497986934062595,
"train_speed(iter/s)": 0.130571
},
{
"epoch": 90.0,
"grad_norm": 0.7791081924406386,
"learning_rate": 1.0990455302504628e-07,
"loss": 0.4978206157684326,
"memory(GiB)": 74.97,
"step": 990,
"token_acc": 0.8598159926863901,
"train_speed(iter/s)": 0.130602
},
{
"epoch": 90.46511627906976,
"grad_norm": 0.7993364463951824,
"learning_rate": 1.0908155414222082e-07,
"loss": 0.47749814987182615,
"memory(GiB)": 74.97,
"step": 995,
"token_acc": 0.8648952240771585,
"train_speed(iter/s)": 0.13055
},
{
"epoch": 90.93023255813954,
"grad_norm": 0.9293826361291836,
"learning_rate": 1.0825793454723325e-07,
"loss": 0.4996511936187744,
"memory(GiB)": 74.97,
"step": 1000,
"token_acc": 0.8631259732808786,
"train_speed(iter/s)": 0.13058
},
{
"epoch": 90.93023255813954,
"eval_loss": 0.602931022644043,
"eval_runtime": 0.716,
"eval_samples_per_second": 18.155,
"eval_steps_per_second": 2.793,
"eval_token_acc": 0.8428010345818565,
"step": 1000
},
{
"epoch": 91.37209302325581,
"grad_norm": 1.1841722413103843,
"learning_rate": 1.0743375053338877e-07,
"loss": 0.5005837440490722,
"memory(GiB)": 74.97,
"step": 1005,
"token_acc": 0.8554249955862447,
"train_speed(iter/s)": 0.130297
},
{
"epoch": 91.83720930232558,
"grad_norm": 1.4009925352920263,
"learning_rate": 1.0660905843256993e-07,
"loss": 0.504381799697876,
"memory(GiB)": 74.97,
"step": 1010,
"token_acc": 0.8340634861704103,
"train_speed(iter/s)": 0.130275
},
{
"epoch": 92.27906976744185,
"grad_norm": 0.7236486242500604,
"learning_rate": 1.057839146113864e-07,
"loss": 0.4767627716064453,
"memory(GiB)": 74.97,
"step": 1015,
"token_acc": 0.8686680165507527,
"train_speed(iter/s)": 0.130259
},
{
"epoch": 92.74418604651163,
"grad_norm": 1.436377509073585,
"learning_rate": 1.0495837546732223e-07,
"loss": 0.5024114131927491,
"memory(GiB)": 74.97,
"step": 1020,
"token_acc": 0.8457889431344258,
"train_speed(iter/s)": 0.1303
},
{
"epoch": 93.18604651162791,
"grad_norm": 0.9279689257580228,
"learning_rate": 1.0413249742488131e-07,
"loss": 0.48839874267578126,
"memory(GiB)": 74.97,
"step": 1025,
"token_acc": 0.8741351653515239,
"train_speed(iter/s)": 0.130339
},
{
"epoch": 93.65116279069767,
"grad_norm": 1.5611563038818324,
"learning_rate": 1.033063369317308e-07,
"loss": 0.48693456649780276,
"memory(GiB)": 74.97,
"step": 1030,
"token_acc": 0.8771067535162163,
"train_speed(iter/s)": 0.130324
},
{
"epoch": 94.09302325581395,
"grad_norm": 1.1172420689296867,
"learning_rate": 1.0247995045484301e-07,
"loss": 0.5037758350372314,
"memory(GiB)": 74.97,
"step": 1035,
"token_acc": 0.8510888627433569,
"train_speed(iter/s)": 0.130325
},
{
"epoch": 94.55813953488372,
"grad_norm": 0.8609018320733309,
"learning_rate": 1.0165339447663586e-07,
"loss": 0.4941869258880615,
"memory(GiB)": 74.97,
"step": 1040,
"token_acc": 0.8680718468508801,
"train_speed(iter/s)": 0.130309
},
{
"epoch": 95.0,
"grad_norm": 1.9807352700715366,
"learning_rate": 1.0082672549111248e-07,
"loss": 0.4907430648803711,
"memory(GiB)": 74.97,
"step": 1045,
"token_acc": 0.8594207248443011,
"train_speed(iter/s)": 0.130342
},
{
"epoch": 95.46511627906976,
"grad_norm": 1.2132659915520214,
"learning_rate": 1e-07,
"loss": 0.5144547462463379,
"memory(GiB)": 74.97,
"step": 1050,
"token_acc": 0.8396730861192019,
"train_speed(iter/s)": 0.130346
},
{
"epoch": 95.93023255813954,
"grad_norm": 1.6515430345069437,
"learning_rate": 9.917327450888751e-08,
"loss": 0.46764235496520995,
"memory(GiB)": 74.97,
"step": 1055,
"token_acc": 0.8370761686275335,
"train_speed(iter/s)": 0.130367
},
{
"epoch": 96.37209302325581,
"grad_norm": 0.7140536621397322,
"learning_rate": 9.834660552336415e-08,
"loss": 0.48370823860168455,
"memory(GiB)": 74.97,
"step": 1060,
"token_acc": 0.8530308955807587,
"train_speed(iter/s)": 0.13034
},
{
"epoch": 96.83720930232558,
"grad_norm": 1.0809702853567489,
"learning_rate": 9.752004954515699e-08,
"loss": 0.49426803588867185,
"memory(GiB)": 74.97,
"step": 1065,
"token_acc": 0.8571793110216901,
"train_speed(iter/s)": 0.130299
},
{
"epoch": 97.27906976744185,
"grad_norm": 0.7163522482069422,
"learning_rate": 9.669366306826918e-08,
"loss": 0.4718944072723389,
"memory(GiB)": 74.97,
"step": 1070,
"token_acc": 0.8619141314767166,
"train_speed(iter/s)": 0.130378
},
{
"epoch": 97.74418604651163,
"grad_norm": 0.83946396188462,
"learning_rate": 9.586750257511866e-08,
"loss": 0.4911818504333496,
"memory(GiB)": 74.97,
"step": 1075,
"token_acc": 0.8650800071189347,
"train_speed(iter/s)": 0.1303
},
{
"epoch": 98.18604651162791,
"grad_norm": 0.9728064150742605,
"learning_rate": 9.504162453267775e-08,
"loss": 0.4725058078765869,
"memory(GiB)": 74.97,
"step": 1080,
"token_acc": 0.876843910806175,
"train_speed(iter/s)": 0.130339
},
{
"epoch": 98.65116279069767,
"grad_norm": 0.7909880351612323,
"learning_rate": 9.421608538861361e-08,
"loss": 0.4865569114685059,
"memory(GiB)": 74.97,
"step": 1085,
"token_acc": 0.8610528723363702,
"train_speed(iter/s)": 0.130299
},
{
"epoch": 99.09302325581395,
"grad_norm": 0.8239796766786783,
"learning_rate": 9.339094156743006e-08,
"loss": 0.49038195610046387,
"memory(GiB)": 74.97,
"step": 1090,
"token_acc": 0.8451910122126125,
"train_speed(iter/s)": 0.130321
},
{
"epoch": 99.55813953488372,
"grad_norm": 0.714832653552484,
"learning_rate": 9.256624946661125e-08,
"loss": 0.47361068725585936,
"memory(GiB)": 74.97,
"step": 1095,
"token_acc": 0.8569815516103255,
"train_speed(iter/s)": 0.13029
},
{
"epoch": 100.0,
"grad_norm": 1.1976610090490132,
"learning_rate": 9.174206545276677e-08,
"loss": 0.49490890502929685,
"memory(GiB)": 74.97,
"step": 1100,
"token_acc": 0.8424860734638123,
"train_speed(iter/s)": 0.130347
},
{
"epoch": 100.46511627906976,
"grad_norm": 0.8008632586934444,
"learning_rate": 9.091844585777917e-08,
"loss": 0.4697834014892578,
"memory(GiB)": 74.97,
"step": 1105,
"token_acc": 0.8580395195660596,
"train_speed(iter/s)": 0.130358
},
{
"epoch": 100.93023255813954,
"grad_norm": 0.6845439357302979,
"learning_rate": 9.009544697495372e-08,
"loss": 0.48686370849609373,
"memory(GiB)": 74.97,
"step": 1110,
"token_acc": 0.8574517231821122,
"train_speed(iter/s)": 0.13037
},
{
"epoch": 101.37209302325581,
"grad_norm": 0.7958237623480675,
"learning_rate": 8.927312505517084e-08,
"loss": 0.4824103832244873,
"memory(GiB)": 74.97,
"step": 1115,
"token_acc": 0.8406652121643884,
"train_speed(iter/s)": 0.130425
},
{
"epoch": 101.83720930232558,
"grad_norm": 0.7562640332446442,
"learning_rate": 8.845153630304139e-08,
"loss": 0.4883410453796387,
"memory(GiB)": 74.97,
"step": 1120,
"token_acc": 0.8654994502241394,
"train_speed(iter/s)": 0.130404
},
{
"epoch": 102.27906976744185,
"grad_norm": 0.91816981994612,
"learning_rate": 8.763073687306523e-08,
"loss": 0.47723941802978515,
"memory(GiB)": 74.97,
"step": 1125,
"token_acc": 0.8617350394493566,
"train_speed(iter/s)": 0.13041
},
{
"epoch": 102.74418604651163,
"grad_norm": 1.0207292255363964,
"learning_rate": 8.68107828657931e-08,
"loss": 0.48489856719970703,
"memory(GiB)": 74.97,
"step": 1130,
"token_acc": 0.8609944029573764,
"train_speed(iter/s)": 0.130389
},
{
"epoch": 103.18604651162791,
"grad_norm": 1.0123784499736115,
"learning_rate": 8.59917303239922e-08,
"loss": 0.4814739227294922,
"memory(GiB)": 74.97,
"step": 1135,
"token_acc": 0.8705958429561201,
"train_speed(iter/s)": 0.130431
},
{
"epoch": 103.65116279069767,
"grad_norm": 0.7408077875426933,
"learning_rate": 8.517363522881579e-08,
"loss": 0.47219066619873046,
"memory(GiB)": 74.97,
"step": 1140,
"token_acc": 0.8524354155002799,
"train_speed(iter/s)": 0.130432
},
{
"epoch": 104.09302325581395,
"grad_norm": 0.7314596110135979,
"learning_rate": 8.435655349597689e-08,
"loss": 0.4839695930480957,
"memory(GiB)": 74.97,
"step": 1145,
"token_acc": 0.8638605778320128,
"train_speed(iter/s)": 0.130438
},
{
"epoch": 104.55813953488372,
"grad_norm": 0.8022214373595549,
"learning_rate": 8.354054097192658e-08,
"loss": 0.4761360168457031,
"memory(GiB)": 74.97,
"step": 1150,
"token_acc": 0.8594843717513341,
"train_speed(iter/s)": 0.130347
},
{
"epoch": 105.0,
"grad_norm": 0.9319907606891521,
"learning_rate": 8.2725653430037e-08,
"loss": 0.4859612941741943,
"memory(GiB)": 74.97,
"step": 1155,
"token_acc": 0.8549445575922154,
"train_speed(iter/s)": 0.130405
},
{
"epoch": 105.46511627906976,
"grad_norm": 1.2930176911390905,
"learning_rate": 8.191194656678904e-08,
"loss": 0.4661128044128418,
"memory(GiB)": 74.97,
"step": 1160,
"token_acc": 0.8626862925482981,
"train_speed(iter/s)": 0.130393
},
{
"epoch": 105.93023255813954,
"grad_norm": 0.9575779480555059,
"learning_rate": 8.109947599796598e-08,
"loss": 0.484060001373291,
"memory(GiB)": 74.97,
"step": 1165,
"token_acc": 0.8556487381611823,
"train_speed(iter/s)": 0.130404
},
{
"epoch": 106.37209302325581,
"grad_norm": 0.7486234774787734,
"learning_rate": 8.028829725485198e-08,
"loss": 0.4818765640258789,
"memory(GiB)": 74.97,
"step": 1170,
"token_acc": 0.8624224886316659,
"train_speed(iter/s)": 0.130438
},
{
"epoch": 106.83720930232558,
"grad_norm": 0.7280471700597845,
"learning_rate": 7.947846578043658e-08,
"loss": 0.48406553268432617,
"memory(GiB)": 74.97,
"step": 1175,
"token_acc": 0.8612848675893546,
"train_speed(iter/s)": 0.130403
},
{
"epoch": 107.27906976744185,
"grad_norm": 0.9703016724934369,
"learning_rate": 7.867003692562532e-08,
"loss": 0.46012191772460936,
"memory(GiB)": 74.97,
"step": 1180,
"token_acc": 0.8720765414599575,
"train_speed(iter/s)": 0.13046
},
{
"epoch": 107.74418604651163,
"grad_norm": 1.7207486244429357,
"learning_rate": 7.786306594545656e-08,
"loss": 0.47897043228149416,
"memory(GiB)": 74.97,
"step": 1185,
"token_acc": 0.8613559838243008,
"train_speed(iter/s)": 0.130449
},
{
"epoch": 108.18604651162791,
"grad_norm": 1.0944806454073215,
"learning_rate": 7.705760799532485e-08,
"loss": 0.48472142219543457,
"memory(GiB)": 74.97,
"step": 1190,
"token_acc": 0.8510737233682787,
"train_speed(iter/s)": 0.130447
},
{
"epoch": 108.65116279069767,
"grad_norm": 0.7340918962562681,
"learning_rate": 7.625371812721114e-08,
"loss": 0.46958436965942385,
"memory(GiB)": 74.97,
"step": 1195,
"token_acc": 0.8719202394209354,
"train_speed(iter/s)": 0.130463
},
{
"epoch": 109.09302325581395,
"grad_norm": 0.939464587476609,
"learning_rate": 7.545145128592009e-08,
"loss": 0.47149295806884767,
"memory(GiB)": 74.97,
"step": 1200,
"token_acc": 0.8800350262697023,
"train_speed(iter/s)": 0.130453
},
{
"epoch": 109.09302325581395,
"eval_loss": 0.6058527827262878,
"eval_runtime": 0.7066,
"eval_samples_per_second": 18.397,
"eval_steps_per_second": 2.83,
"eval_token_acc": 0.8434715968962544,
"step": 1200
},
{
"epoch": 109.55813953488372,
"grad_norm": 0.8652359563773929,
"learning_rate": 7.465086230532459e-08,
"loss": 0.476532506942749,
"memory(GiB)": 74.97,
"step": 1205,
"token_acc": 0.8694151027245068,
"train_speed(iter/s)": 0.130187
},
{
"epoch": 110.0,
"grad_norm": 0.8098360520222708,
"learning_rate": 7.385200590461802e-08,
"loss": 0.4804817199707031,
"memory(GiB)": 74.97,
"step": 1210,
"token_acc": 0.8504993058976311,
"train_speed(iter/s)": 0.130211
},
{
"epoch": 110.46511627906976,
"grad_norm": 0.7864179053648999,
"learning_rate": 7.305493668457419e-08,
"loss": 0.46163101196289064,
"memory(GiB)": 74.97,
"step": 1215,
"token_acc": 0.8520807581376184,
"train_speed(iter/s)": 0.130209
},
{
"epoch": 110.93023255813954,
"grad_norm": 1.2076707405286862,
"learning_rate": 7.225970912381556e-08,
"loss": 0.4753293991088867,
"memory(GiB)": 74.97,
"step": 1220,
"token_acc": 0.8554707472061939,
"train_speed(iter/s)": 0.1302
},
{
"epoch": 111.37209302325581,
"grad_norm": 0.871709312109685,
"learning_rate": 7.146637757508949e-08,
"loss": 0.47620530128479005,
"memory(GiB)": 74.97,
"step": 1225,
"token_acc": 0.8760574752720532,
"train_speed(iter/s)": 0.13021
},
{
"epoch": 111.83720930232558,
"grad_norm": 0.7334760311164147,
"learning_rate": 7.067499626155353e-08,
"loss": 0.46177024841308595,
"memory(GiB)": 74.97,
"step": 1230,
"token_acc": 0.8513760840189522,
"train_speed(iter/s)": 0.130203
},
{
"epoch": 112.27906976744185,
"grad_norm": 0.9429213919362676,
"learning_rate": 6.988561927306926e-08,
"loss": 0.4705217361450195,
"memory(GiB)": 74.97,
"step": 1235,
"token_acc": 0.8782852564102565,
"train_speed(iter/s)": 0.130244
},
{
"epoch": 112.74418604651163,
"grad_norm": 1.0006229504211153,
"learning_rate": 6.909830056250527e-08,
"loss": 0.46991333961486814,
"memory(GiB)": 74.97,
"step": 1240,
"token_acc": 0.8570367690462136,
"train_speed(iter/s)": 0.130239
},
{
"epoch": 113.18604651162791,
"grad_norm": 1.5600658321413452,
"learning_rate": 6.831309394204956e-08,
"loss": 0.5063477039337159,
"memory(GiB)": 74.97,
"step": 1245,
"token_acc": 0.8328871703351179,
"train_speed(iter/s)": 0.130265
},
{
"epoch": 113.65116279069767,
"grad_norm": 0.7100324996989047,
"learning_rate": 6.753005307953166e-08,
"loss": 0.4718203544616699,
"memory(GiB)": 74.97,
"step": 1250,
"token_acc": 0.846406587098945,
"train_speed(iter/s)": 0.130269
},
{
"epoch": 114.09302325581395,
"grad_norm": 1.02085122390004,
"learning_rate": 6.674923149475432e-08,
"loss": 0.46040911674499513,
"memory(GiB)": 74.97,
"step": 1255,
"token_acc": 0.8600188738597043,
"train_speed(iter/s)": 0.130273
},
{
"epoch": 114.55813953488372,
"grad_norm": 0.7602372463858895,
"learning_rate": 6.597068255583569e-08,
"loss": 0.4706200122833252,
"memory(GiB)": 74.97,
"step": 1260,
"token_acc": 0.850320256204964,
"train_speed(iter/s)": 0.130273
},
{
"epoch": 115.0,
"grad_norm": 0.880014706373256,
"learning_rate": 6.519445947556154e-08,
"loss": 0.4695608139038086,
"memory(GiB)": 74.97,
"step": 1265,
"token_acc": 0.8624032731477363,
"train_speed(iter/s)": 0.1303
},
{
"epoch": 115.46511627906976,
"grad_norm": 1.2127086778344998,
"learning_rate": 6.442061530774834e-08,
"loss": 0.47931528091430664,
"memory(GiB)": 74.97,
"step": 1270,
"token_acc": 0.844140842826416,
"train_speed(iter/s)": 0.130299
},
{
"epoch": 115.93023255813954,
"grad_norm": 1.083099670256692,
"learning_rate": 6.3649202943617e-08,
"loss": 0.4720285415649414,
"memory(GiB)": 74.97,
"step": 1275,
"token_acc": 0.8607366273040511,
"train_speed(iter/s)": 0.130282
},
{
"epoch": 116.37209302325581,
"grad_norm": 0.767737493501071,
"learning_rate": 6.288027510817791e-08,
"loss": 0.4558729648590088,
"memory(GiB)": 74.97,
"step": 1280,
"token_acc": 0.8583624139902605,
"train_speed(iter/s)": 0.130308
},
{
"epoch": 116.83720930232558,
"grad_norm": 1.6440663696409548,
"learning_rate": 6.211388435662721e-08,
"loss": 0.47510428428649903,
"memory(GiB)": 74.97,
"step": 1285,
"token_acc": 0.8627756653992396,
"train_speed(iter/s)": 0.130302
},
{
"epoch": 117.27906976744185,
"grad_norm": 0.8947253671514697,
"learning_rate": 6.135008307075479e-08,
"loss": 0.48160324096679685,
"memory(GiB)": 74.97,
"step": 1290,
"token_acc": 0.8668218530666949,
"train_speed(iter/s)": 0.130333
},
{
"epoch": 117.74418604651163,
"grad_norm": 0.7850295846326071,
"learning_rate": 6.058892345536387e-08,
"loss": 0.4656852722167969,
"memory(GiB)": 74.97,
"step": 1295,
"token_acc": 0.8760795485278474,
"train_speed(iter/s)": 0.13033
},
{
"epoch": 118.18604651162791,
"grad_norm": 0.7825259584750254,
"learning_rate": 5.983045753470308e-08,
"loss": 0.4575822830200195,
"memory(GiB)": 74.97,
"step": 1300,
"token_acc": 0.8609389541215373,
"train_speed(iter/s)": 0.130341
},
{
"epoch": 118.65116279069767,
"grad_norm": 1.3046914177444136,
"learning_rate": 5.9074737148910606e-08,
"loss": 0.45604352951049804,
"memory(GiB)": 74.97,
"step": 1305,
"token_acc": 0.8543227692364619,
"train_speed(iter/s)": 0.130361
},
{
"epoch": 119.09302325581395,
"grad_norm": 1.1212563362731731,
"learning_rate": 5.832181395047098e-08,
"loss": 0.4669440269470215,
"memory(GiB)": 74.97,
"step": 1310,
"token_acc": 0.868457034673772,
"train_speed(iter/s)": 0.130368
},
{
"epoch": 119.55813953488372,
"grad_norm": 0.8339959692059283,
"learning_rate": 5.7571739400684635e-08,
"loss": 0.47755279541015627,
"memory(GiB)": 74.97,
"step": 1315,
"token_acc": 0.8567007810897974,
"train_speed(iter/s)": 0.130336
},
{
"epoch": 120.0,
"grad_norm": 0.9954950376422352,
"learning_rate": 5.682456476615072e-08,
"loss": 0.4645816802978516,
"memory(GiB)": 74.97,
"step": 1320,
"token_acc": 0.8440125792344356,
"train_speed(iter/s)": 0.130377
},
{
"epoch": 120.46511627906976,
"grad_norm": 0.6612384359472665,
"learning_rate": 5.6080341115262976e-08,
"loss": 0.45533552169799807,
"memory(GiB)": 74.97,
"step": 1325,
"token_acc": 0.8586772074823821,
"train_speed(iter/s)": 0.130382
},
{
"epoch": 120.93023255813954,
"grad_norm": 1.098228237433943,
"learning_rate": 5.533911931471935e-08,
"loss": 0.4692089080810547,
"memory(GiB)": 74.97,
"step": 1330,
"token_acc": 0.8699830311690632,
"train_speed(iter/s)": 0.13038
},
{
"epoch": 121.37209302325581,
"grad_norm": 0.7854095634086957,
"learning_rate": 5.460095002604532e-08,
"loss": 0.46064138412475586,
"memory(GiB)": 74.97,
"step": 1335,
"token_acc": 0.8677652211026369,
"train_speed(iter/s)": 0.130369
},
{
"epoch": 121.83720930232558,
"grad_norm": 2.1438550225472506,
"learning_rate": 5.386588370213123e-08,
"loss": 0.47399129867553713,
"memory(GiB)": 74.97,
"step": 1340,
"token_acc": 0.8342529761205946,
"train_speed(iter/s)": 0.130402
},
{
"epoch": 122.27906976744185,
"grad_norm": 0.7685065811470108,
"learning_rate": 5.313397058378386e-08,
"loss": 0.46064081192016604,
"memory(GiB)": 74.97,
"step": 1345,
"token_acc": 0.8655901006480077,
"train_speed(iter/s)": 0.130445
},
{
"epoch": 122.74418604651163,
"grad_norm": 0.7484657906315015,
"learning_rate": 5.240526069629264e-08,
"loss": 0.4805141925811768,
"memory(GiB)": 74.97,
"step": 1350,
"token_acc": 0.8551861286142021,
"train_speed(iter/s)": 0.13041
},
{
"epoch": 123.18604651162791,
"grad_norm": 0.7881353244361399,
"learning_rate": 5.1679803846010403e-08,
"loss": 0.4467328071594238,
"memory(GiB)": 74.97,
"step": 1355,
"token_acc": 0.8620426261271331,
"train_speed(iter/s)": 0.13044
},
{
"epoch": 123.65116279069767,
"grad_norm": 0.9062139816497382,
"learning_rate": 5.0957649616949215e-08,
"loss": 0.4628152847290039,
"memory(GiB)": 74.97,
"step": 1360,
"token_acc": 0.8650447427293065,
"train_speed(iter/s)": 0.13041
},
{
"epoch": 124.09302325581395,
"grad_norm": 0.7919450228717162,
"learning_rate": 5.0238847367391314e-08,
"loss": 0.45865640640258787,
"memory(GiB)": 74.97,
"step": 1365,
"token_acc": 0.8601923709624354,
"train_speed(iter/s)": 0.130427
},
{
"epoch": 124.55813953488372,
"grad_norm": 0.6379402091206297,
"learning_rate": 4.952344622651565e-08,
"loss": 0.4563908576965332,
"memory(GiB)": 74.97,
"step": 1370,
"token_acc": 0.8681956209045869,
"train_speed(iter/s)": 0.130458
},
{
"epoch": 125.0,
"grad_norm": 1.558446245583931,
"learning_rate": 4.8811495091039923e-08,
"loss": 0.4724306106567383,
"memory(GiB)": 74.97,
"step": 1375,
"token_acc": 0.860769332539525,
"train_speed(iter/s)": 0.13045
},
{
"epoch": 125.46511627906976,
"grad_norm": 0.7458216850987389,
"learning_rate": 4.810304262187851e-08,
"loss": 0.46082763671875,
"memory(GiB)": 74.97,
"step": 1380,
"token_acc": 0.8532610918012676,
"train_speed(iter/s)": 0.130461
},
{
"epoch": 125.93023255813954,
"grad_norm": 3.0851446793520743,
"learning_rate": 4.739813724081661e-08,
"loss": 0.47005910873413087,
"memory(GiB)": 74.97,
"step": 1385,
"token_acc": 0.8665938394822649,
"train_speed(iter/s)": 0.13044
},
{
"epoch": 126.37209302325581,
"grad_norm": 0.6792623717144913,
"learning_rate": 4.6696827127200644e-08,
"loss": 0.44311208724975587,
"memory(GiB)": 74.97,
"step": 1390,
"token_acc": 0.8821935667868566,
"train_speed(iter/s)": 0.130476
},
{
"epoch": 126.83720930232558,
"grad_norm": 0.8236864003533888,
"learning_rate": 4.599916021464531e-08,
"loss": 0.4629988670349121,
"memory(GiB)": 74.97,
"step": 1395,
"token_acc": 0.8768711824231926,
"train_speed(iter/s)": 0.130449
},
{
"epoch": 127.27906976744185,
"grad_norm": 1.9627254088333494,
"learning_rate": 4.530518418775733e-08,
"loss": 0.48299736976623536,
"memory(GiB)": 74.97,
"step": 1400,
"token_acc": 0.8665078296300133,
"train_speed(iter/s)": 0.130474
},
{
"epoch": 127.27906976744185,
"eval_loss": 0.6098422408103943,
"eval_runtime": 0.6984,
"eval_samples_per_second": 18.613,
"eval_steps_per_second": 2.864,
"eval_token_acc": 0.8434715968962544,
"step": 1400
},
{
"epoch": 127.74418604651163,
"grad_norm": 0.8045410987121008,
"learning_rate": 4.4614946478876306e-08,
"loss": 0.45166778564453125,
"memory(GiB)": 74.97,
"step": 1405,
"token_acc": 0.8502284891267625,
"train_speed(iter/s)": 0.130135
},
{
"epoch": 128.1860465116279,
"grad_norm": 3.0860005142595193,
"learning_rate": 4.392849426483274e-08,
"loss": 0.4591231822967529,
"memory(GiB)": 74.97,
"step": 1410,
"token_acc": 0.8607654013690106,
"train_speed(iter/s)": 0.130182
},
{
"epoch": 128.65116279069767,
"grad_norm": 1.1087292922703431,
"learning_rate": 4.324587446372364e-08,
"loss": 0.474017858505249,
"memory(GiB)": 74.97,
"step": 1415,
"token_acc": 0.8642677323380807,
"train_speed(iter/s)": 0.130185
},
{
"epoch": 129.09302325581396,
"grad_norm": 1.0228218711643116,
"learning_rate": 4.256713373170564e-08,
"loss": 0.4518399715423584,
"memory(GiB)": 74.97,
"step": 1420,
"token_acc": 0.8715422807155804,
"train_speed(iter/s)": 0.130231
},
{
"epoch": 129.5581395348837,
"grad_norm": 0.8006830274772974,
"learning_rate": 4.1892318459806175e-08,
"loss": 0.46432695388793943,
"memory(GiB)": 74.97,
"step": 1425,
"token_acc": 0.8545170039641143,
"train_speed(iter/s)": 0.130209
},
{
"epoch": 130.0,
"grad_norm": 1.0265209199413956,
"learning_rate": 4.1221474770752695e-08,
"loss": 0.44231014251708983,
"memory(GiB)": 74.97,
"step": 1430,
"token_acc": 0.8699077672962582,
"train_speed(iter/s)": 0.130226
},
{
"epoch": 130.46511627906978,
"grad_norm": 0.8188302695487818,
"learning_rate": 4.055464851582021e-08,
"loss": 0.4583402156829834,
"memory(GiB)": 74.97,
"step": 1435,
"token_acc": 0.8707192214769637,
"train_speed(iter/s)": 0.130227
},
{
"epoch": 130.93023255813952,
"grad_norm": 0.8252804258656437,
"learning_rate": 3.989188527169749e-08,
"loss": 0.46385898590087893,
"memory(GiB)": 74.97,
"step": 1440,
"token_acc": 0.8788993882124901,
"train_speed(iter/s)": 0.130217
},
{
"epoch": 131.37209302325581,
"grad_norm": 0.8872565038088099,
"learning_rate": 3.923323033737188e-08,
"loss": 0.4746572017669678,
"memory(GiB)": 74.97,
"step": 1445,
"token_acc": 0.8457805814107371,
"train_speed(iter/s)": 0.130274
},
{
"epoch": 131.8372093023256,
"grad_norm": 1.1550562475118538,
"learning_rate": 3.857872873103322e-08,
"loss": 0.44470739364624023,
"memory(GiB)": 74.97,
"step": 1450,
"token_acc": 0.8544268219233085,
"train_speed(iter/s)": 0.130237
},
{
"epoch": 132.27906976744185,
"grad_norm": 0.7549641151940925,
"learning_rate": 3.7928425186996883e-08,
"loss": 0.46361541748046875,
"memory(GiB)": 74.97,
"step": 1455,
"token_acc": 0.8666913610733835,
"train_speed(iter/s)": 0.130263
},
{
"epoch": 132.74418604651163,
"grad_norm": 0.8290416052434509,
"learning_rate": 3.7282364152646295e-08,
"loss": 0.45833826065063477,
"memory(GiB)": 74.97,
"step": 1460,
"token_acc": 0.8540965869971476,
"train_speed(iter/s)": 0.130268
},
{
"epoch": 133.1860465116279,
"grad_norm": 0.8534414388843884,
"learning_rate": 3.664058978539495e-08,
"loss": 0.4486083507537842,
"memory(GiB)": 74.97,
"step": 1465,
"token_acc": 0.8745001477395844,
"train_speed(iter/s)": 0.130303
},
{
"epoch": 133.65116279069767,
"grad_norm": 0.8212409711926915,
"learning_rate": 3.600314594966833e-08,
"loss": 0.4511223316192627,
"memory(GiB)": 74.97,
"step": 1470,
"token_acc": 0.8836138231258182,
"train_speed(iter/s)": 0.130277
},
{
"epoch": 134.09302325581396,
"grad_norm": 1.0201258607355366,
"learning_rate": 3.53700762139059e-08,
"loss": 0.48140726089477537,
"memory(GiB)": 74.97,
"step": 1475,
"token_acc": 0.8690138329330979,
"train_speed(iter/s)": 0.130297
},
{
"epoch": 134.5581395348837,
"grad_norm": 0.7617400291414114,
"learning_rate": 3.474142384758313e-08,
"loss": 0.4485898017883301,
"memory(GiB)": 74.97,
"step": 1480,
"token_acc": 0.864516129032258,
"train_speed(iter/s)": 0.130302
},
{
"epoch": 135.0,
"grad_norm": 1.0462722965857336,
"learning_rate": 3.41172318182542e-08,
"loss": 0.45436367988586424,
"memory(GiB)": 74.97,
"step": 1485,
"token_acc": 0.8761111111111111,
"train_speed(iter/s)": 0.130327
},
{
"epoch": 135.46511627906978,
"grad_norm": 0.8458888970103854,
"learning_rate": 3.349754278861516e-08,
"loss": 0.4582218170166016,
"memory(GiB)": 74.97,
"step": 1490,
"token_acc": 0.8546937352291658,
"train_speed(iter/s)": 0.130305
},
{
"epoch": 135.93023255813952,
"grad_norm": 1.0227265853515555,
"learning_rate": 3.2882399113588066e-08,
"loss": 0.44946842193603515,
"memory(GiB)": 74.97,
"step": 1495,
"token_acc": 0.8783018139714396,
"train_speed(iter/s)": 0.130303
},
{
"epoch": 136.37209302325581,
"grad_norm": 0.9319234599915691,
"learning_rate": 3.227184283742591e-08,
"loss": 0.4635480880737305,
"memory(GiB)": 74.97,
"step": 1500,
"token_acc": 0.8700904636260837,
"train_speed(iter/s)": 0.130364
},
{
"epoch": 136.8372093023256,
"grad_norm": 0.7448189618376913,
"learning_rate": 3.166591569083916e-08,
"loss": 0.45705451965332033,
"memory(GiB)": 74.97,
"step": 1505,
"token_acc": 0.8637782801950199,
"train_speed(iter/s)": 0.130342
},
{
"epoch": 137.27906976744185,
"grad_norm": 1.1529755818910967,
"learning_rate": 3.106465908814342e-08,
"loss": 0.45585179328918457,
"memory(GiB)": 74.97,
"step": 1510,
"token_acc": 0.8591232839778012,
"train_speed(iter/s)": 0.130363
},
{
"epoch": 137.74418604651163,
"grad_norm": 0.8260584253674346,
"learning_rate": 3.04681141244288e-08,
"loss": 0.46056065559387205,
"memory(GiB)": 74.97,
"step": 1515,
"token_acc": 0.8692046456648592,
"train_speed(iter/s)": 0.130323
},
{
"epoch": 138.1860465116279,
"grad_norm": 1.052950893981831,
"learning_rate": 2.987632157275114e-08,
"loss": 0.45586233139038085,
"memory(GiB)": 74.97,
"step": 1520,
"token_acc": 0.863406408094435,
"train_speed(iter/s)": 0.130349
},
{
"epoch": 138.65116279069767,
"grad_norm": 0.7991045733474148,
"learning_rate": 2.928932188134525e-08,
"loss": 0.4538632869720459,
"memory(GiB)": 74.97,
"step": 1525,
"token_acc": 0.8717533864610406,
"train_speed(iter/s)": 0.130328
},
{
"epoch": 139.09302325581396,
"grad_norm": 1.434227664193626,
"learning_rate": 2.8707155170860297e-08,
"loss": 0.46680850982666017,
"memory(GiB)": 74.97,
"step": 1530,
"token_acc": 0.8410292981517798,
"train_speed(iter/s)": 0.130343
},
{
"epoch": 139.5581395348837,
"grad_norm": 0.7631653651545482,
"learning_rate": 2.8129861231617612e-08,
"loss": 0.44613943099975584,
"memory(GiB)": 74.97,
"step": 1535,
"token_acc": 0.8678071275982503,
"train_speed(iter/s)": 0.130345
},
{
"epoch": 140.0,
"grad_norm": 1.1321589909418222,
"learning_rate": 2.7557479520891104e-08,
"loss": 0.4599461078643799,
"memory(GiB)": 74.97,
"step": 1540,
"token_acc": 0.8688507394846334,
"train_speed(iter/s)": 0.130373
},
{
"epoch": 140.46511627906978,
"grad_norm": 0.9020009960345104,
"learning_rate": 2.699004916021038e-08,
"loss": 0.4559918403625488,
"memory(GiB)": 74.97,
"step": 1545,
"token_acc": 0.8607777938412606,
"train_speed(iter/s)": 0.130363
},
{
"epoch": 140.93023255813952,
"grad_norm": 0.7719651412897752,
"learning_rate": 2.642760893268684e-08,
"loss": 0.459440279006958,
"memory(GiB)": 74.97,
"step": 1550,
"token_acc": 0.881872014598279,
"train_speed(iter/s)": 0.130356
},
{
"epoch": 141.37209302325581,
"grad_norm": 0.7388402954698886,
"learning_rate": 2.5870197280362915e-08,
"loss": 0.42969484329223634,
"memory(GiB)": 74.97,
"step": 1555,
"token_acc": 0.8883613399742432,
"train_speed(iter/s)": 0.130344
},
{
"epoch": 141.8372093023256,
"grad_norm": 0.7267745408294942,
"learning_rate": 2.5317852301584643e-08,
"loss": 0.4578805923461914,
"memory(GiB)": 74.97,
"step": 1560,
"token_acc": 0.8614507600793126,
"train_speed(iter/s)": 0.130359
},
{
"epoch": 142.27906976744185,
"grad_norm": 0.7380347392311346,
"learning_rate": 2.477061174839755e-08,
"loss": 0.465103816986084,
"memory(GiB)": 74.97,
"step": 1565,
"token_acc": 0.8470271187879302,
"train_speed(iter/s)": 0.130395
},
{
"epoch": 142.74418604651163,
"grad_norm": 0.8136693379385729,
"learning_rate": 2.4228513023966547e-08,
"loss": 0.45352745056152344,
"memory(GiB)": 74.97,
"step": 1570,
"token_acc": 0.8680695298875026,
"train_speed(iter/s)": 0.130411
},
{
"epoch": 143.1860465116279,
"grad_norm": 0.8229594379364835,
"learning_rate": 2.3691593180019364e-08,
"loss": 0.46236839294433596,
"memory(GiB)": 74.97,
"step": 1575,
"token_acc": 0.8841950432568365,
"train_speed(iter/s)": 0.130422
},
{
"epoch": 143.65116279069767,
"grad_norm": 0.8764589511746724,
"learning_rate": 2.315988891431412e-08,
"loss": 0.44404191970825196,
"memory(GiB)": 74.97,
"step": 1580,
"token_acc": 0.8711640164847799,
"train_speed(iter/s)": 0.130418
},
{
"epoch": 144.09302325581396,
"grad_norm": 0.7989067686578916,
"learning_rate": 2.263343656813107e-08,
"loss": 0.46502885818481443,
"memory(GiB)": 74.97,
"step": 1585,
"token_acc": 0.8433810096689391,
"train_speed(iter/s)": 0.130431
},
{
"epoch": 144.5581395348837,
"grad_norm": 0.7139519442470533,
"learning_rate": 2.2112272123788767e-08,
"loss": 0.4445913314819336,
"memory(GiB)": 74.97,
"step": 1590,
"token_acc": 0.8719508074869924,
"train_speed(iter/s)": 0.130449
},
{
"epoch": 145.0,
"grad_norm": 0.854611201984833,
"learning_rate": 2.1596431202184705e-08,
"loss": 0.45667543411254885,
"memory(GiB)": 74.97,
"step": 1595,
"token_acc": 0.8543597957753529,
"train_speed(iter/s)": 0.130485
},
{
"epoch": 145.46511627906978,
"grad_norm": 0.8370879177525832,
"learning_rate": 2.108594906036065e-08,
"loss": 0.45542278289794924,
"memory(GiB)": 74.97,
"step": 1600,
"token_acc": 0.8767741127199183,
"train_speed(iter/s)": 0.130498
},
{
"epoch": 145.46511627906978,
"eval_loss": 0.612120509147644,
"eval_runtime": 0.6973,
"eval_samples_per_second": 18.643,
"eval_steps_per_second": 2.868,
"eval_token_acc": 0.8432800076635693,
"step": 1600
},
{
"epoch": 145.93023255813952,
"grad_norm": 1.8496079436558843,
"learning_rate": 2.0580860589092895e-08,
"loss": 0.4458228588104248,
"memory(GiB)": 74.97,
"step": 1605,
"token_acc": 0.8655583853748735,
"train_speed(iter/s)": 0.130281
},
{
"epoch": 146.37209302325581,
"grad_norm": 0.8949636135857424,
"learning_rate": 2.008120031050753e-08,
"loss": 0.4534448146820068,
"memory(GiB)": 74.97,
"step": 1610,
"token_acc": 0.8604011376099039,
"train_speed(iter/s)": 0.130303
},
{
"epoch": 146.8372093023256,
"grad_norm": 1.4041818864948623,
"learning_rate": 1.9587002375720862e-08,
"loss": 0.46073060035705565,
"memory(GiB)": 74.97,
"step": 1615,
"token_acc": 0.8637630263007214,
"train_speed(iter/s)": 0.130307
},
{
"epoch": 147.27906976744185,
"grad_norm": 0.8001120538073951,
"learning_rate": 1.9098300562505266e-08,
"loss": 0.44887795448303225,
"memory(GiB)": 74.97,
"step": 1620,
"token_acc": 0.8565026887074288,
"train_speed(iter/s)": 0.130322
},
{
"epoch": 147.74418604651163,
"grad_norm": 1.3994263394516653,
"learning_rate": 1.8615128272980507e-08,
"loss": 0.4529706001281738,
"memory(GiB)": 74.97,
"step": 1625,
"token_acc": 0.8614175728232399,
"train_speed(iter/s)": 0.130317
},
{
"epoch": 148.1860465116279,
"grad_norm": 0.7986626000623837,
"learning_rate": 1.8137518531330763e-08,
"loss": 0.45129976272583006,
"memory(GiB)": 74.97,
"step": 1630,
"token_acc": 0.88025613660619,
"train_speed(iter/s)": 0.130337
},
{
"epoch": 148.65116279069767,
"grad_norm": 0.7053069152982997,
"learning_rate": 1.7665503981547425e-08,
"loss": 0.45789132118225095,
"memory(GiB)": 74.97,
"step": 1635,
"token_acc": 0.8718905932360007,
"train_speed(iter/s)": 0.13032
},
{
"epoch": 149.09302325581396,
"grad_norm": 1.327231576897276,
"learning_rate": 1.7199116885197997e-08,
"loss": 0.45948057174682616,
"memory(GiB)": 74.97,
"step": 1640,
"token_acc": 0.8687992670776631,
"train_speed(iter/s)": 0.13036
},
{
"epoch": 149.5581395348837,
"grad_norm": 1.8690818535078901,
"learning_rate": 1.6738389119220965e-08,
"loss": 0.4487407684326172,
"memory(GiB)": 74.97,
"step": 1645,
"token_acc": 0.8717857813184292,
"train_speed(iter/s)": 0.130356
},
{
"epoch": 150.0,
"grad_norm": 1.4986410325133508,
"learning_rate": 1.6283352173747144e-08,
"loss": 0.46256265640258787,
"memory(GiB)": 74.97,
"step": 1650,
"token_acc": 0.8687363834422658,
"train_speed(iter/s)": 0.130382
},
{
"epoch": 150.46511627906978,
"grad_norm": 0.9212362231868645,
"learning_rate": 1.5834037149947288e-08,
"loss": 0.45532588958740233,
"memory(GiB)": 74.97,
"step": 1655,
"token_acc": 0.8637431617337635,
"train_speed(iter/s)": 0.130388
},
{
"epoch": 150.93023255813952,
"grad_norm": 0.7589204558012844,
"learning_rate": 1.5390474757906446e-08,
"loss": 0.4434979438781738,
"memory(GiB)": 74.97,
"step": 1660,
"token_acc": 0.8520731295389292,
"train_speed(iter/s)": 0.130379
},
{
"epoch": 151.37209302325581,
"grad_norm": 0.7171576824463824,
"learning_rate": 1.495269531452491e-08,
"loss": 0.45127115249633787,
"memory(GiB)": 74.97,
"step": 1665,
"token_acc": 0.8684483740245822,
"train_speed(iter/s)": 0.130418
},
{
"epoch": 151.8372093023256,
"grad_norm": 0.7991722745507821,
"learning_rate": 1.4520728741446087e-08,
"loss": 0.4588929176330566,
"memory(GiB)": 74.97,
"step": 1670,
"token_acc": 0.8637134079593206,
"train_speed(iter/s)": 0.130412
},
{
"epoch": 152.27906976744185,
"grad_norm": 1.041259181485301,
"learning_rate": 1.409460456301147e-08,
"loss": 0.4453131675720215,
"memory(GiB)": 74.97,
"step": 1675,
"token_acc": 0.8544123886296139,
"train_speed(iter/s)": 0.130435
},
{
"epoch": 152.74418604651163,
"grad_norm": 0.8313710748011637,
"learning_rate": 1.367435190424261e-08,
"loss": 0.45928287506103516,
"memory(GiB)": 74.97,
"step": 1680,
"token_acc": 0.8679964503247146,
"train_speed(iter/s)": 0.13042
},
{
"epoch": 153.1860465116279,
"grad_norm": 0.8698596114757391,
"learning_rate": 1.3259999488850471e-08,
"loss": 0.4635627746582031,
"memory(GiB)": 74.97,
"step": 1685,
"token_acc": 0.8450357565069091,
"train_speed(iter/s)": 0.130437
},
{
"epoch": 153.65116279069767,
"grad_norm": 0.8269395824162962,
"learning_rate": 1.285157563727226e-08,
"loss": 0.44847860336303713,
"memory(GiB)": 74.97,
"step": 1690,
"token_acc": 0.8680811179277437,
"train_speed(iter/s)": 0.130425
},
{
"epoch": 154.09302325581396,
"grad_norm": 0.9761994911989668,
"learning_rate": 1.244910826473572e-08,
"loss": 0.45370187759399416,
"memory(GiB)": 74.97,
"step": 1695,
"token_acc": 0.8793547562067264,
"train_speed(iter/s)": 0.130443
},
{
"epoch": 154.5581395348837,
"grad_norm": 1.6323959605839558,
"learning_rate": 1.2052624879351104e-08,
"loss": 0.4481173515319824,
"memory(GiB)": 74.97,
"step": 1700,
"token_acc": 0.8561244744199944,
"train_speed(iter/s)": 0.13047
},
{
"epoch": 155.0,
"grad_norm": 0.9207580708371824,
"learning_rate": 1.1662152580231144e-08,
"loss": 0.4539341926574707,
"memory(GiB)": 74.97,
"step": 1705,
"token_acc": 0.8649734464445824,
"train_speed(iter/s)": 0.130465
},
{
"epoch": 155.46511627906978,
"grad_norm": 0.745689965265747,
"learning_rate": 1.1277718055638818e-08,
"loss": 0.4519050598144531,
"memory(GiB)": 74.97,
"step": 1710,
"token_acc": 0.852934204004224,
"train_speed(iter/s)": 0.13047
},
{
"epoch": 155.93023255813952,
"grad_norm": 0.8506585464720108,
"learning_rate": 1.089934758116322e-08,
"loss": 0.4458354949951172,
"memory(GiB)": 74.97,
"step": 1715,
"token_acc": 0.874605201263356,
"train_speed(iter/s)": 0.130466
},
{
"epoch": 156.37209302325581,
"grad_norm": 0.8256841894574871,
"learning_rate": 1.0527067017923653e-08,
"loss": 0.4461174011230469,
"memory(GiB)": 74.97,
"step": 1720,
"token_acc": 0.8700296735905044,
"train_speed(iter/s)": 0.130504
},
{
"epoch": 156.8372093023256,
"grad_norm": 1.507219892035112,
"learning_rate": 1.0160901810802114e-08,
"loss": 0.45079655647277833,
"memory(GiB)": 74.97,
"step": 1725,
"token_acc": 0.8675626379955842,
"train_speed(iter/s)": 0.130482
},
{
"epoch": 157.27906976744185,
"grad_norm": 0.8360642824375936,
"learning_rate": 9.800876986704109e-09,
"loss": 0.46644229888916017,
"memory(GiB)": 74.97,
"step": 1730,
"token_acc": 0.8489071709233792,
"train_speed(iter/s)": 0.13048
},
{
"epoch": 157.74418604651163,
"grad_norm": 1.1246527506944004,
"learning_rate": 9.447017152848125e-09,
"loss": 0.4457961082458496,
"memory(GiB)": 74.97,
"step": 1735,
"token_acc": 0.8624011633190948,
"train_speed(iter/s)": 0.130482
},
{
"epoch": 158.1860465116279,
"grad_norm": 1.166595856803442,
"learning_rate": 9.099346495083749e-09,
"loss": 0.46271514892578125,
"memory(GiB)": 74.97,
"step": 1740,
"token_acc": 0.870665567772931,
"train_speed(iter/s)": 0.130529
},
{
"epoch": 158.65116279069767,
"grad_norm": 0.997579039313746,
"learning_rate": 8.75788877623862e-09,
"loss": 0.45302181243896483,
"memory(GiB)": 74.97,
"step": 1745,
"token_acc": 0.8601099764336214,
"train_speed(iter/s)": 0.130496
},
{
"epoch": 159.09302325581396,
"grad_norm": 0.8847455985487748,
"learning_rate": 8.422667334494249e-09,
"loss": 0.44652571678161623,
"memory(GiB)": 74.97,
"step": 1750,
"token_acc": 0.8695352691736444,
"train_speed(iter/s)": 0.130543
},
{
"epoch": 159.5581395348837,
"grad_norm": 0.7005559155585159,
"learning_rate": 8.093705081790891e-09,
"loss": 0.45291786193847655,
"memory(GiB)": 74.97,
"step": 1755,
"token_acc": 0.8535476796830787,
"train_speed(iter/s)": 0.130532
},
{
"epoch": 160.0,
"grad_norm": 1.2943680843819054,
"learning_rate": 7.771024502261525e-09,
"loss": 0.4609353542327881,
"memory(GiB)": 74.97,
"step": 1760,
"token_acc": 0.8666812131791403,
"train_speed(iter/s)": 0.130545
},
{
"epoch": 160.46511627906978,
"grad_norm": 0.7963107860861562,
"learning_rate": 7.454647650695157e-09,
"loss": 0.44596638679504397,
"memory(GiB)": 74.97,
"step": 1765,
"token_acc": 0.8749486582694413,
"train_speed(iter/s)": 0.130569
},
{
"epoch": 160.93023255813952,
"grad_norm": 0.8631048995115475,
"learning_rate": 7.144596151029303e-09,
"loss": 0.4524871826171875,
"memory(GiB)": 74.97,
"step": 1770,
"token_acc": 0.8677113770449089,
"train_speed(iter/s)": 0.130541
},
{
"epoch": 161.37209302325581,
"grad_norm": 2.75385310377207,
"learning_rate": 6.840891194872111e-09,
"loss": 0.4484891891479492,
"memory(GiB)": 74.97,
"step": 1775,
"token_acc": 0.8703089199652366,
"train_speed(iter/s)": 0.130568
},
{
"epoch": 161.8372093023256,
"grad_norm": 0.8722893432749486,
"learning_rate": 6.5435535400539254e-09,
"loss": 0.45218324661254883,
"memory(GiB)": 74.97,
"step": 1780,
"token_acc": 0.868553358560287,
"train_speed(iter/s)": 0.130557
},
{
"epoch": 162.27906976744185,
"grad_norm": 0.8215719185780701,
"learning_rate": 6.252603509208465e-09,
"loss": 0.4554037094116211,
"memory(GiB)": 74.97,
"step": 1785,
"token_acc": 0.8677862595419847,
"train_speed(iter/s)": 0.130539
},
{
"epoch": 162.74418604651163,
"grad_norm": 0.7384288783097476,
"learning_rate": 5.9680609883838825e-09,
"loss": 0.44667611122131345,
"memory(GiB)": 74.97,
"step": 1790,
"token_acc": 0.8610665481183679,
"train_speed(iter/s)": 0.130557
},
{
"epoch": 163.1860465116279,
"grad_norm": 1.061403944970525,
"learning_rate": 5.689945425683473e-09,
"loss": 0.44474124908447266,
"memory(GiB)": 74.97,
"step": 1795,
"token_acc": 0.8644834307992203,
"train_speed(iter/s)": 0.130593
},
{
"epoch": 163.65116279069767,
"grad_norm": 0.7777597746319437,
"learning_rate": 5.418275829936536e-09,
"loss": 0.44541053771972655,
"memory(GiB)": 74.97,
"step": 1800,
"token_acc": 0.8673607496095783,
"train_speed(iter/s)": 0.130589
},
{
"epoch": 163.65116279069767,
"eval_loss": 0.6119173765182495,
"eval_runtime": 0.6953,
"eval_samples_per_second": 18.698,
"eval_steps_per_second": 2.877,
"eval_token_acc": 0.8430884184308842,
"step": 1800
},
{
"epoch": 164.09302325581396,
"grad_norm": 0.7945916641293757,
"learning_rate": 5.15307076939906e-09,
"loss": 0.47254066467285155,
"memory(GiB)": 74.97,
"step": 1805,
"token_acc": 0.8580266386260077,
"train_speed(iter/s)": 0.130407
},
{
"epoch": 164.5581395348837,
"grad_norm": 1.0008056456948866,
"learning_rate": 4.8943483704846465e-09,
"loss": 0.45273590087890625,
"memory(GiB)": 74.97,
"step": 1810,
"token_acc": 0.8646250808015514,
"train_speed(iter/s)": 0.130426
},
{
"epoch": 165.0,
"grad_norm": 1.4076505417290193,
"learning_rate": 4.6421263165255855e-09,
"loss": 0.4405077934265137,
"memory(GiB)": 74.97,
"step": 1815,
"token_acc": 0.8686048572948059,
"train_speed(iter/s)": 0.130467
},
{
"epoch": 165.46511627906978,
"grad_norm": 0.9122077782409643,
"learning_rate": 4.396421846564236e-09,
"loss": 0.4534634590148926,
"memory(GiB)": 74.97,
"step": 1820,
"token_acc": 0.8500332069327506,
"train_speed(iter/s)": 0.13046
},
{
"epoch": 165.93023255813952,
"grad_norm": 1.0037794680637797,
"learning_rate": 4.157251754174729e-09,
"loss": 0.4450718402862549,
"memory(GiB)": 74.97,
"step": 1825,
"token_acc": 0.8572525948963915,
"train_speed(iter/s)": 0.130467
},
{
"epoch": 166.37209302325581,
"grad_norm": 0.7603089866068351,
"learning_rate": 3.924632386315185e-09,
"loss": 0.44524030685424804,
"memory(GiB)": 74.97,
"step": 1830,
"token_acc": 0.8798618132794068,
"train_speed(iter/s)": 0.130461
},
{
"epoch": 166.8372093023256,
"grad_norm": 0.7741124344133163,
"learning_rate": 3.6985796422103977e-09,
"loss": 0.4650570392608643,
"memory(GiB)": 74.97,
"step": 1835,
"token_acc": 0.8640802573718376,
"train_speed(iter/s)": 0.13049
},
{
"epoch": 167.27906976744185,
"grad_norm": 0.7769430246424489,
"learning_rate": 3.4791089722651433e-09,
"loss": 0.4513576507568359,
"memory(GiB)": 74.97,
"step": 1840,
"token_acc": 0.870817885379908,
"train_speed(iter/s)": 0.130488
},
{
"epoch": 167.74418604651163,
"grad_norm": 2.1575767592775823,
"learning_rate": 3.266235377008175e-09,
"loss": 0.4532448768615723,
"memory(GiB)": 74.97,
"step": 1845,
"token_acc": 0.8802010930626702,
"train_speed(iter/s)": 0.130491
},
{
"epoch": 168.1860465116279,
"grad_norm": 0.8581455080813751,
"learning_rate": 3.0599734060669624e-09,
"loss": 0.44078683853149414,
"memory(GiB)": 74.97,
"step": 1850,
"token_acc": 0.8616869584293079,
"train_speed(iter/s)": 0.130513
},
{
"epoch": 168.65116279069767,
"grad_norm": 0.7254996365029248,
"learning_rate": 2.860337157173243e-09,
"loss": 0.45212836265563966,
"memory(GiB)": 74.97,
"step": 1855,
"token_acc": 0.8733549684432675,
"train_speed(iter/s)": 0.130513
},
{
"epoch": 169.09302325581396,
"grad_norm": 1.0391226655473043,
"learning_rate": 2.6673402751994255e-09,
"loss": 0.45039982795715333,
"memory(GiB)": 74.97,
"step": 1860,
"token_acc": 0.8479883283766343,
"train_speed(iter/s)": 0.130544
},
{
"epoch": 169.5581395348837,
"grad_norm": 4.061114170885048,
"learning_rate": 2.480995951226028e-09,
"loss": 0.4557363510131836,
"memory(GiB)": 74.97,
"step": 1865,
"token_acc": 0.8549472607052897,
"train_speed(iter/s)": 0.130538
},
{
"epoch": 170.0,
"grad_norm": 0.7661862324584979,
"learning_rate": 2.301316921640073e-09,
"loss": 0.44440832138061526,
"memory(GiB)": 74.97,
"step": 1870,
"token_acc": 0.864181855416752,
"train_speed(iter/s)": 0.130548
},
{
"epoch": 170.46511627906978,
"grad_norm": 1.0139594423822822,
"learning_rate": 2.128315467264552e-09,
"loss": 0.44567031860351564,
"memory(GiB)": 74.97,
"step": 1875,
"token_acc": 0.8773299316489919,
"train_speed(iter/s)": 0.130532
},
{
"epoch": 170.93023255813952,
"grad_norm": 4.102723548733547,
"learning_rate": 1.962003412519064e-09,
"loss": 0.45189361572265624,
"memory(GiB)": 74.97,
"step": 1880,
"token_acc": 0.8593179414523178,
"train_speed(iter/s)": 0.130547
},
{
"epoch": 171.37209302325581,
"grad_norm": 0.8536208597740141,
"learning_rate": 1.8023921246116402e-09,
"loss": 0.45585269927978517,
"memory(GiB)": 74.97,
"step": 1885,
"token_acc": 0.8772535999691156,
"train_speed(iter/s)": 0.130545
},
{
"epoch": 171.8372093023256,
"grad_norm": 1.2245160632333336,
"learning_rate": 1.6494925127617632e-09,
"loss": 0.4523616790771484,
"memory(GiB)": 74.97,
"step": 1890,
"token_acc": 0.853437876960193,
"train_speed(iter/s)": 0.130555
},
{
"epoch": 172.27906976744185,
"grad_norm": 0.9530973263407838,
"learning_rate": 1.5033150274548324e-09,
"loss": 0.4454800605773926,
"memory(GiB)": 74.97,
"step": 1895,
"token_acc": 0.8595509191527256,
"train_speed(iter/s)": 0.130569
},
{
"epoch": 172.74418604651163,
"grad_norm": 0.7332081781662043,
"learning_rate": 1.3638696597277677e-09,
"loss": 0.443679666519165,
"memory(GiB)": 74.97,
"step": 1900,
"token_acc": 0.8559887049964703,
"train_speed(iter/s)": 0.130587
},
{
"epoch": 173.1860465116279,
"grad_norm": 0.920253466736325,
"learning_rate": 1.231165940486234e-09,
"loss": 0.469818115234375,
"memory(GiB)": 74.97,
"step": 1905,
"token_acc": 0.8453635280095352,
"train_speed(iter/s)": 0.130597
},
{
"epoch": 173.65116279069767,
"grad_norm": 0.8835815802653249,
"learning_rate": 1.1052129398531506e-09,
"loss": 0.44182252883911133,
"memory(GiB)": 74.97,
"step": 1910,
"token_acc": 0.8679900339010742,
"train_speed(iter/s)": 0.130609
},
{
"epoch": 174.09302325581396,
"grad_norm": 1.0067834379405356,
"learning_rate": 9.86019266548821e-10,
"loss": 0.4615338802337646,
"memory(GiB)": 74.97,
"step": 1915,
"token_acc": 0.8516490943498243,
"train_speed(iter/s)": 0.130627
},
{
"epoch": 174.5581395348837,
"grad_norm": 0.7527783775468317,
"learning_rate": 8.735930673024805e-10,
"loss": 0.4349226951599121,
"memory(GiB)": 74.97,
"step": 1920,
"token_acc": 0.8792523552149395,
"train_speed(iter/s)": 0.130639
},
{
"epoch": 175.0,
"grad_norm": 1.117914154380228,
"learning_rate": 7.679420262954983e-10,
"loss": 0.45952515602111815,
"memory(GiB)": 74.97,
"step": 1925,
"token_acc": 0.85297977378299,
"train_speed(iter/s)": 0.130658
},
{
"epoch": 175.46511627906978,
"grad_norm": 0.9275458758815365,
"learning_rate": 6.690733646361856e-10,
"loss": 0.4419642448425293,
"memory(GiB)": 74.97,
"step": 1930,
"token_acc": 0.8648351648351649,
"train_speed(iter/s)": 0.130648
},
{
"epoch": 175.93023255813952,
"grad_norm": 0.9626017497568045,
"learning_rate": 5.769938398662355e-10,
"loss": 0.4574889659881592,
"memory(GiB)": 74.97,
"step": 1935,
"token_acc": 0.862218660255126,
"train_speed(iter/s)": 0.130652
},
{
"epoch": 176.37209302325581,
"grad_norm": 0.7475798565408234,
"learning_rate": 4.917097454988583e-10,
"loss": 0.4532492637634277,
"memory(GiB)": 74.97,
"step": 1940,
"token_acc": 0.8698313950032691,
"train_speed(iter/s)": 0.130672
},
{
"epoch": 176.8372093023256,
"grad_norm": 0.8211466589757828,
"learning_rate": 4.132269105886155e-10,
"loss": 0.4510762691497803,
"memory(GiB)": 74.97,
"step": 1945,
"token_acc": 0.8704986701068692,
"train_speed(iter/s)": 0.130681
},
{
"epoch": 177.27906976744185,
"grad_norm": 1.7704160910518656,
"learning_rate": 3.4155069933301526e-10,
"loss": 0.44258646965026854,
"memory(GiB)": 74.97,
"step": 1950,
"token_acc": 0.8673919489954778,
"train_speed(iter/s)": 0.130706
},
{
"epoch": 177.74418604651163,
"grad_norm": 0.7984443068601499,
"learning_rate": 2.7668601070588436e-10,
"loss": 0.4494297027587891,
"memory(GiB)": 74.97,
"step": 1955,
"token_acc": 0.8809769787056883,
"train_speed(iter/s)": 0.130709
},
{
"epoch": 178.1860465116279,
"grad_norm": 1.034280653667984,
"learning_rate": 2.186372781225465e-10,
"loss": 0.4531251430511475,
"memory(GiB)": 74.97,
"step": 1960,
"token_acc": 0.8546573936837305,
"train_speed(iter/s)": 0.130721
},
{
"epoch": 178.65116279069767,
"grad_norm": 0.7838331091869942,
"learning_rate": 1.6740846913674279e-10,
"loss": 0.45207509994506834,
"memory(GiB)": 74.97,
"step": 1965,
"token_acc": 0.8692857883279776,
"train_speed(iter/s)": 0.130714
},
{
"epoch": 179.09302325581396,
"grad_norm": 1.030250317436395,
"learning_rate": 1.2300308516952628e-10,
"loss": 0.45918664932250974,
"memory(GiB)": 74.97,
"step": 1970,
"token_acc": 0.8727756076388888,
"train_speed(iter/s)": 0.130734
},
{
"epoch": 179.5581395348837,
"grad_norm": 0.9787925643257926,
"learning_rate": 8.542416126989804e-11,
"loss": 0.4371158599853516,
"memory(GiB)": 74.97,
"step": 1975,
"token_acc": 0.8778103770180585,
"train_speed(iter/s)": 0.130716
},
{
"epoch": 180.0,
"grad_norm": 15.04723370590281,
"learning_rate": 5.46742659073951e-11,
"loss": 0.4714357852935791,
"memory(GiB)": 74.97,
"step": 1980,
"token_acc": 0.8756799192508271,
"train_speed(iter/s)": 0.130752
},
{
"epoch": 180.46511627906978,
"grad_norm": 0.8235554306239253,
"learning_rate": 3.0755500796531e-11,
"loss": 0.44407100677490235,
"memory(GiB)": 74.97,
"step": 1985,
"token_acc": 0.8770921605870226,
"train_speed(iter/s)": 0.130773
},
{
"epoch": 180.93023255813952,
"grad_norm": 1.1709340797573902,
"learning_rate": 1.3669500753099584e-11,
"loss": 0.44757466316223143,
"memory(GiB)": 74.97,
"step": 1990,
"token_acc": 0.8692709656348659,
"train_speed(iter/s)": 0.130762
},
{
"epoch": 181.37209302325581,
"grad_norm": 1.040441435930544,
"learning_rate": 3.417433582542095e-12,
"loss": 0.4524868011474609,
"memory(GiB)": 74.97,
"step": 1995,
"token_acc": 0.8582090965920931,
"train_speed(iter/s)": 0.130794
},
{
"epoch": 181.8372093023256,
"grad_norm": 2.1432807975254313,
"learning_rate": 0.0,
"loss": 0.46004161834716795,
"memory(GiB)": 74.97,
"step": 2000,
"token_acc": 0.8837088162521162,
"train_speed(iter/s)": 0.130787
},
{
"epoch": 181.8372093023256,
"eval_loss": 0.6126144528388977,
"eval_runtime": 0.6963,
"eval_samples_per_second": 18.669,
"eval_steps_per_second": 2.872,
"eval_token_acc": 0.8433758022799118,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 200,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4205321758179328.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}