| { | |
| "best_metric": 0.60154372, | |
| "best_model_checkpoint": "/nfs4/models/Qwen2.5-VL/Reject_sft_Qwen2.5-VL-3B-Instruct/v7-20250617-161549/checkpoint-800", | |
| "epoch": 181.8372093023256, | |
| "eval_steps": 200, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09302325581395349, | |
| "grad_norm": 2.441588224180784, | |
| "learning_rate": 2e-09, | |
| "loss": 0.7878831624984741, | |
| "memory(GiB)": 65.48, | |
| "step": 1, | |
| "token_acc": 0.7861313342463778, | |
| "train_speed(iter/s)": 0.055435 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 2.8780390737909576, | |
| "learning_rate": 1e-08, | |
| "loss": 0.8473173379898071, | |
| "memory(GiB)": 65.5, | |
| "step": 5, | |
| "token_acc": 0.7882713944766451, | |
| "train_speed(iter/s)": 0.101539 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 2.5438182772777616, | |
| "learning_rate": 2e-08, | |
| "loss": 0.8371296882629394, | |
| "memory(GiB)": 66.93, | |
| "step": 10, | |
| "token_acc": 0.7700506562717737, | |
| "train_speed(iter/s)": 0.110961 | |
| }, | |
| { | |
| "epoch": 1.372093023255814, | |
| "grad_norm": 2.4572297135546735, | |
| "learning_rate": 3e-08, | |
| "loss": 0.8263990402221679, | |
| "memory(GiB)": 66.93, | |
| "step": 15, | |
| "token_acc": 0.7914247785857225, | |
| "train_speed(iter/s)": 0.118864 | |
| }, | |
| { | |
| "epoch": 1.8372093023255816, | |
| "grad_norm": 2.817513169380205, | |
| "learning_rate": 4e-08, | |
| "loss": 0.8524192810058594, | |
| "memory(GiB)": 66.93, | |
| "step": 20, | |
| "token_acc": 0.8086610622604439, | |
| "train_speed(iter/s)": 0.120417 | |
| }, | |
| { | |
| "epoch": 2.2790697674418605, | |
| "grad_norm": 2.5728578917158496, | |
| "learning_rate": 5e-08, | |
| "loss": 0.8347753524780274, | |
| "memory(GiB)": 66.93, | |
| "step": 25, | |
| "token_acc": 0.7975967163791022, | |
| "train_speed(iter/s)": 0.1197 | |
| }, | |
| { | |
| "epoch": 2.744186046511628, | |
| "grad_norm": 7.697786718127836, | |
| "learning_rate": 6e-08, | |
| "loss": 0.8387296676635743, | |
| "memory(GiB)": 66.93, | |
| "step": 30, | |
| "token_acc": 0.7844551282051282, | |
| "train_speed(iter/s)": 0.121618 | |
| }, | |
| { | |
| "epoch": 3.186046511627907, | |
| "grad_norm": 2.6215878302524973, | |
| "learning_rate": 6.999999999999999e-08, | |
| "loss": 0.8212770462036133, | |
| "memory(GiB)": 66.94, | |
| "step": 35, | |
| "token_acc": 0.7918978074644326, | |
| "train_speed(iter/s)": 0.121919 | |
| }, | |
| { | |
| "epoch": 3.6511627906976747, | |
| "grad_norm": 2.3356396729850886, | |
| "learning_rate": 8e-08, | |
| "loss": 0.8247488021850586, | |
| "memory(GiB)": 66.94, | |
| "step": 40, | |
| "token_acc": 0.7788534837627688, | |
| "train_speed(iter/s)": 0.122256 | |
| }, | |
| { | |
| "epoch": 4.093023255813954, | |
| "grad_norm": 2.24036454294963, | |
| "learning_rate": 9e-08, | |
| "loss": 0.8493685722351074, | |
| "memory(GiB)": 66.94, | |
| "step": 45, | |
| "token_acc": 0.7944452759188386, | |
| "train_speed(iter/s)": 0.124476 | |
| }, | |
| { | |
| "epoch": 4.558139534883721, | |
| "grad_norm": 2.3710774237116135, | |
| "learning_rate": 1e-07, | |
| "loss": 0.8277470588684082, | |
| "memory(GiB)": 66.94, | |
| "step": 50, | |
| "token_acc": 0.8012501821832845, | |
| "train_speed(iter/s)": 0.124595 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 3.0036536630441435, | |
| "learning_rate": 1.1e-07, | |
| "loss": 0.8450939178466796, | |
| "memory(GiB)": 66.94, | |
| "step": 55, | |
| "token_acc": 0.7760649403867543, | |
| "train_speed(iter/s)": 0.125053 | |
| }, | |
| { | |
| "epoch": 5.465116279069767, | |
| "grad_norm": 2.5881257114238627, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.8497460365295411, | |
| "memory(GiB)": 66.94, | |
| "step": 60, | |
| "token_acc": 0.796596503868389, | |
| "train_speed(iter/s)": 0.125941 | |
| }, | |
| { | |
| "epoch": 5.930232558139535, | |
| "grad_norm": 2.3744096454020376, | |
| "learning_rate": 1.3e-07, | |
| "loss": 0.8116294860839843, | |
| "memory(GiB)": 66.94, | |
| "step": 65, | |
| "token_acc": 0.7651333807767786, | |
| "train_speed(iter/s)": 0.125394 | |
| }, | |
| { | |
| "epoch": 6.372093023255814, | |
| "grad_norm": 2.235166994874194, | |
| "learning_rate": 1.3999999999999998e-07, | |
| "loss": 0.8369662284851074, | |
| "memory(GiB)": 66.94, | |
| "step": 70, | |
| "token_acc": 0.7880281843764316, | |
| "train_speed(iter/s)": 0.126206 | |
| }, | |
| { | |
| "epoch": 6.837209302325581, | |
| "grad_norm": 8.546638989645471, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.8286456108093262, | |
| "memory(GiB)": 66.94, | |
| "step": 75, | |
| "token_acc": 0.7748851144806365, | |
| "train_speed(iter/s)": 0.126162 | |
| }, | |
| { | |
| "epoch": 7.27906976744186, | |
| "grad_norm": 2.1654224438993, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.8329730033874512, | |
| "memory(GiB)": 66.94, | |
| "step": 80, | |
| "token_acc": 0.80111933970264, | |
| "train_speed(iter/s)": 0.126637 | |
| }, | |
| { | |
| "epoch": 7.7441860465116275, | |
| "grad_norm": 3.346508648878843, | |
| "learning_rate": 1.7e-07, | |
| "loss": 0.8077556610107421, | |
| "memory(GiB)": 66.94, | |
| "step": 85, | |
| "token_acc": 0.7893712675300275, | |
| "train_speed(iter/s)": 0.126958 | |
| }, | |
| { | |
| "epoch": 8.186046511627907, | |
| "grad_norm": 2.1195939617622908, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.8190940856933594, | |
| "memory(GiB)": 66.94, | |
| "step": 90, | |
| "token_acc": 0.7837902316300859, | |
| "train_speed(iter/s)": 0.127783 | |
| }, | |
| { | |
| "epoch": 8.651162790697674, | |
| "grad_norm": 2.196504569870541, | |
| "learning_rate": 1.8999999999999998e-07, | |
| "loss": 0.7897569179534912, | |
| "memory(GiB)": 66.94, | |
| "step": 95, | |
| "token_acc": 0.8031453890349596, | |
| "train_speed(iter/s)": 0.12748 | |
| }, | |
| { | |
| "epoch": 9.093023255813954, | |
| "grad_norm": 2.4902973786655798, | |
| "learning_rate": 2e-07, | |
| "loss": 0.8305625915527344, | |
| "memory(GiB)": 66.94, | |
| "step": 100, | |
| "token_acc": 0.7491283167239546, | |
| "train_speed(iter/s)": 0.127599 | |
| }, | |
| { | |
| "epoch": 9.55813953488372, | |
| "grad_norm": 1.9236232576368646, | |
| "learning_rate": 1.9999658256641745e-07, | |
| "loss": 0.8344329833984375, | |
| "memory(GiB)": 66.94, | |
| "step": 105, | |
| "token_acc": 0.7713534087092802, | |
| "train_speed(iter/s)": 0.128253 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 1.9264695576885342, | |
| "learning_rate": 1.999863304992469e-07, | |
| "loss": 0.772977876663208, | |
| "memory(GiB)": 66.94, | |
| "step": 110, | |
| "token_acc": 0.7893902319663306, | |
| "train_speed(iter/s)": 0.128413 | |
| }, | |
| { | |
| "epoch": 10.465116279069768, | |
| "grad_norm": 1.7921071186450859, | |
| "learning_rate": 1.9996924449920347e-07, | |
| "loss": 0.7723042488098144, | |
| "memory(GiB)": 66.94, | |
| "step": 115, | |
| "token_acc": 0.8070967633232802, | |
| "train_speed(iter/s)": 0.128698 | |
| }, | |
| { | |
| "epoch": 10.930232558139535, | |
| "grad_norm": 2.0488653924721487, | |
| "learning_rate": 1.999453257340926e-07, | |
| "loss": 0.805912971496582, | |
| "memory(GiB)": 66.94, | |
| "step": 120, | |
| "token_acc": 0.7839763435738668, | |
| "train_speed(iter/s)": 0.128633 | |
| }, | |
| { | |
| "epoch": 11.372093023255815, | |
| "grad_norm": 1.8887448764254238, | |
| "learning_rate": 1.9991457583873009e-07, | |
| "loss": 0.7916177272796631, | |
| "memory(GiB)": 66.94, | |
| "step": 125, | |
| "token_acc": 0.7835127698472789, | |
| "train_speed(iter/s)": 0.129044 | |
| }, | |
| { | |
| "epoch": 11.837209302325581, | |
| "grad_norm": 2.087347255237122, | |
| "learning_rate": 1.9987699691483047e-07, | |
| "loss": 0.7750067710876465, | |
| "memory(GiB)": 66.94, | |
| "step": 130, | |
| "token_acc": 0.79361802524478, | |
| "train_speed(iter/s)": 0.128698 | |
| }, | |
| { | |
| "epoch": 12.279069767441861, | |
| "grad_norm": 1.8497770530709863, | |
| "learning_rate": 1.9983259153086325e-07, | |
| "loss": 0.7334749698638916, | |
| "memory(GiB)": 66.94, | |
| "step": 135, | |
| "token_acc": 0.8016341430131004, | |
| "train_speed(iter/s)": 0.129134 | |
| }, | |
| { | |
| "epoch": 12.744186046511627, | |
| "grad_norm": 1.3780662997872353, | |
| "learning_rate": 1.9978136272187745e-07, | |
| "loss": 0.7617583274841309, | |
| "memory(GiB)": 66.94, | |
| "step": 140, | |
| "token_acc": 0.8071877904067482, | |
| "train_speed(iter/s)": 0.128965 | |
| }, | |
| { | |
| "epoch": 13.186046511627907, | |
| "grad_norm": 1.4470094463921936, | |
| "learning_rate": 1.997233139892941e-07, | |
| "loss": 0.7472479820251465, | |
| "memory(GiB)": 66.94, | |
| "step": 145, | |
| "token_acc": 0.7839292328474627, | |
| "train_speed(iter/s)": 0.129158 | |
| }, | |
| { | |
| "epoch": 13.651162790697674, | |
| "grad_norm": 1.969343282689861, | |
| "learning_rate": 1.9965844930066698e-07, | |
| "loss": 0.7178962707519532, | |
| "memory(GiB)": 66.94, | |
| "step": 150, | |
| "token_acc": 0.7930578931176141, | |
| "train_speed(iter/s)": 0.129381 | |
| }, | |
| { | |
| "epoch": 14.093023255813954, | |
| "grad_norm": 1.659886865517498, | |
| "learning_rate": 1.9958677308941136e-07, | |
| "loss": 0.7550750255584717, | |
| "memory(GiB)": 66.94, | |
| "step": 155, | |
| "token_acc": 0.7681622703125359, | |
| "train_speed(iter/s)": 0.129371 | |
| }, | |
| { | |
| "epoch": 14.55813953488372, | |
| "grad_norm": 1.3482878555174083, | |
| "learning_rate": 1.9950829025450114e-07, | |
| "loss": 0.7135652542114258, | |
| "memory(GiB)": 66.94, | |
| "step": 160, | |
| "token_acc": 0.7849006160641636, | |
| "train_speed(iter/s)": 0.129416 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 1.6524876656316168, | |
| "learning_rate": 1.9942300616013377e-07, | |
| "loss": 0.7475796699523926, | |
| "memory(GiB)": 74.95, | |
| "step": 165, | |
| "token_acc": 0.796426354182834, | |
| "train_speed(iter/s)": 0.129454 | |
| }, | |
| { | |
| "epoch": 15.465116279069768, | |
| "grad_norm": 1.4018266466879952, | |
| "learning_rate": 1.993309266353638e-07, | |
| "loss": 0.7252517223358155, | |
| "memory(GiB)": 74.96, | |
| "step": 170, | |
| "token_acc": 0.8084311921640781, | |
| "train_speed(iter/s)": 0.129786 | |
| }, | |
| { | |
| "epoch": 15.930232558139535, | |
| "grad_norm": 1.2395976325818243, | |
| "learning_rate": 1.992320579737045e-07, | |
| "loss": 0.7289777755737304, | |
| "memory(GiB)": 74.96, | |
| "step": 175, | |
| "token_acc": 0.810318895442178, | |
| "train_speed(iter/s)": 0.129608 | |
| }, | |
| { | |
| "epoch": 16.372093023255815, | |
| "grad_norm": 2.5675852224219553, | |
| "learning_rate": 1.9912640693269751e-07, | |
| "loss": 0.6915578365325927, | |
| "memory(GiB)": 74.96, | |
| "step": 180, | |
| "token_acc": 0.7905717628859845, | |
| "train_speed(iter/s)": 0.129489 | |
| }, | |
| { | |
| "epoch": 16.837209302325583, | |
| "grad_norm": 1.4358606025818346, | |
| "learning_rate": 1.9901398073345117e-07, | |
| "loss": 0.7248753547668457, | |
| "memory(GiB)": 74.96, | |
| "step": 185, | |
| "token_acc": 0.8124335543968005, | |
| "train_speed(iter/s)": 0.129395 | |
| }, | |
| { | |
| "epoch": 17.27906976744186, | |
| "grad_norm": 1.262748163163051, | |
| "learning_rate": 1.9889478706014683e-07, | |
| "loss": 0.7250626564025879, | |
| "memory(GiB)": 74.96, | |
| "step": 190, | |
| "token_acc": 0.7930634826915087, | |
| "train_speed(iter/s)": 0.129717 | |
| }, | |
| { | |
| "epoch": 17.74418604651163, | |
| "grad_norm": 1.467844482343943, | |
| "learning_rate": 1.9876883405951376e-07, | |
| "loss": 0.7151264190673828, | |
| "memory(GiB)": 74.96, | |
| "step": 195, | |
| "token_acc": 0.8009663075081238, | |
| "train_speed(iter/s)": 0.129659 | |
| }, | |
| { | |
| "epoch": 18.186046511627907, | |
| "grad_norm": 1.1965217379377007, | |
| "learning_rate": 1.9863613034027222e-07, | |
| "loss": 0.667814064025879, | |
| "memory(GiB)": 74.96, | |
| "step": 200, | |
| "token_acc": 0.8150907451820857, | |
| "train_speed(iter/s)": 0.12962 | |
| }, | |
| { | |
| "epoch": 18.186046511627907, | |
| "eval_loss": 0.708366334438324, | |
| "eval_runtime": 0.7627, | |
| "eval_samples_per_second": 17.045, | |
| "eval_steps_per_second": 2.622, | |
| "eval_token_acc": 0.8243126736277421, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 18.651162790697676, | |
| "grad_norm": 1.1855878327264966, | |
| "learning_rate": 1.9849668497254518e-07, | |
| "loss": 0.7150158882141113, | |
| "memory(GiB)": 74.96, | |
| "step": 205, | |
| "token_acc": 0.8079891491231421, | |
| "train_speed(iter/s)": 0.128041 | |
| }, | |
| { | |
| "epoch": 19.093023255813954, | |
| "grad_norm": 1.0438364306476957, | |
| "learning_rate": 1.9835050748723822e-07, | |
| "loss": 0.6731427669525146, | |
| "memory(GiB)": 74.96, | |
| "step": 210, | |
| "token_acc": 0.8069213383230205, | |
| "train_speed(iter/s)": 0.128417 | |
| }, | |
| { | |
| "epoch": 19.558139534883722, | |
| "grad_norm": 0.9853618641588676, | |
| "learning_rate": 1.9819760787538837e-07, | |
| "loss": 0.6843628883361816, | |
| "memory(GiB)": 74.96, | |
| "step": 215, | |
| "token_acc": 0.8041327124563445, | |
| "train_speed(iter/s)": 0.128537 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 1.1619660544446906, | |
| "learning_rate": 1.9803799658748093e-07, | |
| "loss": 0.6671229839324951, | |
| "memory(GiB)": 74.97, | |
| "step": 220, | |
| "token_acc": 0.8175119885190241, | |
| "train_speed(iter/s)": 0.12854 | |
| }, | |
| { | |
| "epoch": 20.46511627906977, | |
| "grad_norm": 1.2626864222948397, | |
| "learning_rate": 1.9787168453273545e-07, | |
| "loss": 0.6970132827758789, | |
| "memory(GiB)": 74.97, | |
| "step": 225, | |
| "token_acc": 0.8270612144784331, | |
| "train_speed(iter/s)": 0.128495 | |
| }, | |
| { | |
| "epoch": 20.930232558139537, | |
| "grad_norm": 0.7941657042448518, | |
| "learning_rate": 1.9769868307835993e-07, | |
| "loss": 0.6455688953399659, | |
| "memory(GiB)": 74.97, | |
| "step": 230, | |
| "token_acc": 0.8348736389299637, | |
| "train_speed(iter/s)": 0.128518 | |
| }, | |
| { | |
| "epoch": 21.372093023255815, | |
| "grad_norm": 1.1822694017861601, | |
| "learning_rate": 1.9751900404877398e-07, | |
| "loss": 0.6348400115966797, | |
| "memory(GiB)": 74.97, | |
| "step": 235, | |
| "token_acc": 0.8331182941735705, | |
| "train_speed(iter/s)": 0.128694 | |
| }, | |
| { | |
| "epoch": 21.837209302325583, | |
| "grad_norm": 0.9081549570182597, | |
| "learning_rate": 1.9733265972480058e-07, | |
| "loss": 0.6620560646057129, | |
| "memory(GiB)": 74.97, | |
| "step": 240, | |
| "token_acc": 0.8267432385239223, | |
| "train_speed(iter/s)": 0.128483 | |
| }, | |
| { | |
| "epoch": 22.27906976744186, | |
| "grad_norm": 0.8139654483754177, | |
| "learning_rate": 1.9713966284282674e-07, | |
| "loss": 0.6350464820861816, | |
| "memory(GiB)": 74.97, | |
| "step": 245, | |
| "token_acc": 0.8140620540628695, | |
| "train_speed(iter/s)": 0.128624 | |
| }, | |
| { | |
| "epoch": 22.74418604651163, | |
| "grad_norm": 1.39238370567191, | |
| "learning_rate": 1.9694002659393302e-07, | |
| "loss": 0.6755290031433105, | |
| "memory(GiB)": 74.97, | |
| "step": 250, | |
| "token_acc": 0.8166282714604026, | |
| "train_speed(iter/s)": 0.128733 | |
| }, | |
| { | |
| "epoch": 23.186046511627907, | |
| "grad_norm": 1.0689850821114422, | |
| "learning_rate": 1.9673376462299182e-07, | |
| "loss": 0.6278616905212402, | |
| "memory(GiB)": 74.97, | |
| "step": 255, | |
| "token_acc": 0.8235556962260989, | |
| "train_speed(iter/s)": 0.128805 | |
| }, | |
| { | |
| "epoch": 23.651162790697676, | |
| "grad_norm": 0.826203146475013, | |
| "learning_rate": 1.9652089102773487e-07, | |
| "loss": 0.6573570728302002, | |
| "memory(GiB)": 74.97, | |
| "step": 260, | |
| "token_acc": 0.8203604745946925, | |
| "train_speed(iter/s)": 0.128858 | |
| }, | |
| { | |
| "epoch": 24.093023255813954, | |
| "grad_norm": 1.0075863589078984, | |
| "learning_rate": 1.963014203577896e-07, | |
| "loss": 0.6461727619171143, | |
| "memory(GiB)": 74.97, | |
| "step": 265, | |
| "token_acc": 0.799577569399313, | |
| "train_speed(iter/s)": 0.128878 | |
| }, | |
| { | |
| "epoch": 24.558139534883722, | |
| "grad_norm": 1.1793630828397141, | |
| "learning_rate": 1.9607536761368482e-07, | |
| "loss": 0.634314775466919, | |
| "memory(GiB)": 74.97, | |
| "step": 270, | |
| "token_acc": 0.7944581869582389, | |
| "train_speed(iter/s)": 0.128808 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.9698025031342606, | |
| "learning_rate": 1.9584274824582527e-07, | |
| "loss": 0.6515589714050293, | |
| "memory(GiB)": 74.97, | |
| "step": 275, | |
| "token_acc": 0.8235917962402285, | |
| "train_speed(iter/s)": 0.128916 | |
| }, | |
| { | |
| "epoch": 25.46511627906977, | |
| "grad_norm": 3.191013934790137, | |
| "learning_rate": 1.9560357815343574e-07, | |
| "loss": 0.6280710697174072, | |
| "memory(GiB)": 74.97, | |
| "step": 280, | |
| "token_acc": 0.8348407138350025, | |
| "train_speed(iter/s)": 0.128889 | |
| }, | |
| { | |
| "epoch": 25.930232558139537, | |
| "grad_norm": 2.6857457017550184, | |
| "learning_rate": 1.9535787368347442e-07, | |
| "loss": 0.6403141498565674, | |
| "memory(GiB)": 74.97, | |
| "step": 285, | |
| "token_acc": 0.8304662656603196, | |
| "train_speed(iter/s)": 0.128946 | |
| }, | |
| { | |
| "epoch": 26.372093023255815, | |
| "grad_norm": 0.7179384373982066, | |
| "learning_rate": 1.9510565162951537e-07, | |
| "loss": 0.6483189582824707, | |
| "memory(GiB)": 74.97, | |
| "step": 290, | |
| "token_acc": 0.8042392190472208, | |
| "train_speed(iter/s)": 0.129194 | |
| }, | |
| { | |
| "epoch": 26.837209302325583, | |
| "grad_norm": 0.8336349818317007, | |
| "learning_rate": 1.9484692923060094e-07, | |
| "loss": 0.6260199546813965, | |
| "memory(GiB)": 74.97, | |
| "step": 295, | |
| "token_acc": 0.8142812170144997, | |
| "train_speed(iter/s)": 0.129275 | |
| }, | |
| { | |
| "epoch": 27.27906976744186, | |
| "grad_norm": 0.8073425015755772, | |
| "learning_rate": 1.9458172417006346e-07, | |
| "loss": 0.6319057464599609, | |
| "memory(GiB)": 74.97, | |
| "step": 300, | |
| "token_acc": 0.8167601892733382, | |
| "train_speed(iter/s)": 0.129283 | |
| }, | |
| { | |
| "epoch": 27.74418604651163, | |
| "grad_norm": 0.8293440372694967, | |
| "learning_rate": 1.943100545743165e-07, | |
| "loss": 0.6321963310241699, | |
| "memory(GiB)": 74.97, | |
| "step": 305, | |
| "token_acc": 0.8145223890527623, | |
| "train_speed(iter/s)": 0.129158 | |
| }, | |
| { | |
| "epoch": 28.186046511627907, | |
| "grad_norm": 0.8851271223039491, | |
| "learning_rate": 1.9403193901161612e-07, | |
| "loss": 0.6186152935028076, | |
| "memory(GiB)": 74.97, | |
| "step": 310, | |
| "token_acc": 0.8423929547525053, | |
| "train_speed(iter/s)": 0.129305 | |
| }, | |
| { | |
| "epoch": 28.651162790697676, | |
| "grad_norm": 0.9560469073452553, | |
| "learning_rate": 1.9374739649079154e-07, | |
| "loss": 0.6388277053833008, | |
| "memory(GiB)": 74.97, | |
| "step": 315, | |
| "token_acc": 0.8255307825359716, | |
| "train_speed(iter/s)": 0.129291 | |
| }, | |
| { | |
| "epoch": 29.093023255813954, | |
| "grad_norm": 1.0797696361091218, | |
| "learning_rate": 1.9345644645994608e-07, | |
| "loss": 0.6270732879638672, | |
| "memory(GiB)": 74.97, | |
| "step": 320, | |
| "token_acc": 0.8329987798638171, | |
| "train_speed(iter/s)": 0.129427 | |
| }, | |
| { | |
| "epoch": 29.558139534883722, | |
| "grad_norm": 1.035746534298127, | |
| "learning_rate": 1.9315910880512788e-07, | |
| "loss": 0.6154883861541748, | |
| "memory(GiB)": 74.97, | |
| "step": 325, | |
| "token_acc": 0.8229807039658683, | |
| "train_speed(iter/s)": 0.129368 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.9448004935095479, | |
| "learning_rate": 1.928554038489707e-07, | |
| "loss": 0.6246993541717529, | |
| "memory(GiB)": 74.97, | |
| "step": 330, | |
| "token_acc": 0.8252855659397715, | |
| "train_speed(iter/s)": 0.129558 | |
| }, | |
| { | |
| "epoch": 30.46511627906977, | |
| "grad_norm": 0.7400543933440672, | |
| "learning_rate": 1.9254535234930483e-07, | |
| "loss": 0.6015793323516846, | |
| "memory(GiB)": 74.97, | |
| "step": 335, | |
| "token_acc": 0.8212677580369298, | |
| "train_speed(iter/s)": 0.129568 | |
| }, | |
| { | |
| "epoch": 30.930232558139537, | |
| "grad_norm": 0.6862921067098382, | |
| "learning_rate": 1.9222897549773846e-07, | |
| "loss": 0.627756404876709, | |
| "memory(GiB)": 74.97, | |
| "step": 340, | |
| "token_acc": 0.8131175537754646, | |
| "train_speed(iter/s)": 0.129532 | |
| }, | |
| { | |
| "epoch": 31.372093023255815, | |
| "grad_norm": 1.0706787922118046, | |
| "learning_rate": 1.9190629491820908e-07, | |
| "loss": 0.6050760269165039, | |
| "memory(GiB)": 74.97, | |
| "step": 345, | |
| "token_acc": 0.8153731376034056, | |
| "train_speed(iter/s)": 0.129658 | |
| }, | |
| { | |
| "epoch": 31.837209302325583, | |
| "grad_norm": 0.7747208875253631, | |
| "learning_rate": 1.9157733266550572e-07, | |
| "loss": 0.6289189338684082, | |
| "memory(GiB)": 74.97, | |
| "step": 350, | |
| "token_acc": 0.8139119876370594, | |
| "train_speed(iter/s)": 0.129542 | |
| }, | |
| { | |
| "epoch": 32.27906976744186, | |
| "grad_norm": 0.773459886431363, | |
| "learning_rate": 1.9124211122376135e-07, | |
| "loss": 0.6157156944274902, | |
| "memory(GiB)": 74.97, | |
| "step": 355, | |
| "token_acc": 0.8152114721365039, | |
| "train_speed(iter/s)": 0.129801 | |
| }, | |
| { | |
| "epoch": 32.74418604651163, | |
| "grad_norm": 1.1738935206395225, | |
| "learning_rate": 1.9090065350491624e-07, | |
| "loss": 0.6239834785461426, | |
| "memory(GiB)": 74.97, | |
| "step": 360, | |
| "token_acc": 0.833327410355734, | |
| "train_speed(iter/s)": 0.129897 | |
| }, | |
| { | |
| "epoch": 33.18604651162791, | |
| "grad_norm": 0.848966063311304, | |
| "learning_rate": 1.905529828471519e-07, | |
| "loss": 0.5887202262878418, | |
| "memory(GiB)": 74.97, | |
| "step": 365, | |
| "token_acc": 0.8398133748055988, | |
| "train_speed(iter/s)": 0.129873 | |
| }, | |
| { | |
| "epoch": 33.651162790697676, | |
| "grad_norm": 2.144137430723947, | |
| "learning_rate": 1.901991230132959e-07, | |
| "loss": 0.6359727859497071, | |
| "memory(GiB)": 74.97, | |
| "step": 370, | |
| "token_acc": 0.8069930345126126, | |
| "train_speed(iter/s)": 0.129943 | |
| }, | |
| { | |
| "epoch": 34.093023255813954, | |
| "grad_norm": 0.7367545693321746, | |
| "learning_rate": 1.8983909818919788e-07, | |
| "loss": 0.5804174900054931, | |
| "memory(GiB)": 74.97, | |
| "step": 375, | |
| "token_acc": 0.8437245411415153, | |
| "train_speed(iter/s)": 0.129967 | |
| }, | |
| { | |
| "epoch": 34.55813953488372, | |
| "grad_norm": 0.7507232728161667, | |
| "learning_rate": 1.8947293298207635e-07, | |
| "loss": 0.5902613639831543, | |
| "memory(GiB)": 74.97, | |
| "step": 380, | |
| "token_acc": 0.8308984660336012, | |
| "train_speed(iter/s)": 0.129921 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 1.3494911901833562, | |
| "learning_rate": 1.8910065241883678e-07, | |
| "loss": 0.6213099479675293, | |
| "memory(GiB)": 74.97, | |
| "step": 385, | |
| "token_acc": 0.8180765456329735, | |
| "train_speed(iter/s)": 0.129994 | |
| }, | |
| { | |
| "epoch": 35.46511627906977, | |
| "grad_norm": 0.9327927885382011, | |
| "learning_rate": 1.8872228194436116e-07, | |
| "loss": 0.61426682472229, | |
| "memory(GiB)": 74.97, | |
| "step": 390, | |
| "token_acc": 0.8016005335111704, | |
| "train_speed(iter/s)": 0.130043 | |
| }, | |
| { | |
| "epoch": 35.93023255813954, | |
| "grad_norm": 0.8590493021171992, | |
| "learning_rate": 1.8833784741976886e-07, | |
| "loss": 0.5930656433105469, | |
| "memory(GiB)": 74.97, | |
| "step": 395, | |
| "token_acc": 0.8238509177734666, | |
| "train_speed(iter/s)": 0.129952 | |
| }, | |
| { | |
| "epoch": 36.372093023255815, | |
| "grad_norm": 0.692718053612059, | |
| "learning_rate": 1.8794737512064888e-07, | |
| "loss": 0.601491117477417, | |
| "memory(GiB)": 74.97, | |
| "step": 400, | |
| "token_acc": 0.8390804597701149, | |
| "train_speed(iter/s)": 0.130015 | |
| }, | |
| { | |
| "epoch": 36.372093023255815, | |
| "eval_loss": 0.6246538758277893, | |
| "eval_runtime": 0.7289, | |
| "eval_samples_per_second": 17.836, | |
| "eval_steps_per_second": 2.744, | |
| "eval_token_acc": 0.8392566337771817, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 36.83720930232558, | |
| "grad_norm": 0.8580896624897943, | |
| "learning_rate": 1.875508917352643e-07, | |
| "loss": 0.6003564834594727, | |
| "memory(GiB)": 74.97, | |
| "step": 405, | |
| "token_acc": 0.8357933251629633, | |
| "train_speed(iter/s)": 0.129242 | |
| }, | |
| { | |
| "epoch": 37.27906976744186, | |
| "grad_norm": 0.9684611433600051, | |
| "learning_rate": 1.871484243627277e-07, | |
| "loss": 0.6055225372314453, | |
| "memory(GiB)": 74.97, | |
| "step": 410, | |
| "token_acc": 0.8125408092339449, | |
| "train_speed(iter/s)": 0.129415 | |
| }, | |
| { | |
| "epoch": 37.74418604651163, | |
| "grad_norm": 0.8148508280992611, | |
| "learning_rate": 1.867400005111495e-07, | |
| "loss": 0.5952893257141113, | |
| "memory(GiB)": 74.97, | |
| "step": 415, | |
| "token_acc": 0.8260123541523678, | |
| "train_speed(iter/s)": 0.129433 | |
| }, | |
| { | |
| "epoch": 38.18604651162791, | |
| "grad_norm": 0.7992095356192797, | |
| "learning_rate": 1.8632564809575738e-07, | |
| "loss": 0.6156826496124268, | |
| "memory(GiB)": 74.97, | |
| "step": 420, | |
| "token_acc": 0.8205879974118409, | |
| "train_speed(iter/s)": 0.12938 | |
| }, | |
| { | |
| "epoch": 38.651162790697676, | |
| "grad_norm": 3.6721651326108775, | |
| "learning_rate": 1.859053954369885e-07, | |
| "loss": 0.610502815246582, | |
| "memory(GiB)": 74.97, | |
| "step": 425, | |
| "token_acc": 0.8172398589065256, | |
| "train_speed(iter/s)": 0.129325 | |
| }, | |
| { | |
| "epoch": 39.093023255813954, | |
| "grad_norm": 0.9272484168885702, | |
| "learning_rate": 1.854792712585539e-07, | |
| "loss": 0.5535663604736328, | |
| "memory(GiB)": 74.97, | |
| "step": 430, | |
| "token_acc": 0.8236255683739807, | |
| "train_speed(iter/s)": 0.129482 | |
| }, | |
| { | |
| "epoch": 39.55813953488372, | |
| "grad_norm": 0.8018214646006986, | |
| "learning_rate": 1.8504730468547506e-07, | |
| "loss": 0.5991367340087891, | |
| "memory(GiB)": 74.97, | |
| "step": 435, | |
| "token_acc": 0.8261135086719322, | |
| "train_speed(iter/s)": 0.129405 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 1.2379263967079543, | |
| "learning_rate": 1.846095252420935e-07, | |
| "loss": 0.585663890838623, | |
| "memory(GiB)": 74.97, | |
| "step": 440, | |
| "token_acc": 0.8266845321477151, | |
| "train_speed(iter/s)": 0.129434 | |
| }, | |
| { | |
| "epoch": 40.46511627906977, | |
| "grad_norm": 0.833466025772104, | |
| "learning_rate": 1.841659628500527e-07, | |
| "loss": 0.5750086784362793, | |
| "memory(GiB)": 74.97, | |
| "step": 445, | |
| "token_acc": 0.8343643862202814, | |
| "train_speed(iter/s)": 0.129525 | |
| }, | |
| { | |
| "epoch": 40.93023255813954, | |
| "grad_norm": 0.7870441769315963, | |
| "learning_rate": 1.8371664782625284e-07, | |
| "loss": 0.5996095180511475, | |
| "memory(GiB)": 74.97, | |
| "step": 450, | |
| "token_acc": 0.8262060770106785, | |
| "train_speed(iter/s)": 0.129426 | |
| }, | |
| { | |
| "epoch": 41.372093023255815, | |
| "grad_norm": 0.7270750065258582, | |
| "learning_rate": 1.8326161088077904e-07, | |
| "loss": 0.5774937629699707, | |
| "memory(GiB)": 74.97, | |
| "step": 455, | |
| "token_acc": 0.8339674588455729, | |
| "train_speed(iter/s)": 0.129531 | |
| }, | |
| { | |
| "epoch": 41.83720930232558, | |
| "grad_norm": 0.6345626674708744, | |
| "learning_rate": 1.82800883114802e-07, | |
| "loss": 0.5982451438903809, | |
| "memory(GiB)": 74.97, | |
| "step": 460, | |
| "token_acc": 0.83098393668337, | |
| "train_speed(iter/s)": 0.129577 | |
| }, | |
| { | |
| "epoch": 42.27906976744186, | |
| "grad_norm": 0.8020609888197409, | |
| "learning_rate": 1.8233449601845256e-07, | |
| "loss": 0.5845087051391602, | |
| "memory(GiB)": 74.97, | |
| "step": 465, | |
| "token_acc": 0.8175882797882081, | |
| "train_speed(iter/s)": 0.129629 | |
| }, | |
| { | |
| "epoch": 42.74418604651163, | |
| "grad_norm": 0.8480884031667174, | |
| "learning_rate": 1.8186248146866925e-07, | |
| "loss": 0.591459846496582, | |
| "memory(GiB)": 74.97, | |
| "step": 470, | |
| "token_acc": 0.8345550327140474, | |
| "train_speed(iter/s)": 0.129656 | |
| }, | |
| { | |
| "epoch": 43.18604651162791, | |
| "grad_norm": 4.128756169670704, | |
| "learning_rate": 1.8138487172701948e-07, | |
| "loss": 0.5832277297973633, | |
| "memory(GiB)": 74.97, | |
| "step": 475, | |
| "token_acc": 0.8327794561933535, | |
| "train_speed(iter/s)": 0.129649 | |
| }, | |
| { | |
| "epoch": 43.651162790697676, | |
| "grad_norm": 0.691292587718326, | |
| "learning_rate": 1.8090169943749475e-07, | |
| "loss": 0.5771265029907227, | |
| "memory(GiB)": 74.97, | |
| "step": 480, | |
| "token_acc": 0.8235854875792071, | |
| "train_speed(iter/s)": 0.129594 | |
| }, | |
| { | |
| "epoch": 44.093023255813954, | |
| "grad_norm": 0.9411447489425482, | |
| "learning_rate": 1.8041299762427914e-07, | |
| "loss": 0.5849340438842774, | |
| "memory(GiB)": 74.97, | |
| "step": 485, | |
| "token_acc": 0.8348119811167182, | |
| "train_speed(iter/s)": 0.12971 | |
| }, | |
| { | |
| "epoch": 44.55813953488372, | |
| "grad_norm": 0.9906151143939281, | |
| "learning_rate": 1.7991879968949247e-07, | |
| "loss": 0.6044949531555176, | |
| "memory(GiB)": 74.97, | |
| "step": 490, | |
| "token_acc": 0.8391592252658489, | |
| "train_speed(iter/s)": 0.129794 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 0.6320054379409873, | |
| "learning_rate": 1.794191394109071e-07, | |
| "loss": 0.5554977893829346, | |
| "memory(GiB)": 74.97, | |
| "step": 495, | |
| "token_acc": 0.8345945945945946, | |
| "train_speed(iter/s)": 0.12979 | |
| }, | |
| { | |
| "epoch": 45.46511627906977, | |
| "grad_norm": 0.7061286584704719, | |
| "learning_rate": 1.7891405093963936e-07, | |
| "loss": 0.5755014896392823, | |
| "memory(GiB)": 74.97, | |
| "step": 500, | |
| "token_acc": 0.8254359194017598, | |
| "train_speed(iter/s)": 0.129688 | |
| }, | |
| { | |
| "epoch": 45.93023255813954, | |
| "grad_norm": 0.7195669164082512, | |
| "learning_rate": 1.7840356879781529e-07, | |
| "loss": 0.5827363014221192, | |
| "memory(GiB)": 74.97, | |
| "step": 505, | |
| "token_acc": 0.839882368874185, | |
| "train_speed(iter/s)": 0.129779 | |
| }, | |
| { | |
| "epoch": 46.372093023255815, | |
| "grad_norm": 0.6968950428332337, | |
| "learning_rate": 1.7788772787621125e-07, | |
| "loss": 0.5568270683288574, | |
| "memory(GiB)": 74.97, | |
| "step": 510, | |
| "token_acc": 0.8614190870002142, | |
| "train_speed(iter/s)": 0.129886 | |
| }, | |
| { | |
| "epoch": 46.83720930232558, | |
| "grad_norm": 0.7064063028804808, | |
| "learning_rate": 1.7736656343186894e-07, | |
| "loss": 0.5865127563476562, | |
| "memory(GiB)": 74.97, | |
| "step": 515, | |
| "token_acc": 0.8082950799781602, | |
| "train_speed(iter/s)": 0.129852 | |
| }, | |
| { | |
| "epoch": 47.27906976744186, | |
| "grad_norm": 0.6403030213655208, | |
| "learning_rate": 1.768401110856859e-07, | |
| "loss": 0.5599156379699707, | |
| "memory(GiB)": 74.97, | |
| "step": 520, | |
| "token_acc": 0.8375492061100334, | |
| "train_speed(iter/s)": 0.129883 | |
| }, | |
| { | |
| "epoch": 47.74418604651163, | |
| "grad_norm": 0.9706781013185869, | |
| "learning_rate": 1.7630840681998066e-07, | |
| "loss": 0.5808145523071289, | |
| "memory(GiB)": 74.97, | |
| "step": 525, | |
| "token_acc": 0.8431429663747747, | |
| "train_speed(iter/s)": 0.129855 | |
| }, | |
| { | |
| "epoch": 48.18604651162791, | |
| "grad_norm": 0.7377603527859908, | |
| "learning_rate": 1.7577148697603348e-07, | |
| "loss": 0.5715710639953613, | |
| "memory(GiB)": 74.97, | |
| "step": 530, | |
| "token_acc": 0.8220905089196077, | |
| "train_speed(iter/s)": 0.129985 | |
| }, | |
| { | |
| "epoch": 48.651162790697676, | |
| "grad_norm": 0.8535997732414037, | |
| "learning_rate": 1.7522938825160247e-07, | |
| "loss": 0.5609760284423828, | |
| "memory(GiB)": 74.97, | |
| "step": 535, | |
| "token_acc": 0.8485186181454867, | |
| "train_speed(iter/s)": 0.12997 | |
| }, | |
| { | |
| "epoch": 49.093023255813954, | |
| "grad_norm": 1.6196917405162314, | |
| "learning_rate": 1.7468214769841538e-07, | |
| "loss": 0.5788634777069092, | |
| "memory(GiB)": 74.97, | |
| "step": 540, | |
| "token_acc": 0.8494809430899153, | |
| "train_speed(iter/s)": 0.129998 | |
| }, | |
| { | |
| "epoch": 49.55813953488372, | |
| "grad_norm": 0.8074628776305832, | |
| "learning_rate": 1.7412980271963708e-07, | |
| "loss": 0.5682050704956054, | |
| "memory(GiB)": 74.97, | |
| "step": 545, | |
| "token_acc": 0.8164148196748201, | |
| "train_speed(iter/s)": 0.129923 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.9098109454481578, | |
| "learning_rate": 1.7357239106731316e-07, | |
| "loss": 0.5588317394256592, | |
| "memory(GiB)": 74.97, | |
| "step": 550, | |
| "token_acc": 0.8282426370196996, | |
| "train_speed(iter/s)": 0.130049 | |
| }, | |
| { | |
| "epoch": 50.46511627906977, | |
| "grad_norm": 0.8717894931304141, | |
| "learning_rate": 1.7300995083978961e-07, | |
| "loss": 0.560645866394043, | |
| "memory(GiB)": 74.97, | |
| "step": 555, | |
| "token_acc": 0.8580127632625887, | |
| "train_speed(iter/s)": 0.129978 | |
| }, | |
| { | |
| "epoch": 50.93023255813954, | |
| "grad_norm": 0.901907102378853, | |
| "learning_rate": 1.724425204791089e-07, | |
| "loss": 0.5699704647064209, | |
| "memory(GiB)": 74.97, | |
| "step": 560, | |
| "token_acc": 0.8169467583456241, | |
| "train_speed(iter/s)": 0.129949 | |
| }, | |
| { | |
| "epoch": 51.372093023255815, | |
| "grad_norm": 0.6850047743663971, | |
| "learning_rate": 1.7187013876838238e-07, | |
| "loss": 0.5511385917663574, | |
| "memory(GiB)": 74.97, | |
| "step": 565, | |
| "token_acc": 0.8470804299681305, | |
| "train_speed(iter/s)": 0.130045 | |
| }, | |
| { | |
| "epoch": 51.83720930232558, | |
| "grad_norm": 0.693108198878134, | |
| "learning_rate": 1.712928448291397e-07, | |
| "loss": 0.560858964920044, | |
| "memory(GiB)": 74.97, | |
| "step": 570, | |
| "token_acc": 0.8014341226733077, | |
| "train_speed(iter/s)": 0.130065 | |
| }, | |
| { | |
| "epoch": 52.27906976744186, | |
| "grad_norm": 1.168154430184055, | |
| "learning_rate": 1.7071067811865473e-07, | |
| "loss": 0.5584731578826905, | |
| "memory(GiB)": 74.97, | |
| "step": 575, | |
| "token_acc": 0.8305429323128438, | |
| "train_speed(iter/s)": 0.130047 | |
| }, | |
| { | |
| "epoch": 52.74418604651163, | |
| "grad_norm": 0.8940504753420614, | |
| "learning_rate": 1.7012367842724884e-07, | |
| "loss": 0.5449427127838135, | |
| "memory(GiB)": 74.97, | |
| "step": 580, | |
| "token_acc": 0.8454416804487562, | |
| "train_speed(iter/s)": 0.13012 | |
| }, | |
| { | |
| "epoch": 53.18604651162791, | |
| "grad_norm": 1.3182438739088296, | |
| "learning_rate": 1.695318858755712e-07, | |
| "loss": 0.5867147445678711, | |
| "memory(GiB)": 74.97, | |
| "step": 585, | |
| "token_acc": 0.8317076233934776, | |
| "train_speed(iter/s)": 0.130155 | |
| }, | |
| { | |
| "epoch": 53.651162790697676, | |
| "grad_norm": 5.89431265738365, | |
| "learning_rate": 1.6893534091185658e-07, | |
| "loss": 0.5429623603820801, | |
| "memory(GiB)": 74.97, | |
| "step": 590, | |
| "token_acc": 0.8596458176337604, | |
| "train_speed(iter/s)": 0.130144 | |
| }, | |
| { | |
| "epoch": 54.093023255813954, | |
| "grad_norm": 0.8228392742664287, | |
| "learning_rate": 1.6833408430916082e-07, | |
| "loss": 0.5783446311950684, | |
| "memory(GiB)": 74.97, | |
| "step": 595, | |
| "token_acc": 0.8510537851964256, | |
| "train_speed(iter/s)": 0.130222 | |
| }, | |
| { | |
| "epoch": 54.55813953488372, | |
| "grad_norm": 0.6782178805084175, | |
| "learning_rate": 1.6772815716257412e-07, | |
| "loss": 0.5568069458007813, | |
| "memory(GiB)": 74.97, | |
| "step": 600, | |
| "token_acc": 0.8492520719628057, | |
| "train_speed(iter/s)": 0.130138 | |
| }, | |
| { | |
| "epoch": 54.55813953488372, | |
| "eval_loss": 0.6069812774658203, | |
| "eval_runtime": 0.7244, | |
| "eval_samples_per_second": 17.946, | |
| "eval_steps_per_second": 2.761, | |
| "eval_token_acc": 0.8424178561164862, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 0.9571327352378861, | |
| "learning_rate": 1.6711760088641197e-07, | |
| "loss": 0.549845027923584, | |
| "memory(GiB)": 74.97, | |
| "step": 605, | |
| "token_acc": 0.8441368444744543, | |
| "train_speed(iter/s)": 0.129683 | |
| }, | |
| { | |
| "epoch": 55.46511627906977, | |
| "grad_norm": 0.6574337050432097, | |
| "learning_rate": 1.665024572113848e-07, | |
| "loss": 0.5540960311889649, | |
| "memory(GiB)": 74.97, | |
| "step": 610, | |
| "token_acc": 0.8468528296996988, | |
| "train_speed(iter/s)": 0.12963 | |
| }, | |
| { | |
| "epoch": 55.93023255813954, | |
| "grad_norm": 1.3701583003213704, | |
| "learning_rate": 1.6588276818174578e-07, | |
| "loss": 0.5496389389038085, | |
| "memory(GiB)": 74.97, | |
| "step": 615, | |
| "token_acc": 0.8450532311656608, | |
| "train_speed(iter/s)": 0.129682 | |
| }, | |
| { | |
| "epoch": 56.372093023255815, | |
| "grad_norm": 0.6379537701462664, | |
| "learning_rate": 1.6525857615241686e-07, | |
| "loss": 0.5491930484771729, | |
| "memory(GiB)": 74.97, | |
| "step": 620, | |
| "token_acc": 0.8525308496423799, | |
| "train_speed(iter/s)": 0.129772 | |
| }, | |
| { | |
| "epoch": 56.83720930232558, | |
| "grad_norm": 1.0493433605209441, | |
| "learning_rate": 1.6462992378609406e-07, | |
| "loss": 0.5360322952270508, | |
| "memory(GiB)": 74.97, | |
| "step": 625, | |
| "token_acc": 0.8368756439119319, | |
| "train_speed(iter/s)": 0.129772 | |
| }, | |
| { | |
| "epoch": 57.27906976744186, | |
| "grad_norm": 1.1362722651257062, | |
| "learning_rate": 1.6399685405033166e-07, | |
| "loss": 0.5665555000305176, | |
| "memory(GiB)": 74.97, | |
| "step": 630, | |
| "token_acc": 0.8487739334900907, | |
| "train_speed(iter/s)": 0.129826 | |
| }, | |
| { | |
| "epoch": 57.74418604651163, | |
| "grad_norm": 0.6512954800566325, | |
| "learning_rate": 1.6335941021460504e-07, | |
| "loss": 0.5384564399719238, | |
| "memory(GiB)": 74.97, | |
| "step": 635, | |
| "token_acc": 0.8314396783289121, | |
| "train_speed(iter/s)": 0.129882 | |
| }, | |
| { | |
| "epoch": 58.18604651162791, | |
| "grad_norm": 0.6514693162473681, | |
| "learning_rate": 1.627176358473537e-07, | |
| "loss": 0.5575238227844238, | |
| "memory(GiB)": 74.97, | |
| "step": 640, | |
| "token_acc": 0.8362654193227916, | |
| "train_speed(iter/s)": 0.129896 | |
| }, | |
| { | |
| "epoch": 58.651162790697676, | |
| "grad_norm": 0.6211369831346565, | |
| "learning_rate": 1.6207157481300312e-07, | |
| "loss": 0.5277935981750488, | |
| "memory(GiB)": 74.97, | |
| "step": 645, | |
| "token_acc": 0.8476069720412159, | |
| "train_speed(iter/s)": 0.129829 | |
| }, | |
| { | |
| "epoch": 59.093023255813954, | |
| "grad_norm": 0.93341409437694, | |
| "learning_rate": 1.614212712689668e-07, | |
| "loss": 0.5535923480987549, | |
| "memory(GiB)": 74.97, | |
| "step": 650, | |
| "token_acc": 0.8373809799159632, | |
| "train_speed(iter/s)": 0.129933 | |
| }, | |
| { | |
| "epoch": 59.55813953488372, | |
| "grad_norm": 0.7951026197647952, | |
| "learning_rate": 1.607667696626281e-07, | |
| "loss": 0.5427175045013428, | |
| "memory(GiB)": 74.97, | |
| "step": 655, | |
| "token_acc": 0.845807408479236, | |
| "train_speed(iter/s)": 0.129879 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 0.8112289345971331, | |
| "learning_rate": 1.601081147283025e-07, | |
| "loss": 0.544118070602417, | |
| "memory(GiB)": 74.97, | |
| "step": 660, | |
| "token_acc": 0.8465872536213518, | |
| "train_speed(iter/s)": 0.130007 | |
| }, | |
| { | |
| "epoch": 60.46511627906977, | |
| "grad_norm": 0.8973071989809348, | |
| "learning_rate": 1.594453514841798e-07, | |
| "loss": 0.5551681041717529, | |
| "memory(GiB)": 74.97, | |
| "step": 665, | |
| "token_acc": 0.8406223717409588, | |
| "train_speed(iter/s)": 0.129985 | |
| }, | |
| { | |
| "epoch": 60.93023255813954, | |
| "grad_norm": 0.6961112129897833, | |
| "learning_rate": 1.5877852522924732e-07, | |
| "loss": 0.5278561592102051, | |
| "memory(GiB)": 74.97, | |
| "step": 670, | |
| "token_acc": 0.8361272191105745, | |
| "train_speed(iter/s)": 0.12996 | |
| }, | |
| { | |
| "epoch": 61.372093023255815, | |
| "grad_norm": 0.8454621530526435, | |
| "learning_rate": 1.5810768154019382e-07, | |
| "loss": 0.5304566383361816, | |
| "memory(GiB)": 74.97, | |
| "step": 675, | |
| "token_acc": 0.8467184191954834, | |
| "train_speed(iter/s)": 0.130101 | |
| }, | |
| { | |
| "epoch": 61.83720930232558, | |
| "grad_norm": 0.8048317682461219, | |
| "learning_rate": 1.5743286626829435e-07, | |
| "loss": 0.556386137008667, | |
| "memory(GiB)": 74.97, | |
| "step": 680, | |
| "token_acc": 0.8513160602079739, | |
| "train_speed(iter/s)": 0.130049 | |
| }, | |
| { | |
| "epoch": 62.27906976744186, | |
| "grad_norm": 1.2555197833096778, | |
| "learning_rate": 1.5675412553627636e-07, | |
| "loss": 0.5487345695495606, | |
| "memory(GiB)": 74.97, | |
| "step": 685, | |
| "token_acc": 0.8283330021855752, | |
| "train_speed(iter/s)": 0.130158 | |
| }, | |
| { | |
| "epoch": 62.74418604651163, | |
| "grad_norm": 0.6737924387221673, | |
| "learning_rate": 1.5607150573516727e-07, | |
| "loss": 0.5273719787597656, | |
| "memory(GiB)": 74.97, | |
| "step": 690, | |
| "token_acc": 0.8344278568974075, | |
| "train_speed(iter/s)": 0.130149 | |
| }, | |
| { | |
| "epoch": 63.18604651162791, | |
| "grad_norm": 0.6321277650100168, | |
| "learning_rate": 1.5538505352112372e-07, | |
| "loss": 0.5302412986755372, | |
| "memory(GiB)": 74.97, | |
| "step": 695, | |
| "token_acc": 0.838855421686747, | |
| "train_speed(iter/s)": 0.130224 | |
| }, | |
| { | |
| "epoch": 63.651162790697676, | |
| "grad_norm": 0.6665444493375449, | |
| "learning_rate": 1.546948158122427e-07, | |
| "loss": 0.5358945846557617, | |
| "memory(GiB)": 74.97, | |
| "step": 700, | |
| "token_acc": 0.826061751191652, | |
| "train_speed(iter/s)": 0.130179 | |
| }, | |
| { | |
| "epoch": 64.09302325581395, | |
| "grad_norm": 0.7540141255217923, | |
| "learning_rate": 1.540008397853547e-07, | |
| "loss": 0.5356395244598389, | |
| "memory(GiB)": 74.97, | |
| "step": 705, | |
| "token_acc": 0.8476590569896634, | |
| "train_speed(iter/s)": 0.130248 | |
| }, | |
| { | |
| "epoch": 64.55813953488372, | |
| "grad_norm": 0.7630626447995367, | |
| "learning_rate": 1.5330317287279937e-07, | |
| "loss": 0.5312513828277587, | |
| "memory(GiB)": 74.97, | |
| "step": 710, | |
| "token_acc": 0.8489824739281576, | |
| "train_speed(iter/s)": 0.130176 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 1.2266930256462827, | |
| "learning_rate": 1.526018627591834e-07, | |
| "loss": 0.5403413295745849, | |
| "memory(GiB)": 74.97, | |
| "step": 715, | |
| "token_acc": 0.8551674468851278, | |
| "train_speed(iter/s)": 0.130251 | |
| }, | |
| { | |
| "epoch": 65.46511627906976, | |
| "grad_norm": 0.7496283095791967, | |
| "learning_rate": 1.5189695737812152e-07, | |
| "loss": 0.5270286560058594, | |
| "memory(GiB)": 74.97, | |
| "step": 720, | |
| "token_acc": 0.8398781740525149, | |
| "train_speed(iter/s)": 0.130162 | |
| }, | |
| { | |
| "epoch": 65.93023255813954, | |
| "grad_norm": 0.8680329950142557, | |
| "learning_rate": 1.511885049089601e-07, | |
| "loss": 0.5444748878479004, | |
| "memory(GiB)": 74.97, | |
| "step": 725, | |
| "token_acc": 0.8464486183074266, | |
| "train_speed(iter/s)": 0.130252 | |
| }, | |
| { | |
| "epoch": 66.37209302325581, | |
| "grad_norm": 0.6415609894652046, | |
| "learning_rate": 1.5047655377348439e-07, | |
| "loss": 0.5128337383270264, | |
| "memory(GiB)": 74.97, | |
| "step": 730, | |
| "token_acc": 0.864701716521094, | |
| "train_speed(iter/s)": 0.130315 | |
| }, | |
| { | |
| "epoch": 66.83720930232558, | |
| "grad_norm": 0.6939531108133022, | |
| "learning_rate": 1.4976115263260874e-07, | |
| "loss": 0.5571429252624511, | |
| "memory(GiB)": 74.97, | |
| "step": 735, | |
| "token_acc": 0.8357370669769121, | |
| "train_speed(iter/s)": 0.130322 | |
| }, | |
| { | |
| "epoch": 67.27906976744185, | |
| "grad_norm": 0.7218530264815206, | |
| "learning_rate": 1.4904235038305082e-07, | |
| "loss": 0.5194293975830078, | |
| "memory(GiB)": 74.97, | |
| "step": 740, | |
| "token_acc": 0.8460596389007441, | |
| "train_speed(iter/s)": 0.130299 | |
| }, | |
| { | |
| "epoch": 67.74418604651163, | |
| "grad_norm": 1.285168120381986, | |
| "learning_rate": 1.483201961539896e-07, | |
| "loss": 0.5455545425415039, | |
| "memory(GiB)": 74.97, | |
| "step": 745, | |
| "token_acc": 0.8313979656134666, | |
| "train_speed(iter/s)": 0.130272 | |
| }, | |
| { | |
| "epoch": 68.18604651162791, | |
| "grad_norm": 1.94952748533025, | |
| "learning_rate": 1.4759473930370737e-07, | |
| "loss": 0.5241846084594727, | |
| "memory(GiB)": 74.97, | |
| "step": 750, | |
| "token_acc": 0.8599992655699178, | |
| "train_speed(iter/s)": 0.130347 | |
| }, | |
| { | |
| "epoch": 68.65116279069767, | |
| "grad_norm": 0.7193543863488733, | |
| "learning_rate": 1.4686602941621615e-07, | |
| "loss": 0.5322785377502441, | |
| "memory(GiB)": 74.97, | |
| "step": 755, | |
| "token_acc": 0.8371367656348705, | |
| "train_speed(iter/s)": 0.130295 | |
| }, | |
| { | |
| "epoch": 69.09302325581395, | |
| "grad_norm": 1.0867783614431274, | |
| "learning_rate": 1.4613411629786877e-07, | |
| "loss": 0.521461296081543, | |
| "memory(GiB)": 74.97, | |
| "step": 760, | |
| "token_acc": 0.8467171046810017, | |
| "train_speed(iter/s)": 0.130339 | |
| }, | |
| { | |
| "epoch": 69.55813953488372, | |
| "grad_norm": 0.7455956742708548, | |
| "learning_rate": 1.4539904997395468e-07, | |
| "loss": 0.5118254661560059, | |
| "memory(GiB)": 74.97, | |
| "step": 765, | |
| "token_acc": 0.8578669369898095, | |
| "train_speed(iter/s)": 0.13034 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 0.8528350805883835, | |
| "learning_rate": 1.4466088068528067e-07, | |
| "loss": 0.5299886703491211, | |
| "memory(GiB)": 74.97, | |
| "step": 770, | |
| "token_acc": 0.8476385063027893, | |
| "train_speed(iter/s)": 0.130365 | |
| }, | |
| { | |
| "epoch": 70.46511627906976, | |
| "grad_norm": 0.6395748070686201, | |
| "learning_rate": 1.4391965888473702e-07, | |
| "loss": 0.5287624359130859, | |
| "memory(GiB)": 74.97, | |
| "step": 775, | |
| "token_acc": 0.8381954887218045, | |
| "train_speed(iter/s)": 0.130324 | |
| }, | |
| { | |
| "epoch": 70.93023255813954, | |
| "grad_norm": 0.842531216333987, | |
| "learning_rate": 1.4317543523384928e-07, | |
| "loss": 0.5287698745727539, | |
| "memory(GiB)": 74.97, | |
| "step": 780, | |
| "token_acc": 0.8566830651213208, | |
| "train_speed(iter/s)": 0.130359 | |
| }, | |
| { | |
| "epoch": 71.37209302325581, | |
| "grad_norm": 0.722140572381901, | |
| "learning_rate": 1.4242826059931536e-07, | |
| "loss": 0.5152388572692871, | |
| "memory(GiB)": 74.97, | |
| "step": 785, | |
| "token_acc": 0.8451972291311229, | |
| "train_speed(iter/s)": 0.130398 | |
| }, | |
| { | |
| "epoch": 71.83720930232558, | |
| "grad_norm": 1.1033571214972513, | |
| "learning_rate": 1.4167818604952903e-07, | |
| "loss": 0.5234486579895019, | |
| "memory(GiB)": 74.97, | |
| "step": 790, | |
| "token_acc": 0.8461698837673958, | |
| "train_speed(iter/s)": 0.130382 | |
| }, | |
| { | |
| "epoch": 72.27906976744185, | |
| "grad_norm": 0.7546592396468452, | |
| "learning_rate": 1.4092526285108939e-07, | |
| "loss": 0.5231525897979736, | |
| "memory(GiB)": 74.97, | |
| "step": 795, | |
| "token_acc": 0.8471460044061686, | |
| "train_speed(iter/s)": 0.130495 | |
| }, | |
| { | |
| "epoch": 72.74418604651163, | |
| "grad_norm": 0.7665462491639092, | |
| "learning_rate": 1.4016954246529695e-07, | |
| "loss": 0.5139668941497803, | |
| "memory(GiB)": 74.97, | |
| "step": 800, | |
| "token_acc": 0.8447760249371035, | |
| "train_speed(iter/s)": 0.130432 | |
| }, | |
| { | |
| "epoch": 72.74418604651163, | |
| "eval_loss": 0.6015437245368958, | |
| "eval_runtime": 0.7284, | |
| "eval_samples_per_second": 17.847, | |
| "eval_steps_per_second": 2.746, | |
| "eval_token_acc": 0.8434715968962544, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 73.18604651162791, | |
| "grad_norm": 0.7272029873141171, | |
| "learning_rate": 1.3941107654463616e-07, | |
| "loss": 0.5379150390625, | |
| "memory(GiB)": 74.97, | |
| "step": 805, | |
| "token_acc": 0.8408949295116442, | |
| "train_speed(iter/s)": 0.130058 | |
| }, | |
| { | |
| "epoch": 73.65116279069767, | |
| "grad_norm": 0.7995205555897585, | |
| "learning_rate": 1.3864991692924522e-07, | |
| "loss": 0.5211355209350585, | |
| "memory(GiB)": 74.97, | |
| "step": 810, | |
| "token_acc": 0.8419526596025093, | |
| "train_speed(iter/s)": 0.130097 | |
| }, | |
| { | |
| "epoch": 74.09302325581395, | |
| "grad_norm": 0.9263844311361451, | |
| "learning_rate": 1.3788611564337276e-07, | |
| "loss": 0.5166553497314453, | |
| "memory(GiB)": 74.97, | |
| "step": 815, | |
| "token_acc": 0.8460784079221183, | |
| "train_speed(iter/s)": 0.130043 | |
| }, | |
| { | |
| "epoch": 74.55813953488372, | |
| "grad_norm": 0.9100848478509656, | |
| "learning_rate": 1.3711972489182207e-07, | |
| "loss": 0.5152887344360352, | |
| "memory(GiB)": 74.97, | |
| "step": 820, | |
| "token_acc": 0.8641304347826086, | |
| "train_speed(iter/s)": 0.129973 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 0.8520157723565999, | |
| "learning_rate": 1.3635079705638297e-07, | |
| "loss": 0.5118432998657226, | |
| "memory(GiB)": 74.97, | |
| "step": 825, | |
| "token_acc": 0.8406333086780081, | |
| "train_speed(iter/s)": 0.130038 | |
| }, | |
| { | |
| "epoch": 75.46511627906976, | |
| "grad_norm": 1.89559334384708, | |
| "learning_rate": 1.3557938469225164e-07, | |
| "loss": 0.5238603591918946, | |
| "memory(GiB)": 74.97, | |
| "step": 830, | |
| "token_acc": 0.8296420958151015, | |
| "train_speed(iter/s)": 0.129953 | |
| }, | |
| { | |
| "epoch": 75.93023255813954, | |
| "grad_norm": 0.8445066662231647, | |
| "learning_rate": 1.3480554052443843e-07, | |
| "loss": 0.5140830516815186, | |
| "memory(GiB)": 74.97, | |
| "step": 835, | |
| "token_acc": 0.8494189687565236, | |
| "train_speed(iter/s)": 0.13002 | |
| }, | |
| { | |
| "epoch": 76.37209302325581, | |
| "grad_norm": 1.5322849456525907, | |
| "learning_rate": 1.340293174441643e-07, | |
| "loss": 0.5148379325866699, | |
| "memory(GiB)": 74.97, | |
| "step": 840, | |
| "token_acc": 0.8386292834890966, | |
| "train_speed(iter/s)": 0.13 | |
| }, | |
| { | |
| "epoch": 76.83720930232558, | |
| "grad_norm": 0.7284489005308602, | |
| "learning_rate": 1.332507685052457e-07, | |
| "loss": 0.5148776531219482, | |
| "memory(GiB)": 74.97, | |
| "step": 845, | |
| "token_acc": 0.8438160869248159, | |
| "train_speed(iter/s)": 0.130024 | |
| }, | |
| { | |
| "epoch": 77.27906976744185, | |
| "grad_norm": 0.8254251521761937, | |
| "learning_rate": 1.3246994692046836e-07, | |
| "loss": 0.5172486305236816, | |
| "memory(GiB)": 74.97, | |
| "step": 850, | |
| "token_acc": 0.8467165799851403, | |
| "train_speed(iter/s)": 0.130065 | |
| }, | |
| { | |
| "epoch": 77.74418604651163, | |
| "grad_norm": 0.888794754410688, | |
| "learning_rate": 1.3168690605795043e-07, | |
| "loss": 0.515445613861084, | |
| "memory(GiB)": 74.97, | |
| "step": 855, | |
| "token_acc": 0.8480349170918368, | |
| "train_speed(iter/s)": 0.130098 | |
| }, | |
| { | |
| "epoch": 78.18604651162791, | |
| "grad_norm": 0.8024083233168969, | |
| "learning_rate": 1.3090169943749475e-07, | |
| "loss": 0.5077299118041992, | |
| "memory(GiB)": 74.97, | |
| "step": 860, | |
| "token_acc": 0.8461068818804495, | |
| "train_speed(iter/s)": 0.130157 | |
| }, | |
| { | |
| "epoch": 78.65116279069767, | |
| "grad_norm": 0.7968691650808981, | |
| "learning_rate": 1.3011438072693074e-07, | |
| "loss": 0.5154001235961914, | |
| "memory(GiB)": 74.97, | |
| "step": 865, | |
| "token_acc": 0.8603395311236863, | |
| "train_speed(iter/s)": 0.130118 | |
| }, | |
| { | |
| "epoch": 79.09302325581395, | |
| "grad_norm": 1.4489088486628856, | |
| "learning_rate": 1.2932500373844649e-07, | |
| "loss": 0.5220766544342041, | |
| "memory(GiB)": 74.97, | |
| "step": 870, | |
| "token_acc": 0.8575108409621586, | |
| "train_speed(iter/s)": 0.130187 | |
| }, | |
| { | |
| "epoch": 79.55813953488372, | |
| "grad_norm": 0.833164944608322, | |
| "learning_rate": 1.2853362242491051e-07, | |
| "loss": 0.5146864414215088, | |
| "memory(GiB)": 74.97, | |
| "step": 875, | |
| "token_acc": 0.8354072612769832, | |
| "train_speed(iter/s)": 0.130231 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 1.0334544104049193, | |
| "learning_rate": 1.2774029087618446e-07, | |
| "loss": 0.5196131706237793, | |
| "memory(GiB)": 74.97, | |
| "step": 880, | |
| "token_acc": 0.8273188610093036, | |
| "train_speed(iter/s)": 0.130243 | |
| }, | |
| { | |
| "epoch": 80.46511627906976, | |
| "grad_norm": 0.7327428116602168, | |
| "learning_rate": 1.2694506331542577e-07, | |
| "loss": 0.5012516975402832, | |
| "memory(GiB)": 74.97, | |
| "step": 885, | |
| "token_acc": 0.8552629297640307, | |
| "train_speed(iter/s)": 0.130266 | |
| }, | |
| { | |
| "epoch": 80.93023255813954, | |
| "grad_norm": 0.7823436928202996, | |
| "learning_rate": 1.2614799409538198e-07, | |
| "loss": 0.5132665634155273, | |
| "memory(GiB)": 74.97, | |
| "step": 890, | |
| "token_acc": 0.8614560088497263, | |
| "train_speed(iter/s)": 0.130236 | |
| }, | |
| { | |
| "epoch": 81.37209302325581, | |
| "grad_norm": 0.8496813139641767, | |
| "learning_rate": 1.253491376946754e-07, | |
| "loss": 0.5047847747802734, | |
| "memory(GiB)": 74.97, | |
| "step": 895, | |
| "token_acc": 0.8672797358731915, | |
| "train_speed(iter/s)": 0.130316 | |
| }, | |
| { | |
| "epoch": 81.83720930232558, | |
| "grad_norm": 0.7662540093111049, | |
| "learning_rate": 1.2454854871407992e-07, | |
| "loss": 0.5070115566253662, | |
| "memory(GiB)": 74.97, | |
| "step": 900, | |
| "token_acc": 0.8437890633276128, | |
| "train_speed(iter/s)": 0.130345 | |
| }, | |
| { | |
| "epoch": 82.27906976744185, | |
| "grad_norm": 1.1403186852474703, | |
| "learning_rate": 1.2374628187278885e-07, | |
| "loss": 0.5135304450988769, | |
| "memory(GiB)": 74.97, | |
| "step": 905, | |
| "token_acc": 0.8760885832099473, | |
| "train_speed(iter/s)": 0.130362 | |
| }, | |
| { | |
| "epoch": 82.74418604651163, | |
| "grad_norm": 0.6850775896882327, | |
| "learning_rate": 1.2294239200467515e-07, | |
| "loss": 0.48610854148864746, | |
| "memory(GiB)": 74.97, | |
| "step": 910, | |
| "token_acc": 0.864081524616199, | |
| "train_speed(iter/s)": 0.130315 | |
| }, | |
| { | |
| "epoch": 83.18604651162791, | |
| "grad_norm": 1.7277139603374756, | |
| "learning_rate": 1.2213693405454345e-07, | |
| "loss": 0.5195373058319092, | |
| "memory(GiB)": 74.97, | |
| "step": 915, | |
| "token_acc": 0.842862242005585, | |
| "train_speed(iter/s)": 0.130334 | |
| }, | |
| { | |
| "epoch": 83.65116279069767, | |
| "grad_norm": 1.562225291111122, | |
| "learning_rate": 1.213299630743747e-07, | |
| "loss": 0.5000184059143067, | |
| "memory(GiB)": 74.97, | |
| "step": 920, | |
| "token_acc": 0.8502656832421286, | |
| "train_speed(iter/s)": 0.130352 | |
| }, | |
| { | |
| "epoch": 84.09302325581395, | |
| "grad_norm": 0.7432167354378622, | |
| "learning_rate": 1.205215342195634e-07, | |
| "loss": 0.4978955745697021, | |
| "memory(GiB)": 74.97, | |
| "step": 925, | |
| "token_acc": 0.8535459925769887, | |
| "train_speed(iter/s)": 0.130407 | |
| }, | |
| { | |
| "epoch": 84.55813953488372, | |
| "grad_norm": 2.2667269366172267, | |
| "learning_rate": 1.1971170274514802e-07, | |
| "loss": 0.5232599258422852, | |
| "memory(GiB)": 74.97, | |
| "step": 930, | |
| "token_acc": 0.8631094983089064, | |
| "train_speed(iter/s)": 0.130392 | |
| }, | |
| { | |
| "epoch": 85.0, | |
| "grad_norm": 0.7640021499203492, | |
| "learning_rate": 1.1890052400203402e-07, | |
| "loss": 0.48494710922241213, | |
| "memory(GiB)": 74.97, | |
| "step": 935, | |
| "token_acc": 0.8383060054320491, | |
| "train_speed(iter/s)": 0.130461 | |
| }, | |
| { | |
| "epoch": 85.46511627906976, | |
| "grad_norm": 0.797825246843515, | |
| "learning_rate": 1.18088053433211e-07, | |
| "loss": 0.4894867897033691, | |
| "memory(GiB)": 74.97, | |
| "step": 940, | |
| "token_acc": 0.862217698107348, | |
| "train_speed(iter/s)": 0.130536 | |
| }, | |
| { | |
| "epoch": 85.93023255813954, | |
| "grad_norm": 1.118805326320862, | |
| "learning_rate": 1.1727434656996305e-07, | |
| "loss": 0.5085083961486816, | |
| "memory(GiB)": 74.97, | |
| "step": 945, | |
| "token_acc": 0.8468460041903622, | |
| "train_speed(iter/s)": 0.130472 | |
| }, | |
| { | |
| "epoch": 86.37209302325581, | |
| "grad_norm": 0.8642381524493187, | |
| "learning_rate": 1.1645945902807339e-07, | |
| "loss": 0.501039457321167, | |
| "memory(GiB)": 74.97, | |
| "step": 950, | |
| "token_acc": 0.8637289013917678, | |
| "train_speed(iter/s)": 0.130524 | |
| }, | |
| { | |
| "epoch": 86.83720930232558, | |
| "grad_norm": 0.876594093463965, | |
| "learning_rate": 1.1564344650402309e-07, | |
| "loss": 0.5047001838684082, | |
| "memory(GiB)": 74.97, | |
| "step": 955, | |
| "token_acc": 0.8469405442884382, | |
| "train_speed(iter/s)": 0.130517 | |
| }, | |
| { | |
| "epoch": 87.27906976744185, | |
| "grad_norm": 1.2339377952227535, | |
| "learning_rate": 1.1482636477118419e-07, | |
| "loss": 0.5183281898498535, | |
| "memory(GiB)": 74.97, | |
| "step": 960, | |
| "token_acc": 0.848177734504658, | |
| "train_speed(iter/s)": 0.130587 | |
| }, | |
| { | |
| "epoch": 87.74418604651163, | |
| "grad_norm": 0.631851683029857, | |
| "learning_rate": 1.1400826967600779e-07, | |
| "loss": 0.483397912979126, | |
| "memory(GiB)": 74.97, | |
| "step": 965, | |
| "token_acc": 0.8719364241861677, | |
| "train_speed(iter/s)": 0.130556 | |
| }, | |
| { | |
| "epoch": 88.18604651162791, | |
| "grad_norm": 1.0951446409255636, | |
| "learning_rate": 1.131892171342069e-07, | |
| "loss": 0.5028903007507324, | |
| "memory(GiB)": 74.97, | |
| "step": 970, | |
| "token_acc": 0.8738672544697527, | |
| "train_speed(iter/s)": 0.130594 | |
| }, | |
| { | |
| "epoch": 88.65116279069767, | |
| "grad_norm": 0.7683275760751048, | |
| "learning_rate": 1.1236926312693478e-07, | |
| "loss": 0.4880162239074707, | |
| "memory(GiB)": 74.97, | |
| "step": 975, | |
| "token_acc": 0.8594904599095622, | |
| "train_speed(iter/s)": 0.130573 | |
| }, | |
| { | |
| "epoch": 89.09302325581395, | |
| "grad_norm": 6.943858471099767, | |
| "learning_rate": 1.1154846369695863e-07, | |
| "loss": 0.5035033226013184, | |
| "memory(GiB)": 74.97, | |
| "step": 980, | |
| "token_acc": 0.8718237375361853, | |
| "train_speed(iter/s)": 0.130588 | |
| }, | |
| { | |
| "epoch": 89.55813953488372, | |
| "grad_norm": 0.722153826562248, | |
| "learning_rate": 1.1072687494482918e-07, | |
| "loss": 0.5015533447265625, | |
| "memory(GiB)": 74.97, | |
| "step": 985, | |
| "token_acc": 0.8497986934062595, | |
| "train_speed(iter/s)": 0.130571 | |
| }, | |
| { | |
| "epoch": 90.0, | |
| "grad_norm": 0.7791081924406386, | |
| "learning_rate": 1.0990455302504628e-07, | |
| "loss": 0.4978206157684326, | |
| "memory(GiB)": 74.97, | |
| "step": 990, | |
| "token_acc": 0.8598159926863901, | |
| "train_speed(iter/s)": 0.130602 | |
| }, | |
| { | |
| "epoch": 90.46511627906976, | |
| "grad_norm": 0.7993364463951824, | |
| "learning_rate": 1.0908155414222082e-07, | |
| "loss": 0.47749814987182615, | |
| "memory(GiB)": 74.97, | |
| "step": 995, | |
| "token_acc": 0.8648952240771585, | |
| "train_speed(iter/s)": 0.13055 | |
| }, | |
| { | |
| "epoch": 90.93023255813954, | |
| "grad_norm": 0.9293826361291836, | |
| "learning_rate": 1.0825793454723325e-07, | |
| "loss": 0.4996511936187744, | |
| "memory(GiB)": 74.97, | |
| "step": 1000, | |
| "token_acc": 0.8631259732808786, | |
| "train_speed(iter/s)": 0.13058 | |
| }, | |
| { | |
| "epoch": 90.93023255813954, | |
| "eval_loss": 0.602931022644043, | |
| "eval_runtime": 0.716, | |
| "eval_samples_per_second": 18.155, | |
| "eval_steps_per_second": 2.793, | |
| "eval_token_acc": 0.8428010345818565, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 91.37209302325581, | |
| "grad_norm": 1.1841722413103843, | |
| "learning_rate": 1.0743375053338877e-07, | |
| "loss": 0.5005837440490722, | |
| "memory(GiB)": 74.97, | |
| "step": 1005, | |
| "token_acc": 0.8554249955862447, | |
| "train_speed(iter/s)": 0.130297 | |
| }, | |
| { | |
| "epoch": 91.83720930232558, | |
| "grad_norm": 1.4009925352920263, | |
| "learning_rate": 1.0660905843256993e-07, | |
| "loss": 0.504381799697876, | |
| "memory(GiB)": 74.97, | |
| "step": 1010, | |
| "token_acc": 0.8340634861704103, | |
| "train_speed(iter/s)": 0.130275 | |
| }, | |
| { | |
| "epoch": 92.27906976744185, | |
| "grad_norm": 0.7236486242500604, | |
| "learning_rate": 1.057839146113864e-07, | |
| "loss": 0.4767627716064453, | |
| "memory(GiB)": 74.97, | |
| "step": 1015, | |
| "token_acc": 0.8686680165507527, | |
| "train_speed(iter/s)": 0.130259 | |
| }, | |
| { | |
| "epoch": 92.74418604651163, | |
| "grad_norm": 1.436377509073585, | |
| "learning_rate": 1.0495837546732223e-07, | |
| "loss": 0.5024114131927491, | |
| "memory(GiB)": 74.97, | |
| "step": 1020, | |
| "token_acc": 0.8457889431344258, | |
| "train_speed(iter/s)": 0.1303 | |
| }, | |
| { | |
| "epoch": 93.18604651162791, | |
| "grad_norm": 0.9279689257580228, | |
| "learning_rate": 1.0413249742488131e-07, | |
| "loss": 0.48839874267578126, | |
| "memory(GiB)": 74.97, | |
| "step": 1025, | |
| "token_acc": 0.8741351653515239, | |
| "train_speed(iter/s)": 0.130339 | |
| }, | |
| { | |
| "epoch": 93.65116279069767, | |
| "grad_norm": 1.5611563038818324, | |
| "learning_rate": 1.033063369317308e-07, | |
| "loss": 0.48693456649780276, | |
| "memory(GiB)": 74.97, | |
| "step": 1030, | |
| "token_acc": 0.8771067535162163, | |
| "train_speed(iter/s)": 0.130324 | |
| }, | |
| { | |
| "epoch": 94.09302325581395, | |
| "grad_norm": 1.1172420689296867, | |
| "learning_rate": 1.0247995045484301e-07, | |
| "loss": 0.5037758350372314, | |
| "memory(GiB)": 74.97, | |
| "step": 1035, | |
| "token_acc": 0.8510888627433569, | |
| "train_speed(iter/s)": 0.130325 | |
| }, | |
| { | |
| "epoch": 94.55813953488372, | |
| "grad_norm": 0.8609018320733309, | |
| "learning_rate": 1.0165339447663586e-07, | |
| "loss": 0.4941869258880615, | |
| "memory(GiB)": 74.97, | |
| "step": 1040, | |
| "token_acc": 0.8680718468508801, | |
| "train_speed(iter/s)": 0.130309 | |
| }, | |
| { | |
| "epoch": 95.0, | |
| "grad_norm": 1.9807352700715366, | |
| "learning_rate": 1.0082672549111248e-07, | |
| "loss": 0.4907430648803711, | |
| "memory(GiB)": 74.97, | |
| "step": 1045, | |
| "token_acc": 0.8594207248443011, | |
| "train_speed(iter/s)": 0.130342 | |
| }, | |
| { | |
| "epoch": 95.46511627906976, | |
| "grad_norm": 1.2132659915520214, | |
| "learning_rate": 1e-07, | |
| "loss": 0.5144547462463379, | |
| "memory(GiB)": 74.97, | |
| "step": 1050, | |
| "token_acc": 0.8396730861192019, | |
| "train_speed(iter/s)": 0.130346 | |
| }, | |
| { | |
| "epoch": 95.93023255813954, | |
| "grad_norm": 1.6515430345069437, | |
| "learning_rate": 9.917327450888751e-08, | |
| "loss": 0.46764235496520995, | |
| "memory(GiB)": 74.97, | |
| "step": 1055, | |
| "token_acc": 0.8370761686275335, | |
| "train_speed(iter/s)": 0.130367 | |
| }, | |
| { | |
| "epoch": 96.37209302325581, | |
| "grad_norm": 0.7140536621397322, | |
| "learning_rate": 9.834660552336415e-08, | |
| "loss": 0.48370823860168455, | |
| "memory(GiB)": 74.97, | |
| "step": 1060, | |
| "token_acc": 0.8530308955807587, | |
| "train_speed(iter/s)": 0.13034 | |
| }, | |
| { | |
| "epoch": 96.83720930232558, | |
| "grad_norm": 1.0809702853567489, | |
| "learning_rate": 9.752004954515699e-08, | |
| "loss": 0.49426803588867185, | |
| "memory(GiB)": 74.97, | |
| "step": 1065, | |
| "token_acc": 0.8571793110216901, | |
| "train_speed(iter/s)": 0.130299 | |
| }, | |
| { | |
| "epoch": 97.27906976744185, | |
| "grad_norm": 0.7163522482069422, | |
| "learning_rate": 9.669366306826918e-08, | |
| "loss": 0.4718944072723389, | |
| "memory(GiB)": 74.97, | |
| "step": 1070, | |
| "token_acc": 0.8619141314767166, | |
| "train_speed(iter/s)": 0.130378 | |
| }, | |
| { | |
| "epoch": 97.74418604651163, | |
| "grad_norm": 0.83946396188462, | |
| "learning_rate": 9.586750257511866e-08, | |
| "loss": 0.4911818504333496, | |
| "memory(GiB)": 74.97, | |
| "step": 1075, | |
| "token_acc": 0.8650800071189347, | |
| "train_speed(iter/s)": 0.1303 | |
| }, | |
| { | |
| "epoch": 98.18604651162791, | |
| "grad_norm": 0.9728064150742605, | |
| "learning_rate": 9.504162453267775e-08, | |
| "loss": 0.4725058078765869, | |
| "memory(GiB)": 74.97, | |
| "step": 1080, | |
| "token_acc": 0.876843910806175, | |
| "train_speed(iter/s)": 0.130339 | |
| }, | |
| { | |
| "epoch": 98.65116279069767, | |
| "grad_norm": 0.7909880351612323, | |
| "learning_rate": 9.421608538861361e-08, | |
| "loss": 0.4865569114685059, | |
| "memory(GiB)": 74.97, | |
| "step": 1085, | |
| "token_acc": 0.8610528723363702, | |
| "train_speed(iter/s)": 0.130299 | |
| }, | |
| { | |
| "epoch": 99.09302325581395, | |
| "grad_norm": 0.8239796766786783, | |
| "learning_rate": 9.339094156743006e-08, | |
| "loss": 0.49038195610046387, | |
| "memory(GiB)": 74.97, | |
| "step": 1090, | |
| "token_acc": 0.8451910122126125, | |
| "train_speed(iter/s)": 0.130321 | |
| }, | |
| { | |
| "epoch": 99.55813953488372, | |
| "grad_norm": 0.714832653552484, | |
| "learning_rate": 9.256624946661125e-08, | |
| "loss": 0.47361068725585936, | |
| "memory(GiB)": 74.97, | |
| "step": 1095, | |
| "token_acc": 0.8569815516103255, | |
| "train_speed(iter/s)": 0.13029 | |
| }, | |
| { | |
| "epoch": 100.0, | |
| "grad_norm": 1.1976610090490132, | |
| "learning_rate": 9.174206545276677e-08, | |
| "loss": 0.49490890502929685, | |
| "memory(GiB)": 74.97, | |
| "step": 1100, | |
| "token_acc": 0.8424860734638123, | |
| "train_speed(iter/s)": 0.130347 | |
| }, | |
| { | |
| "epoch": 100.46511627906976, | |
| "grad_norm": 0.8008632586934444, | |
| "learning_rate": 9.091844585777917e-08, | |
| "loss": 0.4697834014892578, | |
| "memory(GiB)": 74.97, | |
| "step": 1105, | |
| "token_acc": 0.8580395195660596, | |
| "train_speed(iter/s)": 0.130358 | |
| }, | |
| { | |
| "epoch": 100.93023255813954, | |
| "grad_norm": 0.6845439357302979, | |
| "learning_rate": 9.009544697495372e-08, | |
| "loss": 0.48686370849609373, | |
| "memory(GiB)": 74.97, | |
| "step": 1110, | |
| "token_acc": 0.8574517231821122, | |
| "train_speed(iter/s)": 0.13037 | |
| }, | |
| { | |
| "epoch": 101.37209302325581, | |
| "grad_norm": 0.7958237623480675, | |
| "learning_rate": 8.927312505517084e-08, | |
| "loss": 0.4824103832244873, | |
| "memory(GiB)": 74.97, | |
| "step": 1115, | |
| "token_acc": 0.8406652121643884, | |
| "train_speed(iter/s)": 0.130425 | |
| }, | |
| { | |
| "epoch": 101.83720930232558, | |
| "grad_norm": 0.7562640332446442, | |
| "learning_rate": 8.845153630304139e-08, | |
| "loss": 0.4883410453796387, | |
| "memory(GiB)": 74.97, | |
| "step": 1120, | |
| "token_acc": 0.8654994502241394, | |
| "train_speed(iter/s)": 0.130404 | |
| }, | |
| { | |
| "epoch": 102.27906976744185, | |
| "grad_norm": 0.91816981994612, | |
| "learning_rate": 8.763073687306523e-08, | |
| "loss": 0.47723941802978515, | |
| "memory(GiB)": 74.97, | |
| "step": 1125, | |
| "token_acc": 0.8617350394493566, | |
| "train_speed(iter/s)": 0.13041 | |
| }, | |
| { | |
| "epoch": 102.74418604651163, | |
| "grad_norm": 1.0207292255363964, | |
| "learning_rate": 8.68107828657931e-08, | |
| "loss": 0.48489856719970703, | |
| "memory(GiB)": 74.97, | |
| "step": 1130, | |
| "token_acc": 0.8609944029573764, | |
| "train_speed(iter/s)": 0.130389 | |
| }, | |
| { | |
| "epoch": 103.18604651162791, | |
| "grad_norm": 1.0123784499736115, | |
| "learning_rate": 8.59917303239922e-08, | |
| "loss": 0.4814739227294922, | |
| "memory(GiB)": 74.97, | |
| "step": 1135, | |
| "token_acc": 0.8705958429561201, | |
| "train_speed(iter/s)": 0.130431 | |
| }, | |
| { | |
| "epoch": 103.65116279069767, | |
| "grad_norm": 0.7408077875426933, | |
| "learning_rate": 8.517363522881579e-08, | |
| "loss": 0.47219066619873046, | |
| "memory(GiB)": 74.97, | |
| "step": 1140, | |
| "token_acc": 0.8524354155002799, | |
| "train_speed(iter/s)": 0.130432 | |
| }, | |
| { | |
| "epoch": 104.09302325581395, | |
| "grad_norm": 0.7314596110135979, | |
| "learning_rate": 8.435655349597689e-08, | |
| "loss": 0.4839695930480957, | |
| "memory(GiB)": 74.97, | |
| "step": 1145, | |
| "token_acc": 0.8638605778320128, | |
| "train_speed(iter/s)": 0.130438 | |
| }, | |
| { | |
| "epoch": 104.55813953488372, | |
| "grad_norm": 0.8022214373595549, | |
| "learning_rate": 8.354054097192658e-08, | |
| "loss": 0.4761360168457031, | |
| "memory(GiB)": 74.97, | |
| "step": 1150, | |
| "token_acc": 0.8594843717513341, | |
| "train_speed(iter/s)": 0.130347 | |
| }, | |
| { | |
| "epoch": 105.0, | |
| "grad_norm": 0.9319907606891521, | |
| "learning_rate": 8.2725653430037e-08, | |
| "loss": 0.4859612941741943, | |
| "memory(GiB)": 74.97, | |
| "step": 1155, | |
| "token_acc": 0.8549445575922154, | |
| "train_speed(iter/s)": 0.130405 | |
| }, | |
| { | |
| "epoch": 105.46511627906976, | |
| "grad_norm": 1.2930176911390905, | |
| "learning_rate": 8.191194656678904e-08, | |
| "loss": 0.4661128044128418, | |
| "memory(GiB)": 74.97, | |
| "step": 1160, | |
| "token_acc": 0.8626862925482981, | |
| "train_speed(iter/s)": 0.130393 | |
| }, | |
| { | |
| "epoch": 105.93023255813954, | |
| "grad_norm": 0.9575779480555059, | |
| "learning_rate": 8.109947599796598e-08, | |
| "loss": 0.484060001373291, | |
| "memory(GiB)": 74.97, | |
| "step": 1165, | |
| "token_acc": 0.8556487381611823, | |
| "train_speed(iter/s)": 0.130404 | |
| }, | |
| { | |
| "epoch": 106.37209302325581, | |
| "grad_norm": 0.7486234774787734, | |
| "learning_rate": 8.028829725485198e-08, | |
| "loss": 0.4818765640258789, | |
| "memory(GiB)": 74.97, | |
| "step": 1170, | |
| "token_acc": 0.8624224886316659, | |
| "train_speed(iter/s)": 0.130438 | |
| }, | |
| { | |
| "epoch": 106.83720930232558, | |
| "grad_norm": 0.7280471700597845, | |
| "learning_rate": 7.947846578043658e-08, | |
| "loss": 0.48406553268432617, | |
| "memory(GiB)": 74.97, | |
| "step": 1175, | |
| "token_acc": 0.8612848675893546, | |
| "train_speed(iter/s)": 0.130403 | |
| }, | |
| { | |
| "epoch": 107.27906976744185, | |
| "grad_norm": 0.9703016724934369, | |
| "learning_rate": 7.867003692562532e-08, | |
| "loss": 0.46012191772460936, | |
| "memory(GiB)": 74.97, | |
| "step": 1180, | |
| "token_acc": 0.8720765414599575, | |
| "train_speed(iter/s)": 0.13046 | |
| }, | |
| { | |
| "epoch": 107.74418604651163, | |
| "grad_norm": 1.7207486244429357, | |
| "learning_rate": 7.786306594545656e-08, | |
| "loss": 0.47897043228149416, | |
| "memory(GiB)": 74.97, | |
| "step": 1185, | |
| "token_acc": 0.8613559838243008, | |
| "train_speed(iter/s)": 0.130449 | |
| }, | |
| { | |
| "epoch": 108.18604651162791, | |
| "grad_norm": 1.0944806454073215, | |
| "learning_rate": 7.705760799532485e-08, | |
| "loss": 0.48472142219543457, | |
| "memory(GiB)": 74.97, | |
| "step": 1190, | |
| "token_acc": 0.8510737233682787, | |
| "train_speed(iter/s)": 0.130447 | |
| }, | |
| { | |
| "epoch": 108.65116279069767, | |
| "grad_norm": 0.7340918962562681, | |
| "learning_rate": 7.625371812721114e-08, | |
| "loss": 0.46958436965942385, | |
| "memory(GiB)": 74.97, | |
| "step": 1195, | |
| "token_acc": 0.8719202394209354, | |
| "train_speed(iter/s)": 0.130463 | |
| }, | |
| { | |
| "epoch": 109.09302325581395, | |
| "grad_norm": 0.939464587476609, | |
| "learning_rate": 7.545145128592009e-08, | |
| "loss": 0.47149295806884767, | |
| "memory(GiB)": 74.97, | |
| "step": 1200, | |
| "token_acc": 0.8800350262697023, | |
| "train_speed(iter/s)": 0.130453 | |
| }, | |
| { | |
| "epoch": 109.09302325581395, | |
| "eval_loss": 0.6058527827262878, | |
| "eval_runtime": 0.7066, | |
| "eval_samples_per_second": 18.397, | |
| "eval_steps_per_second": 2.83, | |
| "eval_token_acc": 0.8434715968962544, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 109.55813953488372, | |
| "grad_norm": 0.8652359563773929, | |
| "learning_rate": 7.465086230532459e-08, | |
| "loss": 0.476532506942749, | |
| "memory(GiB)": 74.97, | |
| "step": 1205, | |
| "token_acc": 0.8694151027245068, | |
| "train_speed(iter/s)": 0.130187 | |
| }, | |
| { | |
| "epoch": 110.0, | |
| "grad_norm": 0.8098360520222708, | |
| "learning_rate": 7.385200590461802e-08, | |
| "loss": 0.4804817199707031, | |
| "memory(GiB)": 74.97, | |
| "step": 1210, | |
| "token_acc": 0.8504993058976311, | |
| "train_speed(iter/s)": 0.130211 | |
| }, | |
| { | |
| "epoch": 110.46511627906976, | |
| "grad_norm": 0.7864179053648999, | |
| "learning_rate": 7.305493668457419e-08, | |
| "loss": 0.46163101196289064, | |
| "memory(GiB)": 74.97, | |
| "step": 1215, | |
| "token_acc": 0.8520807581376184, | |
| "train_speed(iter/s)": 0.130209 | |
| }, | |
| { | |
| "epoch": 110.93023255813954, | |
| "grad_norm": 1.2076707405286862, | |
| "learning_rate": 7.225970912381556e-08, | |
| "loss": 0.4753293991088867, | |
| "memory(GiB)": 74.97, | |
| "step": 1220, | |
| "token_acc": 0.8554707472061939, | |
| "train_speed(iter/s)": 0.1302 | |
| }, | |
| { | |
| "epoch": 111.37209302325581, | |
| "grad_norm": 0.871709312109685, | |
| "learning_rate": 7.146637757508949e-08, | |
| "loss": 0.47620530128479005, | |
| "memory(GiB)": 74.97, | |
| "step": 1225, | |
| "token_acc": 0.8760574752720532, | |
| "train_speed(iter/s)": 0.13021 | |
| }, | |
| { | |
| "epoch": 111.83720930232558, | |
| "grad_norm": 0.7334760311164147, | |
| "learning_rate": 7.067499626155353e-08, | |
| "loss": 0.46177024841308595, | |
| "memory(GiB)": 74.97, | |
| "step": 1230, | |
| "token_acc": 0.8513760840189522, | |
| "train_speed(iter/s)": 0.130203 | |
| }, | |
| { | |
| "epoch": 112.27906976744185, | |
| "grad_norm": 0.9429213919362676, | |
| "learning_rate": 6.988561927306926e-08, | |
| "loss": 0.4705217361450195, | |
| "memory(GiB)": 74.97, | |
| "step": 1235, | |
| "token_acc": 0.8782852564102565, | |
| "train_speed(iter/s)": 0.130244 | |
| }, | |
| { | |
| "epoch": 112.74418604651163, | |
| "grad_norm": 1.0006229504211153, | |
| "learning_rate": 6.909830056250527e-08, | |
| "loss": 0.46991333961486814, | |
| "memory(GiB)": 74.97, | |
| "step": 1240, | |
| "token_acc": 0.8570367690462136, | |
| "train_speed(iter/s)": 0.130239 | |
| }, | |
| { | |
| "epoch": 113.18604651162791, | |
| "grad_norm": 1.5600658321413452, | |
| "learning_rate": 6.831309394204956e-08, | |
| "loss": 0.5063477039337159, | |
| "memory(GiB)": 74.97, | |
| "step": 1245, | |
| "token_acc": 0.8328871703351179, | |
| "train_speed(iter/s)": 0.130265 | |
| }, | |
| { | |
| "epoch": 113.65116279069767, | |
| "grad_norm": 0.7100324996989047, | |
| "learning_rate": 6.753005307953166e-08, | |
| "loss": 0.4718203544616699, | |
| "memory(GiB)": 74.97, | |
| "step": 1250, | |
| "token_acc": 0.846406587098945, | |
| "train_speed(iter/s)": 0.130269 | |
| }, | |
| { | |
| "epoch": 114.09302325581395, | |
| "grad_norm": 1.02085122390004, | |
| "learning_rate": 6.674923149475432e-08, | |
| "loss": 0.46040911674499513, | |
| "memory(GiB)": 74.97, | |
| "step": 1255, | |
| "token_acc": 0.8600188738597043, | |
| "train_speed(iter/s)": 0.130273 | |
| }, | |
| { | |
| "epoch": 114.55813953488372, | |
| "grad_norm": 0.7602372463858895, | |
| "learning_rate": 6.597068255583569e-08, | |
| "loss": 0.4706200122833252, | |
| "memory(GiB)": 74.97, | |
| "step": 1260, | |
| "token_acc": 0.850320256204964, | |
| "train_speed(iter/s)": 0.130273 | |
| }, | |
| { | |
| "epoch": 115.0, | |
| "grad_norm": 0.880014706373256, | |
| "learning_rate": 6.519445947556154e-08, | |
| "loss": 0.4695608139038086, | |
| "memory(GiB)": 74.97, | |
| "step": 1265, | |
| "token_acc": 0.8624032731477363, | |
| "train_speed(iter/s)": 0.1303 | |
| }, | |
| { | |
| "epoch": 115.46511627906976, | |
| "grad_norm": 1.2127086778344998, | |
| "learning_rate": 6.442061530774834e-08, | |
| "loss": 0.47931528091430664, | |
| "memory(GiB)": 74.97, | |
| "step": 1270, | |
| "token_acc": 0.844140842826416, | |
| "train_speed(iter/s)": 0.130299 | |
| }, | |
| { | |
| "epoch": 115.93023255813954, | |
| "grad_norm": 1.083099670256692, | |
| "learning_rate": 6.3649202943617e-08, | |
| "loss": 0.4720285415649414, | |
| "memory(GiB)": 74.97, | |
| "step": 1275, | |
| "token_acc": 0.8607366273040511, | |
| "train_speed(iter/s)": 0.130282 | |
| }, | |
| { | |
| "epoch": 116.37209302325581, | |
| "grad_norm": 0.767737493501071, | |
| "learning_rate": 6.288027510817791e-08, | |
| "loss": 0.4558729648590088, | |
| "memory(GiB)": 74.97, | |
| "step": 1280, | |
| "token_acc": 0.8583624139902605, | |
| "train_speed(iter/s)": 0.130308 | |
| }, | |
| { | |
| "epoch": 116.83720930232558, | |
| "grad_norm": 1.6440663696409548, | |
| "learning_rate": 6.211388435662721e-08, | |
| "loss": 0.47510428428649903, | |
| "memory(GiB)": 74.97, | |
| "step": 1285, | |
| "token_acc": 0.8627756653992396, | |
| "train_speed(iter/s)": 0.130302 | |
| }, | |
| { | |
| "epoch": 117.27906976744185, | |
| "grad_norm": 0.8947253671514697, | |
| "learning_rate": 6.135008307075479e-08, | |
| "loss": 0.48160324096679685, | |
| "memory(GiB)": 74.97, | |
| "step": 1290, | |
| "token_acc": 0.8668218530666949, | |
| "train_speed(iter/s)": 0.130333 | |
| }, | |
| { | |
| "epoch": 117.74418604651163, | |
| "grad_norm": 0.7850295846326071, | |
| "learning_rate": 6.058892345536387e-08, | |
| "loss": 0.4656852722167969, | |
| "memory(GiB)": 74.97, | |
| "step": 1295, | |
| "token_acc": 0.8760795485278474, | |
| "train_speed(iter/s)": 0.13033 | |
| }, | |
| { | |
| "epoch": 118.18604651162791, | |
| "grad_norm": 0.7825259584750254, | |
| "learning_rate": 5.983045753470308e-08, | |
| "loss": 0.4575822830200195, | |
| "memory(GiB)": 74.97, | |
| "step": 1300, | |
| "token_acc": 0.8609389541215373, | |
| "train_speed(iter/s)": 0.130341 | |
| }, | |
| { | |
| "epoch": 118.65116279069767, | |
| "grad_norm": 1.3046914177444136, | |
| "learning_rate": 5.9074737148910606e-08, | |
| "loss": 0.45604352951049804, | |
| "memory(GiB)": 74.97, | |
| "step": 1305, | |
| "token_acc": 0.8543227692364619, | |
| "train_speed(iter/s)": 0.130361 | |
| }, | |
| { | |
| "epoch": 119.09302325581395, | |
| "grad_norm": 1.1212563362731731, | |
| "learning_rate": 5.832181395047098e-08, | |
| "loss": 0.4669440269470215, | |
| "memory(GiB)": 74.97, | |
| "step": 1310, | |
| "token_acc": 0.868457034673772, | |
| "train_speed(iter/s)": 0.130368 | |
| }, | |
| { | |
| "epoch": 119.55813953488372, | |
| "grad_norm": 0.8339959692059283, | |
| "learning_rate": 5.7571739400684635e-08, | |
| "loss": 0.47755279541015627, | |
| "memory(GiB)": 74.97, | |
| "step": 1315, | |
| "token_acc": 0.8567007810897974, | |
| "train_speed(iter/s)": 0.130336 | |
| }, | |
| { | |
| "epoch": 120.0, | |
| "grad_norm": 0.9954950376422352, | |
| "learning_rate": 5.682456476615072e-08, | |
| "loss": 0.4645816802978516, | |
| "memory(GiB)": 74.97, | |
| "step": 1320, | |
| "token_acc": 0.8440125792344356, | |
| "train_speed(iter/s)": 0.130377 | |
| }, | |
| { | |
| "epoch": 120.46511627906976, | |
| "grad_norm": 0.6612384359472665, | |
| "learning_rate": 5.6080341115262976e-08, | |
| "loss": 0.45533552169799807, | |
| "memory(GiB)": 74.97, | |
| "step": 1325, | |
| "token_acc": 0.8586772074823821, | |
| "train_speed(iter/s)": 0.130382 | |
| }, | |
| { | |
| "epoch": 120.93023255813954, | |
| "grad_norm": 1.098228237433943, | |
| "learning_rate": 5.533911931471935e-08, | |
| "loss": 0.4692089080810547, | |
| "memory(GiB)": 74.97, | |
| "step": 1330, | |
| "token_acc": 0.8699830311690632, | |
| "train_speed(iter/s)": 0.13038 | |
| }, | |
| { | |
| "epoch": 121.37209302325581, | |
| "grad_norm": 0.7854095634086957, | |
| "learning_rate": 5.460095002604532e-08, | |
| "loss": 0.46064138412475586, | |
| "memory(GiB)": 74.97, | |
| "step": 1335, | |
| "token_acc": 0.8677652211026369, | |
| "train_speed(iter/s)": 0.130369 | |
| }, | |
| { | |
| "epoch": 121.83720930232558, | |
| "grad_norm": 2.1438550225472506, | |
| "learning_rate": 5.386588370213123e-08, | |
| "loss": 0.47399129867553713, | |
| "memory(GiB)": 74.97, | |
| "step": 1340, | |
| "token_acc": 0.8342529761205946, | |
| "train_speed(iter/s)": 0.130402 | |
| }, | |
| { | |
| "epoch": 122.27906976744185, | |
| "grad_norm": 0.7685065811470108, | |
| "learning_rate": 5.313397058378386e-08, | |
| "loss": 0.46064081192016604, | |
| "memory(GiB)": 74.97, | |
| "step": 1345, | |
| "token_acc": 0.8655901006480077, | |
| "train_speed(iter/s)": 0.130445 | |
| }, | |
| { | |
| "epoch": 122.74418604651163, | |
| "grad_norm": 0.7484657906315015, | |
| "learning_rate": 5.240526069629264e-08, | |
| "loss": 0.4805141925811768, | |
| "memory(GiB)": 74.97, | |
| "step": 1350, | |
| "token_acc": 0.8551861286142021, | |
| "train_speed(iter/s)": 0.13041 | |
| }, | |
| { | |
| "epoch": 123.18604651162791, | |
| "grad_norm": 0.7881353244361399, | |
| "learning_rate": 5.1679803846010403e-08, | |
| "loss": 0.4467328071594238, | |
| "memory(GiB)": 74.97, | |
| "step": 1355, | |
| "token_acc": 0.8620426261271331, | |
| "train_speed(iter/s)": 0.13044 | |
| }, | |
| { | |
| "epoch": 123.65116279069767, | |
| "grad_norm": 0.9062139816497382, | |
| "learning_rate": 5.0957649616949215e-08, | |
| "loss": 0.4628152847290039, | |
| "memory(GiB)": 74.97, | |
| "step": 1360, | |
| "token_acc": 0.8650447427293065, | |
| "train_speed(iter/s)": 0.13041 | |
| }, | |
| { | |
| "epoch": 124.09302325581395, | |
| "grad_norm": 0.7919450228717162, | |
| "learning_rate": 5.0238847367391314e-08, | |
| "loss": 0.45865640640258787, | |
| "memory(GiB)": 74.97, | |
| "step": 1365, | |
| "token_acc": 0.8601923709624354, | |
| "train_speed(iter/s)": 0.130427 | |
| }, | |
| { | |
| "epoch": 124.55813953488372, | |
| "grad_norm": 0.6379402091206297, | |
| "learning_rate": 4.952344622651565e-08, | |
| "loss": 0.4563908576965332, | |
| "memory(GiB)": 74.97, | |
| "step": 1370, | |
| "token_acc": 0.8681956209045869, | |
| "train_speed(iter/s)": 0.130458 | |
| }, | |
| { | |
| "epoch": 125.0, | |
| "grad_norm": 1.558446245583931, | |
| "learning_rate": 4.8811495091039923e-08, | |
| "loss": 0.4724306106567383, | |
| "memory(GiB)": 74.97, | |
| "step": 1375, | |
| "token_acc": 0.860769332539525, | |
| "train_speed(iter/s)": 0.13045 | |
| }, | |
| { | |
| "epoch": 125.46511627906976, | |
| "grad_norm": 0.7458216850987389, | |
| "learning_rate": 4.810304262187851e-08, | |
| "loss": 0.46082763671875, | |
| "memory(GiB)": 74.97, | |
| "step": 1380, | |
| "token_acc": 0.8532610918012676, | |
| "train_speed(iter/s)": 0.130461 | |
| }, | |
| { | |
| "epoch": 125.93023255813954, | |
| "grad_norm": 3.0851446793520743, | |
| "learning_rate": 4.739813724081661e-08, | |
| "loss": 0.47005910873413087, | |
| "memory(GiB)": 74.97, | |
| "step": 1385, | |
| "token_acc": 0.8665938394822649, | |
| "train_speed(iter/s)": 0.13044 | |
| }, | |
| { | |
| "epoch": 126.37209302325581, | |
| "grad_norm": 0.6792623717144913, | |
| "learning_rate": 4.6696827127200644e-08, | |
| "loss": 0.44311208724975587, | |
| "memory(GiB)": 74.97, | |
| "step": 1390, | |
| "token_acc": 0.8821935667868566, | |
| "train_speed(iter/s)": 0.130476 | |
| }, | |
| { | |
| "epoch": 126.83720930232558, | |
| "grad_norm": 0.8236864003533888, | |
| "learning_rate": 4.599916021464531e-08, | |
| "loss": 0.4629988670349121, | |
| "memory(GiB)": 74.97, | |
| "step": 1395, | |
| "token_acc": 0.8768711824231926, | |
| "train_speed(iter/s)": 0.130449 | |
| }, | |
| { | |
| "epoch": 127.27906976744185, | |
| "grad_norm": 1.9627254088333494, | |
| "learning_rate": 4.530518418775733e-08, | |
| "loss": 0.48299736976623536, | |
| "memory(GiB)": 74.97, | |
| "step": 1400, | |
| "token_acc": 0.8665078296300133, | |
| "train_speed(iter/s)": 0.130474 | |
| }, | |
| { | |
| "epoch": 127.27906976744185, | |
| "eval_loss": 0.6098422408103943, | |
| "eval_runtime": 0.6984, | |
| "eval_samples_per_second": 18.613, | |
| "eval_steps_per_second": 2.864, | |
| "eval_token_acc": 0.8434715968962544, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 127.74418604651163, | |
| "grad_norm": 0.8045410987121008, | |
| "learning_rate": 4.4614946478876306e-08, | |
| "loss": 0.45166778564453125, | |
| "memory(GiB)": 74.97, | |
| "step": 1405, | |
| "token_acc": 0.8502284891267625, | |
| "train_speed(iter/s)": 0.130135 | |
| }, | |
| { | |
| "epoch": 128.1860465116279, | |
| "grad_norm": 3.0860005142595193, | |
| "learning_rate": 4.392849426483274e-08, | |
| "loss": 0.4591231822967529, | |
| "memory(GiB)": 74.97, | |
| "step": 1410, | |
| "token_acc": 0.8607654013690106, | |
| "train_speed(iter/s)": 0.130182 | |
| }, | |
| { | |
| "epoch": 128.65116279069767, | |
| "grad_norm": 1.1087292922703431, | |
| "learning_rate": 4.324587446372364e-08, | |
| "loss": 0.474017858505249, | |
| "memory(GiB)": 74.97, | |
| "step": 1415, | |
| "token_acc": 0.8642677323380807, | |
| "train_speed(iter/s)": 0.130185 | |
| }, | |
| { | |
| "epoch": 129.09302325581396, | |
| "grad_norm": 1.0228218711643116, | |
| "learning_rate": 4.256713373170564e-08, | |
| "loss": 0.4518399715423584, | |
| "memory(GiB)": 74.97, | |
| "step": 1420, | |
| "token_acc": 0.8715422807155804, | |
| "train_speed(iter/s)": 0.130231 | |
| }, | |
| { | |
| "epoch": 129.5581395348837, | |
| "grad_norm": 0.8006830274772974, | |
| "learning_rate": 4.1892318459806175e-08, | |
| "loss": 0.46432695388793943, | |
| "memory(GiB)": 74.97, | |
| "step": 1425, | |
| "token_acc": 0.8545170039641143, | |
| "train_speed(iter/s)": 0.130209 | |
| }, | |
| { | |
| "epoch": 130.0, | |
| "grad_norm": 1.0265209199413956, | |
| "learning_rate": 4.1221474770752695e-08, | |
| "loss": 0.44231014251708983, | |
| "memory(GiB)": 74.97, | |
| "step": 1430, | |
| "token_acc": 0.8699077672962582, | |
| "train_speed(iter/s)": 0.130226 | |
| }, | |
| { | |
| "epoch": 130.46511627906978, | |
| "grad_norm": 0.8188302695487818, | |
| "learning_rate": 4.055464851582021e-08, | |
| "loss": 0.4583402156829834, | |
| "memory(GiB)": 74.97, | |
| "step": 1435, | |
| "token_acc": 0.8707192214769637, | |
| "train_speed(iter/s)": 0.130227 | |
| }, | |
| { | |
| "epoch": 130.93023255813952, | |
| "grad_norm": 0.8252804258656437, | |
| "learning_rate": 3.989188527169749e-08, | |
| "loss": 0.46385898590087893, | |
| "memory(GiB)": 74.97, | |
| "step": 1440, | |
| "token_acc": 0.8788993882124901, | |
| "train_speed(iter/s)": 0.130217 | |
| }, | |
| { | |
| "epoch": 131.37209302325581, | |
| "grad_norm": 0.8872565038088099, | |
| "learning_rate": 3.923323033737188e-08, | |
| "loss": 0.4746572017669678, | |
| "memory(GiB)": 74.97, | |
| "step": 1445, | |
| "token_acc": 0.8457805814107371, | |
| "train_speed(iter/s)": 0.130274 | |
| }, | |
| { | |
| "epoch": 131.8372093023256, | |
| "grad_norm": 1.1550562475118538, | |
| "learning_rate": 3.857872873103322e-08, | |
| "loss": 0.44470739364624023, | |
| "memory(GiB)": 74.97, | |
| "step": 1450, | |
| "token_acc": 0.8544268219233085, | |
| "train_speed(iter/s)": 0.130237 | |
| }, | |
| { | |
| "epoch": 132.27906976744185, | |
| "grad_norm": 0.7549641151940925, | |
| "learning_rate": 3.7928425186996883e-08, | |
| "loss": 0.46361541748046875, | |
| "memory(GiB)": 74.97, | |
| "step": 1455, | |
| "token_acc": 0.8666913610733835, | |
| "train_speed(iter/s)": 0.130263 | |
| }, | |
| { | |
| "epoch": 132.74418604651163, | |
| "grad_norm": 0.8290416052434509, | |
| "learning_rate": 3.7282364152646295e-08, | |
| "loss": 0.45833826065063477, | |
| "memory(GiB)": 74.97, | |
| "step": 1460, | |
| "token_acc": 0.8540965869971476, | |
| "train_speed(iter/s)": 0.130268 | |
| }, | |
| { | |
| "epoch": 133.1860465116279, | |
| "grad_norm": 0.8534414388843884, | |
| "learning_rate": 3.664058978539495e-08, | |
| "loss": 0.4486083507537842, | |
| "memory(GiB)": 74.97, | |
| "step": 1465, | |
| "token_acc": 0.8745001477395844, | |
| "train_speed(iter/s)": 0.130303 | |
| }, | |
| { | |
| "epoch": 133.65116279069767, | |
| "grad_norm": 0.8212409711926915, | |
| "learning_rate": 3.600314594966833e-08, | |
| "loss": 0.4511223316192627, | |
| "memory(GiB)": 74.97, | |
| "step": 1470, | |
| "token_acc": 0.8836138231258182, | |
| "train_speed(iter/s)": 0.130277 | |
| }, | |
| { | |
| "epoch": 134.09302325581396, | |
| "grad_norm": 1.0201258607355366, | |
| "learning_rate": 3.53700762139059e-08, | |
| "loss": 0.48140726089477537, | |
| "memory(GiB)": 74.97, | |
| "step": 1475, | |
| "token_acc": 0.8690138329330979, | |
| "train_speed(iter/s)": 0.130297 | |
| }, | |
| { | |
| "epoch": 134.5581395348837, | |
| "grad_norm": 0.7617400291414114, | |
| "learning_rate": 3.474142384758313e-08, | |
| "loss": 0.4485898017883301, | |
| "memory(GiB)": 74.97, | |
| "step": 1480, | |
| "token_acc": 0.864516129032258, | |
| "train_speed(iter/s)": 0.130302 | |
| }, | |
| { | |
| "epoch": 135.0, | |
| "grad_norm": 1.0462722965857336, | |
| "learning_rate": 3.41172318182542e-08, | |
| "loss": 0.45436367988586424, | |
| "memory(GiB)": 74.97, | |
| "step": 1485, | |
| "token_acc": 0.8761111111111111, | |
| "train_speed(iter/s)": 0.130327 | |
| }, | |
| { | |
| "epoch": 135.46511627906978, | |
| "grad_norm": 0.8458888970103854, | |
| "learning_rate": 3.349754278861516e-08, | |
| "loss": 0.4582218170166016, | |
| "memory(GiB)": 74.97, | |
| "step": 1490, | |
| "token_acc": 0.8546937352291658, | |
| "train_speed(iter/s)": 0.130305 | |
| }, | |
| { | |
| "epoch": 135.93023255813952, | |
| "grad_norm": 1.0227265853515555, | |
| "learning_rate": 3.2882399113588066e-08, | |
| "loss": 0.44946842193603515, | |
| "memory(GiB)": 74.97, | |
| "step": 1495, | |
| "token_acc": 0.8783018139714396, | |
| "train_speed(iter/s)": 0.130303 | |
| }, | |
| { | |
| "epoch": 136.37209302325581, | |
| "grad_norm": 0.9319234599915691, | |
| "learning_rate": 3.227184283742591e-08, | |
| "loss": 0.4635480880737305, | |
| "memory(GiB)": 74.97, | |
| "step": 1500, | |
| "token_acc": 0.8700904636260837, | |
| "train_speed(iter/s)": 0.130364 | |
| }, | |
| { | |
| "epoch": 136.8372093023256, | |
| "grad_norm": 0.7448189618376913, | |
| "learning_rate": 3.166591569083916e-08, | |
| "loss": 0.45705451965332033, | |
| "memory(GiB)": 74.97, | |
| "step": 1505, | |
| "token_acc": 0.8637782801950199, | |
| "train_speed(iter/s)": 0.130342 | |
| }, | |
| { | |
| "epoch": 137.27906976744185, | |
| "grad_norm": 1.1529755818910967, | |
| "learning_rate": 3.106465908814342e-08, | |
| "loss": 0.45585179328918457, | |
| "memory(GiB)": 74.97, | |
| "step": 1510, | |
| "token_acc": 0.8591232839778012, | |
| "train_speed(iter/s)": 0.130363 | |
| }, | |
| { | |
| "epoch": 137.74418604651163, | |
| "grad_norm": 0.8260584253674346, | |
| "learning_rate": 3.04681141244288e-08, | |
| "loss": 0.46056065559387205, | |
| "memory(GiB)": 74.97, | |
| "step": 1515, | |
| "token_acc": 0.8692046456648592, | |
| "train_speed(iter/s)": 0.130323 | |
| }, | |
| { | |
| "epoch": 138.1860465116279, | |
| "grad_norm": 1.052950893981831, | |
| "learning_rate": 2.987632157275114e-08, | |
| "loss": 0.45586233139038085, | |
| "memory(GiB)": 74.97, | |
| "step": 1520, | |
| "token_acc": 0.863406408094435, | |
| "train_speed(iter/s)": 0.130349 | |
| }, | |
| { | |
| "epoch": 138.65116279069767, | |
| "grad_norm": 0.7991045733474148, | |
| "learning_rate": 2.928932188134525e-08, | |
| "loss": 0.4538632869720459, | |
| "memory(GiB)": 74.97, | |
| "step": 1525, | |
| "token_acc": 0.8717533864610406, | |
| "train_speed(iter/s)": 0.130328 | |
| }, | |
| { | |
| "epoch": 139.09302325581396, | |
| "grad_norm": 1.434227664193626, | |
| "learning_rate": 2.8707155170860297e-08, | |
| "loss": 0.46680850982666017, | |
| "memory(GiB)": 74.97, | |
| "step": 1530, | |
| "token_acc": 0.8410292981517798, | |
| "train_speed(iter/s)": 0.130343 | |
| }, | |
| { | |
| "epoch": 139.5581395348837, | |
| "grad_norm": 0.7631653651545482, | |
| "learning_rate": 2.8129861231617612e-08, | |
| "loss": 0.44613943099975584, | |
| "memory(GiB)": 74.97, | |
| "step": 1535, | |
| "token_acc": 0.8678071275982503, | |
| "train_speed(iter/s)": 0.130345 | |
| }, | |
| { | |
| "epoch": 140.0, | |
| "grad_norm": 1.1321589909418222, | |
| "learning_rate": 2.7557479520891104e-08, | |
| "loss": 0.4599461078643799, | |
| "memory(GiB)": 74.97, | |
| "step": 1540, | |
| "token_acc": 0.8688507394846334, | |
| "train_speed(iter/s)": 0.130373 | |
| }, | |
| { | |
| "epoch": 140.46511627906978, | |
| "grad_norm": 0.9020009960345104, | |
| "learning_rate": 2.699004916021038e-08, | |
| "loss": 0.4559918403625488, | |
| "memory(GiB)": 74.97, | |
| "step": 1545, | |
| "token_acc": 0.8607777938412606, | |
| "train_speed(iter/s)": 0.130363 | |
| }, | |
| { | |
| "epoch": 140.93023255813952, | |
| "grad_norm": 0.7719651412897752, | |
| "learning_rate": 2.642760893268684e-08, | |
| "loss": 0.459440279006958, | |
| "memory(GiB)": 74.97, | |
| "step": 1550, | |
| "token_acc": 0.881872014598279, | |
| "train_speed(iter/s)": 0.130356 | |
| }, | |
| { | |
| "epoch": 141.37209302325581, | |
| "grad_norm": 0.7388402954698886, | |
| "learning_rate": 2.5870197280362915e-08, | |
| "loss": 0.42969484329223634, | |
| "memory(GiB)": 74.97, | |
| "step": 1555, | |
| "token_acc": 0.8883613399742432, | |
| "train_speed(iter/s)": 0.130344 | |
| }, | |
| { | |
| "epoch": 141.8372093023256, | |
| "grad_norm": 0.7267745408294942, | |
| "learning_rate": 2.5317852301584643e-08, | |
| "loss": 0.4578805923461914, | |
| "memory(GiB)": 74.97, | |
| "step": 1560, | |
| "token_acc": 0.8614507600793126, | |
| "train_speed(iter/s)": 0.130359 | |
| }, | |
| { | |
| "epoch": 142.27906976744185, | |
| "grad_norm": 0.7380347392311346, | |
| "learning_rate": 2.477061174839755e-08, | |
| "loss": 0.465103816986084, | |
| "memory(GiB)": 74.97, | |
| "step": 1565, | |
| "token_acc": 0.8470271187879302, | |
| "train_speed(iter/s)": 0.130395 | |
| }, | |
| { | |
| "epoch": 142.74418604651163, | |
| "grad_norm": 0.8136693379385729, | |
| "learning_rate": 2.4228513023966547e-08, | |
| "loss": 0.45352745056152344, | |
| "memory(GiB)": 74.97, | |
| "step": 1570, | |
| "token_acc": 0.8680695298875026, | |
| "train_speed(iter/s)": 0.130411 | |
| }, | |
| { | |
| "epoch": 143.1860465116279, | |
| "grad_norm": 0.8229594379364835, | |
| "learning_rate": 2.3691593180019364e-08, | |
| "loss": 0.46236839294433596, | |
| "memory(GiB)": 74.97, | |
| "step": 1575, | |
| "token_acc": 0.8841950432568365, | |
| "train_speed(iter/s)": 0.130422 | |
| }, | |
| { | |
| "epoch": 143.65116279069767, | |
| "grad_norm": 0.8764589511746724, | |
| "learning_rate": 2.315988891431412e-08, | |
| "loss": 0.44404191970825196, | |
| "memory(GiB)": 74.97, | |
| "step": 1580, | |
| "token_acc": 0.8711640164847799, | |
| "train_speed(iter/s)": 0.130418 | |
| }, | |
| { | |
| "epoch": 144.09302325581396, | |
| "grad_norm": 0.7989067686578916, | |
| "learning_rate": 2.263343656813107e-08, | |
| "loss": 0.46502885818481443, | |
| "memory(GiB)": 74.97, | |
| "step": 1585, | |
| "token_acc": 0.8433810096689391, | |
| "train_speed(iter/s)": 0.130431 | |
| }, | |
| { | |
| "epoch": 144.5581395348837, | |
| "grad_norm": 0.7139519442470533, | |
| "learning_rate": 2.2112272123788767e-08, | |
| "loss": 0.4445913314819336, | |
| "memory(GiB)": 74.97, | |
| "step": 1590, | |
| "token_acc": 0.8719508074869924, | |
| "train_speed(iter/s)": 0.130449 | |
| }, | |
| { | |
| "epoch": 145.0, | |
| "grad_norm": 0.854611201984833, | |
| "learning_rate": 2.1596431202184705e-08, | |
| "loss": 0.45667543411254885, | |
| "memory(GiB)": 74.97, | |
| "step": 1595, | |
| "token_acc": 0.8543597957753529, | |
| "train_speed(iter/s)": 0.130485 | |
| }, | |
| { | |
| "epoch": 145.46511627906978, | |
| "grad_norm": 0.8370879177525832, | |
| "learning_rate": 2.108594906036065e-08, | |
| "loss": 0.45542278289794924, | |
| "memory(GiB)": 74.97, | |
| "step": 1600, | |
| "token_acc": 0.8767741127199183, | |
| "train_speed(iter/s)": 0.130498 | |
| }, | |
| { | |
| "epoch": 145.46511627906978, | |
| "eval_loss": 0.612120509147644, | |
| "eval_runtime": 0.6973, | |
| "eval_samples_per_second": 18.643, | |
| "eval_steps_per_second": 2.868, | |
| "eval_token_acc": 0.8432800076635693, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 145.93023255813952, | |
| "grad_norm": 1.8496079436558843, | |
| "learning_rate": 2.0580860589092895e-08, | |
| "loss": 0.4458228588104248, | |
| "memory(GiB)": 74.97, | |
| "step": 1605, | |
| "token_acc": 0.8655583853748735, | |
| "train_speed(iter/s)": 0.130281 | |
| }, | |
| { | |
| "epoch": 146.37209302325581, | |
| "grad_norm": 0.8949636135857424, | |
| "learning_rate": 2.008120031050753e-08, | |
| "loss": 0.4534448146820068, | |
| "memory(GiB)": 74.97, | |
| "step": 1610, | |
| "token_acc": 0.8604011376099039, | |
| "train_speed(iter/s)": 0.130303 | |
| }, | |
| { | |
| "epoch": 146.8372093023256, | |
| "grad_norm": 1.4041818864948623, | |
| "learning_rate": 1.9587002375720862e-08, | |
| "loss": 0.46073060035705565, | |
| "memory(GiB)": 74.97, | |
| "step": 1615, | |
| "token_acc": 0.8637630263007214, | |
| "train_speed(iter/s)": 0.130307 | |
| }, | |
| { | |
| "epoch": 147.27906976744185, | |
| "grad_norm": 0.8001120538073951, | |
| "learning_rate": 1.9098300562505266e-08, | |
| "loss": 0.44887795448303225, | |
| "memory(GiB)": 74.97, | |
| "step": 1620, | |
| "token_acc": 0.8565026887074288, | |
| "train_speed(iter/s)": 0.130322 | |
| }, | |
| { | |
| "epoch": 147.74418604651163, | |
| "grad_norm": 1.3994263394516653, | |
| "learning_rate": 1.8615128272980507e-08, | |
| "loss": 0.4529706001281738, | |
| "memory(GiB)": 74.97, | |
| "step": 1625, | |
| "token_acc": 0.8614175728232399, | |
| "train_speed(iter/s)": 0.130317 | |
| }, | |
| { | |
| "epoch": 148.1860465116279, | |
| "grad_norm": 0.7986626000623837, | |
| "learning_rate": 1.8137518531330763e-08, | |
| "loss": 0.45129976272583006, | |
| "memory(GiB)": 74.97, | |
| "step": 1630, | |
| "token_acc": 0.88025613660619, | |
| "train_speed(iter/s)": 0.130337 | |
| }, | |
| { | |
| "epoch": 148.65116279069767, | |
| "grad_norm": 0.7053069152982997, | |
| "learning_rate": 1.7665503981547425e-08, | |
| "loss": 0.45789132118225095, | |
| "memory(GiB)": 74.97, | |
| "step": 1635, | |
| "token_acc": 0.8718905932360007, | |
| "train_speed(iter/s)": 0.13032 | |
| }, | |
| { | |
| "epoch": 149.09302325581396, | |
| "grad_norm": 1.327231576897276, | |
| "learning_rate": 1.7199116885197997e-08, | |
| "loss": 0.45948057174682616, | |
| "memory(GiB)": 74.97, | |
| "step": 1640, | |
| "token_acc": 0.8687992670776631, | |
| "train_speed(iter/s)": 0.13036 | |
| }, | |
| { | |
| "epoch": 149.5581395348837, | |
| "grad_norm": 1.8690818535078901, | |
| "learning_rate": 1.6738389119220965e-08, | |
| "loss": 0.4487407684326172, | |
| "memory(GiB)": 74.97, | |
| "step": 1645, | |
| "token_acc": 0.8717857813184292, | |
| "train_speed(iter/s)": 0.130356 | |
| }, | |
| { | |
| "epoch": 150.0, | |
| "grad_norm": 1.4986410325133508, | |
| "learning_rate": 1.6283352173747144e-08, | |
| "loss": 0.46256265640258787, | |
| "memory(GiB)": 74.97, | |
| "step": 1650, | |
| "token_acc": 0.8687363834422658, | |
| "train_speed(iter/s)": 0.130382 | |
| }, | |
| { | |
| "epoch": 150.46511627906978, | |
| "grad_norm": 0.9212362231868645, | |
| "learning_rate": 1.5834037149947288e-08, | |
| "loss": 0.45532588958740233, | |
| "memory(GiB)": 74.97, | |
| "step": 1655, | |
| "token_acc": 0.8637431617337635, | |
| "train_speed(iter/s)": 0.130388 | |
| }, | |
| { | |
| "epoch": 150.93023255813952, | |
| "grad_norm": 0.7589204558012844, | |
| "learning_rate": 1.5390474757906446e-08, | |
| "loss": 0.4434979438781738, | |
| "memory(GiB)": 74.97, | |
| "step": 1660, | |
| "token_acc": 0.8520731295389292, | |
| "train_speed(iter/s)": 0.130379 | |
| }, | |
| { | |
| "epoch": 151.37209302325581, | |
| "grad_norm": 0.7171576824463824, | |
| "learning_rate": 1.495269531452491e-08, | |
| "loss": 0.45127115249633787, | |
| "memory(GiB)": 74.97, | |
| "step": 1665, | |
| "token_acc": 0.8684483740245822, | |
| "train_speed(iter/s)": 0.130418 | |
| }, | |
| { | |
| "epoch": 151.8372093023256, | |
| "grad_norm": 0.7991722745507821, | |
| "learning_rate": 1.4520728741446087e-08, | |
| "loss": 0.4588929176330566, | |
| "memory(GiB)": 74.97, | |
| "step": 1670, | |
| "token_acc": 0.8637134079593206, | |
| "train_speed(iter/s)": 0.130412 | |
| }, | |
| { | |
| "epoch": 152.27906976744185, | |
| "grad_norm": 1.041259181485301, | |
| "learning_rate": 1.409460456301147e-08, | |
| "loss": 0.4453131675720215, | |
| "memory(GiB)": 74.97, | |
| "step": 1675, | |
| "token_acc": 0.8544123886296139, | |
| "train_speed(iter/s)": 0.130435 | |
| }, | |
| { | |
| "epoch": 152.74418604651163, | |
| "grad_norm": 0.8313710748011637, | |
| "learning_rate": 1.367435190424261e-08, | |
| "loss": 0.45928287506103516, | |
| "memory(GiB)": 74.97, | |
| "step": 1680, | |
| "token_acc": 0.8679964503247146, | |
| "train_speed(iter/s)": 0.13042 | |
| }, | |
| { | |
| "epoch": 153.1860465116279, | |
| "grad_norm": 0.8698596114757391, | |
| "learning_rate": 1.3259999488850471e-08, | |
| "loss": 0.4635627746582031, | |
| "memory(GiB)": 74.97, | |
| "step": 1685, | |
| "token_acc": 0.8450357565069091, | |
| "train_speed(iter/s)": 0.130437 | |
| }, | |
| { | |
| "epoch": 153.65116279069767, | |
| "grad_norm": 0.8269395824162962, | |
| "learning_rate": 1.285157563727226e-08, | |
| "loss": 0.44847860336303713, | |
| "memory(GiB)": 74.97, | |
| "step": 1690, | |
| "token_acc": 0.8680811179277437, | |
| "train_speed(iter/s)": 0.130425 | |
| }, | |
| { | |
| "epoch": 154.09302325581396, | |
| "grad_norm": 0.9761994911989668, | |
| "learning_rate": 1.244910826473572e-08, | |
| "loss": 0.45370187759399416, | |
| "memory(GiB)": 74.97, | |
| "step": 1695, | |
| "token_acc": 0.8793547562067264, | |
| "train_speed(iter/s)": 0.130443 | |
| }, | |
| { | |
| "epoch": 154.5581395348837, | |
| "grad_norm": 1.6323959605839558, | |
| "learning_rate": 1.2052624879351104e-08, | |
| "loss": 0.4481173515319824, | |
| "memory(GiB)": 74.97, | |
| "step": 1700, | |
| "token_acc": 0.8561244744199944, | |
| "train_speed(iter/s)": 0.13047 | |
| }, | |
| { | |
| "epoch": 155.0, | |
| "grad_norm": 0.9207580708371824, | |
| "learning_rate": 1.1662152580231144e-08, | |
| "loss": 0.4539341926574707, | |
| "memory(GiB)": 74.97, | |
| "step": 1705, | |
| "token_acc": 0.8649734464445824, | |
| "train_speed(iter/s)": 0.130465 | |
| }, | |
| { | |
| "epoch": 155.46511627906978, | |
| "grad_norm": 0.745689965265747, | |
| "learning_rate": 1.1277718055638818e-08, | |
| "loss": 0.4519050598144531, | |
| "memory(GiB)": 74.97, | |
| "step": 1710, | |
| "token_acc": 0.852934204004224, | |
| "train_speed(iter/s)": 0.13047 | |
| }, | |
| { | |
| "epoch": 155.93023255813952, | |
| "grad_norm": 0.8506585464720108, | |
| "learning_rate": 1.089934758116322e-08, | |
| "loss": 0.4458354949951172, | |
| "memory(GiB)": 74.97, | |
| "step": 1715, | |
| "token_acc": 0.874605201263356, | |
| "train_speed(iter/s)": 0.130466 | |
| }, | |
| { | |
| "epoch": 156.37209302325581, | |
| "grad_norm": 0.8256841894574871, | |
| "learning_rate": 1.0527067017923653e-08, | |
| "loss": 0.4461174011230469, | |
| "memory(GiB)": 74.97, | |
| "step": 1720, | |
| "token_acc": 0.8700296735905044, | |
| "train_speed(iter/s)": 0.130504 | |
| }, | |
| { | |
| "epoch": 156.8372093023256, | |
| "grad_norm": 1.507219892035112, | |
| "learning_rate": 1.0160901810802114e-08, | |
| "loss": 0.45079655647277833, | |
| "memory(GiB)": 74.97, | |
| "step": 1725, | |
| "token_acc": 0.8675626379955842, | |
| "train_speed(iter/s)": 0.130482 | |
| }, | |
| { | |
| "epoch": 157.27906976744185, | |
| "grad_norm": 0.8360642824375936, | |
| "learning_rate": 9.800876986704109e-09, | |
| "loss": 0.46644229888916017, | |
| "memory(GiB)": 74.97, | |
| "step": 1730, | |
| "token_acc": 0.8489071709233792, | |
| "train_speed(iter/s)": 0.13048 | |
| }, | |
| { | |
| "epoch": 157.74418604651163, | |
| "grad_norm": 1.1246527506944004, | |
| "learning_rate": 9.447017152848125e-09, | |
| "loss": 0.4457961082458496, | |
| "memory(GiB)": 74.97, | |
| "step": 1735, | |
| "token_acc": 0.8624011633190948, | |
| "train_speed(iter/s)": 0.130482 | |
| }, | |
| { | |
| "epoch": 158.1860465116279, | |
| "grad_norm": 1.166595856803442, | |
| "learning_rate": 9.099346495083749e-09, | |
| "loss": 0.46271514892578125, | |
| "memory(GiB)": 74.97, | |
| "step": 1740, | |
| "token_acc": 0.870665567772931, | |
| "train_speed(iter/s)": 0.130529 | |
| }, | |
| { | |
| "epoch": 158.65116279069767, | |
| "grad_norm": 0.997579039313746, | |
| "learning_rate": 8.75788877623862e-09, | |
| "loss": 0.45302181243896483, | |
| "memory(GiB)": 74.97, | |
| "step": 1745, | |
| "token_acc": 0.8601099764336214, | |
| "train_speed(iter/s)": 0.130496 | |
| }, | |
| { | |
| "epoch": 159.09302325581396, | |
| "grad_norm": 0.8847455985487748, | |
| "learning_rate": 8.422667334494249e-09, | |
| "loss": 0.44652571678161623, | |
| "memory(GiB)": 74.97, | |
| "step": 1750, | |
| "token_acc": 0.8695352691736444, | |
| "train_speed(iter/s)": 0.130543 | |
| }, | |
| { | |
| "epoch": 159.5581395348837, | |
| "grad_norm": 0.7005559155585159, | |
| "learning_rate": 8.093705081790891e-09, | |
| "loss": 0.45291786193847655, | |
| "memory(GiB)": 74.97, | |
| "step": 1755, | |
| "token_acc": 0.8535476796830787, | |
| "train_speed(iter/s)": 0.130532 | |
| }, | |
| { | |
| "epoch": 160.0, | |
| "grad_norm": 1.2943680843819054, | |
| "learning_rate": 7.771024502261525e-09, | |
| "loss": 0.4609353542327881, | |
| "memory(GiB)": 74.97, | |
| "step": 1760, | |
| "token_acc": 0.8666812131791403, | |
| "train_speed(iter/s)": 0.130545 | |
| }, | |
| { | |
| "epoch": 160.46511627906978, | |
| "grad_norm": 0.7963107860861562, | |
| "learning_rate": 7.454647650695157e-09, | |
| "loss": 0.44596638679504397, | |
| "memory(GiB)": 74.97, | |
| "step": 1765, | |
| "token_acc": 0.8749486582694413, | |
| "train_speed(iter/s)": 0.130569 | |
| }, | |
| { | |
| "epoch": 160.93023255813952, | |
| "grad_norm": 0.8631048995115475, | |
| "learning_rate": 7.144596151029303e-09, | |
| "loss": 0.4524871826171875, | |
| "memory(GiB)": 74.97, | |
| "step": 1770, | |
| "token_acc": 0.8677113770449089, | |
| "train_speed(iter/s)": 0.130541 | |
| }, | |
| { | |
| "epoch": 161.37209302325581, | |
| "grad_norm": 2.75385310377207, | |
| "learning_rate": 6.840891194872111e-09, | |
| "loss": 0.4484891891479492, | |
| "memory(GiB)": 74.97, | |
| "step": 1775, | |
| "token_acc": 0.8703089199652366, | |
| "train_speed(iter/s)": 0.130568 | |
| }, | |
| { | |
| "epoch": 161.8372093023256, | |
| "grad_norm": 0.8722893432749486, | |
| "learning_rate": 6.5435535400539254e-09, | |
| "loss": 0.45218324661254883, | |
| "memory(GiB)": 74.97, | |
| "step": 1780, | |
| "token_acc": 0.868553358560287, | |
| "train_speed(iter/s)": 0.130557 | |
| }, | |
| { | |
| "epoch": 162.27906976744185, | |
| "grad_norm": 0.8215719185780701, | |
| "learning_rate": 6.252603509208465e-09, | |
| "loss": 0.4554037094116211, | |
| "memory(GiB)": 74.97, | |
| "step": 1785, | |
| "token_acc": 0.8677862595419847, | |
| "train_speed(iter/s)": 0.130539 | |
| }, | |
| { | |
| "epoch": 162.74418604651163, | |
| "grad_norm": 0.7384288783097476, | |
| "learning_rate": 5.9680609883838825e-09, | |
| "loss": 0.44667611122131345, | |
| "memory(GiB)": 74.97, | |
| "step": 1790, | |
| "token_acc": 0.8610665481183679, | |
| "train_speed(iter/s)": 0.130557 | |
| }, | |
| { | |
| "epoch": 163.1860465116279, | |
| "grad_norm": 1.061403944970525, | |
| "learning_rate": 5.689945425683473e-09, | |
| "loss": 0.44474124908447266, | |
| "memory(GiB)": 74.97, | |
| "step": 1795, | |
| "token_acc": 0.8644834307992203, | |
| "train_speed(iter/s)": 0.130593 | |
| }, | |
| { | |
| "epoch": 163.65116279069767, | |
| "grad_norm": 0.7777597746319437, | |
| "learning_rate": 5.418275829936536e-09, | |
| "loss": 0.44541053771972655, | |
| "memory(GiB)": 74.97, | |
| "step": 1800, | |
| "token_acc": 0.8673607496095783, | |
| "train_speed(iter/s)": 0.130589 | |
| }, | |
| { | |
| "epoch": 163.65116279069767, | |
| "eval_loss": 0.6119173765182495, | |
| "eval_runtime": 0.6953, | |
| "eval_samples_per_second": 18.698, | |
| "eval_steps_per_second": 2.877, | |
| "eval_token_acc": 0.8430884184308842, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 164.09302325581396, | |
| "grad_norm": 0.7945916641293757, | |
| "learning_rate": 5.15307076939906e-09, | |
| "loss": 0.47254066467285155, | |
| "memory(GiB)": 74.97, | |
| "step": 1805, | |
| "token_acc": 0.8580266386260077, | |
| "train_speed(iter/s)": 0.130407 | |
| }, | |
| { | |
| "epoch": 164.5581395348837, | |
| "grad_norm": 1.0008056456948866, | |
| "learning_rate": 4.8943483704846465e-09, | |
| "loss": 0.45273590087890625, | |
| "memory(GiB)": 74.97, | |
| "step": 1810, | |
| "token_acc": 0.8646250808015514, | |
| "train_speed(iter/s)": 0.130426 | |
| }, | |
| { | |
| "epoch": 165.0, | |
| "grad_norm": 1.4076505417290193, | |
| "learning_rate": 4.6421263165255855e-09, | |
| "loss": 0.4405077934265137, | |
| "memory(GiB)": 74.97, | |
| "step": 1815, | |
| "token_acc": 0.8686048572948059, | |
| "train_speed(iter/s)": 0.130467 | |
| }, | |
| { | |
| "epoch": 165.46511627906978, | |
| "grad_norm": 0.9122077782409643, | |
| "learning_rate": 4.396421846564236e-09, | |
| "loss": 0.4534634590148926, | |
| "memory(GiB)": 74.97, | |
| "step": 1820, | |
| "token_acc": 0.8500332069327506, | |
| "train_speed(iter/s)": 0.13046 | |
| }, | |
| { | |
| "epoch": 165.93023255813952, | |
| "grad_norm": 1.0037794680637797, | |
| "learning_rate": 4.157251754174729e-09, | |
| "loss": 0.4450718402862549, | |
| "memory(GiB)": 74.97, | |
| "step": 1825, | |
| "token_acc": 0.8572525948963915, | |
| "train_speed(iter/s)": 0.130467 | |
| }, | |
| { | |
| "epoch": 166.37209302325581, | |
| "grad_norm": 0.7603089866068351, | |
| "learning_rate": 3.924632386315185e-09, | |
| "loss": 0.44524030685424804, | |
| "memory(GiB)": 74.97, | |
| "step": 1830, | |
| "token_acc": 0.8798618132794068, | |
| "train_speed(iter/s)": 0.130461 | |
| }, | |
| { | |
| "epoch": 166.8372093023256, | |
| "grad_norm": 0.7741124344133163, | |
| "learning_rate": 3.6985796422103977e-09, | |
| "loss": 0.4650570392608643, | |
| "memory(GiB)": 74.97, | |
| "step": 1835, | |
| "token_acc": 0.8640802573718376, | |
| "train_speed(iter/s)": 0.13049 | |
| }, | |
| { | |
| "epoch": 167.27906976744185, | |
| "grad_norm": 0.7769430246424489, | |
| "learning_rate": 3.4791089722651433e-09, | |
| "loss": 0.4513576507568359, | |
| "memory(GiB)": 74.97, | |
| "step": 1840, | |
| "token_acc": 0.870817885379908, | |
| "train_speed(iter/s)": 0.130488 | |
| }, | |
| { | |
| "epoch": 167.74418604651163, | |
| "grad_norm": 2.1575767592775823, | |
| "learning_rate": 3.266235377008175e-09, | |
| "loss": 0.4532448768615723, | |
| "memory(GiB)": 74.97, | |
| "step": 1845, | |
| "token_acc": 0.8802010930626702, | |
| "train_speed(iter/s)": 0.130491 | |
| }, | |
| { | |
| "epoch": 168.1860465116279, | |
| "grad_norm": 0.8581455080813751, | |
| "learning_rate": 3.0599734060669624e-09, | |
| "loss": 0.44078683853149414, | |
| "memory(GiB)": 74.97, | |
| "step": 1850, | |
| "token_acc": 0.8616869584293079, | |
| "train_speed(iter/s)": 0.130513 | |
| }, | |
| { | |
| "epoch": 168.65116279069767, | |
| "grad_norm": 0.7254996365029248, | |
| "learning_rate": 2.860337157173243e-09, | |
| "loss": 0.45212836265563966, | |
| "memory(GiB)": 74.97, | |
| "step": 1855, | |
| "token_acc": 0.8733549684432675, | |
| "train_speed(iter/s)": 0.130513 | |
| }, | |
| { | |
| "epoch": 169.09302325581396, | |
| "grad_norm": 1.0391226655473043, | |
| "learning_rate": 2.6673402751994255e-09, | |
| "loss": 0.45039982795715333, | |
| "memory(GiB)": 74.97, | |
| "step": 1860, | |
| "token_acc": 0.8479883283766343, | |
| "train_speed(iter/s)": 0.130544 | |
| }, | |
| { | |
| "epoch": 169.5581395348837, | |
| "grad_norm": 4.061114170885048, | |
| "learning_rate": 2.480995951226028e-09, | |
| "loss": 0.4557363510131836, | |
| "memory(GiB)": 74.97, | |
| "step": 1865, | |
| "token_acc": 0.8549472607052897, | |
| "train_speed(iter/s)": 0.130538 | |
| }, | |
| { | |
| "epoch": 170.0, | |
| "grad_norm": 0.7661862324584979, | |
| "learning_rate": 2.301316921640073e-09, | |
| "loss": 0.44440832138061526, | |
| "memory(GiB)": 74.97, | |
| "step": 1870, | |
| "token_acc": 0.864181855416752, | |
| "train_speed(iter/s)": 0.130548 | |
| }, | |
| { | |
| "epoch": 170.46511627906978, | |
| "grad_norm": 1.0139594423822822, | |
| "learning_rate": 2.128315467264552e-09, | |
| "loss": 0.44567031860351564, | |
| "memory(GiB)": 74.97, | |
| "step": 1875, | |
| "token_acc": 0.8773299316489919, | |
| "train_speed(iter/s)": 0.130532 | |
| }, | |
| { | |
| "epoch": 170.93023255813952, | |
| "grad_norm": 4.102723548733547, | |
| "learning_rate": 1.962003412519064e-09, | |
| "loss": 0.45189361572265624, | |
| "memory(GiB)": 74.97, | |
| "step": 1880, | |
| "token_acc": 0.8593179414523178, | |
| "train_speed(iter/s)": 0.130547 | |
| }, | |
| { | |
| "epoch": 171.37209302325581, | |
| "grad_norm": 0.8536208597740141, | |
| "learning_rate": 1.8023921246116402e-09, | |
| "loss": 0.45585269927978517, | |
| "memory(GiB)": 74.97, | |
| "step": 1885, | |
| "token_acc": 0.8772535999691156, | |
| "train_speed(iter/s)": 0.130545 | |
| }, | |
| { | |
| "epoch": 171.8372093023256, | |
| "grad_norm": 1.2245160632333336, | |
| "learning_rate": 1.6494925127617632e-09, | |
| "loss": 0.4523616790771484, | |
| "memory(GiB)": 74.97, | |
| "step": 1890, | |
| "token_acc": 0.853437876960193, | |
| "train_speed(iter/s)": 0.130555 | |
| }, | |
| { | |
| "epoch": 172.27906976744185, | |
| "grad_norm": 0.9530973263407838, | |
| "learning_rate": 1.5033150274548324e-09, | |
| "loss": 0.4454800605773926, | |
| "memory(GiB)": 74.97, | |
| "step": 1895, | |
| "token_acc": 0.8595509191527256, | |
| "train_speed(iter/s)": 0.130569 | |
| }, | |
| { | |
| "epoch": 172.74418604651163, | |
| "grad_norm": 0.7332081781662043, | |
| "learning_rate": 1.3638696597277677e-09, | |
| "loss": 0.443679666519165, | |
| "memory(GiB)": 74.97, | |
| "step": 1900, | |
| "token_acc": 0.8559887049964703, | |
| "train_speed(iter/s)": 0.130587 | |
| }, | |
| { | |
| "epoch": 173.1860465116279, | |
| "grad_norm": 0.920253466736325, | |
| "learning_rate": 1.231165940486234e-09, | |
| "loss": 0.469818115234375, | |
| "memory(GiB)": 74.97, | |
| "step": 1905, | |
| "token_acc": 0.8453635280095352, | |
| "train_speed(iter/s)": 0.130597 | |
| }, | |
| { | |
| "epoch": 173.65116279069767, | |
| "grad_norm": 0.8835815802653249, | |
| "learning_rate": 1.1052129398531506e-09, | |
| "loss": 0.44182252883911133, | |
| "memory(GiB)": 74.97, | |
| "step": 1910, | |
| "token_acc": 0.8679900339010742, | |
| "train_speed(iter/s)": 0.130609 | |
| }, | |
| { | |
| "epoch": 174.09302325581396, | |
| "grad_norm": 1.0067834379405356, | |
| "learning_rate": 9.86019266548821e-10, | |
| "loss": 0.4615338802337646, | |
| "memory(GiB)": 74.97, | |
| "step": 1915, | |
| "token_acc": 0.8516490943498243, | |
| "train_speed(iter/s)": 0.130627 | |
| }, | |
| { | |
| "epoch": 174.5581395348837, | |
| "grad_norm": 0.7527783775468317, | |
| "learning_rate": 8.735930673024805e-10, | |
| "loss": 0.4349226951599121, | |
| "memory(GiB)": 74.97, | |
| "step": 1920, | |
| "token_acc": 0.8792523552149395, | |
| "train_speed(iter/s)": 0.130639 | |
| }, | |
| { | |
| "epoch": 175.0, | |
| "grad_norm": 1.117914154380228, | |
| "learning_rate": 7.679420262954983e-10, | |
| "loss": 0.45952515602111815, | |
| "memory(GiB)": 74.97, | |
| "step": 1925, | |
| "token_acc": 0.85297977378299, | |
| "train_speed(iter/s)": 0.130658 | |
| }, | |
| { | |
| "epoch": 175.46511627906978, | |
| "grad_norm": 0.9275458758815365, | |
| "learning_rate": 6.690733646361856e-10, | |
| "loss": 0.4419642448425293, | |
| "memory(GiB)": 74.97, | |
| "step": 1930, | |
| "token_acc": 0.8648351648351649, | |
| "train_speed(iter/s)": 0.130648 | |
| }, | |
| { | |
| "epoch": 175.93023255813952, | |
| "grad_norm": 0.9626017497568045, | |
| "learning_rate": 5.769938398662355e-10, | |
| "loss": 0.4574889659881592, | |
| "memory(GiB)": 74.97, | |
| "step": 1935, | |
| "token_acc": 0.862218660255126, | |
| "train_speed(iter/s)": 0.130652 | |
| }, | |
| { | |
| "epoch": 176.37209302325581, | |
| "grad_norm": 0.7475798565408234, | |
| "learning_rate": 4.917097454988583e-10, | |
| "loss": 0.4532492637634277, | |
| "memory(GiB)": 74.97, | |
| "step": 1940, | |
| "token_acc": 0.8698313950032691, | |
| "train_speed(iter/s)": 0.130672 | |
| }, | |
| { | |
| "epoch": 176.8372093023256, | |
| "grad_norm": 0.8211466589757828, | |
| "learning_rate": 4.132269105886155e-10, | |
| "loss": 0.4510762691497803, | |
| "memory(GiB)": 74.97, | |
| "step": 1945, | |
| "token_acc": 0.8704986701068692, | |
| "train_speed(iter/s)": 0.130681 | |
| }, | |
| { | |
| "epoch": 177.27906976744185, | |
| "grad_norm": 1.7704160910518656, | |
| "learning_rate": 3.4155069933301526e-10, | |
| "loss": 0.44258646965026854, | |
| "memory(GiB)": 74.97, | |
| "step": 1950, | |
| "token_acc": 0.8673919489954778, | |
| "train_speed(iter/s)": 0.130706 | |
| }, | |
| { | |
| "epoch": 177.74418604651163, | |
| "grad_norm": 0.7984443068601499, | |
| "learning_rate": 2.7668601070588436e-10, | |
| "loss": 0.4494297027587891, | |
| "memory(GiB)": 74.97, | |
| "step": 1955, | |
| "token_acc": 0.8809769787056883, | |
| "train_speed(iter/s)": 0.130709 | |
| }, | |
| { | |
| "epoch": 178.1860465116279, | |
| "grad_norm": 1.034280653667984, | |
| "learning_rate": 2.186372781225465e-10, | |
| "loss": 0.4531251430511475, | |
| "memory(GiB)": 74.97, | |
| "step": 1960, | |
| "token_acc": 0.8546573936837305, | |
| "train_speed(iter/s)": 0.130721 | |
| }, | |
| { | |
| "epoch": 178.65116279069767, | |
| "grad_norm": 0.7838331091869942, | |
| "learning_rate": 1.6740846913674279e-10, | |
| "loss": 0.45207509994506834, | |
| "memory(GiB)": 74.97, | |
| "step": 1965, | |
| "token_acc": 0.8692857883279776, | |
| "train_speed(iter/s)": 0.130714 | |
| }, | |
| { | |
| "epoch": 179.09302325581396, | |
| "grad_norm": 1.030250317436395, | |
| "learning_rate": 1.2300308516952628e-10, | |
| "loss": 0.45918664932250974, | |
| "memory(GiB)": 74.97, | |
| "step": 1970, | |
| "token_acc": 0.8727756076388888, | |
| "train_speed(iter/s)": 0.130734 | |
| }, | |
| { | |
| "epoch": 179.5581395348837, | |
| "grad_norm": 0.9787925643257926, | |
| "learning_rate": 8.542416126989804e-11, | |
| "loss": 0.4371158599853516, | |
| "memory(GiB)": 74.97, | |
| "step": 1975, | |
| "token_acc": 0.8778103770180585, | |
| "train_speed(iter/s)": 0.130716 | |
| }, | |
| { | |
| "epoch": 180.0, | |
| "grad_norm": 15.04723370590281, | |
| "learning_rate": 5.46742659073951e-11, | |
| "loss": 0.4714357852935791, | |
| "memory(GiB)": 74.97, | |
| "step": 1980, | |
| "token_acc": 0.8756799192508271, | |
| "train_speed(iter/s)": 0.130752 | |
| }, | |
| { | |
| "epoch": 180.46511627906978, | |
| "grad_norm": 0.8235554306239253, | |
| "learning_rate": 3.0755500796531e-11, | |
| "loss": 0.44407100677490235, | |
| "memory(GiB)": 74.97, | |
| "step": 1985, | |
| "token_acc": 0.8770921605870226, | |
| "train_speed(iter/s)": 0.130773 | |
| }, | |
| { | |
| "epoch": 180.93023255813952, | |
| "grad_norm": 1.1709340797573902, | |
| "learning_rate": 1.3669500753099584e-11, | |
| "loss": 0.44757466316223143, | |
| "memory(GiB)": 74.97, | |
| "step": 1990, | |
| "token_acc": 0.8692709656348659, | |
| "train_speed(iter/s)": 0.130762 | |
| }, | |
| { | |
| "epoch": 181.37209302325581, | |
| "grad_norm": 1.040441435930544, | |
| "learning_rate": 3.417433582542095e-12, | |
| "loss": 0.4524868011474609, | |
| "memory(GiB)": 74.97, | |
| "step": 1995, | |
| "token_acc": 0.8582090965920931, | |
| "train_speed(iter/s)": 0.130794 | |
| }, | |
| { | |
| "epoch": 181.8372093023256, | |
| "grad_norm": 2.1432807975254313, | |
| "learning_rate": 0.0, | |
| "loss": 0.46004161834716795, | |
| "memory(GiB)": 74.97, | |
| "step": 2000, | |
| "token_acc": 0.8837088162521162, | |
| "train_speed(iter/s)": 0.130787 | |
| }, | |
| { | |
| "epoch": 181.8372093023256, | |
| "eval_loss": 0.6126144528388977, | |
| "eval_runtime": 0.6963, | |
| "eval_samples_per_second": 18.669, | |
| "eval_steps_per_second": 2.872, | |
| "eval_token_acc": 0.8433758022799118, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 200, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4205321758179328.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |