diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5622 @@ +{ + "best_global_step": 557, + "best_metric": 0.00856661, + "best_model_checkpoint": "/workspace/output/v0-20250507-211218/checkpoint-557", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 557, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017953321364452424, + "grad_norm": 20.767723083496094, + "learning_rate": 3.5714285714285716e-07, + "loss": 1.101172685623169, + "memory(GiB)": 12.53, + "step": 1, + "token_acc": 0.7228915662650602, + "train_speed(iter/s)": 0.046884 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 15.578984260559082, + "learning_rate": 7.142857142857143e-07, + "loss": 1.0804812908172607, + "memory(GiB)": 12.53, + "step": 2, + "token_acc": 0.7261904761904762, + "train_speed(iter/s)": 0.089818 + }, + { + "epoch": 0.005385996409335727, + "grad_norm": 218.74844360351562, + "learning_rate": 1.0714285714285714e-06, + "loss": 1.1085731983184814, + "memory(GiB)": 12.53, + "step": 3, + "token_acc": 0.7407407407407407, + "train_speed(iter/s)": 0.129592 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 15.93343448638916, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.103919506072998, + "memory(GiB)": 12.53, + "step": 4, + "token_acc": 0.7710843373493976, + "train_speed(iter/s)": 0.166464 + }, + { + "epoch": 0.008976660682226212, + "grad_norm": 86.19783020019531, + "learning_rate": 1.7857142857142859e-06, + "loss": 1.0932217836380005, + "memory(GiB)": 12.53, + "step": 5, + "token_acc": 0.7560975609756098, + "train_speed(iter/s)": 0.200716 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 16.776695251464844, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.9729880094528198, + "memory(GiB)": 13.08, + "step": 6, + "token_acc": 0.7176470588235294, + "train_speed(iter/s)": 0.232591 + }, + { + "epoch": 0.012567324955116697, + "grad_norm": 21.773569107055664, + "learning_rate": 2.5e-06, + "loss": 0.9782532453536987, + "memory(GiB)": 13.08, + "step": 7, + "token_acc": 0.7325581395348837, + "train_speed(iter/s)": 0.262368 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 14.693713188171387, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.9148114323616028, + "memory(GiB)": 13.08, + "step": 8, + "token_acc": 0.7294117647058823, + "train_speed(iter/s)": 0.290226 + }, + { + "epoch": 0.01615798922800718, + "grad_norm": 13.920761108398438, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.8653872013092041, + "memory(GiB)": 13.08, + "step": 9, + "token_acc": 0.7160493827160493, + "train_speed(iter/s)": 0.316373 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 12.907504081726074, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.7095205783843994, + "memory(GiB)": 13.08, + "step": 10, + "token_acc": 0.7764705882352941, + "train_speed(iter/s)": 0.340916 + }, + { + "epoch": 0.019748653500897665, + "grad_norm": 12.996719360351562, + "learning_rate": 3.928571428571429e-06, + "loss": 0.5850164294242859, + "memory(GiB)": 13.08, + "step": 11, + "token_acc": 0.8117647058823529, + "train_speed(iter/s)": 0.363996 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 8.631417274475098, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.45549219846725464, + "memory(GiB)": 13.08, + "step": 12, + "token_acc": 0.8214285714285714, + "train_speed(iter/s)": 0.385737 + }, + { + "epoch": 0.02333931777378815, + "grad_norm": 12.419778823852539, + "learning_rate": 4.642857142857144e-06, + "loss": 0.39217233657836914, + "memory(GiB)": 13.08, + "step": 13, + "token_acc": 0.8333333333333334, + "train_speed(iter/s)": 0.406279 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 8.926778793334961, + "learning_rate": 5e-06, + "loss": 0.3340882658958435, + "memory(GiB)": 13.08, + "step": 14, + "token_acc": 0.9036144578313253, + "train_speed(iter/s)": 0.425754 + }, + { + "epoch": 0.026929982046678635, + "grad_norm": 9.134852409362793, + "learning_rate": 5.357142857142857e-06, + "loss": 0.2985474467277527, + "memory(GiB)": 13.08, + "step": 15, + "token_acc": 0.9634146341463414, + "train_speed(iter/s)": 0.44421 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 27.215181350708008, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.22333472967147827, + "memory(GiB)": 13.08, + "step": 16, + "token_acc": 0.926829268292683, + "train_speed(iter/s)": 0.461618 + }, + { + "epoch": 0.03052064631956912, + "grad_norm": 12.754878997802734, + "learning_rate": 6.071428571428571e-06, + "loss": 0.22393622994422913, + "memory(GiB)": 13.08, + "step": 17, + "token_acc": 0.891566265060241, + "train_speed(iter/s)": 0.478212 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 5.669656276702881, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.18336020410060883, + "memory(GiB)": 13.08, + "step": 18, + "token_acc": 0.9512195121951219, + "train_speed(iter/s)": 0.493875 + }, + { + "epoch": 0.03411131059245961, + "grad_norm": 4.313683986663818, + "learning_rate": 6.785714285714287e-06, + "loss": 0.1597178429365158, + "memory(GiB)": 13.08, + "step": 19, + "token_acc": 0.9125, + "train_speed(iter/s)": 0.509084 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 2.369284152984619, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.12790489196777344, + "memory(GiB)": 13.08, + "step": 20, + "token_acc": 0.9294117647058824, + "train_speed(iter/s)": 0.523511 + }, + { + "epoch": 0.03770197486535009, + "grad_norm": 6.269874572753906, + "learning_rate": 7.500000000000001e-06, + "loss": 0.11981090903282166, + "memory(GiB)": 13.08, + "step": 21, + "token_acc": 0.9382716049382716, + "train_speed(iter/s)": 0.537299 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 2.469256639480591, + "learning_rate": 7.857142857142858e-06, + "loss": 0.09385544061660767, + "memory(GiB)": 13.08, + "step": 22, + "token_acc": 0.9404761904761905, + "train_speed(iter/s)": 0.550442 + }, + { + "epoch": 0.04129263913824058, + "grad_norm": 2.988142490386963, + "learning_rate": 8.214285714285714e-06, + "loss": 0.09932994097471237, + "memory(GiB)": 13.08, + "step": 23, + "token_acc": 0.9390243902439024, + "train_speed(iter/s)": 0.563072 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 3.13029146194458, + "learning_rate": 8.571428571428571e-06, + "loss": 0.10883159935474396, + "memory(GiB)": 13.08, + "step": 24, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.574898 + }, + { + "epoch": 0.04488330341113106, + "grad_norm": 4.782556533813477, + "learning_rate": 8.92857142857143e-06, + "loss": 0.1130107045173645, + "memory(GiB)": 13.08, + "step": 25, + "token_acc": 0.9529411764705882, + "train_speed(iter/s)": 0.586515 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 4.633636474609375, + "learning_rate": 9.285714285714288e-06, + "loss": 0.10770687460899353, + "memory(GiB)": 13.08, + "step": 26, + "token_acc": 0.9397590361445783, + "train_speed(iter/s)": 0.597627 + }, + { + "epoch": 0.04847396768402154, + "grad_norm": 3.6300508975982666, + "learning_rate": 9.642857142857144e-06, + "loss": 0.10155674815177917, + "memory(GiB)": 13.08, + "step": 27, + "token_acc": 0.9634146341463414, + "train_speed(iter/s)": 0.608103 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 7.393371105194092, + "learning_rate": 1e-05, + "loss": 0.09194546937942505, + "memory(GiB)": 13.08, + "step": 28, + "token_acc": 0.9512195121951219, + "train_speed(iter/s)": 0.618262 + }, + { + "epoch": 0.05206463195691203, + "grad_norm": 6.952455520629883, + "learning_rate": 9.999911828722437e-06, + "loss": 0.10959678888320923, + "memory(GiB)": 13.08, + "step": 29, + "token_acc": 0.9285714285714286, + "train_speed(iter/s)": 0.628137 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 3.8870465755462646, + "learning_rate": 9.999647317999416e-06, + "loss": 0.06351347267627716, + "memory(GiB)": 13.08, + "step": 30, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.637656 + }, + { + "epoch": 0.05565529622980251, + "grad_norm": 5.015563011169434, + "learning_rate": 9.999206477159838e-06, + "loss": 0.10613317042589188, + "memory(GiB)": 13.08, + "step": 31, + "token_acc": 0.9294117647058824, + "train_speed(iter/s)": 0.646851 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 3.729865312576294, + "learning_rate": 9.998589321751502e-06, + "loss": 0.06902734935283661, + "memory(GiB)": 13.08, + "step": 32, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.655735 + }, + { + "epoch": 0.059245960502693, + "grad_norm": 4.560534954071045, + "learning_rate": 9.997795873540561e-06, + "loss": 0.0829491913318634, + "memory(GiB)": 13.08, + "step": 33, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.664349 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 4.6493330001831055, + "learning_rate": 9.996826160510751e-06, + "loss": 0.08029845356941223, + "memory(GiB)": 13.08, + "step": 34, + "token_acc": 0.9397590361445783, + "train_speed(iter/s)": 0.672391 + }, + { + "epoch": 0.06283662477558348, + "grad_norm": 3.491983652114868, + "learning_rate": 9.995680216862407e-06, + "loss": 0.07587797939777374, + "memory(GiB)": 13.08, + "step": 35, + "token_acc": 0.9397590361445783, + "train_speed(iter/s)": 0.680398 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 4.87061071395874, + "learning_rate": 9.994358083011255e-06, + "loss": 0.07939267158508301, + "memory(GiB)": 13.08, + "step": 36, + "token_acc": 0.9285714285714286, + "train_speed(iter/s)": 0.687959 + }, + { + "epoch": 0.06642728904847396, + "grad_norm": 7.749454021453857, + "learning_rate": 9.992859805586988e-06, + "loss": 0.10009106993675232, + "memory(GiB)": 13.08, + "step": 37, + "token_acc": 0.9302325581395349, + "train_speed(iter/s)": 0.695378 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 6.152207851409912, + "learning_rate": 9.991185437431618e-06, + "loss": 0.08440906554460526, + "memory(GiB)": 13.08, + "step": 38, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.702611 + }, + { + "epoch": 0.07001795332136446, + "grad_norm": 7.401669979095459, + "learning_rate": 9.989335037597621e-06, + "loss": 0.07661642879247665, + "memory(GiB)": 13.08, + "step": 39, + "token_acc": 0.9418604651162791, + "train_speed(iter/s)": 0.709583 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 3.6849474906921387, + "learning_rate": 9.98730867134584e-06, + "loss": 0.07675136625766754, + "memory(GiB)": 13.08, + "step": 40, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.716245 + }, + { + "epoch": 0.07360861759425494, + "grad_norm": 2.6268744468688965, + "learning_rate": 9.985106410143197e-06, + "loss": 0.0771910697221756, + "memory(GiB)": 13.08, + "step": 41, + "token_acc": 0.9518072289156626, + "train_speed(iter/s)": 0.722643 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 2.1452856063842773, + "learning_rate": 9.982728331660164e-06, + "loss": 0.0589277520775795, + "memory(GiB)": 13.08, + "step": 42, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.729008 + }, + { + "epoch": 0.07719928186714542, + "grad_norm": 8.447471618652344, + "learning_rate": 9.980174519768031e-06, + "loss": 0.06659379601478577, + "memory(GiB)": 13.08, + "step": 43, + "token_acc": 0.9518072289156626, + "train_speed(iter/s)": 0.735178 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 3.402913808822632, + "learning_rate": 9.977445064535938e-06, + "loss": 0.06866178661584854, + "memory(GiB)": 13.08, + "step": 44, + "token_acc": 0.9634146341463414, + "train_speed(iter/s)": 0.741148 + }, + { + "epoch": 0.0807899461400359, + "grad_norm": 5.51450777053833, + "learning_rate": 9.97454006222771e-06, + "loss": 0.06585047394037247, + "memory(GiB)": 13.08, + "step": 45, + "token_acc": 0.9647058823529412, + "train_speed(iter/s)": 0.746954 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 13.785122871398926, + "learning_rate": 9.971459615298449e-06, + "loss": 0.07860522717237473, + "memory(GiB)": 13.08, + "step": 46, + "token_acc": 0.9753086419753086, + "train_speed(iter/s)": 0.752434 + }, + { + "epoch": 0.0843806104129264, + "grad_norm": 4.304253578186035, + "learning_rate": 9.968203832390935e-06, + "loss": 0.07840836048126221, + "memory(GiB)": 13.08, + "step": 47, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.757739 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 4.706284999847412, + "learning_rate": 9.964772828331781e-06, + "loss": 0.059265222400426865, + "memory(GiB)": 13.08, + "step": 48, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.763084 + }, + { + "epoch": 0.08797127468581688, + "grad_norm": 5.815506458282471, + "learning_rate": 9.961166724127393e-06, + "loss": 0.053193286061286926, + "memory(GiB)": 13.08, + "step": 49, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.768261 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 4.573439121246338, + "learning_rate": 9.957385646959697e-06, + "loss": 0.06922228634357452, + "memory(GiB)": 13.08, + "step": 50, + "token_acc": 0.9523809523809523, + "train_speed(iter/s)": 0.773332 + }, + { + "epoch": 0.09156193895870736, + "grad_norm": 3.102543354034424, + "learning_rate": 9.953429730181653e-06, + "loss": 0.050379615277051926, + "memory(GiB)": 13.08, + "step": 51, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.778213 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 7.190049648284912, + "learning_rate": 9.94929911331256e-06, + "loss": 0.04821791872382164, + "memory(GiB)": 13.08, + "step": 52, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.783039 + }, + { + "epoch": 0.09515260323159784, + "grad_norm": 10.347494125366211, + "learning_rate": 9.944993942033118e-06, + "loss": 0.06359973549842834, + "memory(GiB)": 13.08, + "step": 53, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.787683 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 6.274273872375488, + "learning_rate": 9.940514368180312e-06, + "loss": 0.06939861178398132, + "memory(GiB)": 13.08, + "step": 54, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.79225 + }, + { + "epoch": 0.09874326750448834, + "grad_norm": 5.799787521362305, + "learning_rate": 9.93586054974204e-06, + "loss": 0.06817148625850677, + "memory(GiB)": 13.08, + "step": 55, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.796398 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 2.639753818511963, + "learning_rate": 9.931032650851551e-06, + "loss": 0.047845788300037384, + "memory(GiB)": 13.08, + "step": 56, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.800695 + }, + { + "epoch": 0.10233393177737882, + "grad_norm": 3.7379603385925293, + "learning_rate": 9.92603084178165e-06, + "loss": 0.048428915441036224, + "memory(GiB)": 13.08, + "step": 57, + "token_acc": 1.0, + "train_speed(iter/s)": 0.804922 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 9.594407081604004, + "learning_rate": 9.920855298938692e-06, + "loss": 0.03507259860634804, + "memory(GiB)": 13.08, + "step": 58, + "token_acc": 1.0, + "train_speed(iter/s)": 0.809046 + }, + { + "epoch": 0.1059245960502693, + "grad_norm": 3.0264976024627686, + "learning_rate": 9.915506204856368e-06, + "loss": 0.043764326721429825, + "memory(GiB)": 13.08, + "step": 59, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.813025 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 4.240968227386475, + "learning_rate": 9.909983748189266e-06, + "loss": 0.06347043067216873, + "memory(GiB)": 13.08, + "step": 60, + "token_acc": 0.9885057471264368, + "train_speed(iter/s)": 0.816912 + }, + { + "epoch": 0.10951526032315978, + "grad_norm": 8.781189918518066, + "learning_rate": 9.904288123706206e-06, + "loss": 0.04532075673341751, + "memory(GiB)": 13.08, + "step": 61, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.820753 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 8.886838912963867, + "learning_rate": 9.898419532283382e-06, + "loss": 0.061723217368125916, + "memory(GiB)": 13.08, + "step": 62, + "token_acc": 0.9634146341463414, + "train_speed(iter/s)": 0.824242 + }, + { + "epoch": 0.11310592459605028, + "grad_norm": 1.337734341621399, + "learning_rate": 9.892378180897278e-06, + "loss": 0.02629430592060089, + "memory(GiB)": 13.08, + "step": 63, + "token_acc": 1.0, + "train_speed(iter/s)": 0.827641 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 2.174626350402832, + "learning_rate": 9.88616428261736e-06, + "loss": 0.0340137779712677, + "memory(GiB)": 13.08, + "step": 64, + "token_acc": 1.0, + "train_speed(iter/s)": 0.831084 + }, + { + "epoch": 0.11669658886894076, + "grad_norm": 2.469825506210327, + "learning_rate": 9.879778056598568e-06, + "loss": 0.04367762431502342, + "memory(GiB)": 13.08, + "step": 65, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.834149 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 4.995214462280273, + "learning_rate": 9.873219728073586e-06, + "loss": 0.07332038134336472, + "memory(GiB)": 13.08, + "step": 66, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 0.837342 + }, + { + "epoch": 0.12028725314183124, + "grad_norm": 2.368252992630005, + "learning_rate": 9.866489528344896e-06, + "loss": 0.027399607002735138, + "memory(GiB)": 13.08, + "step": 67, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.840522 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 1.84979248046875, + "learning_rate": 9.85958769477662e-06, + "loss": 0.03163766860961914, + "memory(GiB)": 13.08, + "step": 68, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.843714 + }, + { + "epoch": 0.12387791741472172, + "grad_norm": 5.827478408813477, + "learning_rate": 9.852514470786154e-06, + "loss": 0.0383148267865181, + "memory(GiB)": 13.08, + "step": 69, + "token_acc": 1.0, + "train_speed(iter/s)": 0.846847 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 3.431469678878784, + "learning_rate": 9.845270105835572e-06, + "loss": 0.02153998613357544, + "memory(GiB)": 13.08, + "step": 70, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.849875 + }, + { + "epoch": 0.12746858168761221, + "grad_norm": 7.782017230987549, + "learning_rate": 9.837854855422844e-06, + "loss": 0.03378487378358841, + "memory(GiB)": 13.08, + "step": 71, + "token_acc": 1.0, + "train_speed(iter/s)": 0.852894 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 3.0074715614318848, + "learning_rate": 9.830268981072806e-06, + "loss": 0.06558628380298615, + "memory(GiB)": 13.08, + "step": 72, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.855842 + }, + { + "epoch": 0.1310592459605027, + "grad_norm": 4.273418426513672, + "learning_rate": 9.822512750327954e-06, + "loss": 0.0692499652504921, + "memory(GiB)": 13.08, + "step": 73, + "token_acc": 0.9523809523809523, + "train_speed(iter/s)": 0.858669 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 4.4961838722229, + "learning_rate": 9.814586436738998e-06, + "loss": 0.0451151505112648, + "memory(GiB)": 13.08, + "step": 74, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.861569 + }, + { + "epoch": 0.13464991023339318, + "grad_norm": 4.5969977378845215, + "learning_rate": 9.806490319855214e-06, + "loss": 0.07539676874876022, + "memory(GiB)": 13.08, + "step": 75, + "token_acc": 0.9506172839506173, + "train_speed(iter/s)": 0.864445 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 3.9814624786376953, + "learning_rate": 9.798224685214592e-06, + "loss": 0.060120776295661926, + "memory(GiB)": 13.08, + "step": 76, + "token_acc": 0.9390243902439024, + "train_speed(iter/s)": 0.867183 + }, + { + "epoch": 0.13824057450628366, + "grad_norm": 2.3811380863189697, + "learning_rate": 9.789789824333756e-06, + "loss": 0.042882394045591354, + "memory(GiB)": 13.08, + "step": 77, + "token_acc": 0.9651162790697675, + "train_speed(iter/s)": 0.869894 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 7.900166034698486, + "learning_rate": 9.781186034697692e-06, + "loss": 0.030882596969604492, + "memory(GiB)": 13.08, + "step": 78, + "token_acc": 1.0, + "train_speed(iter/s)": 0.87221 + }, + { + "epoch": 0.14183123877917414, + "grad_norm": 2.502838373184204, + "learning_rate": 9.772413619749249e-06, + "loss": 0.04548752307891846, + "memory(GiB)": 13.08, + "step": 79, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.874757 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 2.647627592086792, + "learning_rate": 9.76347288887844e-06, + "loss": 0.03633784130215645, + "memory(GiB)": 13.08, + "step": 80, + "token_acc": 1.0, + "train_speed(iter/s)": 0.876962 + }, + { + "epoch": 0.14542190305206462, + "grad_norm": 2.948970317840576, + "learning_rate": 9.754364157411531e-06, + "loss": 0.0495162233710289, + "memory(GiB)": 13.08, + "step": 81, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.879368 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 1.9277405738830566, + "learning_rate": 9.745087746599917e-06, + "loss": 0.037241458892822266, + "memory(GiB)": 13.08, + "step": 82, + "token_acc": 1.0, + "train_speed(iter/s)": 0.881635 + }, + { + "epoch": 0.1490125673249551, + "grad_norm": 3.646090507507324, + "learning_rate": 9.735643983608797e-06, + "loss": 0.061003878712654114, + "memory(GiB)": 13.08, + "step": 83, + "token_acc": 0.963855421686747, + "train_speed(iter/s)": 0.883914 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 2.8315680027008057, + "learning_rate": 9.726033201505626e-06, + "loss": 0.0555310919880867, + "memory(GiB)": 13.08, + "step": 84, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.886167 + }, + { + "epoch": 0.1526032315978456, + "grad_norm": 3.2616848945617676, + "learning_rate": 9.716255739248385e-06, + "loss": 0.04406466335058212, + "memory(GiB)": 13.08, + "step": 85, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.888483 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 2.3111984729766846, + "learning_rate": 9.706311941673604e-06, + "loss": 0.03145933151245117, + "memory(GiB)": 13.08, + "step": 86, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.890719 + }, + { + "epoch": 0.1561938958707361, + "grad_norm": 1.5045764446258545, + "learning_rate": 9.696202159484221e-06, + "loss": 0.028421945869922638, + "memory(GiB)": 13.08, + "step": 87, + "token_acc": 0.9767441860465116, + "train_speed(iter/s)": 0.892949 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 2.7039079666137695, + "learning_rate": 9.685926749237196e-06, + "loss": 0.04249519482254982, + "memory(GiB)": 13.08, + "step": 88, + "token_acc": 0.9512195121951219, + "train_speed(iter/s)": 0.895116 + }, + { + "epoch": 0.15978456014362658, + "grad_norm": 6.6029534339904785, + "learning_rate": 9.675486073330953e-06, + "loss": 0.047118138521909714, + "memory(GiB)": 13.08, + "step": 89, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.897155 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 4.033291816711426, + "learning_rate": 9.664880499992585e-06, + "loss": 0.05433673784136772, + "memory(GiB)": 13.08, + "step": 90, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.89925 + }, + { + "epoch": 0.16337522441651706, + "grad_norm": 2.3289759159088135, + "learning_rate": 9.65411040326487e-06, + "loss": 0.039471667259931564, + "memory(GiB)": 13.08, + "step": 91, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.90127 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 2.7999486923217773, + "learning_rate": 9.643176162993082e-06, + "loss": 0.0371071957051754, + "memory(GiB)": 13.08, + "step": 92, + "token_acc": 1.0, + "train_speed(iter/s)": 0.903087 + }, + { + "epoch": 0.16696588868940754, + "grad_norm": 5.384402275085449, + "learning_rate": 9.632078164811599e-06, + "loss": 0.060693953186273575, + "memory(GiB)": 13.08, + "step": 93, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.905023 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 3.9353551864624023, + "learning_rate": 9.62081680013029e-06, + "loss": 0.048364944756031036, + "memory(GiB)": 13.08, + "step": 94, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.906987 + }, + { + "epoch": 0.17055655296229802, + "grad_norm": 1.1935069561004639, + "learning_rate": 9.609392466120718e-06, + "loss": 0.026935912668704987, + "memory(GiB)": 13.08, + "step": 95, + "token_acc": 1.0, + "train_speed(iter/s)": 0.908911 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 2.617582321166992, + "learning_rate": 9.597805565702135e-06, + "loss": 0.025530723854899406, + "memory(GiB)": 13.08, + "step": 96, + "token_acc": 1.0, + "train_speed(iter/s)": 0.910804 + }, + { + "epoch": 0.1741472172351885, + "grad_norm": 2.125988245010376, + "learning_rate": 9.586056507527266e-06, + "loss": 0.04142545536160469, + "memory(GiB)": 13.08, + "step": 97, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.912709 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 3.551936388015747, + "learning_rate": 9.574145705967899e-06, + "loss": 0.04358229041099548, + "memory(GiB)": 13.08, + "step": 98, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.914565 + }, + { + "epoch": 0.17773788150807898, + "grad_norm": 11.684738159179688, + "learning_rate": 9.562073581100268e-06, + "loss": 0.038947056978940964, + "memory(GiB)": 13.08, + "step": 99, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.916346 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 2.4400858879089355, + "learning_rate": 9.549840558690242e-06, + "loss": 0.021702440455555916, + "memory(GiB)": 13.08, + "step": 100, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.918095 + }, + { + "epoch": 0.1813285457809695, + "grad_norm": 10.91565227508545, + "learning_rate": 9.537447070178313e-06, + "loss": 0.033810004591941833, + "memory(GiB)": 13.08, + "step": 101, + "token_acc": 1.0, + "train_speed(iter/s)": 0.91984 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 2.4178123474121094, + "learning_rate": 9.524893552664358e-06, + "loss": 0.03188528120517731, + "memory(GiB)": 13.08, + "step": 102, + "token_acc": 0.9753086419753086, + "train_speed(iter/s)": 0.921562 + }, + { + "epoch": 0.18491921005385997, + "grad_norm": 3.2008867263793945, + "learning_rate": 9.512180448892255e-06, + "loss": 0.04083285480737686, + "memory(GiB)": 13.08, + "step": 103, + "token_acc": 0.9647058823529412, + "train_speed(iter/s)": 0.923273 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 2.8384220600128174, + "learning_rate": 9.49930820723424e-06, + "loss": 0.029252585023641586, + "memory(GiB)": 13.08, + "step": 104, + "token_acc": 1.0, + "train_speed(iter/s)": 0.92491 + }, + { + "epoch": 0.18850987432675045, + "grad_norm": 25.84834098815918, + "learning_rate": 9.486277281675111e-06, + "loss": 0.03789442032575607, + "memory(GiB)": 13.08, + "step": 105, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.92651 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 2.426431655883789, + "learning_rate": 9.47308813179621e-06, + "loss": 0.04464688152074814, + "memory(GiB)": 13.08, + "step": 106, + "token_acc": 1.0, + "train_speed(iter/s)": 0.928152 + }, + { + "epoch": 0.19210053859964094, + "grad_norm": 5.1038737297058105, + "learning_rate": 9.459741222759219e-06, + "loss": 0.04975412040948868, + "memory(GiB)": 13.08, + "step": 107, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.929696 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 3.114001512527466, + "learning_rate": 9.44623702528974e-06, + "loss": 0.02191930264234543, + "memory(GiB)": 13.08, + "step": 108, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.931247 + }, + { + "epoch": 0.19569120287253142, + "grad_norm": 1.3519690036773682, + "learning_rate": 9.432576015660714e-06, + "loss": 0.01995695009827614, + "memory(GiB)": 13.08, + "step": 109, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.932815 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 2.478172779083252, + "learning_rate": 9.418758675675608e-06, + "loss": 0.024603135883808136, + "memory(GiB)": 13.08, + "step": 110, + "token_acc": 1.0, + "train_speed(iter/s)": 0.934313 + }, + { + "epoch": 0.1992818671454219, + "grad_norm": 3.4675686359405518, + "learning_rate": 9.404785492651432e-06, + "loss": 0.033767543733119965, + "memory(GiB)": 13.08, + "step": 111, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 0.935857 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 3.052138566970825, + "learning_rate": 9.390656959401544e-06, + "loss": 0.009343144483864307, + "memory(GiB)": 13.08, + "step": 112, + "token_acc": 1.0, + "train_speed(iter/s)": 0.937371 + }, + { + "epoch": 0.20287253141831238, + "grad_norm": 5.359823703765869, + "learning_rate": 9.376373574218272e-06, + "loss": 0.04563562572002411, + "memory(GiB)": 13.08, + "step": 113, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.938852 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 6.652003765106201, + "learning_rate": 9.361935840855348e-06, + "loss": 0.028069501742720604, + "memory(GiB)": 13.08, + "step": 114, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.940205 + }, + { + "epoch": 0.20646319569120286, + "grad_norm": 3.7444705963134766, + "learning_rate": 9.34734426851013e-06, + "loss": 0.03850091993808746, + "memory(GiB)": 13.08, + "step": 115, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.941625 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.6508049368858337, + "learning_rate": 9.332599371805646e-06, + "loss": 0.0033617676235735416, + "memory(GiB)": 13.08, + "step": 116, + "token_acc": 1.0, + "train_speed(iter/s)": 0.943037 + }, + { + "epoch": 0.21005385996409337, + "grad_norm": 6.506423473358154, + "learning_rate": 9.31770167077245e-06, + "loss": 0.020989812910556793, + "memory(GiB)": 13.08, + "step": 117, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.944433 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 4.432605743408203, + "learning_rate": 9.302651690830272e-06, + "loss": 0.023440973833203316, + "memory(GiB)": 13.08, + "step": 118, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 0.945802 + }, + { + "epoch": 0.21364452423698385, + "grad_norm": 5.686257839202881, + "learning_rate": 9.287449962769499e-06, + "loss": 0.019691113382577896, + "memory(GiB)": 13.08, + "step": 119, + "token_acc": 1.0, + "train_speed(iter/s)": 0.947146 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 4.269283771514893, + "learning_rate": 9.272097022732444e-06, + "loss": 0.045453161001205444, + "memory(GiB)": 13.08, + "step": 120, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.948459 + }, + { + "epoch": 0.21723518850987433, + "grad_norm": 6.724762439727783, + "learning_rate": 9.25659341219444e-06, + "loss": 0.05681533366441727, + "memory(GiB)": 13.08, + "step": 121, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.949676 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 3.360192060470581, + "learning_rate": 9.240939677944747e-06, + "loss": 0.026886530220508575, + "memory(GiB)": 13.08, + "step": 122, + "token_acc": 1.0, + "train_speed(iter/s)": 0.950997 + }, + { + "epoch": 0.22082585278276481, + "grad_norm": 2.599660873413086, + "learning_rate": 9.225136372067266e-06, + "loss": 0.02804575487971306, + "memory(GiB)": 13.08, + "step": 123, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.952175 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 1.821930170059204, + "learning_rate": 9.209184051921062e-06, + "loss": 0.013691036961972713, + "memory(GiB)": 13.08, + "step": 124, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.953415 + }, + { + "epoch": 0.2244165170556553, + "grad_norm": 1.5606189966201782, + "learning_rate": 9.193083280120716e-06, + "loss": 0.015006945468485355, + "memory(GiB)": 13.08, + "step": 125, + "token_acc": 1.0, + "train_speed(iter/s)": 0.954525 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 3.290184259414673, + "learning_rate": 9.176834624516475e-06, + "loss": 0.029213648289442062, + "memory(GiB)": 13.08, + "step": 126, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 0.955743 + }, + { + "epoch": 0.22800718132854578, + "grad_norm": 2.5756008625030518, + "learning_rate": 9.160438658174228e-06, + "loss": 0.05053117871284485, + "memory(GiB)": 13.08, + "step": 127, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.956923 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.9768053889274597, + "learning_rate": 9.143895959355293e-06, + "loss": 0.009400786831974983, + "memory(GiB)": 13.08, + "step": 128, + "token_acc": 1.0, + "train_speed(iter/s)": 0.958093 + }, + { + "epoch": 0.23159784560143626, + "grad_norm": 2.3841869831085205, + "learning_rate": 9.12720711149603e-06, + "loss": 0.04403943568468094, + "memory(GiB)": 13.08, + "step": 129, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.959153 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 2.4007785320281982, + "learning_rate": 9.11037270318725e-06, + "loss": 0.03857174515724182, + "memory(GiB)": 13.08, + "step": 130, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 0.960201 + }, + { + "epoch": 0.23518850987432674, + "grad_norm": 1.6931179761886597, + "learning_rate": 9.093393328153468e-06, + "loss": 0.031723953783512115, + "memory(GiB)": 13.08, + "step": 131, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 0.96129 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 1.7476893663406372, + "learning_rate": 9.076269585231963e-06, + "loss": 0.01917055808007717, + "memory(GiB)": 13.08, + "step": 132, + "token_acc": 1.0, + "train_speed(iter/s)": 0.962341 + }, + { + "epoch": 0.23877917414721722, + "grad_norm": 1.5303798913955688, + "learning_rate": 9.059002078351648e-06, + "loss": 0.024301765486598015, + "memory(GiB)": 13.08, + "step": 133, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.963357 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 1.0618665218353271, + "learning_rate": 9.041591416511781e-06, + "loss": 0.015223094262182713, + "memory(GiB)": 13.08, + "step": 134, + "token_acc": 1.0, + "train_speed(iter/s)": 0.964435 + }, + { + "epoch": 0.24236983842010773, + "grad_norm": 1.078162431716919, + "learning_rate": 9.02403821376048e-06, + "loss": 0.015651162713766098, + "memory(GiB)": 13.08, + "step": 135, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.96548 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.4840225875377655, + "learning_rate": 9.00634308917307e-06, + "loss": 0.003601664211601019, + "memory(GiB)": 13.08, + "step": 136, + "token_acc": 1.0, + "train_speed(iter/s)": 0.966519 + }, + { + "epoch": 0.2459605026929982, + "grad_norm": 2.905595064163208, + "learning_rate": 8.98850666683025e-06, + "loss": 0.05516817420721054, + "memory(GiB)": 13.08, + "step": 137, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.967519 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 4.5008015632629395, + "learning_rate": 8.970529575796075e-06, + "loss": 0.020421452820301056, + "memory(GiB)": 13.08, + "step": 138, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.968588 + }, + { + "epoch": 0.2495511669658887, + "grad_norm": 0.9016497731208801, + "learning_rate": 8.952412450095778e-06, + "loss": 0.011048251762986183, + "memory(GiB)": 13.08, + "step": 139, + "token_acc": 1.0, + "train_speed(iter/s)": 0.969612 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 6.823260307312012, + "learning_rate": 8.93415592869341e-06, + "loss": 0.02468924969434738, + "memory(GiB)": 13.08, + "step": 140, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.970635 + }, + { + "epoch": 0.25314183123877915, + "grad_norm": 0.8994376063346863, + "learning_rate": 8.915760655469295e-06, + "loss": 0.00472388556227088, + "memory(GiB)": 13.08, + "step": 141, + "token_acc": 1.0, + "train_speed(iter/s)": 0.971431 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 1.9290436506271362, + "learning_rate": 8.897227279197329e-06, + "loss": 0.006992727518081665, + "memory(GiB)": 13.08, + "step": 142, + "token_acc": 1.0, + "train_speed(iter/s)": 0.972462 + }, + { + "epoch": 0.25673249551166966, + "grad_norm": 6.983469486236572, + "learning_rate": 8.8785564535221e-06, + "loss": 0.029196854680776596, + "memory(GiB)": 13.08, + "step": 143, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 0.973469 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 1.7266432046890259, + "learning_rate": 8.859748836935827e-06, + "loss": 0.011162776499986649, + "memory(GiB)": 13.08, + "step": 144, + "token_acc": 1.0, + "train_speed(iter/s)": 0.974429 + }, + { + "epoch": 0.26032315978456017, + "grad_norm": 0.831814169883728, + "learning_rate": 8.840805092755143e-06, + "loss": 0.003509755712002516, + "memory(GiB)": 13.08, + "step": 145, + "token_acc": 1.0, + "train_speed(iter/s)": 0.975411 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 2.8398990631103516, + "learning_rate": 8.821725889097697e-06, + "loss": 0.02796313166618347, + "memory(GiB)": 13.08, + "step": 146, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.976248 + }, + { + "epoch": 0.2639138240574506, + "grad_norm": 1.4536628723144531, + "learning_rate": 8.802511898858598e-06, + "loss": 0.0070991357788443565, + "memory(GiB)": 13.08, + "step": 147, + "token_acc": 1.0, + "train_speed(iter/s)": 0.97713 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 9.953302383422852, + "learning_rate": 8.78316379968667e-06, + "loss": 0.04780750721693039, + "memory(GiB)": 13.08, + "step": 148, + "token_acc": 1.0, + "train_speed(iter/s)": 0.978034 + }, + { + "epoch": 0.26750448833034113, + "grad_norm": 3.7487027645111084, + "learning_rate": 8.76368227396056e-06, + "loss": 0.03020842745900154, + "memory(GiB)": 13.08, + "step": 149, + "token_acc": 1.0, + "train_speed(iter/s)": 0.978951 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 3.94450306892395, + "learning_rate": 8.744068008764676e-06, + "loss": 0.05766075849533081, + "memory(GiB)": 13.08, + "step": 150, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.979826 + }, + { + "epoch": 0.2710951526032316, + "grad_norm": 3.3216402530670166, + "learning_rate": 8.724321695864945e-06, + "loss": 0.04996117204427719, + "memory(GiB)": 13.08, + "step": 151, + "token_acc": 1.0, + "train_speed(iter/s)": 0.980739 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.44808074831962585, + "learning_rate": 8.70444403168442e-06, + "loss": 0.0032409292180091143, + "memory(GiB)": 13.08, + "step": 152, + "token_acc": 1.0, + "train_speed(iter/s)": 0.981611 + }, + { + "epoch": 0.2746858168761221, + "grad_norm": 1.278113842010498, + "learning_rate": 8.684435717278723e-06, + "loss": 0.007217491511255503, + "memory(GiB)": 13.08, + "step": 153, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.982437 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 2.0689215660095215, + "learning_rate": 8.664297458311308e-06, + "loss": 0.022039521485567093, + "memory(GiB)": 13.08, + "step": 154, + "token_acc": 0.9875, + "train_speed(iter/s)": 0.98334 + }, + { + "epoch": 0.27827648114901254, + "grad_norm": 1.3855514526367188, + "learning_rate": 8.644029965028584e-06, + "loss": 0.0207529254257679, + "memory(GiB)": 13.08, + "step": 155, + "token_acc": 1.0, + "train_speed(iter/s)": 0.984229 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 3.6675665378570557, + "learning_rate": 8.623633952234863e-06, + "loss": 0.02834094688296318, + "memory(GiB)": 13.08, + "step": 156, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.985091 + }, + { + "epoch": 0.28186714542190305, + "grad_norm": 2.6226563453674316, + "learning_rate": 8.603110139267143e-06, + "loss": 0.019242525100708008, + "memory(GiB)": 13.08, + "step": 157, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.985975 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 1.0710657835006714, + "learning_rate": 8.582459249969752e-06, + "loss": 0.02997148036956787, + "memory(GiB)": 13.08, + "step": 158, + "token_acc": 1.0, + "train_speed(iter/s)": 0.986813 + }, + { + "epoch": 0.28545780969479356, + "grad_norm": 0.5638286471366882, + "learning_rate": 8.561682012668806e-06, + "loss": 0.02546517550945282, + "memory(GiB)": 13.08, + "step": 159, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 0.987659 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.8079213500022888, + "learning_rate": 8.540779160146528e-06, + "loss": 0.016487473621964455, + "memory(GiB)": 13.08, + "step": 160, + "token_acc": 1.0, + "train_speed(iter/s)": 0.988498 + }, + { + "epoch": 0.289048473967684, + "grad_norm": 0.9314688444137573, + "learning_rate": 8.519751429615399e-06, + "loss": 0.02036382630467415, + "memory(GiB)": 13.08, + "step": 161, + "token_acc": 1.0, + "train_speed(iter/s)": 0.989278 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 1.7946116924285889, + "learning_rate": 8.498599562692172e-06, + "loss": 0.018707189708948135, + "memory(GiB)": 13.08, + "step": 162, + "token_acc": 1.0, + "train_speed(iter/s)": 0.989987 + }, + { + "epoch": 0.2926391382405745, + "grad_norm": 0.9974428415298462, + "learning_rate": 8.477324305371693e-06, + "loss": 0.027415761724114418, + "memory(GiB)": 13.08, + "step": 163, + "token_acc": 1.0, + "train_speed(iter/s)": 0.990774 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.9948506355285645, + "learning_rate": 8.45592640800061e-06, + "loss": 0.024100039154291153, + "memory(GiB)": 13.08, + "step": 164, + "token_acc": 1.0, + "train_speed(iter/s)": 0.991555 + }, + { + "epoch": 0.296229802513465, + "grad_norm": 0.8982878923416138, + "learning_rate": 8.434406625250905e-06, + "loss": 0.021193422377109528, + "memory(GiB)": 13.08, + "step": 165, + "token_acc": 1.0, + "train_speed(iter/s)": 0.992333 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.8243660926818848, + "learning_rate": 8.412765716093273e-06, + "loss": 0.01835045963525772, + "memory(GiB)": 13.08, + "step": 166, + "token_acc": 1.0, + "train_speed(iter/s)": 0.99314 + }, + { + "epoch": 0.2998204667863555, + "grad_norm": 0.5059536099433899, + "learning_rate": 8.391004443770352e-06, + "loss": 0.00947535876184702, + "memory(GiB)": 13.08, + "step": 167, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 0.993923 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.45886659622192383, + "learning_rate": 8.369123575769821e-06, + "loss": 0.01513146236538887, + "memory(GiB)": 13.08, + "step": 168, + "token_acc": 1.0, + "train_speed(iter/s)": 0.994701 + }, + { + "epoch": 0.30341113105924594, + "grad_norm": 1.261873483657837, + "learning_rate": 8.347123883797313e-06, + "loss": 0.036124758422374725, + "memory(GiB)": 13.08, + "step": 169, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.995486 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 1.327781081199646, + "learning_rate": 8.325006143749206e-06, + "loss": 0.02721594274044037, + "memory(GiB)": 13.08, + "step": 170, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 0.996245 + }, + { + "epoch": 0.30700179533213645, + "grad_norm": 1.111107587814331, + "learning_rate": 8.302771135685258e-06, + "loss": 0.008845215663313866, + "memory(GiB)": 13.08, + "step": 171, + "token_acc": 1.0, + "train_speed(iter/s)": 0.996998 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 1.819555401802063, + "learning_rate": 8.280419643801096e-06, + "loss": 0.02785387821495533, + "memory(GiB)": 13.08, + "step": 172, + "token_acc": 0.9629629629629629, + "train_speed(iter/s)": 0.997659 + }, + { + "epoch": 0.3105924596050269, + "grad_norm": 2.7433362007141113, + "learning_rate": 8.25795245640056e-06, + "loss": 0.02410004287958145, + "memory(GiB)": 13.08, + "step": 173, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.998384 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 3.860645294189453, + "learning_rate": 8.235370365867894e-06, + "loss": 0.027394965291023254, + "memory(GiB)": 13.08, + "step": 174, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 0.999111 + }, + { + "epoch": 0.3141831238779174, + "grad_norm": 2.0909423828125, + "learning_rate": 8.212674168639809e-06, + "loss": 0.05655178800225258, + "memory(GiB)": 13.08, + "step": 175, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 0.999816 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 1.8992338180541992, + "learning_rate": 8.189864665177387e-06, + "loss": 0.03129833936691284, + "memory(GiB)": 13.08, + "step": 176, + "token_acc": 1.0, + "train_speed(iter/s)": 1.000452 + }, + { + "epoch": 0.3177737881508079, + "grad_norm": 1.2347586154937744, + "learning_rate": 8.166942659937851e-06, + "loss": 0.012607865035533905, + "memory(GiB)": 13.08, + "step": 177, + "token_acc": 1.0, + "train_speed(iter/s)": 1.001126 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.8827244639396667, + "learning_rate": 8.143908961346197e-06, + "loss": 0.01872306875884533, + "memory(GiB)": 13.08, + "step": 178, + "token_acc": 1.0, + "train_speed(iter/s)": 1.001801 + }, + { + "epoch": 0.3213644524236984, + "grad_norm": 3.7868669033050537, + "learning_rate": 8.120764381766677e-06, + "loss": 0.017799578607082367, + "memory(GiB)": 13.08, + "step": 179, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.00247 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.7789050936698914, + "learning_rate": 8.09750973747415e-06, + "loss": 0.011477293446660042, + "memory(GiB)": 13.08, + "step": 180, + "token_acc": 1.0, + "train_speed(iter/s)": 1.003123 + }, + { + "epoch": 0.3249551166965889, + "grad_norm": 0.9042924642562866, + "learning_rate": 8.074145848625297e-06, + "loss": 0.006420719437301159, + "memory(GiB)": 13.08, + "step": 181, + "token_acc": 1.0, + "train_speed(iter/s)": 1.003797 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 2.8096652030944824, + "learning_rate": 8.050673539229687e-06, + "loss": 0.031101427972316742, + "memory(GiB)": 13.08, + "step": 182, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 1.004418 + }, + { + "epoch": 0.32854578096947934, + "grad_norm": 1.5251837968826294, + "learning_rate": 8.027093637120725e-06, + "loss": 0.011296382173895836, + "memory(GiB)": 13.08, + "step": 183, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.005109 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 1.3062682151794434, + "learning_rate": 8.003406973926447e-06, + "loss": 0.011857806704938412, + "memory(GiB)": 13.08, + "step": 184, + "token_acc": 1.0, + "train_speed(iter/s)": 1.005753 + }, + { + "epoch": 0.33213644524236985, + "grad_norm": 1.8101391792297363, + "learning_rate": 7.979614385040196e-06, + "loss": 0.012419183738529682, + "memory(GiB)": 13.08, + "step": 185, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.00636 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.9654263854026794, + "learning_rate": 7.955716709591153e-06, + "loss": 0.02620360068976879, + "memory(GiB)": 13.08, + "step": 186, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.006996 + }, + { + "epoch": 0.3357271095152603, + "grad_norm": 1.9752559661865234, + "learning_rate": 7.93171479041475e-06, + "loss": 0.035749442875385284, + "memory(GiB)": 13.08, + "step": 187, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 1.007513 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 3.0109922885894775, + "learning_rate": 7.907609474022939e-06, + "loss": 0.035250578075647354, + "memory(GiB)": 13.08, + "step": 188, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 1.008135 + }, + { + "epoch": 0.3393177737881508, + "grad_norm": 1.4333490133285522, + "learning_rate": 7.883401610574338e-06, + "loss": 0.01735576055943966, + "memory(GiB)": 13.08, + "step": 189, + "token_acc": 1.0, + "train_speed(iter/s)": 1.008702 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 1.7182650566101074, + "learning_rate": 7.85909205384424e-06, + "loss": 0.01646241545677185, + "memory(GiB)": 13.08, + "step": 190, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 1.009331 + }, + { + "epoch": 0.34290843806104127, + "grad_norm": 3.3415398597717285, + "learning_rate": 7.834681661194521e-06, + "loss": 0.014337165281176567, + "memory(GiB)": 13.08, + "step": 191, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.009703 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 1.8141905069351196, + "learning_rate": 7.810171293543379e-06, + "loss": 0.019885510206222534, + "memory(GiB)": 13.08, + "step": 192, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.010312 + }, + { + "epoch": 0.3464991023339318, + "grad_norm": 13.775602340698242, + "learning_rate": 7.78556181533499e-06, + "loss": 0.029758915305137634, + "memory(GiB)": 13.08, + "step": 193, + "token_acc": 1.0, + "train_speed(iter/s)": 1.010906 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.6618063449859619, + "learning_rate": 7.760854094509004e-06, + "loss": 0.015286603942513466, + "memory(GiB)": 13.08, + "step": 194, + "token_acc": 1.0, + "train_speed(iter/s)": 1.011507 + }, + { + "epoch": 0.3500897666068223, + "grad_norm": 1.301867961883545, + "learning_rate": 7.736049002469944e-06, + "loss": 0.03050239384174347, + "memory(GiB)": 13.08, + "step": 195, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 1.012005 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6352006793022156, + "learning_rate": 7.711147414056478e-06, + "loss": 0.02017250843346119, + "memory(GiB)": 13.08, + "step": 196, + "token_acc": 1.0, + "train_speed(iter/s)": 1.012589 + }, + { + "epoch": 0.35368043087971274, + "grad_norm": 1.7065843343734741, + "learning_rate": 7.686150207510545e-06, + "loss": 0.026215817779302597, + "memory(GiB)": 13.08, + "step": 197, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.013108 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 1.2364556789398193, + "learning_rate": 7.661058264446404e-06, + "loss": 0.0134409349411726, + "memory(GiB)": 13.08, + "step": 198, + "token_acc": 1.0, + "train_speed(iter/s)": 1.01363 + }, + { + "epoch": 0.35727109515260325, + "grad_norm": 1.366138219833374, + "learning_rate": 7.635872469819526e-06, + "loss": 0.02837909385561943, + "memory(GiB)": 13.08, + "step": 199, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.01408 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 2.986611843109131, + "learning_rate": 7.61059371189538e-06, + "loss": 0.0414729118347168, + "memory(GiB)": 13.08, + "step": 200, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.014489 + }, + { + "epoch": 0.3608617594254937, + "grad_norm": 1.6066268682479858, + "learning_rate": 7.585222882218126e-06, + "loss": 0.03649783879518509, + "memory(GiB)": 13.08, + "step": 201, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.014953 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 1.8554317951202393, + "learning_rate": 7.559760875579148e-06, + "loss": 0.028122469782829285, + "memory(GiB)": 13.08, + "step": 202, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 1.015512 + }, + { + "epoch": 0.3644524236983842, + "grad_norm": 2.0980942249298096, + "learning_rate": 7.534208589985508e-06, + "loss": 0.037487149238586426, + "memory(GiB)": 13.08, + "step": 203, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 1.016057 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.988807737827301, + "learning_rate": 7.508566926628269e-06, + "loss": 0.017745109274983406, + "memory(GiB)": 13.08, + "step": 204, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.016588 + }, + { + "epoch": 0.36804308797127466, + "grad_norm": 1.0523693561553955, + "learning_rate": 7.482836789850723e-06, + "loss": 0.008650293573737144, + "memory(GiB)": 13.08, + "step": 205, + "token_acc": 1.0, + "train_speed(iter/s)": 1.017031 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 1.1179697513580322, + "learning_rate": 7.457019087116478e-06, + "loss": 0.013809667900204659, + "memory(GiB)": 13.08, + "step": 206, + "token_acc": 1.0, + "train_speed(iter/s)": 1.017529 + }, + { + "epoch": 0.37163375224416517, + "grad_norm": 0.5934966802597046, + "learning_rate": 7.43111472897747e-06, + "loss": 0.027622466906905174, + "memory(GiB)": 13.08, + "step": 207, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.018066 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 2.456352710723877, + "learning_rate": 7.405124629041839e-06, + "loss": 0.07925088703632355, + "memory(GiB)": 13.08, + "step": 208, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 1.018599 + }, + { + "epoch": 0.3752244165170557, + "grad_norm": 2.269296407699585, + "learning_rate": 7.3790497039417105e-06, + "loss": 0.019099529832601547, + "memory(GiB)": 13.08, + "step": 209, + "token_acc": 1.0, + "train_speed(iter/s)": 1.019139 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 2.8380684852600098, + "learning_rate": 7.35289087330087e-06, + "loss": 0.054111234843730927, + "memory(GiB)": 13.08, + "step": 210, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.01965 + }, + { + "epoch": 0.37881508078994613, + "grad_norm": 0.9783629179000854, + "learning_rate": 7.326649059702323e-06, + "loss": 0.028021687641739845, + "memory(GiB)": 13.08, + "step": 211, + "token_acc": 1.0, + "train_speed(iter/s)": 1.020099 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.9726735353469849, + "learning_rate": 7.300325188655762e-06, + "loss": 0.015542379580438137, + "memory(GiB)": 13.08, + "step": 212, + "token_acc": 1.0, + "train_speed(iter/s)": 1.020607 + }, + { + "epoch": 0.38240574506283664, + "grad_norm": 1.104114294052124, + "learning_rate": 7.273920188564922e-06, + "loss": 0.025392023846507072, + "memory(GiB)": 13.08, + "step": 213, + "token_acc": 1.0, + "train_speed(iter/s)": 1.021111 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.34777626395225525, + "learning_rate": 7.24743499069484e-06, + "loss": 0.007279836107045412, + "memory(GiB)": 13.08, + "step": 214, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.021421 + }, + { + "epoch": 0.3859964093357271, + "grad_norm": 2.9164483547210693, + "learning_rate": 7.220870529139012e-06, + "loss": 0.014113265089690685, + "memory(GiB)": 13.08, + "step": 215, + "token_acc": 1.0, + "train_speed(iter/s)": 1.02169 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 1.2801505327224731, + "learning_rate": 7.19422774078644e-06, + "loss": 0.032644063234329224, + "memory(GiB)": 13.08, + "step": 216, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 1.022088 + }, + { + "epoch": 0.3895870736086176, + "grad_norm": 3.49938702583313, + "learning_rate": 7.167507565288599e-06, + "loss": 0.03756846487522125, + "memory(GiB)": 13.08, + "step": 217, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 1.022535 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 1.3962315320968628, + "learning_rate": 7.140710945026296e-06, + "loss": 0.01906348019838333, + "memory(GiB)": 13.08, + "step": 218, + "token_acc": 1.0, + "train_speed(iter/s)": 1.022981 + }, + { + "epoch": 0.39317773788150806, + "grad_norm": 1.123113751411438, + "learning_rate": 7.1138388250764245e-06, + "loss": 0.04802602156996727, + "memory(GiB)": 13.08, + "step": 219, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 1.023259 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.8331385850906372, + "learning_rate": 7.086892153178646e-06, + "loss": 0.012340272776782513, + "memory(GiB)": 13.08, + "step": 220, + "token_acc": 1.0, + "train_speed(iter/s)": 1.023688 + }, + { + "epoch": 0.39676840215439857, + "grad_norm": 0.7761565446853638, + "learning_rate": 7.059871879701954e-06, + "loss": 0.025298362597823143, + "memory(GiB)": 13.08, + "step": 221, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.024114 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 3.903602361679077, + "learning_rate": 7.032778957611162e-06, + "loss": 0.01550825871527195, + "memory(GiB)": 13.08, + "step": 222, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 1.024492 + }, + { + "epoch": 0.400359066427289, + "grad_norm": 0.5995408296585083, + "learning_rate": 7.005614342433291e-06, + "loss": 0.018919674679636955, + "memory(GiB)": 13.08, + "step": 223, + "token_acc": 1.0, + "train_speed(iter/s)": 1.024915 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 1.0118721723556519, + "learning_rate": 6.978378992223873e-06, + "loss": 0.022716499865055084, + "memory(GiB)": 13.08, + "step": 224, + "token_acc": 1.0, + "train_speed(iter/s)": 1.025376 + }, + { + "epoch": 0.40394973070017953, + "grad_norm": 1.8966789245605469, + "learning_rate": 6.9510738675331535e-06, + "loss": 0.031317099928855896, + "memory(GiB)": 13.08, + "step": 225, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.025843 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.30010876059532166, + "learning_rate": 6.923699931372225e-06, + "loss": 0.01530955545604229, + "memory(GiB)": 13.08, + "step": 226, + "token_acc": 1.0, + "train_speed(iter/s)": 1.026296 + }, + { + "epoch": 0.40754039497307004, + "grad_norm": 1.6241896152496338, + "learning_rate": 6.896258149179058e-06, + "loss": 0.047767072916030884, + "memory(GiB)": 13.08, + "step": 227, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.026742 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.7032896280288696, + "learning_rate": 6.868749488784451e-06, + "loss": 0.006980127654969692, + "memory(GiB)": 13.08, + "step": 228, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.027186 + }, + { + "epoch": 0.4111310592459605, + "grad_norm": 0.6982834935188293, + "learning_rate": 6.841174920377893e-06, + "loss": 0.02293953113257885, + "memory(GiB)": 13.08, + "step": 229, + "token_acc": 1.0, + "train_speed(iter/s)": 1.02763 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 1.1076542139053345, + "learning_rate": 6.813535416473359e-06, + "loss": 0.011044427752494812, + "memory(GiB)": 13.08, + "step": 230, + "token_acc": 1.0, + "train_speed(iter/s)": 1.028067 + }, + { + "epoch": 0.414721723518851, + "grad_norm": 4.531601905822754, + "learning_rate": 6.7858319518749925e-06, + "loss": 0.03839945048093796, + "memory(GiB)": 13.08, + "step": 231, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.028444 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.40456172823905945, + "learning_rate": 6.758065503642741e-06, + "loss": 0.012002535164356232, + "memory(GiB)": 13.08, + "step": 232, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.02884 + }, + { + "epoch": 0.41831238779174146, + "grad_norm": 0.9251776933670044, + "learning_rate": 6.730237051057893e-06, + "loss": 0.019714243710041046, + "memory(GiB)": 13.08, + "step": 233, + "token_acc": 1.0, + "train_speed(iter/s)": 1.0292 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 1.1103687286376953, + "learning_rate": 6.70234757558853e-06, + "loss": 0.031476739794015884, + "memory(GiB)": 13.08, + "step": 234, + "token_acc": 1.0, + "train_speed(iter/s)": 1.029637 + }, + { + "epoch": 0.42190305206463197, + "grad_norm": 1.796712040901184, + "learning_rate": 6.674398060854931e-06, + "loss": 0.01579088345170021, + "memory(GiB)": 13.08, + "step": 235, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.030074 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.1217975988984108, + "learning_rate": 6.646389492594863e-06, + "loss": 0.002822080859914422, + "memory(GiB)": 13.08, + "step": 236, + "token_acc": 1.0, + "train_speed(iter/s)": 1.030504 + }, + { + "epoch": 0.4254937163375224, + "grad_norm": 4.095233917236328, + "learning_rate": 6.618322858628821e-06, + "loss": 0.0496731698513031, + "memory(GiB)": 13.08, + "step": 237, + "token_acc": 0.9512195121951219, + "train_speed(iter/s)": 1.030881 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.6630756855010986, + "learning_rate": 6.590199148825197e-06, + "loss": 0.006224961951375008, + "memory(GiB)": 13.08, + "step": 238, + "token_acc": 1.0, + "train_speed(iter/s)": 1.031284 + }, + { + "epoch": 0.42908438061041293, + "grad_norm": 0.39497628808021545, + "learning_rate": 6.562019355065361e-06, + "loss": 0.0033654342405498028, + "memory(GiB)": 13.08, + "step": 239, + "token_acc": 1.0, + "train_speed(iter/s)": 1.031578 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 2.465054750442505, + "learning_rate": 6.5337844712086785e-06, + "loss": 0.02996770292520523, + "memory(GiB)": 13.08, + "step": 240, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 1.03195 + }, + { + "epoch": 0.4326750448833034, + "grad_norm": 1.4782965183258057, + "learning_rate": 6.505495493057463e-06, + "loss": 0.027920585125684738, + "memory(GiB)": 13.08, + "step": 241, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.032367 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.9718492031097412, + "learning_rate": 6.477153418321851e-06, + "loss": 0.019636746495962143, + "memory(GiB)": 13.08, + "step": 242, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.032769 + }, + { + "epoch": 0.4362657091561939, + "grad_norm": 0.37164536118507385, + "learning_rate": 6.448759246584619e-06, + "loss": 0.008408846333622932, + "memory(GiB)": 13.08, + "step": 243, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.033174 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.7316699028015137, + "learning_rate": 6.420313979265926e-06, + "loss": 0.00858224369585514, + "memory(GiB)": 13.08, + "step": 244, + "token_acc": 1.0, + "train_speed(iter/s)": 1.03356 + }, + { + "epoch": 0.4398563734290844, + "grad_norm": 2.820814847946167, + "learning_rate": 6.391818619587997e-06, + "loss": 0.03275878354907036, + "memory(GiB)": 13.08, + "step": 245, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.033959 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5600858926773071, + "learning_rate": 6.3632741725397375e-06, + "loss": 0.01049504429101944, + "memory(GiB)": 13.08, + "step": 246, + "token_acc": 1.0, + "train_speed(iter/s)": 1.034354 + }, + { + "epoch": 0.44344703770197486, + "grad_norm": 3.131997585296631, + "learning_rate": 6.334681644841292e-06, + "loss": 0.01592497155070305, + "memory(GiB)": 13.08, + "step": 247, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.034734 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.47706976532936096, + "learning_rate": 6.30604204490854e-06, + "loss": 0.0029080186504870653, + "memory(GiB)": 13.08, + "step": 248, + "token_acc": 1.0, + "train_speed(iter/s)": 1.034957 + }, + { + "epoch": 0.44703770197486536, + "grad_norm": 1.283147931098938, + "learning_rate": 6.277356382817529e-06, + "loss": 0.012551498599350452, + "memory(GiB)": 13.08, + "step": 249, + "token_acc": 0.9767441860465116, + "train_speed(iter/s)": 1.035309 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.13286444544792175, + "learning_rate": 6.248625670268846e-06, + "loss": 0.001932018087245524, + "memory(GiB)": 13.08, + "step": 250, + "token_acc": 1.0, + "train_speed(iter/s)": 1.035695 + }, + { + "epoch": 0.4506283662477558, + "grad_norm": 0.6445930600166321, + "learning_rate": 6.219850920551944e-06, + "loss": 0.010524374432861805, + "memory(GiB)": 13.08, + "step": 251, + "token_acc": 1.0, + "train_speed(iter/s)": 1.036065 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 2.483046531677246, + "learning_rate": 6.191033148509402e-06, + "loss": 0.04819817095994949, + "memory(GiB)": 13.08, + "step": 252, + "token_acc": 1.0, + "train_speed(iter/s)": 1.036443 + }, + { + "epoch": 0.4542190305206463, + "grad_norm": 0.5036169290542603, + "learning_rate": 6.16217337050113e-06, + "loss": 0.009306352585554123, + "memory(GiB)": 13.08, + "step": 253, + "token_acc": 1.0, + "train_speed(iter/s)": 1.036839 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 4.299251079559326, + "learning_rate": 6.1332726043685275e-06, + "loss": 0.022741444408893585, + "memory(GiB)": 13.08, + "step": 254, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.037213 + }, + { + "epoch": 0.4578096947935368, + "grad_norm": 0.6425156593322754, + "learning_rate": 6.104331869398583e-06, + "loss": 0.005235548131167889, + "memory(GiB)": 13.08, + "step": 255, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.037573 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.49873191118240356, + "learning_rate": 6.075352186287927e-06, + "loss": 0.011929428204894066, + "memory(GiB)": 13.08, + "step": 256, + "token_acc": 1.0, + "train_speed(iter/s)": 1.037947 + }, + { + "epoch": 0.4614003590664273, + "grad_norm": 1.8537715673446655, + "learning_rate": 6.0463345771068324e-06, + "loss": 0.008838113397359848, + "memory(GiB)": 13.08, + "step": 257, + "token_acc": 1.0, + "train_speed(iter/s)": 1.038315 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 1.9158146381378174, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.03769341856241226, + "memory(GiB)": 13.08, + "step": 258, + "token_acc": 1.0, + "train_speed(iter/s)": 1.038596 + }, + { + "epoch": 0.4649910233393178, + "grad_norm": 2.642102003097534, + "learning_rate": 5.988189675466309e-06, + "loss": 0.029616905376315117, + "memory(GiB)": 13.08, + "step": 259, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.038892 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 1.6327120065689087, + "learning_rate": 5.959064433690983e-06, + "loss": 0.0126350624486804, + "memory(GiB)": 13.08, + "step": 260, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.039164 + }, + { + "epoch": 0.46858168761220825, + "grad_norm": 1.0380727052688599, + "learning_rate": 5.929905367141104e-06, + "loss": 0.01608528569340706, + "memory(GiB)": 13.08, + "step": 261, + "token_acc": 1.0, + "train_speed(iter/s)": 1.039508 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 1.927999496459961, + "learning_rate": 5.900713504213532e-06, + "loss": 0.01229726243764162, + "memory(GiB)": 13.08, + "step": 262, + "token_acc": 1.0, + "train_speed(iter/s)": 1.039889 + }, + { + "epoch": 0.47217235188509876, + "grad_norm": 1.053334355354309, + "learning_rate": 5.871489874461806e-06, + "loss": 0.007304510101675987, + "memory(GiB)": 13.08, + "step": 263, + "token_acc": 1.0, + "train_speed(iter/s)": 1.040231 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 4.6686272621154785, + "learning_rate": 5.842235508559834e-06, + "loss": 0.03626105934381485, + "memory(GiB)": 13.08, + "step": 264, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.040574 + }, + { + "epoch": 0.4757630161579892, + "grad_norm": 1.6118460893630981, + "learning_rate": 5.812951438265542e-06, + "loss": 0.022392109036445618, + "memory(GiB)": 13.08, + "step": 265, + "token_acc": 1.0, + "train_speed(iter/s)": 1.040701 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 1.6628257036209106, + "learning_rate": 5.783638696384486e-06, + "loss": 0.01382455788552761, + "memory(GiB)": 13.08, + "step": 266, + "token_acc": 1.0, + "train_speed(iter/s)": 1.040983 + }, + { + "epoch": 0.4793536804308797, + "grad_norm": 1.5874329805374146, + "learning_rate": 5.754298316733427e-06, + "loss": 0.01736612245440483, + "memory(GiB)": 13.08, + "step": 267, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 1.041288 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 1.0512102842330933, + "learning_rate": 5.724931334103867e-06, + "loss": 0.008069690316915512, + "memory(GiB)": 13.08, + "step": 268, + "token_acc": 1.0, + "train_speed(iter/s)": 1.041599 + }, + { + "epoch": 0.4829443447037702, + "grad_norm": 0.8843129873275757, + "learning_rate": 5.695538784225558e-06, + "loss": 0.015562628395855427, + "memory(GiB)": 13.08, + "step": 269, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.041887 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 1.8927440643310547, + "learning_rate": 5.66612170372997e-06, + "loss": 0.038457706570625305, + "memory(GiB)": 13.08, + "step": 270, + "token_acc": 1.0, + "train_speed(iter/s)": 1.042194 + }, + { + "epoch": 0.4865350089766607, + "grad_norm": 1.4727522134780884, + "learning_rate": 5.636681130113729e-06, + "loss": 0.01323632150888443, + "memory(GiB)": 13.08, + "step": 271, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.042511 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.6518436670303345, + "learning_rate": 5.607218101702031e-06, + "loss": 0.01099499873816967, + "memory(GiB)": 13.08, + "step": 272, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.042799 + }, + { + "epoch": 0.49012567324955114, + "grad_norm": 1.1592682600021362, + "learning_rate": 5.577733657612019e-06, + "loss": 0.018731923773884773, + "memory(GiB)": 13.08, + "step": 273, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.043082 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 2.8617472648620605, + "learning_rate": 5.548228837716133e-06, + "loss": 0.03782326728105545, + "memory(GiB)": 13.08, + "step": 274, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 1.043338 + }, + { + "epoch": 0.49371633752244165, + "grad_norm": 0.5289320349693298, + "learning_rate": 5.5187046826054406e-06, + "loss": 0.021123163402080536, + "memory(GiB)": 13.08, + "step": 275, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 1.043646 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 1.0439578294754028, + "learning_rate": 5.489162233552929e-06, + "loss": 0.010797679424285889, + "memory(GiB)": 13.08, + "step": 276, + "token_acc": 1.0, + "train_speed(iter/s)": 1.043968 + }, + { + "epoch": 0.49730700179533216, + "grad_norm": 0.5113866925239563, + "learning_rate": 5.459602532476791e-06, + "loss": 0.00704155070707202, + "memory(GiB)": 13.08, + "step": 277, + "token_acc": 1.0, + "train_speed(iter/s)": 1.044295 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.4965183138847351, + "learning_rate": 5.430026621903669e-06, + "loss": 0.008296813815832138, + "memory(GiB)": 13.08, + "step": 278, + "token_acc": 1.0, + "train_speed(iter/s)": 1.04461 + }, + { + "epoch": 0.5008976660682226, + "grad_norm": 1.5550410747528076, + "learning_rate": 5.400435544931892e-06, + "loss": 0.020601091906428337, + "memory(GiB)": 13.08, + "step": 279, + "token_acc": 1.0, + "train_speed(iter/s)": 1.044918 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 1.4697859287261963, + "learning_rate": 5.370830345194683e-06, + "loss": 0.018957141786813736, + "memory(GiB)": 13.08, + "step": 280, + "token_acc": 0.975609756097561, + "train_speed(iter/s)": 1.045225 + }, + { + "epoch": 0.5044883303411131, + "grad_norm": 0.8836929798126221, + "learning_rate": 5.341212066823356e-06, + "loss": 0.024562539532780647, + "memory(GiB)": 13.08, + "step": 281, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.045479 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 2.1081759929656982, + "learning_rate": 5.311581754410489e-06, + "loss": 0.024414584040641785, + "memory(GiB)": 13.08, + "step": 282, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.045698 + }, + { + "epoch": 0.5080789946140036, + "grad_norm": 1.2182546854019165, + "learning_rate": 5.28194045297308e-06, + "loss": 0.018110880628228188, + "memory(GiB)": 13.08, + "step": 283, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.045918 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 3.6139252185821533, + "learning_rate": 5.252289207915698e-06, + "loss": 0.042631037533283234, + "memory(GiB)": 13.08, + "step": 284, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.046196 + }, + { + "epoch": 0.5116696588868941, + "grad_norm": 0.8846884965896606, + "learning_rate": 5.222629064993603e-06, + "loss": 0.01252009067684412, + "memory(GiB)": 13.08, + "step": 285, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.046454 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5470329523086548, + "learning_rate": 5.192961070275876e-06, + "loss": 0.012717966921627522, + "memory(GiB)": 13.08, + "step": 286, + "token_acc": 1.0, + "train_speed(iter/s)": 1.046698 + }, + { + "epoch": 0.5152603231597845, + "grad_norm": 0.24770720303058624, + "learning_rate": 5.163286270108514e-06, + "loss": 0.00739861186593771, + "memory(GiB)": 13.08, + "step": 287, + "token_acc": 1.0, + "train_speed(iter/s)": 1.046981 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.1064792275428772, + "learning_rate": 5.133605711077536e-06, + "loss": 0.0020551832858473063, + "memory(GiB)": 13.08, + "step": 288, + "token_acc": 1.0, + "train_speed(iter/s)": 1.047179 + }, + { + "epoch": 0.518850987432675, + "grad_norm": 1.2083576917648315, + "learning_rate": 5.103920439972062e-06, + "loss": 0.03096047230064869, + "memory(GiB)": 13.08, + "step": 289, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.047445 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 1.0709620714187622, + "learning_rate": 5.074231503747406e-06, + "loss": 0.01954200491309166, + "memory(GiB)": 13.08, + "step": 290, + "token_acc": 1.0, + "train_speed(iter/s)": 1.047748 + }, + { + "epoch": 0.5224416517055656, + "grad_norm": 0.6843818426132202, + "learning_rate": 5.044539949488142e-06, + "loss": 0.006242586299777031, + "memory(GiB)": 13.08, + "step": 291, + "token_acc": 1.0, + "train_speed(iter/s)": 1.04804 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.7698149085044861, + "learning_rate": 5.014846824371178e-06, + "loss": 0.012828817591071129, + "memory(GiB)": 13.08, + "step": 292, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.048319 + }, + { + "epoch": 0.526032315978456, + "grad_norm": 1.413210391998291, + "learning_rate": 4.985153175628823e-06, + "loss": 0.014457614161074162, + "memory(GiB)": 13.08, + "step": 293, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.048609 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.3292270600795746, + "learning_rate": 4.955460050511859e-06, + "loss": 0.003923403564840555, + "memory(GiB)": 13.08, + "step": 294, + "token_acc": 1.0, + "train_speed(iter/s)": 1.048893 + }, + { + "epoch": 0.5296229802513465, + "grad_norm": 1.2557495832443237, + "learning_rate": 4.925768496252594e-06, + "loss": 0.015292594209313393, + "memory(GiB)": 13.08, + "step": 295, + "token_acc": 1.0, + "train_speed(iter/s)": 1.04915 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.3847571015357971, + "learning_rate": 4.89607956002794e-06, + "loss": 0.008807068690657616, + "memory(GiB)": 13.08, + "step": 296, + "token_acc": 1.0, + "train_speed(iter/s)": 1.049445 + }, + { + "epoch": 0.533213644524237, + "grad_norm": 2.2386951446533203, + "learning_rate": 4.866394288922467e-06, + "loss": 0.03345731645822525, + "memory(GiB)": 13.08, + "step": 297, + "token_acc": 1.0, + "train_speed(iter/s)": 1.049685 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 3.930828809738159, + "learning_rate": 4.836713729891488e-06, + "loss": 0.00945297535508871, + "memory(GiB)": 13.08, + "step": 298, + "token_acc": 1.0, + "train_speed(iter/s)": 1.04992 + }, + { + "epoch": 0.5368043087971275, + "grad_norm": 0.3558104336261749, + "learning_rate": 4.807038929724125e-06, + "loss": 0.004208588972687721, + "memory(GiB)": 13.08, + "step": 299, + "token_acc": 1.0, + "train_speed(iter/s)": 1.050191 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.7918807864189148, + "learning_rate": 4.777370935006399e-06, + "loss": 0.019519593566656113, + "memory(GiB)": 13.08, + "step": 300, + "token_acc": 1.0, + "train_speed(iter/s)": 1.050463 + }, + { + "epoch": 0.5403949730700179, + "grad_norm": 1.5282371044158936, + "learning_rate": 4.747710792084305e-06, + "loss": 0.011254837736487389, + "memory(GiB)": 13.08, + "step": 301, + "token_acc": 1.0, + "train_speed(iter/s)": 1.050663 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.08211596310138702, + "learning_rate": 4.7180595470269235e-06, + "loss": 0.0009409271879121661, + "memory(GiB)": 13.08, + "step": 302, + "token_acc": 1.0, + "train_speed(iter/s)": 1.050886 + }, + { + "epoch": 0.5439856373429084, + "grad_norm": 0.5311585068702698, + "learning_rate": 4.688418245589514e-06, + "loss": 0.008699169382452965, + "memory(GiB)": 13.08, + "step": 303, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.051143 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.9953644275665283, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.004411707166582346, + "memory(GiB)": 13.08, + "step": 304, + "token_acc": 1.0, + "train_speed(iter/s)": 1.051397 + }, + { + "epoch": 0.547576301615799, + "grad_norm": 0.07225799560546875, + "learning_rate": 4.629169654805319e-06, + "loss": 0.0008859566878527403, + "memory(GiB)": 13.08, + "step": 305, + "token_acc": 1.0, + "train_speed(iter/s)": 1.051657 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.7513275146484375, + "learning_rate": 4.599564455068109e-06, + "loss": 0.021926749497652054, + "memory(GiB)": 13.08, + "step": 306, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.051927 + }, + { + "epoch": 0.5511669658886894, + "grad_norm": 1.2470500469207764, + "learning_rate": 4.5699733780963314e-06, + "loss": 0.011935271322727203, + "memory(GiB)": 13.08, + "step": 307, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.0522 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.7168118357658386, + "learning_rate": 4.5403974675232106e-06, + "loss": 0.033140841871500015, + "memory(GiB)": 13.08, + "step": 308, + "token_acc": 1.0, + "train_speed(iter/s)": 1.052467 + }, + { + "epoch": 0.5547576301615799, + "grad_norm": 0.20917074382305145, + "learning_rate": 4.5108377664470725e-06, + "loss": 0.0014592789812013507, + "memory(GiB)": 13.08, + "step": 309, + "token_acc": 1.0, + "train_speed(iter/s)": 1.052732 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 2.651141405105591, + "learning_rate": 4.481295317394562e-06, + "loss": 0.012496023438870907, + "memory(GiB)": 13.08, + "step": 310, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.052961 + }, + { + "epoch": 0.5583482944344704, + "grad_norm": 2.2486155033111572, + "learning_rate": 4.451771162283868e-06, + "loss": 0.04139062762260437, + "memory(GiB)": 13.08, + "step": 311, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.053211 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.15514300763607025, + "learning_rate": 4.422266342387982e-06, + "loss": 0.002122478326782584, + "memory(GiB)": 13.08, + "step": 312, + "token_acc": 1.0, + "train_speed(iter/s)": 1.053459 + }, + { + "epoch": 0.5619389587073609, + "grad_norm": 1.281379222869873, + "learning_rate": 4.392781898297969e-06, + "loss": 0.015271011739969254, + "memory(GiB)": 13.08, + "step": 313, + "token_acc": 0.9761904761904762, + "train_speed(iter/s)": 1.053703 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 4.335942268371582, + "learning_rate": 4.363318869886271e-06, + "loss": 0.011696171015501022, + "memory(GiB)": 13.08, + "step": 314, + "token_acc": 1.0, + "train_speed(iter/s)": 1.053954 + }, + { + "epoch": 0.5655296229802513, + "grad_norm": 1.2185554504394531, + "learning_rate": 4.333878296270032e-06, + "loss": 0.006226523779332638, + "memory(GiB)": 13.08, + "step": 315, + "token_acc": 1.0, + "train_speed(iter/s)": 1.054214 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 1.0457981824874878, + "learning_rate": 4.3044612157744436e-06, + "loss": 0.009365251287817955, + "memory(GiB)": 13.08, + "step": 316, + "token_acc": 1.0, + "train_speed(iter/s)": 1.054472 + }, + { + "epoch": 0.5691202872531418, + "grad_norm": 0.6282471418380737, + "learning_rate": 4.275068665896135e-06, + "loss": 0.017887648195028305, + "memory(GiB)": 13.08, + "step": 317, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.054723 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.7714171409606934, + "learning_rate": 4.245701683266576e-06, + "loss": 0.00833840947598219, + "memory(GiB)": 13.08, + "step": 318, + "token_acc": 1.0, + "train_speed(iter/s)": 1.054974 + }, + { + "epoch": 0.5727109515260324, + "grad_norm": 0.9541565775871277, + "learning_rate": 4.216361303615515e-06, + "loss": 0.01835642382502556, + "memory(GiB)": 13.08, + "step": 319, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055198 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 2.7340147495269775, + "learning_rate": 4.187048561734459e-06, + "loss": 0.0161958746612072, + "memory(GiB)": 13.08, + "step": 320, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.055353 + }, + { + "epoch": 0.5763016157989228, + "grad_norm": 1.0336790084838867, + "learning_rate": 4.157764491440169e-06, + "loss": 0.005718417000025511, + "memory(GiB)": 13.08, + "step": 321, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055572 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 3.488442897796631, + "learning_rate": 4.128510125538197e-06, + "loss": 0.015117792412638664, + "memory(GiB)": 13.08, + "step": 322, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.05581 + }, + { + "epoch": 0.5798922800718133, + "grad_norm": 1.1137819290161133, + "learning_rate": 4.09928649578647e-06, + "loss": 0.010323446244001389, + "memory(GiB)": 13.08, + "step": 323, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056054 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 3.996793746948242, + "learning_rate": 4.070094632858897e-06, + "loss": 0.011526363901793957, + "memory(GiB)": 13.08, + "step": 324, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056306 + }, + { + "epoch": 0.5834829443447038, + "grad_norm": 0.7782546281814575, + "learning_rate": 4.040935566309018e-06, + "loss": 0.010373005643486977, + "memory(GiB)": 13.08, + "step": 325, + "token_acc": 0.9764705882352941, + "train_speed(iter/s)": 1.05655 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.2521572411060333, + "learning_rate": 4.011810324533692e-06, + "loss": 0.0026296856813132763, + "memory(GiB)": 13.08, + "step": 326, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056696 + }, + { + "epoch": 0.5870736086175943, + "grad_norm": 0.4131295382976532, + "learning_rate": 3.982719934736832e-06, + "loss": 0.007535616867244244, + "memory(GiB)": 13.08, + "step": 327, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056926 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 1.1312165260314941, + "learning_rate": 3.953665422893168e-06, + "loss": 0.01614595577120781, + "memory(GiB)": 13.08, + "step": 328, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.057169 + }, + { + "epoch": 0.5906642728904847, + "grad_norm": 2.325788974761963, + "learning_rate": 3.9246478137120745e-06, + "loss": 0.020757349207997322, + "memory(GiB)": 13.08, + "step": 329, + "token_acc": 0.9642857142857143, + "train_speed(iter/s)": 1.057327 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.49573013186454773, + "learning_rate": 3.895668130601418e-06, + "loss": 0.00733250891789794, + "memory(GiB)": 13.08, + "step": 330, + "token_acc": 1.0, + "train_speed(iter/s)": 1.057543 + }, + { + "epoch": 0.5942549371633752, + "grad_norm": 1.4779912233352661, + "learning_rate": 3.866727395631473e-06, + "loss": 0.008500034920871258, + "memory(GiB)": 13.08, + "step": 331, + "token_acc": 1.0, + "train_speed(iter/s)": 1.05778 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 1.9343817234039307, + "learning_rate": 3.83782662949887e-06, + "loss": 0.013538424856960773, + "memory(GiB)": 13.08, + "step": 332, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.058012 + }, + { + "epoch": 0.5978456014362658, + "grad_norm": 1.9285494089126587, + "learning_rate": 3.8089668514905994e-06, + "loss": 0.018029402941465378, + "memory(GiB)": 13.08, + "step": 333, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.05817 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.5474541783332825, + "learning_rate": 3.780149079448057e-06, + "loss": 0.009142300114035606, + "memory(GiB)": 13.08, + "step": 334, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.058353 + }, + { + "epoch": 0.6014362657091562, + "grad_norm": 0.5142211318016052, + "learning_rate": 3.7513743297311556e-06, + "loss": 0.003718828782439232, + "memory(GiB)": 13.08, + "step": 335, + "token_acc": 1.0, + "train_speed(iter/s)": 1.05851 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.4826161861419678, + "learning_rate": 3.7226436171824732e-06, + "loss": 0.0026040368247777224, + "memory(GiB)": 13.08, + "step": 336, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058652 + }, + { + "epoch": 0.6050269299820467, + "grad_norm": 1.3914278745651245, + "learning_rate": 3.6939579550914607e-06, + "loss": 0.010664897970855236, + "memory(GiB)": 13.08, + "step": 337, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.058827 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.7463139891624451, + "learning_rate": 3.665318355158709e-06, + "loss": 0.004821094684302807, + "memory(GiB)": 13.08, + "step": 338, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058997 + }, + { + "epoch": 0.6086175942549371, + "grad_norm": 0.45180100202560425, + "learning_rate": 3.636725827460266e-06, + "loss": 0.002875033998861909, + "memory(GiB)": 13.08, + "step": 339, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059193 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 1.6877645254135132, + "learning_rate": 3.608181380412005e-06, + "loss": 0.015995891764760017, + "memory(GiB)": 13.08, + "step": 340, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.059384 + }, + { + "epoch": 0.6122082585278277, + "grad_norm": 1.410149335861206, + "learning_rate": 3.579686020734075e-06, + "loss": 0.014103513211011887, + "memory(GiB)": 13.08, + "step": 341, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059566 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.3852950632572174, + "learning_rate": 3.551240753415382e-06, + "loss": 0.0018481770530343056, + "memory(GiB)": 13.08, + "step": 342, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059767 + }, + { + "epoch": 0.6157989228007181, + "grad_norm": 1.8844478130340576, + "learning_rate": 3.5228465816781497e-06, + "loss": 0.009936138056218624, + "memory(GiB)": 13.08, + "step": 343, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059992 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 1.1303751468658447, + "learning_rate": 3.4945045069425387e-06, + "loss": 0.004107779823243618, + "memory(GiB)": 13.08, + "step": 344, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060167 + }, + { + "epoch": 0.6193895870736086, + "grad_norm": 1.322853446006775, + "learning_rate": 3.4662155287913235e-06, + "loss": 0.01510065607726574, + "memory(GiB)": 13.08, + "step": 345, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060384 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.9100804328918457, + "learning_rate": 3.4379806449346416e-06, + "loss": 0.027893947437405586, + "memory(GiB)": 13.08, + "step": 346, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060606 + }, + { + "epoch": 0.6229802513464991, + "grad_norm": 0.563826322555542, + "learning_rate": 3.409800851174805e-06, + "loss": 0.0048735919408500195, + "memory(GiB)": 13.08, + "step": 347, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060809 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 0.7323080897331238, + "learning_rate": 3.3816771413711814e-06, + "loss": 0.016326840966939926, + "memory(GiB)": 13.08, + "step": 348, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060976 + }, + { + "epoch": 0.6265709156193896, + "grad_norm": 0.10170638561248779, + "learning_rate": 3.3536105074051393e-06, + "loss": 0.001500750076957047, + "memory(GiB)": 13.08, + "step": 349, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061187 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 1.1254711151123047, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.004378891550004482, + "memory(GiB)": 13.08, + "step": 350, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061396 + }, + { + "epoch": 0.63016157989228, + "grad_norm": 0.34284642338752747, + "learning_rate": 3.2976524244114704e-06, + "loss": 0.0009745637653395534, + "memory(GiB)": 13.08, + "step": 351, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061593 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.10504211485385895, + "learning_rate": 3.2697629489421097e-06, + "loss": 0.0015449959319084883, + "memory(GiB)": 13.08, + "step": 352, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061764 + }, + { + "epoch": 0.6337522441651705, + "grad_norm": 3.402330160140991, + "learning_rate": 3.2419344963572603e-06, + "loss": 0.00964401662349701, + "memory(GiB)": 13.08, + "step": 353, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061961 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.567754328250885, + "learning_rate": 3.214168048125009e-06, + "loss": 0.00816022977232933, + "memory(GiB)": 13.08, + "step": 354, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062135 + }, + { + "epoch": 0.6373429084380611, + "grad_norm": 0.7188345193862915, + "learning_rate": 3.1864645835266426e-06, + "loss": 0.013497605919837952, + "memory(GiB)": 13.08, + "step": 355, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062304 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 1.3711332082748413, + "learning_rate": 3.1588250796221065e-06, + "loss": 0.0018809286411851645, + "memory(GiB)": 13.08, + "step": 356, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062429 + }, + { + "epoch": 0.6409335727109515, + "grad_norm": 1.239640712738037, + "learning_rate": 3.13125051121555e-06, + "loss": 0.006427587475627661, + "memory(GiB)": 13.08, + "step": 357, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062585 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.025559641420841217, + "learning_rate": 3.103741850820944e-06, + "loss": 0.00034689123276621103, + "memory(GiB)": 13.08, + "step": 358, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062777 + }, + { + "epoch": 0.644524236983842, + "grad_norm": 1.0653990507125854, + "learning_rate": 3.0763000686277754e-06, + "loss": 0.016017021611332893, + "memory(GiB)": 13.08, + "step": 359, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062978 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 1.222447395324707, + "learning_rate": 3.0489261324668486e-06, + "loss": 0.011437255889177322, + "memory(GiB)": 13.08, + "step": 360, + "token_acc": 1.0, + "train_speed(iter/s)": 1.063175 + }, + { + "epoch": 0.6481149012567325, + "grad_norm": 1.3343725204467773, + "learning_rate": 3.0216210077761287e-06, + "loss": 0.010193916969001293, + "memory(GiB)": 13.08, + "step": 361, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.06334 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 1.5795645713806152, + "learning_rate": 2.994385657566709e-06, + "loss": 0.016673408448696136, + "memory(GiB)": 13.08, + "step": 362, + "token_acc": 1.0, + "train_speed(iter/s)": 1.063539 + }, + { + "epoch": 0.651705565529623, + "grad_norm": 0.3886328339576721, + "learning_rate": 2.967221042388838e-06, + "loss": 0.005804315209388733, + "memory(GiB)": 13.08, + "step": 363, + "token_acc": 1.0, + "train_speed(iter/s)": 1.06374 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.5594945549964905, + "learning_rate": 2.940128120298048e-06, + "loss": 0.008250754326581955, + "memory(GiB)": 13.08, + "step": 364, + "token_acc": 1.0, + "train_speed(iter/s)": 1.063943 + }, + { + "epoch": 0.6552962298025135, + "grad_norm": 0.7763862013816833, + "learning_rate": 2.9131078468213547e-06, + "loss": 0.024658694863319397, + "memory(GiB)": 13.08, + "step": 365, + "token_acc": 1.0, + "train_speed(iter/s)": 1.0641 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.9000593423843384, + "learning_rate": 2.8861611749235767e-06, + "loss": 0.007464740425348282, + "memory(GiB)": 13.08, + "step": 366, + "token_acc": 1.0, + "train_speed(iter/s)": 1.064274 + }, + { + "epoch": 0.6588868940754039, + "grad_norm": 0.5618321299552917, + "learning_rate": 2.8592890549737052e-06, + "loss": 0.011171810328960419, + "memory(GiB)": 13.08, + "step": 367, + "token_acc": 1.0, + "train_speed(iter/s)": 1.064469 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.5419095754623413, + "learning_rate": 2.8324924347114012e-06, + "loss": 0.00791275966912508, + "memory(GiB)": 13.08, + "step": 368, + "token_acc": 1.0, + "train_speed(iter/s)": 1.064655 + }, + { + "epoch": 0.6624775583482945, + "grad_norm": 1.0393513441085815, + "learning_rate": 2.8057722592135605e-06, + "loss": 0.012819621711969376, + "memory(GiB)": 13.08, + "step": 369, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.064841 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.9286229014396667, + "learning_rate": 2.779129470860991e-06, + "loss": 0.011815537698566914, + "memory(GiB)": 13.08, + "step": 370, + "token_acc": 1.0, + "train_speed(iter/s)": 1.065025 + }, + { + "epoch": 0.6660682226211849, + "grad_norm": 0.667698323726654, + "learning_rate": 2.7525650093051607e-06, + "loss": 0.0029592120554298162, + "memory(GiB)": 13.08, + "step": 371, + "token_acc": 1.0, + "train_speed(iter/s)": 1.065214 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.9357712864875793, + "learning_rate": 2.726079811435079e-06, + "loss": 0.023475248366594315, + "memory(GiB)": 13.08, + "step": 372, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.065412 + }, + { + "epoch": 0.6696588868940754, + "grad_norm": 2.2222044467926025, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.012070901691913605, + "memory(GiB)": 13.08, + "step": 373, + "token_acc": 1.0, + "train_speed(iter/s)": 1.065592 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.7210403680801392, + "learning_rate": 2.673350940297677e-06, + "loss": 0.006681766360998154, + "memory(GiB)": 13.08, + "step": 374, + "token_acc": 1.0, + "train_speed(iter/s)": 1.065793 + }, + { + "epoch": 0.6732495511669659, + "grad_norm": 0.2000521570444107, + "learning_rate": 2.64710912669913e-06, + "loss": 0.00258549558930099, + "memory(GiB)": 13.08, + "step": 375, + "token_acc": 1.0, + "train_speed(iter/s)": 1.065992 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 1.6960115432739258, + "learning_rate": 2.62095029605829e-06, + "loss": 0.010029124096035957, + "memory(GiB)": 13.08, + "step": 376, + "token_acc": 1.0, + "train_speed(iter/s)": 1.066155 + }, + { + "epoch": 0.6768402154398564, + "grad_norm": 2.0170063972473145, + "learning_rate": 2.594875370958163e-06, + "loss": 0.015618421137332916, + "memory(GiB)": 13.08, + "step": 377, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.066346 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.5755393505096436, + "learning_rate": 2.5688852710225316e-06, + "loss": 0.013920176774263382, + "memory(GiB)": 13.08, + "step": 378, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.066544 + }, + { + "epoch": 0.6804308797127468, + "grad_norm": 0.6749535799026489, + "learning_rate": 2.5429809128835247e-06, + "loss": 0.009063899517059326, + "memory(GiB)": 13.08, + "step": 379, + "token_acc": 1.0, + "train_speed(iter/s)": 1.066713 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.9033764600753784, + "learning_rate": 2.517163210149279e-06, + "loss": 0.016781218349933624, + "memory(GiB)": 13.08, + "step": 380, + "token_acc": 1.0, + "train_speed(iter/s)": 1.066903 + }, + { + "epoch": 0.6840215439856373, + "grad_norm": 0.3434348702430725, + "learning_rate": 2.4914330733717323e-06, + "loss": 0.009626403450965881, + "memory(GiB)": 13.08, + "step": 381, + "token_acc": 1.0, + "train_speed(iter/s)": 1.067082 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.7772487998008728, + "learning_rate": 2.4657914100144954e-06, + "loss": 0.02452985942363739, + "memory(GiB)": 13.08, + "step": 382, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.067269 + }, + { + "epoch": 0.6876122082585279, + "grad_norm": 0.9500385522842407, + "learning_rate": 2.440239124420853e-06, + "loss": 0.02504763752222061, + "memory(GiB)": 13.08, + "step": 383, + "token_acc": 0.963855421686747, + "train_speed(iter/s)": 1.067459 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.7816272377967834, + "learning_rate": 2.414777117781875e-06, + "loss": 0.017525769770145416, + "memory(GiB)": 13.08, + "step": 384, + "token_acc": 1.0, + "train_speed(iter/s)": 1.067639 + }, + { + "epoch": 0.6912028725314183, + "grad_norm": 0.8934208154678345, + "learning_rate": 2.3894062881046203e-06, + "loss": 0.009042469784617424, + "memory(GiB)": 13.08, + "step": 385, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.067782 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 1.5824229717254639, + "learning_rate": 2.364127530180478e-06, + "loss": 0.009145749732851982, + "memory(GiB)": 13.08, + "step": 386, + "token_acc": 1.0, + "train_speed(iter/s)": 1.067963 + }, + { + "epoch": 0.6947935368043088, + "grad_norm": 0.6038988828659058, + "learning_rate": 2.3389417355535966e-06, + "loss": 0.013612976297736168, + "memory(GiB)": 13.08, + "step": 387, + "token_acc": 1.0, + "train_speed(iter/s)": 1.068133 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 1.5376991033554077, + "learning_rate": 2.313849792489456e-06, + "loss": 0.015092341229319572, + "memory(GiB)": 13.08, + "step": 388, + "token_acc": 1.0, + "train_speed(iter/s)": 1.068317 + }, + { + "epoch": 0.6983842010771992, + "grad_norm": 1.0677285194396973, + "learning_rate": 2.2888525859435253e-06, + "loss": 0.005899708718061447, + "memory(GiB)": 13.08, + "step": 389, + "token_acc": 1.0, + "train_speed(iter/s)": 1.0685 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.33772847056388855, + "learning_rate": 2.2639509975300566e-06, + "loss": 0.006135785952210426, + "memory(GiB)": 13.08, + "step": 390, + "token_acc": 1.0, + "train_speed(iter/s)": 1.068677 + }, + { + "epoch": 0.7019748653500898, + "grad_norm": 0.822481632232666, + "learning_rate": 2.239145905490999e-06, + "loss": 0.007328001782298088, + "memory(GiB)": 13.08, + "step": 391, + "token_acc": 1.0, + "train_speed(iter/s)": 1.068822 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.610440194606781, + "learning_rate": 2.2144381846650114e-06, + "loss": 0.009720534086227417, + "memory(GiB)": 13.08, + "step": 392, + "token_acc": 1.0, + "train_speed(iter/s)": 1.068989 + }, + { + "epoch": 0.7055655296229802, + "grad_norm": 0.2826550006866455, + "learning_rate": 2.1898287064566214e-06, + "loss": 0.0070832036435604095, + "memory(GiB)": 13.08, + "step": 393, + "token_acc": 1.0, + "train_speed(iter/s)": 1.069158 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.4414781928062439, + "learning_rate": 2.165318338805481e-06, + "loss": 0.008607835508883, + "memory(GiB)": 13.08, + "step": 394, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.069304 + }, + { + "epoch": 0.7091561938958707, + "grad_norm": 0.26323202252388, + "learning_rate": 2.1409079461557625e-06, + "loss": 0.0023532358463853598, + "memory(GiB)": 13.08, + "step": 395, + "token_acc": 1.0, + "train_speed(iter/s)": 1.069466 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 2.6713178157806396, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.016681954264640808, + "memory(GiB)": 13.08, + "step": 396, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.0696 + }, + { + "epoch": 0.7127468581687613, + "grad_norm": 0.213148832321167, + "learning_rate": 2.0923905259770626e-06, + "loss": 0.002749420003965497, + "memory(GiB)": 13.08, + "step": 397, + "token_acc": 1.0, + "train_speed(iter/s)": 1.069775 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.7741344571113586, + "learning_rate": 2.068285209585251e-06, + "loss": 0.007679448463022709, + "memory(GiB)": 13.08, + "step": 398, + "token_acc": 1.0, + "train_speed(iter/s)": 1.06995 + }, + { + "epoch": 0.7163375224416517, + "grad_norm": 0.9930524230003357, + "learning_rate": 2.0442832904088474e-06, + "loss": 0.007036359049379826, + "memory(GiB)": 13.08, + "step": 399, + "token_acc": 1.0, + "train_speed(iter/s)": 1.070129 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 2.348698139190674, + "learning_rate": 2.020385614959806e-06, + "loss": 0.012709174305200577, + "memory(GiB)": 13.08, + "step": 400, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.070318 + }, + { + "epoch": 0.7199281867145422, + "grad_norm": 1.2392982244491577, + "learning_rate": 1.996593026073555e-06, + "loss": 0.018436726182699203, + "memory(GiB)": 13.08, + "step": 401, + "token_acc": 1.0, + "train_speed(iter/s)": 1.070435 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5735259056091309, + "learning_rate": 1.972906362879277e-06, + "loss": 0.0042767999693751335, + "memory(GiB)": 13.08, + "step": 402, + "token_acc": 1.0, + "train_speed(iter/s)": 1.070537 + }, + { + "epoch": 0.7235188509874326, + "grad_norm": 0.8973569869995117, + "learning_rate": 1.9493264607703138e-06, + "loss": 0.017068825662136078, + "memory(GiB)": 13.08, + "step": 403, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.070676 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.5175889134407043, + "learning_rate": 1.925854151374705e-06, + "loss": 0.013104497455060482, + "memory(GiB)": 13.08, + "step": 404, + "token_acc": 1.0, + "train_speed(iter/s)": 1.07078 + }, + { + "epoch": 0.7271095152603232, + "grad_norm": 0.3876878619194031, + "learning_rate": 1.9024902625258502e-06, + "loss": 0.006141915451735258, + "memory(GiB)": 13.08, + "step": 405, + "token_acc": 0.9883720930232558, + "train_speed(iter/s)": 1.070921 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.37326645851135254, + "learning_rate": 1.879235618233325e-06, + "loss": 0.009587346576154232, + "memory(GiB)": 13.08, + "step": 406, + "token_acc": 1.0, + "train_speed(iter/s)": 1.07107 + }, + { + "epoch": 0.7307001795332136, + "grad_norm": 0.748358428478241, + "learning_rate": 1.856091038653805e-06, + "loss": 0.01084985677152872, + "memory(GiB)": 13.08, + "step": 407, + "token_acc": 1.0, + "train_speed(iter/s)": 1.071233 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.48986369371414185, + "learning_rate": 1.8330573400621493e-06, + "loss": 0.003840287681668997, + "memory(GiB)": 13.08, + "step": 408, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.071373 + }, + { + "epoch": 0.7342908438061041, + "grad_norm": 4.525958061218262, + "learning_rate": 1.8101353348226141e-06, + "loss": 0.01381802000105381, + "memory(GiB)": 13.08, + "step": 409, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.071537 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.906284511089325, + "learning_rate": 1.787325831360191e-06, + "loss": 0.010759145952761173, + "memory(GiB)": 13.08, + "step": 410, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.071701 + }, + { + "epoch": 0.7378815080789947, + "grad_norm": 0.8138025403022766, + "learning_rate": 1.7646296341321068e-06, + "loss": 0.026156552135944366, + "memory(GiB)": 13.08, + "step": 411, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.071859 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.07587321102619171, + "learning_rate": 1.7420475435994406e-06, + "loss": 0.0009182463400065899, + "memory(GiB)": 13.08, + "step": 412, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072036 + }, + { + "epoch": 0.7414721723518851, + "grad_norm": 0.5184074640274048, + "learning_rate": 1.719580356198905e-06, + "loss": 0.005103666335344315, + "memory(GiB)": 13.08, + "step": 413, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072196 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 1.3357967138290405, + "learning_rate": 1.6972288643147444e-06, + "loss": 0.007269741035997868, + "memory(GiB)": 13.08, + "step": 414, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072359 + }, + { + "epoch": 0.7450628366247756, + "grad_norm": 0.18689586222171783, + "learning_rate": 1.6749938562507957e-06, + "loss": 0.0023068576119840145, + "memory(GiB)": 13.08, + "step": 415, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072531 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 1.077985405921936, + "learning_rate": 1.6528761162026885e-06, + "loss": 0.015352905727922916, + "memory(GiB)": 13.08, + "step": 416, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.072677 + }, + { + "epoch": 0.748653500897666, + "grad_norm": 0.327054888010025, + "learning_rate": 1.6308764242301788e-06, + "loss": 0.0031434802804142237, + "memory(GiB)": 13.08, + "step": 417, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072815 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.4002721905708313, + "learning_rate": 1.6089955562296484e-06, + "loss": 0.01052124798297882, + "memory(GiB)": 13.08, + "step": 418, + "token_acc": 1.0, + "train_speed(iter/s)": 1.072894 + }, + { + "epoch": 0.7522441651705566, + "grad_norm": 0.3296772837638855, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.0038328436203300953, + "memory(GiB)": 13.08, + "step": 419, + "token_acc": 1.0, + "train_speed(iter/s)": 1.073033 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.4562620520591736, + "learning_rate": 1.5655933747490975e-06, + "loss": 0.00482065649703145, + "memory(GiB)": 13.08, + "step": 420, + "token_acc": 1.0, + "train_speed(iter/s)": 1.07317 + }, + { + "epoch": 0.755834829443447, + "grad_norm": 2.2710041999816895, + "learning_rate": 1.544073591999391e-06, + "loss": 0.008696787059307098, + "memory(GiB)": 13.08, + "step": 421, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.073308 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.5438786745071411, + "learning_rate": 1.52267569462831e-06, + "loss": 0.004525571130216122, + "memory(GiB)": 13.08, + "step": 422, + "token_acc": 1.0, + "train_speed(iter/s)": 1.073427 + }, + { + "epoch": 0.7594254937163375, + "grad_norm": 0.1277576982975006, + "learning_rate": 1.5014004373078295e-06, + "loss": 0.0009216235484927893, + "memory(GiB)": 13.08, + "step": 423, + "token_acc": 1.0, + "train_speed(iter/s)": 1.073526 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.12262847274541855, + "learning_rate": 1.4802485703846015e-06, + "loss": 0.0007877249736338854, + "memory(GiB)": 13.08, + "step": 424, + "token_acc": 1.0, + "train_speed(iter/s)": 1.073664 + }, + { + "epoch": 0.7630161579892281, + "grad_norm": 0.47984498739242554, + "learning_rate": 1.4592208398534747e-06, + "loss": 0.008510983549058437, + "memory(GiB)": 13.08, + "step": 425, + "token_acc": 1.0, + "train_speed(iter/s)": 1.073781 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 1.3471852540969849, + "learning_rate": 1.4383179873311947e-06, + "loss": 0.02714439108967781, + "memory(GiB)": 13.08, + "step": 426, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.073912 + }, + { + "epoch": 0.7666068222621185, + "grad_norm": 0.3143499791622162, + "learning_rate": 1.417540750030249e-06, + "loss": 0.005050563719123602, + "memory(GiB)": 13.08, + "step": 427, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.074038 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.3487984538078308, + "learning_rate": 1.3968898607328573e-06, + "loss": 0.0054231989197432995, + "memory(GiB)": 13.08, + "step": 428, + "token_acc": 0.9883720930232558, + "train_speed(iter/s)": 1.074178 + }, + { + "epoch": 0.770197486535009, + "grad_norm": 2.098447322845459, + "learning_rate": 1.3763660477651397e-06, + "loss": 0.01812714897096157, + "memory(GiB)": 13.08, + "step": 429, + "token_acc": 1.0, + "train_speed(iter/s)": 1.074317 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.25104638934135437, + "learning_rate": 1.3559700349714167e-06, + "loss": 0.0029485849663615227, + "memory(GiB)": 13.08, + "step": 430, + "token_acc": 1.0, + "train_speed(iter/s)": 1.074466 + }, + { + "epoch": 0.7737881508078994, + "grad_norm": 0.28429216146469116, + "learning_rate": 1.3357025416886932e-06, + "loss": 0.002281719585880637, + "memory(GiB)": 13.08, + "step": 431, + "token_acc": 1.0, + "train_speed(iter/s)": 1.074611 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.35956239700317383, + "learning_rate": 1.3155642827212788e-06, + "loss": 0.0056508975103497505, + "memory(GiB)": 13.08, + "step": 432, + "token_acc": 1.0, + "train_speed(iter/s)": 1.074737 + }, + { + "epoch": 0.77737881508079, + "grad_norm": 0.12542538344860077, + "learning_rate": 1.29555596831558e-06, + "loss": 0.0020140784326940775, + "memory(GiB)": 13.08, + "step": 433, + "token_acc": 1.0, + "train_speed(iter/s)": 1.074868 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.8066336512565613, + "learning_rate": 1.2756783041350568e-06, + "loss": 0.0037097863387316465, + "memory(GiB)": 13.08, + "step": 434, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.075006 + }, + { + "epoch": 0.7809694793536804, + "grad_norm": 0.013342740945518017, + "learning_rate": 1.2559319912353253e-06, + "loss": 0.00019833751139231026, + "memory(GiB)": 13.08, + "step": 435, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075144 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.4184013605117798, + "learning_rate": 1.2363177260394415e-06, + "loss": 0.004341490566730499, + "memory(GiB)": 13.08, + "step": 436, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075279 + }, + { + "epoch": 0.7845601436265709, + "grad_norm": 0.10948548465967178, + "learning_rate": 1.2168362003133316e-06, + "loss": 0.001354630570858717, + "memory(GiB)": 13.08, + "step": 437, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075427 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.4735148549079895, + "learning_rate": 1.1974881011414046e-06, + "loss": 0.00516703212633729, + "memory(GiB)": 13.08, + "step": 438, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075576 + }, + { + "epoch": 0.7881508078994613, + "grad_norm": 0.20596560835838318, + "learning_rate": 1.1782741109023039e-06, + "loss": 0.0023033865727484226, + "memory(GiB)": 13.08, + "step": 439, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075707 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.2670016288757324, + "learning_rate": 1.159194907244859e-06, + "loss": 0.0029851882718503475, + "memory(GiB)": 13.08, + "step": 440, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075845 + }, + { + "epoch": 0.7917414721723519, + "grad_norm": 2.4961907863616943, + "learning_rate": 1.140251163064175e-06, + "loss": 0.024585047736763954, + "memory(GiB)": 13.08, + "step": 441, + "token_acc": 1.0, + "train_speed(iter/s)": 1.075942 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 1.1317187547683716, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.00982555653899908, + "memory(GiB)": 13.08, + "step": 442, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.076073 + }, + { + "epoch": 0.7953321364452424, + "grad_norm": 0.9067276120185852, + "learning_rate": 1.102772720802671e-06, + "loss": 0.013734947890043259, + "memory(GiB)": 13.08, + "step": 443, + "token_acc": 1.0, + "train_speed(iter/s)": 1.076211 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.38522371649742126, + "learning_rate": 1.0842393445307065e-06, + "loss": 0.0016170066082850099, + "memory(GiB)": 13.08, + "step": 444, + "token_acc": 1.0, + "train_speed(iter/s)": 1.076355 + }, + { + "epoch": 0.7989228007181328, + "grad_norm": 0.4796266257762909, + "learning_rate": 1.0658440713065915e-06, + "loss": 0.0040134042501449585, + "memory(GiB)": 13.08, + "step": 445, + "token_acc": 1.0, + "train_speed(iter/s)": 1.076489 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 1.127416968345642, + "learning_rate": 1.047587549904222e-06, + "loss": 0.021780934184789658, + "memory(GiB)": 13.08, + "step": 446, + "token_acc": 0.9759036144578314, + "train_speed(iter/s)": 1.076607 + }, + { + "epoch": 0.8025134649910234, + "grad_norm": 0.6868329048156738, + "learning_rate": 1.0294704242039271e-06, + "loss": 0.006480331066995859, + "memory(GiB)": 13.08, + "step": 447, + "token_acc": 1.0, + "train_speed(iter/s)": 1.076744 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.509554386138916, + "learning_rate": 1.0114933331697514e-06, + "loss": 0.019758323207497597, + "memory(GiB)": 13.08, + "step": 448, + "token_acc": 1.0, + "train_speed(iter/s)": 1.076878 + }, + { + "epoch": 0.8061041292639138, + "grad_norm": 2.4400320053100586, + "learning_rate": 9.936569108269306e-07, + "loss": 0.01969185099005699, + "memory(GiB)": 13.08, + "step": 449, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.077006 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 2.120880365371704, + "learning_rate": 9.75961786239522e-07, + "loss": 0.02465156279504299, + "memory(GiB)": 13.08, + "step": 450, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.077132 + }, + { + "epoch": 0.8096947935368043, + "grad_norm": 1.2014790773391724, + "learning_rate": 9.584085834882206e-07, + "loss": 0.010115174576640129, + "memory(GiB)": 13.08, + "step": 451, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.077267 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.12177525460720062, + "learning_rate": 9.409979216483539e-07, + "loss": 0.0012937367428094149, + "memory(GiB)": 13.08, + "step": 452, + "token_acc": 1.0, + "train_speed(iter/s)": 1.077407 + }, + { + "epoch": 0.8132854578096947, + "grad_norm": 1.024206519126892, + "learning_rate": 9.237304147680376e-07, + "loss": 0.013832255266606808, + "memory(GiB)": 13.08, + "step": 453, + "token_acc": 1.0, + "train_speed(iter/s)": 1.077543 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.6342349052429199, + "learning_rate": 9.066066718465321e-07, + "loss": 0.0029691816307604313, + "memory(GiB)": 13.08, + "step": 454, + "token_acc": 1.0, + "train_speed(iter/s)": 1.077654 + }, + { + "epoch": 0.8168761220825853, + "grad_norm": 1.3011318445205688, + "learning_rate": 8.896272968127506e-07, + "loss": 0.014536377042531967, + "memory(GiB)": 13.08, + "step": 455, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.077781 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.19679167866706848, + "learning_rate": 8.727928885039715e-07, + "loss": 0.0014049422461539507, + "memory(GiB)": 13.08, + "step": 456, + "token_acc": 1.0, + "train_speed(iter/s)": 1.07791 + }, + { + "epoch": 0.8204667863554758, + "grad_norm": 1.3811694383621216, + "learning_rate": 8.561040406447075e-07, + "loss": 0.01814611628651619, + "memory(GiB)": 13.08, + "step": 457, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.078043 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.4900592565536499, + "learning_rate": 8.395613418257731e-07, + "loss": 0.00789629016071558, + "memory(GiB)": 13.08, + "step": 458, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078173 + }, + { + "epoch": 0.8240574506283662, + "grad_norm": 0.48402440547943115, + "learning_rate": 8.231653754835262e-07, + "loss": 0.00701717147603631, + "memory(GiB)": 13.08, + "step": 459, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078292 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.38999930024147034, + "learning_rate": 8.069167198792838e-07, + "loss": 0.004503586329519749, + "memory(GiB)": 13.08, + "step": 460, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.078422 + }, + { + "epoch": 0.8276481149012568, + "grad_norm": 0.8795774579048157, + "learning_rate": 7.908159480789379e-07, + "loss": 0.01637558825314045, + "memory(GiB)": 13.08, + "step": 461, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.078551 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.1430443823337555, + "learning_rate": 7.748636279327349e-07, + "loss": 0.0018170611001551151, + "memory(GiB)": 13.08, + "step": 462, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078632 + }, + { + "epoch": 0.8312387791741472, + "grad_norm": 0.5654152035713196, + "learning_rate": 7.590603220552539e-07, + "loss": 0.0028146901167929173, + "memory(GiB)": 13.08, + "step": 463, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078754 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.7268990874290466, + "learning_rate": 7.434065878055613e-07, + "loss": 0.007614848203957081, + "memory(GiB)": 13.08, + "step": 464, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078847 + }, + { + "epoch": 0.8348294434470377, + "grad_norm": 1.0380241870880127, + "learning_rate": 7.279029772675572e-07, + "loss": 0.008555954322218895, + "memory(GiB)": 13.08, + "step": 465, + "token_acc": 1.0, + "train_speed(iter/s)": 1.078975 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6397264003753662, + "learning_rate": 7.125500372305017e-07, + "loss": 0.006588224787265062, + "memory(GiB)": 13.08, + "step": 466, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079102 + }, + { + "epoch": 0.8384201077199281, + "grad_norm": 0.4453122615814209, + "learning_rate": 6.97348309169728e-07, + "loss": 0.001982884481549263, + "memory(GiB)": 13.08, + "step": 467, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079228 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.48022353649139404, + "learning_rate": 6.822983292275525e-07, + "loss": 0.008024324662983418, + "memory(GiB)": 13.08, + "step": 468, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079353 + }, + { + "epoch": 0.8420107719928187, + "grad_norm": 0.360468327999115, + "learning_rate": 6.674006281943551e-07, + "loss": 0.002145705744624138, + "memory(GiB)": 13.08, + "step": 469, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079449 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.7237960696220398, + "learning_rate": 6.526557314898718e-07, + "loss": 0.0017190808430314064, + "memory(GiB)": 13.08, + "step": 470, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079469 + }, + { + "epoch": 0.8456014362657092, + "grad_norm": 0.5253114700317383, + "learning_rate": 6.380641591446524e-07, + "loss": 0.006616916507482529, + "memory(GiB)": 13.08, + "step": 471, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079542 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 1.0282394886016846, + "learning_rate": 6.236264257817292e-07, + "loss": 0.003393652383238077, + "memory(GiB)": 13.08, + "step": 472, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079626 + }, + { + "epoch": 0.8491921005385996, + "grad_norm": 0.21432991325855255, + "learning_rate": 6.093430405984585e-07, + "loss": 0.0026423251256346703, + "memory(GiB)": 13.08, + "step": 473, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079731 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.16753339767456055, + "learning_rate": 5.952145073485699e-07, + "loss": 0.0014232571702450514, + "memory(GiB)": 13.08, + "step": 474, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079832 + }, + { + "epoch": 0.8527827648114902, + "grad_norm": 0.29233884811401367, + "learning_rate": 5.812413243243936e-07, + "loss": 0.002251636702567339, + "memory(GiB)": 13.08, + "step": 475, + "token_acc": 1.0, + "train_speed(iter/s)": 1.079931 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 3.4666576385498047, + "learning_rate": 5.674239843392876e-07, + "loss": 0.015009160153567791, + "memory(GiB)": 13.08, + "step": 476, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080029 + }, + { + "epoch": 0.8563734290843806, + "grad_norm": 3.7881627082824707, + "learning_rate": 5.537629747102613e-07, + "loss": 0.011072278022766113, + "memory(GiB)": 13.08, + "step": 477, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080131 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.2701643407344818, + "learning_rate": 5.402587772407825e-07, + "loss": 0.0017975243972614408, + "memory(GiB)": 13.08, + "step": 478, + "token_acc": 1.0, + "train_speed(iter/s)": 1.08023 + }, + { + "epoch": 0.8599640933572711, + "grad_norm": 0.6450809240341187, + "learning_rate": 5.269118682037894e-07, + "loss": 0.016003768891096115, + "memory(GiB)": 13.08, + "step": 479, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080348 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.2792893350124359, + "learning_rate": 5.137227183248894e-07, + "loss": 0.002935483818873763, + "memory(GiB)": 13.08, + "step": 480, + "token_acc": 1.0, + "train_speed(iter/s)": 1.08047 + }, + { + "epoch": 0.8635547576301615, + "grad_norm": 0.4301268458366394, + "learning_rate": 5.006917927657617e-07, + "loss": 0.014643090777099133, + "memory(GiB)": 13.08, + "step": 481, + "token_acc": 0.9878048780487805, + "train_speed(iter/s)": 1.080511 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 1.9902009963989258, + "learning_rate": 4.878195511077477e-07, + "loss": 0.007139670196920633, + "memory(GiB)": 13.08, + "step": 482, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080627 + }, + { + "epoch": 0.8671454219030521, + "grad_norm": 0.7385199069976807, + "learning_rate": 4.7510644733564316e-07, + "loss": 0.010074608027935028, + "memory(GiB)": 13.08, + "step": 483, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080743 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.23309572041034698, + "learning_rate": 4.625529298216896e-07, + "loss": 0.000950617715716362, + "memory(GiB)": 13.08, + "step": 484, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080856 + }, + { + "epoch": 0.8707360861759426, + "grad_norm": 0.42583540081977844, + "learning_rate": 4.501594413097571e-07, + "loss": 0.0033356505446135998, + "memory(GiB)": 13.08, + "step": 485, + "token_acc": 1.0, + "train_speed(iter/s)": 1.080975 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.7843057513237, + "learning_rate": 4.379264188997334e-07, + "loss": 0.022459395229816437, + "memory(GiB)": 13.08, + "step": 486, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.081081 + }, + { + "epoch": 0.874326750448833, + "grad_norm": 0.9665271639823914, + "learning_rate": 4.258542940321031e-07, + "loss": 0.02070029079914093, + "memory(GiB)": 13.08, + "step": 487, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.081171 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.4193180501461029, + "learning_rate": 4.139434924727359e-07, + "loss": 0.0020543683785945177, + "memory(GiB)": 13.08, + "step": 488, + "token_acc": 1.0, + "train_speed(iter/s)": 1.081244 + }, + { + "epoch": 0.8779174147217235, + "grad_norm": 1.000474214553833, + "learning_rate": 4.0219443429786567e-07, + "loss": 0.008471306413412094, + "memory(GiB)": 13.08, + "step": 489, + "token_acc": 1.0, + "train_speed(iter/s)": 1.081347 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.6812896132469177, + "learning_rate": 3.9060753387928365e-07, + "loss": 0.006206291262060404, + "memory(GiB)": 13.08, + "step": 490, + "token_acc": 1.0, + "train_speed(iter/s)": 1.081449 + }, + { + "epoch": 0.881508078994614, + "grad_norm": 0.6977695822715759, + "learning_rate": 3.791831998697121e-07, + "loss": 0.0099110072478652, + "memory(GiB)": 13.08, + "step": 491, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.081525 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6331995129585266, + "learning_rate": 3.6792183518840174e-07, + "loss": 0.016056664288043976, + "memory(GiB)": 13.08, + "step": 492, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.081624 + }, + { + "epoch": 0.8850987432675045, + "grad_norm": 1.1182827949523926, + "learning_rate": 3.5682383700691934e-07, + "loss": 0.0118872644379735, + "memory(GiB)": 13.08, + "step": 493, + "token_acc": 1.0, + "train_speed(iter/s)": 1.081727 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 1.3639663457870483, + "learning_rate": 3.458895967351328e-07, + "loss": 0.03659486398100853, + "memory(GiB)": 13.08, + "step": 494, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.081804 + }, + { + "epoch": 0.8886894075403949, + "grad_norm": 0.4585486352443695, + "learning_rate": 3.3511950000741756e-07, + "loss": 0.006072197575122118, + "memory(GiB)": 13.08, + "step": 495, + "token_acc": 1.0, + "train_speed(iter/s)": 1.081902 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.8180314898490906, + "learning_rate": 3.245139266690478e-07, + "loss": 0.006535394582897425, + "memory(GiB)": 13.08, + "step": 496, + "token_acc": 1.0, + "train_speed(iter/s)": 1.08201 + }, + { + "epoch": 0.8922800718132855, + "grad_norm": 1.3128434419631958, + "learning_rate": 3.140732507628047e-07, + "loss": 0.01039805170148611, + "memory(GiB)": 13.08, + "step": 497, + "token_acc": 1.0, + "train_speed(iter/s)": 1.082117 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.1865408569574356, + "learning_rate": 3.03797840515781e-07, + "loss": 0.0013660588301718235, + "memory(GiB)": 13.08, + "step": 498, + "token_acc": 1.0, + "train_speed(iter/s)": 1.082232 + }, + { + "epoch": 0.895870736086176, + "grad_norm": 0.5128012299537659, + "learning_rate": 2.936880583263968e-07, + "loss": 0.010234466753900051, + "memory(GiB)": 13.08, + "step": 499, + "token_acc": 1.0, + "train_speed(iter/s)": 1.082248 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.09025649726390839, + "learning_rate": 2.837442607516161e-07, + "loss": 0.0008039847016334534, + "memory(GiB)": 13.08, + "step": 500, + "token_acc": 1.0, + "train_speed(iter/s)": 1.082347 + }, + { + "epoch": 0.8976660682226212, + "eval_loss": 0.008655626326799393, + "eval_runtime": 3.5021, + "eval_samples_per_second": 25.699, + "eval_steps_per_second": 1.713, + "eval_token_acc": 0.9959859508278976, + "step": 500 + }, + { + "epoch": 0.8994614003590664, + "grad_norm": 0.8232449889183044, + "learning_rate": 2.7396679849437336e-07, + "loss": 0.012742449529469013, + "memory(GiB)": 13.08, + "step": 501, + "token_acc": 0.9931389365351629, + "train_speed(iter/s)": 1.05518 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.49068835377693176, + "learning_rate": 2.6435601639120457e-07, + "loss": 0.005841577425599098, + "memory(GiB)": 13.61, + "step": 502, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055309 + }, + { + "epoch": 0.9030520646319569, + "grad_norm": 0.11432085186243057, + "learning_rate": 2.5491225340008306e-07, + "loss": 0.000775289605371654, + "memory(GiB)": 13.61, + "step": 503, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055468 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.1800203174352646, + "learning_rate": 2.456358425884697e-07, + "loss": 0.0009322683326900005, + "memory(GiB)": 13.61, + "step": 504, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055613 + }, + { + "epoch": 0.9066427289048474, + "grad_norm": 0.21769851446151733, + "learning_rate": 2.3652711112156113e-07, + "loss": 0.0023422411177307367, + "memory(GiB)": 13.61, + "step": 505, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055772 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.8723134398460388, + "learning_rate": 2.2758638025075165e-07, + "loss": 0.007289735134691, + "memory(GiB)": 13.61, + "step": 506, + "token_acc": 1.0, + "train_speed(iter/s)": 1.055918 + }, + { + "epoch": 0.9102333931777379, + "grad_norm": 0.9933255910873413, + "learning_rate": 2.1881396530230915e-07, + "loss": 0.017019910737872124, + "memory(GiB)": 13.61, + "step": 507, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056079 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 1.2228353023529053, + "learning_rate": 2.1021017566624446e-07, + "loss": 0.015079186297953129, + "memory(GiB)": 13.61, + "step": 508, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.05623 + }, + { + "epoch": 0.9138240574506283, + "grad_norm": 0.9666206240653992, + "learning_rate": 2.0177531478540957e-07, + "loss": 0.013778849504888058, + "memory(GiB)": 13.61, + "step": 509, + "token_acc": 0.9876543209876543, + "train_speed(iter/s)": 1.056394 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 1.708090901374817, + "learning_rate": 1.9350968014478667e-07, + "loss": 0.010647699236869812, + "memory(GiB)": 13.61, + "step": 510, + "token_acc": 1.0, + "train_speed(iter/s)": 1.05654 + }, + { + "epoch": 0.9174147217235189, + "grad_norm": 0.06319945305585861, + "learning_rate": 1.8541356326100436e-07, + "loss": 0.0006756841903552413, + "memory(GiB)": 13.61, + "step": 511, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056696 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 1.419111728668213, + "learning_rate": 1.7748724967204701e-07, + "loss": 0.014690598472952843, + "memory(GiB)": 13.61, + "step": 512, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056854 + }, + { + "epoch": 0.9210053859964094, + "grad_norm": 0.08039558678865433, + "learning_rate": 1.6973101892719558e-07, + "loss": 0.0007967624696902931, + "memory(GiB)": 13.61, + "step": 513, + "token_acc": 1.0, + "train_speed(iter/s)": 1.056986 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 1.2945536375045776, + "learning_rate": 1.6214514457715768e-07, + "loss": 0.017802400514483452, + "memory(GiB)": 13.61, + "step": 514, + "token_acc": 0.9882352941176471, + "train_speed(iter/s)": 1.057139 + }, + { + "epoch": 0.9245960502692998, + "grad_norm": 0.13030825555324554, + "learning_rate": 1.5472989416442873e-07, + "loss": 0.0010276761604472995, + "memory(GiB)": 13.61, + "step": 515, + "token_acc": 1.0, + "train_speed(iter/s)": 1.057281 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.6033478379249573, + "learning_rate": 1.4748552921384717e-07, + "loss": 0.004695372190326452, + "memory(GiB)": 13.61, + "step": 516, + "token_acc": 1.0, + "train_speed(iter/s)": 1.05742 + }, + { + "epoch": 0.9281867145421903, + "grad_norm": 0.22921574115753174, + "learning_rate": 1.4041230522337913e-07, + "loss": 0.00219919066876173, + "memory(GiB)": 13.61, + "step": 517, + "token_acc": 1.0, + "train_speed(iter/s)": 1.057539 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.35983556509017944, + "learning_rate": 1.3351047165510444e-07, + "loss": 0.011062784120440483, + "memory(GiB)": 13.61, + "step": 518, + "token_acc": 1.0, + "train_speed(iter/s)": 1.057677 + }, + { + "epoch": 0.9317773788150808, + "grad_norm": 1.0101219415664673, + "learning_rate": 1.2678027192641384e-07, + "loss": 0.006366117391735315, + "memory(GiB)": 13.61, + "step": 519, + "token_acc": 1.0, + "train_speed(iter/s)": 1.057828 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.9938783645629883, + "learning_rate": 1.202219434014329e-07, + "loss": 0.01133299246430397, + "memory(GiB)": 13.61, + "step": 520, + "token_acc": 0.9875, + "train_speed(iter/s)": 1.057966 + }, + { + "epoch": 0.9353680430879713, + "grad_norm": 0.5778594017028809, + "learning_rate": 1.1383571738264154e-07, + "loss": 0.006956363562494516, + "memory(GiB)": 13.61, + "step": 521, + "token_acc": 1.0, + "train_speed(iter/s)": 1.05813 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.26744845509529114, + "learning_rate": 1.0762181910272396e-07, + "loss": 0.0030226772651076317, + "memory(GiB)": 13.61, + "step": 522, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058269 + }, + { + "epoch": 0.9389587073608617, + "grad_norm": 0.6555060148239136, + "learning_rate": 1.0158046771661878e-07, + "loss": 0.013531736098229885, + "memory(GiB)": 13.61, + "step": 523, + "token_acc": 1.0, + "train_speed(iter/s)": 1.0584 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.3874221444129944, + "learning_rate": 9.571187629379586e-08, + "loss": 0.005351795349270105, + "memory(GiB)": 13.61, + "step": 524, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058444 + }, + { + "epoch": 0.9425493716337523, + "grad_norm": 2.077939033508301, + "learning_rate": 9.001625181073503e-08, + "loss": 0.010931195691227913, + "memory(GiB)": 13.61, + "step": 525, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058474 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 1.9851182699203491, + "learning_rate": 8.449379514363143e-08, + "loss": 0.03175752982497215, + "memory(GiB)": 13.61, + "step": 526, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058591 + }, + { + "epoch": 0.9461400359066428, + "grad_norm": 0.44924452900886536, + "learning_rate": 7.914470106130945e-08, + "loss": 0.009974591434001923, + "memory(GiB)": 13.61, + "step": 527, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.05869 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.6397302746772766, + "learning_rate": 7.396915821835216e-08, + "loss": 0.012457557022571564, + "memory(GiB)": 13.61, + "step": 528, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058795 + }, + { + "epoch": 0.9497307001795332, + "grad_norm": 0.5017023086547852, + "learning_rate": 6.896734914844994e-08, + "loss": 0.0051448168233036995, + "memory(GiB)": 13.61, + "step": 529, + "token_acc": 1.0, + "train_speed(iter/s)": 1.058916 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 1.213315486907959, + "learning_rate": 6.413945025796065e-08, + "loss": 0.016828639432787895, + "memory(GiB)": 13.61, + "step": 530, + "token_acc": 0.9885057471264368, + "train_speed(iter/s)": 1.059019 + }, + { + "epoch": 0.9533213644524237, + "grad_norm": 0.4171786606311798, + "learning_rate": 5.948563181968903e-08, + "loss": 0.005509679205715656, + "memory(GiB)": 13.61, + "step": 531, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059147 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6889182329177856, + "learning_rate": 5.500605796688263e-08, + "loss": 0.0044509898871183395, + "memory(GiB)": 13.61, + "step": 532, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059274 + }, + { + "epoch": 0.9569120287253142, + "grad_norm": 0.4290013015270233, + "learning_rate": 5.070088668744144e-08, + "loss": 0.0043940432369709015, + "memory(GiB)": 13.61, + "step": 533, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059416 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.2148108333349228, + "learning_rate": 4.657026981834623e-08, + "loss": 0.0025503088254481554, + "memory(GiB)": 13.61, + "step": 534, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059554 + }, + { + "epoch": 0.9605026929982047, + "grad_norm": 0.5286439657211304, + "learning_rate": 4.261435304030281e-08, + "loss": 0.00757583137601614, + "memory(GiB)": 13.61, + "step": 535, + "token_acc": 0.9883720930232558, + "train_speed(iter/s)": 1.059703 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.5139573216438293, + "learning_rate": 3.8833275872607326e-08, + "loss": 0.005370498634874821, + "memory(GiB)": 13.61, + "step": 536, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059841 + }, + { + "epoch": 0.9640933572710951, + "grad_norm": 0.4864700436592102, + "learning_rate": 3.5227171668218985e-08, + "loss": 0.0072228992357850075, + "memory(GiB)": 13.61, + "step": 537, + "token_acc": 1.0, + "train_speed(iter/s)": 1.059992 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5774115920066833, + "learning_rate": 3.179616760906612e-08, + "loss": 0.008702301420271397, + "memory(GiB)": 13.61, + "step": 538, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060132 + }, + { + "epoch": 0.9676840215439856, + "grad_norm": 0.6363269090652466, + "learning_rate": 2.8540384701551405e-08, + "loss": 0.008673271164298058, + "memory(GiB)": 13.61, + "step": 539, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060272 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.3189695477485657, + "learning_rate": 2.5459937772291942e-08, + "loss": 0.0038987579755485058, + "memory(GiB)": 13.61, + "step": 540, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060345 + }, + { + "epoch": 0.9712746858168761, + "grad_norm": 0.1294567584991455, + "learning_rate": 2.255493546406251e-08, + "loss": 0.0015540863387286663, + "memory(GiB)": 13.61, + "step": 541, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060473 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5199005007743835, + "learning_rate": 1.9825480231970284e-08, + "loss": 0.004626214504241943, + "memory(GiB)": 13.61, + "step": 542, + "token_acc": 1.0, + "train_speed(iter/s)": 1.060604 + }, + { + "epoch": 0.9748653500897666, + "grad_norm": 1.8426061868667603, + "learning_rate": 1.7271668339836624e-08, + "loss": 0.0367659255862236, + "memory(GiB)": 13.61, + "step": 543, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.060731 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.7304965257644653, + "learning_rate": 1.489358985680478e-08, + "loss": 0.00937891099601984, + "memory(GiB)": 13.61, + "step": 544, + "token_acc": 1.0, + "train_speed(iter/s)": 1.06087 + }, + { + "epoch": 0.9784560143626571, + "grad_norm": 3.2844491004943848, + "learning_rate": 1.269132865416134e-08, + "loss": 0.008662230335175991, + "memory(GiB)": 13.61, + "step": 545, + "token_acc": 0.9880952380952381, + "train_speed(iter/s)": 1.060993 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.2824939489364624, + "learning_rate": 1.0664962402380241e-08, + "loss": 0.004621861968189478, + "memory(GiB)": 13.61, + "step": 546, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061126 + }, + { + "epoch": 0.9820466786355476, + "grad_norm": 1.1427395343780518, + "learning_rate": 8.814562568382202e-09, + "loss": 0.011854426003992558, + "memory(GiB)": 13.61, + "step": 547, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061258 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.8954042196273804, + "learning_rate": 7.14019441301339e-09, + "loss": 0.006638347636908293, + "memory(GiB)": 13.61, + "step": 548, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061387 + }, + { + "epoch": 0.9856373429084381, + "grad_norm": 0.09067913889884949, + "learning_rate": 5.641916988746166e-09, + "loss": 0.0008715628646314144, + "memory(GiB)": 13.61, + "step": 549, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061527 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 1.0949702262878418, + "learning_rate": 4.319783137594624e-09, + "loss": 0.012085100635886192, + "memory(GiB)": 13.61, + "step": 550, + "token_acc": 1.0, + "train_speed(iter/s)": 1.06167 + }, + { + "epoch": 0.9892280071813285, + "grad_norm": 1.4783289432525635, + "learning_rate": 3.1738394892505407e-09, + "loss": 0.011535527184605598, + "memory(GiB)": 13.61, + "step": 551, + "token_acc": 1.0, + "train_speed(iter/s)": 1.061796 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.8619922399520874, + "learning_rate": 2.204126459440237e-09, + "loss": 0.016422592103481293, + "memory(GiB)": 13.61, + "step": 552, + "token_acc": 0.9879518072289156, + "train_speed(iter/s)": 1.061942 + }, + { + "epoch": 0.992818671454219, + "grad_norm": 0.8307982683181763, + "learning_rate": 1.4106782484984982e-09, + "loss": 0.012019624933600426, + "memory(GiB)": 13.61, + "step": 553, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062078 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.2896110415458679, + "learning_rate": 7.935228401623196e-10, + "loss": 0.0030490243807435036, + "memory(GiB)": 13.61, + "step": 554, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062216 + }, + { + "epoch": 0.9964093357271095, + "grad_norm": 0.9369227886199951, + "learning_rate": 3.526820005839149e-10, + "loss": 0.017497211694717407, + "memory(GiB)": 13.61, + "step": 555, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062355 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.33920952677726746, + "learning_rate": 8.817127756355348e-11, + "loss": 0.004344519227743149, + "memory(GiB)": 13.61, + "step": 556, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062482 + }, + { + "epoch": 1.0, + "grad_norm": 0.05820717290043831, + "learning_rate": 0.0, + "loss": 0.000651389651466161, + "memory(GiB)": 13.61, + "step": 557, + "token_acc": 1.0, + "train_speed(iter/s)": 1.062823 + }, + { + "epoch": 1.0, + "eval_loss": 0.008566614240407944, + "eval_runtime": 3.6415, + "eval_samples_per_second": 24.715, + "eval_steps_per_second": 1.648, + "eval_token_acc": 0.9964877069744105, + "step": 557 + } + ], + "logging_steps": 1, + "max_steps": 557, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.472664234970317e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}