| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.0, | |
| "eval_steps": 500, | |
| "global_step": 1431, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011180992313067784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2878, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02236198462613557, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.392, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.033542976939203356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.3594, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04472396925227114, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2958, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.055904961565338925, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.3475, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06708595387840671, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.2303, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07826694619147449, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1964, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.08944793850454227, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5e-06, | |
| "loss": 1.3328, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10062893081761007, | |
| "grad_norm": 6.944002672340658, | |
| "learning_rate": 4.999996106235862e-06, | |
| "loss": 1.3134, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.11180992313067785, | |
| "grad_norm": 7.800497498064014, | |
| "learning_rate": 4.999964956195521e-06, | |
| "loss": 1.1147, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12299091544374563, | |
| "grad_norm": 4.4662495771497355, | |
| "learning_rate": 4.999902656502973e-06, | |
| "loss": 1.025, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.13417190775681342, | |
| "grad_norm": 4.026851738528776, | |
| "learning_rate": 4.999809207934472e-06, | |
| "loss": 1.0448, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1453529000698812, | |
| "grad_norm": 5.658278761851693, | |
| "learning_rate": 4.999684611654392e-06, | |
| "loss": 0.9826, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.15653389238294899, | |
| "grad_norm": 3.9275877006609505, | |
| "learning_rate": 4.9995288692152046e-06, | |
| "loss": 0.9627, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.16771488469601678, | |
| "grad_norm": 3.634771950296262, | |
| "learning_rate": 4.9993419825574686e-06, | |
| "loss": 0.9476, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.17889587700908455, | |
| "grad_norm": 4.604406424526374, | |
| "learning_rate": 4.9992368608591775e-06, | |
| "loss": 0.9414, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.19007686932215234, | |
| "grad_norm": 5.708200502114745, | |
| "learning_rate": 4.999003262361029e-06, | |
| "loss": 0.9572, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.20125786163522014, | |
| "grad_norm": 5.020134712294459, | |
| "learning_rate": 4.998738526193412e-06, | |
| "loss": 0.9544, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2124388539482879, | |
| "grad_norm": 4.643332496496484, | |
| "learning_rate": 4.998442655654946e-06, | |
| "loss": 0.8504, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2236198462613557, | |
| "grad_norm": 4.7843514072232125, | |
| "learning_rate": 4.998115654432191e-06, | |
| "loss": 0.914, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2348008385744235, | |
| "grad_norm": 3.973113705087721, | |
| "learning_rate": 4.997757526599592e-06, | |
| "loss": 0.8303, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.24598183088749126, | |
| "grad_norm": 5.753323652117126, | |
| "learning_rate": 4.9973682766194355e-06, | |
| "loss": 0.8916, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.25716282320055905, | |
| "grad_norm": 4.00607759948128, | |
| "learning_rate": 4.996947909341789e-06, | |
| "loss": 0.9391, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.26834381551362685, | |
| "grad_norm": 4.73751358896988, | |
| "learning_rate": 4.996496430004446e-06, | |
| "loss": 0.8445, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.27952480782669464, | |
| "grad_norm": 3.801634673248135, | |
| "learning_rate": 4.9960138442328535e-06, | |
| "loss": 0.8354, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2907058001397624, | |
| "grad_norm": 4.998706656181077, | |
| "learning_rate": 4.9955001580400475e-06, | |
| "loss": 0.8556, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3018867924528302, | |
| "grad_norm": 5.235396952388322, | |
| "learning_rate": 4.994955377826577e-06, | |
| "loss": 0.8821, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.31306778476589797, | |
| "grad_norm": 4.593843550283633, | |
| "learning_rate": 4.994379510380421e-06, | |
| "loss": 0.7965, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.32424877707896577, | |
| "grad_norm": 4.636040406542864, | |
| "learning_rate": 4.993772562876909e-06, | |
| "loss": 0.8576, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.33542976939203356, | |
| "grad_norm": 4.422458900120915, | |
| "learning_rate": 4.993134542878631e-06, | |
| "loss": 0.8388, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3466107617051013, | |
| "grad_norm": 4.88515796654498, | |
| "learning_rate": 4.992465458335335e-06, | |
| "loss": 0.8427, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3577917540181691, | |
| "grad_norm": 4.620642626620232, | |
| "learning_rate": 4.991765317583841e-06, | |
| "loss": 0.8088, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3689727463312369, | |
| "grad_norm": 3.0164501013815146, | |
| "learning_rate": 4.991034129347927e-06, | |
| "loss": 0.7643, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3801537386443047, | |
| "grad_norm": 4.0807085306410915, | |
| "learning_rate": 4.990271902738223e-06, | |
| "loss": 0.8304, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3913347309573725, | |
| "grad_norm": 4.913983348963418, | |
| "learning_rate": 4.989478647252101e-06, | |
| "loss": 0.8694, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4025157232704403, | |
| "grad_norm": 5.427166275548586, | |
| "learning_rate": 4.988654372773552e-06, | |
| "loss": 0.8031, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.413696715583508, | |
| "grad_norm": 4.976699288607289, | |
| "learning_rate": 4.987799089573066e-06, | |
| "loss": 0.7548, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4248777078965758, | |
| "grad_norm": 5.035712861337141, | |
| "learning_rate": 4.986912808307502e-06, | |
| "loss": 0.7769, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4360587002096436, | |
| "grad_norm": 5.703104314189732, | |
| "learning_rate": 4.985995540019956e-06, | |
| "loss": 0.7744, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4472396925227114, | |
| "grad_norm": 3.6174332203212938, | |
| "learning_rate": 4.985047296139622e-06, | |
| "loss": 0.7215, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4584206848357792, | |
| "grad_norm": 5.084461038739496, | |
| "learning_rate": 4.984068088481654e-06, | |
| "loss": 0.7462, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.469601677148847, | |
| "grad_norm": 5.500722673783384, | |
| "learning_rate": 4.983057929247014e-06, | |
| "loss": 0.7937, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4807826694619147, | |
| "grad_norm": 5.76928743736382, | |
| "learning_rate": 4.9820168310223215e-06, | |
| "loss": 0.7701, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4919636617749825, | |
| "grad_norm": 4.3638410984754366, | |
| "learning_rate": 4.980944806779698e-06, | |
| "loss": 0.7063, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5031446540880503, | |
| "grad_norm": 6.6022312070502664, | |
| "learning_rate": 4.979841869876603e-06, | |
| "loss": 0.7829, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5143256464011181, | |
| "grad_norm": 5.114853414480892, | |
| "learning_rate": 4.97870803405567e-06, | |
| "loss": 0.7419, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5255066387141859, | |
| "grad_norm": 5.450293615821356, | |
| "learning_rate": 4.977543313444534e-06, | |
| "loss": 0.7428, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5366876310272537, | |
| "grad_norm": 3.888671786201343, | |
| "learning_rate": 4.976347722555655e-06, | |
| "loss": 0.763, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5478686233403215, | |
| "grad_norm": 5.580018062591517, | |
| "learning_rate": 4.975121276286136e-06, | |
| "loss": 0.7451, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5590496156533893, | |
| "grad_norm": 5.244409209125885, | |
| "learning_rate": 4.973863989917545e-06, | |
| "loss": 0.6658, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.570230607966457, | |
| "grad_norm": 6.341201782490113, | |
| "learning_rate": 4.9725758791157105e-06, | |
| "loss": 0.7042, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5814116002795248, | |
| "grad_norm": 3.63864440598579, | |
| "learning_rate": 4.9712569599305415e-06, | |
| "loss": 0.6859, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 5.643540415249962, | |
| "learning_rate": 4.9699072487958185e-06, | |
| "loss": 0.7072, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6037735849056604, | |
| "grad_norm": 4.518214836889502, | |
| "learning_rate": 4.968526762528988e-06, | |
| "loss": 0.6989, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.6149545772187281, | |
| "grad_norm": 4.813780988459217, | |
| "learning_rate": 4.96711551833096e-06, | |
| "loss": 0.6213, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6261355695317959, | |
| "grad_norm": 6.534716960952802, | |
| "learning_rate": 4.965673533785887e-06, | |
| "loss": 0.6603, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6373165618448637, | |
| "grad_norm": 4.694700268634709, | |
| "learning_rate": 4.9642008268609455e-06, | |
| "loss": 0.6458, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6484975541579315, | |
| "grad_norm": 3.797163997052886, | |
| "learning_rate": 4.962697415906118e-06, | |
| "loss": 0.6208, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6596785464709993, | |
| "grad_norm": 5.303604758140139, | |
| "learning_rate": 4.961163319653959e-06, | |
| "loss": 0.6175, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6708595387840671, | |
| "grad_norm": 3.8308857949946398, | |
| "learning_rate": 4.959598557219361e-06, | |
| "loss": 0.6178, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6820405310971349, | |
| "grad_norm": 5.611339241664303, | |
| "learning_rate": 4.95800314809932e-06, | |
| "loss": 0.617, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6932215234102026, | |
| "grad_norm": 5.234511261826922, | |
| "learning_rate": 4.956377112172691e-06, | |
| "loss": 0.6557, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.7044025157232704, | |
| "grad_norm": 4.381066733905507, | |
| "learning_rate": 4.954720469699939e-06, | |
| "loss": 0.6343, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.7155835080363382, | |
| "grad_norm": 5.113989443684452, | |
| "learning_rate": 4.953033241322887e-06, | |
| "loss": 0.6135, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.726764500349406, | |
| "grad_norm": 5.138987950069777, | |
| "learning_rate": 4.951315448064462e-06, | |
| "loss": 0.6403, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7379454926624738, | |
| "grad_norm": 4.43583718290579, | |
| "learning_rate": 4.949567111328428e-06, | |
| "loss": 0.6226, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7491264849755416, | |
| "grad_norm": 4.391597448273059, | |
| "learning_rate": 4.947788252899124e-06, | |
| "loss": 0.6333, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7603074772886094, | |
| "grad_norm": 4.193385817962468, | |
| "learning_rate": 4.945978894941189e-06, | |
| "loss": 0.6884, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7714884696016772, | |
| "grad_norm": 5.03154779607414, | |
| "learning_rate": 4.944139059999286e-06, | |
| "loss": 0.5783, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.782669461914745, | |
| "grad_norm": 6.345004441163444, | |
| "learning_rate": 4.942268770997825e-06, | |
| "loss": 0.5314, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7938504542278128, | |
| "grad_norm": 4.800013540838224, | |
| "learning_rate": 4.940368051240675e-06, | |
| "loss": 0.5876, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.8050314465408805, | |
| "grad_norm": 5.229387760297341, | |
| "learning_rate": 4.938436924410869e-06, | |
| "loss": 0.6266, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.8162124388539483, | |
| "grad_norm": 5.663117027843187, | |
| "learning_rate": 4.936475414570317e-06, | |
| "loss": 0.5407, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.827393431167016, | |
| "grad_norm": 4.355698674662869, | |
| "learning_rate": 4.9344835461595016e-06, | |
| "loss": 0.5757, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8385744234800838, | |
| "grad_norm": 3.73012661577406, | |
| "learning_rate": 4.932461343997174e-06, | |
| "loss": 0.5671, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8497554157931516, | |
| "grad_norm": 5.17610307953933, | |
| "learning_rate": 4.930408833280044e-06, | |
| "loss": 0.5552, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8609364081062194, | |
| "grad_norm": 4.8108290286110575, | |
| "learning_rate": 4.928326039582468e-06, | |
| "loss": 0.5455, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8721174004192872, | |
| "grad_norm": 4.143977047297293, | |
| "learning_rate": 4.926212988856131e-06, | |
| "loss": 0.5865, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.883298392732355, | |
| "grad_norm": 4.809016102192773, | |
| "learning_rate": 4.9240697074297205e-06, | |
| "loss": 0.5904, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8944793850454228, | |
| "grad_norm": 4.329310274878485, | |
| "learning_rate": 4.921896222008598e-06, | |
| "loss": 0.5213, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9056603773584906, | |
| "grad_norm": 6.082276125346202, | |
| "learning_rate": 4.919692559674469e-06, | |
| "loss": 0.5321, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.9168413696715584, | |
| "grad_norm": 3.595682377289556, | |
| "learning_rate": 4.917458747885045e-06, | |
| "loss": 0.5589, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9280223619846262, | |
| "grad_norm": 4.759398027424621, | |
| "learning_rate": 4.9151948144737e-06, | |
| "loss": 0.5252, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.939203354297694, | |
| "grad_norm": 4.925856740501272, | |
| "learning_rate": 4.912900787649124e-06, | |
| "loss": 0.5688, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9503843466107617, | |
| "grad_norm": 4.9751554778931695, | |
| "learning_rate": 4.910576695994976e-06, | |
| "loss": 0.49, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9615653389238294, | |
| "grad_norm": 4.404002437196143, | |
| "learning_rate": 4.908222568469516e-06, | |
| "loss": 0.5031, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.9727463312368972, | |
| "grad_norm": 4.438458089119356, | |
| "learning_rate": 4.905838434405259e-06, | |
| "loss": 0.5015, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.983927323549965, | |
| "grad_norm": 3.7675300141289205, | |
| "learning_rate": 4.903424323508601e-06, | |
| "loss": 0.5133, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9951083158630328, | |
| "grad_norm": 5.557474516168906, | |
| "learning_rate": 4.900980265859449e-06, | |
| "loss": 0.4913, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.0062893081761006, | |
| "grad_norm": 4.4806858821540585, | |
| "learning_rate": 4.898506291910847e-06, | |
| "loss": 0.4446, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0174703004891683, | |
| "grad_norm": 4.605929975666356, | |
| "learning_rate": 4.896002432488599e-06, | |
| "loss": 0.3632, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.0286512928022362, | |
| "grad_norm": 4.9794341930411665, | |
| "learning_rate": 4.893468718790883e-06, | |
| "loss": 0.3868, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.039832285115304, | |
| "grad_norm": 3.5317296745452733, | |
| "learning_rate": 4.890905182387862e-06, | |
| "loss": 0.4334, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.0510132774283718, | |
| "grad_norm": 4.568181420141649, | |
| "learning_rate": 4.88831185522129e-06, | |
| "loss": 0.456, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.0621942697414395, | |
| "grad_norm": 3.570260813698039, | |
| "learning_rate": 4.885688769604115e-06, | |
| "loss": 0.3846, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0733752620545074, | |
| "grad_norm": 3.639759353451614, | |
| "learning_rate": 4.883035958220077e-06, | |
| "loss": 0.4363, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.084556254367575, | |
| "grad_norm": 4.074741691986429, | |
| "learning_rate": 4.8803534541233016e-06, | |
| "loss": 0.3782, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.095737246680643, | |
| "grad_norm": 4.875221867832197, | |
| "learning_rate": 4.8776412907378845e-06, | |
| "loss": 0.3815, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.1069182389937107, | |
| "grad_norm": 3.575182053435755, | |
| "learning_rate": 4.874899501857477e-06, | |
| "loss": 0.4023, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.1180992313067786, | |
| "grad_norm": 3.984785984285916, | |
| "learning_rate": 4.8721281216448675e-06, | |
| "loss": 0.305, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1292802236198463, | |
| "grad_norm": 3.997235184408756, | |
| "learning_rate": 4.869327184631552e-06, | |
| "loss": 0.3896, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.140461215932914, | |
| "grad_norm": 3.403723018382878, | |
| "learning_rate": 4.866496725717304e-06, | |
| "loss": 0.3332, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.1516422082459818, | |
| "grad_norm": 3.5740869992425917, | |
| "learning_rate": 4.8636367801697415e-06, | |
| "loss": 0.3299, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.1628232005590495, | |
| "grad_norm": 3.8789874672120033, | |
| "learning_rate": 4.860747383623889e-06, | |
| "loss": 0.4145, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1740041928721174, | |
| "grad_norm": 3.8038820435820084, | |
| "learning_rate": 4.857828572081731e-06, | |
| "loss": 0.3171, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 3.260333619392394, | |
| "learning_rate": 4.854880381911762e-06, | |
| "loss": 0.3474, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.196366177498253, | |
| "grad_norm": 2.8989963280714925, | |
| "learning_rate": 4.851902849848536e-06, | |
| "loss": 0.3931, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.2075471698113207, | |
| "grad_norm": 3.6383247911373773, | |
| "learning_rate": 4.848896012992208e-06, | |
| "loss": 0.3822, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.2187281621243886, | |
| "grad_norm": 3.0864181531286734, | |
| "learning_rate": 4.845859908808074e-06, | |
| "loss": 0.378, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.2299091544374563, | |
| "grad_norm": 2.494513481207721, | |
| "learning_rate": 4.842794575126099e-06, | |
| "loss": 0.3655, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2410901467505242, | |
| "grad_norm": 2.6074910342756334, | |
| "learning_rate": 4.839700050140448e-06, | |
| "loss": 0.3973, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.2522711390635919, | |
| "grad_norm": 2.2421870374103285, | |
| "learning_rate": 4.836576372409015e-06, | |
| "loss": 0.3784, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.2634521313766598, | |
| "grad_norm": 2.451559449193117, | |
| "learning_rate": 4.833423580852933e-06, | |
| "loss": 0.3805, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.2746331236897275, | |
| "grad_norm": 2.5374184019501285, | |
| "learning_rate": 4.830241714756099e-06, | |
| "loss": 0.293, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.2858141160027952, | |
| "grad_norm": 2.525807489259318, | |
| "learning_rate": 4.827030813764677e-06, | |
| "loss": 0.2665, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.296995108315863, | |
| "grad_norm": 2.3755504317471523, | |
| "learning_rate": 4.8237909178866075e-06, | |
| "loss": 0.4108, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.3081761006289307, | |
| "grad_norm": 2.7662660096000793, | |
| "learning_rate": 4.8205220674911075e-06, | |
| "loss": 0.3928, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.3193570929419987, | |
| "grad_norm": 2.245517906271987, | |
| "learning_rate": 4.81722430330817e-06, | |
| "loss": 0.355, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.3305380852550663, | |
| "grad_norm": 2.684087860818518, | |
| "learning_rate": 4.813897666428054e-06, | |
| "loss": 0.3624, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.3417190775681342, | |
| "grad_norm": 2.5507370157459865, | |
| "learning_rate": 4.810542198300772e-06, | |
| "loss": 0.3494, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.352900069881202, | |
| "grad_norm": 2.157612559104276, | |
| "learning_rate": 4.807157940735577e-06, | |
| "loss": 0.3064, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.3640810621942698, | |
| "grad_norm": 1.9389355017962189, | |
| "learning_rate": 4.803744935900439e-06, | |
| "loss": 0.3331, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.3752620545073375, | |
| "grad_norm": 2.3147558047608867, | |
| "learning_rate": 4.8003032263215185e-06, | |
| "loss": 0.3538, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.3864430468204052, | |
| "grad_norm": 2.414181223767401, | |
| "learning_rate": 4.79683285488264e-06, | |
| "loss": 0.3237, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.397624039133473, | |
| "grad_norm": 2.0498128676624368, | |
| "learning_rate": 4.793333864824756e-06, | |
| "loss": 0.3742, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.408805031446541, | |
| "grad_norm": 2.2294049255917416, | |
| "learning_rate": 4.789806299745405e-06, | |
| "loss": 0.2948, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.4199860237596087, | |
| "grad_norm": 2.2210196470155923, | |
| "learning_rate": 4.786250203598174e-06, | |
| "loss": 0.28, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.4311670160726764, | |
| "grad_norm": 2.6896787603814816, | |
| "learning_rate": 4.782665620692147e-06, | |
| "loss": 0.3513, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.4423480083857443, | |
| "grad_norm": 2.1151921249556644, | |
| "learning_rate": 4.779052595691355e-06, | |
| "loss": 0.3598, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.453529000698812, | |
| "grad_norm": 2.6404538176276047, | |
| "learning_rate": 4.775411173614218e-06, | |
| "loss": 0.3075, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4647099930118799, | |
| "grad_norm": 1.9888888421343762, | |
| "learning_rate": 4.771741399832984e-06, | |
| "loss": 0.356, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.4758909853249476, | |
| "grad_norm": 2.284642426340359, | |
| "learning_rate": 4.768043320073165e-06, | |
| "loss": 0.2765, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.4870719776380152, | |
| "grad_norm": 2.135563450656965, | |
| "learning_rate": 4.764316980412966e-06, | |
| "loss": 0.2825, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.4982529699510831, | |
| "grad_norm": 1.8267552790003188, | |
| "learning_rate": 4.7605624272827125e-06, | |
| "loss": 0.3915, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.509433962264151, | |
| "grad_norm": 2.26569092336033, | |
| "learning_rate": 4.75677970746427e-06, | |
| "loss": 0.3859, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5206149545772187, | |
| "grad_norm": 2.3510908940666346, | |
| "learning_rate": 4.75296886809046e-06, | |
| "loss": 0.312, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.5317959468902864, | |
| "grad_norm": 2.1562478846600883, | |
| "learning_rate": 4.749129956644477e-06, | |
| "loss": 0.4398, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.5429769392033543, | |
| "grad_norm": 2.1811966726037655, | |
| "learning_rate": 4.745263020959296e-06, | |
| "loss": 0.3221, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.5541579315164222, | |
| "grad_norm": 2.035643810106488, | |
| "learning_rate": 4.741368109217072e-06, | |
| "loss": 0.3317, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.56533892382949, | |
| "grad_norm": 2.0722038381676824, | |
| "learning_rate": 4.737445269948543e-06, | |
| "loss": 0.4627, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5765199161425576, | |
| "grad_norm": 2.2584403073433212, | |
| "learning_rate": 4.733494552032426e-06, | |
| "loss": 0.352, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.5877009084556253, | |
| "grad_norm": 3.1127410509937783, | |
| "learning_rate": 4.729516004694808e-06, | |
| "loss": 0.3109, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.5988819007686932, | |
| "grad_norm": 1.6930738402579835, | |
| "learning_rate": 4.725509677508528e-06, | |
| "loss": 0.3723, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.610062893081761, | |
| "grad_norm": 2.6225330496610573, | |
| "learning_rate": 4.721475620392567e-06, | |
| "loss": 0.2853, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.6212438853948288, | |
| "grad_norm": 1.998954970455011, | |
| "learning_rate": 4.71741388361142e-06, | |
| "loss": 0.323, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6324248777078965, | |
| "grad_norm": 2.3952745413220677, | |
| "learning_rate": 4.713324517774471e-06, | |
| "loss": 0.4057, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.6436058700209644, | |
| "grad_norm": 1.7339961999135642, | |
| "learning_rate": 4.7092075738353625e-06, | |
| "loss": 0.2855, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.6547868623340323, | |
| "grad_norm": 2.3672466509243075, | |
| "learning_rate": 4.705063103091365e-06, | |
| "loss": 0.277, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.6659678546471, | |
| "grad_norm": 1.92096238087282, | |
| "learning_rate": 4.700891157182729e-06, | |
| "loss": 0.2699, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.6771488469601676, | |
| "grad_norm": 1.6478187267877538, | |
| "learning_rate": 4.696691788092049e-06, | |
| "loss": 0.2875, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6883298392732355, | |
| "grad_norm": 2.6637144089516545, | |
| "learning_rate": 4.692465048143615e-06, | |
| "loss": 0.3229, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.6995108315863034, | |
| "grad_norm": 2.0530281428374084, | |
| "learning_rate": 4.688210990002755e-06, | |
| "loss": 0.3546, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.7106918238993711, | |
| "grad_norm": 2.150198399781322, | |
| "learning_rate": 4.683929666675185e-06, | |
| "loss": 0.4021, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.7218728162124388, | |
| "grad_norm": 2.1752313572704542, | |
| "learning_rate": 4.679621131506347e-06, | |
| "loss": 0.3299, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.7330538085255065, | |
| "grad_norm": 1.9055889494341978, | |
| "learning_rate": 4.6752854381807414e-06, | |
| "loss": 0.2514, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7442348008385744, | |
| "grad_norm": 2.469483649303522, | |
| "learning_rate": 4.670922640721261e-06, | |
| "loss": 0.332, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.7554157931516423, | |
| "grad_norm": 2.327049750502898, | |
| "learning_rate": 4.666532793488518e-06, | |
| "loss": 0.3482, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.76659678546471, | |
| "grad_norm": 2.0224582609864674, | |
| "learning_rate": 4.662115951180164e-06, | |
| "loss": 0.3192, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.9568416201882894, | |
| "learning_rate": 4.657672168830211e-06, | |
| "loss": 0.2682, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.7889587700908456, | |
| "grad_norm": 1.919410926201314, | |
| "learning_rate": 4.653201501808346e-06, | |
| "loss": 0.3602, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8001397624039135, | |
| "grad_norm": 2.239752835185363, | |
| "learning_rate": 4.6487040058192385e-06, | |
| "loss": 0.346, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.8113207547169812, | |
| "grad_norm": 2.3820790461811643, | |
| "learning_rate": 4.644179736901848e-06, | |
| "loss": 0.393, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.8225017470300489, | |
| "grad_norm": 2.100652056063807, | |
| "learning_rate": 4.639628751428728e-06, | |
| "loss": 0.3348, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.8336827393431165, | |
| "grad_norm": 1.839587786014522, | |
| "learning_rate": 4.635051106105316e-06, | |
| "loss": 0.297, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.8448637316561844, | |
| "grad_norm": 1.460937373317575, | |
| "learning_rate": 4.630446857969238e-06, | |
| "loss": 0.3291, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8560447239692524, | |
| "grad_norm": 3.066440662132836, | |
| "learning_rate": 4.625816064389589e-06, | |
| "loss": 0.2752, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.86722571628232, | |
| "grad_norm": 1.9596525632755366, | |
| "learning_rate": 4.62115878306622e-06, | |
| "loss": 0.3444, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.8784067085953877, | |
| "grad_norm": 2.2835299782118335, | |
| "learning_rate": 4.616475072029024e-06, | |
| "loss": 0.3013, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.8895877009084556, | |
| "grad_norm": 2.1330589159921756, | |
| "learning_rate": 4.6117649896372055e-06, | |
| "loss": 0.3811, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.9007686932215235, | |
| "grad_norm": 2.28792058261577, | |
| "learning_rate": 4.607028594578559e-06, | |
| "loss": 0.304, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9119496855345912, | |
| "grad_norm": 1.8457539990364031, | |
| "learning_rate": 4.602265945868735e-06, | |
| "loss": 0.2817, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.923130677847659, | |
| "grad_norm": 1.7860630390403116, | |
| "learning_rate": 4.597477102850506e-06, | |
| "loss": 0.3166, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.9343116701607268, | |
| "grad_norm": 1.988441202911347, | |
| "learning_rate": 4.592662125193027e-06, | |
| "loss": 0.2881, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.9454926624737947, | |
| "grad_norm": 1.7341207391896365, | |
| "learning_rate": 4.587821072891089e-06, | |
| "loss": 0.3126, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.9566736547868624, | |
| "grad_norm": 1.8960045369195677, | |
| "learning_rate": 4.582954006264377e-06, | |
| "loss": 0.32, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.96785464709993, | |
| "grad_norm": 1.8028316706058551, | |
| "learning_rate": 4.578060985956714e-06, | |
| "loss": 0.3308, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.9790356394129978, | |
| "grad_norm": 1.7537644172052635, | |
| "learning_rate": 4.573142072935307e-06, | |
| "loss": 0.325, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.9902166317260657, | |
| "grad_norm": 1.5291097261080726, | |
| "learning_rate": 4.568197328489986e-06, | |
| "loss": 0.3418, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.0013976240391336, | |
| "grad_norm": 2.703429613422267, | |
| "learning_rate": 4.563226814232444e-06, | |
| "loss": 0.316, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.0125786163522013, | |
| "grad_norm": 1.6677019482039983, | |
| "learning_rate": 4.558230592095465e-06, | |
| "loss": 0.2242, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.023759608665269, | |
| "grad_norm": 2.1855279147060527, | |
| "learning_rate": 4.5532087243321536e-06, | |
| "loss": 0.1706, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.0349406009783366, | |
| "grad_norm": 1.433260386596143, | |
| "learning_rate": 4.548161273515161e-06, | |
| "loss": 0.2597, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.0461215932914047, | |
| "grad_norm": 1.9528007044032762, | |
| "learning_rate": 4.543088302535903e-06, | |
| "loss": 0.2321, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.0573025856044724, | |
| "grad_norm": 1.508509476663671, | |
| "learning_rate": 4.53798987460378e-06, | |
| "loss": 0.1975, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.06848357791754, | |
| "grad_norm": 1.4870411030447606, | |
| "learning_rate": 4.532866053245385e-06, | |
| "loss": 0.218, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.079664570230608, | |
| "grad_norm": 1.984299603467917, | |
| "learning_rate": 4.527716902303713e-06, | |
| "loss": 0.1866, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.090845562543676, | |
| "grad_norm": 1.7502708144873231, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.2128, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.1020265548567436, | |
| "grad_norm": 1.131006072907252, | |
| "learning_rate": 4.517342868619764e-06, | |
| "loss": 0.2418, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.1132075471698113, | |
| "grad_norm": 2.365723778930082, | |
| "learning_rate": 4.512118115138315e-06, | |
| "loss": 0.2249, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.124388539482879, | |
| "grad_norm": 1.7739738087900154, | |
| "learning_rate": 4.506868290593635e-06, | |
| "loss": 0.225, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.135569531795947, | |
| "grad_norm": 2.3920039733015197, | |
| "learning_rate": 4.501593460398726e-06, | |
| "loss": 0.207, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.146750524109015, | |
| "grad_norm": 1.3961875749075527, | |
| "learning_rate": 4.49629369027816e-06, | |
| "loss": 0.1847, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.1579315164220825, | |
| "grad_norm": 1.740079266616333, | |
| "learning_rate": 4.490969046267258e-06, | |
| "loss": 0.2092, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.16911250873515, | |
| "grad_norm": 1.716849109423316, | |
| "learning_rate": 4.485619594711278e-06, | |
| "loss": 0.2512, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.180293501048218, | |
| "grad_norm": 2.2256205473256836, | |
| "learning_rate": 4.4802454022645725e-06, | |
| "loss": 0.2212, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.191474493361286, | |
| "grad_norm": 1.5080548485099736, | |
| "learning_rate": 4.474846535889773e-06, | |
| "loss": 0.2577, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.2026554856743537, | |
| "grad_norm": 1.849350001917602, | |
| "learning_rate": 4.469423062856946e-06, | |
| "loss": 0.2518, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.2138364779874213, | |
| "grad_norm": 2.0456903454646937, | |
| "learning_rate": 4.463975050742757e-06, | |
| "loss": 0.2666, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.225017470300489, | |
| "grad_norm": 2.1576955140860172, | |
| "learning_rate": 4.4585025674296315e-06, | |
| "loss": 0.1881, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.236198462613557, | |
| "grad_norm": 1.959825305986428, | |
| "learning_rate": 4.453005681104906e-06, | |
| "loss": 0.1912, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.247379454926625, | |
| "grad_norm": 1.8263078605633967, | |
| "learning_rate": 4.44748446025998e-06, | |
| "loss": 0.177, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.2585604472396925, | |
| "grad_norm": 1.3737693376807456, | |
| "learning_rate": 4.44193897368946e-06, | |
| "loss": 0.2083, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.26974143955276, | |
| "grad_norm": 1.9216745648550881, | |
| "learning_rate": 4.436369290490307e-06, | |
| "loss": 0.269, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.280922431865828, | |
| "grad_norm": 1.5225068983698562, | |
| "learning_rate": 4.430775480060973e-06, | |
| "loss": 0.2043, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.292103424178896, | |
| "grad_norm": 1.958524495155971, | |
| "learning_rate": 4.425157612100531e-06, | |
| "loss": 0.2735, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.3032844164919637, | |
| "grad_norm": 2.020109840115744, | |
| "learning_rate": 4.419515756607819e-06, | |
| "loss": 0.2623, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.3144654088050314, | |
| "grad_norm": 1.6832635446278787, | |
| "learning_rate": 4.413849983880554e-06, | |
| "loss": 0.2122, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.325646401118099, | |
| "grad_norm": 1.8238819367042174, | |
| "learning_rate": 4.4081603645144685e-06, | |
| "loss": 0.2141, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.336827393431167, | |
| "grad_norm": 1.636664838162331, | |
| "learning_rate": 4.4024469694024194e-06, | |
| "loss": 0.2159, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.348008385744235, | |
| "grad_norm": 1.563361723149053, | |
| "learning_rate": 4.396709869733515e-06, | |
| "loss": 0.2636, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.3591893780573026, | |
| "grad_norm": 1.7104549540666967, | |
| "learning_rate": 4.39094913699222e-06, | |
| "loss": 0.2059, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.3703703703703702, | |
| "grad_norm": 1.7448299629844894, | |
| "learning_rate": 4.385164842957469e-06, | |
| "loss": 0.2076, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.381551362683438, | |
| "grad_norm": 2.0760771369111812, | |
| "learning_rate": 4.379357059701771e-06, | |
| "loss": 0.2241, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.392732354996506, | |
| "grad_norm": 1.4610379659131663, | |
| "learning_rate": 4.373525859590313e-06, | |
| "loss": 0.2135, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.4039133473095737, | |
| "grad_norm": 1.9763200369365506, | |
| "learning_rate": 4.367671315280055e-06, | |
| "loss": 0.2225, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.4150943396226414, | |
| "grad_norm": 2.138415914668256, | |
| "learning_rate": 4.3617934997188274e-06, | |
| "loss": 0.2618, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.426275331935709, | |
| "grad_norm": 1.6842725394389781, | |
| "learning_rate": 4.355892486144419e-06, | |
| "loss": 0.1691, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.4374563242487772, | |
| "grad_norm": 2.056626946764254, | |
| "learning_rate": 4.349968348083673e-06, | |
| "loss": 0.1922, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.448637316561845, | |
| "grad_norm": 1.2423274511146358, | |
| "learning_rate": 4.3440211593515556e-06, | |
| "loss": 0.2061, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.4598183088749126, | |
| "grad_norm": 1.465237522133527, | |
| "learning_rate": 4.338050994050253e-06, | |
| "loss": 0.1996, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4709993011879803, | |
| "grad_norm": 2.1451900105983315, | |
| "learning_rate": 4.332057926568235e-06, | |
| "loss": 0.2441, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.4821802935010484, | |
| "grad_norm": 1.5259606296511572, | |
| "learning_rate": 4.326042031579337e-06, | |
| "loss": 0.2066, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.493361285814116, | |
| "grad_norm": 2.4163109674867784, | |
| "learning_rate": 4.320003384041823e-06, | |
| "loss": 0.2393, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.5045422781271838, | |
| "grad_norm": 2.1518283309231907, | |
| "learning_rate": 4.313942059197457e-06, | |
| "loss": 0.2467, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.5157232704402515, | |
| "grad_norm": 1.6715387204280183, | |
| "learning_rate": 4.3078581325705614e-06, | |
| "loss": 0.2495, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5269042627533196, | |
| "grad_norm": 1.7729216990478125, | |
| "learning_rate": 4.3017516799670785e-06, | |
| "loss": 0.1586, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.5380852550663873, | |
| "grad_norm": 1.7853923740535589, | |
| "learning_rate": 4.295622777473625e-06, | |
| "loss": 0.2216, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.549266247379455, | |
| "grad_norm": 1.7001940457803237, | |
| "learning_rate": 4.289471501456543e-06, | |
| "loss": 0.2288, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.5604472396925226, | |
| "grad_norm": 2.5868877625212354, | |
| "learning_rate": 4.283297928560951e-06, | |
| "loss": 0.2075, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.5716282320055903, | |
| "grad_norm": 2.1990912649669823, | |
| "learning_rate": 4.277102135709786e-06, | |
| "loss": 0.2017, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.582809224318658, | |
| "grad_norm": 2.2627396419665273, | |
| "learning_rate": 4.270884200102848e-06, | |
| "loss": 0.2144, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.593990216631726, | |
| "grad_norm": 2.2283930780278505, | |
| "learning_rate": 4.2646441992158356e-06, | |
| "loss": 0.3, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.605171208944794, | |
| "grad_norm": 2.6765537923336087, | |
| "learning_rate": 4.258382210799381e-06, | |
| "loss": 0.2441, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.6163522012578615, | |
| "grad_norm": 2.0124117535310706, | |
| "learning_rate": 4.252098312878083e-06, | |
| "loss": 0.2667, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.6275331935709296, | |
| "grad_norm": 2.0622543839995586, | |
| "learning_rate": 4.245792583749533e-06, | |
| "loss": 0.2209, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.6387141858839973, | |
| "grad_norm": 1.7479329049755916, | |
| "learning_rate": 4.2394651019833385e-06, | |
| "loss": 0.2045, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.649895178197065, | |
| "grad_norm": 2.223724201139868, | |
| "learning_rate": 4.23311594642015e-06, | |
| "loss": 0.2283, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.6610761705101327, | |
| "grad_norm": 1.8280919056271019, | |
| "learning_rate": 4.226745196170669e-06, | |
| "loss": 0.2319, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.6722571628232004, | |
| "grad_norm": 1.6911807333452673, | |
| "learning_rate": 4.220352930614672e-06, | |
| "loss": 0.232, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.6834381551362685, | |
| "grad_norm": 1.9242468593637576, | |
| "learning_rate": 4.213939229400014e-06, | |
| "loss": 0.2733, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.694619147449336, | |
| "grad_norm": 2.1223012349945254, | |
| "learning_rate": 4.20750417244164e-06, | |
| "loss": 0.2529, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.705800139762404, | |
| "grad_norm": 2.1921742273194313, | |
| "learning_rate": 4.201047839920589e-06, | |
| "loss": 0.257, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.7169811320754715, | |
| "grad_norm": 2.118251084662083, | |
| "learning_rate": 4.194570312282993e-06, | |
| "loss": 0.235, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.7281621243885397, | |
| "grad_norm": 1.9816644323530734, | |
| "learning_rate": 4.1880716702390764e-06, | |
| "loss": 0.1839, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.7393431167016074, | |
| "grad_norm": 1.8891363830208663, | |
| "learning_rate": 4.181551994762151e-06, | |
| "loss": 0.2301, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.750524109014675, | |
| "grad_norm": 1.7502840233703516, | |
| "learning_rate": 4.1750113670876045e-06, | |
| "loss": 0.1883, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.7617051013277427, | |
| "grad_norm": 1.5627429248705165, | |
| "learning_rate": 4.16844986871189e-06, | |
| "loss": 0.2042, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.7728860936408104, | |
| "grad_norm": 1.8631447011251083, | |
| "learning_rate": 4.161867581391511e-06, | |
| "loss": 0.2018, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.7840670859538785, | |
| "grad_norm": 2.0906363974353765, | |
| "learning_rate": 4.155264587142002e-06, | |
| "loss": 0.2319, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.795248078266946, | |
| "grad_norm": 1.7819164584799931, | |
| "learning_rate": 4.148640968236903e-06, | |
| "loss": 0.1703, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.806429070580014, | |
| "grad_norm": 1.7607086842324982, | |
| "learning_rate": 4.141996807206745e-06, | |
| "loss": 0.2264, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.817610062893082, | |
| "grad_norm": 1.5277530729360727, | |
| "learning_rate": 4.135332186838008e-06, | |
| "loss": 0.2134, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.8287910552061497, | |
| "grad_norm": 1.739277840645659, | |
| "learning_rate": 4.128647190172099e-06, | |
| "loss": 0.1952, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.8399720475192174, | |
| "grad_norm": 1.9987218712547774, | |
| "learning_rate": 4.121941900504316e-06, | |
| "loss": 0.2364, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.851153039832285, | |
| "grad_norm": 2.2244662318443225, | |
| "learning_rate": 4.1152164013828035e-06, | |
| "loss": 0.2072, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.8623340321453528, | |
| "grad_norm": 1.526547678145968, | |
| "learning_rate": 4.108470776607521e-06, | |
| "loss": 0.2047, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.8735150244584204, | |
| "grad_norm": 2.005093613185987, | |
| "learning_rate": 4.1017051102291946e-06, | |
| "loss": 0.2789, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.8846960167714886, | |
| "grad_norm": 2.2990829029486624, | |
| "learning_rate": 4.094919486548266e-06, | |
| "loss": 0.2414, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.8958770090845563, | |
| "grad_norm": 2.13743283403912, | |
| "learning_rate": 4.088113990113846e-06, | |
| "loss": 0.2029, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.907058001397624, | |
| "grad_norm": 1.9027626030017704, | |
| "learning_rate": 4.081288705722666e-06, | |
| "loss": 0.2229, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.918238993710692, | |
| "grad_norm": 2.0076859155071745, | |
| "learning_rate": 4.074443718418009e-06, | |
| "loss": 0.1995, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.9294199860237597, | |
| "grad_norm": 1.7985240007466619, | |
| "learning_rate": 4.067579113488661e-06, | |
| "loss": 0.1807, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.9406009783368274, | |
| "grad_norm": 2.140934337000471, | |
| "learning_rate": 4.060694976467844e-06, | |
| "loss": 0.2532, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.951781970649895, | |
| "grad_norm": 2.323003193893417, | |
| "learning_rate": 4.0537913931321495e-06, | |
| "loss": 0.2421, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 1.4532319163010707, | |
| "learning_rate": 4.04686844950047e-06, | |
| "loss": 0.2267, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.9741439552760305, | |
| "grad_norm": 2.0854922336923023, | |
| "learning_rate": 4.039926231832931e-06, | |
| "loss": 0.266, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.9853249475890986, | |
| "grad_norm": 2.882533995321225, | |
| "learning_rate": 4.032964826629811e-06, | |
| "loss": 0.2079, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.9965059399021663, | |
| "grad_norm": 2.7236955724192873, | |
| "learning_rate": 4.025984320630465e-06, | |
| "loss": 0.1657, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 3.007686932215234, | |
| "grad_norm": 1.8432900490614266, | |
| "learning_rate": 4.018984800812248e-06, | |
| "loss": 0.1354, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 3.018867924528302, | |
| "grad_norm": 2.0142515580054017, | |
| "learning_rate": 4.011966354389424e-06, | |
| "loss": 0.1542, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.03004891684137, | |
| "grad_norm": 2.756352182005047, | |
| "learning_rate": 4.004929068812086e-06, | |
| "loss": 0.1638, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 3.0412299091544375, | |
| "grad_norm": 2.048077691313813, | |
| "learning_rate": 3.997873031765061e-06, | |
| "loss": 0.156, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 3.052410901467505, | |
| "grad_norm": 1.7442233155652336, | |
| "learning_rate": 3.990798331166822e-06, | |
| "loss": 0.1095, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 3.063591893780573, | |
| "grad_norm": 1.826861973142375, | |
| "learning_rate": 3.983705055168391e-06, | |
| "loss": 0.1195, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 3.074772886093641, | |
| "grad_norm": 1.943175517862748, | |
| "learning_rate": 3.976593292152238e-06, | |
| "loss": 0.1638, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0859538784067087, | |
| "grad_norm": 1.5477727978546996, | |
| "learning_rate": 3.969463130731183e-06, | |
| "loss": 0.1291, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 3.0971348707197763, | |
| "grad_norm": 2.3918080397656034, | |
| "learning_rate": 3.9623146597472915e-06, | |
| "loss": 0.1333, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 3.108315863032844, | |
| "grad_norm": 2.0592865934704, | |
| "learning_rate": 3.955147968270764e-06, | |
| "loss": 0.1692, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 3.119496855345912, | |
| "grad_norm": 1.280306245998938, | |
| "learning_rate": 3.947963145598833e-06, | |
| "loss": 0.1695, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 3.13067784765898, | |
| "grad_norm": 1.5568837418874426, | |
| "learning_rate": 3.940760281254645e-06, | |
| "loss": 0.1614, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.1418588399720475, | |
| "grad_norm": 1.6248982612645957, | |
| "learning_rate": 3.933539464986143e-06, | |
| "loss": 0.1184, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 3.153039832285115, | |
| "grad_norm": 1.657284019650329, | |
| "learning_rate": 3.926300786764957e-06, | |
| "loss": 0.1523, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 3.164220824598183, | |
| "grad_norm": 1.9315037734198213, | |
| "learning_rate": 3.919044336785274e-06, | |
| "loss": 0.1411, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 3.175401816911251, | |
| "grad_norm": 1.7456382044347782, | |
| "learning_rate": 3.911770205462717e-06, | |
| "loss": 0.1764, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 3.1865828092243187, | |
| "grad_norm": 1.4045398532057205, | |
| "learning_rate": 3.904478483433223e-06, | |
| "loss": 0.1241, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.1977638015373864, | |
| "grad_norm": 2.0886459168414895, | |
| "learning_rate": 3.897169261551907e-06, | |
| "loss": 0.1475, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 3.208944793850454, | |
| "grad_norm": 1.9098750157027404, | |
| "learning_rate": 3.889842630891934e-06, | |
| "loss": 0.138, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 3.220125786163522, | |
| "grad_norm": 2.184899827108709, | |
| "learning_rate": 3.8824986827433804e-06, | |
| "loss": 0.1315, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 3.23130677847659, | |
| "grad_norm": 1.528868394326383, | |
| "learning_rate": 3.875137508612104e-06, | |
| "loss": 0.1447, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 3.2424877707896576, | |
| "grad_norm": 1.6893708687857107, | |
| "learning_rate": 3.867759200218594e-06, | |
| "loss": 0.1746, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.2536687631027252, | |
| "grad_norm": 1.2610411246909474, | |
| "learning_rate": 3.860363849496836e-06, | |
| "loss": 0.1301, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 3.264849755415793, | |
| "grad_norm": 1.397542140556738, | |
| "learning_rate": 3.852951548593161e-06, | |
| "loss": 0.1373, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 3.276030747728861, | |
| "grad_norm": 1.9903353672741917, | |
| "learning_rate": 3.845522389865106e-06, | |
| "loss": 0.1609, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 3.2872117400419287, | |
| "grad_norm": 1.8370941337314268, | |
| "learning_rate": 3.838076465880248e-06, | |
| "loss": 0.148, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 3.2983927323549964, | |
| "grad_norm": 2.058865100613852, | |
| "learning_rate": 3.830613869415069e-06, | |
| "loss": 0.1483, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.309573724668064, | |
| "grad_norm": 1.5232253694216566, | |
| "learning_rate": 3.823134693453782e-06, | |
| "loss": 0.1621, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 3.3207547169811322, | |
| "grad_norm": 1.4993049111722665, | |
| "learning_rate": 3.8156390311871885e-06, | |
| "loss": 0.1433, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 3.3319357092942, | |
| "grad_norm": 1.555934394379587, | |
| "learning_rate": 3.808126976011505e-06, | |
| "loss": 0.1426, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 3.3431167016072676, | |
| "grad_norm": 1.3356473446523094, | |
| "learning_rate": 3.8005986215272056e-06, | |
| "loss": 0.1706, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 3.3542976939203353, | |
| "grad_norm": 1.9137688829035275, | |
| "learning_rate": 3.7930540615378565e-06, | |
| "loss": 0.1268, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.3654786862334034, | |
| "grad_norm": 1.5344748040953766, | |
| "learning_rate": 3.785493390048942e-06, | |
| "loss": 0.1458, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 3.376659678546471, | |
| "grad_norm": 1.602087497610558, | |
| "learning_rate": 3.777916701266699e-06, | |
| "loss": 0.1697, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 3.3878406708595388, | |
| "grad_norm": 1.4842568873334896, | |
| "learning_rate": 3.7703240895969373e-06, | |
| "loss": 0.1519, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 3.3990216631726065, | |
| "grad_norm": 1.53860971256147, | |
| "learning_rate": 3.7627156496438686e-06, | |
| "loss": 0.1691, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 3.4102026554856746, | |
| "grad_norm": 1.4193083610134813, | |
| "learning_rate": 3.755091476208925e-06, | |
| "loss": 0.1211, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.4213836477987423, | |
| "grad_norm": 1.8053625548432577, | |
| "learning_rate": 3.7474516642895804e-06, | |
| "loss": 0.131, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 3.43256464011181, | |
| "grad_norm": 1.9235537907938398, | |
| "learning_rate": 3.7397963090781606e-06, | |
| "loss": 0.163, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 3.4437456324248776, | |
| "grad_norm": 1.6022979215271898, | |
| "learning_rate": 3.732125505960665e-06, | |
| "loss": 0.1479, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 3.4549266247379453, | |
| "grad_norm": 1.663918706474492, | |
| "learning_rate": 3.7244393505155713e-06, | |
| "loss": 0.1376, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 3.4661076170510134, | |
| "grad_norm": 1.7974067820999995, | |
| "learning_rate": 3.716737938512651e-06, | |
| "loss": 0.1281, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.477288609364081, | |
| "grad_norm": 2.10108609081228, | |
| "learning_rate": 3.709021365911772e-06, | |
| "loss": 0.1388, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 3.488469601677149, | |
| "grad_norm": 1.367826215107555, | |
| "learning_rate": 3.701289728861701e-06, | |
| "loss": 0.1191, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 3.4996505939902165, | |
| "grad_norm": 1.7959553374302317, | |
| "learning_rate": 3.693543123698913e-06, | |
| "loss": 0.1758, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 3.5108315863032846, | |
| "grad_norm": 1.7389366148854988, | |
| "learning_rate": 3.6857816469463806e-06, | |
| "loss": 0.1405, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 3.5220125786163523, | |
| "grad_norm": 2.871162474790627, | |
| "learning_rate": 3.6780053953123836e-06, | |
| "loss": 0.1549, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.53319357092942, | |
| "grad_norm": 1.478751565339363, | |
| "learning_rate": 3.6702144656892907e-06, | |
| "loss": 0.1759, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 3.5443745632424877, | |
| "grad_norm": 1.4974413518112613, | |
| "learning_rate": 3.662408955152364e-06, | |
| "loss": 0.1078, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 1.7006067350332152, | |
| "learning_rate": 3.6545889609585405e-06, | |
| "loss": 0.1427, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 3.5667365478686235, | |
| "grad_norm": 1.8754398825641954, | |
| "learning_rate": 3.6467545805452266e-06, | |
| "loss": 0.1893, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 3.577917540181691, | |
| "grad_norm": 1.7762501705151392, | |
| "learning_rate": 3.6389059115290813e-06, | |
| "loss": 0.1109, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.589098532494759, | |
| "grad_norm": 2.0251975300449327, | |
| "learning_rate": 3.631043051704799e-06, | |
| "loss": 0.121, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 3.6002795248078265, | |
| "grad_norm": 1.3531681902278672, | |
| "learning_rate": 3.6231660990438922e-06, | |
| "loss": 0.1348, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 3.6114605171208947, | |
| "grad_norm": 1.9724391202631109, | |
| "learning_rate": 3.615275151693471e-06, | |
| "loss": 0.1449, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 3.6226415094339623, | |
| "grad_norm": 1.785158595271644, | |
| "learning_rate": 3.6073703079750204e-06, | |
| "loss": 0.1485, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 3.63382250174703, | |
| "grad_norm": 1.829166278099355, | |
| "learning_rate": 3.5994516663831734e-06, | |
| "loss": 0.1192, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.6450034940600977, | |
| "grad_norm": 1.9222881871208803, | |
| "learning_rate": 3.591519325584487e-06, | |
| "loss": 0.1635, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 3.6561844863731654, | |
| "grad_norm": 2.052453811112636, | |
| "learning_rate": 3.583573384416209e-06, | |
| "loss": 0.1561, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 3.6673654786862335, | |
| "grad_norm": 1.9190051036571132, | |
| "learning_rate": 3.575613941885047e-06, | |
| "loss": 0.1051, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 3.678546470999301, | |
| "grad_norm": 1.4736638642637576, | |
| "learning_rate": 3.5676410971659404e-06, | |
| "loss": 0.123, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 3.689727463312369, | |
| "grad_norm": 1.7325761695268906, | |
| "learning_rate": 3.5596549496008165e-06, | |
| "loss": 0.1446, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.700908455625437, | |
| "grad_norm": 2.0344810615726288, | |
| "learning_rate": 3.551655598697358e-06, | |
| "loss": 0.1629, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 3.7120894479385047, | |
| "grad_norm": 1.936581123166174, | |
| "learning_rate": 3.54364314412776e-06, | |
| "loss": 0.1569, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 3.7232704402515724, | |
| "grad_norm": 1.3525874354992642, | |
| "learning_rate": 3.535617685727494e-06, | |
| "loss": 0.1082, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 3.73445143256464, | |
| "grad_norm": 1.6514309403224916, | |
| "learning_rate": 3.527579323494055e-06, | |
| "loss": 0.1431, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 3.7456324248777078, | |
| "grad_norm": 1.8602451468342234, | |
| "learning_rate": 3.5195281575857228e-06, | |
| "loss": 0.1639, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.7568134171907754, | |
| "grad_norm": 1.4731268992440232, | |
| "learning_rate": 3.511464288320311e-06, | |
| "loss": 0.1271, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 3.7679944095038436, | |
| "grad_norm": 1.37724516129253, | |
| "learning_rate": 3.503387816173916e-06, | |
| "loss": 0.1597, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 3.7791754018169113, | |
| "grad_norm": 1.7200144334067748, | |
| "learning_rate": 3.495298841779669e-06, | |
| "loss": 0.117, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 3.790356394129979, | |
| "grad_norm": 1.92538314164391, | |
| "learning_rate": 3.4871974659264786e-06, | |
| "loss": 0.1584, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 3.801537386443047, | |
| "grad_norm": 1.4718208788605616, | |
| "learning_rate": 3.4790837895577752e-06, | |
| "loss": 0.1333, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.8127183787561147, | |
| "grad_norm": 1.5582481918696203, | |
| "learning_rate": 3.470957913770255e-06, | |
| "loss": 0.1464, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 3.8238993710691824, | |
| "grad_norm": 1.4618275028428347, | |
| "learning_rate": 3.462819939812618e-06, | |
| "loss": 0.0995, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 3.83508036338225, | |
| "grad_norm": 1.3366351935592664, | |
| "learning_rate": 3.4546699690843123e-06, | |
| "loss": 0.1204, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 3.846261355695318, | |
| "grad_norm": 1.3780079667316787, | |
| "learning_rate": 3.446508103134259e-06, | |
| "loss": 0.1701, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 3.8574423480083855, | |
| "grad_norm": 1.7451718870626607, | |
| "learning_rate": 3.4383344436595992e-06, | |
| "loss": 0.1158, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.8686233403214536, | |
| "grad_norm": 2.019474198008684, | |
| "learning_rate": 3.430149092504422e-06, | |
| "loss": 0.1304, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 3.8798043326345213, | |
| "grad_norm": 1.6820935429062616, | |
| "learning_rate": 3.4219521516584912e-06, | |
| "loss": 0.1334, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 3.890985324947589, | |
| "grad_norm": 2.2578057319721236, | |
| "learning_rate": 3.4137437232559834e-06, | |
| "loss": 0.1557, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 3.902166317260657, | |
| "grad_norm": 1.3610116271561221, | |
| "learning_rate": 3.4055239095742067e-06, | |
| "loss": 0.1644, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 3.913347309573725, | |
| "grad_norm": 1.3397050224861815, | |
| "learning_rate": 3.3972928130323322e-06, | |
| "loss": 0.1471, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.9245283018867925, | |
| "grad_norm": 1.5234658664307734, | |
| "learning_rate": 3.3890505361901153e-06, | |
| "loss": 0.1195, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 3.93570929419986, | |
| "grad_norm": 1.763362220735128, | |
| "learning_rate": 3.380797181746619e-06, | |
| "loss": 0.1363, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 3.946890286512928, | |
| "grad_norm": 2.038986301246902, | |
| "learning_rate": 3.3725328525389324e-06, | |
| "loss": 0.1203, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 3.958071278825996, | |
| "grad_norm": 1.9046513315579439, | |
| "learning_rate": 3.364257651540891e-06, | |
| "loss": 0.1578, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 3.9692522711390636, | |
| "grad_norm": 1.423399143627221, | |
| "learning_rate": 3.355971681861794e-06, | |
| "loss": 0.1211, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.9804332634521313, | |
| "grad_norm": 1.5586817639667492, | |
| "learning_rate": 3.3476750467451176e-06, | |
| "loss": 0.153, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 3.991614255765199, | |
| "grad_norm": 1.4814888460752178, | |
| "learning_rate": 3.33936784956723e-06, | |
| "loss": 0.1288, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 4.002795248078267, | |
| "grad_norm": 1.6561127976965244, | |
| "learning_rate": 3.331050193836104e-06, | |
| "loss": 0.1196, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 4.013976240391335, | |
| "grad_norm": 1.8246755797846792, | |
| "learning_rate": 3.322722183190025e-06, | |
| "loss": 0.0983, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 4.0251572327044025, | |
| "grad_norm": 1.2508646883720782, | |
| "learning_rate": 3.3143839213963026e-06, | |
| "loss": 0.1132, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.03633822501747, | |
| "grad_norm": 1.3174073933660169, | |
| "learning_rate": 3.306035512349974e-06, | |
| "loss": 0.0886, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 4.047519217330538, | |
| "grad_norm": 1.4006843207756257, | |
| "learning_rate": 3.297677060072513e-06, | |
| "loss": 0.0907, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 4.058700209643606, | |
| "grad_norm": 2.147633002379955, | |
| "learning_rate": 3.2893086687105324e-06, | |
| "loss": 0.0814, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 4.069881201956673, | |
| "grad_norm": 1.8499679148666142, | |
| "learning_rate": 3.280930442534486e-06, | |
| "loss": 0.0916, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 4.081062194269742, | |
| "grad_norm": 1.5576608674855401, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.0814, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.0922431865828095, | |
| "grad_norm": 1.5258204722757824, | |
| "learning_rate": 3.264144903433419e-06, | |
| "loss": 0.0929, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 4.103424178895877, | |
| "grad_norm": 1.2377371189448831, | |
| "learning_rate": 3.2557377996568135e-06, | |
| "loss": 0.0933, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 4.114605171208945, | |
| "grad_norm": 1.6706792363129992, | |
| "learning_rate": 3.247321279360363e-06, | |
| "loss": 0.0957, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 4.1257861635220126, | |
| "grad_norm": 1.5205095000978939, | |
| "learning_rate": 3.238895447414211e-06, | |
| "loss": 0.1094, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 4.13696715583508, | |
| "grad_norm": 1.8218111131497405, | |
| "learning_rate": 3.2304604088045206e-06, | |
| "loss": 0.0866, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.148148148148148, | |
| "grad_norm": 1.5060146063158792, | |
| "learning_rate": 3.222016268632175e-06, | |
| "loss": 0.0974, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 4.159329140461216, | |
| "grad_norm": 2.33394735696618, | |
| "learning_rate": 3.2135631321114603e-06, | |
| "loss": 0.0767, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 4.170510132774284, | |
| "grad_norm": 1.8304481485687374, | |
| "learning_rate": 3.2051011045687574e-06, | |
| "loss": 0.1027, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 4.181691125087352, | |
| "grad_norm": 1.4496933516097028, | |
| "learning_rate": 3.196630291441231e-06, | |
| "loss": 0.073, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 4.1928721174004195, | |
| "grad_norm": 1.5989097781751378, | |
| "learning_rate": 3.1881507982755126e-06, | |
| "loss": 0.074, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.204053109713487, | |
| "grad_norm": 1.5479651084913313, | |
| "learning_rate": 3.17966273072639e-06, | |
| "loss": 0.0941, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 4.215234102026555, | |
| "grad_norm": 1.4844971201883568, | |
| "learning_rate": 3.1711661945554857e-06, | |
| "loss": 0.1171, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 4.226415094339623, | |
| "grad_norm": 1.538555100844062, | |
| "learning_rate": 3.162661295629942e-06, | |
| "loss": 0.0839, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 4.23759608665269, | |
| "grad_norm": 1.511356916861757, | |
| "learning_rate": 3.154148139921102e-06, | |
| "loss": 0.1039, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 4.248777078965758, | |
| "grad_norm": 1.811476489190878, | |
| "learning_rate": 3.1456268335031886e-06, | |
| "loss": 0.0794, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.259958071278826, | |
| "grad_norm": 1.6229333309674812, | |
| "learning_rate": 3.137097482551983e-06, | |
| "loss": 0.1152, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 4.271139063591894, | |
| "grad_norm": 1.4723017587041405, | |
| "learning_rate": 3.128560193343501e-06, | |
| "loss": 0.0944, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 4.282320055904962, | |
| "grad_norm": 1.0034690245189755, | |
| "learning_rate": 3.1200150722526693e-06, | |
| "loss": 0.0663, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 4.29350104821803, | |
| "grad_norm": 1.5551415143149132, | |
| "learning_rate": 3.1114622257520004e-06, | |
| "loss": 0.1021, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 4.304682040531097, | |
| "grad_norm": 1.836559018121584, | |
| "learning_rate": 3.1029017604102655e-06, | |
| "loss": 0.099, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.315863032844165, | |
| "grad_norm": 1.0818921388079483, | |
| "learning_rate": 3.0943337828911673e-06, | |
| "loss": 0.0899, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 4.327044025157233, | |
| "grad_norm": 0.9784785751112162, | |
| "learning_rate": 3.085758399952011e-06, | |
| "loss": 0.1016, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 4.3382250174703, | |
| "grad_norm": 1.348338975607883, | |
| "learning_rate": 3.0771757184423716e-06, | |
| "loss": 0.1063, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 4.349406009783368, | |
| "grad_norm": 2.1529902019434455, | |
| "learning_rate": 3.0685858453027668e-06, | |
| "loss": 0.089, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 4.360587002096436, | |
| "grad_norm": 1.3031273077449874, | |
| "learning_rate": 3.0599888875633192e-06, | |
| "loss": 0.1077, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.371767994409504, | |
| "grad_norm": 1.3772043306307704, | |
| "learning_rate": 3.0513849523424298e-06, | |
| "loss": 0.0879, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 4.382948986722572, | |
| "grad_norm": 1.7829225937512299, | |
| "learning_rate": 3.0427741468454375e-06, | |
| "loss": 0.1099, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 4.39412997903564, | |
| "grad_norm": 1.1143653742483424, | |
| "learning_rate": 3.034156578363284e-06, | |
| "loss": 0.0908, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 4.405310971348707, | |
| "grad_norm": 1.9841896768408593, | |
| "learning_rate": 3.0255323542711784e-06, | |
| "loss": 0.0846, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 4.416491963661775, | |
| "grad_norm": 1.1622503242476587, | |
| "learning_rate": 3.0169015820272595e-06, | |
| "loss": 0.0809, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.427672955974843, | |
| "grad_norm": 1.4138977756081776, | |
| "learning_rate": 3.0082643691712572e-06, | |
| "loss": 0.0832, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 4.43885394828791, | |
| "grad_norm": 1.3694425414816003, | |
| "learning_rate": 2.9996208233231506e-06, | |
| "loss": 0.1015, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 4.450034940600978, | |
| "grad_norm": 1.8252502558409327, | |
| "learning_rate": 2.9909710521818265e-06, | |
| "loss": 0.1049, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 4.461215932914046, | |
| "grad_norm": 1.4396307405101365, | |
| "learning_rate": 2.9823151635237424e-06, | |
| "loss": 0.0613, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 4.472396925227114, | |
| "grad_norm": 1.3667673153541864, | |
| "learning_rate": 2.973653265201578e-06, | |
| "loss": 0.1081, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.483577917540182, | |
| "grad_norm": 1.761976942384573, | |
| "learning_rate": 2.964985465142895e-06, | |
| "loss": 0.1002, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 4.49475890985325, | |
| "grad_norm": 1.6343471974417978, | |
| "learning_rate": 2.9563118713487895e-06, | |
| "loss": 0.0749, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 4.505939902166317, | |
| "grad_norm": 2.0454570442431046, | |
| "learning_rate": 2.9476325918925484e-06, | |
| "loss": 0.0857, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 4.517120894479385, | |
| "grad_norm": 1.7007295640066746, | |
| "learning_rate": 2.938947734918302e-06, | |
| "loss": 0.1085, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 4.528301886792453, | |
| "grad_norm": 1.5611422829954795, | |
| "learning_rate": 2.9302574086396774e-06, | |
| "loss": 0.0775, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.53948287910552, | |
| "grad_norm": 1.7913016893140525, | |
| "learning_rate": 2.9215617213384494e-06, | |
| "loss": 0.0875, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 4.550663871418588, | |
| "grad_norm": 1.5753063947599002, | |
| "learning_rate": 2.91286078136319e-06, | |
| "loss": 0.0805, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 4.561844863731656, | |
| "grad_norm": 1.8942921897754963, | |
| "learning_rate": 2.904154697127921e-06, | |
| "loss": 0.0806, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 4.573025856044724, | |
| "grad_norm": 1.791394910046461, | |
| "learning_rate": 2.8954435771107604e-06, | |
| "loss": 0.0992, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 4.584206848357792, | |
| "grad_norm": 1.245790765054016, | |
| "learning_rate": 2.8867275298525743e-06, | |
| "loss": 0.0886, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.59538784067086, | |
| "grad_norm": 1.5133863011334676, | |
| "learning_rate": 2.878006663955621e-06, | |
| "loss": 0.0886, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 4.606568832983927, | |
| "grad_norm": 2.0502622868705993, | |
| "learning_rate": 2.8692810880821997e-06, | |
| "loss": 0.0716, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 4.617749825296995, | |
| "grad_norm": 1.2876873289352964, | |
| "learning_rate": 2.860550910953296e-06, | |
| "loss": 0.0943, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 4.628930817610063, | |
| "grad_norm": 1.440475980645125, | |
| "learning_rate": 2.8518162413472266e-06, | |
| "loss": 0.1083, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 4.64011180992313, | |
| "grad_norm": 1.3754262878787067, | |
| "learning_rate": 2.843077188098286e-06, | |
| "loss": 0.1041, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.651292802236198, | |
| "grad_norm": 1.4424213259038674, | |
| "learning_rate": 2.834333860095388e-06, | |
| "loss": 0.0807, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 4.662473794549266, | |
| "grad_norm": 1.994638545215632, | |
| "learning_rate": 2.8255863662807097e-06, | |
| "loss": 0.0819, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 4.673654786862334, | |
| "grad_norm": 1.5478645240921063, | |
| "learning_rate": 2.8168348156483356e-06, | |
| "loss": 0.113, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 4.684835779175402, | |
| "grad_norm": 1.324879005941319, | |
| "learning_rate": 2.8124575531000226e-06, | |
| "loss": 0.11, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 4.69601677148847, | |
| "grad_norm": 1.5993247352100177, | |
| "learning_rate": 2.803700121715214e-06, | |
| "loss": 0.0903, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.707197763801537, | |
| "grad_norm": 1.256541482417978, | |
| "learning_rate": 2.7949389062160946e-06, | |
| "loss": 0.0925, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 4.718378756114605, | |
| "grad_norm": 2.706891920194882, | |
| "learning_rate": 2.786174015767721e-06, | |
| "loss": 0.084, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 4.729559748427673, | |
| "grad_norm": 1.3220515828132557, | |
| "learning_rate": 2.7774055595809395e-06, | |
| "loss": 0.0801, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 4.7407407407407405, | |
| "grad_norm": 1.5911477732332153, | |
| "learning_rate": 2.768633646911027e-06, | |
| "loss": 0.0938, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 4.751921733053808, | |
| "grad_norm": 1.1333988378482527, | |
| "learning_rate": 2.759858387056325e-06, | |
| "loss": 0.0721, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.763102725366876, | |
| "grad_norm": 1.4690260920140663, | |
| "learning_rate": 2.7510798893568846e-06, | |
| "loss": 0.0769, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 4.774283717679944, | |
| "grad_norm": 1.3785131166774844, | |
| "learning_rate": 2.742298263193099e-06, | |
| "loss": 0.1064, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 4.785464709993012, | |
| "grad_norm": 1.39128795327872, | |
| "learning_rate": 2.733513617984342e-06, | |
| "loss": 0.075, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 4.79664570230608, | |
| "grad_norm": 1.6826021403482612, | |
| "learning_rate": 2.724726063187605e-06, | |
| "loss": 0.1175, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 4.8078266946191475, | |
| "grad_norm": 1.353741266830404, | |
| "learning_rate": 2.715935708296134e-06, | |
| "loss": 0.1146, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.819007686932215, | |
| "grad_norm": 1.4488179633464906, | |
| "learning_rate": 2.707142662838062e-06, | |
| "loss": 0.1033, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 4.830188679245283, | |
| "grad_norm": 1.307354977462126, | |
| "learning_rate": 2.6983470363750497e-06, | |
| "loss": 0.093, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 4.8413696715583505, | |
| "grad_norm": 1.4753004858703918, | |
| "learning_rate": 2.689548938500914e-06, | |
| "loss": 0.0905, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 4.852550663871418, | |
| "grad_norm": 1.551558439927485, | |
| "learning_rate": 2.6807484788402676e-06, | |
| "loss": 0.075, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 4.863731656184486, | |
| "grad_norm": 1.499892261020302, | |
| "learning_rate": 2.67194576704715e-06, | |
| "loss": 0.0876, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.8749126484975545, | |
| "grad_norm": 1.82643381640813, | |
| "learning_rate": 2.6631409128036637e-06, | |
| "loss": 0.0892, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 4.886093640810622, | |
| "grad_norm": 1.3480606493487655, | |
| "learning_rate": 2.6543340258186063e-06, | |
| "loss": 0.0816, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 4.89727463312369, | |
| "grad_norm": 2.2307067144092407, | |
| "learning_rate": 2.6455252158261015e-06, | |
| "loss": 0.0994, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 4.9084556254367575, | |
| "grad_norm": 1.8646868858712458, | |
| "learning_rate": 2.636714592584235e-06, | |
| "loss": 0.0902, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 4.919636617749825, | |
| "grad_norm": 1.535171207325978, | |
| "learning_rate": 2.6279022658736856e-06, | |
| "loss": 0.0911, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.930817610062893, | |
| "grad_norm": 1.1594360070916991, | |
| "learning_rate": 2.619088345496358e-06, | |
| "loss": 0.066, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 4.941998602375961, | |
| "grad_norm": 1.6526631394475477, | |
| "learning_rate": 2.610272941274012e-06, | |
| "loss": 0.1014, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 4.953179594689029, | |
| "grad_norm": 1.8240816325874138, | |
| "learning_rate": 2.6014561630468993e-06, | |
| "loss": 0.0928, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 4.964360587002097, | |
| "grad_norm": 1.3816438884334348, | |
| "learning_rate": 2.5926381206723885e-06, | |
| "loss": 0.088, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 4.9755415793151645, | |
| "grad_norm": 1.3157397283692482, | |
| "learning_rate": 2.583818924023601e-06, | |
| "loss": 0.0938, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.986722571628232, | |
| "grad_norm": 1.464557516575305, | |
| "learning_rate": 2.5749986829880423e-06, | |
| "loss": 0.0781, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 4.9979035639413, | |
| "grad_norm": 1.8481309973872981, | |
| "learning_rate": 2.5661775074662276e-06, | |
| "loss": 0.0708, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 5.0090845562543675, | |
| "grad_norm": 1.3777408578534927, | |
| "learning_rate": 2.5573555073703172e-06, | |
| "loss": 0.0574, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 5.020265548567435, | |
| "grad_norm": 1.5585565063610693, | |
| "learning_rate": 2.5485327926227464e-06, | |
| "loss": 0.0533, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 5.031446540880503, | |
| "grad_norm": 3.8488829032344403, | |
| "learning_rate": 2.539709473154855e-06, | |
| "loss": 0.0524, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.042627533193571, | |
| "grad_norm": 1.360678519326562, | |
| "learning_rate": 2.5308856589055164e-06, | |
| "loss": 0.0608, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 5.053808525506638, | |
| "grad_norm": 1.4720850175627471, | |
| "learning_rate": 2.5220614598197708e-06, | |
| "loss": 0.0527, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 5.064989517819707, | |
| "grad_norm": 1.2412662972591795, | |
| "learning_rate": 2.513236985847451e-06, | |
| "loss": 0.0488, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 5.0761705101327745, | |
| "grad_norm": 1.3236580966844242, | |
| "learning_rate": 2.5044123469418174e-06, | |
| "loss": 0.0638, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 5.087351502445842, | |
| "grad_norm": 1.8348241342651854, | |
| "learning_rate": 2.495587653058184e-06, | |
| "loss": 0.0629, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.09853249475891, | |
| "grad_norm": 0.9662213920921242, | |
| "learning_rate": 2.4867630141525493e-06, | |
| "loss": 0.0722, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 5.109713487071978, | |
| "grad_norm": 1.6784486385619315, | |
| "learning_rate": 2.477938540180231e-06, | |
| "loss": 0.0482, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 5.120894479385045, | |
| "grad_norm": 1.386742744607905, | |
| "learning_rate": 2.4691143410944844e-06, | |
| "loss": 0.0596, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 5.132075471698113, | |
| "grad_norm": 1.5375835898995094, | |
| "learning_rate": 2.4602905268451455e-06, | |
| "loss": 0.0592, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 5.143256464011181, | |
| "grad_norm": 1.334707574114043, | |
| "learning_rate": 2.451467207377254e-06, | |
| "loss": 0.0493, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.154437456324249, | |
| "grad_norm": 1.018606004126685, | |
| "learning_rate": 2.442644492629683e-06, | |
| "loss": 0.0544, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 5.165618448637317, | |
| "grad_norm": 1.0236510244569192, | |
| "learning_rate": 2.433822492533774e-06, | |
| "loss": 0.0501, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 5.176799440950385, | |
| "grad_norm": 0.8191759766926784, | |
| "learning_rate": 2.4250013170119585e-06, | |
| "loss": 0.0594, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 5.187980433263452, | |
| "grad_norm": 1.0938612787512558, | |
| "learning_rate": 2.4161810759763993e-06, | |
| "loss": 0.0544, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 5.19916142557652, | |
| "grad_norm": 1.3602285379082586, | |
| "learning_rate": 2.407361879327612e-06, | |
| "loss": 0.0442, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 5.210342417889588, | |
| "grad_norm": 1.1380441045618945, | |
| "learning_rate": 2.398543836953101e-06, | |
| "loss": 0.0563, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 5.221523410202655, | |
| "grad_norm": 1.1080478505241853, | |
| "learning_rate": 2.389727058725989e-06, | |
| "loss": 0.0515, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 5.232704402515723, | |
| "grad_norm": 1.2558697950305333, | |
| "learning_rate": 2.380911654503643e-06, | |
| "loss": 0.0507, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 5.243885394828791, | |
| "grad_norm": 1.2293644348010904, | |
| "learning_rate": 2.3720977341263152e-06, | |
| "loss": 0.0607, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 5.255066387141859, | |
| "grad_norm": 1.292488994918762, | |
| "learning_rate": 2.3632854074157653e-06, | |
| "loss": 0.0474, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 5.266247379454927, | |
| "grad_norm": 1.2671492916227067, | |
| "learning_rate": 2.3544747841738998e-06, | |
| "loss": 0.0769, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 5.277428371767995, | |
| "grad_norm": 1.6102887076835615, | |
| "learning_rate": 2.3456659741813945e-06, | |
| "loss": 0.0496, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 5.288609364081062, | |
| "grad_norm": 1.577997048333656, | |
| "learning_rate": 2.3368590871963367e-06, | |
| "loss": 0.0796, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 5.29979035639413, | |
| "grad_norm": 2.278441135480121, | |
| "learning_rate": 2.328054232952851e-06, | |
| "loss": 0.0679, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 5.310971348707198, | |
| "grad_norm": 1.1443796744340577, | |
| "learning_rate": 2.3192515211597332e-06, | |
| "loss": 0.0589, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.322152341020265, | |
| "grad_norm": 1.3246252050774938, | |
| "learning_rate": 2.3104510614990875e-06, | |
| "loss": 0.0711, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 2.3404125762291574, | |
| "learning_rate": 2.301652963624951e-06, | |
| "loss": 0.0571, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 5.344514325646401, | |
| "grad_norm": 1.6173224098499974, | |
| "learning_rate": 2.292857337161938e-06, | |
| "loss": 0.0715, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 5.355695317959469, | |
| "grad_norm": 1.416375080557459, | |
| "learning_rate": 2.2840642917038666e-06, | |
| "loss": 0.0555, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 5.366876310272537, | |
| "grad_norm": 1.2819320119071211, | |
| "learning_rate": 2.2752739368123948e-06, | |
| "loss": 0.0486, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.378057302585605, | |
| "grad_norm": 1.1198977788924485, | |
| "learning_rate": 2.2664863820156593e-06, | |
| "loss": 0.0408, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 5.389238294898672, | |
| "grad_norm": 1.1451798114445098, | |
| "learning_rate": 2.2577017368069017e-06, | |
| "loss": 0.0626, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 5.40041928721174, | |
| "grad_norm": 1.3380127274735694, | |
| "learning_rate": 2.248920110643116e-06, | |
| "loss": 0.0568, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 5.411600279524808, | |
| "grad_norm": 1.4489239240672898, | |
| "learning_rate": 2.2401416129436753e-06, | |
| "loss": 0.059, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 5.422781271837875, | |
| "grad_norm": 1.3130908635170957, | |
| "learning_rate": 2.2313663530889734e-06, | |
| "loss": 0.0444, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.433962264150943, | |
| "grad_norm": 1.2045728193533076, | |
| "learning_rate": 2.222594440419061e-06, | |
| "loss": 0.0952, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 5.445143256464011, | |
| "grad_norm": 1.1505612686257871, | |
| "learning_rate": 2.2138259842322794e-06, | |
| "loss": 0.0536, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 5.456324248777079, | |
| "grad_norm": 1.521719008832957, | |
| "learning_rate": 2.2050610937839058e-06, | |
| "loss": 0.073, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 5.467505241090147, | |
| "grad_norm": 1.3381824532405695, | |
| "learning_rate": 2.1962998782847863e-06, | |
| "loss": 0.0583, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 5.478686233403215, | |
| "grad_norm": 1.1782879600371732, | |
| "learning_rate": 2.1875424468999787e-06, | |
| "loss": 0.052, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.489867225716282, | |
| "grad_norm": 1.1689516819440322, | |
| "learning_rate": 2.178788908747387e-06, | |
| "loss": 0.0515, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 5.50104821802935, | |
| "grad_norm": 1.1479989981730907, | |
| "learning_rate": 2.170039372896409e-06, | |
| "loss": 0.055, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 5.512229210342418, | |
| "grad_norm": 1.3922562574409854, | |
| "learning_rate": 2.161293948366573e-06, | |
| "loss": 0.0554, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 5.523410202655485, | |
| "grad_norm": 1.409490849880991, | |
| "learning_rate": 2.152552744126178e-06, | |
| "loss": 0.0392, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 5.534591194968553, | |
| "grad_norm": 1.2479629003574995, | |
| "learning_rate": 2.1438158690909413e-06, | |
| "loss": 0.0599, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.545772187281621, | |
| "grad_norm": 1.2371376050465024, | |
| "learning_rate": 2.1350834321226344e-06, | |
| "loss": 0.0664, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 5.556953179594689, | |
| "grad_norm": 1.593505278104288, | |
| "learning_rate": 2.126355542027734e-06, | |
| "loss": 0.0479, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 5.568134171907757, | |
| "grad_norm": 1.2742537988695015, | |
| "learning_rate": 2.117632307556059e-06, | |
| "loss": 0.0803, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 5.579315164220825, | |
| "grad_norm": 1.3748039610126324, | |
| "learning_rate": 2.1089138373994226e-06, | |
| "loss": 0.0416, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 5.590496156533892, | |
| "grad_norm": 2.4084571636039755, | |
| "learning_rate": 2.100200240190273e-06, | |
| "loss": 0.0514, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.60167714884696, | |
| "grad_norm": 1.1933752040503858, | |
| "learning_rate": 2.09149162450034e-06, | |
| "loss": 0.0625, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 5.612858141160028, | |
| "grad_norm": 1.037709039674537, | |
| "learning_rate": 2.0827880988392856e-06, | |
| "loss": 0.0514, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 5.6240391334730955, | |
| "grad_norm": 1.315142680072312, | |
| "learning_rate": 2.0740897716533475e-06, | |
| "loss": 0.0593, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 5.635220125786163, | |
| "grad_norm": 1.0531660230737552, | |
| "learning_rate": 2.0653967513239934e-06, | |
| "loss": 0.0543, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 5.646401118099231, | |
| "grad_norm": 1.2633776013551097, | |
| "learning_rate": 2.0567091461665636e-06, | |
| "loss": 0.0431, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.657582110412299, | |
| "grad_norm": 1.449959564050197, | |
| "learning_rate": 2.0480270644289282e-06, | |
| "loss": 0.0482, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 5.668763102725367, | |
| "grad_norm": 1.1071912059302882, | |
| "learning_rate": 2.0393506142901347e-06, | |
| "loss": 0.0564, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 5.679944095038435, | |
| "grad_norm": 0.9876137346535111, | |
| "learning_rate": 2.0306799038590595e-06, | |
| "loss": 0.0391, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 5.6911250873515025, | |
| "grad_norm": 1.1071464038310999, | |
| "learning_rate": 2.0220150411730638e-06, | |
| "loss": 0.0636, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 5.70230607966457, | |
| "grad_norm": 1.0473491285671832, | |
| "learning_rate": 2.013356134196643e-06, | |
| "loss": 0.0581, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.713487071977638, | |
| "grad_norm": 1.1296902267336801, | |
| "learning_rate": 2.004703290820086e-06, | |
| "loss": 0.0604, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 5.7246680642907055, | |
| "grad_norm": 1.309317661735025, | |
| "learning_rate": 1.9960566188581306e-06, | |
| "loss": 0.0438, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 5.735849056603773, | |
| "grad_norm": 0.8918766336417149, | |
| "learning_rate": 1.9874162260486146e-06, | |
| "loss": 0.0475, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 5.747030048916841, | |
| "grad_norm": 1.2095534019736167, | |
| "learning_rate": 1.978782220051142e-06, | |
| "loss": 0.0454, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 5.7582110412299095, | |
| "grad_norm": 1.1967009451687045, | |
| "learning_rate": 1.9701547084457314e-06, | |
| "loss": 0.0697, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.769392033542977, | |
| "grad_norm": 1.8160556667087309, | |
| "learning_rate": 1.961533798731486e-06, | |
| "loss": 0.0422, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 5.780573025856045, | |
| "grad_norm": 1.590627053883797, | |
| "learning_rate": 1.952919598325247e-06, | |
| "loss": 0.0602, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 5.7917540181691125, | |
| "grad_norm": 1.4584761134724722, | |
| "learning_rate": 1.944312214560256e-06, | |
| "loss": 0.0575, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 5.80293501048218, | |
| "grad_norm": 1.6093909025543798, | |
| "learning_rate": 1.935711754684824e-06, | |
| "loss": 0.0814, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 5.814116002795248, | |
| "grad_norm": 1.7715253484509736, | |
| "learning_rate": 1.9271183258609836e-06, | |
| "loss": 0.0608, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.825296995108316, | |
| "grad_norm": 0.850327251905485, | |
| "learning_rate": 1.9185320351631654e-06, | |
| "loss": 0.0388, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 5.836477987421384, | |
| "grad_norm": 1.4837292387797913, | |
| "learning_rate": 1.9099529895768552e-06, | |
| "loss": 0.0567, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 5.847658979734452, | |
| "grad_norm": 1.0384213631474088, | |
| "learning_rate": 1.901381295997267e-06, | |
| "loss": 0.0661, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 5.8588399720475195, | |
| "grad_norm": 1.2071171218984706, | |
| "learning_rate": 1.8928170612280067e-06, | |
| "loss": 0.0665, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 5.870020964360587, | |
| "grad_norm": 1.2020194163974407, | |
| "learning_rate": 1.8842603919797436e-06, | |
| "loss": 0.0466, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.881201956673655, | |
| "grad_norm": 1.141150946131999, | |
| "learning_rate": 1.8757113948688827e-06, | |
| "loss": 0.0562, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 5.8923829489867225, | |
| "grad_norm": 1.583487458549684, | |
| "learning_rate": 1.8671701764162287e-06, | |
| "loss": 0.0589, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 5.90356394129979, | |
| "grad_norm": 1.3417276690702418, | |
| "learning_rate": 1.8586368430456708e-06, | |
| "loss": 0.0604, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 5.914744933612858, | |
| "grad_norm": 1.3294273305641617, | |
| "learning_rate": 1.8501115010828423e-06, | |
| "loss": 0.0628, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 5.925925925925926, | |
| "grad_norm": 1.2448945324282268, | |
| "learning_rate": 1.8415942567538106e-06, | |
| "loss": 0.0554, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.937106918238994, | |
| "grad_norm": 0.960687093766239, | |
| "learning_rate": 1.8330852161837399e-06, | |
| "loss": 0.0532, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 5.948287910552062, | |
| "grad_norm": 1.4656893110825278, | |
| "learning_rate": 1.8245844853955786e-06, | |
| "loss": 0.0719, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 5.9594689028651295, | |
| "grad_norm": 1.6634277575338297, | |
| "learning_rate": 1.8160921703087368e-06, | |
| "loss": 0.0565, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 5.970649895178197, | |
| "grad_norm": 1.7257111050609335, | |
| "learning_rate": 1.8076083767377595e-06, | |
| "loss": 0.068, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 5.981830887491265, | |
| "grad_norm": 1.42483183153276, | |
| "learning_rate": 1.7991332103910184e-06, | |
| "loss": 0.0613, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.993011879804333, | |
| "grad_norm": 1.4316025881020678, | |
| "learning_rate": 1.7906667768693853e-06, | |
| "loss": 0.0481, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 6.0041928721174, | |
| "grad_norm": 1.037376667784287, | |
| "learning_rate": 1.782209181664924e-06, | |
| "loss": 0.0483, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 6.015373864430468, | |
| "grad_norm": 1.0336168566598631, | |
| "learning_rate": 1.773760530159571e-06, | |
| "loss": 0.0347, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 6.026554856743536, | |
| "grad_norm": 0.7872905184564322, | |
| "learning_rate": 1.7653209276238242e-06, | |
| "loss": 0.0355, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 6.037735849056604, | |
| "grad_norm": 1.772389302776251, | |
| "learning_rate": 1.7568904792154328e-06, | |
| "loss": 0.0542, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.048916841369672, | |
| "grad_norm": 1.3577848873845724, | |
| "learning_rate": 1.7484692899780812e-06, | |
| "loss": 0.0583, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 6.06009783368274, | |
| "grad_norm": 0.7840766650439943, | |
| "learning_rate": 1.740057464840088e-06, | |
| "loss": 0.0289, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 6.071278825995807, | |
| "grad_norm": 0.9255675051401594, | |
| "learning_rate": 1.7316551086130925e-06, | |
| "loss": 0.0417, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 6.082459818308875, | |
| "grad_norm": 0.9107219582827843, | |
| "learning_rate": 1.7232623259907538e-06, | |
| "loss": 0.0429, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 6.093640810621943, | |
| "grad_norm": 1.0296310110561282, | |
| "learning_rate": 1.714879221547439e-06, | |
| "loss": 0.0362, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 6.10482180293501, | |
| "grad_norm": 0.9575340239366315, | |
| "learning_rate": 1.7065058997369288e-06, | |
| "loss": 0.0471, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 6.116002795248078, | |
| "grad_norm": 0.7430183397758778, | |
| "learning_rate": 1.6981424648911112e-06, | |
| "loss": 0.0351, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 6.127183787561146, | |
| "grad_norm": 0.9807593854080312, | |
| "learning_rate": 1.6897890212186804e-06, | |
| "loss": 0.0334, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 6.138364779874214, | |
| "grad_norm": 1.2961448011313597, | |
| "learning_rate": 1.6814456728038431e-06, | |
| "loss": 0.025, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 6.149545772187282, | |
| "grad_norm": 0.961636779671174, | |
| "learning_rate": 1.673112523605015e-06, | |
| "loss": 0.0285, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 6.16072676450035, | |
| "grad_norm": 0.9647606646620928, | |
| "learning_rate": 1.6647896774535324e-06, | |
| "loss": 0.0303, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 6.171907756813417, | |
| "grad_norm": 1.1381988477100318, | |
| "learning_rate": 1.6564772380523546e-06, | |
| "loss": 0.0358, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 6.183088749126485, | |
| "grad_norm": 0.7901346245952422, | |
| "learning_rate": 1.648175308974771e-06, | |
| "loss": 0.0279, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 6.194269741439553, | |
| "grad_norm": 1.2717247572933381, | |
| "learning_rate": 1.6398839936631142e-06, | |
| "loss": 0.0328, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 6.20545073375262, | |
| "grad_norm": 1.2916496315117834, | |
| "learning_rate": 1.631603395427466e-06, | |
| "loss": 0.055, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 6.216631726065688, | |
| "grad_norm": 0.9740099844597652, | |
| "learning_rate": 1.6233336174443762e-06, | |
| "loss": 0.048, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 6.227812718378756, | |
| "grad_norm": 1.0103830292004847, | |
| "learning_rate": 1.6150747627555713e-06, | |
| "loss": 0.0434, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 6.238993710691824, | |
| "grad_norm": 1.1350854047223082, | |
| "learning_rate": 1.6068269342666749e-06, | |
| "loss": 0.0389, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 6.250174703004892, | |
| "grad_norm": 0.7884154494279628, | |
| "learning_rate": 1.5985902347459239e-06, | |
| "loss": 0.0432, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 6.26135569531796, | |
| "grad_norm": 0.8788178903528164, | |
| "learning_rate": 1.5903647668228855e-06, | |
| "loss": 0.0432, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 6.272536687631027, | |
| "grad_norm": 0.6393918351108393, | |
| "learning_rate": 1.5821506329871834e-06, | |
| "loss": 0.0253, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 6.283717679944095, | |
| "grad_norm": 1.0870268262489273, | |
| "learning_rate": 1.5739479355872162e-06, | |
| "loss": 0.0364, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 6.294898672257163, | |
| "grad_norm": 1.1679875063936556, | |
| "learning_rate": 1.5657567768288868e-06, | |
| "loss": 0.0333, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 6.30607966457023, | |
| "grad_norm": 0.8388447320245327, | |
| "learning_rate": 1.5575772587743222e-06, | |
| "loss": 0.0316, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 6.317260656883298, | |
| "grad_norm": 0.7710273725047172, | |
| "learning_rate": 1.5494094833406092e-06, | |
| "loss": 0.0308, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 6.328441649196366, | |
| "grad_norm": 1.3107972415612894, | |
| "learning_rate": 1.5412535522985205e-06, | |
| "loss": 0.0186, | |
| "step": 1132 | |
| }, | |
| { | |
| "epoch": 6.339622641509434, | |
| "grad_norm": 0.8488196487806184, | |
| "learning_rate": 1.5331095672712463e-06, | |
| "loss": 0.023, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 6.350803633822502, | |
| "grad_norm": 1.014050814471419, | |
| "learning_rate": 1.5249776297331302e-06, | |
| "loss": 0.0425, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 6.36198462613557, | |
| "grad_norm": 0.8160528908459946, | |
| "learning_rate": 1.516857841008401e-06, | |
| "loss": 0.0407, | |
| "step": 1138 | |
| }, | |
| { | |
| "epoch": 6.373165618448637, | |
| "grad_norm": 0.6924190623075557, | |
| "learning_rate": 1.5087503022699168e-06, | |
| "loss": 0.0527, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 6.384346610761705, | |
| "grad_norm": 1.0149043689805195, | |
| "learning_rate": 1.5006551145378967e-06, | |
| "loss": 0.0367, | |
| "step": 1142 | |
| }, | |
| { | |
| "epoch": 6.395527603074773, | |
| "grad_norm": 1.5920991707794845, | |
| "learning_rate": 1.4925723786786691e-06, | |
| "loss": 0.0319, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 6.40670859538784, | |
| "grad_norm": 0.8834798218634231, | |
| "learning_rate": 1.4845021954034106e-06, | |
| "loss": 0.0372, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 6.417889587700908, | |
| "grad_norm": 1.072104658850445, | |
| "learning_rate": 1.476444665266889e-06, | |
| "loss": 0.0413, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 6.429070580013976, | |
| "grad_norm": 1.1893734124292998, | |
| "learning_rate": 1.4683998886662187e-06, | |
| "loss": 0.0307, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 6.440251572327044, | |
| "grad_norm": 1.1513167005422524, | |
| "learning_rate": 1.4603679658396006e-06, | |
| "loss": 0.0402, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 6.451432564640112, | |
| "grad_norm": 1.0586602700365229, | |
| "learning_rate": 1.4523489968650795e-06, | |
| "loss": 0.0303, | |
| "step": 1154 | |
| }, | |
| { | |
| "epoch": 6.46261355695318, | |
| "grad_norm": 0.7650987855999634, | |
| "learning_rate": 1.4443430816592936e-06, | |
| "loss": 0.0312, | |
| "step": 1156 | |
| }, | |
| { | |
| "epoch": 6.473794549266247, | |
| "grad_norm": 0.7470083708652993, | |
| "learning_rate": 1.4363503199762296e-06, | |
| "loss": 0.0298, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 6.484975541579315, | |
| "grad_norm": 1.2247183517462086, | |
| "learning_rate": 1.4283708114059853e-06, | |
| "loss": 0.0476, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 6.496156533892383, | |
| "grad_norm": 1.0042001049340177, | |
| "learning_rate": 1.4204046553735174e-06, | |
| "loss": 0.0421, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 6.5073375262054505, | |
| "grad_norm": 1.0066856707214424, | |
| "learning_rate": 1.4124519511374158e-06, | |
| "loss": 0.0277, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 6.518518518518518, | |
| "grad_norm": 1.3761888161849996, | |
| "learning_rate": 1.404512797788657e-06, | |
| "loss": 0.0251, | |
| "step": 1166 | |
| }, | |
| { | |
| "epoch": 6.529699510831586, | |
| "grad_norm": 0.7445041473181229, | |
| "learning_rate": 1.396587294249374e-06, | |
| "loss": 0.0383, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 6.540880503144654, | |
| "grad_norm": 1.0231799225570892, | |
| "learning_rate": 1.3886755392716225e-06, | |
| "loss": 0.0289, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 6.552061495457722, | |
| "grad_norm": 1.0842064444530823, | |
| "learning_rate": 1.3807776314361498e-06, | |
| "loss": 0.0341, | |
| "step": 1172 | |
| }, | |
| { | |
| "epoch": 6.56324248777079, | |
| "grad_norm": 0.9409388421938562, | |
| "learning_rate": 1.3728936691511704e-06, | |
| "loss": 0.0413, | |
| "step": 1174 | |
| }, | |
| { | |
| "epoch": 6.5744234800838575, | |
| "grad_norm": 0.8052329748698783, | |
| "learning_rate": 1.3650237506511333e-06, | |
| "loss": 0.0399, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 6.585604472396925, | |
| "grad_norm": 0.6879172446908371, | |
| "learning_rate": 1.3571679739955029e-06, | |
| "loss": 0.0288, | |
| "step": 1178 | |
| }, | |
| { | |
| "epoch": 6.596785464709993, | |
| "grad_norm": 0.8737080494275846, | |
| "learning_rate": 1.3493264370675352e-06, | |
| "loss": 0.0181, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.6079664570230605, | |
| "grad_norm": 0.8744184416405667, | |
| "learning_rate": 1.3414992375730587e-06, | |
| "loss": 0.0432, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 6.619147449336128, | |
| "grad_norm": 0.9265074156931595, | |
| "learning_rate": 1.3336864730392587e-06, | |
| "loss": 0.0464, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 6.630328441649196, | |
| "grad_norm": 1.14003149718633, | |
| "learning_rate": 1.3258882408134582e-06, | |
| "loss": 0.0271, | |
| "step": 1186 | |
| }, | |
| { | |
| "epoch": 6.6415094339622645, | |
| "grad_norm": 0.8949105583359471, | |
| "learning_rate": 1.3181046380619078e-06, | |
| "loss": 0.0276, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 6.652690426275332, | |
| "grad_norm": 1.0602768370905677, | |
| "learning_rate": 1.3103357617685746e-06, | |
| "loss": 0.0352, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 6.6638714185884, | |
| "grad_norm": 1.187406942024327, | |
| "learning_rate": 1.3025817087339335e-06, | |
| "loss": 0.0597, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 6.6750524109014675, | |
| "grad_norm": 0.8451020033143687, | |
| "learning_rate": 1.2948425755737592e-06, | |
| "loss": 0.0359, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 6.686233403214535, | |
| "grad_norm": 1.2760921925255864, | |
| "learning_rate": 1.2871184587179286e-06, | |
| "loss": 0.0285, | |
| "step": 1196 | |
| }, | |
| { | |
| "epoch": 6.697414395527603, | |
| "grad_norm": 0.7781748766075295, | |
| "learning_rate": 1.2794094544092111e-06, | |
| "loss": 0.0346, | |
| "step": 1198 | |
| }, | |
| { | |
| "epoch": 6.7085953878406706, | |
| "grad_norm": 1.1832623077309767, | |
| "learning_rate": 1.2717156587020746e-06, | |
| "loss": 0.041, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.719776380153739, | |
| "grad_norm": 1.3133094357866473, | |
| "learning_rate": 1.2640371674614866e-06, | |
| "loss": 0.0629, | |
| "step": 1202 | |
| }, | |
| { | |
| "epoch": 6.730957372466807, | |
| "grad_norm": 0.7218331862903847, | |
| "learning_rate": 1.2563740763617198e-06, | |
| "loss": 0.0366, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 6.7421383647798745, | |
| "grad_norm": 0.9560652150388108, | |
| "learning_rate": 1.2487264808851654e-06, | |
| "loss": 0.044, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 6.753319357092942, | |
| "grad_norm": 1.1190106870390395, | |
| "learning_rate": 1.2410944763211302e-06, | |
| "loss": 0.0517, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 6.76450034940601, | |
| "grad_norm": 0.7835985914687663, | |
| "learning_rate": 1.2334781577646653e-06, | |
| "loss": 0.0272, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 6.7756813417190775, | |
| "grad_norm": 2.056446636497986, | |
| "learning_rate": 1.2258776201153702e-06, | |
| "loss": 0.0239, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 6.786862334032145, | |
| "grad_norm": 0.8485551422736736, | |
| "learning_rate": 1.218292958076213e-06, | |
| "loss": 0.0206, | |
| "step": 1214 | |
| }, | |
| { | |
| "epoch": 6.798043326345213, | |
| "grad_norm": 1.2531964534501892, | |
| "learning_rate": 1.2107242661523544e-06, | |
| "loss": 0.0254, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 6.809224318658281, | |
| "grad_norm": 1.269537638790587, | |
| "learning_rate": 1.203171638649962e-06, | |
| "loss": 0.0299, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 6.820405310971349, | |
| "grad_norm": 1.1178764385402225, | |
| "learning_rate": 1.195635169675045e-06, | |
| "loss": 0.0396, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.831586303284417, | |
| "grad_norm": 0.6920818283019613, | |
| "learning_rate": 1.1881149531322744e-06, | |
| "loss": 0.0268, | |
| "step": 1222 | |
| }, | |
| { | |
| "epoch": 6.8427672955974845, | |
| "grad_norm": 0.80369354175751, | |
| "learning_rate": 1.180611082723814e-06, | |
| "loss": 0.031, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 6.853948287910552, | |
| "grad_norm": 0.7447389756775401, | |
| "learning_rate": 1.1731236519481593e-06, | |
| "loss": 0.0345, | |
| "step": 1226 | |
| }, | |
| { | |
| "epoch": 6.86512928022362, | |
| "grad_norm": 1.1115305000722167, | |
| "learning_rate": 1.1656527540989595e-06, | |
| "loss": 0.0283, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 6.876310272536688, | |
| "grad_norm": 1.2279572164110593, | |
| "learning_rate": 1.1581984822638706e-06, | |
| "loss": 0.0452, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 6.887491264849755, | |
| "grad_norm": 0.8467749629186313, | |
| "learning_rate": 1.1507609293233837e-06, | |
| "loss": 0.0283, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 6.898672257162823, | |
| "grad_norm": 1.355703618365484, | |
| "learning_rate": 1.1433401879496723e-06, | |
| "loss": 0.0366, | |
| "step": 1234 | |
| }, | |
| { | |
| "epoch": 6.909853249475891, | |
| "grad_norm": 1.004917827499692, | |
| "learning_rate": 1.135936350605438e-06, | |
| "loss": 0.0496, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 6.921034241788959, | |
| "grad_norm": 1.2615070307313305, | |
| "learning_rate": 1.1285495095427563e-06, | |
| "loss": 0.0461, | |
| "step": 1238 | |
| }, | |
| { | |
| "epoch": 6.932215234102027, | |
| "grad_norm": 0.9861185460727813, | |
| "learning_rate": 1.1211797568019312e-06, | |
| "loss": 0.0366, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.943396226415095, | |
| "grad_norm": 1.6576290169923233, | |
| "learning_rate": 1.113827184210343e-06, | |
| "loss": 0.0337, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 6.954577218728162, | |
| "grad_norm": 1.1363579065284033, | |
| "learning_rate": 1.1064918833813073e-06, | |
| "loss": 0.0406, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 6.96575821104123, | |
| "grad_norm": 1.3125191134965577, | |
| "learning_rate": 1.0991739457129333e-06, | |
| "loss": 0.0397, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 6.976939203354298, | |
| "grad_norm": 0.8904462468667067, | |
| "learning_rate": 1.0918734623869835e-06, | |
| "loss": 0.0407, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 6.988120195667365, | |
| "grad_norm": 2.263233580582389, | |
| "learning_rate": 1.0845905243677416e-06, | |
| "loss": 0.0307, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.999301187980433, | |
| "grad_norm": 0.791294534235276, | |
| "learning_rate": 1.0773252224008726e-06, | |
| "loss": 0.0387, | |
| "step": 1252 | |
| }, | |
| { | |
| "epoch": 7.010482180293501, | |
| "grad_norm": 0.76599595030522, | |
| "learning_rate": 1.0700776470122981e-06, | |
| "loss": 0.0269, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 7.021663172606569, | |
| "grad_norm": 0.7331796337642835, | |
| "learning_rate": 1.0628478885070647e-06, | |
| "loss": 0.0221, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 7.032844164919637, | |
| "grad_norm": 0.6845784469587074, | |
| "learning_rate": 1.05563603696822e-06, | |
| "loss": 0.0291, | |
| "step": 1258 | |
| }, | |
| { | |
| "epoch": 7.044025157232705, | |
| "grad_norm": 0.8176233505690059, | |
| "learning_rate": 1.0484421822556904e-06, | |
| "loss": 0.0364, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 7.055206149545772, | |
| "grad_norm": 0.8629657573128657, | |
| "learning_rate": 1.041266414005162e-06, | |
| "loss": 0.0265, | |
| "step": 1262 | |
| }, | |
| { | |
| "epoch": 7.06638714185884, | |
| "grad_norm": 1.1172499462707595, | |
| "learning_rate": 1.0341088216269625e-06, | |
| "loss": 0.0157, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 7.077568134171908, | |
| "grad_norm": 0.5230775744769823, | |
| "learning_rate": 1.0269694943049462e-06, | |
| "loss": 0.0157, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 7.088749126484975, | |
| "grad_norm": 0.8978199171663125, | |
| "learning_rate": 1.0198485209953865e-06, | |
| "loss": 0.0275, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 7.099930118798043, | |
| "grad_norm": 0.815308309594077, | |
| "learning_rate": 1.0127459904258621e-06, | |
| "loss": 0.0237, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.8967571058386815, | |
| "learning_rate": 1.0056619910941592e-06, | |
| "loss": 0.019, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 7.122292103424179, | |
| "grad_norm": 0.7843358442700527, | |
| "learning_rate": 9.98596611267158e-07, | |
| "loss": 0.021, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 7.133473095737247, | |
| "grad_norm": 0.6797830063456453, | |
| "learning_rate": 9.915499389797444e-07, | |
| "loss": 0.0316, | |
| "step": 1276 | |
| }, | |
| { | |
| "epoch": 7.144654088050315, | |
| "grad_norm": 0.6688875199025872, | |
| "learning_rate": 9.845220620337054e-07, | |
| "loss": 0.0303, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 7.155835080363382, | |
| "grad_norm": 0.6664970872749731, | |
| "learning_rate": 9.77513067996636e-07, | |
| "loss": 0.0219, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 7.16701607267645, | |
| "grad_norm": 0.7973098520727987, | |
| "learning_rate": 9.705230442008542e-07, | |
| "loss": 0.0376, | |
| "step": 1282 | |
| }, | |
| { | |
| "epoch": 7.178197064989518, | |
| "grad_norm": 0.8759703504057706, | |
| "learning_rate": 9.63552077742301e-07, | |
| "loss": 0.0385, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 7.189378057302585, | |
| "grad_norm": 1.0267904937054426, | |
| "learning_rate": 9.56600255479469e-07, | |
| "loss": 0.0222, | |
| "step": 1286 | |
| }, | |
| { | |
| "epoch": 7.200559049615653, | |
| "grad_norm": 0.6389768145894307, | |
| "learning_rate": 9.4966766403231e-07, | |
| "loss": 0.018, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 7.211740041928721, | |
| "grad_norm": 0.5762313893158477, | |
| "learning_rate": 9.427543897811584e-07, | |
| "loss": 0.0165, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 7.222921034241789, | |
| "grad_norm": 0.5902518126138557, | |
| "learning_rate": 9.358605188656603e-07, | |
| "loss": 0.02, | |
| "step": 1292 | |
| }, | |
| { | |
| "epoch": 7.234102026554857, | |
| "grad_norm": 0.824105561963567, | |
| "learning_rate": 9.289861371836886e-07, | |
| "loss": 0.0337, | |
| "step": 1294 | |
| }, | |
| { | |
| "epoch": 7.245283018867925, | |
| "grad_norm": 0.504698332550927, | |
| "learning_rate": 9.22131330390286e-07, | |
| "loss": 0.0283, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 7.256464011180992, | |
| "grad_norm": 0.5789695393721453, | |
| "learning_rate": 9.152961838965879e-07, | |
| "loss": 0.0169, | |
| "step": 1298 | |
| }, | |
| { | |
| "epoch": 7.26764500349406, | |
| "grad_norm": 1.4892687104014115, | |
| "learning_rate": 9.084807828687628e-07, | |
| "loss": 0.0314, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 7.278825995807128, | |
| "grad_norm": 1.0727067281323632, | |
| "learning_rate": 9.016852122269493e-07, | |
| "loss": 0.0274, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 7.290006988120195, | |
| "grad_norm": 0.7309629553367788, | |
| "learning_rate": 8.949095566441985e-07, | |
| "loss": 0.0219, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 7.301187980433263, | |
| "grad_norm": 0.6871990809680889, | |
| "learning_rate": 8.881539005454215e-07, | |
| "loss": 0.0339, | |
| "step": 1306 | |
| }, | |
| { | |
| "epoch": 7.312368972746331, | |
| "grad_norm": 0.8530617423198913, | |
| "learning_rate": 8.814183281063326e-07, | |
| "loss": 0.0248, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 7.323549965059399, | |
| "grad_norm": 0.76651991997128, | |
| "learning_rate": 8.747029232524037e-07, | |
| "loss": 0.023, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 7.334730957372467, | |
| "grad_norm": 0.6966547986519114, | |
| "learning_rate": 8.680077696578182e-07, | |
| "loss": 0.0332, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 7.345911949685535, | |
| "grad_norm": 1.0873098335521205, | |
| "learning_rate": 8.613329507444274e-07, | |
| "loss": 0.0234, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 7.357092941998602, | |
| "grad_norm": 0.6461932986017782, | |
| "learning_rate": 8.546785496807116e-07, | |
| "loss": 0.0242, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 7.36827393431167, | |
| "grad_norm": 0.7614414460885182, | |
| "learning_rate": 8.480446493807464e-07, | |
| "loss": 0.031, | |
| "step": 1318 | |
| }, | |
| { | |
| "epoch": 7.379454926624738, | |
| "grad_norm": 0.641294466328584, | |
| "learning_rate": 8.414313325031642e-07, | |
| "loss": 0.028, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 7.3906359189378055, | |
| "grad_norm": 0.47088954187562415, | |
| "learning_rate": 8.348386814501286e-07, | |
| "loss": 0.0186, | |
| "step": 1322 | |
| }, | |
| { | |
| "epoch": 7.401816911250873, | |
| "grad_norm": 0.7909087034714356, | |
| "learning_rate": 8.282667783663056e-07, | |
| "loss": 0.0212, | |
| "step": 1324 | |
| }, | |
| { | |
| "epoch": 7.412997903563941, | |
| "grad_norm": 0.8059238279425677, | |
| "learning_rate": 8.217157051378411e-07, | |
| "loss": 0.0239, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 7.424178895877009, | |
| "grad_norm": 0.788531385863816, | |
| "learning_rate": 8.151855433913414e-07, | |
| "loss": 0.0199, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 7.435359888190077, | |
| "grad_norm": 1.1393964476120448, | |
| "learning_rate": 8.086763744928536e-07, | |
| "loss": 0.0292, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 7.446540880503145, | |
| "grad_norm": 0.5408108502649198, | |
| "learning_rate": 8.02188279546853e-07, | |
| "loss": 0.0146, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 7.4577218728162125, | |
| "grad_norm": 0.8749206113652656, | |
| "learning_rate": 7.957213393952335e-07, | |
| "loss": 0.0247, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 7.46890286512928, | |
| "grad_norm": 0.7053824386402378, | |
| "learning_rate": 7.892756346162986e-07, | |
| "loss": 0.02, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 7.480083857442348, | |
| "grad_norm": 0.6965900833846856, | |
| "learning_rate": 7.82851245523761e-07, | |
| "loss": 0.0315, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 7.4912648497554155, | |
| "grad_norm": 0.9392067120327887, | |
| "learning_rate": 7.764482521657343e-07, | |
| "loss": 0.0308, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 7.502445842068483, | |
| "grad_norm": 0.7074561491918046, | |
| "learning_rate": 7.700667343237453e-07, | |
| "loss": 0.0171, | |
| "step": 1342 | |
| }, | |
| { | |
| "epoch": 7.513626834381551, | |
| "grad_norm": 0.7697005768650605, | |
| "learning_rate": 7.637067715117327e-07, | |
| "loss": 0.0302, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 7.5248078266946195, | |
| "grad_norm": 1.176668146060272, | |
| "learning_rate": 7.573684429750583e-07, | |
| "loss": 0.0265, | |
| "step": 1346 | |
| }, | |
| { | |
| "epoch": 7.535988819007687, | |
| "grad_norm": 0.7258573280389607, | |
| "learning_rate": 7.510518276895234e-07, | |
| "loss": 0.0257, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 7.547169811320755, | |
| "grad_norm": 1.1195611459347754, | |
| "learning_rate": 7.447570043603755e-07, | |
| "loss": 0.0261, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 7.5583508036338225, | |
| "grad_norm": 0.9527258409378455, | |
| "learning_rate": 7.384840514213404e-07, | |
| "loss": 0.0524, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 7.56953179594689, | |
| "grad_norm": 0.7074898357644916, | |
| "learning_rate": 7.322330470336314e-07, | |
| "loss": 0.0205, | |
| "step": 1354 | |
| }, | |
| { | |
| "epoch": 7.580712788259958, | |
| "grad_norm": 0.9361424266631929, | |
| "learning_rate": 7.26004069084987e-07, | |
| "loss": 0.0217, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 7.5918937805730256, | |
| "grad_norm": 1.7048958108176762, | |
| "learning_rate": 7.197971951886956e-07, | |
| "loss": 0.0225, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 7.603074772886094, | |
| "grad_norm": 0.8812767707258257, | |
| "learning_rate": 7.13612502682623e-07, | |
| "loss": 0.0196, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 7.614255765199162, | |
| "grad_norm": 0.5682027618905875, | |
| "learning_rate": 7.074500686282609e-07, | |
| "loss": 0.019, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 7.6254367575122295, | |
| "grad_norm": 0.4475598932931596, | |
| "learning_rate": 7.013099698097539e-07, | |
| "loss": 0.0171, | |
| "step": 1364 | |
| }, | |
| { | |
| "epoch": 7.636617749825297, | |
| "grad_norm": 0.5527498039813922, | |
| "learning_rate": 6.951922827329535e-07, | |
| "loss": 0.0217, | |
| "step": 1366 | |
| }, | |
| { | |
| "epoch": 7.647798742138365, | |
| "grad_norm": 0.7984442985333638, | |
| "learning_rate": 6.890970836244574e-07, | |
| "loss": 0.0361, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 7.6589797344514325, | |
| "grad_norm": 0.624268450810696, | |
| "learning_rate": 6.830244484306623e-07, | |
| "loss": 0.0158, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 7.6701607267645, | |
| "grad_norm": 0.7493822409267487, | |
| "learning_rate": 6.769744528168207e-07, | |
| "loss": 0.0286, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 7.681341719077568, | |
| "grad_norm": 0.6787647092695418, | |
| "learning_rate": 6.709471721660904e-07, | |
| "loss": 0.0215, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 7.692522711390636, | |
| "grad_norm": 0.7321502006735149, | |
| "learning_rate": 6.649426815786045e-07, | |
| "loss": 0.0311, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 7.703703703703704, | |
| "grad_norm": 0.701610396870259, | |
| "learning_rate": 6.589610558705284e-07, | |
| "loss": 0.0235, | |
| "step": 1378 | |
| }, | |
| { | |
| "epoch": 7.714884696016772, | |
| "grad_norm": 0.6530846520546149, | |
| "learning_rate": 6.53002369573131e-07, | |
| "loss": 0.0245, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.7260656883298395, | |
| "grad_norm": 0.7531427984254183, | |
| "learning_rate": 6.470666969318554e-07, | |
| "loss": 0.0315, | |
| "step": 1382 | |
| }, | |
| { | |
| "epoch": 7.737246680642907, | |
| "grad_norm": 0.7301669272251805, | |
| "learning_rate": 6.41154111905393e-07, | |
| "loss": 0.0225, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 7.748427672955975, | |
| "grad_norm": 0.8707140120777088, | |
| "learning_rate": 6.352646881647647e-07, | |
| "loss": 0.0259, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 7.759608665269043, | |
| "grad_norm": 0.837200588883093, | |
| "learning_rate": 6.29398499092399e-07, | |
| "loss": 0.0474, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 7.77078965758211, | |
| "grad_norm": 0.973530488120086, | |
| "learning_rate": 6.235556177812205e-07, | |
| "loss": 0.0329, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 7.781970649895178, | |
| "grad_norm": 0.5813627298678434, | |
| "learning_rate": 6.177361170337376e-07, | |
| "loss": 0.0194, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 7.793151642208246, | |
| "grad_norm": 0.8597088367336019, | |
| "learning_rate": 6.119400693611358e-07, | |
| "loss": 0.0123, | |
| "step": 1394 | |
| }, | |
| { | |
| "epoch": 7.804332634521314, | |
| "grad_norm": 0.8368570476462492, | |
| "learning_rate": 6.061675469823763e-07, | |
| "loss": 0.0227, | |
| "step": 1396 | |
| }, | |
| { | |
| "epoch": 7.815513626834382, | |
| "grad_norm": 0.5203392914919558, | |
| "learning_rate": 6.004186218232933e-07, | |
| "loss": 0.0217, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 7.82669461914745, | |
| "grad_norm": 0.8572153440435842, | |
| "learning_rate": 5.946933655156976e-07, | |
| "loss": 0.0294, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.837875611460517, | |
| "grad_norm": 0.6862577628733875, | |
| "learning_rate": 5.889918493964869e-07, | |
| "loss": 0.0228, | |
| "step": 1402 | |
| }, | |
| { | |
| "epoch": 7.849056603773585, | |
| "grad_norm": 0.7097594226614418, | |
| "learning_rate": 5.833141445067541e-07, | |
| "loss": 0.0113, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 7.860237596086653, | |
| "grad_norm": 0.6322499286175502, | |
| "learning_rate": 5.776603215909041e-07, | |
| "loss": 0.0229, | |
| "step": 1406 | |
| }, | |
| { | |
| "epoch": 7.87141858839972, | |
| "grad_norm": 0.6798739232739857, | |
| "learning_rate": 5.720304510957722e-07, | |
| "loss": 0.0257, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 7.882599580712788, | |
| "grad_norm": 0.6568708401714163, | |
| "learning_rate": 5.66424603169744e-07, | |
| "loss": 0.0285, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 7.893780573025856, | |
| "grad_norm": 1.1483908878505031, | |
| "learning_rate": 5.608428476618843e-07, | |
| "loss": 0.0235, | |
| "step": 1412 | |
| }, | |
| { | |
| "epoch": 7.904961565338924, | |
| "grad_norm": 0.9297111790590921, | |
| "learning_rate": 5.552852541210651e-07, | |
| "loss": 0.022, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 7.916142557651992, | |
| "grad_norm": 0.7288896652277049, | |
| "learning_rate": 5.497518917950986e-07, | |
| "loss": 0.033, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 7.92732354996506, | |
| "grad_norm": 1.3241630685241197, | |
| "learning_rate": 5.44242829629878e-07, | |
| "loss": 0.0236, | |
| "step": 1418 | |
| }, | |
| { | |
| "epoch": 7.938504542278127, | |
| "grad_norm": 0.6616696784338312, | |
| "learning_rate": 5.387581362685112e-07, | |
| "loss": 0.03, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.949685534591195, | |
| "grad_norm": 0.9223806906428696, | |
| "learning_rate": 5.332978800504742e-07, | |
| "loss": 0.0234, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 7.960866526904263, | |
| "grad_norm": 1.1302104401143789, | |
| "learning_rate": 5.278621290107533e-07, | |
| "loss": 0.0334, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 7.97204751921733, | |
| "grad_norm": 0.6145924647383543, | |
| "learning_rate": 5.224509508789987e-07, | |
| "loss": 0.0205, | |
| "step": 1426 | |
| }, | |
| { | |
| "epoch": 7.983228511530398, | |
| "grad_norm": 0.6724718918142113, | |
| "learning_rate": 5.170644130786842e-07, | |
| "loss": 0.0315, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 7.994409503843466, | |
| "grad_norm": 0.5897709957691004, | |
| "learning_rate": 5.117025827262598e-07, | |
| "loss": 0.0189, | |
| "step": 1430 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 1780, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 598197676277760.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |