| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 282, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.6387848258018494, | |
| "epoch": 0.0071301247771836, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 2.6515, | |
| "mean_token_accuracy": 0.445863775908947, | |
| "num_tokens": 1051.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.6529399156570435, | |
| "epoch": 0.0142602495543672, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4e-05, | |
| "loss": 2.7153, | |
| "mean_token_accuracy": 0.4470880478620529, | |
| "num_tokens": 2187.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.5581220388412476, | |
| "epoch": 0.0213903743315508, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8e-05, | |
| "loss": 2.6255, | |
| "mean_token_accuracy": 0.4684518948197365, | |
| "num_tokens": 3295.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.5885010659694672, | |
| "epoch": 0.0285204991087344, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012, | |
| "loss": 2.5733, | |
| "mean_token_accuracy": 0.4601749926805496, | |
| "num_tokens": 4387.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.6543156504631042, | |
| "epoch": 0.035650623885918005, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016, | |
| "loss": 2.6477, | |
| "mean_token_accuracy": 0.4556595981121063, | |
| "num_tokens": 5511.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.5543809533119202, | |
| "epoch": 0.0427807486631016, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0002, | |
| "loss": 2.6579, | |
| "mean_token_accuracy": 0.46554840356111526, | |
| "num_tokens": 6598.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.5990856289863586, | |
| "epoch": 0.049910873440285206, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019976218787158144, | |
| "loss": 2.6618, | |
| "mean_token_accuracy": 0.4601997211575508, | |
| "num_tokens": 7673.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.6176924407482147, | |
| "epoch": 0.0570409982174688, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019952437574316292, | |
| "loss": 2.7365, | |
| "mean_token_accuracy": 0.4649490937590599, | |
| "num_tokens": 8774.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.5261479020118713, | |
| "epoch": 0.06417112299465241, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019928656361474436, | |
| "loss": 2.5824, | |
| "mean_token_accuracy": 0.47935400158166885, | |
| "num_tokens": 9886.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.6173927783966064, | |
| "epoch": 0.07130124777183601, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001990487514863258, | |
| "loss": 2.53, | |
| "mean_token_accuracy": 0.47585250437259674, | |
| "num_tokens": 10983.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.5592622458934784, | |
| "epoch": 0.0784313725490196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019881093935790727, | |
| "loss": 2.6422, | |
| "mean_token_accuracy": 0.46385403722524643, | |
| "num_tokens": 12102.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.5775817930698395, | |
| "epoch": 0.0855614973262032, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001985731272294887, | |
| "loss": 2.6095, | |
| "mean_token_accuracy": 0.49216530472040176, | |
| "num_tokens": 13219.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.6316154599189758, | |
| "epoch": 0.09269162210338681, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019833531510107019, | |
| "loss": 2.7746, | |
| "mean_token_accuracy": 0.4526786059141159, | |
| "num_tokens": 14309.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.5752335786819458, | |
| "epoch": 0.09982174688057041, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019809750297265162, | |
| "loss": 2.6994, | |
| "mean_token_accuracy": 0.4809372276067734, | |
| "num_tokens": 15373.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.5528400540351868, | |
| "epoch": 0.10695187165775401, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019785969084423305, | |
| "loss": 2.6607, | |
| "mean_token_accuracy": 0.4793297126889229, | |
| "num_tokens": 16485.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.6115093529224396, | |
| "epoch": 0.1140819964349376, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019762187871581453, | |
| "loss": 2.6006, | |
| "mean_token_accuracy": 0.48629891872406006, | |
| "num_tokens": 17654.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.6650608479976654, | |
| "epoch": 0.12121212121212122, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019738406658739596, | |
| "loss": 2.8227, | |
| "mean_token_accuracy": 0.4469343200325966, | |
| "num_tokens": 18705.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.5981396436691284, | |
| "epoch": 0.12834224598930483, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019714625445897742, | |
| "loss": 2.6602, | |
| "mean_token_accuracy": 0.48277803510427475, | |
| "num_tokens": 19799.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.6768196523189545, | |
| "epoch": 0.1354723707664884, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019690844233055888, | |
| "loss": 2.7339, | |
| "mean_token_accuracy": 0.45305337756872177, | |
| "num_tokens": 20799.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.6276516318321228, | |
| "epoch": 0.14260249554367202, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001966706302021403, | |
| "loss": 2.6103, | |
| "mean_token_accuracy": 0.45762092620134354, | |
| "num_tokens": 21918.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.5985995829105377, | |
| "epoch": 0.1497326203208556, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019643281807372177, | |
| "loss": 2.6705, | |
| "mean_token_accuracy": 0.474429652094841, | |
| "num_tokens": 23021.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.6575533151626587, | |
| "epoch": 0.1568627450980392, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019619500594530322, | |
| "loss": 2.7187, | |
| "mean_token_accuracy": 0.4531744047999382, | |
| "num_tokens": 24138.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.6001036167144775, | |
| "epoch": 0.16399286987522282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019595719381688465, | |
| "loss": 2.6272, | |
| "mean_token_accuracy": 0.4594630151987076, | |
| "num_tokens": 25229.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.6082253158092499, | |
| "epoch": 0.1711229946524064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019571938168846614, | |
| "loss": 2.5586, | |
| "mean_token_accuracy": 0.4784083887934685, | |
| "num_tokens": 26339.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.6208438277244568, | |
| "epoch": 0.17825311942959002, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019548156956004757, | |
| "loss": 2.6095, | |
| "mean_token_accuracy": 0.4685538485646248, | |
| "num_tokens": 27389.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.6018637418746948, | |
| "epoch": 0.18538324420677363, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019524375743162903, | |
| "loss": 2.7047, | |
| "mean_token_accuracy": 0.4501718729734421, | |
| "num_tokens": 28491.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.566238820552826, | |
| "epoch": 0.1925133689839572, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019500594530321049, | |
| "loss": 2.5167, | |
| "mean_token_accuracy": 0.4768272563815117, | |
| "num_tokens": 29617.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.5953451693058014, | |
| "epoch": 0.19964349376114082, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019476813317479192, | |
| "loss": 2.6907, | |
| "mean_token_accuracy": 0.46527716517448425, | |
| "num_tokens": 30709.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.6036897897720337, | |
| "epoch": 0.20677361853832443, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019453032104637337, | |
| "loss": 2.621, | |
| "mean_token_accuracy": 0.47111422568559647, | |
| "num_tokens": 31780.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.6660344302654266, | |
| "epoch": 0.21390374331550802, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019429250891795483, | |
| "loss": 2.6351, | |
| "mean_token_accuracy": 0.45088133215904236, | |
| "num_tokens": 32827.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.5515300929546356, | |
| "epoch": 0.22103386809269163, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019405469678953626, | |
| "loss": 2.5804, | |
| "mean_token_accuracy": 0.47831422835588455, | |
| "num_tokens": 33977.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.4917864799499512, | |
| "epoch": 0.2281639928698752, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019381688466111775, | |
| "loss": 2.5906, | |
| "mean_token_accuracy": 0.4912087470293045, | |
| "num_tokens": 35070.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.591080754995346, | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019357907253269918, | |
| "loss": 2.7147, | |
| "mean_token_accuracy": 0.4610803797841072, | |
| "num_tokens": 36137.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.5161702930927277, | |
| "epoch": 0.24242424242424243, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019334126040428064, | |
| "loss": 2.4586, | |
| "mean_token_accuracy": 0.5084197968244553, | |
| "num_tokens": 37239.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.5857904851436615, | |
| "epoch": 0.24955436720142601, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001931034482758621, | |
| "loss": 2.5836, | |
| "mean_token_accuracy": 0.48300980031490326, | |
| "num_tokens": 38330.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.6427140533924103, | |
| "epoch": 0.25668449197860965, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019286563614744352, | |
| "loss": 2.6629, | |
| "mean_token_accuracy": 0.4738333150744438, | |
| "num_tokens": 39368.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.6292171776294708, | |
| "epoch": 0.2638146167557932, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019262782401902498, | |
| "loss": 2.7382, | |
| "mean_token_accuracy": 0.46079348772764206, | |
| "num_tokens": 40474.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.5661435425281525, | |
| "epoch": 0.2709447415329768, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019239001189060644, | |
| "loss": 2.4905, | |
| "mean_token_accuracy": 0.49719007313251495, | |
| "num_tokens": 41558.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.6255074739456177, | |
| "epoch": 0.27807486631016043, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019215219976218787, | |
| "loss": 2.6303, | |
| "mean_token_accuracy": 0.4578748494386673, | |
| "num_tokens": 42674.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.4836209118366241, | |
| "epoch": 0.28520499108734404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019191438763376933, | |
| "loss": 2.5324, | |
| "mean_token_accuracy": 0.4868450313806534, | |
| "num_tokens": 43815.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5689969956874847, | |
| "epoch": 0.29233511586452765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019167657550535079, | |
| "loss": 2.6125, | |
| "mean_token_accuracy": 0.45523975044488907, | |
| "num_tokens": 44890.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.5794390439987183, | |
| "epoch": 0.2994652406417112, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019143876337693222, | |
| "loss": 2.6012, | |
| "mean_token_accuracy": 0.47063395380973816, | |
| "num_tokens": 45951.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.6168677806854248, | |
| "epoch": 0.3065953654188948, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001912009512485137, | |
| "loss": 2.6358, | |
| "mean_token_accuracy": 0.46999557316303253, | |
| "num_tokens": 47083.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.6417001783847809, | |
| "epoch": 0.3137254901960784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019096313912009513, | |
| "loss": 2.6473, | |
| "mean_token_accuracy": 0.4624517187476158, | |
| "num_tokens": 48088.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.6053949892520905, | |
| "epoch": 0.32085561497326204, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001907253269916766, | |
| "loss": 2.6728, | |
| "mean_token_accuracy": 0.4565429463982582, | |
| "num_tokens": 49213.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.5239336490631104, | |
| "epoch": 0.32798573975044565, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019048751486325805, | |
| "loss": 2.5886, | |
| "mean_token_accuracy": 0.47441017627716064, | |
| "num_tokens": 50287.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.5953067243099213, | |
| "epoch": 0.33511586452762926, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019024970273483948, | |
| "loss": 2.6498, | |
| "mean_token_accuracy": 0.46358855813741684, | |
| "num_tokens": 51379.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.5651679933071136, | |
| "epoch": 0.3422459893048128, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019001189060642093, | |
| "loss": 2.6206, | |
| "mean_token_accuracy": 0.47789186984300613, | |
| "num_tokens": 52456.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.5568871796131134, | |
| "epoch": 0.3493761140819964, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001897740784780024, | |
| "loss": 2.4854, | |
| "mean_token_accuracy": 0.46421028673648834, | |
| "num_tokens": 53606.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.5674110054969788, | |
| "epoch": 0.35650623885918004, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018953626634958382, | |
| "loss": 2.6727, | |
| "mean_token_accuracy": 0.4703332930803299, | |
| "num_tokens": 54675.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.6391582489013672, | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018929845422116528, | |
| "loss": 2.737, | |
| "mean_token_accuracy": 0.4556998685002327, | |
| "num_tokens": 55814.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.6075606942176819, | |
| "epoch": 0.37076648841354726, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018906064209274674, | |
| "loss": 2.7034, | |
| "mean_token_accuracy": 0.46947986632585526, | |
| "num_tokens": 56856.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.663224458694458, | |
| "epoch": 0.3778966131907308, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001888228299643282, | |
| "loss": 2.6482, | |
| "mean_token_accuracy": 0.46404948085546494, | |
| "num_tokens": 57913.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.5472119748592377, | |
| "epoch": 0.3850267379679144, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018858501783590965, | |
| "loss": 2.6376, | |
| "mean_token_accuracy": 0.46229933202266693, | |
| "num_tokens": 58974.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.576840728521347, | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018834720570749108, | |
| "loss": 2.5702, | |
| "mean_token_accuracy": 0.4657410681247711, | |
| "num_tokens": 60045.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.5661181211471558, | |
| "epoch": 0.39928698752228164, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018810939357907254, | |
| "loss": 2.5967, | |
| "mean_token_accuracy": 0.46670273691415787, | |
| "num_tokens": 61139.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.694770723581314, | |
| "epoch": 0.40641711229946526, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000187871581450654, | |
| "loss": 2.7852, | |
| "mean_token_accuracy": 0.4311066195368767, | |
| "num_tokens": 62182.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.6365650296211243, | |
| "epoch": 0.41354723707664887, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018763376932223543, | |
| "loss": 2.6211, | |
| "mean_token_accuracy": 0.45912958681583405, | |
| "num_tokens": 63294.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.676701843738556, | |
| "epoch": 0.4206773618538324, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001873959571938169, | |
| "loss": 2.727, | |
| "mean_token_accuracy": 0.4495629146695137, | |
| "num_tokens": 64345.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.6373060643672943, | |
| "epoch": 0.42780748663101603, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018715814506539835, | |
| "loss": 2.5274, | |
| "mean_token_accuracy": 0.49557945132255554, | |
| "num_tokens": 65514.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.60589137673378, | |
| "epoch": 0.43493761140819964, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001869203329369798, | |
| "loss": 2.5985, | |
| "mean_token_accuracy": 0.47936083376407623, | |
| "num_tokens": 66582.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.588552087545395, | |
| "epoch": 0.44206773618538325, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018668252080856126, | |
| "loss": 2.6723, | |
| "mean_token_accuracy": 0.4559238702058792, | |
| "num_tokens": 67675.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.6571227610111237, | |
| "epoch": 0.44919786096256686, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001864447086801427, | |
| "loss": 2.6471, | |
| "mean_token_accuracy": 0.45193884521722794, | |
| "num_tokens": 68786.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.624506950378418, | |
| "epoch": 0.4563279857397504, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018620689655172415, | |
| "loss": 2.6798, | |
| "mean_token_accuracy": 0.4612022116780281, | |
| "num_tokens": 69860.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.621217519044876, | |
| "epoch": 0.46345811051693403, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001859690844233056, | |
| "loss": 2.7073, | |
| "mean_token_accuracy": 0.4635338932275772, | |
| "num_tokens": 71015.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.613840103149414, | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018573127229488704, | |
| "loss": 2.5895, | |
| "mean_token_accuracy": 0.4725746735930443, | |
| "num_tokens": 72174.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.6597147285938263, | |
| "epoch": 0.47771836007130125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001854934601664685, | |
| "loss": 2.6713, | |
| "mean_token_accuracy": 0.453179232776165, | |
| "num_tokens": 73201.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.5728524029254913, | |
| "epoch": 0.48484848484848486, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018525564803804995, | |
| "loss": 2.5245, | |
| "mean_token_accuracy": 0.4796541631221771, | |
| "num_tokens": 74317.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.6241073608398438, | |
| "epoch": 0.4919786096256685, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001850178359096314, | |
| "loss": 2.6027, | |
| "mean_token_accuracy": 0.45817650109529495, | |
| "num_tokens": 75379.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.6420445144176483, | |
| "epoch": 0.49910873440285203, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018478002378121284, | |
| "loss": 2.6937, | |
| "mean_token_accuracy": 0.4578876346349716, | |
| "num_tokens": 76447.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.5464614629745483, | |
| "epoch": 0.5062388591800356, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001845422116527943, | |
| "loss": 2.6239, | |
| "mean_token_accuracy": 0.4877299517393112, | |
| "num_tokens": 77527.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.650261253118515, | |
| "epoch": 0.5133689839572193, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018430439952437576, | |
| "loss": 2.7438, | |
| "mean_token_accuracy": 0.44178425520658493, | |
| "num_tokens": 78571.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.5863160490989685, | |
| "epoch": 0.5204991087344029, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018406658739595721, | |
| "loss": 2.6262, | |
| "mean_token_accuracy": 0.4780261069536209, | |
| "num_tokens": 79679.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.5492512583732605, | |
| "epoch": 0.5276292335115864, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018382877526753865, | |
| "loss": 2.6273, | |
| "mean_token_accuracy": 0.4793827310204506, | |
| "num_tokens": 80814.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.641315221786499, | |
| "epoch": 0.5347593582887701, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001835909631391201, | |
| "loss": 2.7035, | |
| "mean_token_accuracy": 0.4648021087050438, | |
| "num_tokens": 81899.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.6433144211769104, | |
| "epoch": 0.5418894830659536, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018335315101070156, | |
| "loss": 2.6694, | |
| "mean_token_accuracy": 0.46280162036418915, | |
| "num_tokens": 83027.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.5979108810424805, | |
| "epoch": 0.5490196078431373, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018311533888228302, | |
| "loss": 2.6064, | |
| "mean_token_accuracy": 0.4912242665886879, | |
| "num_tokens": 84157.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.6251288056373596, | |
| "epoch": 0.5561497326203209, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018287752675386445, | |
| "loss": 2.7191, | |
| "mean_token_accuracy": 0.451671302318573, | |
| "num_tokens": 85250.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.5671390295028687, | |
| "epoch": 0.5632798573975044, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001826397146254459, | |
| "loss": 2.613, | |
| "mean_token_accuracy": 0.4815371036529541, | |
| "num_tokens": 86331.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.6086040139198303, | |
| "epoch": 0.5704099821746881, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018240190249702736, | |
| "loss": 2.6629, | |
| "mean_token_accuracy": 0.4586881175637245, | |
| "num_tokens": 87430.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.643826961517334, | |
| "epoch": 0.5775401069518716, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001821640903686088, | |
| "loss": 2.5622, | |
| "mean_token_accuracy": 0.46292658150196075, | |
| "num_tokens": 88542.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.6064577996730804, | |
| "epoch": 0.5846702317290553, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018192627824019025, | |
| "loss": 2.6491, | |
| "mean_token_accuracy": 0.47355010360479355, | |
| "num_tokens": 89674.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.628090888261795, | |
| "epoch": 0.5918003565062389, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001816884661117717, | |
| "loss": 2.5738, | |
| "mean_token_accuracy": 0.44704899936914444, | |
| "num_tokens": 90762.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.558334767818451, | |
| "epoch": 0.5989304812834224, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018145065398335317, | |
| "loss": 2.5863, | |
| "mean_token_accuracy": 0.4635235145688057, | |
| "num_tokens": 91891.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.5913751423358917, | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001812128418549346, | |
| "loss": 2.5779, | |
| "mean_token_accuracy": 0.49088721722364426, | |
| "num_tokens": 92975.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.6400005519390106, | |
| "epoch": 0.6131907308377896, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018097502972651606, | |
| "loss": 2.7314, | |
| "mean_token_accuracy": 0.4603074938058853, | |
| "num_tokens": 94049.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.6752884984016418, | |
| "epoch": 0.6203208556149733, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018073721759809751, | |
| "loss": 2.6624, | |
| "mean_token_accuracy": 0.4621705859899521, | |
| "num_tokens": 95148.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.6517191529273987, | |
| "epoch": 0.6274509803921569, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018049940546967897, | |
| "loss": 2.6835, | |
| "mean_token_accuracy": 0.4539112225174904, | |
| "num_tokens": 96181.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.7114574909210205, | |
| "epoch": 0.6345811051693404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001802615933412604, | |
| "loss": 2.7193, | |
| "mean_token_accuracy": 0.4676526263356209, | |
| "num_tokens": 97301.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.5929251611232758, | |
| "epoch": 0.6417112299465241, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018002378121284186, | |
| "loss": 2.5851, | |
| "mean_token_accuracy": 0.47982004284858704, | |
| "num_tokens": 98436.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.6745183765888214, | |
| "epoch": 0.6488413547237076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017978596908442332, | |
| "loss": 2.7709, | |
| "mean_token_accuracy": 0.4384681358933449, | |
| "num_tokens": 99477.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.59268319606781, | |
| "epoch": 0.6559714795008913, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017954815695600475, | |
| "loss": 2.5459, | |
| "mean_token_accuracy": 0.47278715670108795, | |
| "num_tokens": 100622.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.542839676141739, | |
| "epoch": 0.6631016042780749, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001793103448275862, | |
| "loss": 2.6196, | |
| "mean_token_accuracy": 0.48384036868810654, | |
| "num_tokens": 101710.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.675879418849945, | |
| "epoch": 0.6702317290552585, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017907253269916766, | |
| "loss": 2.7075, | |
| "mean_token_accuracy": 0.46232420206069946, | |
| "num_tokens": 102745.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.520689070224762, | |
| "epoch": 0.6773618538324421, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017883472057074912, | |
| "loss": 2.5151, | |
| "mean_token_accuracy": 0.48868096619844437, | |
| "num_tokens": 103923.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.5939531326293945, | |
| "epoch": 0.6844919786096256, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017859690844233058, | |
| "loss": 2.5655, | |
| "mean_token_accuracy": 0.48160815238952637, | |
| "num_tokens": 105063.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.6622373461723328, | |
| "epoch": 0.6916221033868093, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000178359096313912, | |
| "loss": 2.6854, | |
| "mean_token_accuracy": 0.4513147324323654, | |
| "num_tokens": 106176.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.596898078918457, | |
| "epoch": 0.6987522281639929, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017812128418549347, | |
| "loss": 2.6575, | |
| "mean_token_accuracy": 0.46170156449079514, | |
| "num_tokens": 107276.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.6209412813186646, | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017788347205707493, | |
| "loss": 2.6675, | |
| "mean_token_accuracy": 0.4495192915201187, | |
| "num_tokens": 108387.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.5751018524169922, | |
| "epoch": 0.7130124777183601, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017764565992865636, | |
| "loss": 2.6782, | |
| "mean_token_accuracy": 0.47493766248226166, | |
| "num_tokens": 109464.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.5404111742973328, | |
| "epoch": 0.7201426024955436, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001774078478002378, | |
| "loss": 2.5644, | |
| "mean_token_accuracy": 0.4800315648317337, | |
| "num_tokens": 110545.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.5617071688175201, | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017717003567181927, | |
| "loss": 2.4542, | |
| "mean_token_accuracy": 0.486552469432354, | |
| "num_tokens": 111644.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.6221667528152466, | |
| "epoch": 0.7344028520499108, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017693222354340073, | |
| "loss": 2.6873, | |
| "mean_token_accuracy": 0.4583618715405464, | |
| "num_tokens": 112683.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.6229766011238098, | |
| "epoch": 0.7415329768270945, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001766944114149822, | |
| "loss": 2.7595, | |
| "mean_token_accuracy": 0.4478139355778694, | |
| "num_tokens": 113734.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.6290508806705475, | |
| "epoch": 0.7486631016042781, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017645659928656362, | |
| "loss": 2.648, | |
| "mean_token_accuracy": 0.4640928953886032, | |
| "num_tokens": 114840.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.5602010190486908, | |
| "epoch": 0.7557932263814616, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017621878715814507, | |
| "loss": 2.6151, | |
| "mean_token_accuracy": 0.47945626825094223, | |
| "num_tokens": 115918.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.561210036277771, | |
| "epoch": 0.7629233511586453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017598097502972653, | |
| "loss": 2.6753, | |
| "mean_token_accuracy": 0.4755818694829941, | |
| "num_tokens": 116992.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.5463439524173737, | |
| "epoch": 0.7700534759358288, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017574316290130796, | |
| "loss": 2.5312, | |
| "mean_token_accuracy": 0.4788963794708252, | |
| "num_tokens": 118163.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.545140415430069, | |
| "epoch": 0.7771836007130125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017550535077288942, | |
| "loss": 2.4861, | |
| "mean_token_accuracy": 0.48517899960279465, | |
| "num_tokens": 119246.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.6490318477153778, | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017526753864447088, | |
| "loss": 2.6319, | |
| "mean_token_accuracy": 0.4522154629230499, | |
| "num_tokens": 120352.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.5705026686191559, | |
| "epoch": 0.7914438502673797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001750297265160523, | |
| "loss": 2.5889, | |
| "mean_token_accuracy": 0.4755884185433388, | |
| "num_tokens": 121495.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.5974452495574951, | |
| "epoch": 0.7985739750445633, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001747919143876338, | |
| "loss": 2.5626, | |
| "mean_token_accuracy": 0.4795725420117378, | |
| "num_tokens": 122564.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.5680693686008453, | |
| "epoch": 0.8057040998217468, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017455410225921522, | |
| "loss": 2.6273, | |
| "mean_token_accuracy": 0.4590470939874649, | |
| "num_tokens": 123650.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.6800198554992676, | |
| "epoch": 0.8128342245989305, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017431629013079668, | |
| "loss": 2.737, | |
| "mean_token_accuracy": 0.45866189897060394, | |
| "num_tokens": 124779.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.6031546890735626, | |
| "epoch": 0.8199643493761141, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017407847800237814, | |
| "loss": 2.5439, | |
| "mean_token_accuracy": 0.472602941095829, | |
| "num_tokens": 125900.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.6303865611553192, | |
| "epoch": 0.8270944741532977, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017384066587395957, | |
| "loss": 2.6487, | |
| "mean_token_accuracy": 0.4673057347536087, | |
| "num_tokens": 126993.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.6584992706775665, | |
| "epoch": 0.8342245989304813, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017360285374554103, | |
| "loss": 2.6956, | |
| "mean_token_accuracy": 0.46377309411764145, | |
| "num_tokens": 128138.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.591305822134018, | |
| "epoch": 0.8413547237076648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017336504161712249, | |
| "loss": 2.713, | |
| "mean_token_accuracy": 0.46212588995695114, | |
| "num_tokens": 129222.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.6993348598480225, | |
| "epoch": 0.8484848484848485, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017312722948870392, | |
| "loss": 2.7792, | |
| "mean_token_accuracy": 0.4319414496421814, | |
| "num_tokens": 130249.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.5929858684539795, | |
| "epoch": 0.8556149732620321, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001728894173602854, | |
| "loss": 2.6866, | |
| "mean_token_accuracy": 0.47090228646993637, | |
| "num_tokens": 131310.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.667660892009735, | |
| "epoch": 0.8627450980392157, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017265160523186683, | |
| "loss": 2.6699, | |
| "mean_token_accuracy": 0.45984991639852524, | |
| "num_tokens": 132431.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.548890471458435, | |
| "epoch": 0.8698752228163993, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017241379310344826, | |
| "loss": 2.6131, | |
| "mean_token_accuracy": 0.4870634377002716, | |
| "num_tokens": 133545.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.5599585175514221, | |
| "epoch": 0.8770053475935828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017217598097502975, | |
| "loss": 2.5476, | |
| "mean_token_accuracy": 0.4777110442519188, | |
| "num_tokens": 134678.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.5160918235778809, | |
| "epoch": 0.8841354723707665, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017193816884661118, | |
| "loss": 2.5731, | |
| "mean_token_accuracy": 0.478961318731308, | |
| "num_tokens": 135821.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.5480628907680511, | |
| "epoch": 0.8912655971479501, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017170035671819264, | |
| "loss": 2.5291, | |
| "mean_token_accuracy": 0.4755103513598442, | |
| "num_tokens": 136971.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.5862606167793274, | |
| "epoch": 0.8983957219251337, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001714625445897741, | |
| "loss": 2.6396, | |
| "mean_token_accuracy": 0.46081865578889847, | |
| "num_tokens": 138007.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.669043391942978, | |
| "epoch": 0.9055258467023173, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017122473246135552, | |
| "loss": 2.6321, | |
| "mean_token_accuracy": 0.4644095078110695, | |
| "num_tokens": 139053.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.5860805809497833, | |
| "epoch": 0.9126559714795008, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017098692033293698, | |
| "loss": 2.5403, | |
| "mean_token_accuracy": 0.4631856083869934, | |
| "num_tokens": 140147.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.6290316879749298, | |
| "epoch": 0.9197860962566845, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017074910820451844, | |
| "loss": 2.6503, | |
| "mean_token_accuracy": 0.46986210346221924, | |
| "num_tokens": 141269.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.5800673365592957, | |
| "epoch": 0.9269162210338681, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017051129607609987, | |
| "loss": 2.5817, | |
| "mean_token_accuracy": 0.46785174310207367, | |
| "num_tokens": 142404.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.5903257727622986, | |
| "epoch": 0.9340463458110517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017027348394768135, | |
| "loss": 2.5134, | |
| "mean_token_accuracy": 0.4654293358325958, | |
| "num_tokens": 143534.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.5758326351642609, | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017003567181926279, | |
| "loss": 2.6744, | |
| "mean_token_accuracy": 0.47502800077199936, | |
| "num_tokens": 144634.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.555185467004776, | |
| "epoch": 0.948306595365419, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016979785969084424, | |
| "loss": 2.5801, | |
| "mean_token_accuracy": 0.47467251121997833, | |
| "num_tokens": 145695.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.58922079205513, | |
| "epoch": 0.9554367201426025, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001695600475624257, | |
| "loss": 2.619, | |
| "mean_token_accuracy": 0.4779004603624344, | |
| "num_tokens": 146787.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.6058009564876556, | |
| "epoch": 0.9625668449197861, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016932223543400713, | |
| "loss": 2.683, | |
| "mean_token_accuracy": 0.4785980358719826, | |
| "num_tokens": 147817.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.6281861662864685, | |
| "epoch": 0.9696969696969697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001690844233055886, | |
| "loss": 2.7144, | |
| "mean_token_accuracy": 0.4627036154270172, | |
| "num_tokens": 148907.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.619028925895691, | |
| "epoch": 0.9768270944741533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016884661117717005, | |
| "loss": 2.6079, | |
| "mean_token_accuracy": 0.45756662636995316, | |
| "num_tokens": 149993.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.5888321995735168, | |
| "epoch": 0.983957219251337, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016860879904875148, | |
| "loss": 2.6093, | |
| "mean_token_accuracy": 0.4724939614534378, | |
| "num_tokens": 151098.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.6436494290828705, | |
| "epoch": 0.9910873440285205, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016837098692033296, | |
| "loss": 2.6575, | |
| "mean_token_accuracy": 0.4623269736766815, | |
| "num_tokens": 152163.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.5655477941036224, | |
| "epoch": 0.9982174688057041, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001681331747919144, | |
| "loss": 2.6335, | |
| "mean_token_accuracy": 0.47587042301893234, | |
| "num_tokens": 153275.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.7136651277542114, | |
| "epoch": 1.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016789536266349582, | |
| "loss": 2.6691, | |
| "mean_token_accuracy": 0.4723247289657593, | |
| "num_tokens": 153550.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.6066825091838837, | |
| "epoch": 1.0071301247771836, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001676575505350773, | |
| "loss": 2.5409, | |
| "mean_token_accuracy": 0.48296716809272766, | |
| "num_tokens": 154641.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.5699058771133423, | |
| "epoch": 1.014260249554367, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016741973840665874, | |
| "loss": 2.5472, | |
| "mean_token_accuracy": 0.47184478491544724, | |
| "num_tokens": 155762.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.5619142949581146, | |
| "epoch": 1.0213903743315509, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001671819262782402, | |
| "loss": 2.6692, | |
| "mean_token_accuracy": 0.4779776930809021, | |
| "num_tokens": 156859.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.5965659022331238, | |
| "epoch": 1.0285204991087344, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016694411414982165, | |
| "loss": 2.621, | |
| "mean_token_accuracy": 0.4778958112001419, | |
| "num_tokens": 157991.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.6152032911777496, | |
| "epoch": 1.035650623885918, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016670630202140308, | |
| "loss": 2.5771, | |
| "mean_token_accuracy": 0.4681878834962845, | |
| "num_tokens": 159092.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.5952391922473907, | |
| "epoch": 1.0427807486631016, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016646848989298457, | |
| "loss": 2.6372, | |
| "mean_token_accuracy": 0.48037248849868774, | |
| "num_tokens": 160160.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.565784215927124, | |
| "epoch": 1.049910873440285, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000166230677764566, | |
| "loss": 2.6739, | |
| "mean_token_accuracy": 0.4823187068104744, | |
| "num_tokens": 161263.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.5616013407707214, | |
| "epoch": 1.0570409982174689, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016599286563614743, | |
| "loss": 2.538, | |
| "mean_token_accuracy": 0.4936261996626854, | |
| "num_tokens": 162366.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.550614982843399, | |
| "epoch": 1.0641711229946524, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016575505350772892, | |
| "loss": 2.5405, | |
| "mean_token_accuracy": 0.4571392834186554, | |
| "num_tokens": 163510.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.6245464980602264, | |
| "epoch": 1.071301247771836, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016551724137931035, | |
| "loss": 2.6446, | |
| "mean_token_accuracy": 0.46846461296081543, | |
| "num_tokens": 164565.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.4947470128536224, | |
| "epoch": 1.0784313725490196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001652794292508918, | |
| "loss": 2.5193, | |
| "mean_token_accuracy": 0.4816185459494591, | |
| "num_tokens": 165718.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.5682013034820557, | |
| "epoch": 1.085561497326203, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016504161712247326, | |
| "loss": 2.5916, | |
| "mean_token_accuracy": 0.46111301332712173, | |
| "num_tokens": 166812.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.6156157553195953, | |
| "epoch": 1.0926916221033869, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001648038049940547, | |
| "loss": 2.6128, | |
| "mean_token_accuracy": 0.4597594439983368, | |
| "num_tokens": 167926.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.59586563706398, | |
| "epoch": 1.0998217468805704, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016456599286563618, | |
| "loss": 2.6021, | |
| "mean_token_accuracy": 0.4618517607450485, | |
| "num_tokens": 168982.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.6061933636665344, | |
| "epoch": 1.106951871657754, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001643281807372176, | |
| "loss": 2.6854, | |
| "mean_token_accuracy": 0.43533751368522644, | |
| "num_tokens": 170010.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.6085174083709717, | |
| "epoch": 1.1140819964349375, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016409036860879904, | |
| "loss": 2.6981, | |
| "mean_token_accuracy": 0.4599795266985893, | |
| "num_tokens": 171113.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.5913650691509247, | |
| "epoch": 1.121212121212121, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016385255648038052, | |
| "loss": 2.5902, | |
| "mean_token_accuracy": 0.4770422652363777, | |
| "num_tokens": 172251.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.5887981355190277, | |
| "epoch": 1.1283422459893049, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016361474435196195, | |
| "loss": 2.6519, | |
| "mean_token_accuracy": 0.46707142144441605, | |
| "num_tokens": 173342.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.574705809354782, | |
| "epoch": 1.1354723707664884, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001633769322235434, | |
| "loss": 2.6085, | |
| "mean_token_accuracy": 0.478085033595562, | |
| "num_tokens": 174460.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.5885936915874481, | |
| "epoch": 1.142602495543672, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016313912009512487, | |
| "loss": 2.7184, | |
| "mean_token_accuracy": 0.45598648488521576, | |
| "num_tokens": 175576.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.5892296731472015, | |
| "epoch": 1.1497326203208555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001629013079667063, | |
| "loss": 2.7047, | |
| "mean_token_accuracy": 0.46477679163217545, | |
| "num_tokens": 176675.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.6486813724040985, | |
| "epoch": 1.156862745098039, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016266349583828776, | |
| "loss": 2.7061, | |
| "mean_token_accuracy": 0.46870480477809906, | |
| "num_tokens": 177740.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.5759459435939789, | |
| "epoch": 1.1639928698752229, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016242568370986922, | |
| "loss": 2.5503, | |
| "mean_token_accuracy": 0.46419017761945724, | |
| "num_tokens": 178796.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.6036653220653534, | |
| "epoch": 1.1711229946524064, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016218787158145065, | |
| "loss": 2.6642, | |
| "mean_token_accuracy": 0.47088219970464706, | |
| "num_tokens": 179879.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.5834835767745972, | |
| "epoch": 1.17825311942959, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016195005945303213, | |
| "loss": 2.6007, | |
| "mean_token_accuracy": 0.4636633098125458, | |
| "num_tokens": 181061.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.7047175765037537, | |
| "epoch": 1.1853832442067735, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016171224732461356, | |
| "loss": 2.6283, | |
| "mean_token_accuracy": 0.46398939192295074, | |
| "num_tokens": 182160.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.649744689464569, | |
| "epoch": 1.192513368983957, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016147443519619502, | |
| "loss": 2.7414, | |
| "mean_token_accuracy": 0.44918210059404373, | |
| "num_tokens": 183238.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.4582054913043976, | |
| "epoch": 1.1996434937611409, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016123662306777648, | |
| "loss": 2.4485, | |
| "mean_token_accuracy": 0.5028558596968651, | |
| "num_tokens": 184343.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.5874928832054138, | |
| "epoch": 1.2067736185383244, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001609988109393579, | |
| "loss": 2.6477, | |
| "mean_token_accuracy": 0.47324957698583603, | |
| "num_tokens": 185432.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.5870967209339142, | |
| "epoch": 1.213903743315508, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016076099881093936, | |
| "loss": 2.6929, | |
| "mean_token_accuracy": 0.459871307015419, | |
| "num_tokens": 186527.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.6552450358867645, | |
| "epoch": 1.2210338680926915, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016052318668252082, | |
| "loss": 2.6735, | |
| "mean_token_accuracy": 0.45555536448955536, | |
| "num_tokens": 187577.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.6631282269954681, | |
| "epoch": 1.228163992869875, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016028537455410225, | |
| "loss": 2.6354, | |
| "mean_token_accuracy": 0.47852643579244614, | |
| "num_tokens": 188730.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.6064516603946686, | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016004756242568374, | |
| "loss": 2.6498, | |
| "mean_token_accuracy": 0.4627776965498924, | |
| "num_tokens": 189851.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.6195379793643951, | |
| "epoch": 1.2424242424242424, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015980975029726517, | |
| "loss": 2.5955, | |
| "mean_token_accuracy": 0.4755994454026222, | |
| "num_tokens": 190954.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.583428680896759, | |
| "epoch": 1.249554367201426, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015957193816884663, | |
| "loss": 2.5734, | |
| "mean_token_accuracy": 0.486144557595253, | |
| "num_tokens": 192084.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.61075758934021, | |
| "epoch": 1.2566844919786098, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015933412604042808, | |
| "loss": 2.699, | |
| "mean_token_accuracy": 0.4629248157143593, | |
| "num_tokens": 193111.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.5986732840538025, | |
| "epoch": 1.263814616755793, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015909631391200951, | |
| "loss": 2.6723, | |
| "mean_token_accuracy": 0.4575885683298111, | |
| "num_tokens": 194260.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.5930440127849579, | |
| "epoch": 1.2709447415329769, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015885850178359097, | |
| "loss": 2.7138, | |
| "mean_token_accuracy": 0.4715413674712181, | |
| "num_tokens": 195316.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.605830729007721, | |
| "epoch": 1.2780748663101604, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015862068965517243, | |
| "loss": 2.6979, | |
| "mean_token_accuracy": 0.4691004157066345, | |
| "num_tokens": 196406.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.6044515669345856, | |
| "epoch": 1.285204991087344, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015838287752675386, | |
| "loss": 2.6274, | |
| "mean_token_accuracy": 0.4783453792333603, | |
| "num_tokens": 197500.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.5863282978534698, | |
| "epoch": 1.2923351158645278, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015814506539833532, | |
| "loss": 2.5774, | |
| "mean_token_accuracy": 0.48002146929502487, | |
| "num_tokens": 198661.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.673670768737793, | |
| "epoch": 1.299465240641711, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015790725326991678, | |
| "loss": 2.6731, | |
| "mean_token_accuracy": 0.46556977182626724, | |
| "num_tokens": 199760.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.5792707800865173, | |
| "epoch": 1.3065953654188949, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001576694411414982, | |
| "loss": 2.598, | |
| "mean_token_accuracy": 0.4617435112595558, | |
| "num_tokens": 200837.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.6057993471622467, | |
| "epoch": 1.3137254901960784, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001574316290130797, | |
| "loss": 2.5975, | |
| "mean_token_accuracy": 0.469496913254261, | |
| "num_tokens": 201963.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.6113585829734802, | |
| "epoch": 1.320855614973262, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015719381688466112, | |
| "loss": 2.7397, | |
| "mean_token_accuracy": 0.46607375144958496, | |
| "num_tokens": 203043.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.6389738619327545, | |
| "epoch": 1.3279857397504458, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015695600475624258, | |
| "loss": 2.6493, | |
| "mean_token_accuracy": 0.4742783457040787, | |
| "num_tokens": 204118.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.6438090801239014, | |
| "epoch": 1.3351158645276293, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015671819262782404, | |
| "loss": 2.7208, | |
| "mean_token_accuracy": 0.4482914060354233, | |
| "num_tokens": 205174.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.5948124527931213, | |
| "epoch": 1.3422459893048129, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015648038049940547, | |
| "loss": 2.6397, | |
| "mean_token_accuracy": 0.4829149469733238, | |
| "num_tokens": 206322.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.5886778235435486, | |
| "epoch": 1.3493761140819964, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015624256837098693, | |
| "loss": 2.4934, | |
| "mean_token_accuracy": 0.4762219786643982, | |
| "num_tokens": 207430.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.5851022601127625, | |
| "epoch": 1.35650623885918, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015600475624256838, | |
| "loss": 2.5747, | |
| "mean_token_accuracy": 0.44965071976184845, | |
| "num_tokens": 208572.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.6201273202896118, | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015576694411414981, | |
| "loss": 2.6311, | |
| "mean_token_accuracy": 0.462075375020504, | |
| "num_tokens": 209649.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.5876831114292145, | |
| "epoch": 1.3707664884135473, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015552913198573127, | |
| "loss": 2.6102, | |
| "mean_token_accuracy": 0.48762883245944977, | |
| "num_tokens": 210713.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.5845709443092346, | |
| "epoch": 1.3778966131907309, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015529131985731273, | |
| "loss": 2.6816, | |
| "mean_token_accuracy": 0.48001401126384735, | |
| "num_tokens": 211837.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.6651998162269592, | |
| "epoch": 1.3850267379679144, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001550535077288942, | |
| "loss": 2.6997, | |
| "mean_token_accuracy": 0.4684208855032921, | |
| "num_tokens": 212932.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.6087098717689514, | |
| "epoch": 1.392156862745098, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015481569560047564, | |
| "loss": 2.6176, | |
| "mean_token_accuracy": 0.46499625593423843, | |
| "num_tokens": 214054.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.6280483305454254, | |
| "epoch": 1.3992869875222818, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015457788347205708, | |
| "loss": 2.7304, | |
| "mean_token_accuracy": 0.4452286586165428, | |
| "num_tokens": 215200.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.5492911636829376, | |
| "epoch": 1.4064171122994653, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015434007134363853, | |
| "loss": 2.5217, | |
| "mean_token_accuracy": 0.49339231103658676, | |
| "num_tokens": 216317.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.5941800773143768, | |
| "epoch": 1.4135472370766489, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015410225921522, | |
| "loss": 2.4819, | |
| "mean_token_accuracy": 0.4754658564925194, | |
| "num_tokens": 217405.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.5978778004646301, | |
| "epoch": 1.4206773618538324, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015386444708680142, | |
| "loss": 2.7044, | |
| "mean_token_accuracy": 0.45556849241256714, | |
| "num_tokens": 218504.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.5847291052341461, | |
| "epoch": 1.427807486631016, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015362663495838288, | |
| "loss": 2.6487, | |
| "mean_token_accuracy": 0.46325846016407013, | |
| "num_tokens": 219640.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.5988908410072327, | |
| "epoch": 1.4349376114081998, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015338882282996434, | |
| "loss": 2.664, | |
| "mean_token_accuracy": 0.46224743872880936, | |
| "num_tokens": 220642.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.5120367109775543, | |
| "epoch": 1.4420677361853833, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001531510107015458, | |
| "loss": 2.5895, | |
| "mean_token_accuracy": 0.48113133013248444, | |
| "num_tokens": 221735.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.5507851243019104, | |
| "epoch": 1.4491978609625669, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015291319857312725, | |
| "loss": 2.5664, | |
| "mean_token_accuracy": 0.48080600798130035, | |
| "num_tokens": 222814.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.6826366484165192, | |
| "epoch": 1.4563279857397504, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015267538644470868, | |
| "loss": 2.7612, | |
| "mean_token_accuracy": 0.4607429876923561, | |
| "num_tokens": 223905.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.6085248589515686, | |
| "epoch": 1.463458110516934, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015243757431629014, | |
| "loss": 2.6815, | |
| "mean_token_accuracy": 0.45429935306310654, | |
| "num_tokens": 224999.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.6905607879161835, | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001521997621878716, | |
| "loss": 2.7331, | |
| "mean_token_accuracy": 0.4660051390528679, | |
| "num_tokens": 226068.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.6017423570156097, | |
| "epoch": 1.4777183600713013, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015196195005945303, | |
| "loss": 2.6021, | |
| "mean_token_accuracy": 0.46242382377386093, | |
| "num_tokens": 227126.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.493167519569397, | |
| "epoch": 1.4848484848484849, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015172413793103449, | |
| "loss": 2.5246, | |
| "mean_token_accuracy": 0.48408984392881393, | |
| "num_tokens": 228245.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.5619983673095703, | |
| "epoch": 1.4919786096256684, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015148632580261594, | |
| "loss": 2.6306, | |
| "mean_token_accuracy": 0.4775094836950302, | |
| "num_tokens": 229336.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.587768405675888, | |
| "epoch": 1.499108734402852, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001512485136741974, | |
| "loss": 2.6375, | |
| "mean_token_accuracy": 0.47264014929533005, | |
| "num_tokens": 230431.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.6371296346187592, | |
| "epoch": 1.5062388591800357, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015101070154577883, | |
| "loss": 2.6987, | |
| "mean_token_accuracy": 0.45142069458961487, | |
| "num_tokens": 231538.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.615380883216858, | |
| "epoch": 1.5133689839572193, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001507728894173603, | |
| "loss": 2.5931, | |
| "mean_token_accuracy": 0.46151647716760635, | |
| "num_tokens": 232692.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.6657347679138184, | |
| "epoch": 1.5204991087344029, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015053507728894175, | |
| "loss": 2.767, | |
| "mean_token_accuracy": 0.45710520446300507, | |
| "num_tokens": 233752.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.5707046389579773, | |
| "epoch": 1.5276292335115864, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001502972651605232, | |
| "loss": 2.575, | |
| "mean_token_accuracy": 0.47789450734853745, | |
| "num_tokens": 234860.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.6575550436973572, | |
| "epoch": 1.53475935828877, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015005945303210464, | |
| "loss": 2.6208, | |
| "mean_token_accuracy": 0.46910084784030914, | |
| "num_tokens": 235953.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.5557922422885895, | |
| "epoch": 1.5418894830659537, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001498216409036861, | |
| "loss": 2.6316, | |
| "mean_token_accuracy": 0.47147487103939056, | |
| "num_tokens": 237113.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.5967816710472107, | |
| "epoch": 1.5490196078431373, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014958382877526755, | |
| "loss": 2.6435, | |
| "mean_token_accuracy": 0.4659326821565628, | |
| "num_tokens": 238183.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.5619607269763947, | |
| "epoch": 1.5561497326203209, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000149346016646849, | |
| "loss": 2.6573, | |
| "mean_token_accuracy": 0.4648478552699089, | |
| "num_tokens": 239250.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.5970399975776672, | |
| "epoch": 1.5632798573975044, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014910820451843044, | |
| "loss": 2.5817, | |
| "mean_token_accuracy": 0.47652987390756607, | |
| "num_tokens": 240379.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.6112593710422516, | |
| "epoch": 1.570409982174688, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001488703923900119, | |
| "loss": 2.6912, | |
| "mean_token_accuracy": 0.46139268577098846, | |
| "num_tokens": 241489.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.6720410883426666, | |
| "epoch": 1.5775401069518717, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014863258026159336, | |
| "loss": 2.6818, | |
| "mean_token_accuracy": 0.438846156001091, | |
| "num_tokens": 242546.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.6308137476444244, | |
| "epoch": 1.5846702317290553, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014839476813317479, | |
| "loss": 2.7512, | |
| "mean_token_accuracy": 0.4592900201678276, | |
| "num_tokens": 243638.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.5938476026058197, | |
| "epoch": 1.5918003565062389, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014815695600475624, | |
| "loss": 2.5778, | |
| "mean_token_accuracy": 0.47352661937475204, | |
| "num_tokens": 244751.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.5609805583953857, | |
| "epoch": 1.5989304812834224, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001479191438763377, | |
| "loss": 2.6435, | |
| "mean_token_accuracy": 0.4681161344051361, | |
| "num_tokens": 245826.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.6306316256523132, | |
| "epoch": 1.606060606060606, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014768133174791916, | |
| "loss": 2.6791, | |
| "mean_token_accuracy": 0.4553253725171089, | |
| "num_tokens": 246871.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.5593467950820923, | |
| "epoch": 1.6131907308377897, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014744351961950062, | |
| "loss": 2.5499, | |
| "mean_token_accuracy": 0.47761671245098114, | |
| "num_tokens": 247955.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.55764502286911, | |
| "epoch": 1.6203208556149733, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014720570749108205, | |
| "loss": 2.5377, | |
| "mean_token_accuracy": 0.49300002306699753, | |
| "num_tokens": 249061.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.5418426096439362, | |
| "epoch": 1.6274509803921569, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001469678953626635, | |
| "loss": 2.6276, | |
| "mean_token_accuracy": 0.4840138778090477, | |
| "num_tokens": 250157.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.6196385324001312, | |
| "epoch": 1.6345811051693404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014673008323424496, | |
| "loss": 2.5876, | |
| "mean_token_accuracy": 0.440780833363533, | |
| "num_tokens": 251267.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.4990095794200897, | |
| "epoch": 1.641711229946524, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001464922711058264, | |
| "loss": 2.528, | |
| "mean_token_accuracy": 0.49401114135980606, | |
| "num_tokens": 252371.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.5892696976661682, | |
| "epoch": 1.6488413547237077, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014625445897740785, | |
| "loss": 2.5374, | |
| "mean_token_accuracy": 0.4945196509361267, | |
| "num_tokens": 253491.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.6785107553005219, | |
| "epoch": 1.6559714795008913, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001460166468489893, | |
| "loss": 2.6625, | |
| "mean_token_accuracy": 0.4550144597887993, | |
| "num_tokens": 254598.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.5320258140563965, | |
| "epoch": 1.6631016042780749, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014577883472057077, | |
| "loss": 2.6136, | |
| "mean_token_accuracy": 0.48827097564935684, | |
| "num_tokens": 255724.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.5822700262069702, | |
| "epoch": 1.6702317290552586, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001455410225921522, | |
| "loss": 2.5751, | |
| "mean_token_accuracy": 0.4698340594768524, | |
| "num_tokens": 256830.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.6102330684661865, | |
| "epoch": 1.677361853832442, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014530321046373365, | |
| "loss": 2.6332, | |
| "mean_token_accuracy": 0.47240811586380005, | |
| "num_tokens": 257916.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.5286598205566406, | |
| "epoch": 1.6844919786096257, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001450653983353151, | |
| "loss": 2.5149, | |
| "mean_token_accuracy": 0.48652924597263336, | |
| "num_tokens": 259019.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.535717934370041, | |
| "epoch": 1.6916221033868093, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014482758620689657, | |
| "loss": 2.5691, | |
| "mean_token_accuracy": 0.4858531951904297, | |
| "num_tokens": 260128.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.6363422572612762, | |
| "epoch": 1.6987522281639929, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000144589774078478, | |
| "loss": 2.6366, | |
| "mean_token_accuracy": 0.4610147476196289, | |
| "num_tokens": 261228.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.6290847063064575, | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014435196195005946, | |
| "loss": 2.7225, | |
| "mean_token_accuracy": 0.47234660387039185, | |
| "num_tokens": 262270.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.6157021522521973, | |
| "epoch": 1.71301247771836, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014411414982164092, | |
| "loss": 2.6575, | |
| "mean_token_accuracy": 0.45069990307092667, | |
| "num_tokens": 263361.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.6559931337833405, | |
| "epoch": 1.7201426024955437, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014387633769322235, | |
| "loss": 2.5864, | |
| "mean_token_accuracy": 0.4479330778121948, | |
| "num_tokens": 264391.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.6161220967769623, | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001436385255648038, | |
| "loss": 2.6456, | |
| "mean_token_accuracy": 0.47140543162822723, | |
| "num_tokens": 265517.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.6491332352161407, | |
| "epoch": 1.7344028520499108, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014340071343638526, | |
| "loss": 2.7443, | |
| "mean_token_accuracy": 0.4643326923251152, | |
| "num_tokens": 266569.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.6256919205188751, | |
| "epoch": 1.7415329768270946, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014316290130796672, | |
| "loss": 2.6762, | |
| "mean_token_accuracy": 0.4532029777765274, | |
| "num_tokens": 267611.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.619517207145691, | |
| "epoch": 1.748663101604278, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014292508917954818, | |
| "loss": 2.7279, | |
| "mean_token_accuracy": 0.45651988685131073, | |
| "num_tokens": 268692.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.5403787195682526, | |
| "epoch": 1.7557932263814617, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001426872770511296, | |
| "loss": 2.5525, | |
| "mean_token_accuracy": 0.4840865433216095, | |
| "num_tokens": 269788.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.6027557253837585, | |
| "epoch": 1.7629233511586453, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014244946492271107, | |
| "loss": 2.574, | |
| "mean_token_accuracy": 0.48409949243068695, | |
| "num_tokens": 270843.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.5946480631828308, | |
| "epoch": 1.7700534759358288, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014221165279429252, | |
| "loss": 2.5633, | |
| "mean_token_accuracy": 0.4869764968752861, | |
| "num_tokens": 271964.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.6484409868717194, | |
| "epoch": 1.7771836007130126, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014197384066587395, | |
| "loss": 2.6964, | |
| "mean_token_accuracy": 0.4450046420097351, | |
| "num_tokens": 273054.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.6375533640384674, | |
| "epoch": 1.784313725490196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001417360285374554, | |
| "loss": 2.6718, | |
| "mean_token_accuracy": 0.4587417542934418, | |
| "num_tokens": 274133.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.5944361984729767, | |
| "epoch": 1.7914438502673797, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014149821640903687, | |
| "loss": 2.5917, | |
| "mean_token_accuracy": 0.4714890420436859, | |
| "num_tokens": 275243.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.6059032678604126, | |
| "epoch": 1.7985739750445633, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001412604042806183, | |
| "loss": 2.6893, | |
| "mean_token_accuracy": 0.45041677355766296, | |
| "num_tokens": 276280.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.6716269254684448, | |
| "epoch": 1.8057040998217468, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014102259215219978, | |
| "loss": 2.7229, | |
| "mean_token_accuracy": 0.45646892488002777, | |
| "num_tokens": 277383.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.6591968834400177, | |
| "epoch": 1.8128342245989306, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014078478002378122, | |
| "loss": 2.6642, | |
| "mean_token_accuracy": 0.457190565764904, | |
| "num_tokens": 278468.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.6405368149280548, | |
| "epoch": 1.819964349376114, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014054696789536267, | |
| "loss": 2.6468, | |
| "mean_token_accuracy": 0.4580274894833565, | |
| "num_tokens": 279544.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.6274996399879456, | |
| "epoch": 1.8270944741532977, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014030915576694413, | |
| "loss": 2.6112, | |
| "mean_token_accuracy": 0.4569789469242096, | |
| "num_tokens": 280607.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.5651440620422363, | |
| "epoch": 1.8342245989304813, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014007134363852556, | |
| "loss": 2.6034, | |
| "mean_token_accuracy": 0.4507257267832756, | |
| "num_tokens": 281691.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.5355416238307953, | |
| "epoch": 1.8413547237076648, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013983353151010702, | |
| "loss": 2.5293, | |
| "mean_token_accuracy": 0.4817895218729973, | |
| "num_tokens": 282815.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.592032641172409, | |
| "epoch": 1.8484848484848486, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013959571938168848, | |
| "loss": 2.6128, | |
| "mean_token_accuracy": 0.4846939668059349, | |
| "num_tokens": 283931.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.6933747231960297, | |
| "epoch": 1.855614973262032, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001393579072532699, | |
| "loss": 2.8414, | |
| "mean_token_accuracy": 0.42922718822956085, | |
| "num_tokens": 284997.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.549606055021286, | |
| "epoch": 1.8627450980392157, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001391200951248514, | |
| "loss": 2.6134, | |
| "mean_token_accuracy": 0.47253087162971497, | |
| "num_tokens": 286090.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.6243177652359009, | |
| "epoch": 1.8698752228163993, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013888228299643282, | |
| "loss": 2.7223, | |
| "mean_token_accuracy": 0.45962000638246536, | |
| "num_tokens": 287183.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.5972241163253784, | |
| "epoch": 1.8770053475935828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013864447086801425, | |
| "loss": 2.6094, | |
| "mean_token_accuracy": 0.47941046208143234, | |
| "num_tokens": 288231.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.6008118391036987, | |
| "epoch": 1.8841354723707666, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013840665873959574, | |
| "loss": 2.6731, | |
| "mean_token_accuracy": 0.4635191634297371, | |
| "num_tokens": 289284.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.5784848630428314, | |
| "epoch": 1.89126559714795, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013816884661117717, | |
| "loss": 2.6694, | |
| "mean_token_accuracy": 0.47262611985206604, | |
| "num_tokens": 290386.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.6606462597846985, | |
| "epoch": 1.8983957219251337, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013793103448275863, | |
| "loss": 2.7134, | |
| "mean_token_accuracy": 0.44980067759752274, | |
| "num_tokens": 291460.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.6004058420658112, | |
| "epoch": 1.9055258467023173, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013769322235434008, | |
| "loss": 2.5894, | |
| "mean_token_accuracy": 0.4649396017193794, | |
| "num_tokens": 292530.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.6282309591770172, | |
| "epoch": 1.9126559714795008, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013745541022592151, | |
| "loss": 2.6314, | |
| "mean_token_accuracy": 0.4572133645415306, | |
| "num_tokens": 293655.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.5889360904693604, | |
| "epoch": 1.9197860962566846, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000137217598097503, | |
| "loss": 2.6898, | |
| "mean_token_accuracy": 0.4732029587030411, | |
| "num_tokens": 294730.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.6428880095481873, | |
| "epoch": 1.926916221033868, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013697978596908443, | |
| "loss": 2.6273, | |
| "mean_token_accuracy": 0.4647170379757881, | |
| "num_tokens": 295818.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.5940674245357513, | |
| "epoch": 1.9340463458110517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013674197384066586, | |
| "loss": 2.5628, | |
| "mean_token_accuracy": 0.47556307166814804, | |
| "num_tokens": 296920.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.5898773968219757, | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013650416171224735, | |
| "loss": 2.6949, | |
| "mean_token_accuracy": 0.47456144541502, | |
| "num_tokens": 298028.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.651709407567978, | |
| "epoch": 1.9483065953654188, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013626634958382878, | |
| "loss": 2.6833, | |
| "mean_token_accuracy": 0.45697759836912155, | |
| "num_tokens": 299111.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.6460674107074738, | |
| "epoch": 1.9554367201426026, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013602853745541023, | |
| "loss": 2.7098, | |
| "mean_token_accuracy": 0.45604951679706573, | |
| "num_tokens": 300236.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.645053118467331, | |
| "epoch": 1.962566844919786, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001357907253269917, | |
| "loss": 2.5929, | |
| "mean_token_accuracy": 0.467997670173645, | |
| "num_tokens": 301315.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.6668168604373932, | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013555291319857312, | |
| "loss": 2.711, | |
| "mean_token_accuracy": 0.4511336088180542, | |
| "num_tokens": 302441.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.6053387224674225, | |
| "epoch": 1.9768270944741533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013531510107015458, | |
| "loss": 2.5949, | |
| "mean_token_accuracy": 0.4798056557774544, | |
| "num_tokens": 303551.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.6160295009613037, | |
| "epoch": 1.9839572192513368, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013507728894173604, | |
| "loss": 2.7294, | |
| "mean_token_accuracy": 0.45531073212623596, | |
| "num_tokens": 304641.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.6171720921993256, | |
| "epoch": 1.9910873440285206, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013483947681331747, | |
| "loss": 2.5741, | |
| "mean_token_accuracy": 0.4656490460038185, | |
| "num_tokens": 305726.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.6310756206512451, | |
| "epoch": 1.998217468805704, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013460166468489895, | |
| "loss": 2.59, | |
| "mean_token_accuracy": 0.45554178953170776, | |
| "num_tokens": 306833.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.6477800607681274, | |
| "epoch": 2.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013436385255648038, | |
| "loss": 2.7133, | |
| "mean_token_accuracy": 0.4486691951751709, | |
| "num_tokens": 307100.0, | |
| "step": 282 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 846, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.450573084483584e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |