Lalu-Prakash's picture
Upload folder using huggingface_hub
3e778d1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 282,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.6387848258018494,
"epoch": 0.0071301247771836,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.6515,
"mean_token_accuracy": 0.445863775908947,
"num_tokens": 1051.0,
"step": 1
},
{
"entropy": 1.6529399156570435,
"epoch": 0.0142602495543672,
"grad_norm": 0.0,
"learning_rate": 4e-05,
"loss": 2.7153,
"mean_token_accuracy": 0.4470880478620529,
"num_tokens": 2187.0,
"step": 2
},
{
"entropy": 1.5581220388412476,
"epoch": 0.0213903743315508,
"grad_norm": 0.0,
"learning_rate": 8e-05,
"loss": 2.6255,
"mean_token_accuracy": 0.4684518948197365,
"num_tokens": 3295.0,
"step": 3
},
{
"entropy": 1.5885010659694672,
"epoch": 0.0285204991087344,
"grad_norm": 0.0,
"learning_rate": 0.00012,
"loss": 2.5733,
"mean_token_accuracy": 0.4601749926805496,
"num_tokens": 4387.0,
"step": 4
},
{
"entropy": 1.6543156504631042,
"epoch": 0.035650623885918005,
"grad_norm": 0.0,
"learning_rate": 0.00016,
"loss": 2.6477,
"mean_token_accuracy": 0.4556595981121063,
"num_tokens": 5511.0,
"step": 5
},
{
"entropy": 1.5543809533119202,
"epoch": 0.0427807486631016,
"grad_norm": 0.0,
"learning_rate": 0.0002,
"loss": 2.6579,
"mean_token_accuracy": 0.46554840356111526,
"num_tokens": 6598.0,
"step": 6
},
{
"entropy": 1.5990856289863586,
"epoch": 0.049910873440285206,
"grad_norm": 0.0,
"learning_rate": 0.00019976218787158144,
"loss": 2.6618,
"mean_token_accuracy": 0.4601997211575508,
"num_tokens": 7673.0,
"step": 7
},
{
"entropy": 1.6176924407482147,
"epoch": 0.0570409982174688,
"grad_norm": 0.0,
"learning_rate": 0.00019952437574316292,
"loss": 2.7365,
"mean_token_accuracy": 0.4649490937590599,
"num_tokens": 8774.0,
"step": 8
},
{
"entropy": 1.5261479020118713,
"epoch": 0.06417112299465241,
"grad_norm": 0.0,
"learning_rate": 0.00019928656361474436,
"loss": 2.5824,
"mean_token_accuracy": 0.47935400158166885,
"num_tokens": 9886.0,
"step": 9
},
{
"entropy": 1.6173927783966064,
"epoch": 0.07130124777183601,
"grad_norm": 0.0,
"learning_rate": 0.0001990487514863258,
"loss": 2.53,
"mean_token_accuracy": 0.47585250437259674,
"num_tokens": 10983.0,
"step": 10
},
{
"entropy": 1.5592622458934784,
"epoch": 0.0784313725490196,
"grad_norm": 0.0,
"learning_rate": 0.00019881093935790727,
"loss": 2.6422,
"mean_token_accuracy": 0.46385403722524643,
"num_tokens": 12102.0,
"step": 11
},
{
"entropy": 1.5775817930698395,
"epoch": 0.0855614973262032,
"grad_norm": 0.0,
"learning_rate": 0.0001985731272294887,
"loss": 2.6095,
"mean_token_accuracy": 0.49216530472040176,
"num_tokens": 13219.0,
"step": 12
},
{
"entropy": 1.6316154599189758,
"epoch": 0.09269162210338681,
"grad_norm": 0.0,
"learning_rate": 0.00019833531510107019,
"loss": 2.7746,
"mean_token_accuracy": 0.4526786059141159,
"num_tokens": 14309.0,
"step": 13
},
{
"entropy": 1.5752335786819458,
"epoch": 0.09982174688057041,
"grad_norm": 0.0,
"learning_rate": 0.00019809750297265162,
"loss": 2.6994,
"mean_token_accuracy": 0.4809372276067734,
"num_tokens": 15373.0,
"step": 14
},
{
"entropy": 1.5528400540351868,
"epoch": 0.10695187165775401,
"grad_norm": 0.0,
"learning_rate": 0.00019785969084423305,
"loss": 2.6607,
"mean_token_accuracy": 0.4793297126889229,
"num_tokens": 16485.0,
"step": 15
},
{
"entropy": 1.6115093529224396,
"epoch": 0.1140819964349376,
"grad_norm": 0.0,
"learning_rate": 0.00019762187871581453,
"loss": 2.6006,
"mean_token_accuracy": 0.48629891872406006,
"num_tokens": 17654.0,
"step": 16
},
{
"entropy": 1.6650608479976654,
"epoch": 0.12121212121212122,
"grad_norm": 0.0,
"learning_rate": 0.00019738406658739596,
"loss": 2.8227,
"mean_token_accuracy": 0.4469343200325966,
"num_tokens": 18705.0,
"step": 17
},
{
"entropy": 1.5981396436691284,
"epoch": 0.12834224598930483,
"grad_norm": 0.0,
"learning_rate": 0.00019714625445897742,
"loss": 2.6602,
"mean_token_accuracy": 0.48277803510427475,
"num_tokens": 19799.0,
"step": 18
},
{
"entropy": 1.6768196523189545,
"epoch": 0.1354723707664884,
"grad_norm": 0.0,
"learning_rate": 0.00019690844233055888,
"loss": 2.7339,
"mean_token_accuracy": 0.45305337756872177,
"num_tokens": 20799.0,
"step": 19
},
{
"entropy": 1.6276516318321228,
"epoch": 0.14260249554367202,
"grad_norm": 0.0,
"learning_rate": 0.0001966706302021403,
"loss": 2.6103,
"mean_token_accuracy": 0.45762092620134354,
"num_tokens": 21918.0,
"step": 20
},
{
"entropy": 1.5985995829105377,
"epoch": 0.1497326203208556,
"grad_norm": 0.0,
"learning_rate": 0.00019643281807372177,
"loss": 2.6705,
"mean_token_accuracy": 0.474429652094841,
"num_tokens": 23021.0,
"step": 21
},
{
"entropy": 1.6575533151626587,
"epoch": 0.1568627450980392,
"grad_norm": 0.0,
"learning_rate": 0.00019619500594530322,
"loss": 2.7187,
"mean_token_accuracy": 0.4531744047999382,
"num_tokens": 24138.0,
"step": 22
},
{
"entropy": 1.6001036167144775,
"epoch": 0.16399286987522282,
"grad_norm": 0.0,
"learning_rate": 0.00019595719381688465,
"loss": 2.6272,
"mean_token_accuracy": 0.4594630151987076,
"num_tokens": 25229.0,
"step": 23
},
{
"entropy": 1.6082253158092499,
"epoch": 0.1711229946524064,
"grad_norm": 0.0,
"learning_rate": 0.00019571938168846614,
"loss": 2.5586,
"mean_token_accuracy": 0.4784083887934685,
"num_tokens": 26339.0,
"step": 24
},
{
"entropy": 1.6208438277244568,
"epoch": 0.17825311942959002,
"grad_norm": 0.0,
"learning_rate": 0.00019548156956004757,
"loss": 2.6095,
"mean_token_accuracy": 0.4685538485646248,
"num_tokens": 27389.0,
"step": 25
},
{
"entropy": 1.6018637418746948,
"epoch": 0.18538324420677363,
"grad_norm": 0.0,
"learning_rate": 0.00019524375743162903,
"loss": 2.7047,
"mean_token_accuracy": 0.4501718729734421,
"num_tokens": 28491.0,
"step": 26
},
{
"entropy": 1.566238820552826,
"epoch": 0.1925133689839572,
"grad_norm": 0.0,
"learning_rate": 0.00019500594530321049,
"loss": 2.5167,
"mean_token_accuracy": 0.4768272563815117,
"num_tokens": 29617.0,
"step": 27
},
{
"entropy": 1.5953451693058014,
"epoch": 0.19964349376114082,
"grad_norm": 0.0,
"learning_rate": 0.00019476813317479192,
"loss": 2.6907,
"mean_token_accuracy": 0.46527716517448425,
"num_tokens": 30709.0,
"step": 28
},
{
"entropy": 1.6036897897720337,
"epoch": 0.20677361853832443,
"grad_norm": 0.0,
"learning_rate": 0.00019453032104637337,
"loss": 2.621,
"mean_token_accuracy": 0.47111422568559647,
"num_tokens": 31780.0,
"step": 29
},
{
"entropy": 1.6660344302654266,
"epoch": 0.21390374331550802,
"grad_norm": 0.0,
"learning_rate": 0.00019429250891795483,
"loss": 2.6351,
"mean_token_accuracy": 0.45088133215904236,
"num_tokens": 32827.0,
"step": 30
},
{
"entropy": 1.5515300929546356,
"epoch": 0.22103386809269163,
"grad_norm": 0.0,
"learning_rate": 0.00019405469678953626,
"loss": 2.5804,
"mean_token_accuracy": 0.47831422835588455,
"num_tokens": 33977.0,
"step": 31
},
{
"entropy": 1.4917864799499512,
"epoch": 0.2281639928698752,
"grad_norm": 0.0,
"learning_rate": 0.00019381688466111775,
"loss": 2.5906,
"mean_token_accuracy": 0.4912087470293045,
"num_tokens": 35070.0,
"step": 32
},
{
"entropy": 1.591080754995346,
"epoch": 0.23529411764705882,
"grad_norm": 0.0,
"learning_rate": 0.00019357907253269918,
"loss": 2.7147,
"mean_token_accuracy": 0.4610803797841072,
"num_tokens": 36137.0,
"step": 33
},
{
"entropy": 1.5161702930927277,
"epoch": 0.24242424242424243,
"grad_norm": 0.0,
"learning_rate": 0.00019334126040428064,
"loss": 2.4586,
"mean_token_accuracy": 0.5084197968244553,
"num_tokens": 37239.0,
"step": 34
},
{
"entropy": 1.5857904851436615,
"epoch": 0.24955436720142601,
"grad_norm": 0.0,
"learning_rate": 0.0001931034482758621,
"loss": 2.5836,
"mean_token_accuracy": 0.48300980031490326,
"num_tokens": 38330.0,
"step": 35
},
{
"entropy": 1.6427140533924103,
"epoch": 0.25668449197860965,
"grad_norm": 0.0,
"learning_rate": 0.00019286563614744352,
"loss": 2.6629,
"mean_token_accuracy": 0.4738333150744438,
"num_tokens": 39368.0,
"step": 36
},
{
"entropy": 1.6292171776294708,
"epoch": 0.2638146167557932,
"grad_norm": 0.0,
"learning_rate": 0.00019262782401902498,
"loss": 2.7382,
"mean_token_accuracy": 0.46079348772764206,
"num_tokens": 40474.0,
"step": 37
},
{
"entropy": 1.5661435425281525,
"epoch": 0.2709447415329768,
"grad_norm": 0.0,
"learning_rate": 0.00019239001189060644,
"loss": 2.4905,
"mean_token_accuracy": 0.49719007313251495,
"num_tokens": 41558.0,
"step": 38
},
{
"entropy": 1.6255074739456177,
"epoch": 0.27807486631016043,
"grad_norm": 0.0,
"learning_rate": 0.00019215219976218787,
"loss": 2.6303,
"mean_token_accuracy": 0.4578748494386673,
"num_tokens": 42674.0,
"step": 39
},
{
"entropy": 1.4836209118366241,
"epoch": 0.28520499108734404,
"grad_norm": 0.0,
"learning_rate": 0.00019191438763376933,
"loss": 2.5324,
"mean_token_accuracy": 0.4868450313806534,
"num_tokens": 43815.0,
"step": 40
},
{
"entropy": 1.5689969956874847,
"epoch": 0.29233511586452765,
"grad_norm": 0.0,
"learning_rate": 0.00019167657550535079,
"loss": 2.6125,
"mean_token_accuracy": 0.45523975044488907,
"num_tokens": 44890.0,
"step": 41
},
{
"entropy": 1.5794390439987183,
"epoch": 0.2994652406417112,
"grad_norm": 0.0,
"learning_rate": 0.00019143876337693222,
"loss": 2.6012,
"mean_token_accuracy": 0.47063395380973816,
"num_tokens": 45951.0,
"step": 42
},
{
"entropy": 1.6168677806854248,
"epoch": 0.3065953654188948,
"grad_norm": 0.0,
"learning_rate": 0.0001912009512485137,
"loss": 2.6358,
"mean_token_accuracy": 0.46999557316303253,
"num_tokens": 47083.0,
"step": 43
},
{
"entropy": 1.6417001783847809,
"epoch": 0.3137254901960784,
"grad_norm": 0.0,
"learning_rate": 0.00019096313912009513,
"loss": 2.6473,
"mean_token_accuracy": 0.4624517187476158,
"num_tokens": 48088.0,
"step": 44
},
{
"entropy": 1.6053949892520905,
"epoch": 0.32085561497326204,
"grad_norm": 0.0,
"learning_rate": 0.0001907253269916766,
"loss": 2.6728,
"mean_token_accuracy": 0.4565429463982582,
"num_tokens": 49213.0,
"step": 45
},
{
"entropy": 1.5239336490631104,
"epoch": 0.32798573975044565,
"grad_norm": 0.0,
"learning_rate": 0.00019048751486325805,
"loss": 2.5886,
"mean_token_accuracy": 0.47441017627716064,
"num_tokens": 50287.0,
"step": 46
},
{
"entropy": 1.5953067243099213,
"epoch": 0.33511586452762926,
"grad_norm": 0.0,
"learning_rate": 0.00019024970273483948,
"loss": 2.6498,
"mean_token_accuracy": 0.46358855813741684,
"num_tokens": 51379.0,
"step": 47
},
{
"entropy": 1.5651679933071136,
"epoch": 0.3422459893048128,
"grad_norm": 0.0,
"learning_rate": 0.00019001189060642093,
"loss": 2.6206,
"mean_token_accuracy": 0.47789186984300613,
"num_tokens": 52456.0,
"step": 48
},
{
"entropy": 1.5568871796131134,
"epoch": 0.3493761140819964,
"grad_norm": 0.0,
"learning_rate": 0.0001897740784780024,
"loss": 2.4854,
"mean_token_accuracy": 0.46421028673648834,
"num_tokens": 53606.0,
"step": 49
},
{
"entropy": 1.5674110054969788,
"epoch": 0.35650623885918004,
"grad_norm": 0.0,
"learning_rate": 0.00018953626634958382,
"loss": 2.6727,
"mean_token_accuracy": 0.4703332930803299,
"num_tokens": 54675.0,
"step": 50
},
{
"entropy": 1.6391582489013672,
"epoch": 0.36363636363636365,
"grad_norm": 0.0,
"learning_rate": 0.00018929845422116528,
"loss": 2.737,
"mean_token_accuracy": 0.4556998685002327,
"num_tokens": 55814.0,
"step": 51
},
{
"entropy": 1.6075606942176819,
"epoch": 0.37076648841354726,
"grad_norm": 0.0,
"learning_rate": 0.00018906064209274674,
"loss": 2.7034,
"mean_token_accuracy": 0.46947986632585526,
"num_tokens": 56856.0,
"step": 52
},
{
"entropy": 1.663224458694458,
"epoch": 0.3778966131907308,
"grad_norm": 0.0,
"learning_rate": 0.0001888228299643282,
"loss": 2.6482,
"mean_token_accuracy": 0.46404948085546494,
"num_tokens": 57913.0,
"step": 53
},
{
"entropy": 1.5472119748592377,
"epoch": 0.3850267379679144,
"grad_norm": 0.0,
"learning_rate": 0.00018858501783590965,
"loss": 2.6376,
"mean_token_accuracy": 0.46229933202266693,
"num_tokens": 58974.0,
"step": 54
},
{
"entropy": 1.576840728521347,
"epoch": 0.39215686274509803,
"grad_norm": 0.0,
"learning_rate": 0.00018834720570749108,
"loss": 2.5702,
"mean_token_accuracy": 0.4657410681247711,
"num_tokens": 60045.0,
"step": 55
},
{
"entropy": 1.5661181211471558,
"epoch": 0.39928698752228164,
"grad_norm": 0.0,
"learning_rate": 0.00018810939357907254,
"loss": 2.5967,
"mean_token_accuracy": 0.46670273691415787,
"num_tokens": 61139.0,
"step": 56
},
{
"entropy": 1.694770723581314,
"epoch": 0.40641711229946526,
"grad_norm": 0.0,
"learning_rate": 0.000187871581450654,
"loss": 2.7852,
"mean_token_accuracy": 0.4311066195368767,
"num_tokens": 62182.0,
"step": 57
},
{
"entropy": 1.6365650296211243,
"epoch": 0.41354723707664887,
"grad_norm": 0.0,
"learning_rate": 0.00018763376932223543,
"loss": 2.6211,
"mean_token_accuracy": 0.45912958681583405,
"num_tokens": 63294.0,
"step": 58
},
{
"entropy": 1.676701843738556,
"epoch": 0.4206773618538324,
"grad_norm": 0.0,
"learning_rate": 0.0001873959571938169,
"loss": 2.727,
"mean_token_accuracy": 0.4495629146695137,
"num_tokens": 64345.0,
"step": 59
},
{
"entropy": 1.6373060643672943,
"epoch": 0.42780748663101603,
"grad_norm": 0.0,
"learning_rate": 0.00018715814506539835,
"loss": 2.5274,
"mean_token_accuracy": 0.49557945132255554,
"num_tokens": 65514.0,
"step": 60
},
{
"entropy": 1.60589137673378,
"epoch": 0.43493761140819964,
"grad_norm": 0.0,
"learning_rate": 0.0001869203329369798,
"loss": 2.5985,
"mean_token_accuracy": 0.47936083376407623,
"num_tokens": 66582.0,
"step": 61
},
{
"entropy": 1.588552087545395,
"epoch": 0.44206773618538325,
"grad_norm": 0.0,
"learning_rate": 0.00018668252080856126,
"loss": 2.6723,
"mean_token_accuracy": 0.4559238702058792,
"num_tokens": 67675.0,
"step": 62
},
{
"entropy": 1.6571227610111237,
"epoch": 0.44919786096256686,
"grad_norm": 0.0,
"learning_rate": 0.0001864447086801427,
"loss": 2.6471,
"mean_token_accuracy": 0.45193884521722794,
"num_tokens": 68786.0,
"step": 63
},
{
"entropy": 1.624506950378418,
"epoch": 0.4563279857397504,
"grad_norm": 0.0,
"learning_rate": 0.00018620689655172415,
"loss": 2.6798,
"mean_token_accuracy": 0.4612022116780281,
"num_tokens": 69860.0,
"step": 64
},
{
"entropy": 1.621217519044876,
"epoch": 0.46345811051693403,
"grad_norm": 0.0,
"learning_rate": 0.0001859690844233056,
"loss": 2.7073,
"mean_token_accuracy": 0.4635338932275772,
"num_tokens": 71015.0,
"step": 65
},
{
"entropy": 1.613840103149414,
"epoch": 0.47058823529411764,
"grad_norm": 0.0,
"learning_rate": 0.00018573127229488704,
"loss": 2.5895,
"mean_token_accuracy": 0.4725746735930443,
"num_tokens": 72174.0,
"step": 66
},
{
"entropy": 1.6597147285938263,
"epoch": 0.47771836007130125,
"grad_norm": 0.0,
"learning_rate": 0.0001854934601664685,
"loss": 2.6713,
"mean_token_accuracy": 0.453179232776165,
"num_tokens": 73201.0,
"step": 67
},
{
"entropy": 1.5728524029254913,
"epoch": 0.48484848484848486,
"grad_norm": 0.0,
"learning_rate": 0.00018525564803804995,
"loss": 2.5245,
"mean_token_accuracy": 0.4796541631221771,
"num_tokens": 74317.0,
"step": 68
},
{
"entropy": 1.6241073608398438,
"epoch": 0.4919786096256685,
"grad_norm": 0.0,
"learning_rate": 0.0001850178359096314,
"loss": 2.6027,
"mean_token_accuracy": 0.45817650109529495,
"num_tokens": 75379.0,
"step": 69
},
{
"entropy": 1.6420445144176483,
"epoch": 0.49910873440285203,
"grad_norm": 0.0,
"learning_rate": 0.00018478002378121284,
"loss": 2.6937,
"mean_token_accuracy": 0.4578876346349716,
"num_tokens": 76447.0,
"step": 70
},
{
"entropy": 1.5464614629745483,
"epoch": 0.5062388591800356,
"grad_norm": 0.0,
"learning_rate": 0.0001845422116527943,
"loss": 2.6239,
"mean_token_accuracy": 0.4877299517393112,
"num_tokens": 77527.0,
"step": 71
},
{
"entropy": 1.650261253118515,
"epoch": 0.5133689839572193,
"grad_norm": 0.0,
"learning_rate": 0.00018430439952437576,
"loss": 2.7438,
"mean_token_accuracy": 0.44178425520658493,
"num_tokens": 78571.0,
"step": 72
},
{
"entropy": 1.5863160490989685,
"epoch": 0.5204991087344029,
"grad_norm": 0.0,
"learning_rate": 0.00018406658739595721,
"loss": 2.6262,
"mean_token_accuracy": 0.4780261069536209,
"num_tokens": 79679.0,
"step": 73
},
{
"entropy": 1.5492512583732605,
"epoch": 0.5276292335115864,
"grad_norm": 0.0,
"learning_rate": 0.00018382877526753865,
"loss": 2.6273,
"mean_token_accuracy": 0.4793827310204506,
"num_tokens": 80814.0,
"step": 74
},
{
"entropy": 1.641315221786499,
"epoch": 0.5347593582887701,
"grad_norm": 0.0,
"learning_rate": 0.0001835909631391201,
"loss": 2.7035,
"mean_token_accuracy": 0.4648021087050438,
"num_tokens": 81899.0,
"step": 75
},
{
"entropy": 1.6433144211769104,
"epoch": 0.5418894830659536,
"grad_norm": 0.0,
"learning_rate": 0.00018335315101070156,
"loss": 2.6694,
"mean_token_accuracy": 0.46280162036418915,
"num_tokens": 83027.0,
"step": 76
},
{
"entropy": 1.5979108810424805,
"epoch": 0.5490196078431373,
"grad_norm": 0.0,
"learning_rate": 0.00018311533888228302,
"loss": 2.6064,
"mean_token_accuracy": 0.4912242665886879,
"num_tokens": 84157.0,
"step": 77
},
{
"entropy": 1.6251288056373596,
"epoch": 0.5561497326203209,
"grad_norm": 0.0,
"learning_rate": 0.00018287752675386445,
"loss": 2.7191,
"mean_token_accuracy": 0.451671302318573,
"num_tokens": 85250.0,
"step": 78
},
{
"entropy": 1.5671390295028687,
"epoch": 0.5632798573975044,
"grad_norm": 0.0,
"learning_rate": 0.0001826397146254459,
"loss": 2.613,
"mean_token_accuracy": 0.4815371036529541,
"num_tokens": 86331.0,
"step": 79
},
{
"entropy": 1.6086040139198303,
"epoch": 0.5704099821746881,
"grad_norm": 0.0,
"learning_rate": 0.00018240190249702736,
"loss": 2.6629,
"mean_token_accuracy": 0.4586881175637245,
"num_tokens": 87430.0,
"step": 80
},
{
"entropy": 1.643826961517334,
"epoch": 0.5775401069518716,
"grad_norm": 0.0,
"learning_rate": 0.0001821640903686088,
"loss": 2.5622,
"mean_token_accuracy": 0.46292658150196075,
"num_tokens": 88542.0,
"step": 81
},
{
"entropy": 1.6064577996730804,
"epoch": 0.5846702317290553,
"grad_norm": 0.0,
"learning_rate": 0.00018192627824019025,
"loss": 2.6491,
"mean_token_accuracy": 0.47355010360479355,
"num_tokens": 89674.0,
"step": 82
},
{
"entropy": 1.628090888261795,
"epoch": 0.5918003565062389,
"grad_norm": 0.0,
"learning_rate": 0.0001816884661117717,
"loss": 2.5738,
"mean_token_accuracy": 0.44704899936914444,
"num_tokens": 90762.0,
"step": 83
},
{
"entropy": 1.558334767818451,
"epoch": 0.5989304812834224,
"grad_norm": 0.0,
"learning_rate": 0.00018145065398335317,
"loss": 2.5863,
"mean_token_accuracy": 0.4635235145688057,
"num_tokens": 91891.0,
"step": 84
},
{
"entropy": 1.5913751423358917,
"epoch": 0.6060606060606061,
"grad_norm": 0.0,
"learning_rate": 0.0001812128418549346,
"loss": 2.5779,
"mean_token_accuracy": 0.49088721722364426,
"num_tokens": 92975.0,
"step": 85
},
{
"entropy": 1.6400005519390106,
"epoch": 0.6131907308377896,
"grad_norm": 0.0,
"learning_rate": 0.00018097502972651606,
"loss": 2.7314,
"mean_token_accuracy": 0.4603074938058853,
"num_tokens": 94049.0,
"step": 86
},
{
"entropy": 1.6752884984016418,
"epoch": 0.6203208556149733,
"grad_norm": 0.0,
"learning_rate": 0.00018073721759809751,
"loss": 2.6624,
"mean_token_accuracy": 0.4621705859899521,
"num_tokens": 95148.0,
"step": 87
},
{
"entropy": 1.6517191529273987,
"epoch": 0.6274509803921569,
"grad_norm": 0.0,
"learning_rate": 0.00018049940546967897,
"loss": 2.6835,
"mean_token_accuracy": 0.4539112225174904,
"num_tokens": 96181.0,
"step": 88
},
{
"entropy": 1.7114574909210205,
"epoch": 0.6345811051693404,
"grad_norm": 0.0,
"learning_rate": 0.0001802615933412604,
"loss": 2.7193,
"mean_token_accuracy": 0.4676526263356209,
"num_tokens": 97301.0,
"step": 89
},
{
"entropy": 1.5929251611232758,
"epoch": 0.6417112299465241,
"grad_norm": 0.0,
"learning_rate": 0.00018002378121284186,
"loss": 2.5851,
"mean_token_accuracy": 0.47982004284858704,
"num_tokens": 98436.0,
"step": 90
},
{
"entropy": 1.6745183765888214,
"epoch": 0.6488413547237076,
"grad_norm": 0.0,
"learning_rate": 0.00017978596908442332,
"loss": 2.7709,
"mean_token_accuracy": 0.4384681358933449,
"num_tokens": 99477.0,
"step": 91
},
{
"entropy": 1.59268319606781,
"epoch": 0.6559714795008913,
"grad_norm": 0.0,
"learning_rate": 0.00017954815695600475,
"loss": 2.5459,
"mean_token_accuracy": 0.47278715670108795,
"num_tokens": 100622.0,
"step": 92
},
{
"entropy": 1.542839676141739,
"epoch": 0.6631016042780749,
"grad_norm": 0.0,
"learning_rate": 0.0001793103448275862,
"loss": 2.6196,
"mean_token_accuracy": 0.48384036868810654,
"num_tokens": 101710.0,
"step": 93
},
{
"entropy": 1.675879418849945,
"epoch": 0.6702317290552585,
"grad_norm": 0.0,
"learning_rate": 0.00017907253269916766,
"loss": 2.7075,
"mean_token_accuracy": 0.46232420206069946,
"num_tokens": 102745.0,
"step": 94
},
{
"entropy": 1.520689070224762,
"epoch": 0.6773618538324421,
"grad_norm": 0.0,
"learning_rate": 0.00017883472057074912,
"loss": 2.5151,
"mean_token_accuracy": 0.48868096619844437,
"num_tokens": 103923.0,
"step": 95
},
{
"entropy": 1.5939531326293945,
"epoch": 0.6844919786096256,
"grad_norm": 0.0,
"learning_rate": 0.00017859690844233058,
"loss": 2.5655,
"mean_token_accuracy": 0.48160815238952637,
"num_tokens": 105063.0,
"step": 96
},
{
"entropy": 1.6622373461723328,
"epoch": 0.6916221033868093,
"grad_norm": 0.0,
"learning_rate": 0.000178359096313912,
"loss": 2.6854,
"mean_token_accuracy": 0.4513147324323654,
"num_tokens": 106176.0,
"step": 97
},
{
"entropy": 1.596898078918457,
"epoch": 0.6987522281639929,
"grad_norm": 0.0,
"learning_rate": 0.00017812128418549347,
"loss": 2.6575,
"mean_token_accuracy": 0.46170156449079514,
"num_tokens": 107276.0,
"step": 98
},
{
"entropy": 1.6209412813186646,
"epoch": 0.7058823529411765,
"grad_norm": 0.0,
"learning_rate": 0.00017788347205707493,
"loss": 2.6675,
"mean_token_accuracy": 0.4495192915201187,
"num_tokens": 108387.0,
"step": 99
},
{
"entropy": 1.5751018524169922,
"epoch": 0.7130124777183601,
"grad_norm": 0.0,
"learning_rate": 0.00017764565992865636,
"loss": 2.6782,
"mean_token_accuracy": 0.47493766248226166,
"num_tokens": 109464.0,
"step": 100
},
{
"entropy": 1.5404111742973328,
"epoch": 0.7201426024955436,
"grad_norm": 0.0,
"learning_rate": 0.0001774078478002378,
"loss": 2.5644,
"mean_token_accuracy": 0.4800315648317337,
"num_tokens": 110545.0,
"step": 101
},
{
"entropy": 1.5617071688175201,
"epoch": 0.7272727272727273,
"grad_norm": 0.0,
"learning_rate": 0.00017717003567181927,
"loss": 2.4542,
"mean_token_accuracy": 0.486552469432354,
"num_tokens": 111644.0,
"step": 102
},
{
"entropy": 1.6221667528152466,
"epoch": 0.7344028520499108,
"grad_norm": 0.0,
"learning_rate": 0.00017693222354340073,
"loss": 2.6873,
"mean_token_accuracy": 0.4583618715405464,
"num_tokens": 112683.0,
"step": 103
},
{
"entropy": 1.6229766011238098,
"epoch": 0.7415329768270945,
"grad_norm": 0.0,
"learning_rate": 0.0001766944114149822,
"loss": 2.7595,
"mean_token_accuracy": 0.4478139355778694,
"num_tokens": 113734.0,
"step": 104
},
{
"entropy": 1.6290508806705475,
"epoch": 0.7486631016042781,
"grad_norm": 0.0,
"learning_rate": 0.00017645659928656362,
"loss": 2.648,
"mean_token_accuracy": 0.4640928953886032,
"num_tokens": 114840.0,
"step": 105
},
{
"entropy": 1.5602010190486908,
"epoch": 0.7557932263814616,
"grad_norm": 0.0,
"learning_rate": 0.00017621878715814507,
"loss": 2.6151,
"mean_token_accuracy": 0.47945626825094223,
"num_tokens": 115918.0,
"step": 106
},
{
"entropy": 1.561210036277771,
"epoch": 0.7629233511586453,
"grad_norm": 0.0,
"learning_rate": 0.00017598097502972653,
"loss": 2.6753,
"mean_token_accuracy": 0.4755818694829941,
"num_tokens": 116992.0,
"step": 107
},
{
"entropy": 1.5463439524173737,
"epoch": 0.7700534759358288,
"grad_norm": 0.0,
"learning_rate": 0.00017574316290130796,
"loss": 2.5312,
"mean_token_accuracy": 0.4788963794708252,
"num_tokens": 118163.0,
"step": 108
},
{
"entropy": 1.545140415430069,
"epoch": 0.7771836007130125,
"grad_norm": 0.0,
"learning_rate": 0.00017550535077288942,
"loss": 2.4861,
"mean_token_accuracy": 0.48517899960279465,
"num_tokens": 119246.0,
"step": 109
},
{
"entropy": 1.6490318477153778,
"epoch": 0.7843137254901961,
"grad_norm": 0.0,
"learning_rate": 0.00017526753864447088,
"loss": 2.6319,
"mean_token_accuracy": 0.4522154629230499,
"num_tokens": 120352.0,
"step": 110
},
{
"entropy": 1.5705026686191559,
"epoch": 0.7914438502673797,
"grad_norm": 0.0,
"learning_rate": 0.0001750297265160523,
"loss": 2.5889,
"mean_token_accuracy": 0.4755884185433388,
"num_tokens": 121495.0,
"step": 111
},
{
"entropy": 1.5974452495574951,
"epoch": 0.7985739750445633,
"grad_norm": 0.0,
"learning_rate": 0.0001747919143876338,
"loss": 2.5626,
"mean_token_accuracy": 0.4795725420117378,
"num_tokens": 122564.0,
"step": 112
},
{
"entropy": 1.5680693686008453,
"epoch": 0.8057040998217468,
"grad_norm": 0.0,
"learning_rate": 0.00017455410225921522,
"loss": 2.6273,
"mean_token_accuracy": 0.4590470939874649,
"num_tokens": 123650.0,
"step": 113
},
{
"entropy": 1.6800198554992676,
"epoch": 0.8128342245989305,
"grad_norm": 0.0,
"learning_rate": 0.00017431629013079668,
"loss": 2.737,
"mean_token_accuracy": 0.45866189897060394,
"num_tokens": 124779.0,
"step": 114
},
{
"entropy": 1.6031546890735626,
"epoch": 0.8199643493761141,
"grad_norm": 0.0,
"learning_rate": 0.00017407847800237814,
"loss": 2.5439,
"mean_token_accuracy": 0.472602941095829,
"num_tokens": 125900.0,
"step": 115
},
{
"entropy": 1.6303865611553192,
"epoch": 0.8270944741532977,
"grad_norm": 0.0,
"learning_rate": 0.00017384066587395957,
"loss": 2.6487,
"mean_token_accuracy": 0.4673057347536087,
"num_tokens": 126993.0,
"step": 116
},
{
"entropy": 1.6584992706775665,
"epoch": 0.8342245989304813,
"grad_norm": 0.0,
"learning_rate": 0.00017360285374554103,
"loss": 2.6956,
"mean_token_accuracy": 0.46377309411764145,
"num_tokens": 128138.0,
"step": 117
},
{
"entropy": 1.591305822134018,
"epoch": 0.8413547237076648,
"grad_norm": 0.0,
"learning_rate": 0.00017336504161712249,
"loss": 2.713,
"mean_token_accuracy": 0.46212588995695114,
"num_tokens": 129222.0,
"step": 118
},
{
"entropy": 1.6993348598480225,
"epoch": 0.8484848484848485,
"grad_norm": 0.0,
"learning_rate": 0.00017312722948870392,
"loss": 2.7792,
"mean_token_accuracy": 0.4319414496421814,
"num_tokens": 130249.0,
"step": 119
},
{
"entropy": 1.5929858684539795,
"epoch": 0.8556149732620321,
"grad_norm": 0.0,
"learning_rate": 0.0001728894173602854,
"loss": 2.6866,
"mean_token_accuracy": 0.47090228646993637,
"num_tokens": 131310.0,
"step": 120
},
{
"entropy": 1.667660892009735,
"epoch": 0.8627450980392157,
"grad_norm": 0.0,
"learning_rate": 0.00017265160523186683,
"loss": 2.6699,
"mean_token_accuracy": 0.45984991639852524,
"num_tokens": 132431.0,
"step": 121
},
{
"entropy": 1.548890471458435,
"epoch": 0.8698752228163993,
"grad_norm": 0.0,
"learning_rate": 0.00017241379310344826,
"loss": 2.6131,
"mean_token_accuracy": 0.4870634377002716,
"num_tokens": 133545.0,
"step": 122
},
{
"entropy": 1.5599585175514221,
"epoch": 0.8770053475935828,
"grad_norm": 0.0,
"learning_rate": 0.00017217598097502975,
"loss": 2.5476,
"mean_token_accuracy": 0.4777110442519188,
"num_tokens": 134678.0,
"step": 123
},
{
"entropy": 1.5160918235778809,
"epoch": 0.8841354723707665,
"grad_norm": 0.0,
"learning_rate": 0.00017193816884661118,
"loss": 2.5731,
"mean_token_accuracy": 0.478961318731308,
"num_tokens": 135821.0,
"step": 124
},
{
"entropy": 1.5480628907680511,
"epoch": 0.8912655971479501,
"grad_norm": 0.0,
"learning_rate": 0.00017170035671819264,
"loss": 2.5291,
"mean_token_accuracy": 0.4755103513598442,
"num_tokens": 136971.0,
"step": 125
},
{
"entropy": 1.5862606167793274,
"epoch": 0.8983957219251337,
"grad_norm": 0.0,
"learning_rate": 0.0001714625445897741,
"loss": 2.6396,
"mean_token_accuracy": 0.46081865578889847,
"num_tokens": 138007.0,
"step": 126
},
{
"entropy": 1.669043391942978,
"epoch": 0.9055258467023173,
"grad_norm": 0.0,
"learning_rate": 0.00017122473246135552,
"loss": 2.6321,
"mean_token_accuracy": 0.4644095078110695,
"num_tokens": 139053.0,
"step": 127
},
{
"entropy": 1.5860805809497833,
"epoch": 0.9126559714795008,
"grad_norm": 0.0,
"learning_rate": 0.00017098692033293698,
"loss": 2.5403,
"mean_token_accuracy": 0.4631856083869934,
"num_tokens": 140147.0,
"step": 128
},
{
"entropy": 1.6290316879749298,
"epoch": 0.9197860962566845,
"grad_norm": 0.0,
"learning_rate": 0.00017074910820451844,
"loss": 2.6503,
"mean_token_accuracy": 0.46986210346221924,
"num_tokens": 141269.0,
"step": 129
},
{
"entropy": 1.5800673365592957,
"epoch": 0.9269162210338681,
"grad_norm": 0.0,
"learning_rate": 0.00017051129607609987,
"loss": 2.5817,
"mean_token_accuracy": 0.46785174310207367,
"num_tokens": 142404.0,
"step": 130
},
{
"entropy": 1.5903257727622986,
"epoch": 0.9340463458110517,
"grad_norm": 0.0,
"learning_rate": 0.00017027348394768135,
"loss": 2.5134,
"mean_token_accuracy": 0.4654293358325958,
"num_tokens": 143534.0,
"step": 131
},
{
"entropy": 1.5758326351642609,
"epoch": 0.9411764705882353,
"grad_norm": 0.0,
"learning_rate": 0.00017003567181926279,
"loss": 2.6744,
"mean_token_accuracy": 0.47502800077199936,
"num_tokens": 144634.0,
"step": 132
},
{
"entropy": 1.555185467004776,
"epoch": 0.948306595365419,
"grad_norm": 0.0,
"learning_rate": 0.00016979785969084424,
"loss": 2.5801,
"mean_token_accuracy": 0.47467251121997833,
"num_tokens": 145695.0,
"step": 133
},
{
"entropy": 1.58922079205513,
"epoch": 0.9554367201426025,
"grad_norm": 0.0,
"learning_rate": 0.0001695600475624257,
"loss": 2.619,
"mean_token_accuracy": 0.4779004603624344,
"num_tokens": 146787.0,
"step": 134
},
{
"entropy": 1.6058009564876556,
"epoch": 0.9625668449197861,
"grad_norm": 0.0,
"learning_rate": 0.00016932223543400713,
"loss": 2.683,
"mean_token_accuracy": 0.4785980358719826,
"num_tokens": 147817.0,
"step": 135
},
{
"entropy": 1.6281861662864685,
"epoch": 0.9696969696969697,
"grad_norm": 0.0,
"learning_rate": 0.0001690844233055886,
"loss": 2.7144,
"mean_token_accuracy": 0.4627036154270172,
"num_tokens": 148907.0,
"step": 136
},
{
"entropy": 1.619028925895691,
"epoch": 0.9768270944741533,
"grad_norm": 0.0,
"learning_rate": 0.00016884661117717005,
"loss": 2.6079,
"mean_token_accuracy": 0.45756662636995316,
"num_tokens": 149993.0,
"step": 137
},
{
"entropy": 1.5888321995735168,
"epoch": 0.983957219251337,
"grad_norm": 0.0,
"learning_rate": 0.00016860879904875148,
"loss": 2.6093,
"mean_token_accuracy": 0.4724939614534378,
"num_tokens": 151098.0,
"step": 138
},
{
"entropy": 1.6436494290828705,
"epoch": 0.9910873440285205,
"grad_norm": 0.0,
"learning_rate": 0.00016837098692033296,
"loss": 2.6575,
"mean_token_accuracy": 0.4623269736766815,
"num_tokens": 152163.0,
"step": 139
},
{
"entropy": 1.5655477941036224,
"epoch": 0.9982174688057041,
"grad_norm": 0.0,
"learning_rate": 0.0001681331747919144,
"loss": 2.6335,
"mean_token_accuracy": 0.47587042301893234,
"num_tokens": 153275.0,
"step": 140
},
{
"entropy": 1.7136651277542114,
"epoch": 1.0,
"grad_norm": 0.0,
"learning_rate": 0.00016789536266349582,
"loss": 2.6691,
"mean_token_accuracy": 0.4723247289657593,
"num_tokens": 153550.0,
"step": 141
},
{
"entropy": 1.6066825091838837,
"epoch": 1.0071301247771836,
"grad_norm": 0.0,
"learning_rate": 0.0001676575505350773,
"loss": 2.5409,
"mean_token_accuracy": 0.48296716809272766,
"num_tokens": 154641.0,
"step": 142
},
{
"entropy": 1.5699058771133423,
"epoch": 1.014260249554367,
"grad_norm": 0.0,
"learning_rate": 0.00016741973840665874,
"loss": 2.5472,
"mean_token_accuracy": 0.47184478491544724,
"num_tokens": 155762.0,
"step": 143
},
{
"entropy": 1.5619142949581146,
"epoch": 1.0213903743315509,
"grad_norm": 0.0,
"learning_rate": 0.0001671819262782402,
"loss": 2.6692,
"mean_token_accuracy": 0.4779776930809021,
"num_tokens": 156859.0,
"step": 144
},
{
"entropy": 1.5965659022331238,
"epoch": 1.0285204991087344,
"grad_norm": 0.0,
"learning_rate": 0.00016694411414982165,
"loss": 2.621,
"mean_token_accuracy": 0.4778958112001419,
"num_tokens": 157991.0,
"step": 145
},
{
"entropy": 1.6152032911777496,
"epoch": 1.035650623885918,
"grad_norm": 0.0,
"learning_rate": 0.00016670630202140308,
"loss": 2.5771,
"mean_token_accuracy": 0.4681878834962845,
"num_tokens": 159092.0,
"step": 146
},
{
"entropy": 1.5952391922473907,
"epoch": 1.0427807486631016,
"grad_norm": 0.0,
"learning_rate": 0.00016646848989298457,
"loss": 2.6372,
"mean_token_accuracy": 0.48037248849868774,
"num_tokens": 160160.0,
"step": 147
},
{
"entropy": 1.565784215927124,
"epoch": 1.049910873440285,
"grad_norm": 0.0,
"learning_rate": 0.000166230677764566,
"loss": 2.6739,
"mean_token_accuracy": 0.4823187068104744,
"num_tokens": 161263.0,
"step": 148
},
{
"entropy": 1.5616013407707214,
"epoch": 1.0570409982174689,
"grad_norm": 0.0,
"learning_rate": 0.00016599286563614743,
"loss": 2.538,
"mean_token_accuracy": 0.4936261996626854,
"num_tokens": 162366.0,
"step": 149
},
{
"entropy": 1.550614982843399,
"epoch": 1.0641711229946524,
"grad_norm": 0.0,
"learning_rate": 0.00016575505350772892,
"loss": 2.5405,
"mean_token_accuracy": 0.4571392834186554,
"num_tokens": 163510.0,
"step": 150
},
{
"entropy": 1.6245464980602264,
"epoch": 1.071301247771836,
"grad_norm": 0.0,
"learning_rate": 0.00016551724137931035,
"loss": 2.6446,
"mean_token_accuracy": 0.46846461296081543,
"num_tokens": 164565.0,
"step": 151
},
{
"entropy": 1.4947470128536224,
"epoch": 1.0784313725490196,
"grad_norm": 0.0,
"learning_rate": 0.0001652794292508918,
"loss": 2.5193,
"mean_token_accuracy": 0.4816185459494591,
"num_tokens": 165718.0,
"step": 152
},
{
"entropy": 1.5682013034820557,
"epoch": 1.085561497326203,
"grad_norm": 0.0,
"learning_rate": 0.00016504161712247326,
"loss": 2.5916,
"mean_token_accuracy": 0.46111301332712173,
"num_tokens": 166812.0,
"step": 153
},
{
"entropy": 1.6156157553195953,
"epoch": 1.0926916221033869,
"grad_norm": 0.0,
"learning_rate": 0.0001648038049940547,
"loss": 2.6128,
"mean_token_accuracy": 0.4597594439983368,
"num_tokens": 167926.0,
"step": 154
},
{
"entropy": 1.59586563706398,
"epoch": 1.0998217468805704,
"grad_norm": 0.0,
"learning_rate": 0.00016456599286563618,
"loss": 2.6021,
"mean_token_accuracy": 0.4618517607450485,
"num_tokens": 168982.0,
"step": 155
},
{
"entropy": 1.6061933636665344,
"epoch": 1.106951871657754,
"grad_norm": 0.0,
"learning_rate": 0.0001643281807372176,
"loss": 2.6854,
"mean_token_accuracy": 0.43533751368522644,
"num_tokens": 170010.0,
"step": 156
},
{
"entropy": 1.6085174083709717,
"epoch": 1.1140819964349375,
"grad_norm": 0.0,
"learning_rate": 0.00016409036860879904,
"loss": 2.6981,
"mean_token_accuracy": 0.4599795266985893,
"num_tokens": 171113.0,
"step": 157
},
{
"entropy": 1.5913650691509247,
"epoch": 1.121212121212121,
"grad_norm": 0.0,
"learning_rate": 0.00016385255648038052,
"loss": 2.5902,
"mean_token_accuracy": 0.4770422652363777,
"num_tokens": 172251.0,
"step": 158
},
{
"entropy": 1.5887981355190277,
"epoch": 1.1283422459893049,
"grad_norm": 0.0,
"learning_rate": 0.00016361474435196195,
"loss": 2.6519,
"mean_token_accuracy": 0.46707142144441605,
"num_tokens": 173342.0,
"step": 159
},
{
"entropy": 1.574705809354782,
"epoch": 1.1354723707664884,
"grad_norm": 0.0,
"learning_rate": 0.0001633769322235434,
"loss": 2.6085,
"mean_token_accuracy": 0.478085033595562,
"num_tokens": 174460.0,
"step": 160
},
{
"entropy": 1.5885936915874481,
"epoch": 1.142602495543672,
"grad_norm": 0.0,
"learning_rate": 0.00016313912009512487,
"loss": 2.7184,
"mean_token_accuracy": 0.45598648488521576,
"num_tokens": 175576.0,
"step": 161
},
{
"entropy": 1.5892296731472015,
"epoch": 1.1497326203208555,
"grad_norm": 0.0,
"learning_rate": 0.0001629013079667063,
"loss": 2.7047,
"mean_token_accuracy": 0.46477679163217545,
"num_tokens": 176675.0,
"step": 162
},
{
"entropy": 1.6486813724040985,
"epoch": 1.156862745098039,
"grad_norm": 0.0,
"learning_rate": 0.00016266349583828776,
"loss": 2.7061,
"mean_token_accuracy": 0.46870480477809906,
"num_tokens": 177740.0,
"step": 163
},
{
"entropy": 1.5759459435939789,
"epoch": 1.1639928698752229,
"grad_norm": 0.0,
"learning_rate": 0.00016242568370986922,
"loss": 2.5503,
"mean_token_accuracy": 0.46419017761945724,
"num_tokens": 178796.0,
"step": 164
},
{
"entropy": 1.6036653220653534,
"epoch": 1.1711229946524064,
"grad_norm": 0.0,
"learning_rate": 0.00016218787158145065,
"loss": 2.6642,
"mean_token_accuracy": 0.47088219970464706,
"num_tokens": 179879.0,
"step": 165
},
{
"entropy": 1.5834835767745972,
"epoch": 1.17825311942959,
"grad_norm": 0.0,
"learning_rate": 0.00016195005945303213,
"loss": 2.6007,
"mean_token_accuracy": 0.4636633098125458,
"num_tokens": 181061.0,
"step": 166
},
{
"entropy": 1.7047175765037537,
"epoch": 1.1853832442067735,
"grad_norm": 0.0,
"learning_rate": 0.00016171224732461356,
"loss": 2.6283,
"mean_token_accuracy": 0.46398939192295074,
"num_tokens": 182160.0,
"step": 167
},
{
"entropy": 1.649744689464569,
"epoch": 1.192513368983957,
"grad_norm": 0.0,
"learning_rate": 0.00016147443519619502,
"loss": 2.7414,
"mean_token_accuracy": 0.44918210059404373,
"num_tokens": 183238.0,
"step": 168
},
{
"entropy": 1.4582054913043976,
"epoch": 1.1996434937611409,
"grad_norm": 0.0,
"learning_rate": 0.00016123662306777648,
"loss": 2.4485,
"mean_token_accuracy": 0.5028558596968651,
"num_tokens": 184343.0,
"step": 169
},
{
"entropy": 1.5874928832054138,
"epoch": 1.2067736185383244,
"grad_norm": 0.0,
"learning_rate": 0.0001609988109393579,
"loss": 2.6477,
"mean_token_accuracy": 0.47324957698583603,
"num_tokens": 185432.0,
"step": 170
},
{
"entropy": 1.5870967209339142,
"epoch": 1.213903743315508,
"grad_norm": 0.0,
"learning_rate": 0.00016076099881093936,
"loss": 2.6929,
"mean_token_accuracy": 0.459871307015419,
"num_tokens": 186527.0,
"step": 171
},
{
"entropy": 1.6552450358867645,
"epoch": 1.2210338680926915,
"grad_norm": 0.0,
"learning_rate": 0.00016052318668252082,
"loss": 2.6735,
"mean_token_accuracy": 0.45555536448955536,
"num_tokens": 187577.0,
"step": 172
},
{
"entropy": 1.6631282269954681,
"epoch": 1.228163992869875,
"grad_norm": 0.0,
"learning_rate": 0.00016028537455410225,
"loss": 2.6354,
"mean_token_accuracy": 0.47852643579244614,
"num_tokens": 188730.0,
"step": 173
},
{
"entropy": 1.6064516603946686,
"epoch": 1.2352941176470589,
"grad_norm": 0.0,
"learning_rate": 0.00016004756242568374,
"loss": 2.6498,
"mean_token_accuracy": 0.4627776965498924,
"num_tokens": 189851.0,
"step": 174
},
{
"entropy": 1.6195379793643951,
"epoch": 1.2424242424242424,
"grad_norm": 0.0,
"learning_rate": 0.00015980975029726517,
"loss": 2.5955,
"mean_token_accuracy": 0.4755994454026222,
"num_tokens": 190954.0,
"step": 175
},
{
"entropy": 1.583428680896759,
"epoch": 1.249554367201426,
"grad_norm": 0.0,
"learning_rate": 0.00015957193816884663,
"loss": 2.5734,
"mean_token_accuracy": 0.486144557595253,
"num_tokens": 192084.0,
"step": 176
},
{
"entropy": 1.61075758934021,
"epoch": 1.2566844919786098,
"grad_norm": 0.0,
"learning_rate": 0.00015933412604042808,
"loss": 2.699,
"mean_token_accuracy": 0.4629248157143593,
"num_tokens": 193111.0,
"step": 177
},
{
"entropy": 1.5986732840538025,
"epoch": 1.263814616755793,
"grad_norm": 0.0,
"learning_rate": 0.00015909631391200951,
"loss": 2.6723,
"mean_token_accuracy": 0.4575885683298111,
"num_tokens": 194260.0,
"step": 178
},
{
"entropy": 1.5930440127849579,
"epoch": 1.2709447415329769,
"grad_norm": 0.0,
"learning_rate": 0.00015885850178359097,
"loss": 2.7138,
"mean_token_accuracy": 0.4715413674712181,
"num_tokens": 195316.0,
"step": 179
},
{
"entropy": 1.605830729007721,
"epoch": 1.2780748663101604,
"grad_norm": 0.0,
"learning_rate": 0.00015862068965517243,
"loss": 2.6979,
"mean_token_accuracy": 0.4691004157066345,
"num_tokens": 196406.0,
"step": 180
},
{
"entropy": 1.6044515669345856,
"epoch": 1.285204991087344,
"grad_norm": 0.0,
"learning_rate": 0.00015838287752675386,
"loss": 2.6274,
"mean_token_accuracy": 0.4783453792333603,
"num_tokens": 197500.0,
"step": 181
},
{
"entropy": 1.5863282978534698,
"epoch": 1.2923351158645278,
"grad_norm": 0.0,
"learning_rate": 0.00015814506539833532,
"loss": 2.5774,
"mean_token_accuracy": 0.48002146929502487,
"num_tokens": 198661.0,
"step": 182
},
{
"entropy": 1.673670768737793,
"epoch": 1.299465240641711,
"grad_norm": 0.0,
"learning_rate": 0.00015790725326991678,
"loss": 2.6731,
"mean_token_accuracy": 0.46556977182626724,
"num_tokens": 199760.0,
"step": 183
},
{
"entropy": 1.5792707800865173,
"epoch": 1.3065953654188949,
"grad_norm": 0.0,
"learning_rate": 0.0001576694411414982,
"loss": 2.598,
"mean_token_accuracy": 0.4617435112595558,
"num_tokens": 200837.0,
"step": 184
},
{
"entropy": 1.6057993471622467,
"epoch": 1.3137254901960784,
"grad_norm": 0.0,
"learning_rate": 0.0001574316290130797,
"loss": 2.5975,
"mean_token_accuracy": 0.469496913254261,
"num_tokens": 201963.0,
"step": 185
},
{
"entropy": 1.6113585829734802,
"epoch": 1.320855614973262,
"grad_norm": 0.0,
"learning_rate": 0.00015719381688466112,
"loss": 2.7397,
"mean_token_accuracy": 0.46607375144958496,
"num_tokens": 203043.0,
"step": 186
},
{
"entropy": 1.6389738619327545,
"epoch": 1.3279857397504458,
"grad_norm": 0.0,
"learning_rate": 0.00015695600475624258,
"loss": 2.6493,
"mean_token_accuracy": 0.4742783457040787,
"num_tokens": 204118.0,
"step": 187
},
{
"entropy": 1.6438090801239014,
"epoch": 1.3351158645276293,
"grad_norm": 0.0,
"learning_rate": 0.00015671819262782404,
"loss": 2.7208,
"mean_token_accuracy": 0.4482914060354233,
"num_tokens": 205174.0,
"step": 188
},
{
"entropy": 1.5948124527931213,
"epoch": 1.3422459893048129,
"grad_norm": 0.0,
"learning_rate": 0.00015648038049940547,
"loss": 2.6397,
"mean_token_accuracy": 0.4829149469733238,
"num_tokens": 206322.0,
"step": 189
},
{
"entropy": 1.5886778235435486,
"epoch": 1.3493761140819964,
"grad_norm": 0.0,
"learning_rate": 0.00015624256837098693,
"loss": 2.4934,
"mean_token_accuracy": 0.4762219786643982,
"num_tokens": 207430.0,
"step": 190
},
{
"entropy": 1.5851022601127625,
"epoch": 1.35650623885918,
"grad_norm": 0.0,
"learning_rate": 0.00015600475624256838,
"loss": 2.5747,
"mean_token_accuracy": 0.44965071976184845,
"num_tokens": 208572.0,
"step": 191
},
{
"entropy": 1.6201273202896118,
"epoch": 1.3636363636363638,
"grad_norm": 0.0,
"learning_rate": 0.00015576694411414981,
"loss": 2.6311,
"mean_token_accuracy": 0.462075375020504,
"num_tokens": 209649.0,
"step": 192
},
{
"entropy": 1.5876831114292145,
"epoch": 1.3707664884135473,
"grad_norm": 0.0,
"learning_rate": 0.00015552913198573127,
"loss": 2.6102,
"mean_token_accuracy": 0.48762883245944977,
"num_tokens": 210713.0,
"step": 193
},
{
"entropy": 1.5845709443092346,
"epoch": 1.3778966131907309,
"grad_norm": 0.0,
"learning_rate": 0.00015529131985731273,
"loss": 2.6816,
"mean_token_accuracy": 0.48001401126384735,
"num_tokens": 211837.0,
"step": 194
},
{
"entropy": 1.6651998162269592,
"epoch": 1.3850267379679144,
"grad_norm": 0.0,
"learning_rate": 0.0001550535077288942,
"loss": 2.6997,
"mean_token_accuracy": 0.4684208855032921,
"num_tokens": 212932.0,
"step": 195
},
{
"entropy": 1.6087098717689514,
"epoch": 1.392156862745098,
"grad_norm": 0.0,
"learning_rate": 0.00015481569560047564,
"loss": 2.6176,
"mean_token_accuracy": 0.46499625593423843,
"num_tokens": 214054.0,
"step": 196
},
{
"entropy": 1.6280483305454254,
"epoch": 1.3992869875222818,
"grad_norm": 0.0,
"learning_rate": 0.00015457788347205708,
"loss": 2.7304,
"mean_token_accuracy": 0.4452286586165428,
"num_tokens": 215200.0,
"step": 197
},
{
"entropy": 1.5492911636829376,
"epoch": 1.4064171122994653,
"grad_norm": 0.0,
"learning_rate": 0.00015434007134363853,
"loss": 2.5217,
"mean_token_accuracy": 0.49339231103658676,
"num_tokens": 216317.0,
"step": 198
},
{
"entropy": 1.5941800773143768,
"epoch": 1.4135472370766489,
"grad_norm": 0.0,
"learning_rate": 0.00015410225921522,
"loss": 2.4819,
"mean_token_accuracy": 0.4754658564925194,
"num_tokens": 217405.0,
"step": 199
},
{
"entropy": 1.5978778004646301,
"epoch": 1.4206773618538324,
"grad_norm": 0.0,
"learning_rate": 0.00015386444708680142,
"loss": 2.7044,
"mean_token_accuracy": 0.45556849241256714,
"num_tokens": 218504.0,
"step": 200
},
{
"entropy": 1.5847291052341461,
"epoch": 1.427807486631016,
"grad_norm": 0.0,
"learning_rate": 0.00015362663495838288,
"loss": 2.6487,
"mean_token_accuracy": 0.46325846016407013,
"num_tokens": 219640.0,
"step": 201
},
{
"entropy": 1.5988908410072327,
"epoch": 1.4349376114081998,
"grad_norm": 0.0,
"learning_rate": 0.00015338882282996434,
"loss": 2.664,
"mean_token_accuracy": 0.46224743872880936,
"num_tokens": 220642.0,
"step": 202
},
{
"entropy": 1.5120367109775543,
"epoch": 1.4420677361853833,
"grad_norm": 0.0,
"learning_rate": 0.0001531510107015458,
"loss": 2.5895,
"mean_token_accuracy": 0.48113133013248444,
"num_tokens": 221735.0,
"step": 203
},
{
"entropy": 1.5507851243019104,
"epoch": 1.4491978609625669,
"grad_norm": 0.0,
"learning_rate": 0.00015291319857312725,
"loss": 2.5664,
"mean_token_accuracy": 0.48080600798130035,
"num_tokens": 222814.0,
"step": 204
},
{
"entropy": 1.6826366484165192,
"epoch": 1.4563279857397504,
"grad_norm": 0.0,
"learning_rate": 0.00015267538644470868,
"loss": 2.7612,
"mean_token_accuracy": 0.4607429876923561,
"num_tokens": 223905.0,
"step": 205
},
{
"entropy": 1.6085248589515686,
"epoch": 1.463458110516934,
"grad_norm": 0.0,
"learning_rate": 0.00015243757431629014,
"loss": 2.6815,
"mean_token_accuracy": 0.45429935306310654,
"num_tokens": 224999.0,
"step": 206
},
{
"entropy": 1.6905607879161835,
"epoch": 1.4705882352941178,
"grad_norm": 0.0,
"learning_rate": 0.0001521997621878716,
"loss": 2.7331,
"mean_token_accuracy": 0.4660051390528679,
"num_tokens": 226068.0,
"step": 207
},
{
"entropy": 1.6017423570156097,
"epoch": 1.4777183600713013,
"grad_norm": 0.0,
"learning_rate": 0.00015196195005945303,
"loss": 2.6021,
"mean_token_accuracy": 0.46242382377386093,
"num_tokens": 227126.0,
"step": 208
},
{
"entropy": 1.493167519569397,
"epoch": 1.4848484848484849,
"grad_norm": 0.0,
"learning_rate": 0.00015172413793103449,
"loss": 2.5246,
"mean_token_accuracy": 0.48408984392881393,
"num_tokens": 228245.0,
"step": 209
},
{
"entropy": 1.5619983673095703,
"epoch": 1.4919786096256684,
"grad_norm": 0.0,
"learning_rate": 0.00015148632580261594,
"loss": 2.6306,
"mean_token_accuracy": 0.4775094836950302,
"num_tokens": 229336.0,
"step": 210
},
{
"entropy": 1.587768405675888,
"epoch": 1.499108734402852,
"grad_norm": 0.0,
"learning_rate": 0.0001512485136741974,
"loss": 2.6375,
"mean_token_accuracy": 0.47264014929533005,
"num_tokens": 230431.0,
"step": 211
},
{
"entropy": 1.6371296346187592,
"epoch": 1.5062388591800357,
"grad_norm": 0.0,
"learning_rate": 0.00015101070154577883,
"loss": 2.6987,
"mean_token_accuracy": 0.45142069458961487,
"num_tokens": 231538.0,
"step": 212
},
{
"entropy": 1.615380883216858,
"epoch": 1.5133689839572193,
"grad_norm": 0.0,
"learning_rate": 0.0001507728894173603,
"loss": 2.5931,
"mean_token_accuracy": 0.46151647716760635,
"num_tokens": 232692.0,
"step": 213
},
{
"entropy": 1.6657347679138184,
"epoch": 1.5204991087344029,
"grad_norm": 0.0,
"learning_rate": 0.00015053507728894175,
"loss": 2.767,
"mean_token_accuracy": 0.45710520446300507,
"num_tokens": 233752.0,
"step": 214
},
{
"entropy": 1.5707046389579773,
"epoch": 1.5276292335115864,
"grad_norm": 0.0,
"learning_rate": 0.0001502972651605232,
"loss": 2.575,
"mean_token_accuracy": 0.47789450734853745,
"num_tokens": 234860.0,
"step": 215
},
{
"entropy": 1.6575550436973572,
"epoch": 1.53475935828877,
"grad_norm": 0.0,
"learning_rate": 0.00015005945303210464,
"loss": 2.6208,
"mean_token_accuracy": 0.46910084784030914,
"num_tokens": 235953.0,
"step": 216
},
{
"entropy": 1.5557922422885895,
"epoch": 1.5418894830659537,
"grad_norm": 0.0,
"learning_rate": 0.0001498216409036861,
"loss": 2.6316,
"mean_token_accuracy": 0.47147487103939056,
"num_tokens": 237113.0,
"step": 217
},
{
"entropy": 1.5967816710472107,
"epoch": 1.5490196078431373,
"grad_norm": 0.0,
"learning_rate": 0.00014958382877526755,
"loss": 2.6435,
"mean_token_accuracy": 0.4659326821565628,
"num_tokens": 238183.0,
"step": 218
},
{
"entropy": 1.5619607269763947,
"epoch": 1.5561497326203209,
"grad_norm": 0.0,
"learning_rate": 0.000149346016646849,
"loss": 2.6573,
"mean_token_accuracy": 0.4648478552699089,
"num_tokens": 239250.0,
"step": 219
},
{
"entropy": 1.5970399975776672,
"epoch": 1.5632798573975044,
"grad_norm": 0.0,
"learning_rate": 0.00014910820451843044,
"loss": 2.5817,
"mean_token_accuracy": 0.47652987390756607,
"num_tokens": 240379.0,
"step": 220
},
{
"entropy": 1.6112593710422516,
"epoch": 1.570409982174688,
"grad_norm": 0.0,
"learning_rate": 0.0001488703923900119,
"loss": 2.6912,
"mean_token_accuracy": 0.46139268577098846,
"num_tokens": 241489.0,
"step": 221
},
{
"entropy": 1.6720410883426666,
"epoch": 1.5775401069518717,
"grad_norm": 0.0,
"learning_rate": 0.00014863258026159336,
"loss": 2.6818,
"mean_token_accuracy": 0.438846156001091,
"num_tokens": 242546.0,
"step": 222
},
{
"entropy": 1.6308137476444244,
"epoch": 1.5846702317290553,
"grad_norm": 0.0,
"learning_rate": 0.00014839476813317479,
"loss": 2.7512,
"mean_token_accuracy": 0.4592900201678276,
"num_tokens": 243638.0,
"step": 223
},
{
"entropy": 1.5938476026058197,
"epoch": 1.5918003565062389,
"grad_norm": 0.0,
"learning_rate": 0.00014815695600475624,
"loss": 2.5778,
"mean_token_accuracy": 0.47352661937475204,
"num_tokens": 244751.0,
"step": 224
},
{
"entropy": 1.5609805583953857,
"epoch": 1.5989304812834224,
"grad_norm": 0.0,
"learning_rate": 0.0001479191438763377,
"loss": 2.6435,
"mean_token_accuracy": 0.4681161344051361,
"num_tokens": 245826.0,
"step": 225
},
{
"entropy": 1.6306316256523132,
"epoch": 1.606060606060606,
"grad_norm": 0.0,
"learning_rate": 0.00014768133174791916,
"loss": 2.6791,
"mean_token_accuracy": 0.4553253725171089,
"num_tokens": 246871.0,
"step": 226
},
{
"entropy": 1.5593467950820923,
"epoch": 1.6131907308377897,
"grad_norm": 0.0,
"learning_rate": 0.00014744351961950062,
"loss": 2.5499,
"mean_token_accuracy": 0.47761671245098114,
"num_tokens": 247955.0,
"step": 227
},
{
"entropy": 1.55764502286911,
"epoch": 1.6203208556149733,
"grad_norm": 0.0,
"learning_rate": 0.00014720570749108205,
"loss": 2.5377,
"mean_token_accuracy": 0.49300002306699753,
"num_tokens": 249061.0,
"step": 228
},
{
"entropy": 1.5418426096439362,
"epoch": 1.6274509803921569,
"grad_norm": 0.0,
"learning_rate": 0.0001469678953626635,
"loss": 2.6276,
"mean_token_accuracy": 0.4840138778090477,
"num_tokens": 250157.0,
"step": 229
},
{
"entropy": 1.6196385324001312,
"epoch": 1.6345811051693404,
"grad_norm": 0.0,
"learning_rate": 0.00014673008323424496,
"loss": 2.5876,
"mean_token_accuracy": 0.440780833363533,
"num_tokens": 251267.0,
"step": 230
},
{
"entropy": 1.4990095794200897,
"epoch": 1.641711229946524,
"grad_norm": 0.0,
"learning_rate": 0.0001464922711058264,
"loss": 2.528,
"mean_token_accuracy": 0.49401114135980606,
"num_tokens": 252371.0,
"step": 231
},
{
"entropy": 1.5892696976661682,
"epoch": 1.6488413547237077,
"grad_norm": 0.0,
"learning_rate": 0.00014625445897740785,
"loss": 2.5374,
"mean_token_accuracy": 0.4945196509361267,
"num_tokens": 253491.0,
"step": 232
},
{
"entropy": 1.6785107553005219,
"epoch": 1.6559714795008913,
"grad_norm": 0.0,
"learning_rate": 0.0001460166468489893,
"loss": 2.6625,
"mean_token_accuracy": 0.4550144597887993,
"num_tokens": 254598.0,
"step": 233
},
{
"entropy": 1.5320258140563965,
"epoch": 1.6631016042780749,
"grad_norm": 0.0,
"learning_rate": 0.00014577883472057077,
"loss": 2.6136,
"mean_token_accuracy": 0.48827097564935684,
"num_tokens": 255724.0,
"step": 234
},
{
"entropy": 1.5822700262069702,
"epoch": 1.6702317290552586,
"grad_norm": 0.0,
"learning_rate": 0.0001455410225921522,
"loss": 2.5751,
"mean_token_accuracy": 0.4698340594768524,
"num_tokens": 256830.0,
"step": 235
},
{
"entropy": 1.6102330684661865,
"epoch": 1.677361853832442,
"grad_norm": 0.0,
"learning_rate": 0.00014530321046373365,
"loss": 2.6332,
"mean_token_accuracy": 0.47240811586380005,
"num_tokens": 257916.0,
"step": 236
},
{
"entropy": 1.5286598205566406,
"epoch": 1.6844919786096257,
"grad_norm": 0.0,
"learning_rate": 0.0001450653983353151,
"loss": 2.5149,
"mean_token_accuracy": 0.48652924597263336,
"num_tokens": 259019.0,
"step": 237
},
{
"entropy": 1.535717934370041,
"epoch": 1.6916221033868093,
"grad_norm": 0.0,
"learning_rate": 0.00014482758620689657,
"loss": 2.5691,
"mean_token_accuracy": 0.4858531951904297,
"num_tokens": 260128.0,
"step": 238
},
{
"entropy": 1.6363422572612762,
"epoch": 1.6987522281639929,
"grad_norm": 0.0,
"learning_rate": 0.000144589774078478,
"loss": 2.6366,
"mean_token_accuracy": 0.4610147476196289,
"num_tokens": 261228.0,
"step": 239
},
{
"entropy": 1.6290847063064575,
"epoch": 1.7058823529411766,
"grad_norm": 0.0,
"learning_rate": 0.00014435196195005946,
"loss": 2.7225,
"mean_token_accuracy": 0.47234660387039185,
"num_tokens": 262270.0,
"step": 240
},
{
"entropy": 1.6157021522521973,
"epoch": 1.71301247771836,
"grad_norm": 0.0,
"learning_rate": 0.00014411414982164092,
"loss": 2.6575,
"mean_token_accuracy": 0.45069990307092667,
"num_tokens": 263361.0,
"step": 241
},
{
"entropy": 1.6559931337833405,
"epoch": 1.7201426024955437,
"grad_norm": 0.0,
"learning_rate": 0.00014387633769322235,
"loss": 2.5864,
"mean_token_accuracy": 0.4479330778121948,
"num_tokens": 264391.0,
"step": 242
},
{
"entropy": 1.6161220967769623,
"epoch": 1.7272727272727273,
"grad_norm": 0.0,
"learning_rate": 0.0001436385255648038,
"loss": 2.6456,
"mean_token_accuracy": 0.47140543162822723,
"num_tokens": 265517.0,
"step": 243
},
{
"entropy": 1.6491332352161407,
"epoch": 1.7344028520499108,
"grad_norm": 0.0,
"learning_rate": 0.00014340071343638526,
"loss": 2.7443,
"mean_token_accuracy": 0.4643326923251152,
"num_tokens": 266569.0,
"step": 244
},
{
"entropy": 1.6256919205188751,
"epoch": 1.7415329768270946,
"grad_norm": 0.0,
"learning_rate": 0.00014316290130796672,
"loss": 2.6762,
"mean_token_accuracy": 0.4532029777765274,
"num_tokens": 267611.0,
"step": 245
},
{
"entropy": 1.619517207145691,
"epoch": 1.748663101604278,
"grad_norm": 0.0,
"learning_rate": 0.00014292508917954818,
"loss": 2.7279,
"mean_token_accuracy": 0.45651988685131073,
"num_tokens": 268692.0,
"step": 246
},
{
"entropy": 1.5403787195682526,
"epoch": 1.7557932263814617,
"grad_norm": 0.0,
"learning_rate": 0.0001426872770511296,
"loss": 2.5525,
"mean_token_accuracy": 0.4840865433216095,
"num_tokens": 269788.0,
"step": 247
},
{
"entropy": 1.6027557253837585,
"epoch": 1.7629233511586453,
"grad_norm": 0.0,
"learning_rate": 0.00014244946492271107,
"loss": 2.574,
"mean_token_accuracy": 0.48409949243068695,
"num_tokens": 270843.0,
"step": 248
},
{
"entropy": 1.5946480631828308,
"epoch": 1.7700534759358288,
"grad_norm": 0.0,
"learning_rate": 0.00014221165279429252,
"loss": 2.5633,
"mean_token_accuracy": 0.4869764968752861,
"num_tokens": 271964.0,
"step": 249
},
{
"entropy": 1.6484409868717194,
"epoch": 1.7771836007130126,
"grad_norm": 0.0,
"learning_rate": 0.00014197384066587395,
"loss": 2.6964,
"mean_token_accuracy": 0.4450046420097351,
"num_tokens": 273054.0,
"step": 250
},
{
"entropy": 1.6375533640384674,
"epoch": 1.784313725490196,
"grad_norm": 0.0,
"learning_rate": 0.0001417360285374554,
"loss": 2.6718,
"mean_token_accuracy": 0.4587417542934418,
"num_tokens": 274133.0,
"step": 251
},
{
"entropy": 1.5944361984729767,
"epoch": 1.7914438502673797,
"grad_norm": 0.0,
"learning_rate": 0.00014149821640903687,
"loss": 2.5917,
"mean_token_accuracy": 0.4714890420436859,
"num_tokens": 275243.0,
"step": 252
},
{
"entropy": 1.6059032678604126,
"epoch": 1.7985739750445633,
"grad_norm": 0.0,
"learning_rate": 0.0001412604042806183,
"loss": 2.6893,
"mean_token_accuracy": 0.45041677355766296,
"num_tokens": 276280.0,
"step": 253
},
{
"entropy": 1.6716269254684448,
"epoch": 1.8057040998217468,
"grad_norm": 0.0,
"learning_rate": 0.00014102259215219978,
"loss": 2.7229,
"mean_token_accuracy": 0.45646892488002777,
"num_tokens": 277383.0,
"step": 254
},
{
"entropy": 1.6591968834400177,
"epoch": 1.8128342245989306,
"grad_norm": 0.0,
"learning_rate": 0.00014078478002378122,
"loss": 2.6642,
"mean_token_accuracy": 0.457190565764904,
"num_tokens": 278468.0,
"step": 255
},
{
"entropy": 1.6405368149280548,
"epoch": 1.819964349376114,
"grad_norm": 0.0,
"learning_rate": 0.00014054696789536267,
"loss": 2.6468,
"mean_token_accuracy": 0.4580274894833565,
"num_tokens": 279544.0,
"step": 256
},
{
"entropy": 1.6274996399879456,
"epoch": 1.8270944741532977,
"grad_norm": 0.0,
"learning_rate": 0.00014030915576694413,
"loss": 2.6112,
"mean_token_accuracy": 0.4569789469242096,
"num_tokens": 280607.0,
"step": 257
},
{
"entropy": 1.5651440620422363,
"epoch": 1.8342245989304813,
"grad_norm": 0.0,
"learning_rate": 0.00014007134363852556,
"loss": 2.6034,
"mean_token_accuracy": 0.4507257267832756,
"num_tokens": 281691.0,
"step": 258
},
{
"entropy": 1.5355416238307953,
"epoch": 1.8413547237076648,
"grad_norm": 0.0,
"learning_rate": 0.00013983353151010702,
"loss": 2.5293,
"mean_token_accuracy": 0.4817895218729973,
"num_tokens": 282815.0,
"step": 259
},
{
"entropy": 1.592032641172409,
"epoch": 1.8484848484848486,
"grad_norm": 0.0,
"learning_rate": 0.00013959571938168848,
"loss": 2.6128,
"mean_token_accuracy": 0.4846939668059349,
"num_tokens": 283931.0,
"step": 260
},
{
"entropy": 1.6933747231960297,
"epoch": 1.855614973262032,
"grad_norm": 0.0,
"learning_rate": 0.0001393579072532699,
"loss": 2.8414,
"mean_token_accuracy": 0.42922718822956085,
"num_tokens": 284997.0,
"step": 261
},
{
"entropy": 1.549606055021286,
"epoch": 1.8627450980392157,
"grad_norm": 0.0,
"learning_rate": 0.0001391200951248514,
"loss": 2.6134,
"mean_token_accuracy": 0.47253087162971497,
"num_tokens": 286090.0,
"step": 262
},
{
"entropy": 1.6243177652359009,
"epoch": 1.8698752228163993,
"grad_norm": 0.0,
"learning_rate": 0.00013888228299643282,
"loss": 2.7223,
"mean_token_accuracy": 0.45962000638246536,
"num_tokens": 287183.0,
"step": 263
},
{
"entropy": 1.5972241163253784,
"epoch": 1.8770053475935828,
"grad_norm": 0.0,
"learning_rate": 0.00013864447086801425,
"loss": 2.6094,
"mean_token_accuracy": 0.47941046208143234,
"num_tokens": 288231.0,
"step": 264
},
{
"entropy": 1.6008118391036987,
"epoch": 1.8841354723707666,
"grad_norm": 0.0,
"learning_rate": 0.00013840665873959574,
"loss": 2.6731,
"mean_token_accuracy": 0.4635191634297371,
"num_tokens": 289284.0,
"step": 265
},
{
"entropy": 1.5784848630428314,
"epoch": 1.89126559714795,
"grad_norm": 0.0,
"learning_rate": 0.00013816884661117717,
"loss": 2.6694,
"mean_token_accuracy": 0.47262611985206604,
"num_tokens": 290386.0,
"step": 266
},
{
"entropy": 1.6606462597846985,
"epoch": 1.8983957219251337,
"grad_norm": 0.0,
"learning_rate": 0.00013793103448275863,
"loss": 2.7134,
"mean_token_accuracy": 0.44980067759752274,
"num_tokens": 291460.0,
"step": 267
},
{
"entropy": 1.6004058420658112,
"epoch": 1.9055258467023173,
"grad_norm": 0.0,
"learning_rate": 0.00013769322235434008,
"loss": 2.5894,
"mean_token_accuracy": 0.4649396017193794,
"num_tokens": 292530.0,
"step": 268
},
{
"entropy": 1.6282309591770172,
"epoch": 1.9126559714795008,
"grad_norm": 0.0,
"learning_rate": 0.00013745541022592151,
"loss": 2.6314,
"mean_token_accuracy": 0.4572133645415306,
"num_tokens": 293655.0,
"step": 269
},
{
"entropy": 1.5889360904693604,
"epoch": 1.9197860962566846,
"grad_norm": 0.0,
"learning_rate": 0.000137217598097503,
"loss": 2.6898,
"mean_token_accuracy": 0.4732029587030411,
"num_tokens": 294730.0,
"step": 270
},
{
"entropy": 1.6428880095481873,
"epoch": 1.926916221033868,
"grad_norm": 0.0,
"learning_rate": 0.00013697978596908443,
"loss": 2.6273,
"mean_token_accuracy": 0.4647170379757881,
"num_tokens": 295818.0,
"step": 271
},
{
"entropy": 1.5940674245357513,
"epoch": 1.9340463458110517,
"grad_norm": 0.0,
"learning_rate": 0.00013674197384066586,
"loss": 2.5628,
"mean_token_accuracy": 0.47556307166814804,
"num_tokens": 296920.0,
"step": 272
},
{
"entropy": 1.5898773968219757,
"epoch": 1.9411764705882353,
"grad_norm": 0.0,
"learning_rate": 0.00013650416171224735,
"loss": 2.6949,
"mean_token_accuracy": 0.47456144541502,
"num_tokens": 298028.0,
"step": 273
},
{
"entropy": 1.651709407567978,
"epoch": 1.9483065953654188,
"grad_norm": 0.0,
"learning_rate": 0.00013626634958382878,
"loss": 2.6833,
"mean_token_accuracy": 0.45697759836912155,
"num_tokens": 299111.0,
"step": 274
},
{
"entropy": 1.6460674107074738,
"epoch": 1.9554367201426026,
"grad_norm": 0.0,
"learning_rate": 0.00013602853745541023,
"loss": 2.7098,
"mean_token_accuracy": 0.45604951679706573,
"num_tokens": 300236.0,
"step": 275
},
{
"entropy": 1.645053118467331,
"epoch": 1.962566844919786,
"grad_norm": 0.0,
"learning_rate": 0.0001357907253269917,
"loss": 2.5929,
"mean_token_accuracy": 0.467997670173645,
"num_tokens": 301315.0,
"step": 276
},
{
"entropy": 1.6668168604373932,
"epoch": 1.9696969696969697,
"grad_norm": 0.0,
"learning_rate": 0.00013555291319857312,
"loss": 2.711,
"mean_token_accuracy": 0.4511336088180542,
"num_tokens": 302441.0,
"step": 277
},
{
"entropy": 1.6053387224674225,
"epoch": 1.9768270944741533,
"grad_norm": 0.0,
"learning_rate": 0.00013531510107015458,
"loss": 2.5949,
"mean_token_accuracy": 0.4798056557774544,
"num_tokens": 303551.0,
"step": 278
},
{
"entropy": 1.6160295009613037,
"epoch": 1.9839572192513368,
"grad_norm": 0.0,
"learning_rate": 0.00013507728894173604,
"loss": 2.7294,
"mean_token_accuracy": 0.45531073212623596,
"num_tokens": 304641.0,
"step": 279
},
{
"entropy": 1.6171720921993256,
"epoch": 1.9910873440285206,
"grad_norm": 0.0,
"learning_rate": 0.00013483947681331747,
"loss": 2.5741,
"mean_token_accuracy": 0.4656490460038185,
"num_tokens": 305726.0,
"step": 280
},
{
"entropy": 1.6310756206512451,
"epoch": 1.998217468805704,
"grad_norm": 0.0,
"learning_rate": 0.00013460166468489895,
"loss": 2.59,
"mean_token_accuracy": 0.45554178953170776,
"num_tokens": 306833.0,
"step": 281
},
{
"entropy": 1.6477800607681274,
"epoch": 2.0,
"grad_norm": 0.0,
"learning_rate": 0.00013436385255648038,
"loss": 2.7133,
"mean_token_accuracy": 0.4486691951751709,
"num_tokens": 307100.0,
"step": 282
}
],
"logging_steps": 1,
"max_steps": 846,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.450573084483584e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}