{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.79247152055473, "eval_steps": 500.0, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00039623576027736503, "grad_norm": 23.5, "learning_rate": 2.6315789473684213e-07, "loss": 1.1837007999420166, "step": 1, "token_acc": 0.8159329621764334 }, { "epoch": 0.00396235760277365, "grad_norm": 21.375, "learning_rate": 2.631578947368421e-06, "loss": 1.123257319132487, "step": 10, "token_acc": 0.8210087927828165 }, { "epoch": 0.0079247152055473, "grad_norm": 7.09375, "learning_rate": 5.263157894736842e-06, "loss": 0.9101140975952149, "step": 20, "token_acc": 0.8285831313786395 }, { "epoch": 0.01188707280832095, "grad_norm": 2.8125, "learning_rate": 7.894736842105265e-06, "loss": 0.5795128822326661, "step": 30, "token_acc": 0.8505745886743207 }, { "epoch": 0.0158494304110946, "grad_norm": 1.7578125, "learning_rate": 1.0526315789473684e-05, "loss": 0.42301692962646487, "step": 40, "token_acc": 0.8767900103104593 }, { "epoch": 0.01981178801386825, "grad_norm": 1.4921875, "learning_rate": 1.3157894736842108e-05, "loss": 0.35302650928497314, "step": 50, "token_acc": 0.8897820845537251 }, { "epoch": 0.0237741456166419, "grad_norm": 1.375, "learning_rate": 1.578947368421053e-05, "loss": 0.33079302310943604, "step": 60, "token_acc": 0.8967102736745091 }, { "epoch": 0.02773650321941555, "grad_norm": 1.25, "learning_rate": 1.8421052631578947e-05, "loss": 0.31292335987091063, "step": 70, "token_acc": 0.9010626512129326 }, { "epoch": 0.0316988608221892, "grad_norm": 1.109375, "learning_rate": 1.999986824534997e-05, "loss": 0.3141467094421387, "step": 80, "token_acc": 0.9005368650633087 }, { "epoch": 0.03566121842496285, "grad_norm": 1.296875, "learning_rate": 1.9998386045408938e-05, "loss": 0.29496400356292723, "step": 90, "token_acc": 0.9055861965123218 }, { "epoch": 0.0396235760277365, "grad_norm": 1.28125, "learning_rate": 1.999525719713366e-05, "loss": 0.2913074970245361, "step": 100, "token_acc": 0.9076149509114921 }, { "epoch": 0.04358593363051015, "grad_norm": 1.6015625, "learning_rate": 1.999048221581858e-05, "loss": 0.2880474805831909, "step": 110, "token_acc": 0.9073922051522615 }, { "epoch": 0.0475482912332838, "grad_norm": 1.390625, "learning_rate": 1.9984061887862118e-05, "loss": 0.27746291160583497, "step": 120, "token_acc": 0.9101783276777932 }, { "epoch": 0.05151064883605745, "grad_norm": 1.171875, "learning_rate": 1.9975997270637172e-05, "loss": 0.273817777633667, "step": 130, "token_acc": 0.909736600422787 }, { "epoch": 0.0554730064388311, "grad_norm": 1.2578125, "learning_rate": 1.9966289692316944e-05, "loss": 0.2767889976501465, "step": 140, "token_acc": 0.9082912026144594 }, { "epoch": 0.05943536404160475, "grad_norm": 1.4921875, "learning_rate": 1.9954940751656245e-05, "loss": 0.27089781761169435, "step": 150, "token_acc": 0.9099060425408418 }, { "epoch": 0.0633977216443784, "grad_norm": 1.171875, "learning_rate": 1.994195231772815e-05, "loss": 0.25421991348266604, "step": 160, "token_acc": 0.9162766481231006 }, { "epoch": 0.06736007924715205, "grad_norm": 1.359375, "learning_rate": 1.9927326529616203e-05, "loss": 0.2611961841583252, "step": 170, "token_acc": 0.9147679722152482 }, { "epoch": 0.0713224368499257, "grad_norm": 1.4296875, "learning_rate": 1.9911065796062137e-05, "loss": 0.264358377456665, "step": 180, "token_acc": 0.9137104702605277 }, { "epoch": 0.07528479445269935, "grad_norm": 1.2265625, "learning_rate": 1.9893172795069144e-05, "loss": 0.27645695209503174, "step": 190, "token_acc": 0.9085774438661551 }, { "epoch": 0.079247152055473, "grad_norm": 1.3125, "learning_rate": 1.9873650473460862e-05, "loss": 0.2564415693283081, "step": 200, "token_acc": 0.9148068228524455 }, { "epoch": 0.08320950965824665, "grad_norm": 1.2265625, "learning_rate": 1.9852502046396035e-05, "loss": 0.2584503650665283, "step": 210, "token_acc": 0.9148747112137906 }, { "epoch": 0.0871718672610203, "grad_norm": 1.1171875, "learning_rate": 1.982973099683902e-05, "loss": 0.25623598098754885, "step": 220, "token_acc": 0.916684382955295 }, { "epoch": 0.09113422486379395, "grad_norm": 1.4296875, "learning_rate": 1.980534107498616e-05, "loss": 0.2456662178039551, "step": 230, "token_acc": 0.9188118082346068 }, { "epoch": 0.0950965824665676, "grad_norm": 1.4609375, "learning_rate": 1.977933629764817e-05, "loss": 0.2530802249908447, "step": 240, "token_acc": 0.9152495545682131 }, { "epoch": 0.09905894006934125, "grad_norm": 1.3203125, "learning_rate": 1.9751720947588603e-05, "loss": 0.24223690032958983, "step": 250, "token_acc": 0.9186887231706855 }, { "epoch": 0.1030212976721149, "grad_norm": 1.21875, "learning_rate": 1.9722499572818496e-05, "loss": 0.23485193252563477, "step": 260, "token_acc": 0.9216265054055996 }, { "epoch": 0.10698365527488855, "grad_norm": 1.5, "learning_rate": 1.969167698584738e-05, "loss": 0.24744803905487062, "step": 270, "token_acc": 0.9177383756974582 }, { "epoch": 0.1109460128776622, "grad_norm": 1.234375, "learning_rate": 1.9659258262890683e-05, "loss": 0.25014376640319824, "step": 280, "token_acc": 0.9170167948905685 }, { "epoch": 0.11490837048043585, "grad_norm": 1.5078125, "learning_rate": 1.9625248743033725e-05, "loss": 0.23340215682983398, "step": 290, "token_acc": 0.9214856049225197 }, { "epoch": 0.1188707280832095, "grad_norm": 1.171875, "learning_rate": 1.9589654027352412e-05, "loss": 0.24289028644561766, "step": 300, "token_acc": 0.9185406963850078 }, { "epoch": 0.12283308568598315, "grad_norm": 1.0625, "learning_rate": 1.9552479977990802e-05, "loss": 0.24520406723022461, "step": 310, "token_acc": 0.9184403422069023 }, { "epoch": 0.1267954432887568, "grad_norm": 1.359375, "learning_rate": 1.9513732717195638e-05, "loss": 0.2427917242050171, "step": 320, "token_acc": 0.9178514285714285 }, { "epoch": 0.13075780089153047, "grad_norm": 0.9609375, "learning_rate": 1.9473418626308086e-05, "loss": 0.21972455978393554, "step": 330, "token_acc": 0.9259012550960103 }, { "epoch": 0.1347201584943041, "grad_norm": 1.3984375, "learning_rate": 1.9431544344712776e-05, "loss": 0.2463603973388672, "step": 340, "token_acc": 0.9171354320865818 }, { "epoch": 0.13868251609707777, "grad_norm": 1.25, "learning_rate": 1.9388116768744344e-05, "loss": 0.23121447563171388, "step": 350, "token_acc": 0.9208610209876757 }, { "epoch": 0.1426448736998514, "grad_norm": 1.0546875, "learning_rate": 1.9343143050551684e-05, "loss": 0.2372572898864746, "step": 360, "token_acc": 0.9205740491816241 }, { "epoch": 0.14660723130262507, "grad_norm": 1.6328125, "learning_rate": 1.929663059692002e-05, "loss": 0.23370888233184814, "step": 370, "token_acc": 0.9218769547078884 }, { "epoch": 0.1505695889053987, "grad_norm": 1.3515625, "learning_rate": 1.924858706805112e-05, "loss": 0.22563014030456544, "step": 380, "token_acc": 0.9239206109486627 }, { "epoch": 0.15453194650817237, "grad_norm": 1.2890625, "learning_rate": 1.9199020376301666e-05, "loss": 0.22754812240600586, "step": 390, "token_acc": 0.923770752222635 }, { "epoch": 0.158494304110946, "grad_norm": 1.2109375, "learning_rate": 1.9147938684880213e-05, "loss": 0.233451247215271, "step": 400, "token_acc": 0.9208578517882449 }, { "epoch": 0.16245666171371967, "grad_norm": 1.359375, "learning_rate": 1.9095350406502736e-05, "loss": 0.22117164134979247, "step": 410, "token_acc": 0.9251948698253339 }, { "epoch": 0.1664190193164933, "grad_norm": 1.265625, "learning_rate": 1.9041264202007158e-05, "loss": 0.23051214218139648, "step": 420, "token_acc": 0.9227009356565836 }, { "epoch": 0.17038137691926697, "grad_norm": 1.296875, "learning_rate": 1.8985688978926972e-05, "loss": 0.22384767532348632, "step": 430, "token_acc": 0.9254292644524351 }, { "epoch": 0.1743437345220406, "grad_norm": 0.96484375, "learning_rate": 1.892863389002424e-05, "loss": 0.22796776294708251, "step": 440, "token_acc": 0.9236655948553054 }, { "epoch": 0.17830609212481427, "grad_norm": 1.1015625, "learning_rate": 1.887010833178222e-05, "loss": 0.2255650520324707, "step": 450, "token_acc": 0.9233627684120709 }, { "epoch": 0.1822684497275879, "grad_norm": 1.3828125, "learning_rate": 1.8810121942857848e-05, "loss": 0.21253745555877684, "step": 460, "token_acc": 0.9272634714542769 }, { "epoch": 0.18623080733036157, "grad_norm": 1.171875, "learning_rate": 1.8748684602494327e-05, "loss": 0.22184033393859864, "step": 470, "token_acc": 0.9256473357586134 }, { "epoch": 0.1901931649331352, "grad_norm": 1.046875, "learning_rate": 1.8685806428894113e-05, "loss": 0.2163544178009033, "step": 480, "token_acc": 0.92641120988206 }, { "epoch": 0.19415552253590887, "grad_norm": 1.234375, "learning_rate": 1.8621497777552508e-05, "loss": 0.2326265335083008, "step": 490, "token_acc": 0.9219484631704639 }, { "epoch": 0.1981178801386825, "grad_norm": 1.1328125, "learning_rate": 1.8555769239552232e-05, "loss": 0.21914072036743165, "step": 500, "token_acc": 0.9266210447862321 }, { "epoch": 0.20208023774145617, "grad_norm": 1.546875, "learning_rate": 1.848863163981914e-05, "loss": 0.22959327697753906, "step": 510, "token_acc": 0.9215090641842234 }, { "epoch": 0.2060425953442298, "grad_norm": 1.109375, "learning_rate": 1.8420096035339454e-05, "loss": 0.21052975654602052, "step": 520, "token_acc": 0.9286930380232219 }, { "epoch": 0.21000495294700347, "grad_norm": 1.0703125, "learning_rate": 1.8350173713338777e-05, "loss": 0.22955830097198487, "step": 530, "token_acc": 0.9225931053342221 }, { "epoch": 0.2139673105497771, "grad_norm": 1.2890625, "learning_rate": 1.827887618942318e-05, "loss": 0.21942346096038817, "step": 540, "token_acc": 0.9257000477242205 }, { "epoch": 0.21792966815255077, "grad_norm": 1.234375, "learning_rate": 1.8206215205682683e-05, "loss": 0.21607930660247804, "step": 550, "token_acc": 0.9265396164644921 }, { "epoch": 0.2218920257553244, "grad_norm": 1.359375, "learning_rate": 1.8132202728757428e-05, "loss": 0.21843266487121582, "step": 560, "token_acc": 0.9258849850056328 }, { "epoch": 0.22585438335809807, "grad_norm": 1.3671875, "learning_rate": 1.805685094786689e-05, "loss": 0.21874871253967285, "step": 570, "token_acc": 0.9250736338016231 }, { "epoch": 0.2298167409608717, "grad_norm": 1.1640625, "learning_rate": 1.7980172272802398e-05, "loss": 0.22817540168762207, "step": 580, "token_acc": 0.9221536778365731 }, { "epoch": 0.23377909856364537, "grad_norm": 1.3671875, "learning_rate": 1.790217933188336e-05, "loss": 0.20628876686096193, "step": 590, "token_acc": 0.9291559217209775 }, { "epoch": 0.237741456166419, "grad_norm": 1.2265625, "learning_rate": 1.7822884969877493e-05, "loss": 0.22458946704864502, "step": 600, "token_acc": 0.9231406464867372 }, { "epoch": 0.24170381376919267, "grad_norm": 1.3046875, "learning_rate": 1.7742302245885384e-05, "loss": 0.20527830123901367, "step": 610, "token_acc": 0.9306424304540271 }, { "epoch": 0.2456661713719663, "grad_norm": 1.1171875, "learning_rate": 1.766044443118978e-05, "loss": 0.2055346965789795, "step": 620, "token_acc": 0.9294153185205075 }, { "epoch": 0.24962852897473997, "grad_norm": 1.1640625, "learning_rate": 1.7577325007069927e-05, "loss": 0.21000022888183595, "step": 630, "token_acc": 0.9276756514760238 }, { "epoch": 0.2535908865775136, "grad_norm": 1.2265625, "learning_rate": 1.7492957662581297e-05, "loss": 0.20726590156555175, "step": 640, "token_acc": 0.9288681287625508 }, { "epoch": 0.25755324418028724, "grad_norm": 1.1328125, "learning_rate": 1.7407356292301134e-05, "loss": 0.20893335342407227, "step": 650, "token_acc": 0.9287459199802928 }, { "epoch": 0.26151560178306094, "grad_norm": 1.46875, "learning_rate": 1.7320534994040148e-05, "loss": 0.2122333526611328, "step": 660, "token_acc": 0.9268251113697004 }, { "epoch": 0.26547795938583457, "grad_norm": 2.21875, "learning_rate": 1.7232508066520702e-05, "loss": 0.2119227170944214, "step": 670, "token_acc": 0.9272324174995067 }, { "epoch": 0.2694403169886082, "grad_norm": 1.2890625, "learning_rate": 1.7143290007021942e-05, "loss": 0.2144456148147583, "step": 680, "token_acc": 0.9266572858854115 }, { "epoch": 0.27340267459138184, "grad_norm": 1.3515625, "learning_rate": 1.7052895508992236e-05, "loss": 0.20908637046813966, "step": 690, "token_acc": 0.9279253384640653 }, { "epoch": 0.27736503219415554, "grad_norm": 1.2734375, "learning_rate": 1.696133945962927e-05, "loss": 0.21407780647277833, "step": 700, "token_acc": 0.9275297697109584 }, { "epoch": 0.2813273897969292, "grad_norm": 1.3984375, "learning_rate": 1.6868636937428254e-05, "loss": 0.20272161960601806, "step": 710, "token_acc": 0.9313989228518674 }, { "epoch": 0.2852897473997028, "grad_norm": 1.359375, "learning_rate": 1.677480320969865e-05, "loss": 0.20830063819885253, "step": 720, "token_acc": 0.9284670505715276 }, { "epoch": 0.2892521050024765, "grad_norm": 1.3046875, "learning_rate": 1.6679853730049743e-05, "loss": 0.20571448802947997, "step": 730, "token_acc": 0.9288137503522119 }, { "epoch": 0.29321446260525014, "grad_norm": 1.4375, "learning_rate": 1.6583804135845582e-05, "loss": 0.207275128364563, "step": 740, "token_acc": 0.9295052506473598 }, { "epoch": 0.2971768202080238, "grad_norm": 1.1640625, "learning_rate": 1.648667024562963e-05, "loss": 0.2059840202331543, "step": 750, "token_acc": 0.9303702716282313 }, { "epoch": 0.3011391778107974, "grad_norm": 1.4296875, "learning_rate": 1.638846805651961e-05, "loss": 0.20929555892944335, "step": 760, "token_acc": 0.9285013576720667 }, { "epoch": 0.3051015354135711, "grad_norm": 1.2265625, "learning_rate": 1.62892137415729e-05, "loss": 0.2164773464202881, "step": 770, "token_acc": 0.9268445872201972 }, { "epoch": 0.30906389301634474, "grad_norm": 1.078125, "learning_rate": 1.6188923647122946e-05, "loss": 0.20146725177764893, "step": 780, "token_acc": 0.9308608962964089 }, { "epoch": 0.3130262506191184, "grad_norm": 1.1796875, "learning_rate": 1.608761429008721e-05, "loss": 0.19116392135620117, "step": 790, "token_acc": 0.9360810066351728 }, { "epoch": 0.316988608221892, "grad_norm": 1.40625, "learning_rate": 1.5985302355246932e-05, "loss": 0.19471538066864014, "step": 800, "token_acc": 0.9334035945789697 }, { "epoch": 0.3209509658246657, "grad_norm": 1.3984375, "learning_rate": 1.5882004692499324e-05, "loss": 0.20449495315551758, "step": 810, "token_acc": 0.9296946281131374 }, { "epoch": 0.32491332342743934, "grad_norm": 1.4140625, "learning_rate": 1.5777738314082514e-05, "loss": 0.2058267116546631, "step": 820, "token_acc": 0.930226312581988 }, { "epoch": 0.328875681030213, "grad_norm": 1.4296875, "learning_rate": 1.567252039177378e-05, "loss": 0.19794673919677735, "step": 830, "token_acc": 0.931884692988862 }, { "epoch": 0.3328380386329866, "grad_norm": 1.390625, "learning_rate": 1.5566368254061505e-05, "loss": 0.20482149124145507, "step": 840, "token_acc": 0.9305290785274152 }, { "epoch": 0.3368003962357603, "grad_norm": 1.53125, "learning_rate": 1.5459299383291347e-05, "loss": 0.19639644622802735, "step": 850, "token_acc": 0.9322417158382036 }, { "epoch": 0.34076275383853394, "grad_norm": 1.4375, "learning_rate": 1.5351331412787004e-05, "loss": 0.2021495819091797, "step": 860, "token_acc": 0.9298179216523921 }, { "epoch": 0.3447251114413076, "grad_norm": 1.1875, "learning_rate": 1.52424821239462e-05, "loss": 0.20063307285308837, "step": 870, "token_acc": 0.9313979538110527 }, { "epoch": 0.3486874690440812, "grad_norm": 1.6328125, "learning_rate": 1.5132769443312207e-05, "loss": 0.20427477359771729, "step": 880, "token_acc": 0.9299313715863092 }, { "epoch": 0.3526498266468549, "grad_norm": 1.4609375, "learning_rate": 1.5022211439621521e-05, "loss": 0.20063276290893556, "step": 890, "token_acc": 0.9309864789183134 }, { "epoch": 0.35661218424962854, "grad_norm": 1.328125, "learning_rate": 1.4910826320828085e-05, "loss": 0.19403212070465087, "step": 900, "token_acc": 0.9340383217142124 }, { "epoch": 0.3605745418524022, "grad_norm": 1.21875, "learning_rate": 1.4798632431104591e-05, "loss": 0.1897117853164673, "step": 910, "token_acc": 0.9360307874252368 }, { "epoch": 0.3645368994551758, "grad_norm": 1.4296875, "learning_rate": 1.4685648247821376e-05, "loss": 0.19313969612121581, "step": 920, "token_acc": 0.9329953036961753 }, { "epoch": 0.3684992570579495, "grad_norm": 1.0859375, "learning_rate": 1.457189237850332e-05, "loss": 0.203882098197937, "step": 930, "token_acc": 0.9312272344443193 }, { "epoch": 0.37246161466072314, "grad_norm": 0.875, "learning_rate": 1.4457383557765385e-05, "loss": 0.1886841893196106, "step": 940, "token_acc": 0.9355444372139664 }, { "epoch": 0.3764239722634968, "grad_norm": 1.359375, "learning_rate": 1.4342140644227151e-05, "loss": 0.1905367612838745, "step": 950, "token_acc": 0.9352085303078055 }, { "epoch": 0.3803863298662704, "grad_norm": 1.140625, "learning_rate": 1.4226182617406996e-05, "loss": 0.19780998229980468, "step": 960, "token_acc": 0.9324879595849204 }, { "epoch": 0.3843486874690441, "grad_norm": 1.3203125, "learning_rate": 1.41095285745963e-05, "loss": 0.19177125692367553, "step": 970, "token_acc": 0.9343932834841926 }, { "epoch": 0.38831104507181774, "grad_norm": 1.7578125, "learning_rate": 1.399219772771431e-05, "loss": 0.1960275650024414, "step": 980, "token_acc": 0.9329073312723757 }, { "epoch": 0.3922734026745914, "grad_norm": 1.34375, "learning_rate": 1.3874209400144092e-05, "loss": 0.18507509231567382, "step": 990, "token_acc": 0.9359859759133133 }, { "epoch": 0.396235760277365, "grad_norm": 1.2734375, "learning_rate": 1.3755583023550128e-05, "loss": 0.1876603364944458, "step": 1000, "token_acc": 0.9350970511384845 }, { "epoch": 0.4001981178801387, "grad_norm": 1.28125, "learning_rate": 1.3636338134678104e-05, "loss": 0.17850277423858643, "step": 1010, "token_acc": 0.9377877376733048 }, { "epoch": 0.40416047548291234, "grad_norm": 1.234375, "learning_rate": 1.3516494372137368e-05, "loss": 0.1958215355873108, "step": 1020, "token_acc": 0.9318651647470785 }, { "epoch": 0.408122833085686, "grad_norm": 1.6640625, "learning_rate": 1.3396071473166614e-05, "loss": 0.18602523803710938, "step": 1030, "token_acc": 0.9359838557500786 }, { "epoch": 0.4120851906884596, "grad_norm": 1.2109375, "learning_rate": 1.327508927038334e-05, "loss": 0.18929693698883057, "step": 1040, "token_acc": 0.9350099237438629 }, { "epoch": 0.4160475482912333, "grad_norm": 1.03125, "learning_rate": 1.3153567688517567e-05, "loss": 0.18981436491012574, "step": 1050, "token_acc": 0.934143741104814 }, { "epoch": 0.42000990589400694, "grad_norm": 1.3359375, "learning_rate": 1.3031526741130435e-05, "loss": 0.1816575288772583, "step": 1060, "token_acc": 0.9370538611291369 }, { "epoch": 0.4239722634967806, "grad_norm": 1.3203125, "learning_rate": 1.2908986527318121e-05, "loss": 0.19676063060760499, "step": 1070, "token_acc": 0.932801285003426 }, { "epoch": 0.4279346210995542, "grad_norm": 1.2734375, "learning_rate": 1.2785967228401688e-05, "loss": 0.19254275560379028, "step": 1080, "token_acc": 0.9333315147712704 }, { "epoch": 0.4318969787023279, "grad_norm": 1.5078125, "learning_rate": 1.266248910460341e-05, "loss": 0.18717528581619264, "step": 1090, "token_acc": 0.9360305301291446 }, { "epoch": 0.43585933630510154, "grad_norm": 1.734375, "learning_rate": 1.2538572491710079e-05, "loss": 0.1824967622756958, "step": 1100, "token_acc": 0.9372006812944594 }, { "epoch": 0.4398216939078752, "grad_norm": 1.0078125, "learning_rate": 1.2414237797723876e-05, "loss": 0.17919249534606935, "step": 1110, "token_acc": 0.9387596071733562 }, { "epoch": 0.4437840515106488, "grad_norm": 1.296875, "learning_rate": 1.2289505499501341e-05, "loss": 0.18926095962524414, "step": 1120, "token_acc": 0.9342525248667318 }, { "epoch": 0.4477464091134225, "grad_norm": 1.28125, "learning_rate": 1.2164396139381029e-05, "loss": 0.20064361095428468, "step": 1130, "token_acc": 0.9315847075431296 }, { "epoch": 0.45170876671619614, "grad_norm": 1.484375, "learning_rate": 1.2038930321800346e-05, "loss": 0.1895804524421692, "step": 1140, "token_acc": 0.9349271790531848 }, { "epoch": 0.4556711243189698, "grad_norm": 1.5703125, "learning_rate": 1.1913128709902182e-05, "loss": 0.1807018995285034, "step": 1150, "token_acc": 0.9369057628872647 }, { "epoch": 0.4596334819217434, "grad_norm": 1.4140625, "learning_rate": 1.1787012022131863e-05, "loss": 0.1842559814453125, "step": 1160, "token_acc": 0.9362108645620739 }, { "epoch": 0.4635958395245171, "grad_norm": 1.3203125, "learning_rate": 1.1660601028825013e-05, "loss": 0.19840482473373414, "step": 1170, "token_acc": 0.9314812356169233 }, { "epoch": 0.46755819712729074, "grad_norm": 1.046875, "learning_rate": 1.1533916548786856e-05, "loss": 0.1772662878036499, "step": 1180, "token_acc": 0.9394712189028833 }, { "epoch": 0.4715205547300644, "grad_norm": 1.2734375, "learning_rate": 1.1406979445863515e-05, "loss": 0.18831554651260377, "step": 1190, "token_acc": 0.935608596292791 }, { "epoch": 0.475482912332838, "grad_norm": 1.34375, "learning_rate": 1.127981062550595e-05, "loss": 0.18489151000976561, "step": 1200, "token_acc": 0.9360608419277421 }, { "epoch": 0.4794452699356117, "grad_norm": 1.3984375, "learning_rate": 1.1152431031326978e-05, "loss": 0.17761152982711792, "step": 1210, "token_acc": 0.9386175400572799 }, { "epoch": 0.48340762753838534, "grad_norm": 3.109375, "learning_rate": 1.102486164165207e-05, "loss": 0.18663549423217773, "step": 1220, "token_acc": 0.9355476517845982 }, { "epoch": 0.487369985141159, "grad_norm": 1.25, "learning_rate": 1.0897123466064376e-05, "loss": 0.18886669874191284, "step": 1230, "token_acc": 0.9356319723508901 }, { "epoch": 0.4913323427439326, "grad_norm": 1.34375, "learning_rate": 1.0769237541944639e-05, "loss": 0.18777060508728027, "step": 1240, "token_acc": 0.9354588236528564 }, { "epoch": 0.4952947003467063, "grad_norm": 1.5, "learning_rate": 1.0641224931006518e-05, "loss": 0.17902556657791138, "step": 1250, "token_acc": 0.9375767442118891 }, { "epoch": 0.49925705794947994, "grad_norm": 1.15625, "learning_rate": 1.0513106715827897e-05, "loss": 0.18400684595108033, "step": 1260, "token_acc": 0.9370039916704695 }, { "epoch": 0.5032194155522536, "grad_norm": 1.40625, "learning_rate": 1.0384903996378784e-05, "loss": 0.17728078365325928, "step": 1270, "token_acc": 0.9389623546976645 }, { "epoch": 0.5071817731550272, "grad_norm": 1.21875, "learning_rate": 1.02566378865463e-05, "loss": 0.18042536973953247, "step": 1280, "token_acc": 0.9374939011828994 }, { "epoch": 0.5111441307578009, "grad_norm": 1.4609375, "learning_rate": 1.0128329510657426e-05, "loss": 0.18618935346603394, "step": 1290, "token_acc": 0.9355284924654325 }, { "epoch": 0.5151064883605745, "grad_norm": 1.453125, "learning_rate": 1e-05, "loss": 0.19038233757019044, "step": 1300, "token_acc": 0.9348978046934141 }, { "epoch": 0.5190688459633482, "grad_norm": 1.921875, "learning_rate": 9.871670489342577e-06, "loss": 0.18166159391403197, "step": 1310, "token_acc": 0.9376118246059261 }, { "epoch": 0.5230312035661219, "grad_norm": 1.3828125, "learning_rate": 9.743362113453705e-06, "loss": 0.18087191581726075, "step": 1320, "token_acc": 0.9368352123903884 }, { "epoch": 0.5269935611688955, "grad_norm": 1.7890625, "learning_rate": 9.615096003621221e-06, "loss": 0.17757006883621215, "step": 1330, "token_acc": 0.9385874468359324 }, { "epoch": 0.5309559187716691, "grad_norm": 1.5, "learning_rate": 9.486893284172103e-06, "loss": 0.1725843906402588, "step": 1340, "token_acc": 0.9396233946138856 }, { "epoch": 0.5349182763744428, "grad_norm": 1.1953125, "learning_rate": 9.358775068993484e-06, "loss": 0.17776031494140626, "step": 1350, "token_acc": 0.9395069576186172 }, { "epoch": 0.5388806339772164, "grad_norm": 1.2890625, "learning_rate": 9.230762458055363e-06, "loss": 0.18048588037490845, "step": 1360, "token_acc": 0.9376439779197635 }, { "epoch": 0.54284299157999, "grad_norm": 1.34375, "learning_rate": 9.102876533935626e-06, "loss": 0.1871953248977661, "step": 1370, "token_acc": 0.9352319496539852 }, { "epoch": 0.5468053491827637, "grad_norm": 1.8203125, "learning_rate": 8.975138358347931e-06, "loss": 0.17401375770568847, "step": 1380, "token_acc": 0.9395100063574127 }, { "epoch": 0.5507677067855374, "grad_norm": 1.5625, "learning_rate": 8.847568968673025e-06, "loss": 0.1821776032447815, "step": 1390, "token_acc": 0.9382589568204417 }, { "epoch": 0.5547300643883111, "grad_norm": 1.3203125, "learning_rate": 8.720189374494055e-06, "loss": 0.18482091426849365, "step": 1400, "token_acc": 0.9366791672453971 }, { "epoch": 0.5586924219910847, "grad_norm": 1.140625, "learning_rate": 8.593020554136491e-06, "loss": 0.17976686954498292, "step": 1410, "token_acc": 0.938686745381246 }, { "epoch": 0.5626547795938583, "grad_norm": 1.7734375, "learning_rate": 8.466083451213145e-06, "loss": 0.16887048482894898, "step": 1420, "token_acc": 0.9413505379807353 }, { "epoch": 0.566617137196632, "grad_norm": 1.53125, "learning_rate": 8.339398971174987e-06, "loss": 0.181710684299469, "step": 1430, "token_acc": 0.9381945571057755 }, { "epoch": 0.5705794947994056, "grad_norm": 1.421875, "learning_rate": 8.212987977868138e-06, "loss": 0.192651104927063, "step": 1440, "token_acc": 0.9346234811416059 }, { "epoch": 0.5745418524021793, "grad_norm": 1.4375, "learning_rate": 8.086871290097822e-06, "loss": 0.1725835084915161, "step": 1450, "token_acc": 0.9401547502340085 }, { "epoch": 0.578504210004953, "grad_norm": 1.390625, "learning_rate": 7.961069678199658e-06, "loss": 0.18463332653045655, "step": 1460, "token_acc": 0.9356090428523226 }, { "epoch": 0.5824665676077266, "grad_norm": 1.390625, "learning_rate": 7.835603860618973e-06, "loss": 0.18219418525695802, "step": 1470, "token_acc": 0.9381983863723681 }, { "epoch": 0.5864289252105003, "grad_norm": 1.3125, "learning_rate": 7.710494500498662e-06, "loss": 0.17673687934875487, "step": 1480, "token_acc": 0.9390315988583202 }, { "epoch": 0.5903912828132739, "grad_norm": 1.6796875, "learning_rate": 7.585762202276129e-06, "loss": 0.1698865532875061, "step": 1490, "token_acc": 0.9415788913714225 }, { "epoch": 0.5943536404160475, "grad_norm": 1.53125, "learning_rate": 7.461427508289922e-06, "loss": 0.17974636554718018, "step": 1500, "token_acc": 0.9385133263736498 }, { "epoch": 0.5983159980188212, "grad_norm": 1.0859375, "learning_rate": 7.337510895396591e-06, "loss": 0.1787565231323242, "step": 1510, "token_acc": 0.9384560906515581 }, { "epoch": 0.6022783556215948, "grad_norm": 1.4609375, "learning_rate": 7.214032771598316e-06, "loss": 0.1744428515434265, "step": 1520, "token_acc": 0.9401470564435646 }, { "epoch": 0.6062407132243685, "grad_norm": 1.265625, "learning_rate": 7.091013472681883e-06, "loss": 0.17123017311096192, "step": 1530, "token_acc": 0.9405837916975914 }, { "epoch": 0.6102030708271422, "grad_norm": 1.5234375, "learning_rate": 6.968473258869566e-06, "loss": 0.1690650463104248, "step": 1540, "token_acc": 0.941058213231226 }, { "epoch": 0.6141654284299158, "grad_norm": 1.453125, "learning_rate": 6.846432311482436e-06, "loss": 0.18313372135162354, "step": 1550, "token_acc": 0.9371285854342504 }, { "epoch": 0.6181277860326895, "grad_norm": 1.7890625, "learning_rate": 6.724910729616665e-06, "loss": 0.17572647333145142, "step": 1560, "token_acc": 0.939426531245842 }, { "epoch": 0.6220901436354631, "grad_norm": 1.25, "learning_rate": 6.603928526833386e-06, "loss": 0.16190264225006104, "step": 1570, "token_acc": 0.9443632366772048 }, { "epoch": 0.6260525012382367, "grad_norm": 1.265625, "learning_rate": 6.483505627862632e-06, "loss": 0.1694807767868042, "step": 1580, "token_acc": 0.9416789717779672 }, { "epoch": 0.6300148588410104, "grad_norm": 1.5, "learning_rate": 6.363661865321898e-06, "loss": 0.17748751640319824, "step": 1590, "token_acc": 0.9385861686705892 }, { "epoch": 0.633977216443784, "grad_norm": 1.3984375, "learning_rate": 6.244416976449875e-06, "loss": 0.17347029447555543, "step": 1600, "token_acc": 0.9403739289918152 }, { "epoch": 0.6379395740465577, "grad_norm": 1.171875, "learning_rate": 6.125790599855912e-06, "loss": 0.1826688289642334, "step": 1610, "token_acc": 0.9372320591550186 }, { "epoch": 0.6419019316493314, "grad_norm": 1.2890625, "learning_rate": 6.007802272285693e-06, "loss": 0.17403693199157716, "step": 1620, "token_acc": 0.9401551062440614 }, { "epoch": 0.645864289252105, "grad_norm": 1.625, "learning_rate": 5.890471425403703e-06, "loss": 0.18286362886428834, "step": 1630, "token_acc": 0.9368950000596794 }, { "epoch": 0.6498266468548787, "grad_norm": 1.4375, "learning_rate": 5.773817382593008e-06, "loss": 0.1804821014404297, "step": 1640, "token_acc": 0.9376396973396319 }, { "epoch": 0.6537890044576523, "grad_norm": 1.375, "learning_rate": 5.65785935577285e-06, "loss": 0.17369402647018434, "step": 1650, "token_acc": 0.9392859770259903 }, { "epoch": 0.657751362060426, "grad_norm": 1.3515625, "learning_rate": 5.542616442234618e-06, "loss": 0.1656261920928955, "step": 1660, "token_acc": 0.943150599230765 }, { "epoch": 0.6617137196631996, "grad_norm": 1.2890625, "learning_rate": 5.428107621496681e-06, "loss": 0.17441051006317138, "step": 1670, "token_acc": 0.9392566132136696 }, { "epoch": 0.6656760772659732, "grad_norm": 1.3125, "learning_rate": 5.3143517521786255e-06, "loss": 0.17141460180282592, "step": 1680, "token_acc": 0.9404770520787022 }, { "epoch": 0.6696384348687469, "grad_norm": 1.4921875, "learning_rate": 5.201367568895408e-06, "loss": 0.1779789924621582, "step": 1690, "token_acc": 0.9389050144048604 }, { "epoch": 0.6736007924715206, "grad_norm": 1.2890625, "learning_rate": 5.089173679171922e-06, "loss": 0.1696174383163452, "step": 1700, "token_acc": 0.9415787866940171 }, { "epoch": 0.6775631500742942, "grad_norm": 1.265625, "learning_rate": 4.977788560378484e-06, "loss": 0.17647080421447753, "step": 1710, "token_acc": 0.9402322070530992 }, { "epoch": 0.6815255076770679, "grad_norm": 1.3203125, "learning_rate": 4.867230556687797e-06, "loss": 0.17825334072113036, "step": 1720, "token_acc": 0.9382623548644003 }, { "epoch": 0.6854878652798415, "grad_norm": 1.390625, "learning_rate": 4.7575178760538e-06, "loss": 0.1728861927986145, "step": 1730, "token_acc": 0.939594911427579 }, { "epoch": 0.6894502228826151, "grad_norm": 1.5234375, "learning_rate": 4.648668587212998e-06, "loss": 0.179952073097229, "step": 1740, "token_acc": 0.9381945052060547 }, { "epoch": 0.6934125804853888, "grad_norm": 1.28125, "learning_rate": 4.5407006167086575e-06, "loss": 0.17567566633224488, "step": 1750, "token_acc": 0.9399701307689505 }, { "epoch": 0.6973749380881624, "grad_norm": 1.328125, "learning_rate": 4.433631745938497e-06, "loss": 0.17287354469299315, "step": 1760, "token_acc": 0.9405146011104378 }, { "epoch": 0.7013372956909361, "grad_norm": 1.5859375, "learning_rate": 4.327479608226226e-06, "loss": 0.17426562309265137, "step": 1770, "token_acc": 0.9401683220236025 }, { "epoch": 0.7052996532937098, "grad_norm": 1.375, "learning_rate": 4.222261685917489e-06, "loss": 0.1734224557876587, "step": 1780, "token_acc": 0.9401309334234104 }, { "epoch": 0.7092620108964834, "grad_norm": 1.2578125, "learning_rate": 4.117995307500677e-06, "loss": 0.17531417608261107, "step": 1790, "token_acc": 0.9409358352138655 }, { "epoch": 0.7132243684992571, "grad_norm": 1.1171875, "learning_rate": 4.014697644753069e-06, "loss": 0.17481131553649903, "step": 1800, "token_acc": 0.9396957170350632 }, { "epoch": 0.7171867261020307, "grad_norm": 1.3046875, "learning_rate": 3.912385709912794e-06, "loss": 0.17085225582122804, "step": 1810, "token_acc": 0.9413413462722593 }, { "epoch": 0.7211490837048044, "grad_norm": 1.765625, "learning_rate": 3.8110763528770543e-06, "loss": 0.18243337869644166, "step": 1820, "token_acc": 0.9380230355884426 }, { "epoch": 0.725111441307578, "grad_norm": 1.2109375, "learning_rate": 3.7107862584271016e-06, "loss": 0.16808085441589354, "step": 1830, "token_acc": 0.9410576758514462 }, { "epoch": 0.7290737989103516, "grad_norm": 1.28125, "learning_rate": 3.6115319434803897e-06, "loss": 0.16966335773468016, "step": 1840, "token_acc": 0.9421915175440875 }, { "epoch": 0.7330361565131253, "grad_norm": 1.484375, "learning_rate": 3.5133297543703724e-06, "loss": 0.18466969728469848, "step": 1850, "token_acc": 0.9361129341986922 }, { "epoch": 0.736998514115899, "grad_norm": 1.34375, "learning_rate": 3.416195864154426e-06, "loss": 0.17389074563980103, "step": 1860, "token_acc": 0.9401176608095999 }, { "epoch": 0.7409608717186726, "grad_norm": 1.8046875, "learning_rate": 3.3201462699502606e-06, "loss": 0.18031821250915528, "step": 1870, "token_acc": 0.9371710958652052 }, { "epoch": 0.7449232293214463, "grad_norm": 1.3828125, "learning_rate": 3.2251967903013515e-06, "loss": 0.16321947574615478, "step": 1880, "token_acc": 0.9434022207870669 }, { "epoch": 0.7488855869242199, "grad_norm": 1.40625, "learning_rate": 3.1313630625717462e-06, "loss": 0.165952730178833, "step": 1890, "token_acc": 0.9433088620870477 }, { "epoch": 0.7528479445269936, "grad_norm": 1.6953125, "learning_rate": 3.0386605403707347e-06, "loss": 0.17759935855865477, "step": 1900, "token_acc": 0.9391445944776249 }, { "epoch": 0.7568103021297672, "grad_norm": 1.5625, "learning_rate": 2.947104491007766e-06, "loss": 0.17080872058868407, "step": 1910, "token_acc": 0.9417546272928465 }, { "epoch": 0.7607726597325408, "grad_norm": 1.296875, "learning_rate": 2.8567099929780596e-06, "loss": 0.17588542699813842, "step": 1920, "token_acc": 0.9384443609064742 }, { "epoch": 0.7647350173353145, "grad_norm": 1.671875, "learning_rate": 2.767491933479304e-06, "loss": 0.17596354484558105, "step": 1930, "token_acc": 0.9390227163544026 }, { "epoch": 0.7686973749380882, "grad_norm": 1.4921875, "learning_rate": 2.679465005959856e-06, "loss": 0.1740294098854065, "step": 1940, "token_acc": 0.9401780685412244 }, { "epoch": 0.7726597325408618, "grad_norm": 1.359375, "learning_rate": 2.5926437076988685e-06, "loss": 0.16495332717895508, "step": 1950, "token_acc": 0.9424596695186467 }, { "epoch": 0.7766220901436355, "grad_norm": 1.359375, "learning_rate": 2.507042337418707e-06, "loss": 0.17244219779968262, "step": 1960, "token_acc": 0.940576109936575 }, { "epoch": 0.7805844477464091, "grad_norm": 1.7109375, "learning_rate": 2.4226749929300774e-06, "loss": 0.17762508392333984, "step": 1970, "token_acc": 0.9379922601444852 }, { "epoch": 0.7845468053491828, "grad_norm": 1.5078125, "learning_rate": 2.339555568810221e-06, "loss": 0.16768510341644288, "step": 1980, "token_acc": 0.9424666806336723 }, { "epoch": 0.7885091629519564, "grad_norm": 1.2734375, "learning_rate": 2.2576977541146193e-06, "loss": 0.1687544584274292, "step": 1990, "token_acc": 0.941843418155467 }, { "epoch": 0.79247152055473, "grad_norm": 1.3984375, "learning_rate": 2.1771150301225097e-06, "loss": 0.17961428165435792, "step": 2000, "token_acc": 0.9382880764646055 } ], "logging_steps": 10, "max_steps": 2524, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3391808311045652e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }