{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 2682, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.93066748380661, "epoch": 0.0037309952429810654, "grad_norm": 14.3125, "learning_rate": 9.876543209876544e-07, "loss": 2.437, "mean_token_accuracy": 0.4887664679437876, "num_tokens": 529463.0, "step": 5 }, { "entropy": 1.9279134914278984, "epoch": 0.007461990485962131, "grad_norm": 15.9375, "learning_rate": 2.222222222222222e-06, "loss": 2.421, "mean_token_accuracy": 0.4953322228044271, "num_tokens": 1062929.0, "step": 10 }, { "entropy": 1.9278883010149002, "epoch": 0.011192985728943195, "grad_norm": 15.4375, "learning_rate": 3.4567901234567904e-06, "loss": 2.4307, "mean_token_accuracy": 0.49091244526207445, "num_tokens": 1593737.0, "step": 15 }, { "entropy": 1.9264666512608528, "epoch": 0.014923980971924261, "grad_norm": 14.3125, "learning_rate": 4.691358024691358e-06, "loss": 2.3686, "mean_token_accuracy": 0.5026748184114694, "num_tokens": 2125554.0, "step": 20 }, { "entropy": 1.915960419178009, "epoch": 0.018654976214905326, "grad_norm": 9.5625, "learning_rate": 5.925925925925926e-06, "loss": 2.347, "mean_token_accuracy": 0.49995064437389375, "num_tokens": 2657455.0, "step": 25 }, { "entropy": 1.9118589267134667, "epoch": 0.02238597145788639, "grad_norm": 10.0625, "learning_rate": 7.160493827160494e-06, "loss": 2.2538, "mean_token_accuracy": 0.5263953868299722, "num_tokens": 3187189.0, "step": 30 }, { "entropy": 1.899963890016079, "epoch": 0.026116966700867456, "grad_norm": 9.3125, "learning_rate": 8.395061728395062e-06, "loss": 2.1564, "mean_token_accuracy": 0.5407103352248669, "num_tokens": 3718429.0, "step": 35 }, { "entropy": 1.911753496527672, "epoch": 0.029847961943848523, "grad_norm": 7.9375, "learning_rate": 9.62962962962963e-06, "loss": 2.095, "mean_token_accuracy": 0.5577171023935079, "num_tokens": 4251155.0, "step": 40 }, { "entropy": 1.9223718717694283, "epoch": 0.033578957186829586, "grad_norm": 6.96875, "learning_rate": 1.0864197530864198e-05, "loss": 2.0702, "mean_token_accuracy": 0.5552546553313732, "num_tokens": 4781447.0, "step": 45 }, { "entropy": 1.9131259977817536, "epoch": 0.03730995242981065, "grad_norm": 6.1875, "learning_rate": 1.2098765432098767e-05, "loss": 1.934, "mean_token_accuracy": 0.5727813556790352, "num_tokens": 5314277.0, "step": 50 }, { "entropy": 1.9110908284783363, "epoch": 0.04104094767279172, "grad_norm": 5.40625, "learning_rate": 1.3333333333333333e-05, "loss": 1.8792, "mean_token_accuracy": 0.594604616984725, "num_tokens": 5845509.0, "step": 55 }, { "entropy": 1.9029613688588143, "epoch": 0.04477194291577278, "grad_norm": 5.21875, "learning_rate": 1.4567901234567903e-05, "loss": 1.7805, "mean_token_accuracy": 0.630406080186367, "num_tokens": 6376657.0, "step": 60 }, { "entropy": 1.8792628690600395, "epoch": 0.048502938158753846, "grad_norm": 3.84375, "learning_rate": 1.580246913580247e-05, "loss": 1.6705, "mean_token_accuracy": 0.6583383537828922, "num_tokens": 6909470.0, "step": 65 }, { "entropy": 1.8848115488886834, "epoch": 0.05223393340173491, "grad_norm": 4.6875, "learning_rate": 1.7037037037037038e-05, "loss": 1.5614, "mean_token_accuracy": 0.6829224102199077, "num_tokens": 7440943.0, "step": 70 }, { "entropy": 1.9064736932516098, "epoch": 0.05596492864471598, "grad_norm": 3.8125, "learning_rate": 1.8271604938271607e-05, "loss": 1.4162, "mean_token_accuracy": 0.7170818209648132, "num_tokens": 7973046.0, "step": 75 }, { "entropy": 1.9023971110582352, "epoch": 0.059695923887697046, "grad_norm": 3.203125, "learning_rate": 1.9506172839506175e-05, "loss": 1.4074, "mean_token_accuracy": 0.7140415258705616, "num_tokens": 8504722.0, "step": 80 }, { "entropy": 1.889440931379795, "epoch": 0.0634269191306781, "grad_norm": 3.109375, "learning_rate": 1.999994091549669e-05, "loss": 1.4883, "mean_token_accuracy": 0.7016305826604367, "num_tokens": 9033389.0, "step": 85 }, { "entropy": 1.871435184776783, "epoch": 0.06715791437365917, "grad_norm": 3.5625, "learning_rate": 1.999957984634139e-05, "loss": 1.3443, "mean_token_accuracy": 0.7220834337174893, "num_tokens": 9564491.0, "step": 90 }, { "entropy": 1.8715314790606499, "epoch": 0.07088890961664024, "grad_norm": 2.984375, "learning_rate": 1.9998890545907742e-05, "loss": 1.3409, "mean_token_accuracy": 0.7232061706483364, "num_tokens": 10094169.0, "step": 95 }, { "entropy": 1.8687299489974976, "epoch": 0.0746199048596213, "grad_norm": 3.125, "learning_rate": 1.9997873039335807e-05, "loss": 1.3168, "mean_token_accuracy": 0.721094774454832, "num_tokens": 10626035.0, "step": 100 }, { "epoch": 0.0746199048596213, "eval_entropy": 1.869174944090953, "eval_loss": 1.0890228748321533, "eval_mean_token_accuracy": 0.7311080878231383, "eval_num_tokens": 10626035.0, "eval_runtime": 416.9338, "eval_samples_per_second": 2.079, "eval_steps_per_second": 0.52, "step": 100 }, { "entropy": 1.8742459282279014, "epoch": 0.07835090010260237, "grad_norm": 2.625, "learning_rate": 1.9996527363735932e-05, "loss": 1.332, "mean_token_accuracy": 0.7202699460089207, "num_tokens": 11157038.0, "step": 105 }, { "entropy": 1.876672126352787, "epoch": 0.08208189534558344, "grad_norm": 3.1875, "learning_rate": 1.999485356818738e-05, "loss": 1.2506, "mean_token_accuracy": 0.7332465507090091, "num_tokens": 11689056.0, "step": 110 }, { "entropy": 1.8850012466311454, "epoch": 0.0858128905885645, "grad_norm": 2.890625, "learning_rate": 1.9992851713736567e-05, "loss": 1.2925, "mean_token_accuracy": 0.7265870451927186, "num_tokens": 12219351.0, "step": 115 }, { "entropy": 1.8786048024892807, "epoch": 0.08954388583154556, "grad_norm": 2.65625, "learning_rate": 1.999052187339482e-05, "loss": 1.3057, "mean_token_accuracy": 0.7244717724621296, "num_tokens": 12753651.0, "step": 120 }, { "entropy": 1.885955972969532, "epoch": 0.09327488107452662, "grad_norm": 3.28125, "learning_rate": 1.9987864132135707e-05, "loss": 1.2657, "mean_token_accuracy": 0.7337329223752022, "num_tokens": 13285085.0, "step": 125 }, { "entropy": 1.8783888518810272, "epoch": 0.09700587631750769, "grad_norm": 3.390625, "learning_rate": 1.998487858689196e-05, "loss": 1.2291, "mean_token_accuracy": 0.7372383318841458, "num_tokens": 13814797.0, "step": 130 }, { "entropy": 1.8861249819397927, "epoch": 0.10073687156048876, "grad_norm": 2.4375, "learning_rate": 1.998156534655192e-05, "loss": 1.2551, "mean_token_accuracy": 0.735904411226511, "num_tokens": 14345582.0, "step": 135 }, { "entropy": 1.8703672036528587, "epoch": 0.10446786680346982, "grad_norm": 3.109375, "learning_rate": 1.9977924531955573e-05, "loss": 1.2374, "mean_token_accuracy": 0.7376377083361149, "num_tokens": 14877910.0, "step": 140 }, { "entropy": 1.882800543308258, "epoch": 0.10819886204645089, "grad_norm": 2.71875, "learning_rate": 1.9973956275890142e-05, "loss": 1.207, "mean_token_accuracy": 0.7382527463138103, "num_tokens": 15408599.0, "step": 145 }, { "entropy": 1.8789533048868179, "epoch": 0.11192985728943196, "grad_norm": 2.609375, "learning_rate": 1.996966072308525e-05, "loss": 1.3252, "mean_token_accuracy": 0.7186619438230991, "num_tokens": 15940048.0, "step": 150 }, { "entropy": 1.8727819621562958, "epoch": 0.11566085253241302, "grad_norm": 2.78125, "learning_rate": 1.996503803020763e-05, "loss": 1.1866, "mean_token_accuracy": 0.7416010297834873, "num_tokens": 16472753.0, "step": 155 }, { "entropy": 1.869297370314598, "epoch": 0.11939184777539409, "grad_norm": 2.84375, "learning_rate": 1.996008836585542e-05, "loss": 1.1632, "mean_token_accuracy": 0.7470499858260155, "num_tokens": 17000919.0, "step": 160 }, { "entropy": 1.8754076406359672, "epoch": 0.12312284301837516, "grad_norm": 2.890625, "learning_rate": 1.9954811910552006e-05, "loss": 1.1094, "mean_token_accuracy": 0.7585403077304363, "num_tokens": 17530757.0, "step": 165 }, { "entropy": 1.8743575155735015, "epoch": 0.1268538382613562, "grad_norm": 2.921875, "learning_rate": 1.994920885673945e-05, "loss": 1.1752, "mean_token_accuracy": 0.7442750185728073, "num_tokens": 18063683.0, "step": 170 }, { "entropy": 1.8884308516979218, "epoch": 0.13058483350433728, "grad_norm": 2.96875, "learning_rate": 1.9943279408771458e-05, "loss": 1.1346, "mean_token_accuracy": 0.7534236386418343, "num_tokens": 18594235.0, "step": 175 }, { "entropy": 1.885941931605339, "epoch": 0.13431582874731834, "grad_norm": 2.765625, "learning_rate": 1.9937023782905938e-05, "loss": 1.1172, "mean_token_accuracy": 0.7521387666463852, "num_tokens": 19125078.0, "step": 180 }, { "entropy": 1.8702793419361115, "epoch": 0.1380468239902994, "grad_norm": 2.84375, "learning_rate": 1.9930442207297097e-05, "loss": 1.1865, "mean_token_accuracy": 0.7442984923720359, "num_tokens": 19657833.0, "step": 185 }, { "entropy": 1.8780140653252602, "epoch": 0.14177781923328048, "grad_norm": 3.0625, "learning_rate": 1.992353492198715e-05, "loss": 1.098, "mean_token_accuracy": 0.7611210979521275, "num_tokens": 20190362.0, "step": 190 }, { "entropy": 1.8812837466597556, "epoch": 0.14550881447626154, "grad_norm": 2.640625, "learning_rate": 1.9916302178897526e-05, "loss": 1.152, "mean_token_accuracy": 0.751427735388279, "num_tokens": 20718896.0, "step": 195 }, { "entropy": 1.881430271267891, "epoch": 0.1492398097192426, "grad_norm": 2.65625, "learning_rate": 1.9908744241819712e-05, "loss": 1.1011, "mean_token_accuracy": 0.7548866882920265, "num_tokens": 21252633.0, "step": 200 }, { "epoch": 0.1492398097192426, "eval_entropy": 1.8965141059066843, "eval_loss": 0.9376018047332764, "eval_mean_token_accuracy": 0.7604197722426208, "eval_num_tokens": 21252633.0, "eval_runtime": 416.2999, "eval_samples_per_second": 2.083, "eval_steps_per_second": 0.521, "step": 200 }, { "entropy": 1.901954147219658, "epoch": 0.15297080496222368, "grad_norm": 2.59375, "learning_rate": 1.990086138640561e-05, "loss": 1.1625, "mean_token_accuracy": 0.7511769436299801, "num_tokens": 21780658.0, "step": 205 }, { "entropy": 1.8937502846121788, "epoch": 0.15670180020520474, "grad_norm": 2.3125, "learning_rate": 1.98926539001575e-05, "loss": 1.1358, "mean_token_accuracy": 0.750651653856039, "num_tokens": 22311279.0, "step": 210 }, { "entropy": 1.900780713558197, "epoch": 0.1604327954481858, "grad_norm": 2.484375, "learning_rate": 1.988412208241755e-05, "loss": 1.1354, "mean_token_accuracy": 0.753037316352129, "num_tokens": 22842732.0, "step": 215 }, { "entropy": 1.900828868150711, "epoch": 0.16416379069116688, "grad_norm": 2.65625, "learning_rate": 1.987526624435689e-05, "loss": 1.1153, "mean_token_accuracy": 0.7550405651330948, "num_tokens": 23370062.0, "step": 220 }, { "entropy": 1.8861738339066505, "epoch": 0.16789478593414794, "grad_norm": 2.71875, "learning_rate": 1.9866086708964264e-05, "loss": 1.0807, "mean_token_accuracy": 0.7639111615717411, "num_tokens": 23899529.0, "step": 225 }, { "entropy": 1.8919328197836875, "epoch": 0.171625781177129, "grad_norm": 2.640625, "learning_rate": 1.985658381103427e-05, "loss": 1.1309, "mean_token_accuracy": 0.7507242120802402, "num_tokens": 24432664.0, "step": 230 }, { "entropy": 1.8881874978542328, "epoch": 0.17535677642011008, "grad_norm": 2.6875, "learning_rate": 1.984675789715512e-05, "loss": 1.0927, "mean_token_accuracy": 0.7648542866110801, "num_tokens": 24962958.0, "step": 235 }, { "entropy": 1.888606432080269, "epoch": 0.17908777166309112, "grad_norm": 2.265625, "learning_rate": 1.9836609325696017e-05, "loss": 1.2134, "mean_token_accuracy": 0.7300407648086548, "num_tokens": 25496026.0, "step": 240 }, { "entropy": 1.8738144993782044, "epoch": 0.18281876690607218, "grad_norm": 2.75, "learning_rate": 1.9826138466794088e-05, "loss": 1.1858, "mean_token_accuracy": 0.7387278139591217, "num_tokens": 26029309.0, "step": 245 }, { "entropy": 1.8657475382089614, "epoch": 0.18654976214905325, "grad_norm": 2.3125, "learning_rate": 1.981534570234087e-05, "loss": 1.1096, "mean_token_accuracy": 0.7601700283586978, "num_tokens": 26561603.0, "step": 250 }, { "entropy": 1.8530463799834251, "epoch": 0.19028075739203432, "grad_norm": 2.390625, "learning_rate": 1.9804231425968394e-05, "loss": 1.0604, "mean_token_accuracy": 0.7672378472983837, "num_tokens": 27095577.0, "step": 255 }, { "entropy": 1.8480310454964637, "epoch": 0.19401175263501538, "grad_norm": 2.96875, "learning_rate": 1.9792796043034817e-05, "loss": 1.0706, "mean_token_accuracy": 0.7706682577729225, "num_tokens": 27626494.0, "step": 260 }, { "entropy": 1.8671077743172646, "epoch": 0.19774274787799645, "grad_norm": 2.53125, "learning_rate": 1.9781039970609656e-05, "loss": 1.0765, "mean_token_accuracy": 0.7629323773086071, "num_tokens": 28157811.0, "step": 265 }, { "entropy": 1.853178422152996, "epoch": 0.20147374312097752, "grad_norm": 2.46875, "learning_rate": 1.9768963637458553e-05, "loss": 1.1053, "mean_token_accuracy": 0.7563259281218052, "num_tokens": 28689739.0, "step": 270 }, { "entropy": 1.8592572018504143, "epoch": 0.20520473836395858, "grad_norm": 2.53125, "learning_rate": 1.9756567484027657e-05, "loss": 1.064, "mean_token_accuracy": 0.7610228173434734, "num_tokens": 29220279.0, "step": 275 }, { "entropy": 1.868624246120453, "epoch": 0.20893573360693965, "grad_norm": 2.5, "learning_rate": 1.9743851962427547e-05, "loss": 1.1369, "mean_token_accuracy": 0.7518735721707344, "num_tokens": 29750745.0, "step": 280 }, { "entropy": 1.8660186305642128, "epoch": 0.21266672884992072, "grad_norm": 2.5625, "learning_rate": 1.9730817536416757e-05, "loss": 1.0856, "mean_token_accuracy": 0.7596553325653076, "num_tokens": 30282712.0, "step": 285 }, { "entropy": 1.8520298555493355, "epoch": 0.21639772409290178, "grad_norm": 2.625, "learning_rate": 1.9717464681384847e-05, "loss": 1.03, "mean_token_accuracy": 0.7732533916831017, "num_tokens": 30814113.0, "step": 290 }, { "entropy": 1.848675537109375, "epoch": 0.22012871933588285, "grad_norm": 2.765625, "learning_rate": 1.970379388433507e-05, "loss": 1.1311, "mean_token_accuracy": 0.7620892561972141, "num_tokens": 31344383.0, "step": 295 }, { "entropy": 1.8459958881139755, "epoch": 0.22385971457886392, "grad_norm": 2.625, "learning_rate": 1.9689805643866612e-05, "loss": 1.1939, "mean_token_accuracy": 0.738617904484272, "num_tokens": 31876803.0, "step": 300 }, { "epoch": 0.22385971457886392, "eval_entropy": 1.8536106128297094, "eval_loss": 0.8944675922393799, "eval_mean_token_accuracy": 0.7675316457374854, "eval_num_tokens": 31876803.0, "eval_runtime": 419.8489, "eval_samples_per_second": 2.065, "eval_steps_per_second": 0.517, "step": 300 }, { "entropy": 1.8487590283155442, "epoch": 0.22759070982184498, "grad_norm": 2.65625, "learning_rate": 1.9675500470156413e-05, "loss": 1.0223, "mean_token_accuracy": 0.7746541768312454, "num_tokens": 32407779.0, "step": 305 }, { "entropy": 1.8505715727806091, "epoch": 0.23132170506482605, "grad_norm": 2.609375, "learning_rate": 1.9660878884940538e-05, "loss": 1.064, "mean_token_accuracy": 0.7622771561145782, "num_tokens": 32939220.0, "step": 310 }, { "entropy": 1.8618406757712365, "epoch": 0.23505270030780712, "grad_norm": 2.875, "learning_rate": 1.9645941421495175e-05, "loss": 1.0937, "mean_token_accuracy": 0.7551429606974125, "num_tokens": 33470031.0, "step": 315 }, { "entropy": 1.8671145975589751, "epoch": 0.23878369555078818, "grad_norm": 2.59375, "learning_rate": 1.9630688624617175e-05, "loss": 1.0394, "mean_token_accuracy": 0.7662747830152512, "num_tokens": 34002483.0, "step": 320 }, { "entropy": 1.8697764590382575, "epoch": 0.24251469079376925, "grad_norm": 2.609375, "learning_rate": 1.9615121050604166e-05, "loss": 1.0503, "mean_token_accuracy": 0.7624305739998818, "num_tokens": 34534879.0, "step": 325 }, { "entropy": 1.8799255907535553, "epoch": 0.24624568603675032, "grad_norm": 2.390625, "learning_rate": 1.9599239267234304e-05, "loss": 1.028, "mean_token_accuracy": 0.7683257654309272, "num_tokens": 35065564.0, "step": 330 }, { "entropy": 1.873243047297001, "epoch": 0.24997668127973136, "grad_norm": 2.484375, "learning_rate": 1.9583043853745514e-05, "loss": 1.1061, "mean_token_accuracy": 0.7559119880199432, "num_tokens": 35596284.0, "step": 335 }, { "entropy": 1.8775247171521188, "epoch": 0.2537076765227124, "grad_norm": 2.53125, "learning_rate": 1.9566535400814407e-05, "loss": 1.1048, "mean_token_accuracy": 0.7629238598048687, "num_tokens": 36130255.0, "step": 340 }, { "entropy": 1.860211183130741, "epoch": 0.2574386717656935, "grad_norm": 3.15625, "learning_rate": 1.954971451053471e-05, "loss": 1.0581, "mean_token_accuracy": 0.7671318039298057, "num_tokens": 36662579.0, "step": 345 }, { "entropy": 1.8578041836619377, "epoch": 0.26116966700867456, "grad_norm": 2.421875, "learning_rate": 1.9532581796395327e-05, "loss": 1.1024, "mean_token_accuracy": 0.7555016644299031, "num_tokens": 37194467.0, "step": 350 }, { "entropy": 1.866262961924076, "epoch": 0.26490066225165565, "grad_norm": 2.53125, "learning_rate": 1.951513788325794e-05, "loss": 1.0253, "mean_token_accuracy": 0.7781079560518265, "num_tokens": 37723064.0, "step": 355 }, { "entropy": 1.856664003431797, "epoch": 0.2686316574946367, "grad_norm": 2.890625, "learning_rate": 1.9497383407334245e-05, "loss": 1.0596, "mean_token_accuracy": 0.7623144149780273, "num_tokens": 38256284.0, "step": 360 }, { "entropy": 1.8678738117218017, "epoch": 0.2723626527376178, "grad_norm": 2.40625, "learning_rate": 1.9479319016162727e-05, "loss": 1.0514, "mean_token_accuracy": 0.764145913720131, "num_tokens": 38787832.0, "step": 365 }, { "entropy": 1.8648770898580551, "epoch": 0.2760936479805988, "grad_norm": 2.5, "learning_rate": 1.9460945368585057e-05, "loss": 1.0654, "mean_token_accuracy": 0.7684322811663151, "num_tokens": 39319841.0, "step": 370 }, { "entropy": 1.8789657562971116, "epoch": 0.2798246432235799, "grad_norm": 2.53125, "learning_rate": 1.9442263134722055e-05, "loss": 1.0287, "mean_token_accuracy": 0.7699924275279045, "num_tokens": 39849403.0, "step": 375 }, { "entropy": 1.8746845886111259, "epoch": 0.28355563846656096, "grad_norm": 2.765625, "learning_rate": 1.9423272995949248e-05, "loss": 1.0038, "mean_token_accuracy": 0.7791954837739468, "num_tokens": 40380746.0, "step": 380 }, { "entropy": 1.8678623661398888, "epoch": 0.287286633709542, "grad_norm": 2.578125, "learning_rate": 1.9403975644872035e-05, "loss": 1.0835, "mean_token_accuracy": 0.759261854737997, "num_tokens": 40911733.0, "step": 385 }, { "entropy": 1.8729774489998818, "epoch": 0.2910176289525231, "grad_norm": 2.390625, "learning_rate": 1.9384371785300408e-05, "loss": 1.0718, "mean_token_accuracy": 0.7628073863685131, "num_tokens": 41443082.0, "step": 390 }, { "entropy": 1.8807472854852676, "epoch": 0.2947486241955041, "grad_norm": 2.609375, "learning_rate": 1.936446213222328e-05, "loss": 1.1063, "mean_token_accuracy": 0.7536667741835117, "num_tokens": 41975804.0, "step": 395 }, { "entropy": 1.8795248806476592, "epoch": 0.2984796194384852, "grad_norm": 2.703125, "learning_rate": 1.934424741178243e-05, "loss": 1.0782, "mean_token_accuracy": 0.7614248670637608, "num_tokens": 42506840.0, "step": 400 }, { "epoch": 0.2984796194384852, "eval_entropy": 1.8717721261187084, "eval_loss": 0.8698498010635376, "eval_mean_token_accuracy": 0.7723283369420311, "eval_num_tokens": 42506840.0, "eval_runtime": 421.506, "eval_samples_per_second": 2.057, "eval_steps_per_second": 0.515, "step": 400 }, { "entropy": 1.8597126081585884, "epoch": 0.30221061468146626, "grad_norm": 2.515625, "learning_rate": 1.932372836124601e-05, "loss": 1.0929, "mean_token_accuracy": 0.7585588760674, "num_tokens": 43037896.0, "step": 405 }, { "entropy": 1.8776621356606484, "epoch": 0.30594160992444736, "grad_norm": 2.515625, "learning_rate": 1.9302905728981628e-05, "loss": 1.0856, "mean_token_accuracy": 0.7586130477488041, "num_tokens": 43568194.0, "step": 410 }, { "entropy": 1.8922527134418488, "epoch": 0.3096726051674284, "grad_norm": 2.546875, "learning_rate": 1.9281780274429108e-05, "loss": 1.1148, "mean_token_accuracy": 0.7570016115903855, "num_tokens": 44097543.0, "step": 415 }, { "entropy": 1.8982042223215103, "epoch": 0.3134036004104095, "grad_norm": 2.640625, "learning_rate": 1.9260352768072737e-05, "loss": 1.0039, "mean_token_accuracy": 0.7770169205963612, "num_tokens": 44628497.0, "step": 420 }, { "entropy": 1.8883474573493004, "epoch": 0.3171345956533905, "grad_norm": 2.671875, "learning_rate": 1.9238623991413212e-05, "loss": 1.0516, "mean_token_accuracy": 0.7661412447690964, "num_tokens": 45161594.0, "step": 425 }, { "entropy": 1.8868870452046393, "epoch": 0.3208655908963716, "grad_norm": 2.578125, "learning_rate": 1.9216594736939086e-05, "loss": 1.0684, "mean_token_accuracy": 0.7590530589222908, "num_tokens": 45691957.0, "step": 430 }, { "entropy": 1.8828503325581551, "epoch": 0.32459658613935266, "grad_norm": 2.828125, "learning_rate": 1.9194265808097913e-05, "loss": 1.044, "mean_token_accuracy": 0.7744015090167522, "num_tokens": 46222540.0, "step": 435 }, { "entropy": 1.8643267348408699, "epoch": 0.32832758138233376, "grad_norm": 2.671875, "learning_rate": 1.9171638019266915e-05, "loss": 1.0212, "mean_token_accuracy": 0.768348614871502, "num_tokens": 46756535.0, "step": 440 }, { "entropy": 1.8655900582671165, "epoch": 0.3320585766253148, "grad_norm": 2.265625, "learning_rate": 1.91487121957233e-05, "loss": 1.0475, "mean_token_accuracy": 0.772677880525589, "num_tokens": 47288448.0, "step": 445 }, { "entropy": 1.8565448239445685, "epoch": 0.3357895718682959, "grad_norm": 2.25, "learning_rate": 1.9125489173614132e-05, "loss": 1.0702, "mean_token_accuracy": 0.7584976136684418, "num_tokens": 47818809.0, "step": 450 }, { "entropy": 1.8645773649215698, "epoch": 0.3395205671112769, "grad_norm": 3.21875, "learning_rate": 1.9101969799925877e-05, "loss": 0.9895, "mean_token_accuracy": 0.7730739787220955, "num_tokens": 48349840.0, "step": 455 }, { "entropy": 1.8840496107935905, "epoch": 0.343251562354258, "grad_norm": 2.65625, "learning_rate": 1.907815493245347e-05, "loss": 1.026, "mean_token_accuracy": 0.7796110861003399, "num_tokens": 48880975.0, "step": 460 }, { "entropy": 1.8931686580181122, "epoch": 0.34698255759723906, "grad_norm": 2.53125, "learning_rate": 1.905404543976906e-05, "loss": 1.0454, "mean_token_accuracy": 0.7702999524772167, "num_tokens": 49411651.0, "step": 465 }, { "entropy": 1.8839797154068947, "epoch": 0.35071355284022016, "grad_norm": 2.546875, "learning_rate": 1.9029642201190328e-05, "loss": 1.0322, "mean_token_accuracy": 0.7673676989972591, "num_tokens": 49942631.0, "step": 470 }, { "entropy": 1.8662499740719796, "epoch": 0.3544445480832012, "grad_norm": 2.625, "learning_rate": 1.9004946106748392e-05, "loss": 1.0455, "mean_token_accuracy": 0.7651143468916416, "num_tokens": 50475396.0, "step": 475 }, { "entropy": 1.861226101219654, "epoch": 0.35817554332618223, "grad_norm": 2.578125, "learning_rate": 1.8979958057155377e-05, "loss": 0.9745, "mean_token_accuracy": 0.777230667322874, "num_tokens": 51008321.0, "step": 480 }, { "entropy": 1.876523844897747, "epoch": 0.3619065385691633, "grad_norm": 2.359375, "learning_rate": 1.8954678963771543e-05, "loss": 1.0391, "mean_token_accuracy": 0.7686795577406883, "num_tokens": 51540064.0, "step": 485 }, { "entropy": 1.8753577783703803, "epoch": 0.36563753381214437, "grad_norm": 2.4375, "learning_rate": 1.892910974857206e-05, "loss": 0.9281, "mean_token_accuracy": 0.787179034948349, "num_tokens": 52071265.0, "step": 490 }, { "entropy": 1.8801261857151985, "epoch": 0.36936852905512546, "grad_norm": 2.75, "learning_rate": 1.8903251344113378e-05, "loss": 1.0486, "mean_token_accuracy": 0.7643107511103153, "num_tokens": 52602478.0, "step": 495 }, { "entropy": 1.8888003394007682, "epoch": 0.3730995242981065, "grad_norm": 2.46875, "learning_rate": 1.8877104693499196e-05, "loss": 1.0622, "mean_token_accuracy": 0.7684491701424122, "num_tokens": 53133417.0, "step": 500 }, { "epoch": 0.3730995242981065, "eval_entropy": 1.895705726838881, "eval_loss": 0.8520886301994324, "eval_mean_token_accuracy": 0.77683642795009, "eval_num_tokens": 53133417.0, "eval_runtime": 430.7614, "eval_samples_per_second": 2.013, "eval_steps_per_second": 0.504, "step": 500 }, { "entropy": 1.8893942952156066, "epoch": 0.3768305195410876, "grad_norm": 2.640625, "learning_rate": 1.885067075034611e-05, "loss": 1.0565, "mean_token_accuracy": 0.7684147022664547, "num_tokens": 53664950.0, "step": 505 }, { "entropy": 1.879076398909092, "epoch": 0.38056151478406863, "grad_norm": 2.671875, "learning_rate": 1.8823950478748776e-05, "loss": 0.9903, "mean_token_accuracy": 0.7752767533063889, "num_tokens": 54193587.0, "step": 510 }, { "entropy": 1.8704274088144301, "epoch": 0.3842925100270497, "grad_norm": 2.859375, "learning_rate": 1.8796944853244792e-05, "loss": 1.0437, "mean_token_accuracy": 0.7737653762102127, "num_tokens": 54725214.0, "step": 515 }, { "entropy": 1.8798809126019478, "epoch": 0.38802350527003077, "grad_norm": 2.796875, "learning_rate": 1.876965485877914e-05, "loss": 1.0818, "mean_token_accuracy": 0.7597776927053929, "num_tokens": 55257440.0, "step": 520 }, { "entropy": 1.8531851500272751, "epoch": 0.39175450051301186, "grad_norm": 2.90625, "learning_rate": 1.8742081490668262e-05, "loss": 0.9992, "mean_token_accuracy": 0.770975799113512, "num_tokens": 55789441.0, "step": 525 }, { "entropy": 1.8578070297837257, "epoch": 0.3954854957559929, "grad_norm": 2.703125, "learning_rate": 1.871422575456375e-05, "loss": 1.0776, "mean_token_accuracy": 0.7619852371513843, "num_tokens": 56320165.0, "step": 530 }, { "entropy": 1.8590588822960854, "epoch": 0.399216490998974, "grad_norm": 2.890625, "learning_rate": 1.8686088666415688e-05, "loss": 1.0425, "mean_token_accuracy": 0.7635523222386837, "num_tokens": 56853151.0, "step": 535 }, { "entropy": 1.853328062593937, "epoch": 0.40294748624195503, "grad_norm": 2.734375, "learning_rate": 1.8657671252435588e-05, "loss": 0.9841, "mean_token_accuracy": 0.7831168882548809, "num_tokens": 57384404.0, "step": 540 }, { "entropy": 1.8588544443249702, "epoch": 0.4066784814849361, "grad_norm": 3.890625, "learning_rate": 1.8628974549058953e-05, "loss": 0.968, "mean_token_accuracy": 0.7785750322043896, "num_tokens": 57918551.0, "step": 545 }, { "entropy": 1.849987468123436, "epoch": 0.41040947672791717, "grad_norm": 2.734375, "learning_rate": 1.8599999602907493e-05, "loss": 1.0544, "mean_token_accuracy": 0.7708779990673065, "num_tokens": 58451041.0, "step": 550 }, { "entropy": 1.8577365413308145, "epoch": 0.41414047197089826, "grad_norm": 2.859375, "learning_rate": 1.857074747075095e-05, "loss": 0.9909, "mean_token_accuracy": 0.7778203032910824, "num_tokens": 58985420.0, "step": 555 }, { "entropy": 1.8802942395210267, "epoch": 0.4178714672138793, "grad_norm": 2.859375, "learning_rate": 1.8541219219468534e-05, "loss": 1.0193, "mean_token_accuracy": 0.7772314816713333, "num_tokens": 59516362.0, "step": 560 }, { "entropy": 1.8853446871042252, "epoch": 0.4216024624568604, "grad_norm": 2.375, "learning_rate": 1.8511415926010043e-05, "loss": 1.0619, "mean_token_accuracy": 0.7719809092581272, "num_tokens": 60046588.0, "step": 565 }, { "entropy": 1.8890699461102485, "epoch": 0.42533345769984143, "grad_norm": 2.421875, "learning_rate": 1.8481338677356574e-05, "loss": 1.0658, "mean_token_accuracy": 0.7626231409609318, "num_tokens": 60577014.0, "step": 570 }, { "entropy": 1.8884864136576653, "epoch": 0.42906445294282247, "grad_norm": 2.40625, "learning_rate": 1.8450988570480865e-05, "loss": 1.0268, "mean_token_accuracy": 0.7732196651399136, "num_tokens": 61109729.0, "step": 575 }, { "entropy": 1.8984817937016487, "epoch": 0.43279544818580357, "grad_norm": 2.5625, "learning_rate": 1.8420366712307293e-05, "loss": 1.0837, "mean_token_accuracy": 0.760554713755846, "num_tokens": 61640531.0, "step": 580 }, { "entropy": 1.8876586690545083, "epoch": 0.4365264434287846, "grad_norm": 2.671875, "learning_rate": 1.838947421967152e-05, "loss": 1.0442, "mean_token_accuracy": 0.763022082298994, "num_tokens": 62169871.0, "step": 585 }, { "entropy": 1.8796870589256287, "epoch": 0.4402574386717657, "grad_norm": 2.453125, "learning_rate": 1.8358312219279733e-05, "loss": 0.9773, "mean_token_accuracy": 0.7882885307073593, "num_tokens": 62700745.0, "step": 590 }, { "entropy": 1.8949595779180526, "epoch": 0.44398843391474674, "grad_norm": 2.65625, "learning_rate": 1.832688184766758e-05, "loss": 1.0091, "mean_token_accuracy": 0.7738702580332756, "num_tokens": 63230735.0, "step": 595 }, { "entropy": 1.8843660712242127, "epoch": 0.44771942915772783, "grad_norm": 2.640625, "learning_rate": 1.8295184251158683e-05, "loss": 1.0339, "mean_token_accuracy": 0.7698764786124229, "num_tokens": 63762011.0, "step": 600 }, { "epoch": 0.44771942915772783, "eval_entropy": 1.8978905815133302, "eval_loss": 0.8385350704193115, "eval_mean_token_accuracy": 0.7798161514893106, "eval_num_tokens": 63762011.0, "eval_runtime": 421.7508, "eval_samples_per_second": 2.056, "eval_steps_per_second": 0.515, "step": 600 }, { "entropy": 1.8987897396087647, "epoch": 0.45145042440070887, "grad_norm": 2.390625, "learning_rate": 1.826322058582287e-05, "loss": 0.9574, "mean_token_accuracy": 0.7808470018208027, "num_tokens": 64293628.0, "step": 605 }, { "entropy": 1.9083756417036057, "epoch": 0.45518141964368997, "grad_norm": 2.765625, "learning_rate": 1.8230992017433964e-05, "loss": 1.0223, "mean_token_accuracy": 0.7760558076202869, "num_tokens": 64823314.0, "step": 610 }, { "entropy": 1.8789043992757797, "epoch": 0.458912414886671, "grad_norm": 2.84375, "learning_rate": 1.8198499721427315e-05, "loss": 1.0547, "mean_token_accuracy": 0.7682580567896367, "num_tokens": 65354447.0, "step": 615 }, { "entropy": 1.88258585780859, "epoch": 0.4626434101296521, "grad_norm": 3.046875, "learning_rate": 1.8165744882856896e-05, "loss": 0.9368, "mean_token_accuracy": 0.7864787146449089, "num_tokens": 65883446.0, "step": 620 }, { "entropy": 1.8828800201416016, "epoch": 0.46637440537263314, "grad_norm": 2.59375, "learning_rate": 1.813272869635209e-05, "loss": 0.9939, "mean_token_accuracy": 0.7787854075431824, "num_tokens": 66413154.0, "step": 625 }, { "entropy": 1.8896025449037552, "epoch": 0.47010540061561423, "grad_norm": 2.453125, "learning_rate": 1.8099452366074125e-05, "loss": 1.0421, "mean_token_accuracy": 0.7667336985468864, "num_tokens": 66944797.0, "step": 630 }, { "entropy": 1.9044037982821465, "epoch": 0.47383639585859527, "grad_norm": 2.609375, "learning_rate": 1.8065917105672147e-05, "loss": 1.03, "mean_token_accuracy": 0.7720849968492984, "num_tokens": 67475154.0, "step": 635 }, { "entropy": 1.9032243177294732, "epoch": 0.47756739110157637, "grad_norm": 2.53125, "learning_rate": 1.8032124138238964e-05, "loss": 1.0904, "mean_token_accuracy": 0.7631759561598301, "num_tokens": 68005497.0, "step": 640 }, { "entropy": 1.9050288692116737, "epoch": 0.4812983863445574, "grad_norm": 2.5625, "learning_rate": 1.7998074696266438e-05, "loss": 1.0385, "mean_token_accuracy": 0.7717444844543934, "num_tokens": 68538652.0, "step": 645 }, { "entropy": 1.9110868632793427, "epoch": 0.4850293815875385, "grad_norm": 2.515625, "learning_rate": 1.7963770021600533e-05, "loss": 1.0514, "mean_token_accuracy": 0.7663141049444675, "num_tokens": 69067407.0, "step": 650 }, { "entropy": 1.9152159363031387, "epoch": 0.48876037683051954, "grad_norm": 2.75, "learning_rate": 1.7929211365396005e-05, "loss": 1.0354, "mean_token_accuracy": 0.7692722842097283, "num_tokens": 69595794.0, "step": 655 }, { "entropy": 1.9117783293128014, "epoch": 0.49249137207350063, "grad_norm": 2.640625, "learning_rate": 1.7894399988070804e-05, "loss": 0.9893, "mean_token_accuracy": 0.7810171820223332, "num_tokens": 70124120.0, "step": 660 }, { "entropy": 1.898228144645691, "epoch": 0.49622236731648167, "grad_norm": 2.453125, "learning_rate": 1.785933715926007e-05, "loss": 1.0349, "mean_token_accuracy": 0.7741277053952217, "num_tokens": 70658383.0, "step": 665 }, { "entropy": 1.892415849864483, "epoch": 0.4999533625594627, "grad_norm": 2.640625, "learning_rate": 1.7824024157769857e-05, "loss": 1.0535, "mean_token_accuracy": 0.7646197609603405, "num_tokens": 71190508.0, "step": 670 }, { "entropy": 1.8964731127023697, "epoch": 0.5036843578024438, "grad_norm": 2.53125, "learning_rate": 1.7788462271530465e-05, "loss": 0.9729, "mean_token_accuracy": 0.7833174630999565, "num_tokens": 71723784.0, "step": 675 }, { "entropy": 1.8879738718271255, "epoch": 0.5074153530454248, "grad_norm": 2.515625, "learning_rate": 1.7752652797549488e-05, "loss": 1.0651, "mean_token_accuracy": 0.7697323687374592, "num_tokens": 72256599.0, "step": 680 }, { "entropy": 1.880824202299118, "epoch": 0.5111463482884059, "grad_norm": 2.4375, "learning_rate": 1.77165970418645e-05, "loss": 1.013, "mean_token_accuracy": 0.7752033665776252, "num_tokens": 72791530.0, "step": 685 }, { "entropy": 1.9036763146519662, "epoch": 0.514877343531387, "grad_norm": 2.9375, "learning_rate": 1.7680296319495417e-05, "loss": 1.0175, "mean_token_accuracy": 0.768329168856144, "num_tokens": 73321750.0, "step": 690 }, { "entropy": 1.8978674352169036, "epoch": 0.5186083387743681, "grad_norm": 2.71875, "learning_rate": 1.764375195439655e-05, "loss": 0.9838, "mean_token_accuracy": 0.776180149614811, "num_tokens": 73851824.0, "step": 695 }, { "entropy": 1.8836245954036712, "epoch": 0.5223393340173491, "grad_norm": 3.03125, "learning_rate": 1.7606965279408306e-05, "loss": 0.9327, "mean_token_accuracy": 0.7869227968156338, "num_tokens": 74383284.0, "step": 700 }, { "epoch": 0.5223393340173491, "eval_entropy": 1.8904020984051964, "eval_loss": 0.8297026753425598, "eval_mean_token_accuracy": 0.7804744441388389, "eval_num_tokens": 74383284.0, "eval_runtime": 413.2232, "eval_samples_per_second": 2.098, "eval_steps_per_second": 0.525, "step": 700 }, { "entropy": 1.8916564375162124, "epoch": 0.5260703292603301, "grad_norm": 2.90625, "learning_rate": 1.756993763620859e-05, "loss": 1.0128, "mean_token_accuracy": 0.7728722743690014, "num_tokens": 74914750.0, "step": 705 }, { "entropy": 1.8978112503886222, "epoch": 0.5298013245033113, "grad_norm": 2.84375, "learning_rate": 1.7532670375263836e-05, "loss": 1.0299, "mean_token_accuracy": 0.7684225983917713, "num_tokens": 75446535.0, "step": 710 }, { "entropy": 1.8988500460982323, "epoch": 0.5335323197462923, "grad_norm": 2.75, "learning_rate": 1.7495164855779812e-05, "loss": 0.9841, "mean_token_accuracy": 0.7884247310459613, "num_tokens": 75978225.0, "step": 715 }, { "entropy": 1.903371374309063, "epoch": 0.5372633149892734, "grad_norm": 2.390625, "learning_rate": 1.7457422445651996e-05, "loss": 1.0304, "mean_token_accuracy": 0.7681498162448406, "num_tokens": 76514443.0, "step": 720 }, { "entropy": 1.9176162853837013, "epoch": 0.5409943102322544, "grad_norm": 2.53125, "learning_rate": 1.7419444521415706e-05, "loss": 0.9887, "mean_token_accuracy": 0.778039886802435, "num_tokens": 77045887.0, "step": 725 }, { "entropy": 1.9141801953315736, "epoch": 0.5447253054752356, "grad_norm": 2.703125, "learning_rate": 1.7381232468195896e-05, "loss": 0.953, "mean_token_accuracy": 0.7912342786788941, "num_tokens": 77579089.0, "step": 730 }, { "entropy": 1.9069402724504472, "epoch": 0.5484563007182166, "grad_norm": 2.734375, "learning_rate": 1.7342787679656635e-05, "loss": 1.0026, "mean_token_accuracy": 0.7827796794474124, "num_tokens": 78108383.0, "step": 735 }, { "entropy": 1.9042499542236329, "epoch": 0.5521872959611976, "grad_norm": 2.953125, "learning_rate": 1.730411155795028e-05, "loss": 1.0295, "mean_token_accuracy": 0.7759525053203106, "num_tokens": 78638979.0, "step": 740 }, { "entropy": 1.9021551787853241, "epoch": 0.5559182912041787, "grad_norm": 2.4375, "learning_rate": 1.726520551366634e-05, "loss": 1.013, "mean_token_accuracy": 0.7834214746952057, "num_tokens": 79170107.0, "step": 745 }, { "entropy": 1.886176513135433, "epoch": 0.5596492864471598, "grad_norm": 2.90625, "learning_rate": 1.7226070965780016e-05, "loss": 0.9796, "mean_token_accuracy": 0.7753789328038693, "num_tokens": 79700908.0, "step": 750 }, { "entropy": 1.9083970338106155, "epoch": 0.5633802816901409, "grad_norm": 2.9375, "learning_rate": 1.7186709341600453e-05, "loss": 0.9348, "mean_token_accuracy": 0.7793697625398636, "num_tokens": 80230760.0, "step": 755 }, { "entropy": 1.8939674898982048, "epoch": 0.5671112769331219, "grad_norm": 2.71875, "learning_rate": 1.7147122076718702e-05, "loss": 0.9813, "mean_token_accuracy": 0.7803831771016121, "num_tokens": 80760531.0, "step": 760 }, { "entropy": 1.9014861181378364, "epoch": 0.570842272176103, "grad_norm": 2.59375, "learning_rate": 1.710731061495534e-05, "loss": 1.0185, "mean_token_accuracy": 0.7759978130459786, "num_tokens": 81291358.0, "step": 765 }, { "entropy": 1.89882051050663, "epoch": 0.574573267419084, "grad_norm": 2.9375, "learning_rate": 1.7067276408307813e-05, "loss": 0.9795, "mean_token_accuracy": 0.7780521295964717, "num_tokens": 81822399.0, "step": 770 }, { "entropy": 1.9006829962134362, "epoch": 0.5783042626620651, "grad_norm": 2.8125, "learning_rate": 1.702702091689749e-05, "loss": 0.9759, "mean_token_accuracy": 0.7813475333154202, "num_tokens": 82353944.0, "step": 775 }, { "entropy": 1.8907926201820373, "epoch": 0.5820352579050462, "grad_norm": 2.46875, "learning_rate": 1.6986545608916398e-05, "loss": 1.0128, "mean_token_accuracy": 0.7728120058774948, "num_tokens": 82884998.0, "step": 780 }, { "entropy": 1.9046567052602768, "epoch": 0.5857662531480272, "grad_norm": 3.609375, "learning_rate": 1.6945851960573688e-05, "loss": 0.9869, "mean_token_accuracy": 0.7850996717810631, "num_tokens": 83413705.0, "step": 785 }, { "entropy": 1.8946405827999115, "epoch": 0.5894972483910083, "grad_norm": 2.5, "learning_rate": 1.690494145604178e-05, "loss": 1.0068, "mean_token_accuracy": 0.7735021203756333, "num_tokens": 83945754.0, "step": 790 }, { "entropy": 1.8882904008030892, "epoch": 0.5932282436339894, "grad_norm": 2.703125, "learning_rate": 1.6863815587402238e-05, "loss": 0.9559, "mean_token_accuracy": 0.7860287889838219, "num_tokens": 84476584.0, "step": 795 }, { "entropy": 1.8921405717730522, "epoch": 0.5969592388769704, "grad_norm": 2.65625, "learning_rate": 1.682247585459136e-05, "loss": 0.9545, "mean_token_accuracy": 0.7818309612572193, "num_tokens": 85009333.0, "step": 800 }, { "epoch": 0.5969592388769704, "eval_entropy": 1.9031854873428697, "eval_loss": 0.8225738406181335, "eval_mean_token_accuracy": 0.7820164143764479, "eval_num_tokens": 85009333.0, "eval_runtime": 414.4529, "eval_samples_per_second": 2.092, "eval_steps_per_second": 0.524, "step": 800 }, { "entropy": 1.8995822116732597, "epoch": 0.6006902341199515, "grad_norm": 2.890625, "learning_rate": 1.6780923765345456e-05, "loss": 0.9868, "mean_token_accuracy": 0.7774978131055832, "num_tokens": 85539990.0, "step": 805 }, { "entropy": 1.8972021773457528, "epoch": 0.6044212293629325, "grad_norm": 2.734375, "learning_rate": 1.6739160835145874e-05, "loss": 1.0258, "mean_token_accuracy": 0.7669338978826999, "num_tokens": 86071665.0, "step": 810 }, { "entropy": 1.8983050301671027, "epoch": 0.6081522246059137, "grad_norm": 2.6875, "learning_rate": 1.669718858716372e-05, "loss": 0.9648, "mean_token_accuracy": 0.7823536217212677, "num_tokens": 86603171.0, "step": 815 }, { "entropy": 1.8986048236489297, "epoch": 0.6118832198488947, "grad_norm": 2.5, "learning_rate": 1.6655008552204296e-05, "loss": 1.0247, "mean_token_accuracy": 0.7682311937212944, "num_tokens": 87133722.0, "step": 820 }, { "entropy": 1.9042039334774017, "epoch": 0.6156142150918757, "grad_norm": 2.6875, "learning_rate": 1.661262226865129e-05, "loss": 0.9211, "mean_token_accuracy": 0.7911783196032047, "num_tokens": 87664262.0, "step": 825 }, { "entropy": 1.896624006330967, "epoch": 0.6193452103348568, "grad_norm": 2.75, "learning_rate": 1.657003128241065e-05, "loss": 1.0694, "mean_token_accuracy": 0.7654512405395508, "num_tokens": 88197100.0, "step": 830 }, { "entropy": 1.9086493596434593, "epoch": 0.6230762055778379, "grad_norm": 2.5, "learning_rate": 1.6527237146854216e-05, "loss": 1.0434, "mean_token_accuracy": 0.7728351354598999, "num_tokens": 88729818.0, "step": 835 }, { "entropy": 1.9073491543531418, "epoch": 0.626807200820819, "grad_norm": 3.015625, "learning_rate": 1.648424142276305e-05, "loss": 0.9657, "mean_token_accuracy": 0.7809720672667027, "num_tokens": 89261237.0, "step": 840 }, { "entropy": 1.9095076739788055, "epoch": 0.6305381960638, "grad_norm": 2.703125, "learning_rate": 1.6441045678270523e-05, "loss": 0.9939, "mean_token_accuracy": 0.7778474785387516, "num_tokens": 89792709.0, "step": 845 }, { "entropy": 1.9190806448459625, "epoch": 0.634269191306781, "grad_norm": 2.703125, "learning_rate": 1.6397651488805123e-05, "loss": 1.0188, "mean_token_accuracy": 0.7764581590890884, "num_tokens": 90325632.0, "step": 850 }, { "entropy": 1.9133808985352516, "epoch": 0.6380001865497621, "grad_norm": 2.578125, "learning_rate": 1.635406043703298e-05, "loss": 1.0084, "mean_token_accuracy": 0.7716027073562145, "num_tokens": 90856786.0, "step": 855 }, { "entropy": 1.9075268417596818, "epoch": 0.6417311817927432, "grad_norm": 2.59375, "learning_rate": 1.6310274112800168e-05, "loss": 0.9919, "mean_token_accuracy": 0.7748941496014595, "num_tokens": 91388721.0, "step": 860 }, { "entropy": 1.918405619263649, "epoch": 0.6454621770357243, "grad_norm": 2.609375, "learning_rate": 1.6266294113074705e-05, "loss": 1.0701, "mean_token_accuracy": 0.7638019427657128, "num_tokens": 91920341.0, "step": 865 }, { "entropy": 1.9207670271396637, "epoch": 0.6491931722787053, "grad_norm": 2.84375, "learning_rate": 1.622212204188831e-05, "loss": 1.0503, "mean_token_accuracy": 0.7648871757090092, "num_tokens": 92451835.0, "step": 870 }, { "entropy": 1.907970777153969, "epoch": 0.6529241675216864, "grad_norm": 2.734375, "learning_rate": 1.6177759510277896e-05, "loss": 1.0486, "mean_token_accuracy": 0.7644262924790383, "num_tokens": 92983993.0, "step": 875 }, { "entropy": 1.906702682375908, "epoch": 0.6566551627646675, "grad_norm": 2.625, "learning_rate": 1.6133208136226837e-05, "loss": 0.9787, "mean_token_accuracy": 0.7803527511656284, "num_tokens": 93515391.0, "step": 880 }, { "entropy": 1.9120250821113587, "epoch": 0.6603861580076485, "grad_norm": 2.765625, "learning_rate": 1.6088469544605917e-05, "loss": 1.0555, "mean_token_accuracy": 0.7651013791561126, "num_tokens": 94046135.0, "step": 885 }, { "entropy": 1.903009794652462, "epoch": 0.6641171532506296, "grad_norm": 3.84375, "learning_rate": 1.6043545367114108e-05, "loss": 1.0049, "mean_token_accuracy": 0.7814605854451656, "num_tokens": 94579367.0, "step": 890 }, { "entropy": 1.9165362551808358, "epoch": 0.6678481484936106, "grad_norm": 2.515625, "learning_rate": 1.599843724221903e-05, "loss": 1.0443, "mean_token_accuracy": 0.7678561337292195, "num_tokens": 95108280.0, "step": 895 }, { "entropy": 1.8937674418091774, "epoch": 0.6715791437365918, "grad_norm": 3.296875, "learning_rate": 1.595314681509721e-05, "loss": 0.9987, "mean_token_accuracy": 0.780899266153574, "num_tokens": 95640164.0, "step": 900 }, { "epoch": 0.6715791437365918, "eval_entropy": 1.911442369359979, "eval_loss": 0.8153104186058044, "eval_mean_token_accuracy": 0.7842850034138025, "eval_num_tokens": 95640164.0, "eval_runtime": 407.5269, "eval_samples_per_second": 2.127, "eval_steps_per_second": 0.532, "step": 900 }, { "entropy": 1.9134518131613731, "epoch": 0.6753101389795728, "grad_norm": 2.671875, "learning_rate": 1.590767573757407e-05, "loss": 1.0539, "mean_token_accuracy": 0.7672203324735165, "num_tokens": 96174673.0, "step": 905 }, { "entropy": 1.9068228647112846, "epoch": 0.6790411342225539, "grad_norm": 2.96875, "learning_rate": 1.5862025668063684e-05, "loss": 0.9293, "mean_token_accuracy": 0.797333613038063, "num_tokens": 96705226.0, "step": 910 }, { "entropy": 1.9047024309635163, "epoch": 0.6827721294655349, "grad_norm": 2.734375, "learning_rate": 1.5816198271508304e-05, "loss": 0.9766, "mean_token_accuracy": 0.7800018131732941, "num_tokens": 97237322.0, "step": 915 }, { "entropy": 1.8980093270540237, "epoch": 0.686503124708516, "grad_norm": 2.515625, "learning_rate": 1.5770195219317616e-05, "loss": 1.0123, "mean_token_accuracy": 0.7715090371668338, "num_tokens": 97769225.0, "step": 920 }, { "entropy": 1.8994540840387344, "epoch": 0.6902341199514971, "grad_norm": 3.046875, "learning_rate": 1.5724018189307794e-05, "loss": 0.9984, "mean_token_accuracy": 0.7745929844677448, "num_tokens": 98303973.0, "step": 925 }, { "entropy": 1.8997361719608308, "epoch": 0.6939651151944781, "grad_norm": 2.796875, "learning_rate": 1.5677668865640303e-05, "loss": 1.0065, "mean_token_accuracy": 0.7724752642214299, "num_tokens": 98834383.0, "step": 930 }, { "entropy": 1.906589935719967, "epoch": 0.6976961104374592, "grad_norm": 2.625, "learning_rate": 1.5631148938760478e-05, "loss": 0.9321, "mean_token_accuracy": 0.7906432278454304, "num_tokens": 99366644.0, "step": 935 }, { "entropy": 1.9116594985127449, "epoch": 0.7014271056804403, "grad_norm": 2.765625, "learning_rate": 1.558446010533587e-05, "loss": 0.9932, "mean_token_accuracy": 0.7812504962086677, "num_tokens": 99899469.0, "step": 940 }, { "entropy": 1.908557726442814, "epoch": 0.7051581009234213, "grad_norm": 2.9375, "learning_rate": 1.553760406819435e-05, "loss": 0.9354, "mean_token_accuracy": 0.7859705902636052, "num_tokens": 100432079.0, "step": 945 }, { "entropy": 1.911416806280613, "epoch": 0.7088890961664024, "grad_norm": 2.59375, "learning_rate": 1.549058253626204e-05, "loss": 0.9158, "mean_token_accuracy": 0.7985282003879547, "num_tokens": 100964903.0, "step": 950 }, { "entropy": 1.9102476298809052, "epoch": 0.7126200914093834, "grad_norm": 2.59375, "learning_rate": 1.544339722450094e-05, "loss": 0.9396, "mean_token_accuracy": 0.7870368644595146, "num_tokens": 101495644.0, "step": 955 }, { "entropy": 1.9011426746845246, "epoch": 0.7163510866523645, "grad_norm": 2.578125, "learning_rate": 1.5396049853846416e-05, "loss": 0.9782, "mean_token_accuracy": 0.776444636285305, "num_tokens": 102027056.0, "step": 960 }, { "entropy": 1.9028504237532615, "epoch": 0.7200820818953456, "grad_norm": 2.78125, "learning_rate": 1.5348542151144412e-05, "loss": 1.0195, "mean_token_accuracy": 0.7756047353148461, "num_tokens": 102558960.0, "step": 965 }, { "entropy": 1.913777655363083, "epoch": 0.7238130771383267, "grad_norm": 2.578125, "learning_rate": 1.530087584908849e-05, "loss": 0.9679, "mean_token_accuracy": 0.783194737881422, "num_tokens": 103090509.0, "step": 970 }, { "entropy": 1.9065766379237175, "epoch": 0.7275440723813077, "grad_norm": 2.953125, "learning_rate": 1.5253052686156606e-05, "loss": 0.9767, "mean_token_accuracy": 0.7749697960913181, "num_tokens": 103623149.0, "step": 975 }, { "entropy": 1.9180918678641319, "epoch": 0.7312750676242887, "grad_norm": 2.640625, "learning_rate": 1.5205074406547737e-05, "loss": 0.9984, "mean_token_accuracy": 0.7796571411192417, "num_tokens": 104151367.0, "step": 980 }, { "entropy": 1.9228741899132729, "epoch": 0.7350060628672699, "grad_norm": 2.703125, "learning_rate": 1.5156942760118243e-05, "loss": 0.9168, "mean_token_accuracy": 0.7995883099734783, "num_tokens": 104680461.0, "step": 985 }, { "entropy": 1.923213917016983, "epoch": 0.7387370581102509, "grad_norm": 2.4375, "learning_rate": 1.5108659502318057e-05, "loss": 0.983, "mean_token_accuracy": 0.7768499352037906, "num_tokens": 105211200.0, "step": 990 }, { "entropy": 1.9263088315725327, "epoch": 0.742468053353232, "grad_norm": 2.703125, "learning_rate": 1.506022639412666e-05, "loss": 0.9758, "mean_token_accuracy": 0.7826492115855217, "num_tokens": 105743445.0, "step": 995 }, { "entropy": 1.927260972559452, "epoch": 0.746199048596213, "grad_norm": 3.171875, "learning_rate": 1.5011645201988847e-05, "loss": 0.9638, "mean_token_accuracy": 0.7851476401090622, "num_tokens": 106276369.0, "step": 1000 }, { "epoch": 0.746199048596213, "eval_entropy": 1.93777990505992, "eval_loss": 0.810206949710846, "eval_mean_token_accuracy": 0.7849089860366786, "eval_num_tokens": 106276369.0, "eval_runtime": 413.4859, "eval_samples_per_second": 2.097, "eval_steps_per_second": 0.525, "step": 1000 }, { "entropy": 1.924168910086155, "epoch": 0.7499300438391941, "grad_norm": 2.703125, "learning_rate": 1.4962917697750312e-05, "loss": 1.0092, "mean_token_accuracy": 0.7759475007653236, "num_tokens": 106806838.0, "step": 1005 }, { "entropy": 1.926104436814785, "epoch": 0.7536610390821752, "grad_norm": 2.75, "learning_rate": 1.4914045658593029e-05, "loss": 1.008, "mean_token_accuracy": 0.7736393176019192, "num_tokens": 107339166.0, "step": 1010 }, { "entropy": 1.927248127758503, "epoch": 0.7573920343251562, "grad_norm": 2.59375, "learning_rate": 1.4865030866970408e-05, "loss": 0.9093, "mean_token_accuracy": 0.7949215397238731, "num_tokens": 107870267.0, "step": 1015 }, { "entropy": 1.915828388929367, "epoch": 0.7611230295681373, "grad_norm": 2.640625, "learning_rate": 1.4815875110542326e-05, "loss": 0.9008, "mean_token_accuracy": 0.7937774039804936, "num_tokens": 108400513.0, "step": 1020 }, { "entropy": 1.9184807062149047, "epoch": 0.7648540248111184, "grad_norm": 2.640625, "learning_rate": 1.4766580182109889e-05, "loss": 1.0036, "mean_token_accuracy": 0.7738492339849472, "num_tokens": 108932594.0, "step": 1025 }, { "entropy": 1.9205460771918297, "epoch": 0.7685850200540995, "grad_norm": 2.671875, "learning_rate": 1.4717147879550078e-05, "loss": 0.9702, "mean_token_accuracy": 0.7899442717432976, "num_tokens": 109462921.0, "step": 1030 }, { "entropy": 1.9184999212622642, "epoch": 0.7723160152970805, "grad_norm": 2.890625, "learning_rate": 1.4667580005750146e-05, "loss": 0.9383, "mean_token_accuracy": 0.7896254241466523, "num_tokens": 109994997.0, "step": 1035 }, { "entropy": 1.9122421011328696, "epoch": 0.7760470105400615, "grad_norm": 2.65625, "learning_rate": 1.4617878368541895e-05, "loss": 0.9512, "mean_token_accuracy": 0.7840572439134121, "num_tokens": 110525919.0, "step": 1040 }, { "entropy": 1.913463969528675, "epoch": 0.7797780057830426, "grad_norm": 2.828125, "learning_rate": 1.456804478063571e-05, "loss": 1.0129, "mean_token_accuracy": 0.7674046225845814, "num_tokens": 111055974.0, "step": 1045 }, { "entropy": 1.9107275918126105, "epoch": 0.7835090010260237, "grad_norm": 2.515625, "learning_rate": 1.4518081059554477e-05, "loss": 0.9937, "mean_token_accuracy": 0.777343125641346, "num_tokens": 111585284.0, "step": 1050 }, { "entropy": 1.917552863061428, "epoch": 0.7872399962690048, "grad_norm": 2.75, "learning_rate": 1.4467989027567263e-05, "loss": 0.9839, "mean_token_accuracy": 0.7785539641976357, "num_tokens": 112116183.0, "step": 1055 }, { "entropy": 1.9054254487156868, "epoch": 0.7909709915119858, "grad_norm": 2.734375, "learning_rate": 1.4417770511622887e-05, "loss": 0.9861, "mean_token_accuracy": 0.7839046396315098, "num_tokens": 112646135.0, "step": 1060 }, { "entropy": 1.8971596196293832, "epoch": 0.7947019867549668, "grad_norm": 2.765625, "learning_rate": 1.4367427343283254e-05, "loss": 0.9637, "mean_token_accuracy": 0.7812280923128128, "num_tokens": 113176362.0, "step": 1065 }, { "entropy": 1.9003028556704522, "epoch": 0.798432981997948, "grad_norm": 2.859375, "learning_rate": 1.4316961358656584e-05, "loss": 0.9873, "mean_token_accuracy": 0.7795728340744972, "num_tokens": 113708268.0, "step": 1070 }, { "entropy": 1.896422977745533, "epoch": 0.802163977240929, "grad_norm": 2.75, "learning_rate": 1.4266374398330433e-05, "loss": 1.0349, "mean_token_accuracy": 0.7646013095974922, "num_tokens": 114240150.0, "step": 1075 }, { "entropy": 1.899100808799267, "epoch": 0.8058949724839101, "grad_norm": 2.765625, "learning_rate": 1.4215668307304557e-05, "loss": 0.9489, "mean_token_accuracy": 0.7867104850709439, "num_tokens": 114771681.0, "step": 1080 }, { "entropy": 1.9089683890342712, "epoch": 0.8096259677268911, "grad_norm": 2.59375, "learning_rate": 1.4164844934923631e-05, "loss": 1.0208, "mean_token_accuracy": 0.7843053504824639, "num_tokens": 115303073.0, "step": 1085 }, { "entropy": 1.9021274089813232, "epoch": 0.8133569629698723, "grad_norm": 2.875, "learning_rate": 1.4113906134809805e-05, "loss": 0.9929, "mean_token_accuracy": 0.7804038859903812, "num_tokens": 115837335.0, "step": 1090 }, { "entropy": 1.9116863787174225, "epoch": 0.8170879582128533, "grad_norm": 2.59375, "learning_rate": 1.406285376479508e-05, "loss": 0.9766, "mean_token_accuracy": 0.7845917344093323, "num_tokens": 116369639.0, "step": 1095 }, { "entropy": 1.8996223762631417, "epoch": 0.8208189534558343, "grad_norm": 2.6875, "learning_rate": 1.4011689686853562e-05, "loss": 1.065, "mean_token_accuracy": 0.7678873062133789, "num_tokens": 116901018.0, "step": 1100 }, { "epoch": 0.8208189534558343, "eval_entropy": 1.9164034864320183, "eval_loss": 0.8076829314231873, "eval_mean_token_accuracy": 0.7854106917908664, "eval_num_tokens": 116901018.0, "eval_runtime": 410.9839, "eval_samples_per_second": 2.11, "eval_steps_per_second": 0.528, "step": 1100 }, { "entropy": 1.912019345164299, "epoch": 0.8245499486988154, "grad_norm": 2.703125, "learning_rate": 1.3960415767033566e-05, "loss": 1.0343, "mean_token_accuracy": 0.7703364498913288, "num_tokens": 117434104.0, "step": 1105 }, { "entropy": 1.9091193810105325, "epoch": 0.8282809439417965, "grad_norm": 2.703125, "learning_rate": 1.3909033875389522e-05, "loss": 0.9879, "mean_token_accuracy": 0.7769205167889595, "num_tokens": 117964339.0, "step": 1110 }, { "entropy": 1.9093813717365264, "epoch": 0.8320119391847776, "grad_norm": 2.578125, "learning_rate": 1.3857545885913806e-05, "loss": 1.0359, "mean_token_accuracy": 0.7665309607982635, "num_tokens": 118493981.0, "step": 1115 }, { "entropy": 1.9185933470726013, "epoch": 0.8357429344277586, "grad_norm": 2.984375, "learning_rate": 1.3805953676468372e-05, "loss": 1.0506, "mean_token_accuracy": 0.762969171255827, "num_tokens": 119026536.0, "step": 1120 }, { "entropy": 1.9164486616849898, "epoch": 0.8394739296707396, "grad_norm": 3.0625, "learning_rate": 1.3754259128716266e-05, "loss": 0.9963, "mean_token_accuracy": 0.7848824128508568, "num_tokens": 119557686.0, "step": 1125 }, { "entropy": 1.9078744739294051, "epoch": 0.8432049249137208, "grad_norm": 2.625, "learning_rate": 1.370246412805301e-05, "loss": 0.9685, "mean_token_accuracy": 0.7828864552080631, "num_tokens": 120087050.0, "step": 1130 }, { "entropy": 1.9092570304870606, "epoch": 0.8469359201567018, "grad_norm": 2.65625, "learning_rate": 1.3650570563537824e-05, "loss": 0.9468, "mean_token_accuracy": 0.7840012639760972, "num_tokens": 120618701.0, "step": 1135 }, { "entropy": 1.90948646068573, "epoch": 0.8506669153996829, "grad_norm": 2.78125, "learning_rate": 1.3598580327824726e-05, "loss": 0.974, "mean_token_accuracy": 0.7765664204955101, "num_tokens": 121152315.0, "step": 1140 }, { "entropy": 1.913860061764717, "epoch": 0.8543979106426639, "grad_norm": 2.515625, "learning_rate": 1.3546495317093527e-05, "loss": 0.9608, "mean_token_accuracy": 0.784039318561554, "num_tokens": 121684014.0, "step": 1145 }, { "entropy": 1.908640170097351, "epoch": 0.8581289058856449, "grad_norm": 2.5, "learning_rate": 1.3494317430980642e-05, "loss": 0.9537, "mean_token_accuracy": 0.7842854559421539, "num_tokens": 122217932.0, "step": 1150 }, { "entropy": 1.9161628365516663, "epoch": 0.8618599011286261, "grad_norm": 2.609375, "learning_rate": 1.3442048572509827e-05, "loss": 0.9725, "mean_token_accuracy": 0.7805037409067154, "num_tokens": 122749728.0, "step": 1155 }, { "entropy": 1.9111833572387695, "epoch": 0.8655908963716071, "grad_norm": 2.6875, "learning_rate": 1.3389690648022777e-05, "loss": 0.8939, "mean_token_accuracy": 0.7963828206062317, "num_tokens": 123281364.0, "step": 1160 }, { "entropy": 1.9157856091856957, "epoch": 0.8693218916145882, "grad_norm": 2.640625, "learning_rate": 1.3337245567109578e-05, "loss": 1.006, "mean_token_accuracy": 0.7796331122517586, "num_tokens": 123812659.0, "step": 1165 }, { "entropy": 1.92343310713768, "epoch": 0.8730528868575692, "grad_norm": 2.8125, "learning_rate": 1.3284715242539073e-05, "loss": 0.8891, "mean_token_accuracy": 0.789270393550396, "num_tokens": 124341405.0, "step": 1170 }, { "entropy": 1.9144797176122665, "epoch": 0.8767838821005504, "grad_norm": 2.828125, "learning_rate": 1.3232101590189104e-05, "loss": 0.9934, "mean_token_accuracy": 0.7757502853870392, "num_tokens": 124872130.0, "step": 1175 }, { "entropy": 1.921220065653324, "epoch": 0.8805148773435314, "grad_norm": 2.578125, "learning_rate": 1.317940652897663e-05, "loss": 0.9992, "mean_token_accuracy": 0.7749721556901932, "num_tokens": 125402195.0, "step": 1180 }, { "entropy": 1.9140778228640556, "epoch": 0.8842458725865124, "grad_norm": 2.609375, "learning_rate": 1.3126631980787729e-05, "loss": 0.9669, "mean_token_accuracy": 0.781208099424839, "num_tokens": 125933557.0, "step": 1185 }, { "entropy": 1.9249017596244813, "epoch": 0.8879768678294935, "grad_norm": 2.546875, "learning_rate": 1.3073779870407539e-05, "loss": 0.9694, "mean_token_accuracy": 0.7858038902282715, "num_tokens": 126465874.0, "step": 1190 }, { "entropy": 1.9239431992173195, "epoch": 0.8917078630724746, "grad_norm": 2.625, "learning_rate": 1.3020852125450009e-05, "loss": 0.9928, "mean_token_accuracy": 0.7798442251980304, "num_tokens": 126995887.0, "step": 1195 }, { "entropy": 1.9185133665800094, "epoch": 0.8954388583154557, "grad_norm": 2.6875, "learning_rate": 1.296785067628764e-05, "loss": 0.9198, "mean_token_accuracy": 0.7875108510255814, "num_tokens": 127526712.0, "step": 1200 }, { "epoch": 0.8954388583154557, "eval_entropy": 1.9207860181957894, "eval_loss": 0.8042152523994446, "eval_mean_token_accuracy": 0.7863888339512909, "eval_num_tokens": 127526712.0, "eval_runtime": 409.4078, "eval_samples_per_second": 2.118, "eval_steps_per_second": 0.53, "step": 1200 }, { "entropy": 1.9089737579226493, "epoch": 0.8991698535584367, "grad_norm": 2.609375, "learning_rate": 1.2914777455981055e-05, "loss": 0.9846, "mean_token_accuracy": 0.7735326617956162, "num_tokens": 128061650.0, "step": 1205 }, { "entropy": 1.9178216248750686, "epoch": 0.9029008488014177, "grad_norm": 2.953125, "learning_rate": 1.2861634400208494e-05, "loss": 0.9333, "mean_token_accuracy": 0.7947493709623814, "num_tokens": 128594317.0, "step": 1210 }, { "entropy": 1.9146702647209168, "epoch": 0.9066318440443989, "grad_norm": 2.8125, "learning_rate": 1.2808423447195238e-05, "loss": 1.021, "mean_token_accuracy": 0.7814876094460488, "num_tokens": 129127989.0, "step": 1215 }, { "entropy": 1.917625893652439, "epoch": 0.9103628392873799, "grad_norm": 2.734375, "learning_rate": 1.27551465376429e-05, "loss": 0.9356, "mean_token_accuracy": 0.7922505825757981, "num_tokens": 129658129.0, "step": 1220 }, { "entropy": 1.9145172625780105, "epoch": 0.914093834530361, "grad_norm": 2.875, "learning_rate": 1.2701805614658646e-05, "loss": 0.9872, "mean_token_accuracy": 0.775139132887125, "num_tokens": 130190875.0, "step": 1225 }, { "entropy": 1.9233573004603386, "epoch": 0.917824829773342, "grad_norm": 2.984375, "learning_rate": 1.264840262368434e-05, "loss": 1.0117, "mean_token_accuracy": 0.7777254186570645, "num_tokens": 130723320.0, "step": 1230 }, { "entropy": 1.9207363352179527, "epoch": 0.921555825016323, "grad_norm": 2.734375, "learning_rate": 1.2594939512425575e-05, "loss": 0.9916, "mean_token_accuracy": 0.7813470110297203, "num_tokens": 131254800.0, "step": 1235 }, { "entropy": 1.9260804876685143, "epoch": 0.9252868202593042, "grad_norm": 2.96875, "learning_rate": 1.2541418230780636e-05, "loss": 0.9056, "mean_token_accuracy": 0.790333466976881, "num_tokens": 131784122.0, "step": 1240 }, { "entropy": 1.9206727132201196, "epoch": 0.9290178155022852, "grad_norm": 2.875, "learning_rate": 1.2487840730769393e-05, "loss": 1.001, "mean_token_accuracy": 0.7754542909562587, "num_tokens": 132315594.0, "step": 1245 }, { "entropy": 1.931351412832737, "epoch": 0.9327488107452663, "grad_norm": 2.96875, "learning_rate": 1.2434208966462113e-05, "loss": 0.9194, "mean_token_accuracy": 0.7928143329918385, "num_tokens": 132845920.0, "step": 1250 }, { "entropy": 1.9268579319119454, "epoch": 0.9364798059882473, "grad_norm": 3.171875, "learning_rate": 1.238052489390817e-05, "loss": 1.0312, "mean_token_accuracy": 0.7688205942511559, "num_tokens": 133376732.0, "step": 1255 }, { "entropy": 1.9306054621934892, "epoch": 0.9402108012312285, "grad_norm": 2.796875, "learning_rate": 1.2326790471064715e-05, "loss": 0.984, "mean_token_accuracy": 0.7766520656645298, "num_tokens": 133907577.0, "step": 1260 }, { "entropy": 1.9358166873455047, "epoch": 0.9439417964742095, "grad_norm": 2.859375, "learning_rate": 1.2273007657725278e-05, "loss": 0.9288, "mean_token_accuracy": 0.7995492741465569, "num_tokens": 134439785.0, "step": 1265 }, { "entropy": 1.9333247378468514, "epoch": 0.9476727917171905, "grad_norm": 2.6875, "learning_rate": 1.2219178415448271e-05, "loss": 0.9917, "mean_token_accuracy": 0.7727632015943527, "num_tokens": 134969134.0, "step": 1270 }, { "entropy": 1.9407745644450187, "epoch": 0.9514037869601716, "grad_norm": 2.71875, "learning_rate": 1.216530470748546e-05, "loss": 0.9204, "mean_token_accuracy": 0.7908377006649971, "num_tokens": 135497613.0, "step": 1275 }, { "entropy": 1.9259536176919938, "epoch": 0.9551347822031527, "grad_norm": 2.9375, "learning_rate": 1.2111388498710347e-05, "loss": 0.9738, "mean_token_accuracy": 0.7784877195954323, "num_tokens": 136028881.0, "step": 1280 }, { "entropy": 1.925316996872425, "epoch": 0.9588657774461338, "grad_norm": 2.8125, "learning_rate": 1.2057431755546535e-05, "loss": 1.01, "mean_token_accuracy": 0.7734282180666924, "num_tokens": 136560459.0, "step": 1285 }, { "entropy": 1.9157800003886223, "epoch": 0.9625967726891148, "grad_norm": 2.546875, "learning_rate": 1.2003436445895973e-05, "loss": 0.9531, "mean_token_accuracy": 0.7834272600710392, "num_tokens": 137092130.0, "step": 1290 }, { "entropy": 1.90481239259243, "epoch": 0.9663277679320958, "grad_norm": 2.421875, "learning_rate": 1.194940453906721e-05, "loss": 0.9573, "mean_token_accuracy": 0.786161408573389, "num_tokens": 137623136.0, "step": 1295 }, { "entropy": 1.8986893266439437, "epoch": 0.970058763175077, "grad_norm": 2.765625, "learning_rate": 1.189533800570356e-05, "loss": 1.0726, "mean_token_accuracy": 0.764766127616167, "num_tokens": 138155488.0, "step": 1300 }, { "epoch": 0.970058763175077, "eval_entropy": 1.9125681104747931, "eval_loss": 0.8023692965507507, "eval_mean_token_accuracy": 0.7871058926604311, "eval_num_tokens": 138155488.0, "eval_runtime": 411.7048, "eval_samples_per_second": 2.106, "eval_steps_per_second": 0.527, "step": 1300 }, { "entropy": 1.909130072593689, "epoch": 0.973789758418058, "grad_norm": 2.84375, "learning_rate": 1.1841238817711226e-05, "loss": 1.0111, "mean_token_accuracy": 0.7737887717783452, "num_tokens": 138685815.0, "step": 1305 }, { "entropy": 1.914137528836727, "epoch": 0.9775207536610391, "grad_norm": 2.84375, "learning_rate": 1.1787108948187395e-05, "loss": 0.9504, "mean_token_accuracy": 0.7876806251704693, "num_tokens": 139214572.0, "step": 1310 }, { "entropy": 1.923145028948784, "epoch": 0.9812517489040201, "grad_norm": 2.546875, "learning_rate": 1.1732950371348262e-05, "loss": 0.9302, "mean_token_accuracy": 0.7920562744140625, "num_tokens": 139743508.0, "step": 1315 }, { "entropy": 1.916971817612648, "epoch": 0.9849827441470013, "grad_norm": 2.703125, "learning_rate": 1.1678765062457023e-05, "loss": 0.9254, "mean_token_accuracy": 0.7934870257973671, "num_tokens": 140274750.0, "step": 1320 }, { "entropy": 1.9185368612408638, "epoch": 0.9887137393899823, "grad_norm": 2.703125, "learning_rate": 1.1624554997751853e-05, "loss": 0.8702, "mean_token_accuracy": 0.7973764210939407, "num_tokens": 140807008.0, "step": 1325 }, { "entropy": 1.9259465247392655, "epoch": 0.9924447346329633, "grad_norm": 2.78125, "learning_rate": 1.157032215437381e-05, "loss": 0.9814, "mean_token_accuracy": 0.7817091770470143, "num_tokens": 141336497.0, "step": 1330 }, { "entropy": 1.9247869119048118, "epoch": 0.9961757298759444, "grad_norm": 2.96875, "learning_rate": 1.1516068510294735e-05, "loss": 0.9902, "mean_token_accuracy": 0.7780245266854763, "num_tokens": 141867864.0, "step": 1335 }, { "entropy": 1.9211131125688552, "epoch": 0.9999067251189254, "grad_norm": 2.875, "learning_rate": 1.1461796044245108e-05, "loss": 0.9372, "mean_token_accuracy": 0.7970803648233413, "num_tokens": 142401439.0, "step": 1340 }, { "entropy": 1.9021222645586187, "epoch": 1.0029847961943847, "grad_norm": 2.765625, "learning_rate": 1.1407506735641882e-05, "loss": 1.0408, "mean_token_accuracy": 0.7564558558391802, "num_tokens": 142842106.0, "step": 1345 }, { "entropy": 1.91049023270607, "epoch": 1.006715791437366, "grad_norm": 2.453125, "learning_rate": 1.1353202564516287e-05, "loss": 0.9824, "mean_token_accuracy": 0.7773703202605248, "num_tokens": 143375469.0, "step": 1350 }, { "entropy": 1.9135325998067856, "epoch": 1.010446786680347, "grad_norm": 3.578125, "learning_rate": 1.1298885511441618e-05, "loss": 0.9687, "mean_token_accuracy": 0.7757138065993786, "num_tokens": 143908345.0, "step": 1355 }, { "entropy": 1.9061910092830658, "epoch": 1.014177781923328, "grad_norm": 2.765625, "learning_rate": 1.1244557557461002e-05, "loss": 0.9514, "mean_token_accuracy": 0.7835961289703846, "num_tokens": 144437756.0, "step": 1360 }, { "entropy": 1.9110153183341025, "epoch": 1.0179087771663091, "grad_norm": 2.796875, "learning_rate": 1.1190220684015129e-05, "loss": 0.9301, "mean_token_accuracy": 0.7900028511881828, "num_tokens": 144966417.0, "step": 1365 }, { "entropy": 1.9046073108911514, "epoch": 1.0216397724092903, "grad_norm": 2.71875, "learning_rate": 1.1135876872870018e-05, "loss": 0.9883, "mean_token_accuracy": 0.7769135765731334, "num_tokens": 145499906.0, "step": 1370 }, { "entropy": 1.9097348660230637, "epoch": 1.0253707676522712, "grad_norm": 2.609375, "learning_rate": 1.1081528106044704e-05, "loss": 1.0131, "mean_token_accuracy": 0.7666771307587623, "num_tokens": 146031293.0, "step": 1375 }, { "entropy": 1.9049879655241966, "epoch": 1.0291017628952523, "grad_norm": 2.8125, "learning_rate": 1.1027176365738965e-05, "loss": 1.025, "mean_token_accuracy": 0.7679546378552914, "num_tokens": 146564883.0, "step": 1380 }, { "entropy": 1.9138263523578645, "epoch": 1.0328327581382333, "grad_norm": 2.53125, "learning_rate": 1.0972823634261038e-05, "loss": 0.9745, "mean_token_accuracy": 0.784792798012495, "num_tokens": 147097765.0, "step": 1385 }, { "entropy": 1.9188172191381454, "epoch": 1.0365637533812144, "grad_norm": 2.875, "learning_rate": 1.0918471893955303e-05, "loss": 1.0081, "mean_token_accuracy": 0.7770697064697742, "num_tokens": 147628650.0, "step": 1390 }, { "entropy": 1.9182799458503723, "epoch": 1.0402947486241956, "grad_norm": 2.515625, "learning_rate": 1.0864123127129987e-05, "loss": 0.9117, "mean_token_accuracy": 0.8060022041201591, "num_tokens": 148159888.0, "step": 1395 }, { "entropy": 1.9043408751487731, "epoch": 1.0440257438671765, "grad_norm": 2.859375, "learning_rate": 1.0809779315984874e-05, "loss": 0.9451, "mean_token_accuracy": 0.7813556417822838, "num_tokens": 148691293.0, "step": 1400 }, { "epoch": 1.0440257438671765, "eval_entropy": 1.914900781372176, "eval_loss": 0.8001424670219421, "eval_mean_token_accuracy": 0.7873817116434124, "eval_num_tokens": 148691293.0, "eval_runtime": 408.5753, "eval_samples_per_second": 2.122, "eval_steps_per_second": 0.531, "step": 1400 }, { "entropy": 1.906543317437172, "epoch": 1.0477567391101577, "grad_norm": 2.828125, "learning_rate": 1.0755442442539004e-05, "loss": 0.9118, "mean_token_accuracy": 0.7968203231692315, "num_tokens": 149221200.0, "step": 1405 }, { "entropy": 1.8938933908939362, "epoch": 1.0514877343531388, "grad_norm": 2.75, "learning_rate": 1.0701114488558387e-05, "loss": 0.9224, "mean_token_accuracy": 0.7925568759441376, "num_tokens": 149754700.0, "step": 1410 }, { "entropy": 1.9065657809376717, "epoch": 1.0552187295961197, "grad_norm": 2.609375, "learning_rate": 1.0646797435483716e-05, "loss": 0.9389, "mean_token_accuracy": 0.7918999843299389, "num_tokens": 150284180.0, "step": 1415 }, { "entropy": 1.9041003629565239, "epoch": 1.0589497248391009, "grad_norm": 3.09375, "learning_rate": 1.0592493264358122e-05, "loss": 0.9762, "mean_token_accuracy": 0.7818893454968929, "num_tokens": 150815166.0, "step": 1420 }, { "entropy": 1.8972408846020699, "epoch": 1.0626807200820818, "grad_norm": 2.8125, "learning_rate": 1.0538203955754893e-05, "loss": 1.0264, "mean_token_accuracy": 0.7636294946074486, "num_tokens": 151348074.0, "step": 1425 }, { "entropy": 1.9111024513840675, "epoch": 1.066411715325063, "grad_norm": 2.75, "learning_rate": 1.0483931489705267e-05, "loss": 0.9958, "mean_token_accuracy": 0.7807611756026744, "num_tokens": 151878240.0, "step": 1430 }, { "entropy": 1.8972911700606345, "epoch": 1.070142710568044, "grad_norm": 2.765625, "learning_rate": 1.0429677845626191e-05, "loss": 0.92, "mean_token_accuracy": 0.7862489655613899, "num_tokens": 152411551.0, "step": 1435 }, { "entropy": 1.8995465740561486, "epoch": 1.073873705811025, "grad_norm": 2.609375, "learning_rate": 1.0375445002248153e-05, "loss": 0.9829, "mean_token_accuracy": 0.779705585539341, "num_tokens": 152944517.0, "step": 1440 }, { "entropy": 1.9031965538859368, "epoch": 1.0776047010540062, "grad_norm": 2.578125, "learning_rate": 1.0321234937542982e-05, "loss": 0.9502, "mean_token_accuracy": 0.7840150997042656, "num_tokens": 153477408.0, "step": 1445 }, { "entropy": 1.900654748082161, "epoch": 1.0813356962969873, "grad_norm": 2.71875, "learning_rate": 1.0267049628651744e-05, "loss": 0.9186, "mean_token_accuracy": 0.7939566880464554, "num_tokens": 154009646.0, "step": 1450 }, { "entropy": 1.9173641785979272, "epoch": 1.0850666915399683, "grad_norm": 2.921875, "learning_rate": 1.0212891051812606e-05, "loss": 0.9827, "mean_token_accuracy": 0.7764195807278156, "num_tokens": 154538093.0, "step": 1455 }, { "entropy": 1.9076469540596008, "epoch": 1.0887976867829494, "grad_norm": 2.71875, "learning_rate": 1.0158761182288776e-05, "loss": 0.9337, "mean_token_accuracy": 0.7856325849890708, "num_tokens": 155069159.0, "step": 1460 }, { "entropy": 1.9166174724698066, "epoch": 1.0925286820259303, "grad_norm": 2.671875, "learning_rate": 1.0104661994296445e-05, "loss": 0.9603, "mean_token_accuracy": 0.7911513254046441, "num_tokens": 155599420.0, "step": 1465 }, { "entropy": 1.9228487834334373, "epoch": 1.0962596772689115, "grad_norm": 2.75, "learning_rate": 1.0050595460932795e-05, "loss": 0.9645, "mean_token_accuracy": 0.7795494057238102, "num_tokens": 156130263.0, "step": 1470 }, { "entropy": 1.8965369552373885, "epoch": 1.0999906725118926, "grad_norm": 2.546875, "learning_rate": 9.99656355410403e-06, "loss": 1.0223, "mean_token_accuracy": 0.7734660752117634, "num_tokens": 156663734.0, "step": 1475 }, { "entropy": 1.9180607333779336, "epoch": 1.1037216677548736, "grad_norm": 2.75, "learning_rate": 9.942568244453469e-06, "loss": 0.9277, "mean_token_accuracy": 0.7880247853696346, "num_tokens": 157194630.0, "step": 1480 }, { "entropy": 1.91579008102417, "epoch": 1.1074526629978547, "grad_norm": 2.46875, "learning_rate": 9.88861150128965e-06, "loss": 0.9902, "mean_token_accuracy": 0.7840256102383136, "num_tokens": 157725220.0, "step": 1485 }, { "entropy": 1.918996299803257, "epoch": 1.1111836582408356, "grad_norm": 2.765625, "learning_rate": 9.834695292514545e-06, "loss": 0.8728, "mean_token_accuracy": 0.802612192928791, "num_tokens": 158254991.0, "step": 1490 }, { "entropy": 1.921280962228775, "epoch": 1.1149146534838168, "grad_norm": 2.734375, "learning_rate": 9.78082158455173e-06, "loss": 0.9266, "mean_token_accuracy": 0.7892040610313416, "num_tokens": 158785345.0, "step": 1495 }, { "entropy": 1.9033719599246979, "epoch": 1.118645648726798, "grad_norm": 2.71875, "learning_rate": 9.726992342274726e-06, "loss": 1.0086, "mean_token_accuracy": 0.7704753667116165, "num_tokens": 159318008.0, "step": 1500 }, { "epoch": 1.118645648726798, "eval_entropy": 1.9108136875838178, "eval_loss": 0.7981786131858826, "eval_mean_token_accuracy": 0.7881955545618787, "eval_num_tokens": 159318008.0, "eval_runtime": 421.7688, "eval_samples_per_second": 2.056, "eval_steps_per_second": 0.514, "step": 1500 }, { "entropy": 1.9149455368518828, "epoch": 1.1223766439697789, "grad_norm": 2.921875, "learning_rate": 9.673209528935288e-06, "loss": 0.9201, "mean_token_accuracy": 0.7925434753298759, "num_tokens": 159850478.0, "step": 1505 }, { "entropy": 1.8980061173439027, "epoch": 1.12610763921276, "grad_norm": 2.84375, "learning_rate": 9.619475106091832e-06, "loss": 0.9597, "mean_token_accuracy": 0.7821764424443245, "num_tokens": 160381164.0, "step": 1510 }, { "entropy": 1.901640383899212, "epoch": 1.1298386344557412, "grad_norm": 3.359375, "learning_rate": 9.565791033537888e-06, "loss": 0.9118, "mean_token_accuracy": 0.7857741206884384, "num_tokens": 160914862.0, "step": 1515 }, { "entropy": 1.895643164217472, "epoch": 1.133569629698722, "grad_norm": 2.90625, "learning_rate": 9.512159269230608e-06, "loss": 0.9746, "mean_token_accuracy": 0.7788461379706859, "num_tokens": 161447626.0, "step": 1520 }, { "entropy": 1.907567872107029, "epoch": 1.1373006249417033, "grad_norm": 2.765625, "learning_rate": 9.458581769219369e-06, "loss": 0.9623, "mean_token_accuracy": 0.7834442459046841, "num_tokens": 161979709.0, "step": 1525 }, { "entropy": 1.913025863468647, "epoch": 1.1410316201846842, "grad_norm": 2.765625, "learning_rate": 9.40506048757443e-06, "loss": 0.9361, "mean_token_accuracy": 0.7849182806909084, "num_tokens": 162509479.0, "step": 1530 }, { "entropy": 1.9022073447704315, "epoch": 1.1447626154276653, "grad_norm": 2.734375, "learning_rate": 9.351597376315661e-06, "loss": 0.963, "mean_token_accuracy": 0.7830614671111107, "num_tokens": 163041559.0, "step": 1535 }, { "entropy": 1.9128715977072717, "epoch": 1.1484936106706465, "grad_norm": 3.171875, "learning_rate": 9.298194385341355e-06, "loss": 0.9564, "mean_token_accuracy": 0.7814685918390751, "num_tokens": 163572658.0, "step": 1540 }, { "entropy": 1.914643220603466, "epoch": 1.1522246059136274, "grad_norm": 3.296875, "learning_rate": 9.244853462357103e-06, "loss": 0.9746, "mean_token_accuracy": 0.7881543360650539, "num_tokens": 164105639.0, "step": 1545 }, { "entropy": 1.9207996159791947, "epoch": 1.1559556011566086, "grad_norm": 2.875, "learning_rate": 9.191576552804767e-06, "loss": 0.9381, "mean_token_accuracy": 0.7910158529877662, "num_tokens": 164638056.0, "step": 1550 }, { "entropy": 1.9149420797824859, "epoch": 1.1596865963995895, "grad_norm": 2.53125, "learning_rate": 9.13836559979151e-06, "loss": 0.9308, "mean_token_accuracy": 0.7858420059084892, "num_tokens": 165170709.0, "step": 1555 }, { "entropy": 1.9160060003399848, "epoch": 1.1634175916425706, "grad_norm": 2.671875, "learning_rate": 9.08522254401895e-06, "loss": 0.9486, "mean_token_accuracy": 0.7823890775442124, "num_tokens": 165701451.0, "step": 1560 }, { "entropy": 1.916386367380619, "epoch": 1.1671485868855518, "grad_norm": 3.1875, "learning_rate": 9.032149323712359e-06, "loss": 1.0188, "mean_token_accuracy": 0.7758285783231258, "num_tokens": 166233264.0, "step": 1565 }, { "entropy": 1.9224628746509551, "epoch": 1.1708795821285327, "grad_norm": 2.859375, "learning_rate": 8.979147874549989e-06, "loss": 1.0125, "mean_token_accuracy": 0.7762011967599391, "num_tokens": 166764218.0, "step": 1570 }, { "entropy": 1.9250923693180084, "epoch": 1.1746105773715139, "grad_norm": 2.734375, "learning_rate": 8.926220129592464e-06, "loss": 0.9431, "mean_token_accuracy": 0.7869702532887459, "num_tokens": 167294808.0, "step": 1575 }, { "entropy": 1.9111168920993804, "epoch": 1.178341572614495, "grad_norm": 2.75, "learning_rate": 8.873368019212274e-06, "loss": 0.9434, "mean_token_accuracy": 0.789091457426548, "num_tokens": 167826649.0, "step": 1580 }, { "entropy": 1.9143835335969925, "epoch": 1.182072567857476, "grad_norm": 2.859375, "learning_rate": 8.820593471023376e-06, "loss": 0.8946, "mean_token_accuracy": 0.7969313278794289, "num_tokens": 168358668.0, "step": 1585 }, { "entropy": 1.9125552743673324, "epoch": 1.185803563100457, "grad_norm": 2.75, "learning_rate": 8.7678984098109e-06, "loss": 0.9983, "mean_token_accuracy": 0.769627034664154, "num_tokens": 168891441.0, "step": 1590 }, { "entropy": 1.9116299599409103, "epoch": 1.189534558343438, "grad_norm": 2.96875, "learning_rate": 8.715284757460931e-06, "loss": 0.9524, "mean_token_accuracy": 0.7888155646622181, "num_tokens": 169421986.0, "step": 1595 }, { "entropy": 1.919787773489952, "epoch": 1.1932655535864192, "grad_norm": 2.734375, "learning_rate": 8.662754432890425e-06, "loss": 0.9749, "mean_token_accuracy": 0.7735044836997986, "num_tokens": 169954027.0, "step": 1600 }, { "epoch": 1.1932655535864192, "eval_entropy": 1.9173187561298846, "eval_loss": 0.7971454858779907, "eval_mean_token_accuracy": 0.7884338443180383, "eval_num_tokens": 169954027.0, "eval_runtime": 410.1401, "eval_samples_per_second": 2.114, "eval_steps_per_second": 0.529, "step": 1600 }, { "entropy": 1.9170388147234916, "epoch": 1.1969965488294003, "grad_norm": 2.796875, "learning_rate": 8.610309351977229e-06, "loss": 0.9239, "mean_token_accuracy": 0.7988493584096432, "num_tokens": 170485056.0, "step": 1605 }, { "entropy": 1.9125929340720176, "epoch": 1.2007275440723812, "grad_norm": 2.515625, "learning_rate": 8.557951427490179e-06, "loss": 0.9515, "mean_token_accuracy": 0.783885195851326, "num_tokens": 171014926.0, "step": 1610 }, { "entropy": 1.9132960721850396, "epoch": 1.2044585393153624, "grad_norm": 2.71875, "learning_rate": 8.505682569019364e-06, "loss": 0.9995, "mean_token_accuracy": 0.773869439214468, "num_tokens": 171545131.0, "step": 1615 }, { "entropy": 1.9141040802001954, "epoch": 1.2081895345583433, "grad_norm": 2.75, "learning_rate": 8.453504682906475e-06, "loss": 0.9394, "mean_token_accuracy": 0.7839238695800305, "num_tokens": 172077112.0, "step": 1620 }, { "entropy": 1.9142849922180176, "epoch": 1.2119205298013245, "grad_norm": 2.84375, "learning_rate": 8.401419672175275e-06, "loss": 0.9279, "mean_token_accuracy": 0.7974015466868878, "num_tokens": 172608007.0, "step": 1625 }, { "entropy": 1.9144698426127433, "epoch": 1.2156515250443056, "grad_norm": 2.734375, "learning_rate": 8.349429436462177e-06, "loss": 0.9012, "mean_token_accuracy": 0.7962669752538204, "num_tokens": 173140132.0, "step": 1630 }, { "entropy": 1.9135999783873558, "epoch": 1.2193825202872866, "grad_norm": 2.921875, "learning_rate": 8.29753587194699e-06, "loss": 1.0005, "mean_token_accuracy": 0.7792818926274776, "num_tokens": 173671827.0, "step": 1635 }, { "entropy": 1.9121680706739426, "epoch": 1.2231135155302677, "grad_norm": 2.9375, "learning_rate": 8.245740871283738e-06, "loss": 0.9935, "mean_token_accuracy": 0.777762345224619, "num_tokens": 174204179.0, "step": 1640 }, { "entropy": 1.9009287789463998, "epoch": 1.2268445107732489, "grad_norm": 2.53125, "learning_rate": 8.194046323531635e-06, "loss": 1.035, "mean_token_accuracy": 0.7625086784362793, "num_tokens": 174736876.0, "step": 1645 }, { "entropy": 1.9046483889222146, "epoch": 1.2305755060162298, "grad_norm": 2.796875, "learning_rate": 8.142454114086198e-06, "loss": 0.975, "mean_token_accuracy": 0.7746729388833046, "num_tokens": 175268815.0, "step": 1650 }, { "entropy": 1.9050630331039429, "epoch": 1.234306501259211, "grad_norm": 2.5625, "learning_rate": 8.090966124610479e-06, "loss": 0.9344, "mean_token_accuracy": 0.7914486967027188, "num_tokens": 175801245.0, "step": 1655 }, { "entropy": 1.9112103179097175, "epoch": 1.238037496502192, "grad_norm": 2.78125, "learning_rate": 8.039584232966437e-06, "loss": 0.98, "mean_token_accuracy": 0.7776268184185028, "num_tokens": 176331996.0, "step": 1660 }, { "entropy": 1.9143457114696503, "epoch": 1.241768491745173, "grad_norm": 3.0, "learning_rate": 7.988310313146439e-06, "loss": 0.936, "mean_token_accuracy": 0.7907832115888596, "num_tokens": 176865230.0, "step": 1665 }, { "entropy": 1.9180011659860612, "epoch": 1.2454994869881542, "grad_norm": 2.453125, "learning_rate": 7.937146235204927e-06, "loss": 1.0015, "mean_token_accuracy": 0.7779019854962825, "num_tokens": 177393698.0, "step": 1670 }, { "entropy": 1.9097063705325126, "epoch": 1.249230482231135, "grad_norm": 2.484375, "learning_rate": 7.886093865190201e-06, "loss": 1.0247, "mean_token_accuracy": 0.7732160426676273, "num_tokens": 177925791.0, "step": 1675 }, { "entropy": 1.907798358798027, "epoch": 1.2529614774741162, "grad_norm": 2.625, "learning_rate": 7.83515506507637e-06, "loss": 1.018, "mean_token_accuracy": 0.7696801349520683, "num_tokens": 178457035.0, "step": 1680 }, { "entropy": 1.9117656916379928, "epoch": 1.2566924727170972, "grad_norm": 2.921875, "learning_rate": 7.784331692695447e-06, "loss": 0.9456, "mean_token_accuracy": 0.7896642990410327, "num_tokens": 178987947.0, "step": 1685 }, { "entropy": 1.9108289450407028, "epoch": 1.2604234679600783, "grad_norm": 2.84375, "learning_rate": 7.73362560166957e-06, "loss": 0.9616, "mean_token_accuracy": 0.7905923403799534, "num_tokens": 179520270.0, "step": 1690 }, { "entropy": 1.9133642435073852, "epoch": 1.2641544632030595, "grad_norm": 2.75, "learning_rate": 7.683038641343418e-06, "loss": 0.975, "mean_token_accuracy": 0.7816100649535656, "num_tokens": 180050074.0, "step": 1695 }, { "entropy": 1.918484927713871, "epoch": 1.2678854584460404, "grad_norm": 3.109375, "learning_rate": 7.63257265671675e-06, "loss": 0.9223, "mean_token_accuracy": 0.8018887743353844, "num_tokens": 180583004.0, "step": 1700 }, { "epoch": 1.2678854584460404, "eval_entropy": 1.9196278774243896, "eval_loss": 0.7961597442626953, "eval_mean_token_accuracy": 0.78862205683361, "eval_num_tokens": 180583004.0, "eval_runtime": 410.4519, "eval_samples_per_second": 2.112, "eval_steps_per_second": 0.529, "step": 1700 }, { "entropy": 1.9090944886207581, "epoch": 1.2716164536890215, "grad_norm": 2.96875, "learning_rate": 7.582229488377117e-06, "loss": 0.9635, "mean_token_accuracy": 0.7776286259293557, "num_tokens": 181116350.0, "step": 1705 }, { "entropy": 1.915034532546997, "epoch": 1.2753474489320027, "grad_norm": 2.625, "learning_rate": 7.5320109724327394e-06, "loss": 0.9893, "mean_token_accuracy": 0.7796139396727085, "num_tokens": 181647859.0, "step": 1710 }, { "entropy": 1.9097247406840325, "epoch": 1.2790784441749836, "grad_norm": 2.671875, "learning_rate": 7.4819189404455275e-06, "loss": 0.9614, "mean_token_accuracy": 0.778455737233162, "num_tokens": 182176933.0, "step": 1715 }, { "entropy": 1.9054157629609108, "epoch": 1.2828094394179648, "grad_norm": 2.796875, "learning_rate": 7.431955219364293e-06, "loss": 0.9687, "mean_token_accuracy": 0.7882591933012009, "num_tokens": 182706541.0, "step": 1720 }, { "entropy": 1.9075705125927924, "epoch": 1.286540434660946, "grad_norm": 2.734375, "learning_rate": 7.38212163145811e-06, "loss": 0.9315, "mean_token_accuracy": 0.7873170509934425, "num_tokens": 183239225.0, "step": 1725 }, { "entropy": 1.9072406858205795, "epoch": 1.2902714299039268, "grad_norm": 2.75, "learning_rate": 7.33241999424986e-06, "loss": 0.8958, "mean_token_accuracy": 0.7974990576505661, "num_tokens": 183770965.0, "step": 1730 }, { "entropy": 1.9208739027380943, "epoch": 1.294002425146908, "grad_norm": 2.828125, "learning_rate": 7.282852120449926e-06, "loss": 0.9611, "mean_token_accuracy": 0.7872603751718998, "num_tokens": 184304098.0, "step": 1735 }, { "entropy": 1.912934736907482, "epoch": 1.297733420389889, "grad_norm": 2.90625, "learning_rate": 7.2334198178901125e-06, "loss": 1.0037, "mean_token_accuracy": 0.7736862108111382, "num_tokens": 184835401.0, "step": 1740 }, { "entropy": 1.9067255914211274, "epoch": 1.30146441563287, "grad_norm": 2.984375, "learning_rate": 7.184124889457678e-06, "loss": 0.9615, "mean_token_accuracy": 0.7750180624425411, "num_tokens": 185366030.0, "step": 1745 }, { "entropy": 1.915813611447811, "epoch": 1.305195410875851, "grad_norm": 2.84375, "learning_rate": 7.134969133029597e-06, "loss": 0.9655, "mean_token_accuracy": 0.7763230137526989, "num_tokens": 185898027.0, "step": 1750 }, { "entropy": 1.9158352181315421, "epoch": 1.3089264061188322, "grad_norm": 2.9375, "learning_rate": 7.085954341406977e-06, "loss": 0.9449, "mean_token_accuracy": 0.7908552013337612, "num_tokens": 186433530.0, "step": 1755 }, { "entropy": 1.9180777758359908, "epoch": 1.3126574013618133, "grad_norm": 3.390625, "learning_rate": 7.0370823022496895e-06, "loss": 0.9632, "mean_token_accuracy": 0.7842883773148059, "num_tokens": 186965777.0, "step": 1760 }, { "entropy": 1.9151813015341759, "epoch": 1.3163883966047942, "grad_norm": 3.4375, "learning_rate": 6.988354798011157e-06, "loss": 1.0085, "mean_token_accuracy": 0.7796649239957333, "num_tokens": 187496044.0, "step": 1765 }, { "entropy": 1.9258656322956085, "epoch": 1.3201193918477754, "grad_norm": 2.625, "learning_rate": 6.939773605873343e-06, "loss": 0.9668, "mean_token_accuracy": 0.7782830119132995, "num_tokens": 188027696.0, "step": 1770 }, { "entropy": 1.9207917422056198, "epoch": 1.3238503870907565, "grad_norm": 2.703125, "learning_rate": 6.891340497681943e-06, "loss": 0.9789, "mean_token_accuracy": 0.7820106342434883, "num_tokens": 188557431.0, "step": 1775 }, { "entropy": 1.9169728681445122, "epoch": 1.3275813823337375, "grad_norm": 3.03125, "learning_rate": 6.843057239881761e-06, "loss": 0.9628, "mean_token_accuracy": 0.7850354023277759, "num_tokens": 189086807.0, "step": 1780 }, { "entropy": 1.9095087736845016, "epoch": 1.3313123775767186, "grad_norm": 2.53125, "learning_rate": 6.794925593452266e-06, "loss": 0.9283, "mean_token_accuracy": 0.7953442484140396, "num_tokens": 189619674.0, "step": 1785 }, { "entropy": 1.911048673093319, "epoch": 1.3350433728196998, "grad_norm": 2.90625, "learning_rate": 6.746947313843397e-06, "loss": 0.8736, "mean_token_accuracy": 0.798565112799406, "num_tokens": 190149129.0, "step": 1790 }, { "entropy": 1.9093924537301064, "epoch": 1.3387743680626807, "grad_norm": 2.796875, "learning_rate": 6.699124150911515e-06, "loss": 1.0138, "mean_token_accuracy": 0.7696616865694523, "num_tokens": 190679470.0, "step": 1795 }, { "entropy": 1.9110808402299881, "epoch": 1.3425053633056618, "grad_norm": 2.84375, "learning_rate": 6.651457848855589e-06, "loss": 0.9741, "mean_token_accuracy": 0.7712423533201218, "num_tokens": 191209208.0, "step": 1800 }, { "epoch": 1.3425053633056618, "eval_entropy": 1.9213085690951017, "eval_loss": 0.7952644228935242, "eval_mean_token_accuracy": 0.7886214846839553, "eval_num_tokens": 191209208.0, "eval_runtime": 417.0124, "eval_samples_per_second": 2.079, "eval_steps_per_second": 0.52, "step": 1800 }, { "entropy": 1.9260811403393745, "epoch": 1.346236358548643, "grad_norm": 2.625, "learning_rate": 6.603950146153588e-06, "loss": 0.9596, "mean_token_accuracy": 0.7831857018172741, "num_tokens": 191739157.0, "step": 1805 }, { "entropy": 1.9144031763076783, "epoch": 1.349967353791624, "grad_norm": 2.890625, "learning_rate": 6.5566027754990655e-06, "loss": 0.9274, "mean_token_accuracy": 0.7880234897136689, "num_tokens": 192268705.0, "step": 1810 }, { "entropy": 1.909570923447609, "epoch": 1.353698349034605, "grad_norm": 2.546875, "learning_rate": 6.509417463737966e-06, "loss": 0.8996, "mean_token_accuracy": 0.790591161698103, "num_tokens": 192801572.0, "step": 1815 }, { "entropy": 1.9133508145809173, "epoch": 1.357429344277586, "grad_norm": 2.84375, "learning_rate": 6.462395931805653e-06, "loss": 0.9941, "mean_token_accuracy": 0.7782829597592353, "num_tokens": 193334058.0, "step": 1820 }, { "entropy": 1.9052420556545258, "epoch": 1.3611603395205671, "grad_norm": 2.9375, "learning_rate": 6.4155398946641356e-06, "loss": 1.0017, "mean_token_accuracy": 0.7746684297919273, "num_tokens": 193865850.0, "step": 1825 }, { "entropy": 1.913660430908203, "epoch": 1.364891334763548, "grad_norm": 2.9375, "learning_rate": 6.368851061239525e-06, "loss": 1.0345, "mean_token_accuracy": 0.7757304526865483, "num_tokens": 194395523.0, "step": 1830 }, { "entropy": 1.9195617333054542, "epoch": 1.3686223300065292, "grad_norm": 3.109375, "learning_rate": 6.3223311343596986e-06, "loss": 0.9157, "mean_token_accuracy": 0.7950341895222663, "num_tokens": 194927027.0, "step": 1835 }, { "entropy": 1.9108835265040398, "epoch": 1.3723533252495104, "grad_norm": 2.890625, "learning_rate": 6.275981810692211e-06, "loss": 0.962, "mean_token_accuracy": 0.7879531472921372, "num_tokens": 195460614.0, "step": 1840 }, { "entropy": 1.9083492949604988, "epoch": 1.3760843204924913, "grad_norm": 2.890625, "learning_rate": 6.229804780682385e-06, "loss": 0.9744, "mean_token_accuracy": 0.7809002794325351, "num_tokens": 195993086.0, "step": 1845 }, { "entropy": 1.9126764357089996, "epoch": 1.3798153157354724, "grad_norm": 2.78125, "learning_rate": 6.183801728491699e-06, "loss": 0.9072, "mean_token_accuracy": 0.7959237761795521, "num_tokens": 196523248.0, "step": 1850 }, { "entropy": 1.9173283711075784, "epoch": 1.3835463109784536, "grad_norm": 2.921875, "learning_rate": 6.137974331936318e-06, "loss": 0.9121, "mean_token_accuracy": 0.7949255533516407, "num_tokens": 197055644.0, "step": 1855 }, { "entropy": 1.9184476286172867, "epoch": 1.3872773062214345, "grad_norm": 2.796875, "learning_rate": 6.092324262425934e-06, "loss": 0.9758, "mean_token_accuracy": 0.787192802876234, "num_tokens": 197586729.0, "step": 1860 }, { "entropy": 1.9162007600069046, "epoch": 1.3910083014644157, "grad_norm": 2.84375, "learning_rate": 6.046853184902793e-06, "loss": 0.9327, "mean_token_accuracy": 0.7840571783483028, "num_tokens": 198120218.0, "step": 1865 }, { "entropy": 1.9201767519116402, "epoch": 1.3947392967073968, "grad_norm": 2.703125, "learning_rate": 6.001562757780975e-06, "loss": 0.9963, "mean_token_accuracy": 0.7816694289445877, "num_tokens": 198653192.0, "step": 1870 }, { "entropy": 1.91514031291008, "epoch": 1.3984702919503778, "grad_norm": 2.765625, "learning_rate": 5.956454632885894e-06, "loss": 0.9816, "mean_token_accuracy": 0.7758061110973358, "num_tokens": 199183589.0, "step": 1875 }, { "entropy": 1.913975764811039, "epoch": 1.402201287193359, "grad_norm": 2.765625, "learning_rate": 5.911530455394086e-06, "loss": 1.0108, "mean_token_accuracy": 0.7735319837927819, "num_tokens": 199713315.0, "step": 1880 }, { "entropy": 1.9288661509752274, "epoch": 1.4059322824363398, "grad_norm": 2.703125, "learning_rate": 5.866791863773168e-06, "loss": 0.8622, "mean_token_accuracy": 0.8103111371397972, "num_tokens": 200243645.0, "step": 1885 }, { "entropy": 1.926280289888382, "epoch": 1.409663277679321, "grad_norm": 3.0625, "learning_rate": 5.822240489722104e-06, "loss": 0.8932, "mean_token_accuracy": 0.8012527912855149, "num_tokens": 200773847.0, "step": 1890 }, { "entropy": 1.9145182326436043, "epoch": 1.413394272922302, "grad_norm": 2.84375, "learning_rate": 5.777877958111694e-06, "loss": 0.9799, "mean_token_accuracy": 0.778388562053442, "num_tokens": 201308141.0, "step": 1895 }, { "entropy": 1.9145977690815925, "epoch": 1.417125268165283, "grad_norm": 2.6875, "learning_rate": 5.733705886925296e-06, "loss": 0.9105, "mean_token_accuracy": 0.7903211638331413, "num_tokens": 201839018.0, "step": 1900 }, { "epoch": 1.417125268165283, "eval_entropy": 1.9226924159010435, "eval_loss": 0.7947914004325867, "eval_mean_token_accuracy": 0.7886583176076687, "eval_num_tokens": 201839018.0, "eval_runtime": 415.0444, "eval_samples_per_second": 2.089, "eval_steps_per_second": 0.523, "step": 1900 }, { "entropy": 1.9117732971906662, "epoch": 1.4208562634082642, "grad_norm": 2.453125, "learning_rate": 5.689725887199833e-06, "loss": 0.9656, "mean_token_accuracy": 0.787474986165762, "num_tokens": 202368336.0, "step": 1905 }, { "entropy": 1.919792464375496, "epoch": 1.4245872586512451, "grad_norm": 2.859375, "learning_rate": 5.645939562967026e-06, "loss": 0.9176, "mean_token_accuracy": 0.7953712411224843, "num_tokens": 202899920.0, "step": 1910 }, { "entropy": 1.9164147719740867, "epoch": 1.4283182538942263, "grad_norm": 2.984375, "learning_rate": 5.602348511194882e-06, "loss": 0.9731, "mean_token_accuracy": 0.783729862421751, "num_tokens": 203431781.0, "step": 1915 }, { "entropy": 1.9064750477671624, "epoch": 1.4320492491372074, "grad_norm": 2.953125, "learning_rate": 5.558954321729477e-06, "loss": 0.9373, "mean_token_accuracy": 0.7890445157885552, "num_tokens": 203963211.0, "step": 1920 }, { "entropy": 1.9172628924250603, "epoch": 1.4357802443801884, "grad_norm": 3.015625, "learning_rate": 5.515758577236951e-06, "loss": 0.9597, "mean_token_accuracy": 0.7818316780030727, "num_tokens": 204493102.0, "step": 1925 }, { "entropy": 1.9186809375882148, "epoch": 1.4395112396231695, "grad_norm": 3.296875, "learning_rate": 5.472762853145785e-06, "loss": 0.983, "mean_token_accuracy": 0.77845314219594, "num_tokens": 205022866.0, "step": 1930 }, { "entropy": 1.907158075273037, "epoch": 1.4432422348661507, "grad_norm": 2.84375, "learning_rate": 5.429968717589349e-06, "loss": 1.0143, "mean_token_accuracy": 0.7621226042509079, "num_tokens": 205555650.0, "step": 1935 }, { "entropy": 1.9154852628707886, "epoch": 1.4469732301091316, "grad_norm": 2.9375, "learning_rate": 5.387377731348715e-06, "loss": 0.9273, "mean_token_accuracy": 0.7915920428931713, "num_tokens": 206089656.0, "step": 1940 }, { "entropy": 1.9256163939833641, "epoch": 1.4507042253521127, "grad_norm": 2.796875, "learning_rate": 5.344991447795709e-06, "loss": 0.9501, "mean_token_accuracy": 0.7945499166846275, "num_tokens": 206622120.0, "step": 1945 }, { "entropy": 1.9029963329434394, "epoch": 1.4544352205950937, "grad_norm": 2.53125, "learning_rate": 5.302811412836285e-06, "loss": 1.0196, "mean_token_accuracy": 0.7709416255354882, "num_tokens": 207155611.0, "step": 1950 }, { "entropy": 1.917876623570919, "epoch": 1.4581662158380748, "grad_norm": 2.578125, "learning_rate": 5.2608391648541276e-06, "loss": 0.9417, "mean_token_accuracy": 0.7857843562960625, "num_tokens": 207686473.0, "step": 1955 }, { "entropy": 1.9248770505189896, "epoch": 1.4618972110810557, "grad_norm": 3.140625, "learning_rate": 5.219076234654545e-06, "loss": 0.9953, "mean_token_accuracy": 0.7756866917014122, "num_tokens": 208217822.0, "step": 1960 }, { "entropy": 1.9229801774024964, "epoch": 1.465628206324037, "grad_norm": 2.984375, "learning_rate": 5.177524145408642e-06, "loss": 0.9483, "mean_token_accuracy": 0.7806045912206173, "num_tokens": 208747962.0, "step": 1965 }, { "entropy": 1.9025382086634637, "epoch": 1.469359201567018, "grad_norm": 2.8125, "learning_rate": 5.136184412597767e-06, "loss": 0.9272, "mean_token_accuracy": 0.7886285282671451, "num_tokens": 209278371.0, "step": 1970 }, { "entropy": 1.915241926908493, "epoch": 1.473090196809999, "grad_norm": 2.90625, "learning_rate": 5.0950585439582244e-06, "loss": 0.9676, "mean_token_accuracy": 0.7825336590409279, "num_tokens": 209810564.0, "step": 1975 }, { "entropy": 1.9186094522476196, "epoch": 1.4768211920529801, "grad_norm": 2.78125, "learning_rate": 5.054148039426314e-06, "loss": 0.979, "mean_token_accuracy": 0.7762361623346805, "num_tokens": 210340000.0, "step": 1980 }, { "entropy": 1.917215932905674, "epoch": 1.4805521872959613, "grad_norm": 2.734375, "learning_rate": 5.013454391083603e-06, "loss": 1.0244, "mean_token_accuracy": 0.76511832177639, "num_tokens": 210871442.0, "step": 1985 }, { "entropy": 1.9191341713070869, "epoch": 1.4842831825389422, "grad_norm": 2.890625, "learning_rate": 4.972979083102512e-06, "loss": 0.9806, "mean_token_accuracy": 0.780594726651907, "num_tokens": 211401718.0, "step": 1990 }, { "entropy": 1.9123217433691024, "epoch": 1.4880141777819234, "grad_norm": 3.265625, "learning_rate": 4.9327235916921885e-06, "loss": 1.0062, "mean_token_accuracy": 0.776950541883707, "num_tokens": 211933026.0, "step": 1995 }, { "entropy": 1.9251987099647523, "epoch": 1.4917451730249045, "grad_norm": 2.71875, "learning_rate": 4.892689385044664e-06, "loss": 0.958, "mean_token_accuracy": 0.7888118654489518, "num_tokens": 212463969.0, "step": 2000 }, { "epoch": 1.4917451730249045, "eval_entropy": 1.923713410504952, "eval_loss": 0.7941039800643921, "eval_mean_token_accuracy": 0.7887901699488065, "eval_num_tokens": 212463969.0, "eval_runtime": 423.4613, "eval_samples_per_second": 2.047, "eval_steps_per_second": 0.512, "step": 2000 }, { "entropy": 1.9207520872354507, "epoch": 1.4954761682678854, "grad_norm": 2.90625, "learning_rate": 4.852877923281301e-06, "loss": 0.8589, "mean_token_accuracy": 0.7933267027139663, "num_tokens": 212994560.0, "step": 2005 }, { "entropy": 1.9220597296953201, "epoch": 1.4992071635108666, "grad_norm": 3.828125, "learning_rate": 4.81329065839955e-06, "loss": 0.9453, "mean_token_accuracy": 0.7886260092258454, "num_tokens": 213525059.0, "step": 2010 }, { "entropy": 1.926292322576046, "epoch": 1.5029381587538477, "grad_norm": 2.9375, "learning_rate": 4.773929034219989e-06, "loss": 0.924, "mean_token_accuracy": 0.7947903670370579, "num_tokens": 214056456.0, "step": 2015 }, { "entropy": 1.9207703724503518, "epoch": 1.5066691539968287, "grad_norm": 3.0, "learning_rate": 4.734794486333661e-06, "loss": 0.996, "mean_token_accuracy": 0.7710816271603107, "num_tokens": 214589110.0, "step": 2020 }, { "entropy": 1.9281767308712006, "epoch": 1.5104001492398096, "grad_norm": 3.015625, "learning_rate": 4.6958884420497196e-06, "loss": 0.936, "mean_token_accuracy": 0.7833106748759746, "num_tokens": 215118032.0, "step": 2025 }, { "entropy": 1.92151879966259, "epoch": 1.5141311444827907, "grad_norm": 2.96875, "learning_rate": 4.657212320343369e-06, "loss": 0.9275, "mean_token_accuracy": 0.7913325764238834, "num_tokens": 215650747.0, "step": 2030 }, { "entropy": 1.9185984879732132, "epoch": 1.5178621397257719, "grad_norm": 2.890625, "learning_rate": 4.6187675318041085e-06, "loss": 1.03, "mean_token_accuracy": 0.7779677748680115, "num_tokens": 216181946.0, "step": 2035 }, { "entropy": 1.9180883452296258, "epoch": 1.5215931349687528, "grad_norm": 3.15625, "learning_rate": 4.580555478584295e-06, "loss": 0.959, "mean_token_accuracy": 0.7815264448523521, "num_tokens": 216712813.0, "step": 2040 }, { "entropy": 1.9216745108366013, "epoch": 1.525324130211734, "grad_norm": 2.84375, "learning_rate": 4.542577554348007e-06, "loss": 0.9013, "mean_token_accuracy": 0.7971072472631932, "num_tokens": 217242416.0, "step": 2045 }, { "entropy": 1.9199370354413987, "epoch": 1.5290551254547151, "grad_norm": 2.625, "learning_rate": 4.50483514422019e-06, "loss": 0.9607, "mean_token_accuracy": 0.7842399850487709, "num_tokens": 217774340.0, "step": 2050 }, { "entropy": 1.9149819746613503, "epoch": 1.532786120697696, "grad_norm": 2.65625, "learning_rate": 4.467329624736165e-06, "loss": 0.9555, "mean_token_accuracy": 0.7854515843093395, "num_tokens": 218305918.0, "step": 2055 }, { "entropy": 1.9093442946672439, "epoch": 1.5365171159406772, "grad_norm": 2.84375, "learning_rate": 4.4300623637914186e-06, "loss": 0.9692, "mean_token_accuracy": 0.7805149331688881, "num_tokens": 218838626.0, "step": 2060 }, { "entropy": 1.9209594249725341, "epoch": 1.5402481111836583, "grad_norm": 2.828125, "learning_rate": 4.393034720591696e-06, "loss": 1.005, "mean_token_accuracy": 0.7798107601702213, "num_tokens": 219368785.0, "step": 2065 }, { "entropy": 1.919579316675663, "epoch": 1.5439791064266393, "grad_norm": 2.671875, "learning_rate": 4.356248045603453e-06, "loss": 0.9529, "mean_token_accuracy": 0.781716576218605, "num_tokens": 219899756.0, "step": 2070 }, { "entropy": 1.922955933213234, "epoch": 1.5477101016696204, "grad_norm": 2.53125, "learning_rate": 4.319703680504589e-06, "loss": 0.9333, "mean_token_accuracy": 0.7829143904149533, "num_tokens": 220433320.0, "step": 2075 }, { "entropy": 1.9164094612002374, "epoch": 1.5514410969126016, "grad_norm": 2.46875, "learning_rate": 4.283402958135506e-06, "loss": 1.0076, "mean_token_accuracy": 0.7693854115903378, "num_tokens": 220963973.0, "step": 2080 }, { "entropy": 1.9213256657123565, "epoch": 1.5551720921555825, "grad_norm": 2.9375, "learning_rate": 4.247347202450514e-06, "loss": 0.9796, "mean_token_accuracy": 0.7753906600177288, "num_tokens": 221493799.0, "step": 2085 }, { "entropy": 1.914214849472046, "epoch": 1.5589030873985634, "grad_norm": 2.625, "learning_rate": 4.211537728469538e-06, "loss": 0.923, "mean_token_accuracy": 0.7858444310724735, "num_tokens": 222025516.0, "step": 2090 }, { "entropy": 1.9236473813652992, "epoch": 1.5626340826415448, "grad_norm": 2.765625, "learning_rate": 4.175975842230144e-06, "loss": 0.9964, "mean_token_accuracy": 0.7722836993634701, "num_tokens": 222555997.0, "step": 2095 }, { "entropy": 1.9193670630455018, "epoch": 1.5663650778845257, "grad_norm": 2.875, "learning_rate": 4.140662840739929e-06, "loss": 1.0129, "mean_token_accuracy": 0.777188754081726, "num_tokens": 223086481.0, "step": 2100 }, { "epoch": 1.5663650778845257, "eval_entropy": 1.9263520954940725, "eval_loss": 0.7939472198486328, "eval_mean_token_accuracy": 0.789093039002836, "eval_num_tokens": 223086481.0, "eval_runtime": 411.5243, "eval_samples_per_second": 2.107, "eval_steps_per_second": 0.527, "step": 2100 }, { "entropy": 1.9265794411301613, "epoch": 1.5700960731275067, "grad_norm": 2.484375, "learning_rate": 4.1056000119292e-06, "loss": 0.9907, "mean_token_accuracy": 0.7721577517688274, "num_tokens": 223616881.0, "step": 2105 }, { "entropy": 1.9221562936902046, "epoch": 1.5738270683704878, "grad_norm": 2.84375, "learning_rate": 4.070788634603996e-06, "loss": 0.9787, "mean_token_accuracy": 0.7774898521602154, "num_tokens": 224149559.0, "step": 2110 }, { "entropy": 1.9217603102326393, "epoch": 1.577558063613469, "grad_norm": 2.75, "learning_rate": 4.036229978399471e-06, "loss": 0.9185, "mean_token_accuracy": 0.7944407314062119, "num_tokens": 224678853.0, "step": 2115 }, { "entropy": 1.9181690692901612, "epoch": 1.5812890588564499, "grad_norm": 3.203125, "learning_rate": 4.001925303733564e-06, "loss": 0.9609, "mean_token_accuracy": 0.7815622113645077, "num_tokens": 225211439.0, "step": 2120 }, { "entropy": 1.923632562160492, "epoch": 1.585020054099431, "grad_norm": 2.9375, "learning_rate": 3.967875861761037e-06, "loss": 0.9365, "mean_token_accuracy": 0.7866649009287358, "num_tokens": 225742793.0, "step": 2125 }, { "entropy": 1.919877180457115, "epoch": 1.5887510493424122, "grad_norm": 2.890625, "learning_rate": 3.934082894327855e-06, "loss": 0.9416, "mean_token_accuracy": 0.7854848735034465, "num_tokens": 226273252.0, "step": 2130 }, { "entropy": 1.9240162417292594, "epoch": 1.592482044585393, "grad_norm": 2.53125, "learning_rate": 3.9005476339258795e-06, "loss": 0.9857, "mean_token_accuracy": 0.7773564159870148, "num_tokens": 226806159.0, "step": 2135 }, { "entropy": 1.9207270756363868, "epoch": 1.5962130398283743, "grad_norm": 3.0, "learning_rate": 3.867271303647912e-06, "loss": 0.9065, "mean_token_accuracy": 0.7894360311329365, "num_tokens": 227335162.0, "step": 2140 }, { "entropy": 1.925444918870926, "epoch": 1.5999440350713554, "grad_norm": 2.90625, "learning_rate": 3.834255117143105e-06, "loss": 0.9146, "mean_token_accuracy": 0.7939443275332451, "num_tokens": 227866960.0, "step": 2145 }, { "entropy": 1.920570534467697, "epoch": 1.6036750303143363, "grad_norm": 3.140625, "learning_rate": 3.8015002785726852e-06, "loss": 0.9451, "mean_token_accuracy": 0.7859753273427487, "num_tokens": 228397933.0, "step": 2150 }, { "entropy": 1.9137139230966569, "epoch": 1.6074060255573173, "grad_norm": 2.84375, "learning_rate": 3.7690079825660367e-06, "loss": 0.9853, "mean_token_accuracy": 0.7792695969343185, "num_tokens": 228932759.0, "step": 2155 }, { "entropy": 1.9213504776358605, "epoch": 1.6111370208002986, "grad_norm": 2.734375, "learning_rate": 3.736779414177133e-06, "loss": 0.8977, "mean_token_accuracy": 0.7924081102013588, "num_tokens": 229464578.0, "step": 2160 }, { "entropy": 1.9174690261483192, "epoch": 1.6148680160432796, "grad_norm": 2.671875, "learning_rate": 3.7048157488413185e-06, "loss": 1.0169, "mean_token_accuracy": 0.7713649921119213, "num_tokens": 229995198.0, "step": 2165 }, { "entropy": 1.9215390980243683, "epoch": 1.6185990112862605, "grad_norm": 3.1875, "learning_rate": 3.6731181523324234e-06, "loss": 0.85, "mean_token_accuracy": 0.8087228052318096, "num_tokens": 230526460.0, "step": 2170 }, { "entropy": 1.924245499074459, "epoch": 1.6223300065292416, "grad_norm": 2.75, "learning_rate": 3.6416877807202676e-06, "loss": 0.9276, "mean_token_accuracy": 0.788924976438284, "num_tokens": 231057436.0, "step": 2175 }, { "entropy": 1.9228363931179047, "epoch": 1.6260610017722228, "grad_norm": 3.171875, "learning_rate": 3.6105257803284844e-06, "loss": 0.9772, "mean_token_accuracy": 0.7773720294237136, "num_tokens": 231589551.0, "step": 2180 }, { "entropy": 1.9220960274338723, "epoch": 1.6297919970152037, "grad_norm": 2.59375, "learning_rate": 3.579633287692711e-06, "loss": 0.947, "mean_token_accuracy": 0.7894782021641731, "num_tokens": 232121607.0, "step": 2185 }, { "entropy": 1.9228823781013489, "epoch": 1.6335229922581849, "grad_norm": 2.890625, "learning_rate": 3.549011429519139e-06, "loss": 0.8958, "mean_token_accuracy": 0.7975842222571373, "num_tokens": 232653590.0, "step": 2190 }, { "entropy": 1.9289898470044136, "epoch": 1.637253987501166, "grad_norm": 2.890625, "learning_rate": 3.5186613226434286e-06, "loss": 0.9415, "mean_token_accuracy": 0.7874727994203568, "num_tokens": 233184465.0, "step": 2195 }, { "entropy": 1.9162527233362199, "epoch": 1.640984982744147, "grad_norm": 2.6875, "learning_rate": 3.488584073989958e-06, "loss": 0.981, "mean_token_accuracy": 0.7749945864081382, "num_tokens": 233716486.0, "step": 2200 }, { "epoch": 1.640984982744147, "eval_entropy": 1.922694647916451, "eval_loss": 0.7940070033073425, "eval_mean_token_accuracy": 0.7886112252688078, "eval_num_tokens": 233716486.0, "eval_runtime": 452.226, "eval_samples_per_second": 1.917, "eval_steps_per_second": 0.48, "step": 2200 }, { "entropy": 1.9192370921373367, "epoch": 1.644715977987128, "grad_norm": 2.84375, "learning_rate": 3.4587807805314695e-06, "loss": 0.9459, "mean_token_accuracy": 0.7845867499709129, "num_tokens": 234245589.0, "step": 2205 }, { "entropy": 1.9262465044856072, "epoch": 1.6484469732301092, "grad_norm": 2.875, "learning_rate": 3.4292525292490544e-06, "loss": 0.9529, "mean_token_accuracy": 0.7888288274407387, "num_tokens": 234776020.0, "step": 2210 }, { "entropy": 1.9103346571326256, "epoch": 1.6521779684730902, "grad_norm": 2.671875, "learning_rate": 3.4000003970925076e-06, "loss": 0.9747, "mean_token_accuracy": 0.7737406581640244, "num_tokens": 235307381.0, "step": 2215 }, { "entropy": 1.9194400265812874, "epoch": 1.655908963716071, "grad_norm": 3.0625, "learning_rate": 3.371025450941049e-06, "loss": 0.985, "mean_token_accuracy": 0.773327323794365, "num_tokens": 235835901.0, "step": 2220 }, { "entropy": 1.9171280801296233, "epoch": 1.6596399589590525, "grad_norm": 2.6875, "learning_rate": 3.342328747564416e-06, "loss": 0.9619, "mean_token_accuracy": 0.790045228600502, "num_tokens": 236366996.0, "step": 2225 }, { "entropy": 1.9132253170013427, "epoch": 1.6633709542020334, "grad_norm": 2.6875, "learning_rate": 3.313911333584313e-06, "loss": 1.0102, "mean_token_accuracy": 0.7748994067311287, "num_tokens": 236899548.0, "step": 2230 }, { "entropy": 1.918226708471775, "epoch": 1.6671019494450143, "grad_norm": 2.890625, "learning_rate": 3.2857742454362516e-06, "loss": 0.9767, "mean_token_accuracy": 0.7792360559105873, "num_tokens": 237431409.0, "step": 2235 }, { "entropy": 1.9195396587252618, "epoch": 1.6708329446879955, "grad_norm": 2.78125, "learning_rate": 3.257918509331741e-06, "loss": 0.9301, "mean_token_accuracy": 0.7851594485342502, "num_tokens": 237963649.0, "step": 2240 }, { "entropy": 1.912094420194626, "epoch": 1.6745639399309766, "grad_norm": 2.953125, "learning_rate": 3.2303451412208604e-06, "loss": 0.9567, "mean_token_accuracy": 0.7778088934719563, "num_tokens": 238496392.0, "step": 2245 }, { "entropy": 1.9135541930794715, "epoch": 1.6782949351739576, "grad_norm": 2.546875, "learning_rate": 3.2030551467552087e-06, "loss": 1.0216, "mean_token_accuracy": 0.764666422456503, "num_tokens": 239028705.0, "step": 2250 }, { "entropy": 1.9229572668671608, "epoch": 1.6820259304169387, "grad_norm": 2.65625, "learning_rate": 3.176049521251229e-06, "loss": 1.0217, "mean_token_accuracy": 0.7704686284065246, "num_tokens": 239559619.0, "step": 2255 }, { "entropy": 1.9300857797265052, "epoch": 1.6857569256599199, "grad_norm": 2.78125, "learning_rate": 3.149329249653895e-06, "loss": 0.9225, "mean_token_accuracy": 0.8011655747890473, "num_tokens": 240092059.0, "step": 2260 }, { "entropy": 1.9296444818377494, "epoch": 1.6894879209029008, "grad_norm": 2.609375, "learning_rate": 3.1228953065008035e-06, "loss": 0.9052, "mean_token_accuracy": 0.7951169416308403, "num_tokens": 240622703.0, "step": 2265 }, { "entropy": 1.9241193488240242, "epoch": 1.693218916145882, "grad_norm": 2.75, "learning_rate": 3.0967486558866257e-06, "loss": 0.8544, "mean_token_accuracy": 0.8072010114789009, "num_tokens": 241152633.0, "step": 2270 }, { "entropy": 1.9230607852339745, "epoch": 1.696949911388863, "grad_norm": 3.59375, "learning_rate": 3.0708902514279405e-06, "loss": 0.9734, "mean_token_accuracy": 0.7778616219758987, "num_tokens": 241684053.0, "step": 2275 }, { "entropy": 1.9290795788168906, "epoch": 1.700680906631844, "grad_norm": 2.6875, "learning_rate": 3.045321036228458e-06, "loss": 0.8386, "mean_token_accuracy": 0.8009327456355095, "num_tokens": 242214854.0, "step": 2280 }, { "entropy": 1.9164558365941047, "epoch": 1.7044119018748252, "grad_norm": 2.65625, "learning_rate": 3.0200419428446276e-06, "loss": 0.9529, "mean_token_accuracy": 0.7778381988406181, "num_tokens": 242746380.0, "step": 2285 }, { "entropy": 1.9153956204652787, "epoch": 1.7081428971178063, "grad_norm": 2.8125, "learning_rate": 2.9950538932516093e-06, "loss": 0.9705, "mean_token_accuracy": 0.7820813551545143, "num_tokens": 243277935.0, "step": 2290 }, { "entropy": 1.9179982468485832, "epoch": 1.7118738923607872, "grad_norm": 2.84375, "learning_rate": 2.9703577988096754e-06, "loss": 0.9666, "mean_token_accuracy": 0.7841740600764752, "num_tokens": 243809259.0, "step": 2295 }, { "entropy": 1.9282338470220566, "epoch": 1.7156048876037682, "grad_norm": 2.953125, "learning_rate": 2.9459545602309403e-06, "loss": 0.9468, "mean_token_accuracy": 0.7934271112084389, "num_tokens": 244338575.0, "step": 2300 }, { "epoch": 1.7156048876037682, "eval_entropy": 1.9274940507203204, "eval_loss": 0.7935892343521118, "eval_mean_token_accuracy": 0.7888673678521188, "eval_num_tokens": 244338575.0, "eval_runtime": 408.8193, "eval_samples_per_second": 2.121, "eval_steps_per_second": 0.531, "step": 2300 }, { "entropy": 1.9255469009280204, "epoch": 1.7193358828467493, "grad_norm": 2.796875, "learning_rate": 2.9218450675465326e-06, "loss": 0.9493, "mean_token_accuracy": 0.7921744890511035, "num_tokens": 244868407.0, "step": 2305 }, { "entropy": 1.9082586839795113, "epoch": 1.7230668780897305, "grad_norm": 2.640625, "learning_rate": 2.8980302000741267e-06, "loss": 0.9779, "mean_token_accuracy": 0.7783657617866993, "num_tokens": 245398671.0, "step": 2310 }, { "entropy": 1.9125275507569313, "epoch": 1.7267978733327114, "grad_norm": 2.828125, "learning_rate": 2.87451082638587e-06, "loss": 0.9251, "mean_token_accuracy": 0.7951716847717762, "num_tokens": 245930302.0, "step": 2315 }, { "entropy": 1.921560037136078, "epoch": 1.7305288685756925, "grad_norm": 3.09375, "learning_rate": 2.8512878042767027e-06, "loss": 0.9052, "mean_token_accuracy": 0.7911626607179642, "num_tokens": 246462728.0, "step": 2320 }, { "entropy": 1.926774376630783, "epoch": 1.7342598638186737, "grad_norm": 2.5625, "learning_rate": 2.8283619807330855e-06, "loss": 0.8988, "mean_token_accuracy": 0.7861256495118141, "num_tokens": 246992816.0, "step": 2325 }, { "entropy": 1.9165738061070443, "epoch": 1.7379908590616546, "grad_norm": 3.0625, "learning_rate": 2.8057341919020906e-06, "loss": 0.9964, "mean_token_accuracy": 0.7701143123209476, "num_tokens": 247526253.0, "step": 2330 }, { "entropy": 1.913632895052433, "epoch": 1.7417218543046358, "grad_norm": 2.703125, "learning_rate": 2.783405263060918e-06, "loss": 1.0172, "mean_token_accuracy": 0.7685214690864086, "num_tokens": 248057413.0, "step": 2335 }, { "entropy": 1.9237369418144226, "epoch": 1.745452849547617, "grad_norm": 3.09375, "learning_rate": 2.7613760085867924e-06, "loss": 0.9882, "mean_token_accuracy": 0.7725242026150226, "num_tokens": 248587926.0, "step": 2340 }, { "entropy": 1.9254556432366372, "epoch": 1.7491838447905979, "grad_norm": 2.84375, "learning_rate": 2.739647231927264e-06, "loss": 0.9002, "mean_token_accuracy": 0.7950693652033806, "num_tokens": 249118942.0, "step": 2345 }, { "entropy": 1.915513914823532, "epoch": 1.752914840033579, "grad_norm": 3.15625, "learning_rate": 2.7182197255708942e-06, "loss": 0.976, "mean_token_accuracy": 0.7857939094305039, "num_tokens": 249649163.0, "step": 2350 }, { "entropy": 1.9156290039420127, "epoch": 1.7566458352765602, "grad_norm": 2.921875, "learning_rate": 2.697094271018374e-06, "loss": 0.9748, "mean_token_accuracy": 0.7830391086637973, "num_tokens": 250181840.0, "step": 2355 }, { "entropy": 1.9220938324928283, "epoch": 1.760376830519541, "grad_norm": 2.71875, "learning_rate": 2.6762716387539956e-06, "loss": 0.9367, "mean_token_accuracy": 0.7896655574440956, "num_tokens": 250712561.0, "step": 2360 }, { "entropy": 1.9115706831216812, "epoch": 1.764107825762522, "grad_norm": 2.609375, "learning_rate": 2.65575258821757e-06, "loss": 0.9094, "mean_token_accuracy": 0.7902532637119293, "num_tokens": 251244090.0, "step": 2365 }, { "entropy": 1.9223439544439316, "epoch": 1.7678388210055034, "grad_norm": 3.1875, "learning_rate": 2.6355378677767223e-06, "loss": 0.9001, "mean_token_accuracy": 0.7949155509471894, "num_tokens": 251775524.0, "step": 2370 }, { "entropy": 1.9269421100616455, "epoch": 1.7715698162484843, "grad_norm": 2.65625, "learning_rate": 2.6156282146995955e-06, "loss": 0.9287, "mean_token_accuracy": 0.795907324552536, "num_tokens": 252304609.0, "step": 2375 }, { "entropy": 1.9298863634467125, "epoch": 1.7753008114914652, "grad_norm": 3.1875, "learning_rate": 2.5960243551279652e-06, "loss": 0.884, "mean_token_accuracy": 0.7974181033670902, "num_tokens": 252835385.0, "step": 2380 }, { "entropy": 1.9196127310395241, "epoch": 1.7790318067344464, "grad_norm": 2.734375, "learning_rate": 2.576727004050754e-06, "loss": 0.8886, "mean_token_accuracy": 0.7969051510095596, "num_tokens": 253367813.0, "step": 2385 }, { "entropy": 1.9148397564888, "epoch": 1.7827628019774275, "grad_norm": 3.4375, "learning_rate": 2.5577368652779495e-06, "loss": 0.9919, "mean_token_accuracy": 0.7807718679308892, "num_tokens": 253899925.0, "step": 2390 }, { "entropy": 1.92100038677454, "epoch": 1.7864937972204085, "grad_norm": 2.90625, "learning_rate": 2.539054631414946e-06, "loss": 0.9312, "mean_token_accuracy": 0.7850274242460727, "num_tokens": 254428726.0, "step": 2395 }, { "entropy": 1.9288868740200997, "epoch": 1.7902247924633896, "grad_norm": 3.0, "learning_rate": 2.5206809838372747e-06, "loss": 0.954, "mean_token_accuracy": 0.7834418945014476, "num_tokens": 254958402.0, "step": 2400 }, { "epoch": 1.7902247924633896, "eval_entropy": 1.9260941276901877, "eval_loss": 0.7932396531105042, "eval_mean_token_accuracy": 0.7890994557037881, "eval_num_tokens": 254958402.0, "eval_runtime": 416.7399, "eval_samples_per_second": 2.08, "eval_steps_per_second": 0.521, "step": 2400 }, { "entropy": 1.9224698156118394, "epoch": 1.7939557877063708, "grad_norm": 2.8125, "learning_rate": 2.502616592665756e-06, "loss": 1.0307, "mean_token_accuracy": 0.7735580526292324, "num_tokens": 255491551.0, "step": 2405 }, { "entropy": 1.9186719000339507, "epoch": 1.7976867829493517, "grad_norm": 2.90625, "learning_rate": 2.48486211674206e-06, "loss": 0.9552, "mean_token_accuracy": 0.7844504326581955, "num_tokens": 256022334.0, "step": 2410 }, { "entropy": 1.928916445374489, "epoch": 1.8014177781923328, "grad_norm": 2.78125, "learning_rate": 2.4674182036046756e-06, "loss": 0.856, "mean_token_accuracy": 0.8019163817167282, "num_tokens": 256554715.0, "step": 2415 }, { "entropy": 1.920437340438366, "epoch": 1.805148773435314, "grad_norm": 2.875, "learning_rate": 2.45028548946529e-06, "loss": 0.9485, "mean_token_accuracy": 0.7907397754490375, "num_tokens": 257087212.0, "step": 2420 }, { "entropy": 1.9137002557516098, "epoch": 1.808879768678295, "grad_norm": 2.796875, "learning_rate": 2.4334645991855948e-06, "loss": 0.9036, "mean_token_accuracy": 0.7897277273237705, "num_tokens": 257619612.0, "step": 2425 }, { "entropy": 1.9229405611753463, "epoch": 1.8126107639212758, "grad_norm": 2.9375, "learning_rate": 2.4169561462544875e-06, "loss": 0.9088, "mean_token_accuracy": 0.7894883915781975, "num_tokens": 258151703.0, "step": 2430 }, { "entropy": 1.921665945649147, "epoch": 1.8163417591642572, "grad_norm": 3.109375, "learning_rate": 2.400760732765699e-06, "loss": 0.9314, "mean_token_accuracy": 0.7824103131890296, "num_tokens": 258683838.0, "step": 2435 }, { "entropy": 1.9171794831752778, "epoch": 1.8200727544072381, "grad_norm": 2.796875, "learning_rate": 2.384878949395834e-06, "loss": 1.0209, "mean_token_accuracy": 0.7705119363963604, "num_tokens": 259215447.0, "step": 2440 }, { "entropy": 1.9318658068776131, "epoch": 1.823803749650219, "grad_norm": 2.859375, "learning_rate": 2.3693113753828303e-06, "loss": 0.9432, "mean_token_accuracy": 0.790088453143835, "num_tokens": 259746239.0, "step": 2445 }, { "entropy": 1.9272696867585182, "epoch": 1.8275347448932002, "grad_norm": 2.78125, "learning_rate": 2.354058578504828e-06, "loss": 0.9821, "mean_token_accuracy": 0.7820963867008686, "num_tokens": 260276474.0, "step": 2450 }, { "entropy": 1.9249881833791733, "epoch": 1.8312657401361814, "grad_norm": 2.625, "learning_rate": 2.3391211150594658e-06, "loss": 0.9845, "mean_token_accuracy": 0.7766280405223369, "num_tokens": 260807881.0, "step": 2455 }, { "entropy": 1.9197756513953208, "epoch": 1.8349967353791623, "grad_norm": 2.875, "learning_rate": 2.324499529843591e-06, "loss": 1.0046, "mean_token_accuracy": 0.7696422345936298, "num_tokens": 261338350.0, "step": 2460 }, { "entropy": 1.9283653423190117, "epoch": 1.8387277306221435, "grad_norm": 3.078125, "learning_rate": 2.310194356133389e-06, "loss": 0.9167, "mean_token_accuracy": 0.7916576892137528, "num_tokens": 261871446.0, "step": 2465 }, { "entropy": 1.924713484942913, "epoch": 1.8424587258651246, "grad_norm": 2.9375, "learning_rate": 2.296206115664932e-06, "loss": 0.9699, "mean_token_accuracy": 0.780182758718729, "num_tokens": 262404727.0, "step": 2470 }, { "entropy": 1.9232066422700882, "epoch": 1.8461897211081055, "grad_norm": 2.796875, "learning_rate": 2.282535318615156e-06, "loss": 0.959, "mean_token_accuracy": 0.7938723795115947, "num_tokens": 262937258.0, "step": 2475 }, { "entropy": 1.9326208502054214, "epoch": 1.8499207163510867, "grad_norm": 2.75, "learning_rate": 2.2691824635832447e-06, "loss": 0.9705, "mean_token_accuracy": 0.7897952258586883, "num_tokens": 263467509.0, "step": 2480 }, { "entropy": 1.9184764593839645, "epoch": 1.8536517115940678, "grad_norm": 2.765625, "learning_rate": 2.2561480375724545e-06, "loss": 0.981, "mean_token_accuracy": 0.7857754305005074, "num_tokens": 263998964.0, "step": 2485 }, { "entropy": 1.922873741388321, "epoch": 1.8573827068370488, "grad_norm": 2.625, "learning_rate": 2.2434325159723475e-06, "loss": 0.9843, "mean_token_accuracy": 0.7813891164958477, "num_tokens": 264528676.0, "step": 2490 }, { "entropy": 1.9227689027786254, "epoch": 1.8611137020800297, "grad_norm": 2.6875, "learning_rate": 2.231036362541451e-06, "loss": 1.0119, "mean_token_accuracy": 0.7707612752914429, "num_tokens": 265058995.0, "step": 2495 }, { "entropy": 1.9281268939375877, "epoch": 1.864844697323011, "grad_norm": 2.796875, "learning_rate": 2.218960029390347e-06, "loss": 0.8987, "mean_token_accuracy": 0.7969741009175777, "num_tokens": 265589875.0, "step": 2500 }, { "epoch": 1.864844697323011, "eval_entropy": 1.9268543950973018, "eval_loss": 0.7931507229804993, "eval_mean_token_accuracy": 0.7893393185281534, "eval_num_tokens": 265589875.0, "eval_runtime": 405.8579, "eval_samples_per_second": 2.136, "eval_steps_per_second": 0.535, "step": 2500 }, { "entropy": 1.9152730286121369, "epoch": 1.868575692565992, "grad_norm": 2.6875, "learning_rate": 2.2072039569651845e-06, "loss": 0.968, "mean_token_accuracy": 0.7776608653366566, "num_tokens": 266121755.0, "step": 2505 }, { "entropy": 1.9201575055718423, "epoch": 1.872306687808973, "grad_norm": 3.0625, "learning_rate": 2.1957685740316082e-06, "loss": 0.965, "mean_token_accuracy": 0.7823125168681144, "num_tokens": 266652176.0, "step": 2510 }, { "entropy": 1.9179353535175323, "epoch": 1.876037683051954, "grad_norm": 2.96875, "learning_rate": 2.1846542976591307e-06, "loss": 0.9728, "mean_token_accuracy": 0.7809740528464317, "num_tokens": 267184426.0, "step": 2515 }, { "entropy": 1.925567814707756, "epoch": 1.8797686782949352, "grad_norm": 3.015625, "learning_rate": 2.1738615332059143e-06, "loss": 0.97, "mean_token_accuracy": 0.7776124000549316, "num_tokens": 267714012.0, "step": 2520 }, { "entropy": 1.9293931931257249, "epoch": 1.8834996735379161, "grad_norm": 3.15625, "learning_rate": 2.163390674303985e-06, "loss": 0.9549, "mean_token_accuracy": 0.7903631627559662, "num_tokens": 268243632.0, "step": 2525 }, { "entropy": 1.917325484752655, "epoch": 1.8872306687808973, "grad_norm": 2.75, "learning_rate": 2.153242102844884e-06, "loss": 1.0178, "mean_token_accuracy": 0.7697039887309074, "num_tokens": 268774277.0, "step": 2530 }, { "entropy": 1.9345811545848846, "epoch": 1.8909616640238784, "grad_norm": 2.859375, "learning_rate": 2.1434161889657344e-06, "loss": 0.9233, "mean_token_accuracy": 0.7909011483192444, "num_tokens": 269303608.0, "step": 2535 }, { "entropy": 1.9219995021820069, "epoch": 1.8946926592668594, "grad_norm": 2.640625, "learning_rate": 2.133913291035738e-06, "loss": 0.9395, "mean_token_accuracy": 0.7876539684832096, "num_tokens": 269836167.0, "step": 2540 }, { "entropy": 1.9185631155967713, "epoch": 1.8984236545098405, "grad_norm": 2.78125, "learning_rate": 2.1247337556431135e-06, "loss": 0.9905, "mean_token_accuracy": 0.7754731863737107, "num_tokens": 270367576.0, "step": 2545 }, { "entropy": 1.9295933306217194, "epoch": 1.9021546497528217, "grad_norm": 2.703125, "learning_rate": 2.1158779175824527e-06, "loss": 0.9171, "mean_token_accuracy": 0.7968859523534775, "num_tokens": 270898012.0, "step": 2550 }, { "entropy": 1.917301008105278, "epoch": 1.9058856449958026, "grad_norm": 2.734375, "learning_rate": 2.107346099842501e-06, "loss": 0.965, "mean_token_accuracy": 0.7783635087311268, "num_tokens": 271427122.0, "step": 2555 }, { "entropy": 1.9307295560836792, "epoch": 1.9096166402387837, "grad_norm": 2.875, "learning_rate": 2.099138613594394e-06, "loss": 0.9469, "mean_token_accuracy": 0.7890574164688587, "num_tokens": 271956636.0, "step": 2560 }, { "entropy": 1.9290086671710014, "epoch": 1.913347635481765, "grad_norm": 2.78125, "learning_rate": 2.091255758180293e-06, "loss": 0.9692, "mean_token_accuracy": 0.7894337013363838, "num_tokens": 272486655.0, "step": 2565 }, { "entropy": 1.9271051108837127, "epoch": 1.9170786307247458, "grad_norm": 2.984375, "learning_rate": 2.0836978211024754e-06, "loss": 0.9693, "mean_token_accuracy": 0.7833365269005299, "num_tokens": 273018975.0, "step": 2570 }, { "entropy": 1.9174389943480492, "epoch": 1.9208096259677268, "grad_norm": 2.8125, "learning_rate": 2.0764650780128517e-06, "loss": 0.9338, "mean_token_accuracy": 0.7890181943774224, "num_tokens": 273549264.0, "step": 2575 }, { "entropy": 1.9197008699178695, "epoch": 1.924540621210708, "grad_norm": 3.078125, "learning_rate": 2.069557792702903e-06, "loss": 0.9289, "mean_token_accuracy": 0.791838351637125, "num_tokens": 274080352.0, "step": 2580 }, { "entropy": 1.9107312351465224, "epoch": 1.928271616453689, "grad_norm": 2.5625, "learning_rate": 2.0629762170940656e-06, "loss": 1.0158, "mean_token_accuracy": 0.7829626590013504, "num_tokens": 274611815.0, "step": 2585 }, { "entropy": 1.932517835497856, "epoch": 1.93200261169667, "grad_norm": 2.765625, "learning_rate": 2.056720591228543e-06, "loss": 0.956, "mean_token_accuracy": 0.7831753470003605, "num_tokens": 275141325.0, "step": 2590 }, { "entropy": 1.9160398855805396, "epoch": 1.9357336069396511, "grad_norm": 2.765625, "learning_rate": 2.0507911432605516e-06, "loss": 0.9787, "mean_token_accuracy": 0.778945604711771, "num_tokens": 275673810.0, "step": 2595 }, { "entropy": 1.9250822380185126, "epoch": 1.9394646021826323, "grad_norm": 2.96875, "learning_rate": 2.045188089447994e-06, "loss": 0.9509, "mean_token_accuracy": 0.782188569009304, "num_tokens": 276205721.0, "step": 2600 }, { "epoch": 1.9394646021826323, "eval_entropy": 1.9263563881271994, "eval_loss": 0.7933353781700134, "eval_mean_token_accuracy": 0.7889978152266296, "eval_num_tokens": 276205721.0, "eval_runtime": 416.0362, "eval_samples_per_second": 2.084, "eval_steps_per_second": 0.522, "step": 2600 }, { "entropy": 1.9174866080284119, "epoch": 1.9431955974256132, "grad_norm": 2.796875, "learning_rate": 2.0399116341445807e-06, "loss": 1.006, "mean_token_accuracy": 0.775168327987194, "num_tokens": 276736589.0, "step": 2605 }, { "entropy": 1.9143609598278999, "epoch": 1.9469265926685944, "grad_norm": 2.671875, "learning_rate": 2.0349619697923697e-06, "loss": 1.0056, "mean_token_accuracy": 0.7784153208136558, "num_tokens": 277267929.0, "step": 2610 }, { "entropy": 1.9196015775203705, "epoch": 1.9506575879115755, "grad_norm": 2.703125, "learning_rate": 2.0303392769147507e-06, "loss": 0.9649, "mean_token_accuracy": 0.7793896451592446, "num_tokens": 277800473.0, "step": 2615 }, { "entropy": 1.9250148937106133, "epoch": 1.9543885831545564, "grad_norm": 2.671875, "learning_rate": 2.0260437241098584e-06, "loss": 0.9311, "mean_token_accuracy": 0.7856952168047429, "num_tokens": 278332246.0, "step": 2620 }, { "entropy": 1.9160726517438889, "epoch": 1.9581195783975376, "grad_norm": 2.640625, "learning_rate": 2.0220754680444303e-06, "loss": 1.0071, "mean_token_accuracy": 0.7723523326218128, "num_tokens": 278865988.0, "step": 2625 }, { "entropy": 1.9222462639212607, "epoch": 1.9618505736405187, "grad_norm": 2.90625, "learning_rate": 2.0184346534480832e-06, "loss": 0.9767, "mean_token_accuracy": 0.7792594291269779, "num_tokens": 279397039.0, "step": 2630 }, { "entropy": 1.9240744143724442, "epoch": 1.9655815688834997, "grad_norm": 2.734375, "learning_rate": 2.015121413108043e-06, "loss": 0.9444, "mean_token_accuracy": 0.789904459565878, "num_tokens": 279929289.0, "step": 2635 }, { "entropy": 1.9292493999004363, "epoch": 1.9693125641264806, "grad_norm": 2.890625, "learning_rate": 2.012135867864295e-06, "loss": 0.9963, "mean_token_accuracy": 0.7859659023582936, "num_tokens": 280459438.0, "step": 2640 }, { "entropy": 1.9246867805719376, "epoch": 1.973043559369462, "grad_norm": 2.546875, "learning_rate": 2.009478126605184e-06, "loss": 1.0206, "mean_token_accuracy": 0.7676692776381969, "num_tokens": 280990158.0, "step": 2645 }, { "entropy": 1.917986649274826, "epoch": 1.976774554612443, "grad_norm": 2.921875, "learning_rate": 2.0071482862634343e-06, "loss": 0.8863, "mean_token_accuracy": 0.7945875555276871, "num_tokens": 281521698.0, "step": 2650 }, { "entropy": 1.9185520693659783, "epoch": 1.9805055498554238, "grad_norm": 2.78125, "learning_rate": 2.0051464318126223e-06, "loss": 1.0, "mean_token_accuracy": 0.7771862842142582, "num_tokens": 282052844.0, "step": 2655 }, { "entropy": 1.9054419621825218, "epoch": 1.984236545098405, "grad_norm": 2.984375, "learning_rate": 2.0034726362640712e-06, "loss": 0.9117, "mean_token_accuracy": 0.7958167672157288, "num_tokens": 282582899.0, "step": 2660 }, { "entropy": 1.9172074422240257, "epoch": 1.9879675403413861, "grad_norm": 2.96875, "learning_rate": 2.002126960664194e-06, "loss": 0.9155, "mean_token_accuracy": 0.7893202118575573, "num_tokens": 283116002.0, "step": 2665 }, { "entropy": 1.9196871653199197, "epoch": 1.991698535584367, "grad_norm": 2.859375, "learning_rate": 2.0011094540922605e-06, "loss": 0.9836, "mean_token_accuracy": 0.7831322841346264, "num_tokens": 283646490.0, "step": 2670 }, { "entropy": 1.924867632985115, "epoch": 1.9954295308273482, "grad_norm": 2.75, "learning_rate": 2.000420153658614e-06, "loss": 0.9986, "mean_token_accuracy": 0.7747132256627083, "num_tokens": 284178743.0, "step": 2675 }, { "entropy": 1.9152036324143409, "epoch": 1.9991605260703293, "grad_norm": 2.640625, "learning_rate": 2.000059084503312e-06, "loss": 1.0037, "mean_token_accuracy": 0.7744490541517735, "num_tokens": 284710078.0, "step": 2680 }, { "entropy": 1.9348924954732258, "epoch": 2.0, "mean_token_accuracy": 0.8092464572853513, "num_tokens": 284829214.0, "step": 2682, "total_flos": 4.071862729311437e+17, "train_loss": 1.0256049086316854, "train_runtime": 118640.8071, "train_samples_per_second": 1.446, "train_steps_per_second": 0.023 } ], "logging_steps": 5, "max_steps": 2682, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.071862729311437e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }