anish12's picture
Upload LoRA fine-tuned model after 2 epochs
988c870 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3211100099108029,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 6.753322017192841,
"epoch": 0.006607201850016518,
"grad_norm": 0.0,
"learning_rate": 9e-07,
"loss": 220.5158,
"mean_token_accuracy": 0.021751992404460907,
"num_tokens": 40960.0,
"step": 10
},
{
"entropy": 6.814661657810211,
"epoch": 0.013214403700033036,
"grad_norm": 0.0,
"learning_rate": 1.9e-06,
"loss": 38.3501,
"mean_token_accuracy": 0.02807121314108372,
"num_tokens": 81920.0,
"step": 20
},
{
"entropy": 6.945201528072357,
"epoch": 0.019821605550049554,
"grad_norm": 0.0,
"learning_rate": 2.9e-06,
"loss": 62.9007,
"mean_token_accuracy": 0.01135626919567585,
"num_tokens": 122880.0,
"step": 30
},
{
"entropy": 7.206499898433686,
"epoch": 0.026428807400066073,
"grad_norm": 0.0,
"learning_rate": 3.9e-06,
"loss": 49.7069,
"mean_token_accuracy": 0.023292785882949828,
"num_tokens": 163840.0,
"step": 40
},
{
"entropy": 2.605212825164199,
"epoch": 0.03303600925008259,
"grad_norm": 0.0,
"learning_rate": 4.9000000000000005e-06,
"loss": 214.3199,
"mean_token_accuracy": 0.021621854975819588,
"num_tokens": 204800.0,
"step": 50
},
{
"entropy": 0.06497852916363626,
"epoch": 0.03964321110009911,
"grad_norm": 0.0,
"learning_rate": 5.9e-06,
"loss": 11.6628,
"mean_token_accuracy": 0.043425589054822925,
"num_tokens": 245760.0,
"step": 60
},
{
"entropy": 0.021584587066899984,
"epoch": 0.04625041295011562,
"grad_norm": 19.475257873535156,
"learning_rate": 6.900000000000001e-06,
"loss": 4.9491,
"mean_token_accuracy": 0.039490992575883864,
"num_tokens": 286720.0,
"step": 70
},
{
"entropy": 0.03706419242080301,
"epoch": 0.052857614800132145,
"grad_norm": 65.02709197998047,
"learning_rate": 7.9e-06,
"loss": 11.027,
"mean_token_accuracy": 0.07533193230628968,
"num_tokens": 327680.0,
"step": 80
},
{
"entropy": 0.05241385023109615,
"epoch": 0.05946481665014866,
"grad_norm": 0.0,
"learning_rate": 8.9e-06,
"loss": 21.8544,
"mean_token_accuracy": 0.012833333015441895,
"num_tokens": 368640.0,
"step": 90
},
{
"entropy": 0.4075972191989422,
"epoch": 0.06607201850016518,
"grad_norm": 0.0,
"learning_rate": 9.900000000000002e-06,
"loss": 35.157,
"mean_token_accuracy": 0.024832390993833543,
"num_tokens": 409600.0,
"step": 100
},
{
"entropy": 1.835695518553257,
"epoch": 0.07267922035018169,
"grad_norm": 0.0,
"learning_rate": 1.09e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 450560.0,
"step": 110
},
{
"entropy": 0.9127246515825391,
"epoch": 0.07928642220019821,
"grad_norm": 0.0,
"learning_rate": 1.19e-05,
"loss": 134.9978,
"mean_token_accuracy": 0.07018110677599906,
"num_tokens": 491520.0,
"step": 120
},
{
"entropy": 0.09644881915301085,
"epoch": 0.08589362405021474,
"grad_norm": 18.287517547607422,
"learning_rate": 1.29e-05,
"loss": 17.8291,
"mean_token_accuracy": 0.11415426433086395,
"num_tokens": 532480.0,
"step": 130
},
{
"entropy": 1.061981100589037,
"epoch": 0.09250082590023125,
"grad_norm": 0.0,
"learning_rate": 1.3900000000000002e-05,
"loss": 73.0922,
"mean_token_accuracy": 0.0463849127292633,
"num_tokens": 573440.0,
"step": 140
},
{
"entropy": 1.7951227528974414,
"epoch": 0.09910802775024777,
"grad_norm": 0.0,
"learning_rate": 1.49e-05,
"loss": 26.185,
"mean_token_accuracy": 0.043432857096195224,
"num_tokens": 614400.0,
"step": 150
},
{
"entropy": 0.100301666976884,
"epoch": 0.10571522960026429,
"grad_norm": 0.0,
"learning_rate": 1.59e-05,
"loss": 45.3161,
"mean_token_accuracy": 0.08393882662057876,
"num_tokens": 655360.0,
"step": 160
},
{
"entropy": 0.7640071153640747,
"epoch": 0.11232243145028081,
"grad_norm": 0.0,
"learning_rate": 1.69e-05,
"loss": 61.7114,
"mean_token_accuracy": 0.04098484814167023,
"num_tokens": 696320.0,
"step": 170
},
{
"entropy": 0.3028068076006093,
"epoch": 0.11892963330029732,
"grad_norm": 0.0,
"learning_rate": 1.79e-05,
"loss": 57.9743,
"mean_token_accuracy": 0.07434001564979553,
"num_tokens": 737280.0,
"step": 180
},
{
"entropy": 0.02740152989870239,
"epoch": 0.12553683515031383,
"grad_norm": 30.224332809448242,
"learning_rate": 1.8900000000000002e-05,
"loss": 43.8902,
"mean_token_accuracy": 0.06686745882034302,
"num_tokens": 778240.0,
"step": 190
},
{
"entropy": 0.00011176439667224259,
"epoch": 0.13214403700033037,
"grad_norm": 31.883712768554688,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.0793,
"mean_token_accuracy": 0.02380952388048172,
"num_tokens": 819200.0,
"step": 200
},
{
"entropy": 0.005680033719772837,
"epoch": 0.13875123885034688,
"grad_norm": 33.85360336303711,
"learning_rate": 2.09e-05,
"loss": 0.2071,
"mean_token_accuracy": 0.015425531566143036,
"num_tokens": 860160.0,
"step": 210
},
{
"entropy": 0.0075609647078181295,
"epoch": 0.14535844070036338,
"grad_norm": 19.486099243164062,
"learning_rate": 2.19e-05,
"loss": 1.201,
"mean_token_accuracy": 0.07949066758155823,
"num_tokens": 901120.0,
"step": 220
},
{
"entropy": 0.011990599616638065,
"epoch": 0.15196564255037992,
"grad_norm": 0.0,
"learning_rate": 2.29e-05,
"loss": 2.0033,
"mean_token_accuracy": 0.02794431447982788,
"num_tokens": 942080.0,
"step": 230
},
{
"entropy": 0.0031902942908345723,
"epoch": 0.15857284440039643,
"grad_norm": 0.0,
"learning_rate": 2.39e-05,
"loss": 1.1575,
"mean_token_accuracy": 0.017129629850387573,
"num_tokens": 983040.0,
"step": 240
},
{
"entropy": 0.14381448374479078,
"epoch": 0.16518004625041294,
"grad_norm": 0.0,
"learning_rate": 2.4900000000000002e-05,
"loss": 0.7004,
"mean_token_accuracy": 0.021666666865348815,
"num_tokens": 1024000.0,
"step": 250
},
{
"entropy": 3.897232323416836,
"epoch": 0.17178724810042947,
"grad_norm": 0.0,
"learning_rate": 2.5900000000000003e-05,
"loss": 0.812,
"mean_token_accuracy": 0.039259061217308044,
"num_tokens": 1064960.0,
"step": 260
},
{
"entropy": 0.019721664518976723,
"epoch": 0.17839444995044598,
"grad_norm": 0.0,
"learning_rate": 2.6900000000000003e-05,
"loss": 2.7005,
"mean_token_accuracy": 0.05297277569770813,
"num_tokens": 1105920.0,
"step": 270
},
{
"entropy": 0.02900617156851304,
"epoch": 0.1850016518004625,
"grad_norm": 18.309810638427734,
"learning_rate": 2.7900000000000004e-05,
"loss": 0.6091,
"mean_token_accuracy": 0.07364537045359612,
"num_tokens": 1146880.0,
"step": 280
},
{
"entropy": 0.03415291602525096,
"epoch": 0.19160885365047903,
"grad_norm": 0.0,
"learning_rate": 2.8899999999999998e-05,
"loss": 19.6735,
"mean_token_accuracy": 0.10901817381381988,
"num_tokens": 1187840.0,
"step": 290
},
{
"entropy": 0.03262247524494468,
"epoch": 0.19821605550049554,
"grad_norm": 0.0,
"learning_rate": 2.9900000000000002e-05,
"loss": 10.0336,
"mean_token_accuracy": 0.06002416908740997,
"num_tokens": 1228800.0,
"step": 300
},
{
"entropy": 0.003781934377029783,
"epoch": 0.20482325735051204,
"grad_norm": 89.78754425048828,
"learning_rate": 3.09e-05,
"loss": 7.6655,
"mean_token_accuracy": 0.07553683370351791,
"num_tokens": 1269760.0,
"step": 310
},
{
"entropy": 0.02375032416614431,
"epoch": 0.21143045920052858,
"grad_norm": 0.0,
"learning_rate": 3.19e-05,
"loss": 55.6063,
"mean_token_accuracy": 0.08025674372911454,
"num_tokens": 1310720.0,
"step": 320
},
{
"entropy": 0.0065040366395965775,
"epoch": 0.2180376610505451,
"grad_norm": 0.0,
"learning_rate": 3.29e-05,
"loss": 10.669,
"mean_token_accuracy": 0.01593567281961441,
"num_tokens": 1351680.0,
"step": 330
},
{
"entropy": 0.003754374942396499,
"epoch": 0.22464486290056163,
"grad_norm": 0.0,
"learning_rate": 3.3900000000000004e-05,
"loss": 0.3125,
"mean_token_accuracy": 0.042081044614315034,
"num_tokens": 1392640.0,
"step": 340
},
{
"entropy": 0.0002896140339728959,
"epoch": 0.23125206475057813,
"grad_norm": 16.06511878967285,
"learning_rate": 3.49e-05,
"loss": 0.0652,
"mean_token_accuracy": 0.02388888895511627,
"num_tokens": 1433600.0,
"step": 350
},
{
"entropy": 0.0001928646220630714,
"epoch": 0.23785926660059464,
"grad_norm": 0.0,
"learning_rate": 3.59e-05,
"loss": 1.1458,
"mean_token_accuracy": 0.02368421107530594,
"num_tokens": 1474560.0,
"step": 360
},
{
"entropy": 0.0003226411399264606,
"epoch": 0.24446646845061118,
"grad_norm": 0.0,
"learning_rate": 3.69e-05,
"loss": 4.4569,
"mean_token_accuracy": 0.04751228392124176,
"num_tokens": 1515520.0,
"step": 370
},
{
"entropy": 0.030229031606691593,
"epoch": 0.25107367030062766,
"grad_norm": 0.0,
"learning_rate": 3.79e-05,
"loss": 4.4461,
"mean_token_accuracy": 0.06874987185001373,
"num_tokens": 1556480.0,
"step": 380
},
{
"entropy": 0.014421097982523178,
"epoch": 0.2576808721506442,
"grad_norm": 0.0,
"learning_rate": 3.8900000000000004e-05,
"loss": 89.155,
"mean_token_accuracy": 0.07968828082084656,
"num_tokens": 1597440.0,
"step": 390
},
{
"entropy": 0.018559856562546172,
"epoch": 0.26428807400066073,
"grad_norm": 0.0,
"learning_rate": 3.99e-05,
"loss": 1.1971,
"mean_token_accuracy": 0.04879816025495529,
"num_tokens": 1638400.0,
"step": 400
},
{
"entropy": 0.0025903429913086027,
"epoch": 0.27089527585067724,
"grad_norm": 0.0,
"learning_rate": 4.09e-05,
"loss": 46.7922,
"mean_token_accuracy": 0.06377801150083542,
"num_tokens": 1679360.0,
"step": 410
},
{
"entropy": 0.0014362634548334085,
"epoch": 0.27750247770069375,
"grad_norm": 0.0,
"learning_rate": 4.19e-05,
"loss": 0.2403,
"mean_token_accuracy": 0.019583334028720856,
"num_tokens": 1720320.0,
"step": 420
},
{
"entropy": 0.01779812537132557,
"epoch": 0.28410967955071026,
"grad_norm": 0.0,
"learning_rate": 4.29e-05,
"loss": 3.4174,
"mean_token_accuracy": 0.09800765216350556,
"num_tokens": 1761280.0,
"step": 430
},
{
"entropy": 0.00543891376372585,
"epoch": 0.29071688140072677,
"grad_norm": 0.0,
"learning_rate": 4.39e-05,
"loss": 1.7927,
"mean_token_accuracy": 0.10844782143831252,
"num_tokens": 1802240.0,
"step": 440
},
{
"entropy": 0.008688661311566648,
"epoch": 0.29732408325074333,
"grad_norm": 11.81262493133545,
"learning_rate": 4.49e-05,
"loss": 19.4753,
"mean_token_accuracy": 0.03172447681427002,
"num_tokens": 1843200.0,
"step": 450
},
{
"entropy": 0.03670089549409568,
"epoch": 0.30393128510075984,
"grad_norm": 0.0,
"learning_rate": 4.5900000000000004e-05,
"loss": 9.5863,
"mean_token_accuracy": 0.09990399181842805,
"num_tokens": 1884160.0,
"step": 460
},
{
"entropy": 0.0009635578760935459,
"epoch": 0.31053848695077635,
"grad_norm": 0.0,
"learning_rate": 4.69e-05,
"loss": 0.1027,
"mean_token_accuracy": 0.047025862336158755,
"num_tokens": 1925120.0,
"step": 470
},
{
"entropy": 0.03435224668937735,
"epoch": 0.31714568880079286,
"grad_norm": 0.0,
"learning_rate": 4.79e-05,
"loss": 9.3884,
"mean_token_accuracy": 0.12481869906187057,
"num_tokens": 1966080.0,
"step": 480
},
{
"entropy": 0.02036606671208574,
"epoch": 0.32375289065080937,
"grad_norm": 12.76618766784668,
"learning_rate": 4.89e-05,
"loss": 8.3534,
"mean_token_accuracy": 0.025242918729782106,
"num_tokens": 2007040.0,
"step": 490
},
{
"entropy": 0.005103755064374127,
"epoch": 0.3303600925008259,
"grad_norm": 10.067124366760254,
"learning_rate": 4.99e-05,
"loss": 0.996,
"mean_token_accuracy": 0.03732142895460129,
"num_tokens": 2048000.0,
"step": 500
},
{
"entropy": 0.02930727633283823,
"epoch": 0.33696729435084244,
"grad_norm": 13.147978782653809,
"learning_rate": 4.999843636237961e-05,
"loss": 2.6708,
"mean_token_accuracy": 0.16797256022691726,
"num_tokens": 2088960.0,
"step": 510
},
{
"entropy": 0.01542719653371023,
"epoch": 0.34357449620085895,
"grad_norm": 0.0,
"learning_rate": 4.999303144641334e-05,
"loss": 0.7947,
"mean_token_accuracy": 0.029625000059604646,
"num_tokens": 2129920.0,
"step": 520
},
{
"entropy": 0.011237168186926282,
"epoch": 0.35018169805087546,
"grad_norm": 0.0,
"learning_rate": 4.9983766782431473e-05,
"loss": 1.8663,
"mean_token_accuracy": 0.015829145908355713,
"num_tokens": 2170880.0,
"step": 530
},
{
"entropy": 0.021366176847368478,
"epoch": 0.35678889990089196,
"grad_norm": 0.0,
"learning_rate": 4.997064380120359e-05,
"loss": 0.7902,
"mean_token_accuracy": 0.034427966177463534,
"num_tokens": 2211840.0,
"step": 540
},
{
"entropy": 0.013086076875333674,
"epoch": 0.3633961017509085,
"grad_norm": 0.0,
"learning_rate": 4.995366452935061e-05,
"loss": 4.0451,
"mean_token_accuracy": 0.03658333420753479,
"num_tokens": 2252800.0,
"step": 550
},
{
"entropy": 0.15343078810255975,
"epoch": 0.370003303600925,
"grad_norm": 21.48503303527832,
"learning_rate": 4.993283158903187e-05,
"loss": 108.0904,
"mean_token_accuracy": 0.03694373369216919,
"num_tokens": 2293760.0,
"step": 560
},
{
"entropy": 0.08353904361720196,
"epoch": 0.37661050545094155,
"grad_norm": 0.0,
"learning_rate": 4.9908148197540174e-05,
"loss": 3.3117,
"mean_token_accuracy": 0.05191558599472046,
"num_tokens": 2334720.0,
"step": 570
},
{
"entropy": 0.0030798995423538143,
"epoch": 0.38321770730095805,
"grad_norm": 15.358031272888184,
"learning_rate": 4.987961816680492e-05,
"loss": 107.2357,
"mean_token_accuracy": 0.040966108441352844,
"num_tokens": 2375680.0,
"step": 580
},
{
"entropy": 0.0050722503918223085,
"epoch": 0.38982490915097456,
"grad_norm": 0.0,
"learning_rate": 4.984724590280343e-05,
"loss": 1.1264,
"mean_token_accuracy": 0.01942567527294159,
"num_tokens": 2416640.0,
"step": 590
},
{
"entropy": 0.025612023619032698,
"epoch": 0.39643211100099107,
"grad_norm": 0.0,
"learning_rate": 4.981103640488048e-05,
"loss": 1.0595,
"mean_token_accuracy": 0.05857066810131073,
"num_tokens": 2457600.0,
"step": 600
},
{
"entropy": 0.02604469430661993,
"epoch": 0.4030393128510076,
"grad_norm": 0.0,
"learning_rate": 4.977099526497631e-05,
"loss": 2.9458,
"mean_token_accuracy": 0.11472017914056779,
"num_tokens": 2498560.0,
"step": 610
},
{
"entropy": 1.34335226020159,
"epoch": 0.4096465147010241,
"grad_norm": 0.0,
"learning_rate": 4.972712866676297e-05,
"loss": 0.8969,
"mean_token_accuracy": 0.0374501496553421,
"num_tokens": 2539520.0,
"step": 620
},
{
"entropy": 0.017200125091973145,
"epoch": 0.41625371655104065,
"grad_norm": 6.167868614196777,
"learning_rate": 4.96794433846894e-05,
"loss": 0.7561,
"mean_token_accuracy": 0.04967625737190247,
"num_tokens": 2580480.0,
"step": 630
},
{
"entropy": 0.013742945558520164,
"epoch": 0.42286091840105716,
"grad_norm": 4.658291816711426,
"learning_rate": 4.962794678293523e-05,
"loss": 3.9976,
"mean_token_accuracy": 0.09124306589365005,
"num_tokens": 2621440.0,
"step": 640
},
{
"entropy": 0.018405587278946686,
"epoch": 0.42946812025107367,
"grad_norm": 0.0,
"learning_rate": 4.957264681427348e-05,
"loss": 1.2244,
"mean_token_accuracy": 0.030132341384887695,
"num_tokens": 2662400.0,
"step": 650
},
{
"entropy": 0.007059359534223398,
"epoch": 0.4360753221010902,
"grad_norm": 41.432003021240234,
"learning_rate": 4.9513552018842426e-05,
"loss": 5.2841,
"mean_token_accuracy": 0.10671066045761109,
"num_tokens": 2703360.0,
"step": 660
},
{
"entropy": 0.004513938081906041,
"epoch": 0.4426825239511067,
"grad_norm": 11.688268661499023,
"learning_rate": 4.9450671522826694e-05,
"loss": 0.4656,
"mean_token_accuracy": 0.0375,
"num_tokens": 2744320.0,
"step": 670
},
{
"entropy": 6.520482699785834e-12,
"epoch": 0.44928972580112325,
"grad_norm": 0.0,
"learning_rate": 4.938401503704789e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 2785280.0,
"step": 680
},
{
"entropy": 0.019155432335637948,
"epoch": 0.45589692765113976,
"grad_norm": 7.324256420135498,
"learning_rate": 4.9313592855464916e-05,
"loss": 8.6945,
"mean_token_accuracy": 0.07001543641090394,
"num_tokens": 2826240.0,
"step": 690
},
{
"entropy": 0.0031058518021977906,
"epoch": 0.46250412950115627,
"grad_norm": 0.0,
"learning_rate": 4.923941585358426e-05,
"loss": 24.0465,
"mean_token_accuracy": 0.039618226885795596,
"num_tokens": 2867200.0,
"step": 700
},
{
"entropy": 0.011669352487973095,
"epoch": 0.4691113313511728,
"grad_norm": 0.0,
"learning_rate": 4.9161495486780456e-05,
"loss": 3.7727,
"mean_token_accuracy": 0.14685782939195632,
"num_tokens": 2908160.0,
"step": 710
},
{
"entropy": 0.011068030085762338,
"epoch": 0.4757185332011893,
"grad_norm": 0.0,
"learning_rate": 4.907984378852699e-05,
"loss": 4.0753,
"mean_token_accuracy": 0.05300231724977493,
"num_tokens": 2949120.0,
"step": 720
},
{
"entropy": 0.019895143689083482,
"epoch": 0.4823257350512058,
"grad_norm": 0.0,
"learning_rate": 4.8994473368537916e-05,
"loss": 3.4842,
"mean_token_accuracy": 0.06887777000665665,
"num_tokens": 2990080.0,
"step": 730
},
{
"entropy": 0.00012087312168218567,
"epoch": 0.48893293690122236,
"grad_norm": 0.0,
"learning_rate": 4.8905397410820554e-05,
"loss": 0.4551,
"mean_token_accuracy": 0.02083333283662796,
"num_tokens": 3031040.0,
"step": 740
},
{
"entropy": 0.011406292311664856,
"epoch": 0.49554013875123887,
"grad_norm": 0.0,
"learning_rate": 4.881262967163939e-05,
"loss": 25.7439,
"mean_token_accuracy": 0.08733686208724975,
"num_tokens": 3072000.0,
"step": 750
},
{
"entropy": 0.007139437921159697,
"epoch": 0.5021473406012553,
"grad_norm": 0.0,
"learning_rate": 4.8716184477391675e-05,
"loss": 3.6028,
"mean_token_accuracy": 0.03767942488193512,
"num_tokens": 3112960.0,
"step": 760
},
{
"entropy": 0.011418092579742734,
"epoch": 0.5087545424512719,
"grad_norm": 0.0,
"learning_rate": 4.8616076722394974e-05,
"loss": 3.0005,
"mean_token_accuracy": 0.07940062433481217,
"num_tokens": 3153920.0,
"step": 770
},
{
"entropy": 0.006054546175801079,
"epoch": 0.5153617443012884,
"grad_norm": 17.855485916137695,
"learning_rate": 4.8512321866586985e-05,
"loss": 0.2468,
"mean_token_accuracy": 0.03400537669658661,
"num_tokens": 3194880.0,
"step": 780
},
{
"entropy": 0.02499405129653951,
"epoch": 0.521968946151305,
"grad_norm": 0.0,
"learning_rate": 4.840493593313798e-05,
"loss": 7.3568,
"mean_token_accuracy": 0.06639731675386429,
"num_tokens": 3235840.0,
"step": 790
},
{
"entropy": 0.030509774422012016,
"epoch": 0.5285761480013215,
"grad_norm": 8.936971664428711,
"learning_rate": 4.829393550597633e-05,
"loss": 2.2173,
"mean_token_accuracy": 0.0885118618607521,
"num_tokens": 3276800.0,
"step": 800
},
{
"entropy": 0.008201998914637442,
"epoch": 0.535183349851338,
"grad_norm": 0.0,
"learning_rate": 4.8179337727227424e-05,
"loss": 10.5952,
"mean_token_accuracy": 0.03566812574863434,
"num_tokens": 3317760.0,
"step": 810
},
{
"entropy": 0.0163260028579316,
"epoch": 0.5417905517013545,
"grad_norm": 8.472152709960938,
"learning_rate": 4.806116029456631e-05,
"loss": 9.1567,
"mean_token_accuracy": 0.02788277491927147,
"num_tokens": 3358720.0,
"step": 820
},
{
"entropy": 0.014577616745373233,
"epoch": 0.548397753551371,
"grad_norm": 0.0,
"learning_rate": 4.7939421458484604e-05,
"loss": 14.4947,
"mean_token_accuracy": 0.038144654035568236,
"num_tokens": 3399680.0,
"step": 830
},
{
"entropy": 0.060555978957563636,
"epoch": 0.5550049554013875,
"grad_norm": 5.755632400512695,
"learning_rate": 4.781414001947205e-05,
"loss": 17.9944,
"mean_token_accuracy": 0.08551393002271652,
"num_tokens": 3440640.0,
"step": 840
},
{
"entropy": 0.01132769331743475,
"epoch": 0.561612157251404,
"grad_norm": 0.0,
"learning_rate": 4.768533532511306e-05,
"loss": 54.7683,
"mean_token_accuracy": 0.034085105359554294,
"num_tokens": 3481600.0,
"step": 850
},
{
"entropy": 0.03579027488522115,
"epoch": 0.5682193591014205,
"grad_norm": 1.5527058839797974,
"learning_rate": 4.755302726709882e-05,
"loss": 48.52,
"mean_token_accuracy": 0.06784629076719284,
"num_tokens": 3522560.0,
"step": 860
},
{
"entropy": 0.025357943994458766,
"epoch": 0.574826560951437,
"grad_norm": 0.0,
"learning_rate": 4.741723627815536e-05,
"loss": 8.7313,
"mean_token_accuracy": 0.06684041172266006,
"num_tokens": 3563520.0,
"step": 870
},
{
"entropy": 0.003138951886649011,
"epoch": 0.5814337628014535,
"grad_norm": 0.7629010677337646,
"learning_rate": 4.7277983328888084e-05,
"loss": 4.1401,
"mean_token_accuracy": 0.0589192196726799,
"num_tokens": 3604480.0,
"step": 880
},
{
"entropy": 0.008384830836621403,
"epoch": 0.5880409646514702,
"grad_norm": 0.0,
"learning_rate": 4.7135289924543197e-05,
"loss": 4.0731,
"mean_token_accuracy": 0.032692307233810426,
"num_tokens": 3645440.0,
"step": 890
},
{
"entropy": 0.006141025204078688,
"epoch": 0.5946481665014867,
"grad_norm": 7.293692588806152,
"learning_rate": 4.6989178101686584e-05,
"loss": 0.9345,
"mean_token_accuracy": 0.06051216870546341,
"num_tokens": 3686400.0,
"step": 900
},
{
"entropy": 0.057043678060654204,
"epoch": 0.6012553683515032,
"grad_norm": 4.391047954559326,
"learning_rate": 4.683967042480064e-05,
"loss": 3.9633,
"mean_token_accuracy": 0.12203554809093475,
"num_tokens": 3727360.0,
"step": 910
},
{
"entropy": 0.012308508742717095,
"epoch": 0.6078625702015197,
"grad_norm": 2.2332639694213867,
"learning_rate": 4.668678998279961e-05,
"loss": 0.6798,
"mean_token_accuracy": 0.10872009098529815,
"num_tokens": 3768320.0,
"step": 920
},
{
"entropy": 0.06679710658499971,
"epoch": 0.6144697720515362,
"grad_norm": 0.0,
"learning_rate": 4.653056038546381e-05,
"loss": 1.5921,
"mean_token_accuracy": 0.020000000298023225,
"num_tokens": 3809280.0,
"step": 930
},
{
"entropy": 0.41163066651206465,
"epoch": 0.6210769739015527,
"grad_norm": 0.0,
"learning_rate": 4.637100575979361e-05,
"loss": 1.009,
"mean_token_accuracy": 0.029701614379882814,
"num_tokens": 3850240.0,
"step": 940
},
{
"entropy": 0.005564717383629158,
"epoch": 0.6276841757515692,
"grad_norm": 8.103035926818848,
"learning_rate": 4.620815074628333e-05,
"loss": 0.4482,
"mean_token_accuracy": 0.060583002865314484,
"num_tokens": 3891200.0,
"step": 950
},
{
"entropy": 0.006938851803111312,
"epoch": 0.6342913776015857,
"grad_norm": 0.0,
"learning_rate": 4.6042020495116005e-05,
"loss": 12.1827,
"mean_token_accuracy": 0.03860159516334534,
"num_tokens": 3932160.0,
"step": 960
},
{
"entropy": 0.025441451920087132,
"epoch": 0.6408985794516022,
"grad_norm": 0.0,
"learning_rate": 4.587264066227933e-05,
"loss": 4.7399,
"mean_token_accuracy": 0.045529181510210036,
"num_tokens": 3973120.0,
"step": 970
},
{
"entropy": 0.0037336295128966414,
"epoch": 0.6475057813016187,
"grad_norm": 0.0,
"learning_rate": 4.570003740560352e-05,
"loss": 9.9644,
"mean_token_accuracy": 0.032598039507865904,
"num_tokens": 4014080.0,
"step": 980
},
{
"entropy": 0.02236287235959935,
"epoch": 0.6541129831516352,
"grad_norm": 0.0,
"learning_rate": 4.5524237380721755e-05,
"loss": 25.0121,
"mean_token_accuracy": 0.09488918632268906,
"num_tokens": 4055040.0,
"step": 990
},
{
"entropy": 0.004253433111853155,
"epoch": 0.6607201850016517,
"grad_norm": 0.0,
"learning_rate": 4.5345267736953566e-05,
"loss": 2.2839,
"mean_token_accuracy": 0.061769942939281466,
"num_tokens": 4096000.0,
"step": 1000
},
{
"entropy": 0.004871105551137589,
"epoch": 0.6673273868516684,
"grad_norm": 0.0,
"learning_rate": 4.516315611311215e-05,
"loss": 0.4531,
"mean_token_accuracy": 0.03642857223749161,
"num_tokens": 4136960.0,
"step": 1010
},
{
"entropy": 0.025206556524062763,
"epoch": 0.6739345887016849,
"grad_norm": 0.0,
"learning_rate": 4.4977930633236e-05,
"loss": 1.3895,
"mean_token_accuracy": 0.0927792876958847,
"num_tokens": 4177920.0,
"step": 1020
},
{
"entropy": 0.006291561650141375,
"epoch": 0.6805417905517014,
"grad_norm": 0.0,
"learning_rate": 4.478961990224567e-05,
"loss": 3.6759,
"mean_token_accuracy": 0.06052185446023941,
"num_tokens": 4218880.0,
"step": 1030
},
{
"entropy": 0.0068054500185098735,
"epoch": 0.6871489924017179,
"grad_norm": 7.908809185028076,
"learning_rate": 4.4598253001526165e-05,
"loss": 0.8344,
"mean_token_accuracy": 0.04019886404275894,
"num_tokens": 4259840.0,
"step": 1040
},
{
"entropy": 0.02280626448646217,
"epoch": 0.6937561942517344,
"grad_norm": 0.0,
"learning_rate": 4.440385948443586e-05,
"loss": 56.0997,
"mean_token_accuracy": 0.05091123506426811,
"num_tokens": 4300800.0,
"step": 1050
},
{
"entropy": 0.009011537862716069,
"epoch": 0.7003633961017509,
"grad_norm": 0.0,
"learning_rate": 4.420646937174249e-05,
"loss": 4.2313,
"mean_token_accuracy": 0.10524300783872605,
"num_tokens": 4341760.0,
"step": 1060
},
{
"entropy": 0.0018482632265659049,
"epoch": 0.7069705979517674,
"grad_norm": 20.32362937927246,
"learning_rate": 4.400611314698693e-05,
"loss": 0.0597,
"mean_token_accuracy": 0.04590909034013748,
"num_tokens": 4382720.0,
"step": 1070
},
{
"entropy": 0.017142248089658096,
"epoch": 0.7135777998017839,
"grad_norm": 0.0,
"learning_rate": 4.3802821751775545e-05,
"loss": 3.2115,
"mean_token_accuracy": 0.03341048508882523,
"num_tokens": 4423680.0,
"step": 1080
},
{
"entropy": 0.009434876564773731,
"epoch": 0.7201850016518004,
"grad_norm": 0.0,
"learning_rate": 4.35966265810018e-05,
"loss": 0.5134,
"mean_token_accuracy": 0.05730164796113968,
"num_tokens": 4464640.0,
"step": 1090
},
{
"entropy": 0.01804768085712567,
"epoch": 0.726792203501817,
"grad_norm": 6.497892379760742,
"learning_rate": 4.338755947799779e-05,
"loss": 0.3714,
"mean_token_accuracy": 0.030211375653743745,
"num_tokens": 4505600.0,
"step": 1100
},
{
"entropy": 0.010639433527830987,
"epoch": 0.7333994053518335,
"grad_norm": 0.0,
"learning_rate": 4.317565272961668e-05,
"loss": 5.4826,
"mean_token_accuracy": 0.03939753919839859,
"num_tokens": 4546560.0,
"step": 1110
},
{
"entropy": 0.009574767109006644,
"epoch": 0.74000660720185,
"grad_norm": 1.7740790843963623,
"learning_rate": 4.296093906124648e-05,
"loss": 0.0149,
"mean_token_accuracy": 0.024193547666072845,
"num_tokens": 4587520.0,
"step": 1120
},
{
"entropy": 0.016046867379918693,
"epoch": 0.7466138090518666,
"grad_norm": 5.654629230499268,
"learning_rate": 4.274345163175617e-05,
"loss": 0.1119,
"mean_token_accuracy": 0.020338982343673706,
"num_tokens": 4628480.0,
"step": 1130
},
{
"entropy": 0.011864034581230953,
"epoch": 0.7532210109018831,
"grad_norm": 0.0,
"learning_rate": 4.252322402837491e-05,
"loss": 2.6892,
"mean_token_accuracy": 0.06476814448833465,
"num_tokens": 4669440.0,
"step": 1140
},
{
"entropy": 0.016540668293600902,
"epoch": 0.7598282127518996,
"grad_norm": 16.53849220275879,
"learning_rate": 4.2300290261505036e-05,
"loss": 30.6023,
"mean_token_accuracy": 0.0714763343334198,
"num_tokens": 4710400.0,
"step": 1150
},
{
"entropy": 0.0230761248501949,
"epoch": 0.7664354146019161,
"grad_norm": 0.0,
"learning_rate": 4.2074684759469746e-05,
"loss": 2.7597,
"mean_token_accuracy": 0.03069286197423935,
"num_tokens": 4751360.0,
"step": 1160
},
{
"entropy": 0.011810722067821189,
"epoch": 0.7730426164519326,
"grad_norm": 0.0,
"learning_rate": 4.1846442363196216e-05,
"loss": 0.9111,
"mean_token_accuracy": 0.10935835093259812,
"num_tokens": 4792320.0,
"step": 1170
},
{
"entropy": 0.014862862655547815,
"epoch": 0.7796498183019491,
"grad_norm": 0.0,
"learning_rate": 4.161559832083505e-05,
"loss": 6.2322,
"mean_token_accuracy": 0.07727646380662918,
"num_tokens": 4833280.0,
"step": 1180
},
{
"entropy": 0.03382376935035154,
"epoch": 0.7862570201519656,
"grad_norm": 0.0,
"learning_rate": 4.138218828231674e-05,
"loss": 6.8809,
"mean_token_accuracy": 0.08848222196102143,
"num_tokens": 4874240.0,
"step": 1190
},
{
"entropy": 0.01027671142185227,
"epoch": 0.7928642220019821,
"grad_norm": 0.0,
"learning_rate": 4.1146248293846226e-05,
"loss": 19.8159,
"mean_token_accuracy": 0.05517474114894867,
"num_tokens": 4915200.0,
"step": 1200
},
{
"entropy": 0.0004816283415379985,
"epoch": 0.7994714238519987,
"grad_norm": 20.06401252746582,
"learning_rate": 4.0907814792336086e-05,
"loss": 0.1573,
"mean_token_accuracy": 0.04738095104694366,
"num_tokens": 4956160.0,
"step": 1210
},
{
"entropy": 0.0002817700983996474,
"epoch": 0.8060786257020152,
"grad_norm": 0.0,
"learning_rate": 4.0666924599779545e-05,
"loss": 1.3697,
"mean_token_accuracy": 0.07327586263418198,
"num_tokens": 4997120.0,
"step": 1220
},
{
"entropy": 0.022998044469534306,
"epoch": 0.8126858275520317,
"grad_norm": 0.0,
"learning_rate": 4.042361491756389e-05,
"loss": 75.5602,
"mean_token_accuracy": 0.03188432902097702,
"num_tokens": 5038080.0,
"step": 1230
},
{
"entropy": 0.006426294211951245,
"epoch": 0.8192930294020482,
"grad_norm": 0.0,
"learning_rate": 4.017792332072541e-05,
"loss": 43.3363,
"mean_token_accuracy": 0.060617709159851076,
"num_tokens": 5079040.0,
"step": 1240
},
{
"entropy": 0.04888666102579009,
"epoch": 0.8259002312520648,
"grad_norm": 5.245207786560059,
"learning_rate": 3.992988775214651e-05,
"loss": 10.3096,
"mean_token_accuracy": 0.09515323489904404,
"num_tokens": 5120000.0,
"step": 1250
},
{
"entropy": 0.0168402054007629,
"epoch": 0.8325074331020813,
"grad_norm": 0.0,
"learning_rate": 3.967954651669612e-05,
"loss": 22.3533,
"mean_token_accuracy": 0.054832766950130465,
"num_tokens": 5160960.0,
"step": 1260
},
{
"entropy": 1.2130512709518371e-05,
"epoch": 0.8391146349520978,
"grad_norm": 0.0,
"learning_rate": 3.942693827531413e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 5201920.0,
"step": 1270
},
{
"entropy": 0.024002419360476777,
"epoch": 0.8457218368021143,
"grad_norm": 0.0,
"learning_rate": 3.917210203904092e-05,
"loss": 0.537,
"mean_token_accuracy": 0.03225543200969696,
"num_tokens": 5242880.0,
"step": 1280
},
{
"entropy": 0.007740778655079339,
"epoch": 0.8523290386521308,
"grad_norm": 0.0,
"learning_rate": 3.891507716299268e-05,
"loss": 66.1835,
"mean_token_accuracy": 0.03432405143976212,
"num_tokens": 5283840.0,
"step": 1290
},
{
"entropy": 0.01155800115434431,
"epoch": 0.8589362405021473,
"grad_norm": 4.934366703033447,
"learning_rate": 3.865590334028378e-05,
"loss": 0.4728,
"mean_token_accuracy": 0.03674568980932236,
"num_tokens": 5324800.0,
"step": 1300
},
{
"entropy": 0.027338066224683644,
"epoch": 0.8655434423521638,
"grad_norm": 0.0,
"learning_rate": 3.83946205958968e-05,
"loss": 6.3222,
"mean_token_accuracy": 0.09498430639505387,
"num_tokens": 5365760.0,
"step": 1310
},
{
"entropy": 0.020768357973520325,
"epoch": 0.8721506442021804,
"grad_norm": 0.0,
"learning_rate": 3.813126928050132e-05,
"loss": 16.5894,
"mean_token_accuracy": 0.07061131447553634,
"num_tokens": 5406720.0,
"step": 1320
},
{
"entropy": 0.008218960013255127,
"epoch": 0.8787578460521969,
"grad_norm": 7.263195991516113,
"learning_rate": 3.78658900642225e-05,
"loss": 0.1677,
"mean_token_accuracy": 0.03645932972431183,
"num_tokens": 5447680.0,
"step": 1330
},
{
"entropy": 0.06189728657391243,
"epoch": 0.8853650479022134,
"grad_norm": 0.0,
"learning_rate": 3.759852393036025e-05,
"loss": 10.2275,
"mean_token_accuracy": 0.14302214086055756,
"num_tokens": 5488640.0,
"step": 1340
},
{
"entropy": 0.03971352147527796,
"epoch": 0.8919722497522299,
"grad_norm": 0.0,
"learning_rate": 3.732921216906006e-05,
"loss": 5.071,
"mean_token_accuracy": 0.028831233829259874,
"num_tokens": 5529600.0,
"step": 1350
},
{
"entropy": 0.005447403571633913,
"epoch": 0.8985794516022465,
"grad_norm": 0.0,
"learning_rate": 3.705799637093644e-05,
"loss": 1.2257,
"mean_token_accuracy": 0.06499999910593032,
"num_tokens": 5570560.0,
"step": 1360
},
{
"entropy": 0.0004757829759910237,
"epoch": 0.905186653452263,
"grad_norm": 0.0,
"learning_rate": 3.678491842064995e-05,
"loss": 0.1014,
"mean_token_accuracy": 0.0236486479640007,
"num_tokens": 5611520.0,
"step": 1370
},
{
"entropy": 0.0005358504138712305,
"epoch": 0.9117938553022795,
"grad_norm": 0.0,
"learning_rate": 3.651002049043883e-05,
"loss": 1.0571,
"mean_token_accuracy": 0.05,
"num_tokens": 5652480.0,
"step": 1380
},
{
"entropy": 0.013598478100175272,
"epoch": 0.918401057152296,
"grad_norm": 4.50636625289917,
"learning_rate": 3.623334503360625e-05,
"loss": 0.2652,
"mean_token_accuracy": 0.036414363980293275,
"num_tokens": 5693440.0,
"step": 1390
},
{
"entropy": 0.00263449905833113,
"epoch": 0.9250082590023125,
"grad_norm": 0.0,
"learning_rate": 3.595493477796405e-05,
"loss": 32.6138,
"mean_token_accuracy": 0.01770833283662796,
"num_tokens": 5734400.0,
"step": 1400
},
{
"entropy": 0.0006027125244145281,
"epoch": 0.931615460852329,
"grad_norm": 0.0,
"learning_rate": 3.5674832719234236e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.025,
"num_tokens": 5775360.0,
"step": 1410
},
{
"entropy": 0.04056153310375521,
"epoch": 0.9382226627023456,
"grad_norm": 4.126308441162109,
"learning_rate": 3.539308211440896e-05,
"loss": 37.0153,
"mean_token_accuracy": 0.1092478021979332,
"num_tokens": 5816320.0,
"step": 1420
},
{
"entropy": 0.025153067862265743,
"epoch": 0.9448298645523621,
"grad_norm": 4.144694805145264,
"learning_rate": 3.510972647507024e-05,
"loss": 0.7825,
"mean_token_accuracy": 0.03231704980134964,
"num_tokens": 5857280.0,
"step": 1430
},
{
"entropy": 0.013085571637930116,
"epoch": 0.9514370664023786,
"grad_norm": 0.0,
"learning_rate": 3.482480956067036e-05,
"loss": 3.2684,
"mean_token_accuracy": 0.030653293430805206,
"num_tokens": 5898240.0,
"step": 1440
},
{
"entropy": 0.03377640448525199,
"epoch": 0.9580442682523951,
"grad_norm": 4.88741397857666,
"learning_rate": 3.453837537177392e-05,
"loss": 8.4465,
"mean_token_accuracy": 0.10374114364385605,
"num_tokens": 5939200.0,
"step": 1450
},
{
"entropy": 0.0001492878371209372,
"epoch": 0.9646514701024116,
"grad_norm": 0.0,
"learning_rate": 3.425046814326275e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 5980160.0,
"step": 1460
},
{
"entropy": 0.0095839648241963,
"epoch": 0.9712586719524281,
"grad_norm": 0.0,
"learning_rate": 3.396113233750452e-05,
"loss": 0.738,
"mean_token_accuracy": 0.06367539763450622,
"num_tokens": 6021120.0,
"step": 1470
},
{
"entropy": 0.008002570147255028,
"epoch": 0.9778658738024447,
"grad_norm": 0.0,
"learning_rate": 3.3670412637486356e-05,
"loss": 50.6298,
"mean_token_accuracy": 0.03787661790847778,
"num_tokens": 6062080.0,
"step": 1480
},
{
"entropy": 0.025678539301952696,
"epoch": 0.9844730756524612,
"grad_norm": 0.0,
"learning_rate": 3.3378353939914274e-05,
"loss": 58.3281,
"mean_token_accuracy": 0.04199896454811096,
"num_tokens": 6103040.0,
"step": 1490
},
{
"entropy": 0.009659759866462992,
"epoch": 0.9910802775024777,
"grad_norm": 0.0,
"learning_rate": 3.3085001348279655e-05,
"loss": 20.3392,
"mean_token_accuracy": 0.05786270499229431,
"num_tokens": 6144000.0,
"step": 1500
},
{
"entropy": 0.02696728161745341,
"epoch": 0.9976874793524942,
"grad_norm": 4.9037766456604,
"learning_rate": 3.2790400165893765e-05,
"loss": 24.4541,
"mean_token_accuracy": 0.05242506861686706,
"num_tokens": 6184960.0,
"step": 1510
},
{
"entropy": 0.03248862427522239,
"epoch": 1.00396432111001,
"grad_norm": 5.108238697052002,
"learning_rate": 3.249459588889148e-05,
"loss": 29.0479,
"mean_token_accuracy": 0.07015540411597804,
"num_tokens": 6223872.0,
"step": 1520
},
{
"entropy": 0.009673559664588539,
"epoch": 1.0105715229600265,
"grad_norm": 0.0,
"learning_rate": 3.21976341992051e-05,
"loss": 1.6678,
"mean_token_accuracy": 0.06724415719509125,
"num_tokens": 6264832.0,
"step": 1530
},
{
"entropy": 0.006153753565013176,
"epoch": 1.017178724810043,
"grad_norm": 0.0,
"learning_rate": 3.189956095750964e-05,
"loss": 13.6964,
"mean_token_accuracy": 0.06525651663541794,
"num_tokens": 6305792.0,
"step": 1540
},
{
"entropy": 0.01643671256279049,
"epoch": 1.0237859266600595,
"grad_norm": 0.0,
"learning_rate": 3.160042219614039e-05,
"loss": 0.2836,
"mean_token_accuracy": 0.065067720413208,
"num_tokens": 6346752.0,
"step": 1550
},
{
"entropy": 0.012418469145814015,
"epoch": 1.030393128510076,
"grad_norm": 0.0,
"learning_rate": 3.130026411198397e-05,
"loss": 3.5205,
"mean_token_accuracy": 0.1062299519777298,
"num_tokens": 6387712.0,
"step": 1560
},
{
"entropy": 0.00280168341632816,
"epoch": 1.0370003303600925,
"grad_norm": 0.0,
"learning_rate": 3.099913305934407e-05,
"loss": 9.8431,
"mean_token_accuracy": 0.06536642909049988,
"num_tokens": 6428672.0,
"step": 1570
},
{
"entropy": 0.0040143982156223505,
"epoch": 1.043607532210109,
"grad_norm": 0.0,
"learning_rate": 3.0697075542782805e-05,
"loss": 2.5969,
"mean_token_accuracy": 0.04253978878259659,
"num_tokens": 6469632.0,
"step": 1580
},
{
"entropy": 0.014350750502853771,
"epoch": 1.0502147340601256,
"grad_norm": 0.0,
"learning_rate": 3.039413820993881e-05,
"loss": 0.6221,
"mean_token_accuracy": 0.05783200562000275,
"num_tokens": 6510592.0,
"step": 1590
},
{
"entropy": 0.009398723715912638,
"epoch": 1.056821935910142,
"grad_norm": 0.0,
"learning_rate": 3.0090367844323427e-05,
"loss": 1.078,
"mean_token_accuracy": 0.1340048998594284,
"num_tokens": 6551552.0,
"step": 1600
},
{
"entropy": 0.00624132590037334,
"epoch": 1.0634291377601586,
"grad_norm": 0.0,
"learning_rate": 2.9785811358095666e-05,
"loss": 7.9215,
"mean_token_accuracy": 0.06667087972164154,
"num_tokens": 6592512.0,
"step": 1610
},
{
"entropy": 0.0015867714646446984,
"epoch": 1.070036339610175,
"grad_norm": 3.8178083896636963,
"learning_rate": 2.9480515784817497e-05,
"loss": 0.0385,
"mean_token_accuracy": 0.0234375,
"num_tokens": 6633472.0,
"step": 1620
},
{
"entropy": 0.026325100680696777,
"epoch": 1.0766435414601916,
"grad_norm": 0.0,
"learning_rate": 2.9174528272190317e-05,
"loss": 7.6648,
"mean_token_accuracy": 0.1373343974351883,
"num_tokens": 6674432.0,
"step": 1630
},
{
"entropy": 0.01830651660930016,
"epoch": 1.083250743310208,
"grad_norm": 0.0,
"learning_rate": 2.8867896074773715e-05,
"loss": 9.6631,
"mean_token_accuracy": 0.10328344106674195,
"num_tokens": 6715392.0,
"step": 1640
},
{
"entropy": 0.04101564450556907,
"epoch": 1.0898579451602246,
"grad_norm": 0.0,
"learning_rate": 2.8560666546687932e-05,
"loss": 0.5364,
"mean_token_accuracy": 0.07200934141874313,
"num_tokens": 6756352.0,
"step": 1650
},
{
"entropy": 0.030506200826857822,
"epoch": 1.0964651470102411,
"grad_norm": 0.0,
"learning_rate": 2.8252887134300705e-05,
"loss": 7.2533,
"mean_token_accuracy": 0.12010089755058288,
"num_tokens": 6797312.0,
"step": 1660
},
{
"entropy": 0.011131914072029759,
"epoch": 1.1030723488602576,
"grad_norm": 0.0,
"learning_rate": 2.794460536890006e-05,
"loss": 2.1866,
"mean_token_accuracy": 0.03891718536615372,
"num_tokens": 6838272.0,
"step": 1670
},
{
"entropy": 0.0067470017631421795,
"epoch": 1.1096795507102741,
"grad_norm": 0.0,
"learning_rate": 2.7635868859353892e-05,
"loss": 2.1451,
"mean_token_accuracy": 0.08797054588794709,
"num_tokens": 6879232.0,
"step": 1680
},
{
"entropy": 0.0010968264658004045,
"epoch": 1.1162867525602906,
"grad_norm": 0.0,
"learning_rate": 2.732672528475756e-05,
"loss": 4.1927,
"mean_token_accuracy": 0.04846225529909134,
"num_tokens": 6920192.0,
"step": 1690
},
{
"entropy": 0.014475939798285254,
"epoch": 1.1228939544103071,
"grad_norm": 3.6743106842041016,
"learning_rate": 2.701722238707073e-05,
"loss": 1.7459,
"mean_token_accuracy": 0.08139636963605881,
"num_tokens": 6961152.0,
"step": 1700
},
{
"entropy": 0.012516654322098475,
"epoch": 1.1295011562603237,
"grad_norm": 0.0,
"learning_rate": 2.670740796374434e-05,
"loss": 27.5715,
"mean_token_accuracy": 0.08141163587570191,
"num_tokens": 7002112.0,
"step": 1710
},
{
"entropy": 0.01645941206952557,
"epoch": 1.1361083581103402,
"grad_norm": 0.0,
"learning_rate": 2.6397329860339216e-05,
"loss": 0.5778,
"mean_token_accuracy": 0.06432082802057267,
"num_tokens": 7043072.0,
"step": 1720
},
{
"entropy": 0.0060044553698389794,
"epoch": 1.1427155599603567,
"grad_norm": 3.3929810523986816,
"learning_rate": 2.608703596313704e-05,
"loss": 0.2285,
"mean_token_accuracy": 0.06034291386604309,
"num_tokens": 7084032.0,
"step": 1730
},
{
"entropy": 0.006904965872308821,
"epoch": 1.1493227618103732,
"grad_norm": 0.0,
"learning_rate": 2.5776574191745185e-05,
"loss": 0.7553,
"mean_token_accuracy": 0.08508117645978927,
"num_tokens": 7124992.0,
"step": 1740
},
{
"entropy": 0.0003588718103856081,
"epoch": 1.15592996366039,
"grad_norm": 0.0,
"learning_rate": 2.5465992491696346e-05,
"loss": 0.0268,
"mean_token_accuracy": 0.02421875,
"num_tokens": 7165952.0,
"step": 1750
},
{
"entropy": 0.002262050682293193,
"epoch": 1.1625371655104064,
"grad_norm": 0.09779301285743713,
"learning_rate": 2.5155338827044135e-05,
"loss": 10.7155,
"mean_token_accuracy": 0.06871064454317093,
"num_tokens": 7206912.0,
"step": 1760
},
{
"entropy": 0.004658589258269785,
"epoch": 1.169144367360423,
"grad_norm": 0.0,
"learning_rate": 2.484466117295588e-05,
"loss": 0.5489,
"mean_token_accuracy": 0.09281249940395356,
"num_tokens": 7247872.0,
"step": 1770
},
{
"entropy": 0.0026989880995643032,
"epoch": 1.1757515692104394,
"grad_norm": 0.0,
"learning_rate": 2.4534007508303664e-05,
"loss": 0.2234,
"mean_token_accuracy": 0.06663182377815247,
"num_tokens": 7288832.0,
"step": 1780
},
{
"entropy": 0.0004317208723023214,
"epoch": 1.182358771060456,
"grad_norm": 0.0,
"learning_rate": 2.4223425808254818e-05,
"loss": 0.0728,
"mean_token_accuracy": 0.045624999701976775,
"num_tokens": 7329792.0,
"step": 1790
},
{
"entropy": 0.009598436806300015,
"epoch": 1.1889659729104725,
"grad_norm": 0.0,
"learning_rate": 2.3912964036862963e-05,
"loss": 16.6883,
"mean_token_accuracy": 0.03912037014961243,
"num_tokens": 7370752.0,
"step": 1800
},
{
"entropy": 0.010217096674750792,
"epoch": 1.195573174760489,
"grad_norm": 15.332958221435547,
"learning_rate": 2.3602670139660786e-05,
"loss": 7.1663,
"mean_token_accuracy": 0.08217611610889435,
"num_tokens": 7411712.0,
"step": 1810
},
{
"entropy": 0.009031679033341788,
"epoch": 1.2021803766105055,
"grad_norm": 0.0,
"learning_rate": 2.329259203625567e-05,
"loss": 1.0604,
"mean_token_accuracy": 0.01841755360364914,
"num_tokens": 7452672.0,
"step": 1820
},
{
"entropy": 0.006855891485770371,
"epoch": 1.208787578460522,
"grad_norm": 5.940762042999268,
"learning_rate": 2.2982777612929283e-05,
"loss": 2.9393,
"mean_token_accuracy": 0.08348637223243713,
"num_tokens": 7493632.0,
"step": 1830
},
{
"entropy": 0.00012862358862548718,
"epoch": 1.2153947803105385,
"grad_norm": 0.0,
"learning_rate": 2.267327471524244e-05,
"loss": 1.4553,
"mean_token_accuracy": 0.024074074625968934,
"num_tokens": 7534592.0,
"step": 1840
},
{
"entropy": 0.0011855133230255887,
"epoch": 1.222001982160555,
"grad_norm": 0.0,
"learning_rate": 2.2364131140646114e-05,
"loss": 0.1606,
"mean_token_accuracy": 0.04429824501276016,
"num_tokens": 7575552.0,
"step": 1850
},
{
"entropy": 0.0026850333628772203,
"epoch": 1.2286091840105715,
"grad_norm": 0.2782774865627289,
"learning_rate": 2.2055394631099942e-05,
"loss": 0.5211,
"mean_token_accuracy": 0.09299242496490479,
"num_tokens": 7616512.0,
"step": 1860
},
{
"entropy": 0.007756018704094458,
"epoch": 1.235216385860588,
"grad_norm": 0.0,
"learning_rate": 2.1747112865699297e-05,
"loss": 0.9564,
"mean_token_accuracy": 0.06141776293516159,
"num_tokens": 7657472.0,
"step": 1870
},
{
"entropy": 0.010400129779191048,
"epoch": 1.2418235877106045,
"grad_norm": 0.0,
"learning_rate": 2.1439333453312084e-05,
"loss": 3.3547,
"mean_token_accuracy": 0.08667385131120682,
"num_tokens": 7698432.0,
"step": 1880
},
{
"entropy": 0.015071574994362891,
"epoch": 1.248430789560621,
"grad_norm": 0.0,
"learning_rate": 2.1132103925226294e-05,
"loss": 2.6196,
"mean_token_accuracy": 0.06390394121408463,
"num_tokens": 7739392.0,
"step": 1890
},
{
"entropy": 0.0022723632777342574,
"epoch": 1.2550379914106375,
"grad_norm": 4.928098201751709,
"learning_rate": 2.0825471727809692e-05,
"loss": 0.4508,
"mean_token_accuracy": 0.06901711821556092,
"num_tokens": 7780352.0,
"step": 1900
},
{
"entropy": 0.004141029497259296,
"epoch": 1.261645193260654,
"grad_norm": 10.553857803344727,
"learning_rate": 2.0519484215182498e-05,
"loss": 26.983,
"mean_token_accuracy": 0.04182367026805878,
"num_tokens": 7821312.0,
"step": 1910
},
{
"entropy": 0.014739693689989508,
"epoch": 1.2682523951106706,
"grad_norm": 0.0,
"learning_rate": 2.0214188641904337e-05,
"loss": 5.3302,
"mean_token_accuracy": 0.06254241913557053,
"num_tokens": 7862272.0,
"step": 1920
},
{
"entropy": 0.004958675562374992,
"epoch": 1.274859596960687,
"grad_norm": 4.394237995147705,
"learning_rate": 1.9909632155676583e-05,
"loss": 1.385,
"mean_token_accuracy": 0.0656544640660286,
"num_tokens": 7903232.0,
"step": 1930
},
{
"entropy": 0.010434124947641977,
"epoch": 1.2814667988107038,
"grad_norm": 0.0,
"learning_rate": 1.9605861790061194e-05,
"loss": 0.8501,
"mean_token_accuracy": 0.03979341834783554,
"num_tokens": 7944192.0,
"step": 1940
},
{
"entropy": 0.018370524288911838,
"epoch": 1.2880740006607203,
"grad_norm": 0.0,
"learning_rate": 1.9302924457217204e-05,
"loss": 1.6869,
"mean_token_accuracy": 0.07690805941820145,
"num_tokens": 7985152.0,
"step": 1950
},
{
"entropy": 0.01128047517995583,
"epoch": 1.2946812025107368,
"grad_norm": 0.0,
"learning_rate": 1.900086694065593e-05,
"loss": 38.9167,
"mean_token_accuracy": 0.05990941822528839,
"num_tokens": 8026112.0,
"step": 1960
},
{
"entropy": 0.005127273782272823,
"epoch": 1.3012884043607533,
"grad_norm": 0.0,
"learning_rate": 1.8699735888016036e-05,
"loss": 1.9819,
"mean_token_accuracy": 0.019821429252624513,
"num_tokens": 8067072.0,
"step": 1970
},
{
"entropy": 0.0013086928360280582,
"epoch": 1.3078956062107698,
"grad_norm": 0.0,
"learning_rate": 1.8399577803859623e-05,
"loss": 1.2477,
"mean_token_accuracy": 0.05,
"num_tokens": 8108032.0,
"step": 1980
},
{
"entropy": 0.029465652829094323,
"epoch": 1.3145028080607863,
"grad_norm": 0.0,
"learning_rate": 1.8100439042490368e-05,
"loss": 10.0606,
"mean_token_accuracy": 0.10168795138597489,
"num_tokens": 8148992.0,
"step": 1990
},
{
"entropy": 0.0015846440888708456,
"epoch": 1.3211100099108029,
"grad_norm": 0.0,
"learning_rate": 1.780236580079491e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.048466117680072786,
"num_tokens": 8189952.0,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 3028,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.012440723577897e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}