| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3211100099108029, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 6.753322017192841, | |
| "epoch": 0.006607201850016518, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9e-07, | |
| "loss": 220.5158, | |
| "mean_token_accuracy": 0.021751992404460907, | |
| "num_tokens": 40960.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 6.814661657810211, | |
| "epoch": 0.013214403700033036, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9e-06, | |
| "loss": 38.3501, | |
| "mean_token_accuracy": 0.02807121314108372, | |
| "num_tokens": 81920.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 6.945201528072357, | |
| "epoch": 0.019821605550049554, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9e-06, | |
| "loss": 62.9007, | |
| "mean_token_accuracy": 0.01135626919567585, | |
| "num_tokens": 122880.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 7.206499898433686, | |
| "epoch": 0.026428807400066073, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.9e-06, | |
| "loss": 49.7069, | |
| "mean_token_accuracy": 0.023292785882949828, | |
| "num_tokens": 163840.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.605212825164199, | |
| "epoch": 0.03303600925008259, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 214.3199, | |
| "mean_token_accuracy": 0.021621854975819588, | |
| "num_tokens": 204800.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.06497852916363626, | |
| "epoch": 0.03964321110009911, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.9e-06, | |
| "loss": 11.6628, | |
| "mean_token_accuracy": 0.043425589054822925, | |
| "num_tokens": 245760.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.021584587066899984, | |
| "epoch": 0.04625041295011562, | |
| "grad_norm": 19.475257873535156, | |
| "learning_rate": 6.900000000000001e-06, | |
| "loss": 4.9491, | |
| "mean_token_accuracy": 0.039490992575883864, | |
| "num_tokens": 286720.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.03706419242080301, | |
| "epoch": 0.052857614800132145, | |
| "grad_norm": 65.02709197998047, | |
| "learning_rate": 7.9e-06, | |
| "loss": 11.027, | |
| "mean_token_accuracy": 0.07533193230628968, | |
| "num_tokens": 327680.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.05241385023109615, | |
| "epoch": 0.05946481665014866, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.9e-06, | |
| "loss": 21.8544, | |
| "mean_token_accuracy": 0.012833333015441895, | |
| "num_tokens": 368640.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.4075972191989422, | |
| "epoch": 0.06607201850016518, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.900000000000002e-06, | |
| "loss": 35.157, | |
| "mean_token_accuracy": 0.024832390993833543, | |
| "num_tokens": 409600.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.835695518553257, | |
| "epoch": 0.07267922035018169, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.09e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 450560.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.9127246515825391, | |
| "epoch": 0.07928642220019821, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.19e-05, | |
| "loss": 134.9978, | |
| "mean_token_accuracy": 0.07018110677599906, | |
| "num_tokens": 491520.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.09644881915301085, | |
| "epoch": 0.08589362405021474, | |
| "grad_norm": 18.287517547607422, | |
| "learning_rate": 1.29e-05, | |
| "loss": 17.8291, | |
| "mean_token_accuracy": 0.11415426433086395, | |
| "num_tokens": 532480.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.061981100589037, | |
| "epoch": 0.09250082590023125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3900000000000002e-05, | |
| "loss": 73.0922, | |
| "mean_token_accuracy": 0.0463849127292633, | |
| "num_tokens": 573440.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.7951227528974414, | |
| "epoch": 0.09910802775024777, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.49e-05, | |
| "loss": 26.185, | |
| "mean_token_accuracy": 0.043432857096195224, | |
| "num_tokens": 614400.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.100301666976884, | |
| "epoch": 0.10571522960026429, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.59e-05, | |
| "loss": 45.3161, | |
| "mean_token_accuracy": 0.08393882662057876, | |
| "num_tokens": 655360.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.7640071153640747, | |
| "epoch": 0.11232243145028081, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.69e-05, | |
| "loss": 61.7114, | |
| "mean_token_accuracy": 0.04098484814167023, | |
| "num_tokens": 696320.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.3028068076006093, | |
| "epoch": 0.11892963330029732, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.79e-05, | |
| "loss": 57.9743, | |
| "mean_token_accuracy": 0.07434001564979553, | |
| "num_tokens": 737280.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.02740152989870239, | |
| "epoch": 0.12553683515031383, | |
| "grad_norm": 30.224332809448242, | |
| "learning_rate": 1.8900000000000002e-05, | |
| "loss": 43.8902, | |
| "mean_token_accuracy": 0.06686745882034302, | |
| "num_tokens": 778240.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.00011176439667224259, | |
| "epoch": 0.13214403700033037, | |
| "grad_norm": 31.883712768554688, | |
| "learning_rate": 1.9900000000000003e-05, | |
| "loss": 0.0793, | |
| "mean_token_accuracy": 0.02380952388048172, | |
| "num_tokens": 819200.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.005680033719772837, | |
| "epoch": 0.13875123885034688, | |
| "grad_norm": 33.85360336303711, | |
| "learning_rate": 2.09e-05, | |
| "loss": 0.2071, | |
| "mean_token_accuracy": 0.015425531566143036, | |
| "num_tokens": 860160.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.0075609647078181295, | |
| "epoch": 0.14535844070036338, | |
| "grad_norm": 19.486099243164062, | |
| "learning_rate": 2.19e-05, | |
| "loss": 1.201, | |
| "mean_token_accuracy": 0.07949066758155823, | |
| "num_tokens": 901120.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.011990599616638065, | |
| "epoch": 0.15196564255037992, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.29e-05, | |
| "loss": 2.0033, | |
| "mean_token_accuracy": 0.02794431447982788, | |
| "num_tokens": 942080.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.0031902942908345723, | |
| "epoch": 0.15857284440039643, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.39e-05, | |
| "loss": 1.1575, | |
| "mean_token_accuracy": 0.017129629850387573, | |
| "num_tokens": 983040.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.14381448374479078, | |
| "epoch": 0.16518004625041294, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4900000000000002e-05, | |
| "loss": 0.7004, | |
| "mean_token_accuracy": 0.021666666865348815, | |
| "num_tokens": 1024000.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 3.897232323416836, | |
| "epoch": 0.17178724810042947, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5900000000000003e-05, | |
| "loss": 0.812, | |
| "mean_token_accuracy": 0.039259061217308044, | |
| "num_tokens": 1064960.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.019721664518976723, | |
| "epoch": 0.17839444995044598, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.6900000000000003e-05, | |
| "loss": 2.7005, | |
| "mean_token_accuracy": 0.05297277569770813, | |
| "num_tokens": 1105920.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.02900617156851304, | |
| "epoch": 0.1850016518004625, | |
| "grad_norm": 18.309810638427734, | |
| "learning_rate": 2.7900000000000004e-05, | |
| "loss": 0.6091, | |
| "mean_token_accuracy": 0.07364537045359612, | |
| "num_tokens": 1146880.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.03415291602525096, | |
| "epoch": 0.19160885365047903, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8899999999999998e-05, | |
| "loss": 19.6735, | |
| "mean_token_accuracy": 0.10901817381381988, | |
| "num_tokens": 1187840.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.03262247524494468, | |
| "epoch": 0.19821605550049554, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9900000000000002e-05, | |
| "loss": 10.0336, | |
| "mean_token_accuracy": 0.06002416908740997, | |
| "num_tokens": 1228800.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.003781934377029783, | |
| "epoch": 0.20482325735051204, | |
| "grad_norm": 89.78754425048828, | |
| "learning_rate": 3.09e-05, | |
| "loss": 7.6655, | |
| "mean_token_accuracy": 0.07553683370351791, | |
| "num_tokens": 1269760.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.02375032416614431, | |
| "epoch": 0.21143045920052858, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.19e-05, | |
| "loss": 55.6063, | |
| "mean_token_accuracy": 0.08025674372911454, | |
| "num_tokens": 1310720.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.0065040366395965775, | |
| "epoch": 0.2180376610505451, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.29e-05, | |
| "loss": 10.669, | |
| "mean_token_accuracy": 0.01593567281961441, | |
| "num_tokens": 1351680.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.003754374942396499, | |
| "epoch": 0.22464486290056163, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3900000000000004e-05, | |
| "loss": 0.3125, | |
| "mean_token_accuracy": 0.042081044614315034, | |
| "num_tokens": 1392640.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.0002896140339728959, | |
| "epoch": 0.23125206475057813, | |
| "grad_norm": 16.06511878967285, | |
| "learning_rate": 3.49e-05, | |
| "loss": 0.0652, | |
| "mean_token_accuracy": 0.02388888895511627, | |
| "num_tokens": 1433600.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.0001928646220630714, | |
| "epoch": 0.23785926660059464, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.59e-05, | |
| "loss": 1.1458, | |
| "mean_token_accuracy": 0.02368421107530594, | |
| "num_tokens": 1474560.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.0003226411399264606, | |
| "epoch": 0.24446646845061118, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.69e-05, | |
| "loss": 4.4569, | |
| "mean_token_accuracy": 0.04751228392124176, | |
| "num_tokens": 1515520.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.030229031606691593, | |
| "epoch": 0.25107367030062766, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.79e-05, | |
| "loss": 4.4461, | |
| "mean_token_accuracy": 0.06874987185001373, | |
| "num_tokens": 1556480.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.014421097982523178, | |
| "epoch": 0.2576808721506442, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.8900000000000004e-05, | |
| "loss": 89.155, | |
| "mean_token_accuracy": 0.07968828082084656, | |
| "num_tokens": 1597440.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.018559856562546172, | |
| "epoch": 0.26428807400066073, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.99e-05, | |
| "loss": 1.1971, | |
| "mean_token_accuracy": 0.04879816025495529, | |
| "num_tokens": 1638400.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.0025903429913086027, | |
| "epoch": 0.27089527585067724, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.09e-05, | |
| "loss": 46.7922, | |
| "mean_token_accuracy": 0.06377801150083542, | |
| "num_tokens": 1679360.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.0014362634548334085, | |
| "epoch": 0.27750247770069375, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.19e-05, | |
| "loss": 0.2403, | |
| "mean_token_accuracy": 0.019583334028720856, | |
| "num_tokens": 1720320.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.01779812537132557, | |
| "epoch": 0.28410967955071026, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.29e-05, | |
| "loss": 3.4174, | |
| "mean_token_accuracy": 0.09800765216350556, | |
| "num_tokens": 1761280.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.00543891376372585, | |
| "epoch": 0.29071688140072677, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.39e-05, | |
| "loss": 1.7927, | |
| "mean_token_accuracy": 0.10844782143831252, | |
| "num_tokens": 1802240.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.008688661311566648, | |
| "epoch": 0.29732408325074333, | |
| "grad_norm": 11.81262493133545, | |
| "learning_rate": 4.49e-05, | |
| "loss": 19.4753, | |
| "mean_token_accuracy": 0.03172447681427002, | |
| "num_tokens": 1843200.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.03670089549409568, | |
| "epoch": 0.30393128510075984, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.5900000000000004e-05, | |
| "loss": 9.5863, | |
| "mean_token_accuracy": 0.09990399181842805, | |
| "num_tokens": 1884160.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.0009635578760935459, | |
| "epoch": 0.31053848695077635, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.69e-05, | |
| "loss": 0.1027, | |
| "mean_token_accuracy": 0.047025862336158755, | |
| "num_tokens": 1925120.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.03435224668937735, | |
| "epoch": 0.31714568880079286, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.79e-05, | |
| "loss": 9.3884, | |
| "mean_token_accuracy": 0.12481869906187057, | |
| "num_tokens": 1966080.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.02036606671208574, | |
| "epoch": 0.32375289065080937, | |
| "grad_norm": 12.76618766784668, | |
| "learning_rate": 4.89e-05, | |
| "loss": 8.3534, | |
| "mean_token_accuracy": 0.025242918729782106, | |
| "num_tokens": 2007040.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.005103755064374127, | |
| "epoch": 0.3303600925008259, | |
| "grad_norm": 10.067124366760254, | |
| "learning_rate": 4.99e-05, | |
| "loss": 0.996, | |
| "mean_token_accuracy": 0.03732142895460129, | |
| "num_tokens": 2048000.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.02930727633283823, | |
| "epoch": 0.33696729435084244, | |
| "grad_norm": 13.147978782653809, | |
| "learning_rate": 4.999843636237961e-05, | |
| "loss": 2.6708, | |
| "mean_token_accuracy": 0.16797256022691726, | |
| "num_tokens": 2088960.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.01542719653371023, | |
| "epoch": 0.34357449620085895, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.999303144641334e-05, | |
| "loss": 0.7947, | |
| "mean_token_accuracy": 0.029625000059604646, | |
| "num_tokens": 2129920.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.011237168186926282, | |
| "epoch": 0.35018169805087546, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.9983766782431473e-05, | |
| "loss": 1.8663, | |
| "mean_token_accuracy": 0.015829145908355713, | |
| "num_tokens": 2170880.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.021366176847368478, | |
| "epoch": 0.35678889990089196, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.997064380120359e-05, | |
| "loss": 0.7902, | |
| "mean_token_accuracy": 0.034427966177463534, | |
| "num_tokens": 2211840.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.013086076875333674, | |
| "epoch": 0.3633961017509085, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.995366452935061e-05, | |
| "loss": 4.0451, | |
| "mean_token_accuracy": 0.03658333420753479, | |
| "num_tokens": 2252800.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.15343078810255975, | |
| "epoch": 0.370003303600925, | |
| "grad_norm": 21.48503303527832, | |
| "learning_rate": 4.993283158903187e-05, | |
| "loss": 108.0904, | |
| "mean_token_accuracy": 0.03694373369216919, | |
| "num_tokens": 2293760.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.08353904361720196, | |
| "epoch": 0.37661050545094155, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.9908148197540174e-05, | |
| "loss": 3.3117, | |
| "mean_token_accuracy": 0.05191558599472046, | |
| "num_tokens": 2334720.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.0030798995423538143, | |
| "epoch": 0.38321770730095805, | |
| "grad_norm": 15.358031272888184, | |
| "learning_rate": 4.987961816680492e-05, | |
| "loss": 107.2357, | |
| "mean_token_accuracy": 0.040966108441352844, | |
| "num_tokens": 2375680.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.0050722503918223085, | |
| "epoch": 0.38982490915097456, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.984724590280343e-05, | |
| "loss": 1.1264, | |
| "mean_token_accuracy": 0.01942567527294159, | |
| "num_tokens": 2416640.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.025612023619032698, | |
| "epoch": 0.39643211100099107, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.981103640488048e-05, | |
| "loss": 1.0595, | |
| "mean_token_accuracy": 0.05857066810131073, | |
| "num_tokens": 2457600.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.02604469430661993, | |
| "epoch": 0.4030393128510076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.977099526497631e-05, | |
| "loss": 2.9458, | |
| "mean_token_accuracy": 0.11472017914056779, | |
| "num_tokens": 2498560.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.34335226020159, | |
| "epoch": 0.4096465147010241, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.972712866676297e-05, | |
| "loss": 0.8969, | |
| "mean_token_accuracy": 0.0374501496553421, | |
| "num_tokens": 2539520.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.017200125091973145, | |
| "epoch": 0.41625371655104065, | |
| "grad_norm": 6.167868614196777, | |
| "learning_rate": 4.96794433846894e-05, | |
| "loss": 0.7561, | |
| "mean_token_accuracy": 0.04967625737190247, | |
| "num_tokens": 2580480.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.013742945558520164, | |
| "epoch": 0.42286091840105716, | |
| "grad_norm": 4.658291816711426, | |
| "learning_rate": 4.962794678293523e-05, | |
| "loss": 3.9976, | |
| "mean_token_accuracy": 0.09124306589365005, | |
| "num_tokens": 2621440.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.018405587278946686, | |
| "epoch": 0.42946812025107367, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.957264681427348e-05, | |
| "loss": 1.2244, | |
| "mean_token_accuracy": 0.030132341384887695, | |
| "num_tokens": 2662400.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.007059359534223398, | |
| "epoch": 0.4360753221010902, | |
| "grad_norm": 41.432003021240234, | |
| "learning_rate": 4.9513552018842426e-05, | |
| "loss": 5.2841, | |
| "mean_token_accuracy": 0.10671066045761109, | |
| "num_tokens": 2703360.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.004513938081906041, | |
| "epoch": 0.4426825239511067, | |
| "grad_norm": 11.688268661499023, | |
| "learning_rate": 4.9450671522826694e-05, | |
| "loss": 0.4656, | |
| "mean_token_accuracy": 0.0375, | |
| "num_tokens": 2744320.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 6.520482699785834e-12, | |
| "epoch": 0.44928972580112325, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.938401503704789e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 2785280.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.019155432335637948, | |
| "epoch": 0.45589692765113976, | |
| "grad_norm": 7.324256420135498, | |
| "learning_rate": 4.9313592855464916e-05, | |
| "loss": 8.6945, | |
| "mean_token_accuracy": 0.07001543641090394, | |
| "num_tokens": 2826240.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.0031058518021977906, | |
| "epoch": 0.46250412950115627, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.923941585358426e-05, | |
| "loss": 24.0465, | |
| "mean_token_accuracy": 0.039618226885795596, | |
| "num_tokens": 2867200.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.011669352487973095, | |
| "epoch": 0.4691113313511728, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.9161495486780456e-05, | |
| "loss": 3.7727, | |
| "mean_token_accuracy": 0.14685782939195632, | |
| "num_tokens": 2908160.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.011068030085762338, | |
| "epoch": 0.4757185332011893, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.907984378852699e-05, | |
| "loss": 4.0753, | |
| "mean_token_accuracy": 0.05300231724977493, | |
| "num_tokens": 2949120.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.019895143689083482, | |
| "epoch": 0.4823257350512058, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8994473368537916e-05, | |
| "loss": 3.4842, | |
| "mean_token_accuracy": 0.06887777000665665, | |
| "num_tokens": 2990080.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.00012087312168218567, | |
| "epoch": 0.48893293690122236, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8905397410820554e-05, | |
| "loss": 0.4551, | |
| "mean_token_accuracy": 0.02083333283662796, | |
| "num_tokens": 3031040.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.011406292311664856, | |
| "epoch": 0.49554013875123887, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.881262967163939e-05, | |
| "loss": 25.7439, | |
| "mean_token_accuracy": 0.08733686208724975, | |
| "num_tokens": 3072000.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.007139437921159697, | |
| "epoch": 0.5021473406012553, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8716184477391675e-05, | |
| "loss": 3.6028, | |
| "mean_token_accuracy": 0.03767942488193512, | |
| "num_tokens": 3112960.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.011418092579742734, | |
| "epoch": 0.5087545424512719, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8616076722394974e-05, | |
| "loss": 3.0005, | |
| "mean_token_accuracy": 0.07940062433481217, | |
| "num_tokens": 3153920.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.006054546175801079, | |
| "epoch": 0.5153617443012884, | |
| "grad_norm": 17.855485916137695, | |
| "learning_rate": 4.8512321866586985e-05, | |
| "loss": 0.2468, | |
| "mean_token_accuracy": 0.03400537669658661, | |
| "num_tokens": 3194880.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.02499405129653951, | |
| "epoch": 0.521968946151305, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.840493593313798e-05, | |
| "loss": 7.3568, | |
| "mean_token_accuracy": 0.06639731675386429, | |
| "num_tokens": 3235840.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.030509774422012016, | |
| "epoch": 0.5285761480013215, | |
| "grad_norm": 8.936971664428711, | |
| "learning_rate": 4.829393550597633e-05, | |
| "loss": 2.2173, | |
| "mean_token_accuracy": 0.0885118618607521, | |
| "num_tokens": 3276800.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.008201998914637442, | |
| "epoch": 0.535183349851338, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.8179337727227424e-05, | |
| "loss": 10.5952, | |
| "mean_token_accuracy": 0.03566812574863434, | |
| "num_tokens": 3317760.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.0163260028579316, | |
| "epoch": 0.5417905517013545, | |
| "grad_norm": 8.472152709960938, | |
| "learning_rate": 4.806116029456631e-05, | |
| "loss": 9.1567, | |
| "mean_token_accuracy": 0.02788277491927147, | |
| "num_tokens": 3358720.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.014577616745373233, | |
| "epoch": 0.548397753551371, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.7939421458484604e-05, | |
| "loss": 14.4947, | |
| "mean_token_accuracy": 0.038144654035568236, | |
| "num_tokens": 3399680.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.060555978957563636, | |
| "epoch": 0.5550049554013875, | |
| "grad_norm": 5.755632400512695, | |
| "learning_rate": 4.781414001947205e-05, | |
| "loss": 17.9944, | |
| "mean_token_accuracy": 0.08551393002271652, | |
| "num_tokens": 3440640.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.01132769331743475, | |
| "epoch": 0.561612157251404, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.768533532511306e-05, | |
| "loss": 54.7683, | |
| "mean_token_accuracy": 0.034085105359554294, | |
| "num_tokens": 3481600.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.03579027488522115, | |
| "epoch": 0.5682193591014205, | |
| "grad_norm": 1.5527058839797974, | |
| "learning_rate": 4.755302726709882e-05, | |
| "loss": 48.52, | |
| "mean_token_accuracy": 0.06784629076719284, | |
| "num_tokens": 3522560.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.025357943994458766, | |
| "epoch": 0.574826560951437, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.741723627815536e-05, | |
| "loss": 8.7313, | |
| "mean_token_accuracy": 0.06684041172266006, | |
| "num_tokens": 3563520.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.003138951886649011, | |
| "epoch": 0.5814337628014535, | |
| "grad_norm": 0.7629010677337646, | |
| "learning_rate": 4.7277983328888084e-05, | |
| "loss": 4.1401, | |
| "mean_token_accuracy": 0.0589192196726799, | |
| "num_tokens": 3604480.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.008384830836621403, | |
| "epoch": 0.5880409646514702, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.7135289924543197e-05, | |
| "loss": 4.0731, | |
| "mean_token_accuracy": 0.032692307233810426, | |
| "num_tokens": 3645440.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.006141025204078688, | |
| "epoch": 0.5946481665014867, | |
| "grad_norm": 7.293692588806152, | |
| "learning_rate": 4.6989178101686584e-05, | |
| "loss": 0.9345, | |
| "mean_token_accuracy": 0.06051216870546341, | |
| "num_tokens": 3686400.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.057043678060654204, | |
| "epoch": 0.6012553683515032, | |
| "grad_norm": 4.391047954559326, | |
| "learning_rate": 4.683967042480064e-05, | |
| "loss": 3.9633, | |
| "mean_token_accuracy": 0.12203554809093475, | |
| "num_tokens": 3727360.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.012308508742717095, | |
| "epoch": 0.6078625702015197, | |
| "grad_norm": 2.2332639694213867, | |
| "learning_rate": 4.668678998279961e-05, | |
| "loss": 0.6798, | |
| "mean_token_accuracy": 0.10872009098529815, | |
| "num_tokens": 3768320.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.06679710658499971, | |
| "epoch": 0.6144697720515362, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.653056038546381e-05, | |
| "loss": 1.5921, | |
| "mean_token_accuracy": 0.020000000298023225, | |
| "num_tokens": 3809280.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.41163066651206465, | |
| "epoch": 0.6210769739015527, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.637100575979361e-05, | |
| "loss": 1.009, | |
| "mean_token_accuracy": 0.029701614379882814, | |
| "num_tokens": 3850240.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.005564717383629158, | |
| "epoch": 0.6276841757515692, | |
| "grad_norm": 8.103035926818848, | |
| "learning_rate": 4.620815074628333e-05, | |
| "loss": 0.4482, | |
| "mean_token_accuracy": 0.060583002865314484, | |
| "num_tokens": 3891200.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.006938851803111312, | |
| "epoch": 0.6342913776015857, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.6042020495116005e-05, | |
| "loss": 12.1827, | |
| "mean_token_accuracy": 0.03860159516334534, | |
| "num_tokens": 3932160.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.025441451920087132, | |
| "epoch": 0.6408985794516022, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.587264066227933e-05, | |
| "loss": 4.7399, | |
| "mean_token_accuracy": 0.045529181510210036, | |
| "num_tokens": 3973120.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.0037336295128966414, | |
| "epoch": 0.6475057813016187, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.570003740560352e-05, | |
| "loss": 9.9644, | |
| "mean_token_accuracy": 0.032598039507865904, | |
| "num_tokens": 4014080.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.02236287235959935, | |
| "epoch": 0.6541129831516352, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.5524237380721755e-05, | |
| "loss": 25.0121, | |
| "mean_token_accuracy": 0.09488918632268906, | |
| "num_tokens": 4055040.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.004253433111853155, | |
| "epoch": 0.6607201850016517, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.5345267736953566e-05, | |
| "loss": 2.2839, | |
| "mean_token_accuracy": 0.061769942939281466, | |
| "num_tokens": 4096000.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.004871105551137589, | |
| "epoch": 0.6673273868516684, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.516315611311215e-05, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.03642857223749161, | |
| "num_tokens": 4136960.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.025206556524062763, | |
| "epoch": 0.6739345887016849, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.4977930633236e-05, | |
| "loss": 1.3895, | |
| "mean_token_accuracy": 0.0927792876958847, | |
| "num_tokens": 4177920.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.006291561650141375, | |
| "epoch": 0.6805417905517014, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.478961990224567e-05, | |
| "loss": 3.6759, | |
| "mean_token_accuracy": 0.06052185446023941, | |
| "num_tokens": 4218880.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.0068054500185098735, | |
| "epoch": 0.6871489924017179, | |
| "grad_norm": 7.908809185028076, | |
| "learning_rate": 4.4598253001526165e-05, | |
| "loss": 0.8344, | |
| "mean_token_accuracy": 0.04019886404275894, | |
| "num_tokens": 4259840.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.02280626448646217, | |
| "epoch": 0.6937561942517344, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.440385948443586e-05, | |
| "loss": 56.0997, | |
| "mean_token_accuracy": 0.05091123506426811, | |
| "num_tokens": 4300800.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.009011537862716069, | |
| "epoch": 0.7003633961017509, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.420646937174249e-05, | |
| "loss": 4.2313, | |
| "mean_token_accuracy": 0.10524300783872605, | |
| "num_tokens": 4341760.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.0018482632265659049, | |
| "epoch": 0.7069705979517674, | |
| "grad_norm": 20.32362937927246, | |
| "learning_rate": 4.400611314698693e-05, | |
| "loss": 0.0597, | |
| "mean_token_accuracy": 0.04590909034013748, | |
| "num_tokens": 4382720.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.017142248089658096, | |
| "epoch": 0.7135777998017839, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.3802821751775545e-05, | |
| "loss": 3.2115, | |
| "mean_token_accuracy": 0.03341048508882523, | |
| "num_tokens": 4423680.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.009434876564773731, | |
| "epoch": 0.7201850016518004, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.35966265810018e-05, | |
| "loss": 0.5134, | |
| "mean_token_accuracy": 0.05730164796113968, | |
| "num_tokens": 4464640.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.01804768085712567, | |
| "epoch": 0.726792203501817, | |
| "grad_norm": 6.497892379760742, | |
| "learning_rate": 4.338755947799779e-05, | |
| "loss": 0.3714, | |
| "mean_token_accuracy": 0.030211375653743745, | |
| "num_tokens": 4505600.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.010639433527830987, | |
| "epoch": 0.7333994053518335, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.317565272961668e-05, | |
| "loss": 5.4826, | |
| "mean_token_accuracy": 0.03939753919839859, | |
| "num_tokens": 4546560.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.009574767109006644, | |
| "epoch": 0.74000660720185, | |
| "grad_norm": 1.7740790843963623, | |
| "learning_rate": 4.296093906124648e-05, | |
| "loss": 0.0149, | |
| "mean_token_accuracy": 0.024193547666072845, | |
| "num_tokens": 4587520.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.016046867379918693, | |
| "epoch": 0.7466138090518666, | |
| "grad_norm": 5.654629230499268, | |
| "learning_rate": 4.274345163175617e-05, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.020338982343673706, | |
| "num_tokens": 4628480.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.011864034581230953, | |
| "epoch": 0.7532210109018831, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.252322402837491e-05, | |
| "loss": 2.6892, | |
| "mean_token_accuracy": 0.06476814448833465, | |
| "num_tokens": 4669440.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.016540668293600902, | |
| "epoch": 0.7598282127518996, | |
| "grad_norm": 16.53849220275879, | |
| "learning_rate": 4.2300290261505036e-05, | |
| "loss": 30.6023, | |
| "mean_token_accuracy": 0.0714763343334198, | |
| "num_tokens": 4710400.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.0230761248501949, | |
| "epoch": 0.7664354146019161, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.2074684759469746e-05, | |
| "loss": 2.7597, | |
| "mean_token_accuracy": 0.03069286197423935, | |
| "num_tokens": 4751360.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.011810722067821189, | |
| "epoch": 0.7730426164519326, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.1846442363196216e-05, | |
| "loss": 0.9111, | |
| "mean_token_accuracy": 0.10935835093259812, | |
| "num_tokens": 4792320.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.014862862655547815, | |
| "epoch": 0.7796498183019491, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.161559832083505e-05, | |
| "loss": 6.2322, | |
| "mean_token_accuracy": 0.07727646380662918, | |
| "num_tokens": 4833280.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.03382376935035154, | |
| "epoch": 0.7862570201519656, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.138218828231674e-05, | |
| "loss": 6.8809, | |
| "mean_token_accuracy": 0.08848222196102143, | |
| "num_tokens": 4874240.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.01027671142185227, | |
| "epoch": 0.7928642220019821, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.1146248293846226e-05, | |
| "loss": 19.8159, | |
| "mean_token_accuracy": 0.05517474114894867, | |
| "num_tokens": 4915200.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.0004816283415379985, | |
| "epoch": 0.7994714238519987, | |
| "grad_norm": 20.06401252746582, | |
| "learning_rate": 4.0907814792336086e-05, | |
| "loss": 0.1573, | |
| "mean_token_accuracy": 0.04738095104694366, | |
| "num_tokens": 4956160.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.0002817700983996474, | |
| "epoch": 0.8060786257020152, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.0666924599779545e-05, | |
| "loss": 1.3697, | |
| "mean_token_accuracy": 0.07327586263418198, | |
| "num_tokens": 4997120.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.022998044469534306, | |
| "epoch": 0.8126858275520317, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.042361491756389e-05, | |
| "loss": 75.5602, | |
| "mean_token_accuracy": 0.03188432902097702, | |
| "num_tokens": 5038080.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.006426294211951245, | |
| "epoch": 0.8192930294020482, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.017792332072541e-05, | |
| "loss": 43.3363, | |
| "mean_token_accuracy": 0.060617709159851076, | |
| "num_tokens": 5079040.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.04888666102579009, | |
| "epoch": 0.8259002312520648, | |
| "grad_norm": 5.245207786560059, | |
| "learning_rate": 3.992988775214651e-05, | |
| "loss": 10.3096, | |
| "mean_token_accuracy": 0.09515323489904404, | |
| "num_tokens": 5120000.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.0168402054007629, | |
| "epoch": 0.8325074331020813, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.967954651669612e-05, | |
| "loss": 22.3533, | |
| "mean_token_accuracy": 0.054832766950130465, | |
| "num_tokens": 5160960.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.2130512709518371e-05, | |
| "epoch": 0.8391146349520978, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.942693827531413e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 5201920.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.024002419360476777, | |
| "epoch": 0.8457218368021143, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.917210203904092e-05, | |
| "loss": 0.537, | |
| "mean_token_accuracy": 0.03225543200969696, | |
| "num_tokens": 5242880.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.007740778655079339, | |
| "epoch": 0.8523290386521308, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.891507716299268e-05, | |
| "loss": 66.1835, | |
| "mean_token_accuracy": 0.03432405143976212, | |
| "num_tokens": 5283840.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.01155800115434431, | |
| "epoch": 0.8589362405021473, | |
| "grad_norm": 4.934366703033447, | |
| "learning_rate": 3.865590334028378e-05, | |
| "loss": 0.4728, | |
| "mean_token_accuracy": 0.03674568980932236, | |
| "num_tokens": 5324800.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.027338066224683644, | |
| "epoch": 0.8655434423521638, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.83946205958968e-05, | |
| "loss": 6.3222, | |
| "mean_token_accuracy": 0.09498430639505387, | |
| "num_tokens": 5365760.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.020768357973520325, | |
| "epoch": 0.8721506442021804, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.813126928050132e-05, | |
| "loss": 16.5894, | |
| "mean_token_accuracy": 0.07061131447553634, | |
| "num_tokens": 5406720.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.008218960013255127, | |
| "epoch": 0.8787578460521969, | |
| "grad_norm": 7.263195991516113, | |
| "learning_rate": 3.78658900642225e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.03645932972431183, | |
| "num_tokens": 5447680.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.06189728657391243, | |
| "epoch": 0.8853650479022134, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.759852393036025e-05, | |
| "loss": 10.2275, | |
| "mean_token_accuracy": 0.14302214086055756, | |
| "num_tokens": 5488640.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.03971352147527796, | |
| "epoch": 0.8919722497522299, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.732921216906006e-05, | |
| "loss": 5.071, | |
| "mean_token_accuracy": 0.028831233829259874, | |
| "num_tokens": 5529600.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.005447403571633913, | |
| "epoch": 0.8985794516022465, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.705799637093644e-05, | |
| "loss": 1.2257, | |
| "mean_token_accuracy": 0.06499999910593032, | |
| "num_tokens": 5570560.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.0004757829759910237, | |
| "epoch": 0.905186653452263, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.678491842064995e-05, | |
| "loss": 0.1014, | |
| "mean_token_accuracy": 0.0236486479640007, | |
| "num_tokens": 5611520.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.0005358504138712305, | |
| "epoch": 0.9117938553022795, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.651002049043883e-05, | |
| "loss": 1.0571, | |
| "mean_token_accuracy": 0.05, | |
| "num_tokens": 5652480.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.013598478100175272, | |
| "epoch": 0.918401057152296, | |
| "grad_norm": 4.50636625289917, | |
| "learning_rate": 3.623334503360625e-05, | |
| "loss": 0.2652, | |
| "mean_token_accuracy": 0.036414363980293275, | |
| "num_tokens": 5693440.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.00263449905833113, | |
| "epoch": 0.9250082590023125, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.595493477796405e-05, | |
| "loss": 32.6138, | |
| "mean_token_accuracy": 0.01770833283662796, | |
| "num_tokens": 5734400.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.0006027125244145281, | |
| "epoch": 0.931615460852329, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.5674832719234236e-05, | |
| "loss": 0.0309, | |
| "mean_token_accuracy": 0.025, | |
| "num_tokens": 5775360.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.04056153310375521, | |
| "epoch": 0.9382226627023456, | |
| "grad_norm": 4.126308441162109, | |
| "learning_rate": 3.539308211440896e-05, | |
| "loss": 37.0153, | |
| "mean_token_accuracy": 0.1092478021979332, | |
| "num_tokens": 5816320.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.025153067862265743, | |
| "epoch": 0.9448298645523621, | |
| "grad_norm": 4.144694805145264, | |
| "learning_rate": 3.510972647507024e-05, | |
| "loss": 0.7825, | |
| "mean_token_accuracy": 0.03231704980134964, | |
| "num_tokens": 5857280.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.013085571637930116, | |
| "epoch": 0.9514370664023786, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.482480956067036e-05, | |
| "loss": 3.2684, | |
| "mean_token_accuracy": 0.030653293430805206, | |
| "num_tokens": 5898240.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.03377640448525199, | |
| "epoch": 0.9580442682523951, | |
| "grad_norm": 4.88741397857666, | |
| "learning_rate": 3.453837537177392e-05, | |
| "loss": 8.4465, | |
| "mean_token_accuracy": 0.10374114364385605, | |
| "num_tokens": 5939200.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.0001492878371209372, | |
| "epoch": 0.9646514701024116, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.425046814326275e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 5980160.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.0095839648241963, | |
| "epoch": 0.9712586719524281, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.396113233750452e-05, | |
| "loss": 0.738, | |
| "mean_token_accuracy": 0.06367539763450622, | |
| "num_tokens": 6021120.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.008002570147255028, | |
| "epoch": 0.9778658738024447, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3670412637486356e-05, | |
| "loss": 50.6298, | |
| "mean_token_accuracy": 0.03787661790847778, | |
| "num_tokens": 6062080.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.025678539301952696, | |
| "epoch": 0.9844730756524612, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3378353939914274e-05, | |
| "loss": 58.3281, | |
| "mean_token_accuracy": 0.04199896454811096, | |
| "num_tokens": 6103040.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.009659759866462992, | |
| "epoch": 0.9910802775024777, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.3085001348279655e-05, | |
| "loss": 20.3392, | |
| "mean_token_accuracy": 0.05786270499229431, | |
| "num_tokens": 6144000.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.02696728161745341, | |
| "epoch": 0.9976874793524942, | |
| "grad_norm": 4.9037766456604, | |
| "learning_rate": 3.2790400165893765e-05, | |
| "loss": 24.4541, | |
| "mean_token_accuracy": 0.05242506861686706, | |
| "num_tokens": 6184960.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.03248862427522239, | |
| "epoch": 1.00396432111001, | |
| "grad_norm": 5.108238697052002, | |
| "learning_rate": 3.249459588889148e-05, | |
| "loss": 29.0479, | |
| "mean_token_accuracy": 0.07015540411597804, | |
| "num_tokens": 6223872.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.009673559664588539, | |
| "epoch": 1.0105715229600265, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.21976341992051e-05, | |
| "loss": 1.6678, | |
| "mean_token_accuracy": 0.06724415719509125, | |
| "num_tokens": 6264832.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.006153753565013176, | |
| "epoch": 1.017178724810043, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.189956095750964e-05, | |
| "loss": 13.6964, | |
| "mean_token_accuracy": 0.06525651663541794, | |
| "num_tokens": 6305792.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.01643671256279049, | |
| "epoch": 1.0237859266600595, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.160042219614039e-05, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.065067720413208, | |
| "num_tokens": 6346752.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.012418469145814015, | |
| "epoch": 1.030393128510076, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.130026411198397e-05, | |
| "loss": 3.5205, | |
| "mean_token_accuracy": 0.1062299519777298, | |
| "num_tokens": 6387712.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.00280168341632816, | |
| "epoch": 1.0370003303600925, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.099913305934407e-05, | |
| "loss": 9.8431, | |
| "mean_token_accuracy": 0.06536642909049988, | |
| "num_tokens": 6428672.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.0040143982156223505, | |
| "epoch": 1.043607532210109, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.0697075542782805e-05, | |
| "loss": 2.5969, | |
| "mean_token_accuracy": 0.04253978878259659, | |
| "num_tokens": 6469632.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.014350750502853771, | |
| "epoch": 1.0502147340601256, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.039413820993881e-05, | |
| "loss": 0.6221, | |
| "mean_token_accuracy": 0.05783200562000275, | |
| "num_tokens": 6510592.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.009398723715912638, | |
| "epoch": 1.056821935910142, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.0090367844323427e-05, | |
| "loss": 1.078, | |
| "mean_token_accuracy": 0.1340048998594284, | |
| "num_tokens": 6551552.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.00624132590037334, | |
| "epoch": 1.0634291377601586, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9785811358095666e-05, | |
| "loss": 7.9215, | |
| "mean_token_accuracy": 0.06667087972164154, | |
| "num_tokens": 6592512.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.0015867714646446984, | |
| "epoch": 1.070036339610175, | |
| "grad_norm": 3.8178083896636963, | |
| "learning_rate": 2.9480515784817497e-05, | |
| "loss": 0.0385, | |
| "mean_token_accuracy": 0.0234375, | |
| "num_tokens": 6633472.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.026325100680696777, | |
| "epoch": 1.0766435414601916, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.9174528272190317e-05, | |
| "loss": 7.6648, | |
| "mean_token_accuracy": 0.1373343974351883, | |
| "num_tokens": 6674432.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.01830651660930016, | |
| "epoch": 1.083250743310208, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8867896074773715e-05, | |
| "loss": 9.6631, | |
| "mean_token_accuracy": 0.10328344106674195, | |
| "num_tokens": 6715392.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.04101564450556907, | |
| "epoch": 1.0898579451602246, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8560666546687932e-05, | |
| "loss": 0.5364, | |
| "mean_token_accuracy": 0.07200934141874313, | |
| "num_tokens": 6756352.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.030506200826857822, | |
| "epoch": 1.0964651470102411, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.8252887134300705e-05, | |
| "loss": 7.2533, | |
| "mean_token_accuracy": 0.12010089755058288, | |
| "num_tokens": 6797312.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.011131914072029759, | |
| "epoch": 1.1030723488602576, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.794460536890006e-05, | |
| "loss": 2.1866, | |
| "mean_token_accuracy": 0.03891718536615372, | |
| "num_tokens": 6838272.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.0067470017631421795, | |
| "epoch": 1.1096795507102741, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.7635868859353892e-05, | |
| "loss": 2.1451, | |
| "mean_token_accuracy": 0.08797054588794709, | |
| "num_tokens": 6879232.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.0010968264658004045, | |
| "epoch": 1.1162867525602906, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.732672528475756e-05, | |
| "loss": 4.1927, | |
| "mean_token_accuracy": 0.04846225529909134, | |
| "num_tokens": 6920192.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.014475939798285254, | |
| "epoch": 1.1228939544103071, | |
| "grad_norm": 3.6743106842041016, | |
| "learning_rate": 2.701722238707073e-05, | |
| "loss": 1.7459, | |
| "mean_token_accuracy": 0.08139636963605881, | |
| "num_tokens": 6961152.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.012516654322098475, | |
| "epoch": 1.1295011562603237, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.670740796374434e-05, | |
| "loss": 27.5715, | |
| "mean_token_accuracy": 0.08141163587570191, | |
| "num_tokens": 7002112.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.01645941206952557, | |
| "epoch": 1.1361083581103402, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.6397329860339216e-05, | |
| "loss": 0.5778, | |
| "mean_token_accuracy": 0.06432082802057267, | |
| "num_tokens": 7043072.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.0060044553698389794, | |
| "epoch": 1.1427155599603567, | |
| "grad_norm": 3.3929810523986816, | |
| "learning_rate": 2.608703596313704e-05, | |
| "loss": 0.2285, | |
| "mean_token_accuracy": 0.06034291386604309, | |
| "num_tokens": 7084032.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.006904965872308821, | |
| "epoch": 1.1493227618103732, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5776574191745185e-05, | |
| "loss": 0.7553, | |
| "mean_token_accuracy": 0.08508117645978927, | |
| "num_tokens": 7124992.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.0003588718103856081, | |
| "epoch": 1.15592996366039, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5465992491696346e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.02421875, | |
| "num_tokens": 7165952.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.002262050682293193, | |
| "epoch": 1.1625371655104064, | |
| "grad_norm": 0.09779301285743713, | |
| "learning_rate": 2.5155338827044135e-05, | |
| "loss": 10.7155, | |
| "mean_token_accuracy": 0.06871064454317093, | |
| "num_tokens": 7206912.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.004658589258269785, | |
| "epoch": 1.169144367360423, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.484466117295588e-05, | |
| "loss": 0.5489, | |
| "mean_token_accuracy": 0.09281249940395356, | |
| "num_tokens": 7247872.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.0026989880995643032, | |
| "epoch": 1.1757515692104394, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4534007508303664e-05, | |
| "loss": 0.2234, | |
| "mean_token_accuracy": 0.06663182377815247, | |
| "num_tokens": 7288832.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.0004317208723023214, | |
| "epoch": 1.182358771060456, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4223425808254818e-05, | |
| "loss": 0.0728, | |
| "mean_token_accuracy": 0.045624999701976775, | |
| "num_tokens": 7329792.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.009598436806300015, | |
| "epoch": 1.1889659729104725, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.3912964036862963e-05, | |
| "loss": 16.6883, | |
| "mean_token_accuracy": 0.03912037014961243, | |
| "num_tokens": 7370752.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.010217096674750792, | |
| "epoch": 1.195573174760489, | |
| "grad_norm": 15.332958221435547, | |
| "learning_rate": 2.3602670139660786e-05, | |
| "loss": 7.1663, | |
| "mean_token_accuracy": 0.08217611610889435, | |
| "num_tokens": 7411712.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.009031679033341788, | |
| "epoch": 1.2021803766105055, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.329259203625567e-05, | |
| "loss": 1.0604, | |
| "mean_token_accuracy": 0.01841755360364914, | |
| "num_tokens": 7452672.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.006855891485770371, | |
| "epoch": 1.208787578460522, | |
| "grad_norm": 5.940762042999268, | |
| "learning_rate": 2.2982777612929283e-05, | |
| "loss": 2.9393, | |
| "mean_token_accuracy": 0.08348637223243713, | |
| "num_tokens": 7493632.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.00012862358862548718, | |
| "epoch": 1.2153947803105385, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.267327471524244e-05, | |
| "loss": 1.4553, | |
| "mean_token_accuracy": 0.024074074625968934, | |
| "num_tokens": 7534592.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.0011855133230255887, | |
| "epoch": 1.222001982160555, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2364131140646114e-05, | |
| "loss": 0.1606, | |
| "mean_token_accuracy": 0.04429824501276016, | |
| "num_tokens": 7575552.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.0026850333628772203, | |
| "epoch": 1.2286091840105715, | |
| "grad_norm": 0.2782774865627289, | |
| "learning_rate": 2.2055394631099942e-05, | |
| "loss": 0.5211, | |
| "mean_token_accuracy": 0.09299242496490479, | |
| "num_tokens": 7616512.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.007756018704094458, | |
| "epoch": 1.235216385860588, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1747112865699297e-05, | |
| "loss": 0.9564, | |
| "mean_token_accuracy": 0.06141776293516159, | |
| "num_tokens": 7657472.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.010400129779191048, | |
| "epoch": 1.2418235877106045, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1439333453312084e-05, | |
| "loss": 3.3547, | |
| "mean_token_accuracy": 0.08667385131120682, | |
| "num_tokens": 7698432.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.015071574994362891, | |
| "epoch": 1.248430789560621, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1132103925226294e-05, | |
| "loss": 2.6196, | |
| "mean_token_accuracy": 0.06390394121408463, | |
| "num_tokens": 7739392.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.0022723632777342574, | |
| "epoch": 1.2550379914106375, | |
| "grad_norm": 4.928098201751709, | |
| "learning_rate": 2.0825471727809692e-05, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.06901711821556092, | |
| "num_tokens": 7780352.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.004141029497259296, | |
| "epoch": 1.261645193260654, | |
| "grad_norm": 10.553857803344727, | |
| "learning_rate": 2.0519484215182498e-05, | |
| "loss": 26.983, | |
| "mean_token_accuracy": 0.04182367026805878, | |
| "num_tokens": 7821312.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.014739693689989508, | |
| "epoch": 1.2682523951106706, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.0214188641904337e-05, | |
| "loss": 5.3302, | |
| "mean_token_accuracy": 0.06254241913557053, | |
| "num_tokens": 7862272.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.004958675562374992, | |
| "epoch": 1.274859596960687, | |
| "grad_norm": 4.394237995147705, | |
| "learning_rate": 1.9909632155676583e-05, | |
| "loss": 1.385, | |
| "mean_token_accuracy": 0.0656544640660286, | |
| "num_tokens": 7903232.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.010434124947641977, | |
| "epoch": 1.2814667988107038, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9605861790061194e-05, | |
| "loss": 0.8501, | |
| "mean_token_accuracy": 0.03979341834783554, | |
| "num_tokens": 7944192.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.018370524288911838, | |
| "epoch": 1.2880740006607203, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9302924457217204e-05, | |
| "loss": 1.6869, | |
| "mean_token_accuracy": 0.07690805941820145, | |
| "num_tokens": 7985152.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.01128047517995583, | |
| "epoch": 1.2946812025107368, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.900086694065593e-05, | |
| "loss": 38.9167, | |
| "mean_token_accuracy": 0.05990941822528839, | |
| "num_tokens": 8026112.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.005127273782272823, | |
| "epoch": 1.3012884043607533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8699735888016036e-05, | |
| "loss": 1.9819, | |
| "mean_token_accuracy": 0.019821429252624513, | |
| "num_tokens": 8067072.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.0013086928360280582, | |
| "epoch": 1.3078956062107698, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8399577803859623e-05, | |
| "loss": 1.2477, | |
| "mean_token_accuracy": 0.05, | |
| "num_tokens": 8108032.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.029465652829094323, | |
| "epoch": 1.3145028080607863, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.8100439042490368e-05, | |
| "loss": 10.0606, | |
| "mean_token_accuracy": 0.10168795138597489, | |
| "num_tokens": 8148992.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.0015846440888708456, | |
| "epoch": 1.3211100099108029, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.780236580079491e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.048466117680072786, | |
| "num_tokens": 8189952.0, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3028, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.012440723577897e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |