diff --git "a/checkpoint-1020/trainer_state.json" "b/checkpoint-1020/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1020/trainer_state.json" @@ -0,0 +1,8193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.73104880581516, + "eval_steps": 500, + "global_step": 1020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016614745586708203, + "grad_norm": 0.050998032093048096, + "learning_rate": 4.999991432639962e-05, + "loss": 0.5487, + "num_input_tokens_seen": 70408, + "step": 1 + }, + { + "epoch": 0.033229491173416406, + "grad_norm": 0.049370743334293365, + "learning_rate": 4.999965730618567e-05, + "loss": 0.4981, + "num_input_tokens_seen": 139640, + "step": 2 + }, + { + "epoch": 0.04984423676012461, + "grad_norm": 0.05077400803565979, + "learning_rate": 4.9999228941119745e-05, + "loss": 0.5505, + "num_input_tokens_seen": 223656, + "step": 3 + }, + { + "epoch": 0.06645898234683281, + "grad_norm": 0.04397282376885414, + "learning_rate": 4.999862923413781e-05, + "loss": 0.504, + "num_input_tokens_seen": 300688, + "step": 4 + }, + { + "epoch": 0.08307372793354102, + "grad_norm": 0.05225864797830582, + "learning_rate": 4.999785818935018e-05, + "loss": 0.4925, + "num_input_tokens_seen": 366368, + "step": 5 + }, + { + "epoch": 0.09968847352024922, + "grad_norm": 0.049482282251119614, + "learning_rate": 4.999691581204152e-05, + "loss": 0.4771, + "num_input_tokens_seen": 445808, + "step": 6 + }, + { + "epoch": 0.11630321910695743, + "grad_norm": 0.05594080314040184, + "learning_rate": 4.9995802108670775e-05, + "loss": 0.4986, + "num_input_tokens_seen": 522800, + "step": 7 + }, + { + "epoch": 0.13291796469366562, + "grad_norm": 0.051852282136678696, + "learning_rate": 4.999451708687114e-05, + "loss": 0.5171, + "num_input_tokens_seen": 599608, + "step": 8 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 0.045517683029174805, + "learning_rate": 4.9993060755450015e-05, + "loss": 0.5669, + "num_input_tokens_seen": 681424, + "step": 9 + }, + { + "epoch": 0.16614745586708204, + "grad_norm": 0.044325754046440125, + "learning_rate": 4.999143312438893e-05, + "loss": 0.4218, + "num_input_tokens_seen": 756744, + "step": 10 + }, + { + "epoch": 0.18276220145379024, + "grad_norm": 0.04328459873795509, + "learning_rate": 4.998963420484349e-05, + "loss": 0.434, + "num_input_tokens_seen": 842576, + "step": 11 + }, + { + "epoch": 0.19937694704049844, + "grad_norm": 0.04725787043571472, + "learning_rate": 4.998766400914329e-05, + "loss": 0.4287, + "num_input_tokens_seen": 917232, + "step": 12 + }, + { + "epoch": 0.21599169262720663, + "grad_norm": 0.03806879371404648, + "learning_rate": 4.9985522550791825e-05, + "loss": 0.3454, + "num_input_tokens_seen": 1006800, + "step": 13 + }, + { + "epoch": 0.23260643821391486, + "grad_norm": 0.05201176926493645, + "learning_rate": 4.998320984446641e-05, + "loss": 0.436, + "num_input_tokens_seen": 1085824, + "step": 14 + }, + { + "epoch": 0.24922118380062305, + "grad_norm": 0.047955628484487534, + "learning_rate": 4.9980725906018074e-05, + "loss": 0.4625, + "num_input_tokens_seen": 1164160, + "step": 15 + }, + { + "epoch": 0.26583592938733125, + "grad_norm": 0.05529098957777023, + "learning_rate": 4.997807075247146e-05, + "loss": 0.5035, + "num_input_tokens_seen": 1242264, + "step": 16 + }, + { + "epoch": 0.2824506749740395, + "grad_norm": 0.04751162976026535, + "learning_rate": 4.997524440202469e-05, + "loss": 0.4354, + "num_input_tokens_seen": 1325904, + "step": 17 + }, + { + "epoch": 0.29906542056074764, + "grad_norm": 0.06726882606744766, + "learning_rate": 4.9972246874049254e-05, + "loss": 0.5439, + "num_input_tokens_seen": 1385632, + "step": 18 + }, + { + "epoch": 0.31568016614745587, + "grad_norm": 0.05245920270681381, + "learning_rate": 4.996907818908987e-05, + "loss": 0.3727, + "num_input_tokens_seen": 1470632, + "step": 19 + }, + { + "epoch": 0.3322949117341641, + "grad_norm": 0.05745376646518707, + "learning_rate": 4.996573836886435e-05, + "loss": 0.4894, + "num_input_tokens_seen": 1547536, + "step": 20 + }, + { + "epoch": 0.34890965732087226, + "grad_norm": 0.056607529520988464, + "learning_rate": 4.9962227436263453e-05, + "loss": 0.3846, + "num_input_tokens_seen": 1615528, + "step": 21 + }, + { + "epoch": 0.3655244029075805, + "grad_norm": 0.06150667741894722, + "learning_rate": 4.995854541535071e-05, + "loss": 0.4362, + "num_input_tokens_seen": 1694352, + "step": 22 + }, + { + "epoch": 0.3821391484942887, + "grad_norm": 0.056484442204236984, + "learning_rate": 4.9954692331362294e-05, + "loss": 0.4438, + "num_input_tokens_seen": 1753776, + "step": 23 + }, + { + "epoch": 0.3987538940809969, + "grad_norm": 0.0704159140586853, + "learning_rate": 4.995066821070679e-05, + "loss": 0.4496, + "num_input_tokens_seen": 1809048, + "step": 24 + }, + { + "epoch": 0.4153686396677051, + "grad_norm": 0.06202029809355736, + "learning_rate": 4.994647308096509e-05, + "loss": 0.5096, + "num_input_tokens_seen": 1884264, + "step": 25 + }, + { + "epoch": 0.43198338525441327, + "grad_norm": 0.04237145930528641, + "learning_rate": 4.994210697089014e-05, + "loss": 0.3722, + "num_input_tokens_seen": 1981704, + "step": 26 + }, + { + "epoch": 0.4485981308411215, + "grad_norm": 0.06920398026704788, + "learning_rate": 4.9937569910406756e-05, + "loss": 0.4103, + "num_input_tokens_seen": 2044144, + "step": 27 + }, + { + "epoch": 0.4652128764278297, + "grad_norm": 0.062432270497083664, + "learning_rate": 4.9932861930611454e-05, + "loss": 0.357, + "num_input_tokens_seen": 2107584, + "step": 28 + }, + { + "epoch": 0.4818276220145379, + "grad_norm": 0.06791180372238159, + "learning_rate": 4.9927983063772196e-05, + "loss": 0.3889, + "num_input_tokens_seen": 2169248, + "step": 29 + }, + { + "epoch": 0.4984423676012461, + "grad_norm": 0.07219590991735458, + "learning_rate": 4.99229333433282e-05, + "loss": 0.3543, + "num_input_tokens_seen": 2230344, + "step": 30 + }, + { + "epoch": 0.5150571131879543, + "grad_norm": 0.0647474005818367, + "learning_rate": 4.9917712803889674e-05, + "loss": 0.3453, + "num_input_tokens_seen": 2302368, + "step": 31 + }, + { + "epoch": 0.5316718587746625, + "grad_norm": 0.07434642314910889, + "learning_rate": 4.991232148123761e-05, + "loss": 0.435, + "num_input_tokens_seen": 2369984, + "step": 32 + }, + { + "epoch": 0.5482866043613707, + "grad_norm": 0.05302443355321884, + "learning_rate": 4.990675941232353e-05, + "loss": 0.3981, + "num_input_tokens_seen": 2453032, + "step": 33 + }, + { + "epoch": 0.564901349948079, + "grad_norm": 0.053745292127132416, + "learning_rate": 4.990102663526924e-05, + "loss": 0.3755, + "num_input_tokens_seen": 2527464, + "step": 34 + }, + { + "epoch": 0.5815160955347871, + "grad_norm": 0.06717613339424133, + "learning_rate": 4.989512318936655e-05, + "loss": 0.3699, + "num_input_tokens_seen": 2597032, + "step": 35 + }, + { + "epoch": 0.5981308411214953, + "grad_norm": 0.071847103536129, + "learning_rate": 4.9889049115077005e-05, + "loss": 0.3705, + "num_input_tokens_seen": 2671704, + "step": 36 + }, + { + "epoch": 0.6147455867082036, + "grad_norm": 0.0460306741297245, + "learning_rate": 4.988280445403164e-05, + "loss": 0.3797, + "num_input_tokens_seen": 2767640, + "step": 37 + }, + { + "epoch": 0.6313603322949117, + "grad_norm": 0.053273387253284454, + "learning_rate": 4.987638924903067e-05, + "loss": 0.3799, + "num_input_tokens_seen": 2843720, + "step": 38 + }, + { + "epoch": 0.6479750778816199, + "grad_norm": 0.05600422993302345, + "learning_rate": 4.9869803544043166e-05, + "loss": 0.2866, + "num_input_tokens_seen": 2921472, + "step": 39 + }, + { + "epoch": 0.6645898234683282, + "grad_norm": 0.06414052098989487, + "learning_rate": 4.9863047384206835e-05, + "loss": 0.4115, + "num_input_tokens_seen": 2998400, + "step": 40 + }, + { + "epoch": 0.6812045690550363, + "grad_norm": 0.09214208275079727, + "learning_rate": 4.985612081582764e-05, + "loss": 0.3804, + "num_input_tokens_seen": 3059648, + "step": 41 + }, + { + "epoch": 0.6978193146417445, + "grad_norm": 0.0555964931845665, + "learning_rate": 4.98490238863795e-05, + "loss": 0.3121, + "num_input_tokens_seen": 3140184, + "step": 42 + }, + { + "epoch": 0.7144340602284528, + "grad_norm": 0.06256969273090363, + "learning_rate": 4.984175664450397e-05, + "loss": 0.3271, + "num_input_tokens_seen": 3207184, + "step": 43 + }, + { + "epoch": 0.731048805815161, + "grad_norm": 0.0543232187628746, + "learning_rate": 4.983431914000991e-05, + "loss": 0.364, + "num_input_tokens_seen": 3292344, + "step": 44 + }, + { + "epoch": 0.7476635514018691, + "grad_norm": 0.06077824532985687, + "learning_rate": 4.982671142387316e-05, + "loss": 0.3894, + "num_input_tokens_seen": 3365384, + "step": 45 + }, + { + "epoch": 0.7642782969885774, + "grad_norm": 0.06091070920228958, + "learning_rate": 4.981893354823614e-05, + "loss": 0.3354, + "num_input_tokens_seen": 3440720, + "step": 46 + }, + { + "epoch": 0.7808930425752856, + "grad_norm": 0.054153311997652054, + "learning_rate": 4.9810985566407544e-05, + "loss": 0.3058, + "num_input_tokens_seen": 3533576, + "step": 47 + }, + { + "epoch": 0.7975077881619937, + "grad_norm": 0.06662417948246002, + "learning_rate": 4.980286753286195e-05, + "loss": 0.4658, + "num_input_tokens_seen": 3599744, + "step": 48 + }, + { + "epoch": 0.814122533748702, + "grad_norm": 0.05790851265192032, + "learning_rate": 4.979457950323945e-05, + "loss": 0.3647, + "num_input_tokens_seen": 3689520, + "step": 49 + }, + { + "epoch": 0.8307372793354102, + "grad_norm": 0.10742159187793732, + "learning_rate": 4.9786121534345265e-05, + "loss": 0.343, + "num_input_tokens_seen": 3751808, + "step": 50 + }, + { + "epoch": 0.8473520249221184, + "grad_norm": 0.05565556138753891, + "learning_rate": 4.9777493684149375e-05, + "loss": 0.3317, + "num_input_tokens_seen": 3839096, + "step": 51 + }, + { + "epoch": 0.8639667705088265, + "grad_norm": 0.05752381682395935, + "learning_rate": 4.976869601178609e-05, + "loss": 0.38, + "num_input_tokens_seen": 3919824, + "step": 52 + }, + { + "epoch": 0.8805815160955348, + "grad_norm": 0.06406434625387192, + "learning_rate": 4.975972857755369e-05, + "loss": 0.2676, + "num_input_tokens_seen": 3989312, + "step": 53 + }, + { + "epoch": 0.897196261682243, + "grad_norm": 0.0653691440820694, + "learning_rate": 4.975059144291394e-05, + "loss": 0.3516, + "num_input_tokens_seen": 4060528, + "step": 54 + }, + { + "epoch": 0.9138110072689511, + "grad_norm": 0.06272953748703003, + "learning_rate": 4.974128467049176e-05, + "loss": 0.3004, + "num_input_tokens_seen": 4129368, + "step": 55 + }, + { + "epoch": 0.9304257528556594, + "grad_norm": 0.08054930716753006, + "learning_rate": 4.9731808324074717e-05, + "loss": 0.3009, + "num_input_tokens_seen": 4175208, + "step": 56 + }, + { + "epoch": 0.9470404984423676, + "grad_norm": 0.07523038238286972, + "learning_rate": 4.972216246861262e-05, + "loss": 0.2814, + "num_input_tokens_seen": 4218096, + "step": 57 + }, + { + "epoch": 0.9636552440290758, + "grad_norm": 0.07347433269023895, + "learning_rate": 4.971234717021709e-05, + "loss": 0.3321, + "num_input_tokens_seen": 4275968, + "step": 58 + }, + { + "epoch": 0.980269989615784, + "grad_norm": 0.05830248445272446, + "learning_rate": 4.9702362496161085e-05, + "loss": 0.2881, + "num_input_tokens_seen": 4346616, + "step": 59 + }, + { + "epoch": 0.9968847352024922, + "grad_norm": 0.061629410833120346, + "learning_rate": 4.9692208514878444e-05, + "loss": 0.2993, + "num_input_tokens_seen": 4425064, + "step": 60 + }, + { + "epoch": 1.0, + "grad_norm": 0.13380740582942963, + "learning_rate": 4.968188529596342e-05, + "loss": 0.2511, + "num_input_tokens_seen": 4435328, + "step": 61 + }, + { + "epoch": 1.0166147455867083, + "grad_norm": 0.0726238414645195, + "learning_rate": 4.9671392910170185e-05, + "loss": 0.3127, + "num_input_tokens_seen": 4500104, + "step": 62 + }, + { + "epoch": 1.0332294911734163, + "grad_norm": 0.05980083718895912, + "learning_rate": 4.966073142941239e-05, + "loss": 0.3601, + "num_input_tokens_seen": 4581976, + "step": 63 + }, + { + "epoch": 1.0498442367601246, + "grad_norm": 0.06445376574993134, + "learning_rate": 4.964990092676263e-05, + "loss": 0.3049, + "num_input_tokens_seen": 4652160, + "step": 64 + }, + { + "epoch": 1.066458982346833, + "grad_norm": 0.07824505120515823, + "learning_rate": 4.9638901476451946e-05, + "loss": 0.3099, + "num_input_tokens_seen": 4709368, + "step": 65 + }, + { + "epoch": 1.083073727933541, + "grad_norm": 0.058268457651138306, + "learning_rate": 4.962773315386935e-05, + "loss": 0.3273, + "num_input_tokens_seen": 4798256, + "step": 66 + }, + { + "epoch": 1.0996884735202492, + "grad_norm": 0.07069691270589828, + "learning_rate": 4.961639603556127e-05, + "loss": 0.282, + "num_input_tokens_seen": 4859200, + "step": 67 + }, + { + "epoch": 1.1163032191069575, + "grad_norm": 0.0775996670126915, + "learning_rate": 4.960489019923105e-05, + "loss": 0.3642, + "num_input_tokens_seen": 4925992, + "step": 68 + }, + { + "epoch": 1.1329179646936656, + "grad_norm": 0.07044171541929245, + "learning_rate": 4.9593215723738404e-05, + "loss": 0.2896, + "num_input_tokens_seen": 4998808, + "step": 69 + }, + { + "epoch": 1.1495327102803738, + "grad_norm": 0.05971802771091461, + "learning_rate": 4.958137268909887e-05, + "loss": 0.2578, + "num_input_tokens_seen": 5089672, + "step": 70 + }, + { + "epoch": 1.1661474558670821, + "grad_norm": 0.07145556062459946, + "learning_rate": 4.9569361176483286e-05, + "loss": 0.3243, + "num_input_tokens_seen": 5166744, + "step": 71 + }, + { + "epoch": 1.1827622014537902, + "grad_norm": 0.07455787807703018, + "learning_rate": 4.9557181268217227e-05, + "loss": 0.3949, + "num_input_tokens_seen": 5228264, + "step": 72 + }, + { + "epoch": 1.1993769470404985, + "grad_norm": 0.055582575500011444, + "learning_rate": 4.9544833047780394e-05, + "loss": 0.2877, + "num_input_tokens_seen": 5338224, + "step": 73 + }, + { + "epoch": 1.2159916926272065, + "grad_norm": 0.07675391435623169, + "learning_rate": 4.9532316599806124e-05, + "loss": 0.3152, + "num_input_tokens_seen": 5399848, + "step": 74 + }, + { + "epoch": 1.2326064382139148, + "grad_norm": 0.08048644661903381, + "learning_rate": 4.951963201008076e-05, + "loss": 0.2976, + "num_input_tokens_seen": 5468624, + "step": 75 + }, + { + "epoch": 1.249221183800623, + "grad_norm": 0.07579060643911362, + "learning_rate": 4.9506779365543046e-05, + "loss": 0.2982, + "num_input_tokens_seen": 5536776, + "step": 76 + }, + { + "epoch": 1.2658359293873311, + "grad_norm": 0.07828006893396378, + "learning_rate": 4.949375875428357e-05, + "loss": 0.3272, + "num_input_tokens_seen": 5609296, + "step": 77 + }, + { + "epoch": 1.2824506749740394, + "grad_norm": 0.08079098165035248, + "learning_rate": 4.9480570265544144e-05, + "loss": 0.2768, + "num_input_tokens_seen": 5663824, + "step": 78 + }, + { + "epoch": 1.2990654205607477, + "grad_norm": 0.07579358667135239, + "learning_rate": 4.94672139897172e-05, + "loss": 0.318, + "num_input_tokens_seen": 5742032, + "step": 79 + }, + { + "epoch": 1.3156801661474558, + "grad_norm": 0.07588379085063934, + "learning_rate": 4.9453690018345144e-05, + "loss": 0.3007, + "num_input_tokens_seen": 5816864, + "step": 80 + }, + { + "epoch": 1.332294911734164, + "grad_norm": 0.08709035068750381, + "learning_rate": 4.943999844411977e-05, + "loss": 0.2797, + "num_input_tokens_seen": 5881624, + "step": 81 + }, + { + "epoch": 1.3489096573208723, + "grad_norm": 0.05975884944200516, + "learning_rate": 4.94261393608816e-05, + "loss": 0.2591, + "num_input_tokens_seen": 5970272, + "step": 82 + }, + { + "epoch": 1.3655244029075804, + "grad_norm": 0.07372818142175674, + "learning_rate": 4.941211286361922e-05, + "loss": 0.2687, + "num_input_tokens_seen": 6058752, + "step": 83 + }, + { + "epoch": 1.3821391484942886, + "grad_norm": 0.09071576595306396, + "learning_rate": 4.939791904846869e-05, + "loss": 0.2979, + "num_input_tokens_seen": 6120064, + "step": 84 + }, + { + "epoch": 1.398753894080997, + "grad_norm": 0.0849960595369339, + "learning_rate": 4.938355801271282e-05, + "loss": 0.2927, + "num_input_tokens_seen": 6182072, + "step": 85 + }, + { + "epoch": 1.415368639667705, + "grad_norm": 0.08258760720491409, + "learning_rate": 4.936902985478055e-05, + "loss": 0.295, + "num_input_tokens_seen": 6269680, + "step": 86 + }, + { + "epoch": 1.4319833852544133, + "grad_norm": 0.0851503536105156, + "learning_rate": 4.935433467424624e-05, + "loss": 0.2925, + "num_input_tokens_seen": 6347424, + "step": 87 + }, + { + "epoch": 1.4485981308411215, + "grad_norm": 0.08852345496416092, + "learning_rate": 4.933947257182901e-05, + "loss": 0.3153, + "num_input_tokens_seen": 6412584, + "step": 88 + }, + { + "epoch": 1.4652128764278296, + "grad_norm": 0.08184897154569626, + "learning_rate": 4.932444364939205e-05, + "loss": 0.292, + "num_input_tokens_seen": 6482728, + "step": 89 + }, + { + "epoch": 1.4818276220145379, + "grad_norm": 0.08270515501499176, + "learning_rate": 4.9309248009941914e-05, + "loss": 0.3472, + "num_input_tokens_seen": 6562104, + "step": 90 + }, + { + "epoch": 1.4984423676012462, + "grad_norm": 0.07407747954130173, + "learning_rate": 4.929388575762782e-05, + "loss": 0.2995, + "num_input_tokens_seen": 6656552, + "step": 91 + }, + { + "epoch": 1.5150571131879542, + "grad_norm": 0.08710360527038574, + "learning_rate": 4.9278356997740904e-05, + "loss": 0.2549, + "num_input_tokens_seen": 6714184, + "step": 92 + }, + { + "epoch": 1.5316718587746625, + "grad_norm": 0.0773790255188942, + "learning_rate": 4.9262661836713564e-05, + "loss": 0.2814, + "num_input_tokens_seen": 6793552, + "step": 93 + }, + { + "epoch": 1.5482866043613708, + "grad_norm": 0.1002134457230568, + "learning_rate": 4.924680038211867e-05, + "loss": 0.2876, + "num_input_tokens_seen": 6865256, + "step": 94 + }, + { + "epoch": 1.5649013499480788, + "grad_norm": 0.09670394659042358, + "learning_rate": 4.9230772742668866e-05, + "loss": 0.2846, + "num_input_tokens_seen": 6931152, + "step": 95 + }, + { + "epoch": 1.5815160955347871, + "grad_norm": 0.08910100907087326, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.2944, + "num_input_tokens_seen": 6998408, + "step": 96 + }, + { + "epoch": 1.5981308411214954, + "grad_norm": 0.09202459454536438, + "learning_rate": 4.919821934974933e-05, + "loss": 0.251, + "num_input_tokens_seen": 7053008, + "step": 97 + }, + { + "epoch": 1.6147455867082035, + "grad_norm": 0.10218881815671921, + "learning_rate": 4.918169381939692e-05, + "loss": 0.2851, + "num_input_tokens_seen": 7106440, + "step": 98 + }, + { + "epoch": 1.6313603322949117, + "grad_norm": 0.09290914982557297, + "learning_rate": 4.916500255042268e-05, + "loss": 0.2959, + "num_input_tokens_seen": 7167032, + "step": 99 + }, + { + "epoch": 1.64797507788162, + "grad_norm": 0.07791033387184143, + "learning_rate": 4.914814565722671e-05, + "loss": 0.2481, + "num_input_tokens_seen": 7245720, + "step": 100 + }, + { + "epoch": 1.664589823468328, + "grad_norm": 0.08885534107685089, + "learning_rate": 4.913112325534426e-05, + "loss": 0.3168, + "num_input_tokens_seen": 7326320, + "step": 101 + }, + { + "epoch": 1.6812045690550363, + "grad_norm": 0.08569750934839249, + "learning_rate": 4.9113935461444955e-05, + "loss": 0.2805, + "num_input_tokens_seen": 7442232, + "step": 102 + }, + { + "epoch": 1.6978193146417446, + "grad_norm": 0.1112508773803711, + "learning_rate": 4.9096582393332025e-05, + "loss": 0.2675, + "num_input_tokens_seen": 7502496, + "step": 103 + }, + { + "epoch": 1.7144340602284527, + "grad_norm": 0.09654372185468674, + "learning_rate": 4.907906416994146e-05, + "loss": 0.3038, + "num_input_tokens_seen": 7566496, + "step": 104 + }, + { + "epoch": 1.731048805815161, + "grad_norm": 0.10022995620965958, + "learning_rate": 4.906138091134118e-05, + "loss": 0.3639, + "num_input_tokens_seen": 7629056, + "step": 105 + }, + { + "epoch": 1.7476635514018692, + "grad_norm": 0.08336564153432846, + "learning_rate": 4.9043532738730284e-05, + "loss": 0.2944, + "num_input_tokens_seen": 7706096, + "step": 106 + }, + { + "epoch": 1.7642782969885773, + "grad_norm": 0.08539658784866333, + "learning_rate": 4.9025519774438136e-05, + "loss": 0.2392, + "num_input_tokens_seen": 7780072, + "step": 107 + }, + { + "epoch": 1.7808930425752856, + "grad_norm": 0.09139693528413773, + "learning_rate": 4.900734214192358e-05, + "loss": 0.2685, + "num_input_tokens_seen": 7857712, + "step": 108 + }, + { + "epoch": 1.7975077881619939, + "grad_norm": 0.1043916717171669, + "learning_rate": 4.898899996577407e-05, + "loss": 0.2513, + "num_input_tokens_seen": 7916832, + "step": 109 + }, + { + "epoch": 1.814122533748702, + "grad_norm": 0.09203662723302841, + "learning_rate": 4.8970493371704826e-05, + "loss": 0.2974, + "num_input_tokens_seen": 7993056, + "step": 110 + }, + { + "epoch": 1.8307372793354102, + "grad_norm": 0.09319474548101425, + "learning_rate": 4.8951822486557986e-05, + "loss": 0.3096, + "num_input_tokens_seen": 8090056, + "step": 111 + }, + { + "epoch": 1.8473520249221185, + "grad_norm": 0.10193445533514023, + "learning_rate": 4.893298743830168e-05, + "loss": 0.2633, + "num_input_tokens_seen": 8164808, + "step": 112 + }, + { + "epoch": 1.8639667705088265, + "grad_norm": 0.11407948285341263, + "learning_rate": 4.891398835602925e-05, + "loss": 0.2584, + "num_input_tokens_seen": 8223568, + "step": 113 + }, + { + "epoch": 1.8805815160955348, + "grad_norm": 0.11977085471153259, + "learning_rate": 4.8894825369958255e-05, + "loss": 0.2619, + "num_input_tokens_seen": 8276160, + "step": 114 + }, + { + "epoch": 1.897196261682243, + "grad_norm": 0.10925433784723282, + "learning_rate": 4.8875498611429674e-05, + "loss": 0.2762, + "num_input_tokens_seen": 8354904, + "step": 115 + }, + { + "epoch": 1.9138110072689511, + "grad_norm": 0.09673939645290375, + "learning_rate": 4.8856008212906925e-05, + "loss": 0.3152, + "num_input_tokens_seen": 8442584, + "step": 116 + }, + { + "epoch": 1.9304257528556594, + "grad_norm": 0.10827789455652237, + "learning_rate": 4.8836354307975026e-05, + "loss": 0.2759, + "num_input_tokens_seen": 8506688, + "step": 117 + }, + { + "epoch": 1.9470404984423677, + "grad_norm": 0.08390220254659653, + "learning_rate": 4.881653703133966e-05, + "loss": 0.2192, + "num_input_tokens_seen": 8610712, + "step": 118 + }, + { + "epoch": 1.9636552440290758, + "grad_norm": 0.09252211451530457, + "learning_rate": 4.87965565188262e-05, + "loss": 0.2618, + "num_input_tokens_seen": 8692624, + "step": 119 + }, + { + "epoch": 1.980269989615784, + "grad_norm": 0.1107102632522583, + "learning_rate": 4.877641290737884e-05, + "loss": 0.2666, + "num_input_tokens_seen": 8772208, + "step": 120 + }, + { + "epoch": 1.9968847352024923, + "grad_norm": 0.0917077362537384, + "learning_rate": 4.8756106335059646e-05, + "loss": 0.253, + "num_input_tokens_seen": 8854904, + "step": 121 + }, + { + "epoch": 2.0, + "grad_norm": 0.2606711685657501, + "learning_rate": 4.87356369410476e-05, + "loss": 0.235, + "num_input_tokens_seen": 8872656, + "step": 122 + }, + { + "epoch": 2.016614745586708, + "grad_norm": 0.10363993793725967, + "learning_rate": 4.8715004865637614e-05, + "loss": 0.266, + "num_input_tokens_seen": 8946480, + "step": 123 + }, + { + "epoch": 2.0332294911734166, + "grad_norm": 0.09997844696044922, + "learning_rate": 4.869421025023965e-05, + "loss": 0.2696, + "num_input_tokens_seen": 9023328, + "step": 124 + }, + { + "epoch": 2.0498442367601246, + "grad_norm": 0.13349319994449615, + "learning_rate": 4.867325323737765e-05, + "loss": 0.2552, + "num_input_tokens_seen": 9074320, + "step": 125 + }, + { + "epoch": 2.0664589823468327, + "grad_norm": 0.11201464384794235, + "learning_rate": 4.8652133970688636e-05, + "loss": 0.2486, + "num_input_tokens_seen": 9148784, + "step": 126 + }, + { + "epoch": 2.083073727933541, + "grad_norm": 0.10193142294883728, + "learning_rate": 4.8630852594921706e-05, + "loss": 0.2814, + "num_input_tokens_seen": 9246624, + "step": 127 + }, + { + "epoch": 2.0996884735202492, + "grad_norm": 0.1305130422115326, + "learning_rate": 4.860940925593703e-05, + "loss": 0.304, + "num_input_tokens_seen": 9328176, + "step": 128 + }, + { + "epoch": 2.1163032191069573, + "grad_norm": 0.1137692779302597, + "learning_rate": 4.8587804100704845e-05, + "loss": 0.2427, + "num_input_tokens_seen": 9388936, + "step": 129 + }, + { + "epoch": 2.132917964693666, + "grad_norm": 0.12126237154006958, + "learning_rate": 4.856603727730447e-05, + "loss": 0.2485, + "num_input_tokens_seen": 9461664, + "step": 130 + }, + { + "epoch": 2.149532710280374, + "grad_norm": 0.11567176878452301, + "learning_rate": 4.854410893492326e-05, + "loss": 0.2628, + "num_input_tokens_seen": 9535000, + "step": 131 + }, + { + "epoch": 2.166147455867082, + "grad_norm": 0.1399552971124649, + "learning_rate": 4.852201922385564e-05, + "loss": 0.2518, + "num_input_tokens_seen": 9600296, + "step": 132 + }, + { + "epoch": 2.1827622014537904, + "grad_norm": 0.13912151753902435, + "learning_rate": 4.8499768295502004e-05, + "loss": 0.2429, + "num_input_tokens_seen": 9686784, + "step": 133 + }, + { + "epoch": 2.1993769470404985, + "grad_norm": 0.11130474507808685, + "learning_rate": 4.847735630236773e-05, + "loss": 0.2775, + "num_input_tokens_seen": 9781112, + "step": 134 + }, + { + "epoch": 2.2159916926272065, + "grad_norm": 0.12169156968593597, + "learning_rate": 4.8454783398062106e-05, + "loss": 0.2439, + "num_input_tokens_seen": 9849528, + "step": 135 + }, + { + "epoch": 2.232606438213915, + "grad_norm": 0.11766713112592697, + "learning_rate": 4.843204973729729e-05, + "loss": 0.2538, + "num_input_tokens_seen": 9931080, + "step": 136 + }, + { + "epoch": 2.249221183800623, + "grad_norm": 0.11854218691587448, + "learning_rate": 4.840915547588725e-05, + "loss": 0.2782, + "num_input_tokens_seen": 10011176, + "step": 137 + }, + { + "epoch": 2.265835929387331, + "grad_norm": 0.1340581178665161, + "learning_rate": 4.838610077074669e-05, + "loss": 0.248, + "num_input_tokens_seen": 10084128, + "step": 138 + }, + { + "epoch": 2.2824506749740396, + "grad_norm": 0.12075436115264893, + "learning_rate": 4.836288577988996e-05, + "loss": 0.2582, + "num_input_tokens_seen": 10155536, + "step": 139 + }, + { + "epoch": 2.2990654205607477, + "grad_norm": 0.10599923878908157, + "learning_rate": 4.8339510662430046e-05, + "loss": 0.2199, + "num_input_tokens_seen": 10251160, + "step": 140 + }, + { + "epoch": 2.3156801661474558, + "grad_norm": 0.1117846742272377, + "learning_rate": 4.8315975578577355e-05, + "loss": 0.2324, + "num_input_tokens_seen": 10345864, + "step": 141 + }, + { + "epoch": 2.3322949117341643, + "grad_norm": 0.13972057402133942, + "learning_rate": 4.8292280689638725e-05, + "loss": 0.4072, + "num_input_tokens_seen": 10417616, + "step": 142 + }, + { + "epoch": 2.3489096573208723, + "grad_norm": 0.13837860524654388, + "learning_rate": 4.826842615801628e-05, + "loss": 0.2607, + "num_input_tokens_seen": 10481816, + "step": 143 + }, + { + "epoch": 2.3655244029075804, + "grad_norm": 0.14040137827396393, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.3094, + "num_input_tokens_seen": 10562056, + "step": 144 + }, + { + "epoch": 2.382139148494289, + "grad_norm": 0.1393299251794815, + "learning_rate": 4.822023882179811e-05, + "loss": 0.2407, + "num_input_tokens_seen": 10612808, + "step": 145 + }, + { + "epoch": 2.398753894080997, + "grad_norm": 0.13878698647022247, + "learning_rate": 4.8195906347473e-05, + "loss": 0.2481, + "num_input_tokens_seen": 10682328, + "step": 146 + }, + { + "epoch": 2.415368639667705, + "grad_norm": 0.10430227965116501, + "learning_rate": 4.817141489100302e-05, + "loss": 0.2528, + "num_input_tokens_seen": 10771912, + "step": 147 + }, + { + "epoch": 2.431983385254413, + "grad_norm": 0.12963703274726868, + "learning_rate": 4.814676462024988e-05, + "loss": 0.2739, + "num_input_tokens_seen": 10842232, + "step": 148 + }, + { + "epoch": 2.4485981308411215, + "grad_norm": 0.13274963200092316, + "learning_rate": 4.8121955704163745e-05, + "loss": 0.2407, + "num_input_tokens_seen": 10902264, + "step": 149 + }, + { + "epoch": 2.4652128764278296, + "grad_norm": 0.11079717427492142, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.2142, + "num_input_tokens_seen": 10992744, + "step": 150 + }, + { + "epoch": 2.4818276220145377, + "grad_norm": 0.08429212868213654, + "learning_rate": 4.8071862617228855e-05, + "loss": 0.1428, + "num_input_tokens_seen": 11090064, + "step": 151 + }, + { + "epoch": 2.498442367601246, + "grad_norm": 0.12903761863708496, + "learning_rate": 4.8046578789712515e-05, + "loss": 0.2268, + "num_input_tokens_seen": 11162864, + "step": 152 + }, + { + "epoch": 2.515057113187954, + "grad_norm": 0.14638672769069672, + "learning_rate": 4.8021137003525664e-05, + "loss": 0.2388, + "num_input_tokens_seen": 11224368, + "step": 153 + }, + { + "epoch": 2.5316718587746623, + "grad_norm": 0.1372838169336319, + "learning_rate": 4.7995537433043446e-05, + "loss": 0.2588, + "num_input_tokens_seen": 11291056, + "step": 154 + }, + { + "epoch": 2.5482866043613708, + "grad_norm": 0.15665481984615326, + "learning_rate": 4.796978025372246e-05, + "loss": 0.2225, + "num_input_tokens_seen": 11345464, + "step": 155 + }, + { + "epoch": 2.564901349948079, + "grad_norm": 0.13234855234622955, + "learning_rate": 4.794386564209953e-05, + "loss": 0.275, + "num_input_tokens_seen": 11418912, + "step": 156 + }, + { + "epoch": 2.581516095534787, + "grad_norm": 0.13585953414440155, + "learning_rate": 4.79177937757905e-05, + "loss": 0.2407, + "num_input_tokens_seen": 11491216, + "step": 157 + }, + { + "epoch": 2.5981308411214954, + "grad_norm": 0.1423913538455963, + "learning_rate": 4.7891564833489035e-05, + "loss": 0.1971, + "num_input_tokens_seen": 11558016, + "step": 158 + }, + { + "epoch": 2.6147455867082035, + "grad_norm": 0.13013511896133423, + "learning_rate": 4.7865178994965344e-05, + "loss": 0.2362, + "num_input_tokens_seen": 11630432, + "step": 159 + }, + { + "epoch": 2.6313603322949115, + "grad_norm": 0.1587141752243042, + "learning_rate": 4.783863644106502e-05, + "loss": 0.2252, + "num_input_tokens_seen": 11684624, + "step": 160 + }, + { + "epoch": 2.64797507788162, + "grad_norm": 0.12592960894107819, + "learning_rate": 4.781193735370777e-05, + "loss": 0.2506, + "num_input_tokens_seen": 11770232, + "step": 161 + }, + { + "epoch": 2.664589823468328, + "grad_norm": 0.1583249419927597, + "learning_rate": 4.7785081915886134e-05, + "loss": 0.2352, + "num_input_tokens_seen": 11828360, + "step": 162 + }, + { + "epoch": 2.681204569055036, + "grad_norm": 0.14881783723831177, + "learning_rate": 4.775807031166428e-05, + "loss": 0.2308, + "num_input_tokens_seen": 11915944, + "step": 163 + }, + { + "epoch": 2.6978193146417446, + "grad_norm": 0.1607823222875595, + "learning_rate": 4.773090272617672e-05, + "loss": 0.2238, + "num_input_tokens_seen": 11981792, + "step": 164 + }, + { + "epoch": 2.7144340602284527, + "grad_norm": 0.13583113253116608, + "learning_rate": 4.7703579345627035e-05, + "loss": 0.3196, + "num_input_tokens_seen": 12044024, + "step": 165 + }, + { + "epoch": 2.7310488058151607, + "grad_norm": 0.19167298078536987, + "learning_rate": 4.7676100357286624e-05, + "loss": 0.2745, + "num_input_tokens_seen": 12093424, + "step": 166 + }, + { + "epoch": 2.7476635514018692, + "grad_norm": 0.130703404545784, + "learning_rate": 4.76484659494934e-05, + "loss": 0.2285, + "num_input_tokens_seen": 12167792, + "step": 167 + }, + { + "epoch": 2.7642782969885773, + "grad_norm": 0.14331185817718506, + "learning_rate": 4.762067631165049e-05, + "loss": 0.2506, + "num_input_tokens_seen": 12233712, + "step": 168 + }, + { + "epoch": 2.7808930425752854, + "grad_norm": 0.12700341641902924, + "learning_rate": 4.7592731634224966e-05, + "loss": 0.2052, + "num_input_tokens_seen": 12310544, + "step": 169 + }, + { + "epoch": 2.797507788161994, + "grad_norm": 0.15118420124053955, + "learning_rate": 4.756463210874652e-05, + "loss": 0.2309, + "num_input_tokens_seen": 12400160, + "step": 170 + }, + { + "epoch": 2.814122533748702, + "grad_norm": 0.14001020789146423, + "learning_rate": 4.753637792780614e-05, + "loss": 0.2544, + "num_input_tokens_seen": 12480432, + "step": 171 + }, + { + "epoch": 2.83073727933541, + "grad_norm": 0.12076311558485031, + "learning_rate": 4.7507969285054845e-05, + "loss": 0.2434, + "num_input_tokens_seen": 12568064, + "step": 172 + }, + { + "epoch": 2.8473520249221185, + "grad_norm": 0.16462342441082, + "learning_rate": 4.7479406375202264e-05, + "loss": 0.2417, + "num_input_tokens_seen": 12647400, + "step": 173 + }, + { + "epoch": 2.8639667705088265, + "grad_norm": 0.17294971644878387, + "learning_rate": 4.745068939401539e-05, + "loss": 0.2121, + "num_input_tokens_seen": 12698208, + "step": 174 + }, + { + "epoch": 2.8805815160955346, + "grad_norm": 0.16743803024291992, + "learning_rate": 4.742181853831721e-05, + "loss": 0.2238, + "num_input_tokens_seen": 12758528, + "step": 175 + }, + { + "epoch": 2.897196261682243, + "grad_norm": 0.14583320915699005, + "learning_rate": 4.7392794005985326e-05, + "loss": 0.2333, + "num_input_tokens_seen": 12837264, + "step": 176 + }, + { + "epoch": 2.913811007268951, + "grad_norm": 0.1509270817041397, + "learning_rate": 4.7363615995950626e-05, + "loss": 0.2179, + "num_input_tokens_seen": 12902368, + "step": 177 + }, + { + "epoch": 2.930425752855659, + "grad_norm": 0.12910738587379456, + "learning_rate": 4.733428470819594e-05, + "loss": 0.2144, + "num_input_tokens_seen": 12974296, + "step": 178 + }, + { + "epoch": 2.9470404984423677, + "grad_norm": 0.142000213265419, + "learning_rate": 4.730480034375462e-05, + "loss": 0.2413, + "num_input_tokens_seen": 13057280, + "step": 179 + }, + { + "epoch": 2.9636552440290758, + "grad_norm": 0.131468266248703, + "learning_rate": 4.72751631047092e-05, + "loss": 0.294, + "num_input_tokens_seen": 13158232, + "step": 180 + }, + { + "epoch": 2.980269989615784, + "grad_norm": 0.1529342085123062, + "learning_rate": 4.7245373194189994e-05, + "loss": 0.216, + "num_input_tokens_seen": 13229840, + "step": 181 + }, + { + "epoch": 2.9968847352024923, + "grad_norm": 0.1573815941810608, + "learning_rate": 4.7215430816373726e-05, + "loss": 0.2384, + "num_input_tokens_seen": 13296520, + "step": 182 + }, + { + "epoch": 3.0, + "grad_norm": 0.2532118558883667, + "learning_rate": 4.718533617648209e-05, + "loss": 0.1459, + "num_input_tokens_seen": 13309672, + "step": 183 + }, + { + "epoch": 3.016614745586708, + "grad_norm": 0.16963432729244232, + "learning_rate": 4.715508948078037e-05, + "loss": 0.1985, + "num_input_tokens_seen": 13371544, + "step": 184 + }, + { + "epoch": 3.0332294911734166, + "grad_norm": 0.18877384066581726, + "learning_rate": 4.712469093657605e-05, + "loss": 0.1856, + "num_input_tokens_seen": 13432984, + "step": 185 + }, + { + "epoch": 3.0498442367601246, + "grad_norm": 0.14922884106636047, + "learning_rate": 4.709414075221734e-05, + "loss": 0.2385, + "num_input_tokens_seen": 13500016, + "step": 186 + }, + { + "epoch": 3.0664589823468327, + "grad_norm": 0.2028326541185379, + "learning_rate": 4.706343913709178e-05, + "loss": 0.2227, + "num_input_tokens_seen": 13579672, + "step": 187 + }, + { + "epoch": 3.083073727933541, + "grad_norm": 0.19964616000652313, + "learning_rate": 4.70325863016248e-05, + "loss": 0.2045, + "num_input_tokens_seen": 13630704, + "step": 188 + }, + { + "epoch": 3.0996884735202492, + "grad_norm": 0.1594657599925995, + "learning_rate": 4.7001582457278304e-05, + "loss": 0.2648, + "num_input_tokens_seen": 13695472, + "step": 189 + }, + { + "epoch": 3.1163032191069573, + "grad_norm": 0.16952532529830933, + "learning_rate": 4.697042781654913e-05, + "loss": 0.22, + "num_input_tokens_seen": 13767792, + "step": 190 + }, + { + "epoch": 3.132917964693666, + "grad_norm": 0.16775831580162048, + "learning_rate": 4.693912259296773e-05, + "loss": 0.2667, + "num_input_tokens_seen": 13857352, + "step": 191 + }, + { + "epoch": 3.149532710280374, + "grad_norm": 0.15529580414295197, + "learning_rate": 4.690766700109659e-05, + "loss": 0.2154, + "num_input_tokens_seen": 13939928, + "step": 192 + }, + { + "epoch": 3.166147455867082, + "grad_norm": 0.1619848757982254, + "learning_rate": 4.687606125652882e-05, + "loss": 0.1963, + "num_input_tokens_seen": 14017936, + "step": 193 + }, + { + "epoch": 3.1827622014537904, + "grad_norm": 0.18066684901714325, + "learning_rate": 4.684430557588664e-05, + "loss": 0.1862, + "num_input_tokens_seen": 14074176, + "step": 194 + }, + { + "epoch": 3.1993769470404985, + "grad_norm": 0.16520777344703674, + "learning_rate": 4.681240017681993e-05, + "loss": 0.2576, + "num_input_tokens_seen": 14167656, + "step": 195 + }, + { + "epoch": 3.2159916926272065, + "grad_norm": 0.15385325253009796, + "learning_rate": 4.678034527800474e-05, + "loss": 0.1813, + "num_input_tokens_seen": 14235800, + "step": 196 + }, + { + "epoch": 3.232606438213915, + "grad_norm": 0.16897696256637573, + "learning_rate": 4.674814109914174e-05, + "loss": 0.1741, + "num_input_tokens_seen": 14301272, + "step": 197 + }, + { + "epoch": 3.249221183800623, + "grad_norm": 0.19556447863578796, + "learning_rate": 4.671578786095478e-05, + "loss": 0.2186, + "num_input_tokens_seen": 14347352, + "step": 198 + }, + { + "epoch": 3.265835929387331, + "grad_norm": 0.17333142459392548, + "learning_rate": 4.668328578518933e-05, + "loss": 0.2892, + "num_input_tokens_seen": 14434600, + "step": 199 + }, + { + "epoch": 3.2824506749740396, + "grad_norm": 0.20295488834381104, + "learning_rate": 4.665063509461097e-05, + "loss": 0.2014, + "num_input_tokens_seen": 14484104, + "step": 200 + }, + { + "epoch": 3.2990654205607477, + "grad_norm": 0.1597638726234436, + "learning_rate": 4.661783601300388e-05, + "loss": 0.2158, + "num_input_tokens_seen": 14567152, + "step": 201 + }, + { + "epoch": 3.3156801661474558, + "grad_norm": 0.19849488139152527, + "learning_rate": 4.6584888765169296e-05, + "loss": 0.2578, + "num_input_tokens_seen": 14647040, + "step": 202 + }, + { + "epoch": 3.3322949117341643, + "grad_norm": 0.1508200466632843, + "learning_rate": 4.6551793576923964e-05, + "loss": 0.2213, + "num_input_tokens_seen": 14738216, + "step": 203 + }, + { + "epoch": 3.3489096573208723, + "grad_norm": 0.1687687337398529, + "learning_rate": 4.65185506750986e-05, + "loss": 0.1828, + "num_input_tokens_seen": 14811216, + "step": 204 + }, + { + "epoch": 3.3655244029075804, + "grad_norm": 0.16587376594543457, + "learning_rate": 4.648516028753632e-05, + "loss": 0.1619, + "num_input_tokens_seen": 14885992, + "step": 205 + }, + { + "epoch": 3.382139148494289, + "grad_norm": 0.16600169241428375, + "learning_rate": 4.645162264309112e-05, + "loss": 0.2438, + "num_input_tokens_seen": 14961984, + "step": 206 + }, + { + "epoch": 3.398753894080997, + "grad_norm": 0.1877703070640564, + "learning_rate": 4.6417937971626245e-05, + "loss": 0.1771, + "num_input_tokens_seen": 15021240, + "step": 207 + }, + { + "epoch": 3.415368639667705, + "grad_norm": 0.20105206966400146, + "learning_rate": 4.638410650401267e-05, + "loss": 0.1742, + "num_input_tokens_seen": 15092016, + "step": 208 + }, + { + "epoch": 3.431983385254413, + "grad_norm": 0.12934140861034393, + "learning_rate": 4.635012847212748e-05, + "loss": 0.1725, + "num_input_tokens_seen": 15198192, + "step": 209 + }, + { + "epoch": 3.4485981308411215, + "grad_norm": 0.18388882279396057, + "learning_rate": 4.6316004108852305e-05, + "loss": 0.186, + "num_input_tokens_seen": 15258432, + "step": 210 + }, + { + "epoch": 3.4652128764278296, + "grad_norm": 0.1766858547925949, + "learning_rate": 4.628173364807171e-05, + "loss": 0.2166, + "num_input_tokens_seen": 15329600, + "step": 211 + }, + { + "epoch": 3.4818276220145377, + "grad_norm": 0.16214998066425323, + "learning_rate": 4.6247317324671605e-05, + "loss": 0.2038, + "num_input_tokens_seen": 15407920, + "step": 212 + }, + { + "epoch": 3.498442367601246, + "grad_norm": 0.16933797299861908, + "learning_rate": 4.6212755374537596e-05, + "loss": 0.2017, + "num_input_tokens_seen": 15479640, + "step": 213 + }, + { + "epoch": 3.515057113187954, + "grad_norm": 0.19472749531269073, + "learning_rate": 4.617804803455344e-05, + "loss": 0.2048, + "num_input_tokens_seen": 15561960, + "step": 214 + }, + { + "epoch": 3.5316718587746623, + "grad_norm": 0.33335182070732117, + "learning_rate": 4.614319554259934e-05, + "loss": 0.2358, + "num_input_tokens_seen": 15641440, + "step": 215 + }, + { + "epoch": 3.5482866043613708, + "grad_norm": 0.19587557017803192, + "learning_rate": 4.610819813755038e-05, + "loss": 0.2374, + "num_input_tokens_seen": 15728872, + "step": 216 + }, + { + "epoch": 3.564901349948079, + "grad_norm": 0.19063518941402435, + "learning_rate": 4.607305605927487e-05, + "loss": 0.1919, + "num_input_tokens_seen": 15798112, + "step": 217 + }, + { + "epoch": 3.581516095534787, + "grad_norm": 0.19598323106765747, + "learning_rate": 4.6037769548632656e-05, + "loss": 0.2583, + "num_input_tokens_seen": 15865936, + "step": 218 + }, + { + "epoch": 3.5981308411214954, + "grad_norm": 0.18066690862178802, + "learning_rate": 4.600233884747355e-05, + "loss": 0.2337, + "num_input_tokens_seen": 15941368, + "step": 219 + }, + { + "epoch": 3.6147455867082035, + "grad_norm": 0.16981899738311768, + "learning_rate": 4.5966764198635606e-05, + "loss": 0.1818, + "num_input_tokens_seen": 16028208, + "step": 220 + }, + { + "epoch": 3.6313603322949115, + "grad_norm": 0.180410236120224, + "learning_rate": 4.5931045845943474e-05, + "loss": 0.1646, + "num_input_tokens_seen": 16104408, + "step": 221 + }, + { + "epoch": 3.64797507788162, + "grad_norm": 0.19180680811405182, + "learning_rate": 4.5895184034206765e-05, + "loss": 0.3263, + "num_input_tokens_seen": 16156800, + "step": 222 + }, + { + "epoch": 3.664589823468328, + "grad_norm": 0.16119280457496643, + "learning_rate": 4.585917900921829e-05, + "loss": 0.2636, + "num_input_tokens_seen": 16256712, + "step": 223 + }, + { + "epoch": 3.681204569055036, + "grad_norm": 0.18559172749519348, + "learning_rate": 4.5823031017752485e-05, + "loss": 0.1759, + "num_input_tokens_seen": 16330344, + "step": 224 + }, + { + "epoch": 3.6978193146417446, + "grad_norm": 0.17767880856990814, + "learning_rate": 4.5786740307563636e-05, + "loss": 0.196, + "num_input_tokens_seen": 16399792, + "step": 225 + }, + { + "epoch": 3.7144340602284527, + "grad_norm": 0.17806987464427948, + "learning_rate": 4.575030712738419e-05, + "loss": 0.186, + "num_input_tokens_seen": 16466368, + "step": 226 + }, + { + "epoch": 3.7310488058151607, + "grad_norm": 0.1952792853116989, + "learning_rate": 4.571373172692309e-05, + "loss": 0.1789, + "num_input_tokens_seen": 16530976, + "step": 227 + }, + { + "epoch": 3.7476635514018692, + "grad_norm": 0.1774374544620514, + "learning_rate": 4.567701435686404e-05, + "loss": 0.1929, + "num_input_tokens_seen": 16600216, + "step": 228 + }, + { + "epoch": 3.7642782969885773, + "grad_norm": 0.18798600137233734, + "learning_rate": 4.5640155268863796e-05, + "loss": 0.2268, + "num_input_tokens_seen": 16673192, + "step": 229 + }, + { + "epoch": 3.7808930425752854, + "grad_norm": 0.2022520750761032, + "learning_rate": 4.5603154715550386e-05, + "loss": 0.1716, + "num_input_tokens_seen": 16739912, + "step": 230 + }, + { + "epoch": 3.797507788161994, + "grad_norm": 0.15170948207378387, + "learning_rate": 4.55660129505215e-05, + "loss": 0.1844, + "num_input_tokens_seen": 16834632, + "step": 231 + }, + { + "epoch": 3.814122533748702, + "grad_norm": 0.16655084490776062, + "learning_rate": 4.5528730228342605e-05, + "loss": 0.1899, + "num_input_tokens_seen": 16914728, + "step": 232 + }, + { + "epoch": 3.83073727933541, + "grad_norm": 0.19025221467018127, + "learning_rate": 4.549130680454532e-05, + "loss": 0.2214, + "num_input_tokens_seen": 17014304, + "step": 233 + }, + { + "epoch": 3.8473520249221185, + "grad_norm": 0.17126557230949402, + "learning_rate": 4.545374293562559e-05, + "loss": 0.2062, + "num_input_tokens_seen": 17106664, + "step": 234 + }, + { + "epoch": 3.8639667705088265, + "grad_norm": 0.16162410378456116, + "learning_rate": 4.541603887904198e-05, + "loss": 0.2016, + "num_input_tokens_seen": 17193744, + "step": 235 + }, + { + "epoch": 3.8805815160955346, + "grad_norm": 0.2067136913537979, + "learning_rate": 4.537819489321386e-05, + "loss": 0.1992, + "num_input_tokens_seen": 17254656, + "step": 236 + }, + { + "epoch": 3.897196261682243, + "grad_norm": 0.200433611869812, + "learning_rate": 4.534021123751968e-05, + "loss": 0.1961, + "num_input_tokens_seen": 17325896, + "step": 237 + }, + { + "epoch": 3.913811007268951, + "grad_norm": 0.2062034010887146, + "learning_rate": 4.5302088172295156e-05, + "loss": 0.2302, + "num_input_tokens_seen": 17394424, + "step": 238 + }, + { + "epoch": 3.930425752855659, + "grad_norm": 0.1928798407316208, + "learning_rate": 4.526382595883152e-05, + "loss": 0.1846, + "num_input_tokens_seen": 17456352, + "step": 239 + }, + { + "epoch": 3.9470404984423677, + "grad_norm": 0.2011859118938446, + "learning_rate": 4.522542485937369e-05, + "loss": 0.1879, + "num_input_tokens_seen": 17519168, + "step": 240 + }, + { + "epoch": 3.9636552440290758, + "grad_norm": 0.20441657304763794, + "learning_rate": 4.51868851371185e-05, + "loss": 0.206, + "num_input_tokens_seen": 17585144, + "step": 241 + }, + { + "epoch": 3.980269989615784, + "grad_norm": 0.18314018845558167, + "learning_rate": 4.5148207056212896e-05, + "loss": 0.1676, + "num_input_tokens_seen": 17662024, + "step": 242 + }, + { + "epoch": 3.9968847352024923, + "grad_norm": 0.21530692279338837, + "learning_rate": 4.5109390881752114e-05, + "loss": 0.1961, + "num_input_tokens_seen": 17724360, + "step": 243 + }, + { + "epoch": 4.0, + "grad_norm": 0.38912200927734375, + "learning_rate": 4.5070436879777865e-05, + "loss": 0.185, + "num_input_tokens_seen": 17746200, + "step": 244 + }, + { + "epoch": 4.0166147455867085, + "grad_norm": 0.15166164934635162, + "learning_rate": 4.503134531727652e-05, + "loss": 0.1674, + "num_input_tokens_seen": 17830760, + "step": 245 + }, + { + "epoch": 4.033229491173416, + "grad_norm": 0.1999833583831787, + "learning_rate": 4.499211646217727e-05, + "loss": 0.1739, + "num_input_tokens_seen": 17903840, + "step": 246 + }, + { + "epoch": 4.049844236760125, + "grad_norm": 0.2024000585079193, + "learning_rate": 4.495275058335029e-05, + "loss": 0.1753, + "num_input_tokens_seen": 17990448, + "step": 247 + }, + { + "epoch": 4.066458982346833, + "grad_norm": 0.22637376189231873, + "learning_rate": 4.491324795060491e-05, + "loss": 0.1896, + "num_input_tokens_seen": 18069520, + "step": 248 + }, + { + "epoch": 4.083073727933541, + "grad_norm": 0.24361123144626617, + "learning_rate": 4.487360883468775e-05, + "loss": 0.1688, + "num_input_tokens_seen": 18129128, + "step": 249 + }, + { + "epoch": 4.099688473520249, + "grad_norm": 0.21949416399002075, + "learning_rate": 4.4833833507280884e-05, + "loss": 0.1928, + "num_input_tokens_seen": 18202472, + "step": 250 + }, + { + "epoch": 4.116303219106958, + "grad_norm": 0.22039519250392914, + "learning_rate": 4.4793922240999933e-05, + "loss": 0.1737, + "num_input_tokens_seen": 18267232, + "step": 251 + }, + { + "epoch": 4.132917964693665, + "grad_norm": 0.23173294961452484, + "learning_rate": 4.4753875309392266e-05, + "loss": 0.1883, + "num_input_tokens_seen": 18325216, + "step": 252 + }, + { + "epoch": 4.149532710280374, + "grad_norm": 0.24100351333618164, + "learning_rate": 4.471369298693505e-05, + "loss": 0.2042, + "num_input_tokens_seen": 18406184, + "step": 253 + }, + { + "epoch": 4.166147455867082, + "grad_norm": 0.1888919323682785, + "learning_rate": 4.467337554903344e-05, + "loss": 0.1656, + "num_input_tokens_seen": 18481056, + "step": 254 + }, + { + "epoch": 4.18276220145379, + "grad_norm": 0.17849119007587433, + "learning_rate": 4.463292327201862e-05, + "loss": 0.1454, + "num_input_tokens_seen": 18554864, + "step": 255 + }, + { + "epoch": 4.1993769470404985, + "grad_norm": 0.24600732326507568, + "learning_rate": 4.4592336433146e-05, + "loss": 0.2039, + "num_input_tokens_seen": 18612120, + "step": 256 + }, + { + "epoch": 4.215991692627207, + "grad_norm": 0.23695628345012665, + "learning_rate": 4.4551615310593195e-05, + "loss": 0.2112, + "num_input_tokens_seen": 18710408, + "step": 257 + }, + { + "epoch": 4.232606438213915, + "grad_norm": 0.2511826753616333, + "learning_rate": 4.451076018345825e-05, + "loss": 0.1831, + "num_input_tokens_seen": 18769400, + "step": 258 + }, + { + "epoch": 4.249221183800623, + "grad_norm": 0.1971820890903473, + "learning_rate": 4.4469771331757604e-05, + "loss": 0.1722, + "num_input_tokens_seen": 18849704, + "step": 259 + }, + { + "epoch": 4.265835929387332, + "grad_norm": 0.23203876614570618, + "learning_rate": 4.442864903642428e-05, + "loss": 0.1981, + "num_input_tokens_seen": 18943328, + "step": 260 + }, + { + "epoch": 4.282450674974039, + "grad_norm": 0.23434185981750488, + "learning_rate": 4.4387393579305865e-05, + "loss": 0.2014, + "num_input_tokens_seen": 19022536, + "step": 261 + }, + { + "epoch": 4.299065420560748, + "grad_norm": 0.2373885214328766, + "learning_rate": 4.434600524316266e-05, + "loss": 0.167, + "num_input_tokens_seen": 19089200, + "step": 262 + }, + { + "epoch": 4.315680166147456, + "grad_norm": 0.20130722224712372, + "learning_rate": 4.430448431166567e-05, + "loss": 0.2747, + "num_input_tokens_seen": 19171216, + "step": 263 + }, + { + "epoch": 4.332294911734164, + "grad_norm": 0.17947593331336975, + "learning_rate": 4.426283106939474e-05, + "loss": 0.1508, + "num_input_tokens_seen": 19271872, + "step": 264 + }, + { + "epoch": 4.348909657320872, + "grad_norm": 0.23321041464805603, + "learning_rate": 4.4221045801836494e-05, + "loss": 0.2585, + "num_input_tokens_seen": 19342984, + "step": 265 + }, + { + "epoch": 4.365524402907581, + "grad_norm": 0.26581740379333496, + "learning_rate": 4.41791287953825e-05, + "loss": 0.1789, + "num_input_tokens_seen": 19391640, + "step": 266 + }, + { + "epoch": 4.382139148494288, + "grad_norm": 0.20715415477752686, + "learning_rate": 4.4137080337327205e-05, + "loss": 0.1953, + "num_input_tokens_seen": 19463232, + "step": 267 + }, + { + "epoch": 4.398753894080997, + "grad_norm": 0.20641866326332092, + "learning_rate": 4.4094900715866064e-05, + "loss": 0.1752, + "num_input_tokens_seen": 19523728, + "step": 268 + }, + { + "epoch": 4.415368639667705, + "grad_norm": 0.23343385756015778, + "learning_rate": 4.4052590220093446e-05, + "loss": 0.1904, + "num_input_tokens_seen": 19598960, + "step": 269 + }, + { + "epoch": 4.431983385254413, + "grad_norm": 0.20117436349391937, + "learning_rate": 4.401014914000078e-05, + "loss": 0.1801, + "num_input_tokens_seen": 19666136, + "step": 270 + }, + { + "epoch": 4.4485981308411215, + "grad_norm": 0.24009813368320465, + "learning_rate": 4.3967577766474455e-05, + "loss": 0.1798, + "num_input_tokens_seen": 19728600, + "step": 271 + }, + { + "epoch": 4.46521287642783, + "grad_norm": 0.2242031991481781, + "learning_rate": 4.3924876391293915e-05, + "loss": 0.2221, + "num_input_tokens_seen": 19801032, + "step": 272 + }, + { + "epoch": 4.481827622014538, + "grad_norm": 0.22890391945838928, + "learning_rate": 4.3882045307129594e-05, + "loss": 0.1906, + "num_input_tokens_seen": 19885496, + "step": 273 + }, + { + "epoch": 4.498442367601246, + "grad_norm": 0.21996937692165375, + "learning_rate": 4.383908480754095e-05, + "loss": 0.1775, + "num_input_tokens_seen": 19952072, + "step": 274 + }, + { + "epoch": 4.515057113187955, + "grad_norm": 0.1860388070344925, + "learning_rate": 4.379599518697444e-05, + "loss": 0.1593, + "num_input_tokens_seen": 20026536, + "step": 275 + }, + { + "epoch": 4.531671858774662, + "grad_norm": 0.20987707376480103, + "learning_rate": 4.375277674076149e-05, + "loss": 0.1409, + "num_input_tokens_seen": 20079112, + "step": 276 + }, + { + "epoch": 4.548286604361371, + "grad_norm": 0.21347324550151825, + "learning_rate": 4.3709429765116504e-05, + "loss": 0.2701, + "num_input_tokens_seen": 20144264, + "step": 277 + }, + { + "epoch": 4.564901349948079, + "grad_norm": 0.27563896775245667, + "learning_rate": 4.366595455713479e-05, + "loss": 0.1856, + "num_input_tokens_seen": 20207568, + "step": 278 + }, + { + "epoch": 4.581516095534787, + "grad_norm": 0.21850791573524475, + "learning_rate": 4.3622351414790554e-05, + "loss": 0.2204, + "num_input_tokens_seen": 20292376, + "step": 279 + }, + { + "epoch": 4.598130841121495, + "grad_norm": 0.19672711193561554, + "learning_rate": 4.357862063693486e-05, + "loss": 0.1397, + "num_input_tokens_seen": 20383048, + "step": 280 + }, + { + "epoch": 4.614745586708204, + "grad_norm": 0.23507343232631683, + "learning_rate": 4.353476252329356e-05, + "loss": 0.1655, + "num_input_tokens_seen": 20463376, + "step": 281 + }, + { + "epoch": 4.6313603322949115, + "grad_norm": 0.23900464177131653, + "learning_rate": 4.349077737446525e-05, + "loss": 0.1511, + "num_input_tokens_seen": 20537808, + "step": 282 + }, + { + "epoch": 4.64797507788162, + "grad_norm": 0.2231033891439438, + "learning_rate": 4.344666549191921e-05, + "loss": 0.1783, + "num_input_tokens_seen": 20605496, + "step": 283 + }, + { + "epoch": 4.6645898234683285, + "grad_norm": 0.20772908627986908, + "learning_rate": 4.3402427177993366e-05, + "loss": 0.2135, + "num_input_tokens_seen": 20692096, + "step": 284 + }, + { + "epoch": 4.681204569055036, + "grad_norm": 0.24927115440368652, + "learning_rate": 4.335806273589214e-05, + "loss": 0.1919, + "num_input_tokens_seen": 20762800, + "step": 285 + }, + { + "epoch": 4.697819314641745, + "grad_norm": 0.2112305462360382, + "learning_rate": 4.3313572469684474e-05, + "loss": 0.1546, + "num_input_tokens_seen": 20831584, + "step": 286 + }, + { + "epoch": 4.714434060228453, + "grad_norm": 0.22639551758766174, + "learning_rate": 4.326895668430166e-05, + "loss": 0.124, + "num_input_tokens_seen": 20897320, + "step": 287 + }, + { + "epoch": 4.731048805815161, + "grad_norm": 0.2295934110879898, + "learning_rate": 4.3224215685535294e-05, + "loss": 0.1639, + "num_input_tokens_seen": 20966136, + "step": 288 + }, + { + "epoch": 4.747663551401869, + "grad_norm": 0.2341577112674713, + "learning_rate": 4.317934978003517e-05, + "loss": 0.1584, + "num_input_tokens_seen": 21034800, + "step": 289 + }, + { + "epoch": 4.764278296988578, + "grad_norm": 0.2542404234409332, + "learning_rate": 4.313435927530719e-05, + "loss": 0.1918, + "num_input_tokens_seen": 21098672, + "step": 290 + }, + { + "epoch": 4.780893042575285, + "grad_norm": 0.23311223089694977, + "learning_rate": 4.3089244479711236e-05, + "loss": 0.1597, + "num_input_tokens_seen": 21177632, + "step": 291 + }, + { + "epoch": 4.797507788161994, + "grad_norm": 0.2642923593521118, + "learning_rate": 4.304400570245906e-05, + "loss": 0.1847, + "num_input_tokens_seen": 21240896, + "step": 292 + }, + { + "epoch": 4.814122533748702, + "grad_norm": 0.18841278553009033, + "learning_rate": 4.299864325361217e-05, + "loss": 0.1472, + "num_input_tokens_seen": 21322984, + "step": 293 + }, + { + "epoch": 4.83073727933541, + "grad_norm": 0.22440434992313385, + "learning_rate": 4.295315744407972e-05, + "loss": 0.1607, + "num_input_tokens_seen": 21389128, + "step": 294 + }, + { + "epoch": 4.8473520249221185, + "grad_norm": 0.22145289182662964, + "learning_rate": 4.290754858561637e-05, + "loss": 0.1851, + "num_input_tokens_seen": 21469912, + "step": 295 + }, + { + "epoch": 4.863966770508826, + "grad_norm": 0.22817087173461914, + "learning_rate": 4.2861816990820084e-05, + "loss": 0.1531, + "num_input_tokens_seen": 21540320, + "step": 296 + }, + { + "epoch": 4.880581516095535, + "grad_norm": 0.22014038264751434, + "learning_rate": 4.281596297313013e-05, + "loss": 0.1815, + "num_input_tokens_seen": 21626312, + "step": 297 + }, + { + "epoch": 4.897196261682243, + "grad_norm": 0.2234148383140564, + "learning_rate": 4.2769986846824815e-05, + "loss": 0.1667, + "num_input_tokens_seen": 21702792, + "step": 298 + }, + { + "epoch": 4.913811007268951, + "grad_norm": 0.2851375341415405, + "learning_rate": 4.272388892701934e-05, + "loss": 0.1805, + "num_input_tokens_seen": 21771880, + "step": 299 + }, + { + "epoch": 4.930425752855659, + "grad_norm": 0.2221265286207199, + "learning_rate": 4.267766952966369e-05, + "loss": 0.1653, + "num_input_tokens_seen": 21844024, + "step": 300 + }, + { + "epoch": 4.947040498442368, + "grad_norm": 0.20688939094543457, + "learning_rate": 4.2631328971540444e-05, + "loss": 0.1654, + "num_input_tokens_seen": 21925632, + "step": 301 + }, + { + "epoch": 4.963655244029075, + "grad_norm": 0.2270977795124054, + "learning_rate": 4.2584867570262597e-05, + "loss": 0.1774, + "num_input_tokens_seen": 21981952, + "step": 302 + }, + { + "epoch": 4.980269989615784, + "grad_norm": 0.18652501702308655, + "learning_rate": 4.25382856442714e-05, + "loss": 0.1452, + "num_input_tokens_seen": 22070440, + "step": 303 + }, + { + "epoch": 4.996884735202492, + "grad_norm": 0.19792407751083374, + "learning_rate": 4.249158351283414e-05, + "loss": 0.1806, + "num_input_tokens_seen": 22170184, + "step": 304 + }, + { + "epoch": 5.0, + "grad_norm": 0.5933757424354553, + "learning_rate": 4.244476149604201e-05, + "loss": 0.2115, + "num_input_tokens_seen": 22181856, + "step": 305 + }, + { + "epoch": 5.0166147455867085, + "grad_norm": 0.22399979829788208, + "learning_rate": 4.2397819914807856e-05, + "loss": 0.1614, + "num_input_tokens_seen": 22256808, + "step": 306 + }, + { + "epoch": 5.033229491173416, + "grad_norm": 0.2595834732055664, + "learning_rate": 4.2350759090864046e-05, + "loss": 0.1838, + "num_input_tokens_seen": 22325224, + "step": 307 + }, + { + "epoch": 5.049844236760125, + "grad_norm": 0.188430517911911, + "learning_rate": 4.230357934676017e-05, + "loss": 0.1421, + "num_input_tokens_seen": 22389624, + "step": 308 + }, + { + "epoch": 5.066458982346833, + "grad_norm": 0.325431764125824, + "learning_rate": 4.225628100586093e-05, + "loss": 0.1852, + "num_input_tokens_seen": 22463872, + "step": 309 + }, + { + "epoch": 5.083073727933541, + "grad_norm": 0.27097174525260925, + "learning_rate": 4.220886439234385e-05, + "loss": 0.1526, + "num_input_tokens_seen": 22515824, + "step": 310 + }, + { + "epoch": 5.099688473520249, + "grad_norm": 0.21379193663597107, + "learning_rate": 4.2161329831197095e-05, + "loss": 0.151, + "num_input_tokens_seen": 22602336, + "step": 311 + }, + { + "epoch": 5.116303219106958, + "grad_norm": 0.24827998876571655, + "learning_rate": 4.211367764821722e-05, + "loss": 0.142, + "num_input_tokens_seen": 22655176, + "step": 312 + }, + { + "epoch": 5.132917964693665, + "grad_norm": 0.26246964931488037, + "learning_rate": 4.2065908170006955e-05, + "loss": 0.1589, + "num_input_tokens_seen": 22728680, + "step": 313 + }, + { + "epoch": 5.149532710280374, + "grad_norm": 0.24459198117256165, + "learning_rate": 4.201802172397295e-05, + "loss": 0.1435, + "num_input_tokens_seen": 22806784, + "step": 314 + }, + { + "epoch": 5.166147455867082, + "grad_norm": 0.26540517807006836, + "learning_rate": 4.197001863832355e-05, + "loss": 0.1447, + "num_input_tokens_seen": 22880648, + "step": 315 + }, + { + "epoch": 5.18276220145379, + "grad_norm": 0.25646644830703735, + "learning_rate": 4.192189924206652e-05, + "loss": 0.1418, + "num_input_tokens_seen": 22953184, + "step": 316 + }, + { + "epoch": 5.1993769470404985, + "grad_norm": 0.2358384132385254, + "learning_rate": 4.187366386500683e-05, + "loss": 0.1845, + "num_input_tokens_seen": 23037392, + "step": 317 + }, + { + "epoch": 5.215991692627207, + "grad_norm": 0.2270258218050003, + "learning_rate": 4.182531283774434e-05, + "loss": 0.2668, + "num_input_tokens_seen": 23086552, + "step": 318 + }, + { + "epoch": 5.232606438213915, + "grad_norm": 0.24396558105945587, + "learning_rate": 4.177684649167158e-05, + "loss": 0.1567, + "num_input_tokens_seen": 23153152, + "step": 319 + }, + { + "epoch": 5.249221183800623, + "grad_norm": 0.2542375922203064, + "learning_rate": 4.172826515897146e-05, + "loss": 0.1617, + "num_input_tokens_seen": 23240928, + "step": 320 + }, + { + "epoch": 5.265835929387332, + "grad_norm": 0.2268146276473999, + "learning_rate": 4.1679569172614996e-05, + "loss": 0.1573, + "num_input_tokens_seen": 23325912, + "step": 321 + }, + { + "epoch": 5.282450674974039, + "grad_norm": 0.26405712962150574, + "learning_rate": 4.163075886635902e-05, + "loss": 0.1738, + "num_input_tokens_seen": 23401952, + "step": 322 + }, + { + "epoch": 5.299065420560748, + "grad_norm": 0.2599943280220032, + "learning_rate": 4.1581834574743915e-05, + "loss": 0.1433, + "num_input_tokens_seen": 23463760, + "step": 323 + }, + { + "epoch": 5.315680166147456, + "grad_norm": 0.24650180339813232, + "learning_rate": 4.1532796633091296e-05, + "loss": 0.1511, + "num_input_tokens_seen": 23535272, + "step": 324 + }, + { + "epoch": 5.332294911734164, + "grad_norm": 0.22445832192897797, + "learning_rate": 4.148364537750172e-05, + "loss": 0.1296, + "num_input_tokens_seen": 23607752, + "step": 325 + }, + { + "epoch": 5.348909657320872, + "grad_norm": 0.20715995132923126, + "learning_rate": 4.14343811448524e-05, + "loss": 0.142, + "num_input_tokens_seen": 23674872, + "step": 326 + }, + { + "epoch": 5.365524402907581, + "grad_norm": 0.2707969546318054, + "learning_rate": 4.138500427279485e-05, + "loss": 0.1679, + "num_input_tokens_seen": 23736384, + "step": 327 + }, + { + "epoch": 5.382139148494288, + "grad_norm": 0.2017841637134552, + "learning_rate": 4.133551509975264e-05, + "loss": 0.1346, + "num_input_tokens_seen": 23835000, + "step": 328 + }, + { + "epoch": 5.398753894080997, + "grad_norm": 0.21116195619106293, + "learning_rate": 4.128591396491901e-05, + "loss": 0.1364, + "num_input_tokens_seen": 23912552, + "step": 329 + }, + { + "epoch": 5.415368639667705, + "grad_norm": 0.2331131547689438, + "learning_rate": 4.123620120825459e-05, + "loss": 0.1719, + "num_input_tokens_seen": 23987368, + "step": 330 + }, + { + "epoch": 5.431983385254413, + "grad_norm": 0.27115845680236816, + "learning_rate": 4.118637717048506e-05, + "loss": 0.1468, + "num_input_tokens_seen": 24050848, + "step": 331 + }, + { + "epoch": 5.4485981308411215, + "grad_norm": 0.21654783189296722, + "learning_rate": 4.113644219309877e-05, + "loss": 0.1418, + "num_input_tokens_seen": 24146104, + "step": 332 + }, + { + "epoch": 5.46521287642783, + "grad_norm": 0.2783348560333252, + "learning_rate": 4.1086396618344476e-05, + "loss": 0.1502, + "num_input_tokens_seen": 24194184, + "step": 333 + }, + { + "epoch": 5.481827622014538, + "grad_norm": 0.23255467414855957, + "learning_rate": 4.1036240789228954e-05, + "loss": 0.1571, + "num_input_tokens_seen": 24275368, + "step": 334 + }, + { + "epoch": 5.498442367601246, + "grad_norm": 0.2655453681945801, + "learning_rate": 4.098597504951462e-05, + "loss": 0.1607, + "num_input_tokens_seen": 24329192, + "step": 335 + }, + { + "epoch": 5.515057113187955, + "grad_norm": 0.23245719075202942, + "learning_rate": 4.093559974371725e-05, + "loss": 0.1453, + "num_input_tokens_seen": 24426696, + "step": 336 + }, + { + "epoch": 5.531671858774662, + "grad_norm": 0.20036327838897705, + "learning_rate": 4.088511521710352e-05, + "loss": 0.1411, + "num_input_tokens_seen": 24514344, + "step": 337 + }, + { + "epoch": 5.548286604361371, + "grad_norm": 0.32294195890426636, + "learning_rate": 4.083452181568875e-05, + "loss": 0.1467, + "num_input_tokens_seen": 24584464, + "step": 338 + }, + { + "epoch": 5.564901349948079, + "grad_norm": 0.23969624936580658, + "learning_rate": 4.0783819886234445e-05, + "loss": 0.1304, + "num_input_tokens_seen": 24660600, + "step": 339 + }, + { + "epoch": 5.581516095534787, + "grad_norm": 0.2539989948272705, + "learning_rate": 4.073300977624594e-05, + "loss": 0.1374, + "num_input_tokens_seen": 24717088, + "step": 340 + }, + { + "epoch": 5.598130841121495, + "grad_norm": 0.26608580350875854, + "learning_rate": 4.068209183397004e-05, + "loss": 0.1519, + "num_input_tokens_seen": 24775352, + "step": 341 + }, + { + "epoch": 5.614745586708204, + "grad_norm": 0.2161550372838974, + "learning_rate": 4.063106640839264e-05, + "loss": 0.1409, + "num_input_tokens_seen": 24860072, + "step": 342 + }, + { + "epoch": 5.6313603322949115, + "grad_norm": 0.22482600808143616, + "learning_rate": 4.057993384923626e-05, + "loss": 0.1393, + "num_input_tokens_seen": 24947856, + "step": 343 + }, + { + "epoch": 5.64797507788162, + "grad_norm": 0.2367829829454422, + "learning_rate": 4.052869450695776e-05, + "loss": 0.1539, + "num_input_tokens_seen": 25024992, + "step": 344 + }, + { + "epoch": 5.6645898234683285, + "grad_norm": 0.29229775071144104, + "learning_rate": 4.047734873274586e-05, + "loss": 0.1522, + "num_input_tokens_seen": 25092248, + "step": 345 + }, + { + "epoch": 5.681204569055036, + "grad_norm": 0.2589828670024872, + "learning_rate": 4.042589687851872e-05, + "loss": 0.1493, + "num_input_tokens_seen": 25170496, + "step": 346 + }, + { + "epoch": 5.697819314641745, + "grad_norm": 0.23003339767456055, + "learning_rate": 4.037433929692161e-05, + "loss": 0.1529, + "num_input_tokens_seen": 25268720, + "step": 347 + }, + { + "epoch": 5.714434060228453, + "grad_norm": 0.26932114362716675, + "learning_rate": 4.0322676341324415e-05, + "loss": 0.1499, + "num_input_tokens_seen": 25332688, + "step": 348 + }, + { + "epoch": 5.731048805815161, + "grad_norm": 0.27059391140937805, + "learning_rate": 4.027090836581925e-05, + "loss": 0.173, + "num_input_tokens_seen": 25413904, + "step": 349 + }, + { + "epoch": 5.747663551401869, + "grad_norm": 0.24265804886817932, + "learning_rate": 4.021903572521802e-05, + "loss": 0.1531, + "num_input_tokens_seen": 25503720, + "step": 350 + }, + { + "epoch": 5.764278296988578, + "grad_norm": 0.28688696026802063, + "learning_rate": 4.0167058775049996e-05, + "loss": 0.1615, + "num_input_tokens_seen": 25568560, + "step": 351 + }, + { + "epoch": 5.780893042575285, + "grad_norm": 0.26201075315475464, + "learning_rate": 4.011497787155938e-05, + "loss": 0.1452, + "num_input_tokens_seen": 25635184, + "step": 352 + }, + { + "epoch": 5.797507788161994, + "grad_norm": 0.22841767966747284, + "learning_rate": 4.006279337170283e-05, + "loss": 0.148, + "num_input_tokens_seen": 25719768, + "step": 353 + }, + { + "epoch": 5.814122533748702, + "grad_norm": 0.28246188163757324, + "learning_rate": 4.0010505633147106e-05, + "loss": 0.1446, + "num_input_tokens_seen": 25795016, + "step": 354 + }, + { + "epoch": 5.83073727933541, + "grad_norm": 0.2533949315547943, + "learning_rate": 3.995811501426648e-05, + "loss": 0.1297, + "num_input_tokens_seen": 25863184, + "step": 355 + }, + { + "epoch": 5.8473520249221185, + "grad_norm": 0.25117793679237366, + "learning_rate": 3.99056218741404e-05, + "loss": 0.1479, + "num_input_tokens_seen": 25935752, + "step": 356 + }, + { + "epoch": 5.863966770508826, + "grad_norm": 0.2759612202644348, + "learning_rate": 3.985302657255097e-05, + "loss": 0.1466, + "num_input_tokens_seen": 25995760, + "step": 357 + }, + { + "epoch": 5.880581516095535, + "grad_norm": 0.24218028783798218, + "learning_rate": 3.980032946998049e-05, + "loss": 0.1297, + "num_input_tokens_seen": 26061240, + "step": 358 + }, + { + "epoch": 5.897196261682243, + "grad_norm": 0.2628185749053955, + "learning_rate": 3.974753092760901e-05, + "loss": 0.1421, + "num_input_tokens_seen": 26131024, + "step": 359 + }, + { + "epoch": 5.913811007268951, + "grad_norm": 0.18742328882217407, + "learning_rate": 3.969463130731183e-05, + "loss": 0.1737, + "num_input_tokens_seen": 26233672, + "step": 360 + }, + { + "epoch": 5.930425752855659, + "grad_norm": 0.22187161445617676, + "learning_rate": 3.964163097165702e-05, + "loss": 0.1153, + "num_input_tokens_seen": 26303488, + "step": 361 + }, + { + "epoch": 5.947040498442368, + "grad_norm": 0.25965312123298645, + "learning_rate": 3.958853028390294e-05, + "loss": 0.1347, + "num_input_tokens_seen": 26377768, + "step": 362 + }, + { + "epoch": 5.963655244029075, + "grad_norm": 0.2957039475440979, + "learning_rate": 3.953532960799577e-05, + "loss": 0.2736, + "num_input_tokens_seen": 26435984, + "step": 363 + }, + { + "epoch": 5.980269989615784, + "grad_norm": 0.3063158392906189, + "learning_rate": 3.948202930856697e-05, + "loss": 0.1563, + "num_input_tokens_seen": 26513960, + "step": 364 + }, + { + "epoch": 5.996884735202492, + "grad_norm": 0.21576139330863953, + "learning_rate": 3.942862975093085e-05, + "loss": 0.1361, + "num_input_tokens_seen": 26599064, + "step": 365 + }, + { + "epoch": 6.0, + "grad_norm": 0.49579551815986633, + "learning_rate": 3.937513130108197e-05, + "loss": 0.1849, + "num_input_tokens_seen": 26617264, + "step": 366 + }, + { + "epoch": 6.0166147455867085, + "grad_norm": 0.25051239132881165, + "learning_rate": 3.9321534325692726e-05, + "loss": 0.1452, + "num_input_tokens_seen": 26717024, + "step": 367 + }, + { + "epoch": 6.033229491173416, + "grad_norm": 0.2842366099357605, + "learning_rate": 3.92678391921108e-05, + "loss": 0.1361, + "num_input_tokens_seen": 26788120, + "step": 368 + }, + { + "epoch": 6.049844236760125, + "grad_norm": 0.32458341121673584, + "learning_rate": 3.92140462683566e-05, + "loss": 0.1537, + "num_input_tokens_seen": 26853760, + "step": 369 + }, + { + "epoch": 6.066458982346833, + "grad_norm": 0.27452871203422546, + "learning_rate": 3.916015592312082e-05, + "loss": 0.1354, + "num_input_tokens_seen": 26923848, + "step": 370 + }, + { + "epoch": 6.083073727933541, + "grad_norm": 0.2542634904384613, + "learning_rate": 3.9106168525761855e-05, + "loss": 0.1233, + "num_input_tokens_seen": 26976184, + "step": 371 + }, + { + "epoch": 6.099688473520249, + "grad_norm": 0.2786627411842346, + "learning_rate": 3.905208444630327e-05, + "loss": 0.125, + "num_input_tokens_seen": 27065712, + "step": 372 + }, + { + "epoch": 6.116303219106958, + "grad_norm": 0.19119420647621155, + "learning_rate": 3.899790405543129e-05, + "loss": 0.1475, + "num_input_tokens_seen": 27145472, + "step": 373 + }, + { + "epoch": 6.132917964693665, + "grad_norm": 0.25163477659225464, + "learning_rate": 3.894362772449226e-05, + "loss": 0.1235, + "num_input_tokens_seen": 27233904, + "step": 374 + }, + { + "epoch": 6.149532710280374, + "grad_norm": 0.2418321669101715, + "learning_rate": 3.888925582549006e-05, + "loss": 0.1117, + "num_input_tokens_seen": 27311512, + "step": 375 + }, + { + "epoch": 6.166147455867082, + "grad_norm": 0.26851242780685425, + "learning_rate": 3.883478873108361e-05, + "loss": 0.1226, + "num_input_tokens_seen": 27387400, + "step": 376 + }, + { + "epoch": 6.18276220145379, + "grad_norm": 0.280127614736557, + "learning_rate": 3.878022681458426e-05, + "loss": 0.1295, + "num_input_tokens_seen": 27461280, + "step": 377 + }, + { + "epoch": 6.1993769470404985, + "grad_norm": 0.255588173866272, + "learning_rate": 3.87255704499533e-05, + "loss": 0.1197, + "num_input_tokens_seen": 27556400, + "step": 378 + }, + { + "epoch": 6.215991692627207, + "grad_norm": 0.2888232171535492, + "learning_rate": 3.8670820011799315e-05, + "loss": 0.1417, + "num_input_tokens_seen": 27613664, + "step": 379 + }, + { + "epoch": 6.232606438213915, + "grad_norm": 0.28014957904815674, + "learning_rate": 3.861597587537568e-05, + "loss": 0.1565, + "num_input_tokens_seen": 27681024, + "step": 380 + }, + { + "epoch": 6.249221183800623, + "grad_norm": 0.2911868095397949, + "learning_rate": 3.856103841657797e-05, + "loss": 0.1293, + "num_input_tokens_seen": 27759536, + "step": 381 + }, + { + "epoch": 6.265835929387332, + "grad_norm": 0.19534167647361755, + "learning_rate": 3.850600801194138e-05, + "loss": 0.0967, + "num_input_tokens_seen": 27857288, + "step": 382 + }, + { + "epoch": 6.282450674974039, + "grad_norm": 0.2868412733078003, + "learning_rate": 3.8450885038638127e-05, + "loss": 0.1395, + "num_input_tokens_seen": 27940528, + "step": 383 + }, + { + "epoch": 6.299065420560748, + "grad_norm": 0.3421030342578888, + "learning_rate": 3.8395669874474915e-05, + "loss": 0.1402, + "num_input_tokens_seen": 28033824, + "step": 384 + }, + { + "epoch": 6.315680166147456, + "grad_norm": 0.2819141447544098, + "learning_rate": 3.834036289789029e-05, + "loss": 0.1137, + "num_input_tokens_seen": 28096192, + "step": 385 + }, + { + "epoch": 6.332294911734164, + "grad_norm": 0.22595854103565216, + "learning_rate": 3.828496448795207e-05, + "loss": 0.1143, + "num_input_tokens_seen": 28181256, + "step": 386 + }, + { + "epoch": 6.348909657320872, + "grad_norm": 0.2450927495956421, + "learning_rate": 3.822947502435477e-05, + "loss": 0.121, + "num_input_tokens_seen": 28245480, + "step": 387 + }, + { + "epoch": 6.365524402907581, + "grad_norm": 0.2552040219306946, + "learning_rate": 3.8173894887416945e-05, + "loss": 0.1183, + "num_input_tokens_seen": 28307200, + "step": 388 + }, + { + "epoch": 6.382139148494288, + "grad_norm": 0.3974437415599823, + "learning_rate": 3.811822445807863e-05, + "loss": 0.1423, + "num_input_tokens_seen": 28384640, + "step": 389 + }, + { + "epoch": 6.398753894080997, + "grad_norm": 0.2750483453273773, + "learning_rate": 3.8062464117898724e-05, + "loss": 0.1068, + "num_input_tokens_seen": 28447992, + "step": 390 + }, + { + "epoch": 6.415368639667705, + "grad_norm": 0.3021928071975708, + "learning_rate": 3.800661424905235e-05, + "loss": 0.1233, + "num_input_tokens_seen": 28513856, + "step": 391 + }, + { + "epoch": 6.431983385254413, + "grad_norm": 0.4087376296520233, + "learning_rate": 3.795067523432826e-05, + "loss": 0.1354, + "num_input_tokens_seen": 28596584, + "step": 392 + }, + { + "epoch": 6.4485981308411215, + "grad_norm": 0.32668039202690125, + "learning_rate": 3.789464745712619e-05, + "loss": 0.1483, + "num_input_tokens_seen": 28664560, + "step": 393 + }, + { + "epoch": 6.46521287642783, + "grad_norm": 0.24187126755714417, + "learning_rate": 3.7838531301454254e-05, + "loss": 0.1225, + "num_input_tokens_seen": 28739512, + "step": 394 + }, + { + "epoch": 6.481827622014538, + "grad_norm": 0.2861412465572357, + "learning_rate": 3.77823271519263e-05, + "loss": 0.1318, + "num_input_tokens_seen": 28831848, + "step": 395 + }, + { + "epoch": 6.498442367601246, + "grad_norm": 0.30458521842956543, + "learning_rate": 3.7726035393759285e-05, + "loss": 0.1337, + "num_input_tokens_seen": 28898408, + "step": 396 + }, + { + "epoch": 6.515057113187955, + "grad_norm": 0.2644833028316498, + "learning_rate": 3.76696564127706e-05, + "loss": 0.1283, + "num_input_tokens_seen": 28960224, + "step": 397 + }, + { + "epoch": 6.531671858774662, + "grad_norm": 0.3309299349784851, + "learning_rate": 3.761319059537548e-05, + "loss": 0.1548, + "num_input_tokens_seen": 29020568, + "step": 398 + }, + { + "epoch": 6.548286604361371, + "grad_norm": 0.2788560092449188, + "learning_rate": 3.755663832858432e-05, + "loss": 0.1154, + "num_input_tokens_seen": 29095448, + "step": 399 + }, + { + "epoch": 6.564901349948079, + "grad_norm": 0.26450732350349426, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.1276, + "num_input_tokens_seen": 29186600, + "step": 400 + }, + { + "epoch": 6.581516095534787, + "grad_norm": 0.2650800049304962, + "learning_rate": 3.744327599781531e-05, + "loss": 0.2184, + "num_input_tokens_seen": 29258552, + "step": 401 + }, + { + "epoch": 6.598130841121495, + "grad_norm": 0.2814129590988159, + "learning_rate": 3.7386466710810194e-05, + "loss": 0.1508, + "num_input_tokens_seen": 29344848, + "step": 402 + }, + { + "epoch": 6.614745586708204, + "grad_norm": 0.2940891981124878, + "learning_rate": 3.7329572528349146e-05, + "loss": 0.2035, + "num_input_tokens_seen": 29410184, + "step": 403 + }, + { + "epoch": 6.6313603322949115, + "grad_norm": 0.25937849283218384, + "learning_rate": 3.727259384037852e-05, + "loss": 0.1308, + "num_input_tokens_seen": 29484928, + "step": 404 + }, + { + "epoch": 6.64797507788162, + "grad_norm": 0.26631370186805725, + "learning_rate": 3.721553103742388e-05, + "loss": 0.1349, + "num_input_tokens_seen": 29566432, + "step": 405 + }, + { + "epoch": 6.6645898234683285, + "grad_norm": 0.2721930146217346, + "learning_rate": 3.715838451058726e-05, + "loss": 0.1246, + "num_input_tokens_seen": 29634032, + "step": 406 + }, + { + "epoch": 6.681204569055036, + "grad_norm": 0.3131358325481415, + "learning_rate": 3.7101154651544584e-05, + "loss": 0.142, + "num_input_tokens_seen": 29681424, + "step": 407 + }, + { + "epoch": 6.697819314641745, + "grad_norm": 0.266184002161026, + "learning_rate": 3.704384185254288e-05, + "loss": 0.118, + "num_input_tokens_seen": 29762208, + "step": 408 + }, + { + "epoch": 6.714434060228453, + "grad_norm": 0.29790574312210083, + "learning_rate": 3.6986446506397666e-05, + "loss": 0.1286, + "num_input_tokens_seen": 29816280, + "step": 409 + }, + { + "epoch": 6.731048805815161, + "grad_norm": 0.2710191309452057, + "learning_rate": 3.692896900649021e-05, + "loss": 0.1211, + "num_input_tokens_seen": 29893040, + "step": 410 + }, + { + "epoch": 6.747663551401869, + "grad_norm": 0.46238377690315247, + "learning_rate": 3.6871409746764865e-05, + "loss": 0.1324, + "num_input_tokens_seen": 29971688, + "step": 411 + }, + { + "epoch": 6.764278296988578, + "grad_norm": 0.24678252637386322, + "learning_rate": 3.681376912172636e-05, + "loss": 0.1175, + "num_input_tokens_seen": 30051784, + "step": 412 + }, + { + "epoch": 6.780893042575285, + "grad_norm": 0.2699359655380249, + "learning_rate": 3.675604752643706e-05, + "loss": 0.1279, + "num_input_tokens_seen": 30146048, + "step": 413 + }, + { + "epoch": 6.797507788161994, + "grad_norm": 0.3923390805721283, + "learning_rate": 3.6698245356514335e-05, + "loss": 0.1204, + "num_input_tokens_seen": 30221296, + "step": 414 + }, + { + "epoch": 6.814122533748702, + "grad_norm": 0.26230940222740173, + "learning_rate": 3.6640363008127784e-05, + "loss": 0.1311, + "num_input_tokens_seen": 30287664, + "step": 415 + }, + { + "epoch": 6.83073727933541, + "grad_norm": 0.31548890471458435, + "learning_rate": 3.6582400877996546e-05, + "loss": 0.1417, + "num_input_tokens_seen": 30352816, + "step": 416 + }, + { + "epoch": 6.8473520249221185, + "grad_norm": 0.25795799493789673, + "learning_rate": 3.652435936338656e-05, + "loss": 0.1316, + "num_input_tokens_seen": 30439688, + "step": 417 + }, + { + "epoch": 6.863966770508826, + "grad_norm": 0.3696773648262024, + "learning_rate": 3.646623886210788e-05, + "loss": 0.1422, + "num_input_tokens_seen": 30506856, + "step": 418 + }, + { + "epoch": 6.880581516095535, + "grad_norm": 0.28359490633010864, + "learning_rate": 3.64080397725119e-05, + "loss": 0.1148, + "num_input_tokens_seen": 30565848, + "step": 419 + }, + { + "epoch": 6.897196261682243, + "grad_norm": 0.3696165978908539, + "learning_rate": 3.634976249348867e-05, + "loss": 0.1472, + "num_input_tokens_seen": 30633944, + "step": 420 + }, + { + "epoch": 6.913811007268951, + "grad_norm": 0.2775883078575134, + "learning_rate": 3.629140742446414e-05, + "loss": 0.1281, + "num_input_tokens_seen": 30704760, + "step": 421 + }, + { + "epoch": 6.930425752855659, + "grad_norm": 0.3016800582408905, + "learning_rate": 3.623297496539741e-05, + "loss": 0.1207, + "num_input_tokens_seen": 30773792, + "step": 422 + }, + { + "epoch": 6.947040498442368, + "grad_norm": 0.2855510711669922, + "learning_rate": 3.6174465516778035e-05, + "loss": 0.1274, + "num_input_tokens_seen": 30848672, + "step": 423 + }, + { + "epoch": 6.963655244029075, + "grad_norm": 0.27131325006484985, + "learning_rate": 3.611587947962319e-05, + "loss": 0.1234, + "num_input_tokens_seen": 30906064, + "step": 424 + }, + { + "epoch": 6.980269989615784, + "grad_norm": 0.35402408242225647, + "learning_rate": 3.6057217255475034e-05, + "loss": 0.1465, + "num_input_tokens_seen": 30964720, + "step": 425 + }, + { + "epoch": 6.996884735202492, + "grad_norm": 0.282308965921402, + "learning_rate": 3.599847924639788e-05, + "loss": 0.133, + "num_input_tokens_seen": 31043152, + "step": 426 + }, + { + "epoch": 7.0, + "grad_norm": 0.5601735711097717, + "learning_rate": 3.593966585497547e-05, + "loss": 0.106, + "num_input_tokens_seen": 31056056, + "step": 427 + }, + { + "epoch": 7.0166147455867085, + "grad_norm": 0.26449844241142273, + "learning_rate": 3.588077748430819e-05, + "loss": 0.1082, + "num_input_tokens_seen": 31135304, + "step": 428 + }, + { + "epoch": 7.033229491173416, + "grad_norm": 0.2989669442176819, + "learning_rate": 3.582181453801036e-05, + "loss": 0.1217, + "num_input_tokens_seen": 31185600, + "step": 429 + }, + { + "epoch": 7.049844236760125, + "grad_norm": 0.3569459021091461, + "learning_rate": 3.576277742020738e-05, + "loss": 0.119, + "num_input_tokens_seen": 31254312, + "step": 430 + }, + { + "epoch": 7.066458982346833, + "grad_norm": 0.3184427320957184, + "learning_rate": 3.570366653553307e-05, + "loss": 0.1131, + "num_input_tokens_seen": 31339112, + "step": 431 + }, + { + "epoch": 7.083073727933541, + "grad_norm": 0.2780788540840149, + "learning_rate": 3.564448228912682e-05, + "loss": 0.0967, + "num_input_tokens_seen": 31424024, + "step": 432 + }, + { + "epoch": 7.099688473520249, + "grad_norm": 0.29954010248184204, + "learning_rate": 3.558522508663081e-05, + "loss": 0.124, + "num_input_tokens_seen": 31494656, + "step": 433 + }, + { + "epoch": 7.116303219106958, + "grad_norm": 0.235326886177063, + "learning_rate": 3.552589533418728e-05, + "loss": 0.1102, + "num_input_tokens_seen": 31588536, + "step": 434 + }, + { + "epoch": 7.132917964693665, + "grad_norm": 0.31165215373039246, + "learning_rate": 3.54664934384357e-05, + "loss": 0.2184, + "num_input_tokens_seen": 31657560, + "step": 435 + }, + { + "epoch": 7.149532710280374, + "grad_norm": 0.30129772424697876, + "learning_rate": 3.540701980651003e-05, + "loss": 0.1208, + "num_input_tokens_seen": 31743992, + "step": 436 + }, + { + "epoch": 7.166147455867082, + "grad_norm": 0.2609243094921112, + "learning_rate": 3.534747484603587e-05, + "loss": 0.108, + "num_input_tokens_seen": 31806520, + "step": 437 + }, + { + "epoch": 7.18276220145379, + "grad_norm": 0.3314545452594757, + "learning_rate": 3.528785896512772e-05, + "loss": 0.1154, + "num_input_tokens_seen": 31860464, + "step": 438 + }, + { + "epoch": 7.1993769470404985, + "grad_norm": 0.29827994108200073, + "learning_rate": 3.5228172572386146e-05, + "loss": 0.2637, + "num_input_tokens_seen": 31921424, + "step": 439 + }, + { + "epoch": 7.215991692627207, + "grad_norm": 0.3889339566230774, + "learning_rate": 3.516841607689501e-05, + "loss": 0.1164, + "num_input_tokens_seen": 31981064, + "step": 440 + }, + { + "epoch": 7.232606438213915, + "grad_norm": 0.3092401325702667, + "learning_rate": 3.510858988821863e-05, + "loss": 0.1085, + "num_input_tokens_seen": 32050648, + "step": 441 + }, + { + "epoch": 7.249221183800623, + "grad_norm": 0.28353771567344666, + "learning_rate": 3.504869441639901e-05, + "loss": 0.1026, + "num_input_tokens_seen": 32118584, + "step": 442 + }, + { + "epoch": 7.265835929387332, + "grad_norm": 0.26136070489883423, + "learning_rate": 3.4988730071953004e-05, + "loss": 0.1035, + "num_input_tokens_seen": 32206384, + "step": 443 + }, + { + "epoch": 7.282450674974039, + "grad_norm": 0.26213783025741577, + "learning_rate": 3.4928697265869515e-05, + "loss": 0.0904, + "num_input_tokens_seen": 32299040, + "step": 444 + }, + { + "epoch": 7.299065420560748, + "grad_norm": 0.2811000347137451, + "learning_rate": 3.486859640960668e-05, + "loss": 0.1053, + "num_input_tokens_seen": 32355624, + "step": 445 + }, + { + "epoch": 7.315680166147456, + "grad_norm": 0.280838280916214, + "learning_rate": 3.480842791508904e-05, + "loss": 0.1227, + "num_input_tokens_seen": 32427792, + "step": 446 + }, + { + "epoch": 7.332294911734164, + "grad_norm": 0.3092939555644989, + "learning_rate": 3.474819219470471e-05, + "loss": 0.1104, + "num_input_tokens_seen": 32508696, + "step": 447 + }, + { + "epoch": 7.348909657320872, + "grad_norm": 0.25746166706085205, + "learning_rate": 3.4687889661302576e-05, + "loss": 0.1099, + "num_input_tokens_seen": 32601312, + "step": 448 + }, + { + "epoch": 7.365524402907581, + "grad_norm": 0.2721652090549469, + "learning_rate": 3.4627520728189456e-05, + "loss": 0.0966, + "num_input_tokens_seen": 32680256, + "step": 449 + }, + { + "epoch": 7.382139148494288, + "grad_norm": 0.3046250641345978, + "learning_rate": 3.456708580912725e-05, + "loss": 0.1138, + "num_input_tokens_seen": 32738816, + "step": 450 + }, + { + "epoch": 7.398753894080997, + "grad_norm": 0.28120365738868713, + "learning_rate": 3.4506585318330125e-05, + "loss": 0.1026, + "num_input_tokens_seen": 32813240, + "step": 451 + }, + { + "epoch": 7.415368639667705, + "grad_norm": 0.29472339153289795, + "learning_rate": 3.444601967046168e-05, + "loss": 0.1183, + "num_input_tokens_seen": 32889680, + "step": 452 + }, + { + "epoch": 7.431983385254413, + "grad_norm": 0.32324308156967163, + "learning_rate": 3.438538928063208e-05, + "loss": 0.1218, + "num_input_tokens_seen": 32964760, + "step": 453 + }, + { + "epoch": 7.4485981308411215, + "grad_norm": 0.2858980894088745, + "learning_rate": 3.432469456439523e-05, + "loss": 0.1226, + "num_input_tokens_seen": 33048992, + "step": 454 + }, + { + "epoch": 7.46521287642783, + "grad_norm": 0.30722907185554504, + "learning_rate": 3.426393593774591e-05, + "loss": 0.1127, + "num_input_tokens_seen": 33130200, + "step": 455 + }, + { + "epoch": 7.481827622014538, + "grad_norm": 0.2962627112865448, + "learning_rate": 3.4203113817116957e-05, + "loss": 0.1057, + "num_input_tokens_seen": 33223024, + "step": 456 + }, + { + "epoch": 7.498442367601246, + "grad_norm": 0.29225122928619385, + "learning_rate": 3.414222861937636e-05, + "loss": 0.1127, + "num_input_tokens_seen": 33303120, + "step": 457 + }, + { + "epoch": 7.515057113187955, + "grad_norm": 0.2806551456451416, + "learning_rate": 3.408128076182446e-05, + "loss": 0.1201, + "num_input_tokens_seen": 33364984, + "step": 458 + }, + { + "epoch": 7.531671858774662, + "grad_norm": 0.30782198905944824, + "learning_rate": 3.402027066219105e-05, + "loss": 0.1318, + "num_input_tokens_seen": 33427352, + "step": 459 + }, + { + "epoch": 7.548286604361371, + "grad_norm": 0.27443554997444153, + "learning_rate": 3.39591987386325e-05, + "loss": 0.1045, + "num_input_tokens_seen": 33481272, + "step": 460 + }, + { + "epoch": 7.564901349948079, + "grad_norm": 0.29877278208732605, + "learning_rate": 3.389806540972898e-05, + "loss": 0.1, + "num_input_tokens_seen": 33538904, + "step": 461 + }, + { + "epoch": 7.581516095534787, + "grad_norm": 0.24680256843566895, + "learning_rate": 3.383687109448143e-05, + "loss": 0.1111, + "num_input_tokens_seen": 33635976, + "step": 462 + }, + { + "epoch": 7.598130841121495, + "grad_norm": 0.2937908470630646, + "learning_rate": 3.377561621230887e-05, + "loss": 0.1123, + "num_input_tokens_seen": 33711184, + "step": 463 + }, + { + "epoch": 7.614745586708204, + "grad_norm": 0.2607901096343994, + "learning_rate": 3.3714301183045385e-05, + "loss": 0.0937, + "num_input_tokens_seen": 33778848, + "step": 464 + }, + { + "epoch": 7.6313603322949115, + "grad_norm": 0.28291332721710205, + "learning_rate": 3.365292642693732e-05, + "loss": 0.1023, + "num_input_tokens_seen": 33866024, + "step": 465 + }, + { + "epoch": 7.64797507788162, + "grad_norm": 0.2455737590789795, + "learning_rate": 3.359149236464041e-05, + "loss": 0.1179, + "num_input_tokens_seen": 33978144, + "step": 466 + }, + { + "epoch": 7.6645898234683285, + "grad_norm": 0.32317623496055603, + "learning_rate": 3.35299994172168e-05, + "loss": 0.1326, + "num_input_tokens_seen": 34047480, + "step": 467 + }, + { + "epoch": 7.681204569055036, + "grad_norm": 0.23927746713161469, + "learning_rate": 3.346844800613229e-05, + "loss": 0.1058, + "num_input_tokens_seen": 34134480, + "step": 468 + }, + { + "epoch": 7.697819314641745, + "grad_norm": 0.28762567043304443, + "learning_rate": 3.340683855325335e-05, + "loss": 0.1077, + "num_input_tokens_seen": 34190176, + "step": 469 + }, + { + "epoch": 7.714434060228453, + "grad_norm": 0.32403019070625305, + "learning_rate": 3.3345171480844275e-05, + "loss": 0.1224, + "num_input_tokens_seen": 34267336, + "step": 470 + }, + { + "epoch": 7.731048805815161, + "grad_norm": 0.2981953024864197, + "learning_rate": 3.3283447211564276e-05, + "loss": 0.116, + "num_input_tokens_seen": 34333616, + "step": 471 + }, + { + "epoch": 7.747663551401869, + "grad_norm": 0.24373090267181396, + "learning_rate": 3.322166616846458e-05, + "loss": 0.1153, + "num_input_tokens_seen": 34404000, + "step": 472 + }, + { + "epoch": 7.764278296988578, + "grad_norm": 0.3025456666946411, + "learning_rate": 3.315982877498555e-05, + "loss": 0.111, + "num_input_tokens_seen": 34466048, + "step": 473 + }, + { + "epoch": 7.780893042575285, + "grad_norm": 0.2972700893878937, + "learning_rate": 3.309793545495374e-05, + "loss": 0.1099, + "num_input_tokens_seen": 34547312, + "step": 474 + }, + { + "epoch": 7.797507788161994, + "grad_norm": 0.32208284735679626, + "learning_rate": 3.303598663257904e-05, + "loss": 0.1029, + "num_input_tokens_seen": 34600544, + "step": 475 + }, + { + "epoch": 7.814122533748702, + "grad_norm": 0.2841314971446991, + "learning_rate": 3.2973982732451755e-05, + "loss": 0.1111, + "num_input_tokens_seen": 34660792, + "step": 476 + }, + { + "epoch": 7.83073727933541, + "grad_norm": 0.25448325276374817, + "learning_rate": 3.2911924179539656e-05, + "loss": 0.162, + "num_input_tokens_seen": 34778440, + "step": 477 + }, + { + "epoch": 7.8473520249221185, + "grad_norm": 0.30900058150291443, + "learning_rate": 3.284981139918513e-05, + "loss": 0.1161, + "num_input_tokens_seen": 34849760, + "step": 478 + }, + { + "epoch": 7.863966770508826, + "grad_norm": 0.26905959844589233, + "learning_rate": 3.278764481710221e-05, + "loss": 0.0973, + "num_input_tokens_seen": 34940776, + "step": 479 + }, + { + "epoch": 7.880581516095535, + "grad_norm": 0.2753785252571106, + "learning_rate": 3.272542485937369e-05, + "loss": 0.1131, + "num_input_tokens_seen": 35018104, + "step": 480 + }, + { + "epoch": 7.897196261682243, + "grad_norm": 0.27028700709342957, + "learning_rate": 3.26631519524482e-05, + "loss": 0.1012, + "num_input_tokens_seen": 35079744, + "step": 481 + }, + { + "epoch": 7.913811007268951, + "grad_norm": 0.2808922231197357, + "learning_rate": 3.260082652313726e-05, + "loss": 0.1, + "num_input_tokens_seen": 35132808, + "step": 482 + }, + { + "epoch": 7.930425752855659, + "grad_norm": 0.2821935713291168, + "learning_rate": 3.253844899861239e-05, + "loss": 0.1005, + "num_input_tokens_seen": 35197816, + "step": 483 + }, + { + "epoch": 7.947040498442368, + "grad_norm": 0.2640717029571533, + "learning_rate": 3.247601980640217e-05, + "loss": 0.0914, + "num_input_tokens_seen": 35275528, + "step": 484 + }, + { + "epoch": 7.963655244029075, + "grad_norm": 0.31811895966529846, + "learning_rate": 3.241353937438927e-05, + "loss": 0.119, + "num_input_tokens_seen": 35333280, + "step": 485 + }, + { + "epoch": 7.980269989615784, + "grad_norm": 0.30607783794403076, + "learning_rate": 3.23510081308076e-05, + "loss": 0.1156, + "num_input_tokens_seen": 35412944, + "step": 486 + }, + { + "epoch": 7.996884735202492, + "grad_norm": 0.29681020975112915, + "learning_rate": 3.228842650423929e-05, + "loss": 0.1217, + "num_input_tokens_seen": 35485056, + "step": 487 + }, + { + "epoch": 8.0, + "grad_norm": 0.6129417419433594, + "learning_rate": 3.222579492361179e-05, + "loss": 0.1188, + "num_input_tokens_seen": 35494824, + "step": 488 + }, + { + "epoch": 8.016614745586708, + "grad_norm": 0.26822784543037415, + "learning_rate": 3.2163113818194964e-05, + "loss": 0.0967, + "num_input_tokens_seen": 35557768, + "step": 489 + }, + { + "epoch": 8.033229491173417, + "grad_norm": 0.261436402797699, + "learning_rate": 3.210038361759807e-05, + "loss": 0.0989, + "num_input_tokens_seen": 35613120, + "step": 490 + }, + { + "epoch": 8.049844236760125, + "grad_norm": 0.26140278577804565, + "learning_rate": 3.2037604751766885e-05, + "loss": 0.096, + "num_input_tokens_seen": 35674176, + "step": 491 + }, + { + "epoch": 8.066458982346832, + "grad_norm": 0.285133957862854, + "learning_rate": 3.1974777650980735e-05, + "loss": 0.1049, + "num_input_tokens_seen": 35786664, + "step": 492 + }, + { + "epoch": 8.083073727933542, + "grad_norm": 0.34326228499412537, + "learning_rate": 3.191190274584952e-05, + "loss": 0.1102, + "num_input_tokens_seen": 35840720, + "step": 493 + }, + { + "epoch": 8.09968847352025, + "grad_norm": 0.23448359966278076, + "learning_rate": 3.184898046731082e-05, + "loss": 0.0864, + "num_input_tokens_seen": 35936736, + "step": 494 + }, + { + "epoch": 8.116303219106957, + "grad_norm": 0.2525218427181244, + "learning_rate": 3.178601124662686e-05, + "loss": 0.1, + "num_input_tokens_seen": 36013800, + "step": 495 + }, + { + "epoch": 8.132917964693666, + "grad_norm": 0.27955323457717896, + "learning_rate": 3.172299551538164e-05, + "loss": 0.0957, + "num_input_tokens_seen": 36097904, + "step": 496 + }, + { + "epoch": 8.149532710280374, + "grad_norm": 0.2300575226545334, + "learning_rate": 3.165993370547794e-05, + "loss": 0.077, + "num_input_tokens_seen": 36195544, + "step": 497 + }, + { + "epoch": 8.166147455867081, + "grad_norm": 0.3093201816082001, + "learning_rate": 3.1596826249134324e-05, + "loss": 0.1195, + "num_input_tokens_seen": 36261256, + "step": 498 + }, + { + "epoch": 8.18276220145379, + "grad_norm": 0.36045631766319275, + "learning_rate": 3.153367357888224e-05, + "loss": 0.1199, + "num_input_tokens_seen": 36325024, + "step": 499 + }, + { + "epoch": 8.199376947040498, + "grad_norm": 0.28502553701400757, + "learning_rate": 3.147047612756302e-05, + "loss": 0.1066, + "num_input_tokens_seen": 36377368, + "step": 500 + }, + { + "epoch": 8.215991692627206, + "grad_norm": 0.2474561631679535, + "learning_rate": 3.140723432832492e-05, + "loss": 0.0908, + "num_input_tokens_seen": 36459240, + "step": 501 + }, + { + "epoch": 8.232606438213915, + "grad_norm": 0.2631191313266754, + "learning_rate": 3.1343948614620145e-05, + "loss": 0.1022, + "num_input_tokens_seen": 36553088, + "step": 502 + }, + { + "epoch": 8.249221183800623, + "grad_norm": 0.34360605478286743, + "learning_rate": 3.128061942020189e-05, + "loss": 0.2542, + "num_input_tokens_seen": 36611464, + "step": 503 + }, + { + "epoch": 8.26583592938733, + "grad_norm": 0.21943052113056183, + "learning_rate": 3.121724717912138e-05, + "loss": 0.0817, + "num_input_tokens_seen": 36705696, + "step": 504 + }, + { + "epoch": 8.28245067497404, + "grad_norm": 0.27961695194244385, + "learning_rate": 3.115383232572483e-05, + "loss": 0.0879, + "num_input_tokens_seen": 36762744, + "step": 505 + }, + { + "epoch": 8.299065420560748, + "grad_norm": 0.3083556294441223, + "learning_rate": 3.109037529465056e-05, + "loss": 0.1057, + "num_input_tokens_seen": 36827816, + "step": 506 + }, + { + "epoch": 8.315680166147455, + "grad_norm": 0.24268627166748047, + "learning_rate": 3.102687652082597e-05, + "loss": 0.0924, + "num_input_tokens_seen": 36931424, + "step": 507 + }, + { + "epoch": 8.332294911734165, + "grad_norm": 0.4235898554325104, + "learning_rate": 3.0963336439464526e-05, + "loss": 0.093, + "num_input_tokens_seen": 36991464, + "step": 508 + }, + { + "epoch": 8.348909657320872, + "grad_norm": 0.2394886463880539, + "learning_rate": 3.089975548606283e-05, + "loss": 0.0784, + "num_input_tokens_seen": 37092928, + "step": 509 + }, + { + "epoch": 8.36552440290758, + "grad_norm": 0.2834029495716095, + "learning_rate": 3.083613409639764e-05, + "loss": 0.0953, + "num_input_tokens_seen": 37168792, + "step": 510 + }, + { + "epoch": 8.38213914849429, + "grad_norm": 0.3311532735824585, + "learning_rate": 3.0772472706522806e-05, + "loss": 0.0964, + "num_input_tokens_seen": 37258864, + "step": 511 + }, + { + "epoch": 8.398753894080997, + "grad_norm": 0.28146132826805115, + "learning_rate": 3.0708771752766394e-05, + "loss": 0.1084, + "num_input_tokens_seen": 37343224, + "step": 512 + }, + { + "epoch": 8.415368639667705, + "grad_norm": 0.35688018798828125, + "learning_rate": 3.06450316717276e-05, + "loss": 0.1068, + "num_input_tokens_seen": 37395488, + "step": 513 + }, + { + "epoch": 8.431983385254414, + "grad_norm": 0.24343422055244446, + "learning_rate": 3.0581252900273786e-05, + "loss": 0.0849, + "num_input_tokens_seen": 37473248, + "step": 514 + }, + { + "epoch": 8.448598130841122, + "grad_norm": 0.30105647444725037, + "learning_rate": 3.0517435875537536e-05, + "loss": 0.0883, + "num_input_tokens_seen": 37532096, + "step": 515 + }, + { + "epoch": 8.46521287642783, + "grad_norm": 0.24181726574897766, + "learning_rate": 3.045358103491357e-05, + "loss": 0.0839, + "num_input_tokens_seen": 37622328, + "step": 516 + }, + { + "epoch": 8.481827622014539, + "grad_norm": 0.3248032331466675, + "learning_rate": 3.038968881605583e-05, + "loss": 0.0999, + "num_input_tokens_seen": 37686304, + "step": 517 + }, + { + "epoch": 8.498442367601246, + "grad_norm": 0.28919899463653564, + "learning_rate": 3.0325759656874418e-05, + "loss": 0.1023, + "num_input_tokens_seen": 37770856, + "step": 518 + }, + { + "epoch": 8.515057113187954, + "grad_norm": 0.32156485319137573, + "learning_rate": 3.026179399553264e-05, + "loss": 0.0907, + "num_input_tokens_seen": 37834072, + "step": 519 + }, + { + "epoch": 8.531671858774663, + "grad_norm": 0.36424997448921204, + "learning_rate": 3.0197792270443982e-05, + "loss": 0.0899, + "num_input_tokens_seen": 37889928, + "step": 520 + }, + { + "epoch": 8.54828660436137, + "grad_norm": 0.3248436450958252, + "learning_rate": 3.0133754920269103e-05, + "loss": 0.2102, + "num_input_tokens_seen": 37971296, + "step": 521 + }, + { + "epoch": 8.564901349948078, + "grad_norm": 0.3297094702720642, + "learning_rate": 3.0069682383912813e-05, + "loss": 0.0978, + "num_input_tokens_seen": 38049288, + "step": 522 + }, + { + "epoch": 8.581516095534788, + "grad_norm": 0.3013320565223694, + "learning_rate": 3.0005575100521118e-05, + "loss": 0.1069, + "num_input_tokens_seen": 38123392, + "step": 523 + }, + { + "epoch": 8.598130841121495, + "grad_norm": 0.2589408755302429, + "learning_rate": 2.9941433509478156e-05, + "loss": 0.0973, + "num_input_tokens_seen": 38208264, + "step": 524 + }, + { + "epoch": 8.614745586708203, + "grad_norm": 0.30129629373550415, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.1045, + "num_input_tokens_seen": 38258192, + "step": 525 + }, + { + "epoch": 8.631360332294912, + "grad_norm": 0.28428325057029724, + "learning_rate": 2.9813049163147688e-05, + "loss": 0.1014, + "num_input_tokens_seen": 38332408, + "step": 526 + }, + { + "epoch": 8.64797507788162, + "grad_norm": 0.23376743495464325, + "learning_rate": 2.974880728779212e-05, + "loss": 0.0816, + "num_input_tokens_seen": 38404960, + "step": 527 + }, + { + "epoch": 8.664589823468328, + "grad_norm": 0.26018524169921875, + "learning_rate": 2.9684532864643122e-05, + "loss": 0.1103, + "num_input_tokens_seen": 38481704, + "step": 528 + }, + { + "epoch": 8.681204569055037, + "grad_norm": 0.3135533332824707, + "learning_rate": 2.9620226334230388e-05, + "loss": 0.0833, + "num_input_tokens_seen": 38546304, + "step": 529 + }, + { + "epoch": 8.697819314641745, + "grad_norm": 0.318968266248703, + "learning_rate": 2.9555888137303695e-05, + "loss": 0.1225, + "num_input_tokens_seen": 38621024, + "step": 530 + }, + { + "epoch": 8.714434060228452, + "grad_norm": 0.2814541161060333, + "learning_rate": 2.949151871482982e-05, + "loss": 0.0874, + "num_input_tokens_seen": 38679368, + "step": 531 + }, + { + "epoch": 8.731048805815162, + "grad_norm": 0.304311603307724, + "learning_rate": 2.9427118507989586e-05, + "loss": 0.1045, + "num_input_tokens_seen": 38753984, + "step": 532 + }, + { + "epoch": 8.74766355140187, + "grad_norm": 0.28034818172454834, + "learning_rate": 2.93626879581748e-05, + "loss": 0.0939, + "num_input_tokens_seen": 38808336, + "step": 533 + }, + { + "epoch": 8.764278296988577, + "grad_norm": 0.29433172941207886, + "learning_rate": 2.929822750698524e-05, + "loss": 0.2031, + "num_input_tokens_seen": 38876624, + "step": 534 + }, + { + "epoch": 8.780893042575286, + "grad_norm": 0.2822960913181305, + "learning_rate": 2.9233737596225613e-05, + "loss": 0.0906, + "num_input_tokens_seen": 38933576, + "step": 535 + }, + { + "epoch": 8.797507788161994, + "grad_norm": 0.26343122124671936, + "learning_rate": 2.916921866790256e-05, + "loss": 0.0876, + "num_input_tokens_seen": 39050816, + "step": 536 + }, + { + "epoch": 8.814122533748701, + "grad_norm": 0.2758285403251648, + "learning_rate": 2.9104671164221576e-05, + "loss": 0.0899, + "num_input_tokens_seen": 39101856, + "step": 537 + }, + { + "epoch": 8.83073727933541, + "grad_norm": 0.2846848964691162, + "learning_rate": 2.9040095527584032e-05, + "loss": 0.094, + "num_input_tokens_seen": 39161928, + "step": 538 + }, + { + "epoch": 8.847352024922118, + "grad_norm": 0.32669830322265625, + "learning_rate": 2.897549220058411e-05, + "loss": 0.1077, + "num_input_tokens_seen": 39216048, + "step": 539 + }, + { + "epoch": 8.863966770508826, + "grad_norm": 0.28703227639198303, + "learning_rate": 2.8910861626005776e-05, + "loss": 0.0857, + "num_input_tokens_seen": 39317320, + "step": 540 + }, + { + "epoch": 8.880581516095535, + "grad_norm": 0.32840797305107117, + "learning_rate": 2.884620424681976e-05, + "loss": 0.0981, + "num_input_tokens_seen": 39383120, + "step": 541 + }, + { + "epoch": 8.897196261682243, + "grad_norm": 0.2913094758987427, + "learning_rate": 2.8781520506180486e-05, + "loss": 0.0969, + "num_input_tokens_seen": 39458584, + "step": 542 + }, + { + "epoch": 8.91381100726895, + "grad_norm": 0.23351797461509705, + "learning_rate": 2.871681084742308e-05, + "loss": 0.0809, + "num_input_tokens_seen": 39535152, + "step": 543 + }, + { + "epoch": 8.93042575285566, + "grad_norm": 0.32754406332969666, + "learning_rate": 2.8652075714060295e-05, + "loss": 0.1064, + "num_input_tokens_seen": 39590360, + "step": 544 + }, + { + "epoch": 8.947040498442368, + "grad_norm": 0.28002288937568665, + "learning_rate": 2.858731554977948e-05, + "loss": 0.0897, + "num_input_tokens_seen": 39669984, + "step": 545 + }, + { + "epoch": 8.963655244029075, + "grad_norm": 0.2599540054798126, + "learning_rate": 2.8522530798439567e-05, + "loss": 0.0883, + "num_input_tokens_seen": 39757392, + "step": 546 + }, + { + "epoch": 8.980269989615785, + "grad_norm": 0.29728278517723083, + "learning_rate": 2.845772190406798e-05, + "loss": 0.1024, + "num_input_tokens_seen": 39848064, + "step": 547 + }, + { + "epoch": 8.996884735202492, + "grad_norm": 0.27901116013526917, + "learning_rate": 2.8392889310857612e-05, + "loss": 0.093, + "num_input_tokens_seen": 39922288, + "step": 548 + }, + { + "epoch": 9.0, + "grad_norm": 0.7538627982139587, + "learning_rate": 2.832803346316381e-05, + "loss": 0.0997, + "num_input_tokens_seen": 39932640, + "step": 549 + }, + { + "epoch": 9.016614745586708, + "grad_norm": 0.259941965341568, + "learning_rate": 2.8263154805501297e-05, + "loss": 0.0756, + "num_input_tokens_seen": 40005688, + "step": 550 + }, + { + "epoch": 9.033229491173417, + "grad_norm": 0.27237728238105774, + "learning_rate": 2.819825378254111e-05, + "loss": 0.0851, + "num_input_tokens_seen": 40057120, + "step": 551 + }, + { + "epoch": 9.049844236760125, + "grad_norm": 0.28233349323272705, + "learning_rate": 2.8133330839107608e-05, + "loss": 0.0841, + "num_input_tokens_seen": 40135992, + "step": 552 + }, + { + "epoch": 9.066458982346832, + "grad_norm": 0.278773695230484, + "learning_rate": 2.8068386420175375e-05, + "loss": 0.1069, + "num_input_tokens_seen": 40196928, + "step": 553 + }, + { + "epoch": 9.083073727933542, + "grad_norm": 0.28890740871429443, + "learning_rate": 2.8003420970866177e-05, + "loss": 0.0851, + "num_input_tokens_seen": 40269392, + "step": 554 + }, + { + "epoch": 9.09968847352025, + "grad_norm": 0.33677053451538086, + "learning_rate": 2.7938434936445945e-05, + "loss": 0.0899, + "num_input_tokens_seen": 40350080, + "step": 555 + }, + { + "epoch": 9.116303219106957, + "grad_norm": 0.33169543743133545, + "learning_rate": 2.787342876232167e-05, + "loss": 0.0991, + "num_input_tokens_seen": 40416360, + "step": 556 + }, + { + "epoch": 9.132917964693666, + "grad_norm": 0.2758564054965973, + "learning_rate": 2.780840289403839e-05, + "loss": 0.0733, + "num_input_tokens_seen": 40490432, + "step": 557 + }, + { + "epoch": 9.149532710280374, + "grad_norm": 0.29123055934906006, + "learning_rate": 2.774335777727613e-05, + "loss": 0.0927, + "num_input_tokens_seen": 40561784, + "step": 558 + }, + { + "epoch": 9.166147455867081, + "grad_norm": 0.26718494296073914, + "learning_rate": 2.7678293857846844e-05, + "loss": 0.083, + "num_input_tokens_seen": 40641728, + "step": 559 + }, + { + "epoch": 9.18276220145379, + "grad_norm": 0.30949828028678894, + "learning_rate": 2.761321158169134e-05, + "loss": 0.0975, + "num_input_tokens_seen": 40700744, + "step": 560 + }, + { + "epoch": 9.199376947040498, + "grad_norm": 0.3187688887119293, + "learning_rate": 2.754811139487625e-05, + "loss": 0.0995, + "num_input_tokens_seen": 40748048, + "step": 561 + }, + { + "epoch": 9.215991692627206, + "grad_norm": 0.2677261531352997, + "learning_rate": 2.7482993743590978e-05, + "loss": 0.073, + "num_input_tokens_seen": 40810104, + "step": 562 + }, + { + "epoch": 9.232606438213915, + "grad_norm": 0.3078905940055847, + "learning_rate": 2.7417859074144604e-05, + "loss": 0.0952, + "num_input_tokens_seen": 40899480, + "step": 563 + }, + { + "epoch": 9.249221183800623, + "grad_norm": 0.25825080275535583, + "learning_rate": 2.7352707832962865e-05, + "loss": 0.0808, + "num_input_tokens_seen": 40993536, + "step": 564 + }, + { + "epoch": 9.26583592938733, + "grad_norm": 0.23915870487689972, + "learning_rate": 2.7287540466585065e-05, + "loss": 0.0698, + "num_input_tokens_seen": 41060848, + "step": 565 + }, + { + "epoch": 9.28245067497404, + "grad_norm": 0.28298041224479675, + "learning_rate": 2.7222357421661042e-05, + "loss": 0.0931, + "num_input_tokens_seen": 41138352, + "step": 566 + }, + { + "epoch": 9.299065420560748, + "grad_norm": 0.3111128509044647, + "learning_rate": 2.7157159144948092e-05, + "loss": 0.0996, + "num_input_tokens_seen": 41212624, + "step": 567 + }, + { + "epoch": 9.315680166147455, + "grad_norm": 0.2831686735153198, + "learning_rate": 2.7091946083307896e-05, + "loss": 0.0764, + "num_input_tokens_seen": 41279472, + "step": 568 + }, + { + "epoch": 9.332294911734165, + "grad_norm": 0.27329790592193604, + "learning_rate": 2.7026718683703473e-05, + "loss": 0.0942, + "num_input_tokens_seen": 41353544, + "step": 569 + }, + { + "epoch": 9.348909657320872, + "grad_norm": 0.23970037698745728, + "learning_rate": 2.6961477393196126e-05, + "loss": 0.0718, + "num_input_tokens_seen": 41444896, + "step": 570 + }, + { + "epoch": 9.36552440290758, + "grad_norm": 0.2980060279369354, + "learning_rate": 2.6896222658942348e-05, + "loss": 0.0906, + "num_input_tokens_seen": 41505152, + "step": 571 + }, + { + "epoch": 9.38213914849429, + "grad_norm": 0.32479342818260193, + "learning_rate": 2.6830954928190794e-05, + "loss": 0.1081, + "num_input_tokens_seen": 41566696, + "step": 572 + }, + { + "epoch": 9.398753894080997, + "grad_norm": 0.2624221444129944, + "learning_rate": 2.6765674648279172e-05, + "loss": 0.0922, + "num_input_tokens_seen": 41641736, + "step": 573 + }, + { + "epoch": 9.415368639667705, + "grad_norm": 0.22994311153888702, + "learning_rate": 2.6700382266631206e-05, + "loss": 0.0748, + "num_input_tokens_seen": 41740008, + "step": 574 + }, + { + "epoch": 9.431983385254414, + "grad_norm": 0.264123797416687, + "learning_rate": 2.663507823075358e-05, + "loss": 0.0813, + "num_input_tokens_seen": 41842808, + "step": 575 + }, + { + "epoch": 9.448598130841122, + "grad_norm": 0.2524157464504242, + "learning_rate": 2.656976298823284e-05, + "loss": 0.0779, + "num_input_tokens_seen": 41917128, + "step": 576 + }, + { + "epoch": 9.46521287642783, + "grad_norm": 0.2805820405483246, + "learning_rate": 2.6504436986732338e-05, + "loss": 0.0882, + "num_input_tokens_seen": 41984232, + "step": 577 + }, + { + "epoch": 9.481827622014539, + "grad_norm": 0.2692512273788452, + "learning_rate": 2.6439100673989187e-05, + "loss": 0.0905, + "num_input_tokens_seen": 42047216, + "step": 578 + }, + { + "epoch": 9.498442367601246, + "grad_norm": 0.26741349697113037, + "learning_rate": 2.637375449781115e-05, + "loss": 0.081, + "num_input_tokens_seen": 42122072, + "step": 579 + }, + { + "epoch": 9.515057113187954, + "grad_norm": 0.24825426936149597, + "learning_rate": 2.63083989060736e-05, + "loss": 0.0777, + "num_input_tokens_seen": 42198480, + "step": 580 + }, + { + "epoch": 9.531671858774663, + "grad_norm": 0.26209551095962524, + "learning_rate": 2.624303434671645e-05, + "loss": 0.0817, + "num_input_tokens_seen": 42289336, + "step": 581 + }, + { + "epoch": 9.54828660436137, + "grad_norm": 0.3296469748020172, + "learning_rate": 2.6177661267741065e-05, + "loss": 0.0931, + "num_input_tokens_seen": 42352288, + "step": 582 + }, + { + "epoch": 9.564901349948078, + "grad_norm": 0.29804670810699463, + "learning_rate": 2.611228011720722e-05, + "loss": 0.0852, + "num_input_tokens_seen": 42453832, + "step": 583 + }, + { + "epoch": 9.581516095534788, + "grad_norm": 0.30008113384246826, + "learning_rate": 2.604689134322999e-05, + "loss": 0.0917, + "num_input_tokens_seen": 42514704, + "step": 584 + }, + { + "epoch": 9.598130841121495, + "grad_norm": 0.3668535649776459, + "learning_rate": 2.598149539397672e-05, + "loss": 0.184, + "num_input_tokens_seen": 42576344, + "step": 585 + }, + { + "epoch": 9.614745586708203, + "grad_norm": 0.26044410467147827, + "learning_rate": 2.591609271766391e-05, + "loss": 0.0799, + "num_input_tokens_seen": 42662824, + "step": 586 + }, + { + "epoch": 9.631360332294912, + "grad_norm": 0.23652903735637665, + "learning_rate": 2.5850683762554184e-05, + "loss": 0.0681, + "num_input_tokens_seen": 42752496, + "step": 587 + }, + { + "epoch": 9.64797507788162, + "grad_norm": 0.2591301202774048, + "learning_rate": 2.578526897695321e-05, + "loss": 0.07, + "num_input_tokens_seen": 42826064, + "step": 588 + }, + { + "epoch": 9.664589823468328, + "grad_norm": 0.27966633439064026, + "learning_rate": 2.5719848809206586e-05, + "loss": 0.1774, + "num_input_tokens_seen": 42895808, + "step": 589 + }, + { + "epoch": 9.681204569055037, + "grad_norm": 0.2826574146747589, + "learning_rate": 2.5654423707696833e-05, + "loss": 0.0925, + "num_input_tokens_seen": 42952408, + "step": 590 + }, + { + "epoch": 9.697819314641745, + "grad_norm": 0.29687875509262085, + "learning_rate": 2.558899412084026e-05, + "loss": 0.0824, + "num_input_tokens_seen": 43025536, + "step": 591 + }, + { + "epoch": 9.714434060228452, + "grad_norm": 0.2201530486345291, + "learning_rate": 2.5523560497083926e-05, + "loss": 0.0634, + "num_input_tokens_seen": 43118024, + "step": 592 + }, + { + "epoch": 9.731048805815162, + "grad_norm": 0.25466641783714294, + "learning_rate": 2.5458123284902573e-05, + "loss": 0.0796, + "num_input_tokens_seen": 43198360, + "step": 593 + }, + { + "epoch": 9.74766355140187, + "grad_norm": 0.2543363571166992, + "learning_rate": 2.539268293279552e-05, + "loss": 0.0681, + "num_input_tokens_seen": 43264072, + "step": 594 + }, + { + "epoch": 9.764278296988577, + "grad_norm": 0.35776007175445557, + "learning_rate": 2.5327239889283612e-05, + "loss": 0.098, + "num_input_tokens_seen": 43339600, + "step": 595 + }, + { + "epoch": 9.780893042575286, + "grad_norm": 0.2951027452945709, + "learning_rate": 2.5261794602906145e-05, + "loss": 0.0857, + "num_input_tokens_seen": 43401136, + "step": 596 + }, + { + "epoch": 9.797507788161994, + "grad_norm": 0.2961483299732208, + "learning_rate": 2.5196347522217784e-05, + "loss": 0.0853, + "num_input_tokens_seen": 43477528, + "step": 597 + }, + { + "epoch": 9.814122533748701, + "grad_norm": 0.23898787796497345, + "learning_rate": 2.513089909578549e-05, + "loss": 0.0825, + "num_input_tokens_seen": 43557352, + "step": 598 + }, + { + "epoch": 9.83073727933541, + "grad_norm": 0.3080408275127411, + "learning_rate": 2.5065449772185456e-05, + "loss": 0.0985, + "num_input_tokens_seen": 43643104, + "step": 599 + }, + { + "epoch": 9.847352024922118, + "grad_norm": 0.3034234941005707, + "learning_rate": 2.5e-05, + "loss": 0.0859, + "num_input_tokens_seen": 43721408, + "step": 600 + }, + { + "epoch": 9.863966770508826, + "grad_norm": 0.23991604149341583, + "learning_rate": 2.4934550227814553e-05, + "loss": 0.0745, + "num_input_tokens_seen": 43802136, + "step": 601 + }, + { + "epoch": 9.880581516095535, + "grad_norm": 0.30657002329826355, + "learning_rate": 2.486910090421451e-05, + "loss": 0.0909, + "num_input_tokens_seen": 43862904, + "step": 602 + }, + { + "epoch": 9.897196261682243, + "grad_norm": 0.27239277958869934, + "learning_rate": 2.480365247778223e-05, + "loss": 0.0946, + "num_input_tokens_seen": 43942056, + "step": 603 + }, + { + "epoch": 9.91381100726895, + "grad_norm": 0.2195497751235962, + "learning_rate": 2.4738205397093864e-05, + "loss": 0.163, + "num_input_tokens_seen": 44016096, + "step": 604 + }, + { + "epoch": 9.93042575285566, + "grad_norm": 0.33310699462890625, + "learning_rate": 2.4672760110716394e-05, + "loss": 0.1059, + "num_input_tokens_seen": 44065504, + "step": 605 + }, + { + "epoch": 9.947040498442368, + "grad_norm": 0.2776658833026886, + "learning_rate": 2.460731706720449e-05, + "loss": 0.0852, + "num_input_tokens_seen": 44114776, + "step": 606 + }, + { + "epoch": 9.963655244029075, + "grad_norm": 0.32274240255355835, + "learning_rate": 2.4541876715097432e-05, + "loss": 0.1065, + "num_input_tokens_seen": 44175184, + "step": 607 + }, + { + "epoch": 9.980269989615785, + "grad_norm": 0.2639608085155487, + "learning_rate": 2.447643950291608e-05, + "loss": 0.0708, + "num_input_tokens_seen": 44265024, + "step": 608 + }, + { + "epoch": 9.996884735202492, + "grad_norm": 0.26623424887657166, + "learning_rate": 2.4411005879159753e-05, + "loss": 0.0803, + "num_input_tokens_seen": 44355400, + "step": 609 + }, + { + "epoch": 10.0, + "grad_norm": 0.6041810512542725, + "learning_rate": 2.4345576292303176e-05, + "loss": 0.0686, + "num_input_tokens_seen": 44370360, + "step": 610 + }, + { + "epoch": 10.016614745586708, + "grad_norm": 0.2492121458053589, + "learning_rate": 2.4280151190793417e-05, + "loss": 0.0685, + "num_input_tokens_seen": 44446816, + "step": 611 + }, + { + "epoch": 10.033229491173417, + "grad_norm": 0.307436466217041, + "learning_rate": 2.4214731023046793e-05, + "loss": 0.0814, + "num_input_tokens_seen": 44503632, + "step": 612 + }, + { + "epoch": 10.049844236760125, + "grad_norm": 0.2768141031265259, + "learning_rate": 2.4149316237445812e-05, + "loss": 0.0782, + "num_input_tokens_seen": 44590320, + "step": 613 + }, + { + "epoch": 10.066458982346832, + "grad_norm": 0.29155638813972473, + "learning_rate": 2.408390728233609e-05, + "loss": 0.0793, + "num_input_tokens_seen": 44655224, + "step": 614 + }, + { + "epoch": 10.083073727933542, + "grad_norm": 0.24246808886528015, + "learning_rate": 2.4018504606023293e-05, + "loss": 0.1559, + "num_input_tokens_seen": 44736200, + "step": 615 + }, + { + "epoch": 10.09968847352025, + "grad_norm": 0.2551959455013275, + "learning_rate": 2.3953108656770016e-05, + "loss": 0.069, + "num_input_tokens_seen": 44804416, + "step": 616 + }, + { + "epoch": 10.116303219106957, + "grad_norm": 0.28595054149627686, + "learning_rate": 2.3887719882792785e-05, + "loss": 0.0694, + "num_input_tokens_seen": 44873864, + "step": 617 + }, + { + "epoch": 10.132917964693666, + "grad_norm": 0.2920657992362976, + "learning_rate": 2.3822338732258937e-05, + "loss": 0.081, + "num_input_tokens_seen": 44936736, + "step": 618 + }, + { + "epoch": 10.149532710280374, + "grad_norm": 0.23846709728240967, + "learning_rate": 2.3756965653283557e-05, + "loss": 0.0612, + "num_input_tokens_seen": 45026952, + "step": 619 + }, + { + "epoch": 10.166147455867081, + "grad_norm": 0.258646160364151, + "learning_rate": 2.3691601093926404e-05, + "loss": 0.0776, + "num_input_tokens_seen": 45104816, + "step": 620 + }, + { + "epoch": 10.18276220145379, + "grad_norm": 0.2821391522884369, + "learning_rate": 2.3626245502188864e-05, + "loss": 0.0707, + "num_input_tokens_seen": 45177392, + "step": 621 + }, + { + "epoch": 10.199376947040498, + "grad_norm": 0.34291672706604004, + "learning_rate": 2.3560899326010822e-05, + "loss": 0.0849, + "num_input_tokens_seen": 45237200, + "step": 622 + }, + { + "epoch": 10.215991692627206, + "grad_norm": 0.32930925488471985, + "learning_rate": 2.3495563013267664e-05, + "loss": 0.0856, + "num_input_tokens_seen": 45293376, + "step": 623 + }, + { + "epoch": 10.232606438213915, + "grad_norm": 0.2715514004230499, + "learning_rate": 2.3430237011767167e-05, + "loss": 0.081, + "num_input_tokens_seen": 45369376, + "step": 624 + }, + { + "epoch": 10.249221183800623, + "grad_norm": 0.3008752167224884, + "learning_rate": 2.3364921769246423e-05, + "loss": 0.0707, + "num_input_tokens_seen": 45439920, + "step": 625 + }, + { + "epoch": 10.26583592938733, + "grad_norm": 0.3019203841686249, + "learning_rate": 2.3299617733368806e-05, + "loss": 0.0791, + "num_input_tokens_seen": 45497992, + "step": 626 + }, + { + "epoch": 10.28245067497404, + "grad_norm": 0.2555345892906189, + "learning_rate": 2.323432535172084e-05, + "loss": 0.0649, + "num_input_tokens_seen": 45562832, + "step": 627 + }, + { + "epoch": 10.299065420560748, + "grad_norm": 0.2813156843185425, + "learning_rate": 2.3169045071809215e-05, + "loss": 0.0835, + "num_input_tokens_seen": 45638712, + "step": 628 + }, + { + "epoch": 10.315680166147455, + "grad_norm": 0.30063870549201965, + "learning_rate": 2.3103777341057655e-05, + "loss": 0.0797, + "num_input_tokens_seen": 45697336, + "step": 629 + }, + { + "epoch": 10.332294911734165, + "grad_norm": 0.26722076535224915, + "learning_rate": 2.303852260680388e-05, + "loss": 0.0892, + "num_input_tokens_seen": 45765680, + "step": 630 + }, + { + "epoch": 10.348909657320872, + "grad_norm": 0.28536635637283325, + "learning_rate": 2.2973281316296533e-05, + "loss": 0.081, + "num_input_tokens_seen": 45829592, + "step": 631 + }, + { + "epoch": 10.36552440290758, + "grad_norm": 0.2656691074371338, + "learning_rate": 2.2908053916692117e-05, + "loss": 0.0816, + "num_input_tokens_seen": 45903304, + "step": 632 + }, + { + "epoch": 10.38213914849429, + "grad_norm": 0.23176686465740204, + "learning_rate": 2.284284085505192e-05, + "loss": 0.0773, + "num_input_tokens_seen": 45999840, + "step": 633 + }, + { + "epoch": 10.398753894080997, + "grad_norm": 0.32047316431999207, + "learning_rate": 2.2777642578338963e-05, + "loss": 0.0773, + "num_input_tokens_seen": 46107264, + "step": 634 + }, + { + "epoch": 10.415368639667705, + "grad_norm": 0.2911181151866913, + "learning_rate": 2.2712459533414944e-05, + "loss": 0.0696, + "num_input_tokens_seen": 46174720, + "step": 635 + }, + { + "epoch": 10.431983385254414, + "grad_norm": 0.22214826941490173, + "learning_rate": 2.2647292167037144e-05, + "loss": 0.063, + "num_input_tokens_seen": 46269984, + "step": 636 + }, + { + "epoch": 10.448598130841122, + "grad_norm": 0.2614048719406128, + "learning_rate": 2.2582140925855395e-05, + "loss": 0.0672, + "num_input_tokens_seen": 46354792, + "step": 637 + }, + { + "epoch": 10.46521287642783, + "grad_norm": 0.3499446213245392, + "learning_rate": 2.251700625640903e-05, + "loss": 0.0755, + "num_input_tokens_seen": 46420480, + "step": 638 + }, + { + "epoch": 10.481827622014539, + "grad_norm": 0.30158787965774536, + "learning_rate": 2.2451888605123754e-05, + "loss": 0.0773, + "num_input_tokens_seen": 46487288, + "step": 639 + }, + { + "epoch": 10.498442367601246, + "grad_norm": 0.2722439169883728, + "learning_rate": 2.238678841830867e-05, + "loss": 0.0769, + "num_input_tokens_seen": 46560016, + "step": 640 + }, + { + "epoch": 10.515057113187954, + "grad_norm": 0.27054470777511597, + "learning_rate": 2.2321706142153162e-05, + "loss": 0.079, + "num_input_tokens_seen": 46640864, + "step": 641 + }, + { + "epoch": 10.531671858774663, + "grad_norm": 0.3000243306159973, + "learning_rate": 2.225664222272387e-05, + "loss": 0.0798, + "num_input_tokens_seen": 46712072, + "step": 642 + }, + { + "epoch": 10.54828660436137, + "grad_norm": 0.37168607115745544, + "learning_rate": 2.219159710596161e-05, + "loss": 0.075, + "num_input_tokens_seen": 46787552, + "step": 643 + }, + { + "epoch": 10.564901349948078, + "grad_norm": 0.28578755259513855, + "learning_rate": 2.212657123767834e-05, + "loss": 0.0724, + "num_input_tokens_seen": 46862880, + "step": 644 + }, + { + "epoch": 10.581516095534788, + "grad_norm": 0.24598972499370575, + "learning_rate": 2.2061565063554064e-05, + "loss": 0.0711, + "num_input_tokens_seen": 46944632, + "step": 645 + }, + { + "epoch": 10.598130841121495, + "grad_norm": 0.24780628085136414, + "learning_rate": 2.1996579029133825e-05, + "loss": 0.0699, + "num_input_tokens_seen": 47029152, + "step": 646 + }, + { + "epoch": 10.614745586708203, + "grad_norm": 0.2675541341304779, + "learning_rate": 2.1931613579824628e-05, + "loss": 0.0717, + "num_input_tokens_seen": 47133448, + "step": 647 + }, + { + "epoch": 10.631360332294912, + "grad_norm": 0.28133469820022583, + "learning_rate": 2.186666916089239e-05, + "loss": 0.0698, + "num_input_tokens_seen": 47219888, + "step": 648 + }, + { + "epoch": 10.64797507788162, + "grad_norm": 0.2919703722000122, + "learning_rate": 2.180174621745889e-05, + "loss": 0.07, + "num_input_tokens_seen": 47286344, + "step": 649 + }, + { + "epoch": 10.664589823468328, + "grad_norm": 0.26447877287864685, + "learning_rate": 2.173684519449872e-05, + "loss": 0.0619, + "num_input_tokens_seen": 47392832, + "step": 650 + }, + { + "epoch": 10.681204569055037, + "grad_norm": 0.32841357588768005, + "learning_rate": 2.1671966536836196e-05, + "loss": 0.0796, + "num_input_tokens_seen": 47456448, + "step": 651 + }, + { + "epoch": 10.697819314641745, + "grad_norm": 0.29451075196266174, + "learning_rate": 2.1607110689142393e-05, + "loss": 0.0696, + "num_input_tokens_seen": 47544608, + "step": 652 + }, + { + "epoch": 10.714434060228452, + "grad_norm": 0.2618974447250366, + "learning_rate": 2.154227809593203e-05, + "loss": 0.0804, + "num_input_tokens_seen": 47605504, + "step": 653 + }, + { + "epoch": 10.731048805815162, + "grad_norm": 0.2926179766654968, + "learning_rate": 2.1477469201560435e-05, + "loss": 0.0675, + "num_input_tokens_seen": 47668856, + "step": 654 + }, + { + "epoch": 10.74766355140187, + "grad_norm": 0.26044753193855286, + "learning_rate": 2.141268445022052e-05, + "loss": 0.0779, + "num_input_tokens_seen": 47732072, + "step": 655 + }, + { + "epoch": 10.764278296988577, + "grad_norm": 0.26107361912727356, + "learning_rate": 2.1347924285939714e-05, + "loss": 0.0645, + "num_input_tokens_seen": 47821128, + "step": 656 + }, + { + "epoch": 10.780893042575286, + "grad_norm": 0.334241658449173, + "learning_rate": 2.1283189152576925e-05, + "loss": 0.0975, + "num_input_tokens_seen": 47888432, + "step": 657 + }, + { + "epoch": 10.797507788161994, + "grad_norm": 0.2968047857284546, + "learning_rate": 2.121847949381952e-05, + "loss": 0.0789, + "num_input_tokens_seen": 47961872, + "step": 658 + }, + { + "epoch": 10.814122533748701, + "grad_norm": 0.2523380219936371, + "learning_rate": 2.1153795753180247e-05, + "loss": 0.0607, + "num_input_tokens_seen": 48049624, + "step": 659 + }, + { + "epoch": 10.83073727933541, + "grad_norm": 0.32714295387268066, + "learning_rate": 2.1089138373994223e-05, + "loss": 0.0868, + "num_input_tokens_seen": 48119288, + "step": 660 + }, + { + "epoch": 10.847352024922118, + "grad_norm": 0.3273003101348877, + "learning_rate": 2.1024507799415903e-05, + "loss": 0.0888, + "num_input_tokens_seen": 48176432, + "step": 661 + }, + { + "epoch": 10.863966770508826, + "grad_norm": 0.27770254015922546, + "learning_rate": 2.0959904472415977e-05, + "loss": 0.1672, + "num_input_tokens_seen": 48252000, + "step": 662 + }, + { + "epoch": 10.880581516095535, + "grad_norm": 0.2864398956298828, + "learning_rate": 2.089532883577843e-05, + "loss": 0.0815, + "num_input_tokens_seen": 48317320, + "step": 663 + }, + { + "epoch": 10.897196261682243, + "grad_norm": 0.30251842737197876, + "learning_rate": 2.0830781332097446e-05, + "loss": 0.0764, + "num_input_tokens_seen": 48384728, + "step": 664 + }, + { + "epoch": 10.91381100726895, + "grad_norm": 0.29925107955932617, + "learning_rate": 2.0766262403774386e-05, + "loss": 0.181, + "num_input_tokens_seen": 48451128, + "step": 665 + }, + { + "epoch": 10.93042575285566, + "grad_norm": 0.24659354984760284, + "learning_rate": 2.070177249301476e-05, + "loss": 0.0658, + "num_input_tokens_seen": 48535696, + "step": 666 + }, + { + "epoch": 10.947040498442368, + "grad_norm": 0.27392905950546265, + "learning_rate": 2.0637312041825205e-05, + "loss": 0.0701, + "num_input_tokens_seen": 48606496, + "step": 667 + }, + { + "epoch": 10.963655244029075, + "grad_norm": 0.33744704723358154, + "learning_rate": 2.057288149201042e-05, + "loss": 0.091, + "num_input_tokens_seen": 48671728, + "step": 668 + }, + { + "epoch": 10.980269989615785, + "grad_norm": 0.28141456842422485, + "learning_rate": 2.0508481285170186e-05, + "loss": 0.0652, + "num_input_tokens_seen": 48735408, + "step": 669 + }, + { + "epoch": 10.996884735202492, + "grad_norm": 0.35630133748054504, + "learning_rate": 2.0444111862696314e-05, + "loss": 0.0916, + "num_input_tokens_seen": 48794360, + "step": 670 + }, + { + "epoch": 11.0, + "grad_norm": 0.9144778847694397, + "learning_rate": 2.037977366576961e-05, + "loss": 0.1137, + "num_input_tokens_seen": 48806360, + "step": 671 + }, + { + "epoch": 11.016614745586708, + "grad_norm": 0.2536274790763855, + "learning_rate": 2.031546713535688e-05, + "loss": 0.0732, + "num_input_tokens_seen": 48873568, + "step": 672 + }, + { + "epoch": 11.033229491173417, + "grad_norm": 0.22841258347034454, + "learning_rate": 2.025119271220789e-05, + "loss": 0.0561, + "num_input_tokens_seen": 48959880, + "step": 673 + }, + { + "epoch": 11.049844236760125, + "grad_norm": 0.30224987864494324, + "learning_rate": 2.018695083685232e-05, + "loss": 0.0598, + "num_input_tokens_seen": 49048936, + "step": 674 + }, + { + "epoch": 11.066458982346832, + "grad_norm": 0.24764464795589447, + "learning_rate": 2.0122741949596797e-05, + "loss": 0.0586, + "num_input_tokens_seen": 49119216, + "step": 675 + }, + { + "epoch": 11.083073727933542, + "grad_norm": 0.30028969049453735, + "learning_rate": 2.0058566490521847e-05, + "loss": 0.0718, + "num_input_tokens_seen": 49183408, + "step": 676 + }, + { + "epoch": 11.09968847352025, + "grad_norm": 0.2609259784221649, + "learning_rate": 1.9994424899478885e-05, + "loss": 0.062, + "num_input_tokens_seen": 49275352, + "step": 677 + }, + { + "epoch": 11.116303219106957, + "grad_norm": 0.34472572803497314, + "learning_rate": 1.9930317616087196e-05, + "loss": 0.0794, + "num_input_tokens_seen": 49326536, + "step": 678 + }, + { + "epoch": 11.132917964693666, + "grad_norm": 0.28830835223197937, + "learning_rate": 1.986624507973091e-05, + "loss": 0.066, + "num_input_tokens_seen": 49390840, + "step": 679 + }, + { + "epoch": 11.149532710280374, + "grad_norm": 0.38434937596321106, + "learning_rate": 1.980220772955602e-05, + "loss": 0.0612, + "num_input_tokens_seen": 49474128, + "step": 680 + }, + { + "epoch": 11.166147455867081, + "grad_norm": 0.31176745891571045, + "learning_rate": 1.9738206004467363e-05, + "loss": 0.0715, + "num_input_tokens_seen": 49534248, + "step": 681 + }, + { + "epoch": 11.18276220145379, + "grad_norm": 0.23110291361808777, + "learning_rate": 1.9674240343125588e-05, + "loss": 0.0535, + "num_input_tokens_seen": 49621320, + "step": 682 + }, + { + "epoch": 11.199376947040498, + "grad_norm": 0.3060101270675659, + "learning_rate": 1.961031118394418e-05, + "loss": 0.0729, + "num_input_tokens_seen": 49689760, + "step": 683 + }, + { + "epoch": 11.215991692627206, + "grad_norm": 0.2873547077178955, + "learning_rate": 1.9546418965086442e-05, + "loss": 0.0635, + "num_input_tokens_seen": 49771640, + "step": 684 + }, + { + "epoch": 11.232606438213915, + "grad_norm": 0.2780902087688446, + "learning_rate": 1.9482564124462476e-05, + "loss": 0.0644, + "num_input_tokens_seen": 49838736, + "step": 685 + }, + { + "epoch": 11.249221183800623, + "grad_norm": 0.3456045091152191, + "learning_rate": 1.941874709972622e-05, + "loss": 0.079, + "num_input_tokens_seen": 49908376, + "step": 686 + }, + { + "epoch": 11.26583592938733, + "grad_norm": 0.22289884090423584, + "learning_rate": 1.935496832827241e-05, + "loss": 0.0518, + "num_input_tokens_seen": 50006464, + "step": 687 + }, + { + "epoch": 11.28245067497404, + "grad_norm": 0.330370157957077, + "learning_rate": 1.9291228247233605e-05, + "loss": 0.075, + "num_input_tokens_seen": 50081920, + "step": 688 + }, + { + "epoch": 11.299065420560748, + "grad_norm": 0.2642322778701782, + "learning_rate": 1.9227527293477186e-05, + "loss": 0.1651, + "num_input_tokens_seen": 50145832, + "step": 689 + }, + { + "epoch": 11.315680166147455, + "grad_norm": 0.32244759798049927, + "learning_rate": 1.9163865903602374e-05, + "loss": 0.0777, + "num_input_tokens_seen": 50193752, + "step": 690 + }, + { + "epoch": 11.332294911734165, + "grad_norm": 0.2895197570323944, + "learning_rate": 1.9100244513937174e-05, + "loss": 0.0765, + "num_input_tokens_seen": 50252792, + "step": 691 + }, + { + "epoch": 11.348909657320872, + "grad_norm": 0.31584399938583374, + "learning_rate": 1.9036663560535483e-05, + "loss": 0.0693, + "num_input_tokens_seen": 50315264, + "step": 692 + }, + { + "epoch": 11.36552440290758, + "grad_norm": 0.2692103087902069, + "learning_rate": 1.897312347917404e-05, + "loss": 0.0658, + "num_input_tokens_seen": 50379280, + "step": 693 + }, + { + "epoch": 11.38213914849429, + "grad_norm": 0.2717333436012268, + "learning_rate": 1.890962470534944e-05, + "loss": 0.0709, + "num_input_tokens_seen": 50459248, + "step": 694 + }, + { + "epoch": 11.398753894080997, + "grad_norm": 0.23477673530578613, + "learning_rate": 1.8846167674275176e-05, + "loss": 0.0649, + "num_input_tokens_seen": 50552672, + "step": 695 + }, + { + "epoch": 11.415368639667705, + "grad_norm": 0.30369827151298523, + "learning_rate": 1.8782752820878634e-05, + "loss": 0.062, + "num_input_tokens_seen": 50613744, + "step": 696 + }, + { + "epoch": 11.431983385254414, + "grad_norm": 0.2870631814002991, + "learning_rate": 1.8719380579798112e-05, + "loss": 0.0667, + "num_input_tokens_seen": 50690248, + "step": 697 + }, + { + "epoch": 11.448598130841122, + "grad_norm": 0.2412702739238739, + "learning_rate": 1.865605138537986e-05, + "loss": 0.0638, + "num_input_tokens_seen": 50782136, + "step": 698 + }, + { + "epoch": 11.46521287642783, + "grad_norm": 0.3324171304702759, + "learning_rate": 1.8592765671675084e-05, + "loss": 0.0806, + "num_input_tokens_seen": 50843280, + "step": 699 + }, + { + "epoch": 11.481827622014539, + "grad_norm": 0.25983986258506775, + "learning_rate": 1.852952387243698e-05, + "loss": 0.0738, + "num_input_tokens_seen": 50909176, + "step": 700 + }, + { + "epoch": 11.498442367601246, + "grad_norm": 0.2748391628265381, + "learning_rate": 1.846632642111777e-05, + "loss": 0.0823, + "num_input_tokens_seen": 50979512, + "step": 701 + }, + { + "epoch": 11.515057113187954, + "grad_norm": 0.2953915297985077, + "learning_rate": 1.8403173750865685e-05, + "loss": 0.0726, + "num_input_tokens_seen": 51043064, + "step": 702 + }, + { + "epoch": 11.531671858774663, + "grad_norm": 0.27582335472106934, + "learning_rate": 1.8340066294522068e-05, + "loss": 0.0812, + "num_input_tokens_seen": 51117784, + "step": 703 + }, + { + "epoch": 11.54828660436137, + "grad_norm": 0.27300143241882324, + "learning_rate": 1.827700448461836e-05, + "loss": 0.0661, + "num_input_tokens_seen": 51190232, + "step": 704 + }, + { + "epoch": 11.564901349948078, + "grad_norm": 0.27855339646339417, + "learning_rate": 1.8213988753373146e-05, + "loss": 0.0714, + "num_input_tokens_seen": 51261080, + "step": 705 + }, + { + "epoch": 11.581516095534788, + "grad_norm": 0.21793155372142792, + "learning_rate": 1.815101953268919e-05, + "loss": 0.0573, + "num_input_tokens_seen": 51341592, + "step": 706 + }, + { + "epoch": 11.598130841121495, + "grad_norm": 0.29607975482940674, + "learning_rate": 1.8088097254150486e-05, + "loss": 0.0625, + "num_input_tokens_seen": 51411168, + "step": 707 + }, + { + "epoch": 11.614745586708203, + "grad_norm": 0.3065016269683838, + "learning_rate": 1.802522234901927e-05, + "loss": 0.0729, + "num_input_tokens_seen": 51467184, + "step": 708 + }, + { + "epoch": 11.631360332294912, + "grad_norm": 0.2603945732116699, + "learning_rate": 1.7962395248233114e-05, + "loss": 0.1445, + "num_input_tokens_seen": 51567896, + "step": 709 + }, + { + "epoch": 11.64797507788162, + "grad_norm": 0.298199862241745, + "learning_rate": 1.7899616382401936e-05, + "loss": 0.0814, + "num_input_tokens_seen": 51629240, + "step": 710 + }, + { + "epoch": 11.664589823468328, + "grad_norm": 0.33891791105270386, + "learning_rate": 1.783688618180504e-05, + "loss": 0.087, + "num_input_tokens_seen": 51687552, + "step": 711 + }, + { + "epoch": 11.681204569055037, + "grad_norm": 0.2577643394470215, + "learning_rate": 1.7774205076388206e-05, + "loss": 0.0658, + "num_input_tokens_seen": 51784232, + "step": 712 + }, + { + "epoch": 11.697819314641745, + "grad_norm": 0.27919164299964905, + "learning_rate": 1.7711573495760725e-05, + "loss": 0.0708, + "num_input_tokens_seen": 51852288, + "step": 713 + }, + { + "epoch": 11.714434060228452, + "grad_norm": 0.22918739914894104, + "learning_rate": 1.7648991869192405e-05, + "loss": 0.0526, + "num_input_tokens_seen": 51948408, + "step": 714 + }, + { + "epoch": 11.731048805815162, + "grad_norm": 0.2642481327056885, + "learning_rate": 1.7586460625610728e-05, + "loss": 0.0648, + "num_input_tokens_seen": 52029512, + "step": 715 + }, + { + "epoch": 11.74766355140187, + "grad_norm": 0.2955224812030792, + "learning_rate": 1.7523980193597836e-05, + "loss": 0.073, + "num_input_tokens_seen": 52095864, + "step": 716 + }, + { + "epoch": 11.764278296988577, + "grad_norm": 0.2940084934234619, + "learning_rate": 1.746155100138761e-05, + "loss": 0.0764, + "num_input_tokens_seen": 52154880, + "step": 717 + }, + { + "epoch": 11.780893042575286, + "grad_norm": 0.3002694547176361, + "learning_rate": 1.739917347686274e-05, + "loss": 0.0807, + "num_input_tokens_seen": 52209696, + "step": 718 + }, + { + "epoch": 11.797507788161994, + "grad_norm": 0.22952137887477875, + "learning_rate": 1.7336848047551814e-05, + "loss": 0.0552, + "num_input_tokens_seen": 52290608, + "step": 719 + }, + { + "epoch": 11.814122533748701, + "grad_norm": 0.256875216960907, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.0628, + "num_input_tokens_seen": 52364616, + "step": 720 + }, + { + "epoch": 11.83073727933541, + "grad_norm": 0.4035111367702484, + "learning_rate": 1.72123551828978e-05, + "loss": 0.0687, + "num_input_tokens_seen": 52446432, + "step": 721 + }, + { + "epoch": 11.847352024922118, + "grad_norm": 0.34936726093292236, + "learning_rate": 1.7150188600814877e-05, + "loss": 0.1985, + "num_input_tokens_seen": 52500320, + "step": 722 + }, + { + "epoch": 11.863966770508826, + "grad_norm": 0.2872942090034485, + "learning_rate": 1.7088075820460346e-05, + "loss": 0.0639, + "num_input_tokens_seen": 52572888, + "step": 723 + }, + { + "epoch": 11.880581516095535, + "grad_norm": 0.2703973650932312, + "learning_rate": 1.702601726754825e-05, + "loss": 0.0715, + "num_input_tokens_seen": 52649240, + "step": 724 + }, + { + "epoch": 11.897196261682243, + "grad_norm": 0.2533208727836609, + "learning_rate": 1.6964013367420966e-05, + "loss": 0.0638, + "num_input_tokens_seen": 52731312, + "step": 725 + }, + { + "epoch": 11.91381100726895, + "grad_norm": 0.24604761600494385, + "learning_rate": 1.690206454504627e-05, + "loss": 0.0668, + "num_input_tokens_seen": 52829968, + "step": 726 + }, + { + "epoch": 11.93042575285566, + "grad_norm": 0.21317115426063538, + "learning_rate": 1.6840171225014457e-05, + "loss": 0.0567, + "num_input_tokens_seen": 52938304, + "step": 727 + }, + { + "epoch": 11.947040498442368, + "grad_norm": 0.3644583821296692, + "learning_rate": 1.677833383153542e-05, + "loss": 0.0798, + "num_input_tokens_seen": 53003736, + "step": 728 + }, + { + "epoch": 11.963655244029075, + "grad_norm": 0.293142169713974, + "learning_rate": 1.6716552788435724e-05, + "loss": 0.0621, + "num_input_tokens_seen": 53071872, + "step": 729 + }, + { + "epoch": 11.980269989615785, + "grad_norm": 0.2816755473613739, + "learning_rate": 1.665482851915573e-05, + "loss": 0.06, + "num_input_tokens_seen": 53145192, + "step": 730 + }, + { + "epoch": 11.996884735202492, + "grad_norm": 0.21882176399230957, + "learning_rate": 1.659316144674666e-05, + "loss": 0.0551, + "num_input_tokens_seen": 53229240, + "step": 731 + }, + { + "epoch": 12.0, + "grad_norm": 0.5393900275230408, + "learning_rate": 1.6531551993867717e-05, + "loss": 0.0551, + "num_input_tokens_seen": 53245592, + "step": 732 + }, + { + "epoch": 12.016614745586708, + "grad_norm": 0.2326558381319046, + "learning_rate": 1.6470000582783204e-05, + "loss": 0.0617, + "num_input_tokens_seen": 53327016, + "step": 733 + }, + { + "epoch": 12.033229491173417, + "grad_norm": 0.24347102642059326, + "learning_rate": 1.64085076353596e-05, + "loss": 0.062, + "num_input_tokens_seen": 53396464, + "step": 734 + }, + { + "epoch": 12.049844236760125, + "grad_norm": 0.2822597622871399, + "learning_rate": 1.6347073573062672e-05, + "loss": 0.0568, + "num_input_tokens_seen": 53485056, + "step": 735 + }, + { + "epoch": 12.066458982346832, + "grad_norm": 0.27358195185661316, + "learning_rate": 1.6285698816954624e-05, + "loss": 0.0666, + "num_input_tokens_seen": 53553376, + "step": 736 + }, + { + "epoch": 12.083073727933542, + "grad_norm": 0.2802799344062805, + "learning_rate": 1.622438378769114e-05, + "loss": 0.0675, + "num_input_tokens_seen": 53615616, + "step": 737 + }, + { + "epoch": 12.09968847352025, + "grad_norm": 0.26887986063957214, + "learning_rate": 1.6163128905518578e-05, + "loss": 0.0631, + "num_input_tokens_seen": 53689512, + "step": 738 + }, + { + "epoch": 12.116303219106957, + "grad_norm": 0.33475375175476074, + "learning_rate": 1.6101934590271032e-05, + "loss": 0.0775, + "num_input_tokens_seen": 53743080, + "step": 739 + }, + { + "epoch": 12.132917964693666, + "grad_norm": 0.28297415375709534, + "learning_rate": 1.6040801261367493e-05, + "loss": 0.0568, + "num_input_tokens_seen": 53830432, + "step": 740 + }, + { + "epoch": 12.149532710280374, + "grad_norm": 0.2604190707206726, + "learning_rate": 1.5979729337808955e-05, + "loss": 0.0545, + "num_input_tokens_seen": 53914536, + "step": 741 + }, + { + "epoch": 12.166147455867081, + "grad_norm": 0.36683493852615356, + "learning_rate": 1.5918719238175544e-05, + "loss": 0.0553, + "num_input_tokens_seen": 53977072, + "step": 742 + }, + { + "epoch": 12.18276220145379, + "grad_norm": 0.27039340138435364, + "learning_rate": 1.5857771380623643e-05, + "loss": 0.0609, + "num_input_tokens_seen": 54051576, + "step": 743 + }, + { + "epoch": 12.199376947040498, + "grad_norm": 0.2632916271686554, + "learning_rate": 1.5796886182883053e-05, + "loss": 0.0629, + "num_input_tokens_seen": 54126832, + "step": 744 + }, + { + "epoch": 12.215991692627206, + "grad_norm": 0.2716232240200043, + "learning_rate": 1.5736064062254094e-05, + "loss": 0.0579, + "num_input_tokens_seen": 54203832, + "step": 745 + }, + { + "epoch": 12.232606438213915, + "grad_norm": 0.29032984375953674, + "learning_rate": 1.5675305435604775e-05, + "loss": 0.0652, + "num_input_tokens_seen": 54268712, + "step": 746 + }, + { + "epoch": 12.249221183800623, + "grad_norm": 0.2442651391029358, + "learning_rate": 1.561461071936792e-05, + "loss": 0.1332, + "num_input_tokens_seen": 54342024, + "step": 747 + }, + { + "epoch": 12.26583592938733, + "grad_norm": 0.25280654430389404, + "learning_rate": 1.5553980329538326e-05, + "loss": 0.0549, + "num_input_tokens_seen": 54407352, + "step": 748 + }, + { + "epoch": 12.28245067497404, + "grad_norm": 0.3102169930934906, + "learning_rate": 1.549341468166988e-05, + "loss": 0.0639, + "num_input_tokens_seen": 54466560, + "step": 749 + }, + { + "epoch": 12.299065420560748, + "grad_norm": 0.2568191885948181, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.0611, + "num_input_tokens_seen": 54559704, + "step": 750 + }, + { + "epoch": 12.315680166147455, + "grad_norm": 0.3201739192008972, + "learning_rate": 1.537247927181055e-05, + "loss": 0.0735, + "num_input_tokens_seen": 54623736, + "step": 751 + }, + { + "epoch": 12.332294911734165, + "grad_norm": 0.25409963726997375, + "learning_rate": 1.5312110338697426e-05, + "loss": 0.0655, + "num_input_tokens_seen": 54699928, + "step": 752 + }, + { + "epoch": 12.348909657320872, + "grad_norm": 0.2382766753435135, + "learning_rate": 1.5251807805295302e-05, + "loss": 0.0529, + "num_input_tokens_seen": 54782808, + "step": 753 + }, + { + "epoch": 12.36552440290758, + "grad_norm": 0.2533799111843109, + "learning_rate": 1.519157208491097e-05, + "loss": 0.0607, + "num_input_tokens_seen": 54868720, + "step": 754 + }, + { + "epoch": 12.38213914849429, + "grad_norm": 0.19882524013519287, + "learning_rate": 1.5131403590393323e-05, + "loss": 0.0496, + "num_input_tokens_seen": 54960384, + "step": 755 + }, + { + "epoch": 12.398753894080997, + "grad_norm": 0.2437567114830017, + "learning_rate": 1.5071302734130489e-05, + "loss": 0.0553, + "num_input_tokens_seen": 55044232, + "step": 756 + }, + { + "epoch": 12.415368639667705, + "grad_norm": 0.24854236841201782, + "learning_rate": 1.5011269928047003e-05, + "loss": 0.1655, + "num_input_tokens_seen": 55125632, + "step": 757 + }, + { + "epoch": 12.431983385254414, + "grad_norm": 0.2629762887954712, + "learning_rate": 1.4951305583601e-05, + "loss": 0.0588, + "num_input_tokens_seen": 55201720, + "step": 758 + }, + { + "epoch": 12.448598130841122, + "grad_norm": 0.4379749000072479, + "learning_rate": 1.4891410111781378e-05, + "loss": 0.0522, + "num_input_tokens_seen": 55265832, + "step": 759 + }, + { + "epoch": 12.46521287642783, + "grad_norm": 0.3276868462562561, + "learning_rate": 1.4831583923104999e-05, + "loss": 0.0776, + "num_input_tokens_seen": 55316928, + "step": 760 + }, + { + "epoch": 12.481827622014539, + "grad_norm": 0.34918782114982605, + "learning_rate": 1.4771827427613855e-05, + "loss": 0.059, + "num_input_tokens_seen": 55391232, + "step": 761 + }, + { + "epoch": 12.498442367601246, + "grad_norm": 0.28458765149116516, + "learning_rate": 1.4712141034872282e-05, + "loss": 0.0604, + "num_input_tokens_seen": 55460984, + "step": 762 + }, + { + "epoch": 12.515057113187954, + "grad_norm": 0.27223336696624756, + "learning_rate": 1.4652525153964131e-05, + "loss": 0.0556, + "num_input_tokens_seen": 55537056, + "step": 763 + }, + { + "epoch": 12.531671858774663, + "grad_norm": 0.27431365847587585, + "learning_rate": 1.4592980193489975e-05, + "loss": 0.0598, + "num_input_tokens_seen": 55625272, + "step": 764 + }, + { + "epoch": 12.54828660436137, + "grad_norm": 0.2332375943660736, + "learning_rate": 1.4533506561564306e-05, + "loss": 0.05, + "num_input_tokens_seen": 55701992, + "step": 765 + }, + { + "epoch": 12.564901349948078, + "grad_norm": 0.3134738802909851, + "learning_rate": 1.4474104665812727e-05, + "loss": 0.066, + "num_input_tokens_seen": 55759312, + "step": 766 + }, + { + "epoch": 12.581516095534788, + "grad_norm": 0.26774415373802185, + "learning_rate": 1.44147749133692e-05, + "loss": 0.0541, + "num_input_tokens_seen": 55840400, + "step": 767 + }, + { + "epoch": 12.598130841121495, + "grad_norm": 0.28324708342552185, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.0623, + "num_input_tokens_seen": 55912176, + "step": 768 + }, + { + "epoch": 12.614745586708203, + "grad_norm": 0.22127535939216614, + "learning_rate": 1.429633346446693e-05, + "loss": 0.0542, + "num_input_tokens_seen": 55989272, + "step": 769 + }, + { + "epoch": 12.631360332294912, + "grad_norm": 0.30930501222610474, + "learning_rate": 1.4237222579792618e-05, + "loss": 0.0719, + "num_input_tokens_seen": 56046720, + "step": 770 + }, + { + "epoch": 12.64797507788162, + "grad_norm": 0.27857473492622375, + "learning_rate": 1.4178185461989662e-05, + "loss": 0.0572, + "num_input_tokens_seen": 56132720, + "step": 771 + }, + { + "epoch": 12.664589823468328, + "grad_norm": 0.30680912733078003, + "learning_rate": 1.4119222515691816e-05, + "loss": 0.0648, + "num_input_tokens_seen": 56187824, + "step": 772 + }, + { + "epoch": 12.681204569055037, + "grad_norm": 0.2681683301925659, + "learning_rate": 1.4060334145024542e-05, + "loss": 0.0576, + "num_input_tokens_seen": 56259248, + "step": 773 + }, + { + "epoch": 12.697819314641745, + "grad_norm": 0.2639155983924866, + "learning_rate": 1.4001520753602121e-05, + "loss": 0.0637, + "num_input_tokens_seen": 56347584, + "step": 774 + }, + { + "epoch": 12.714434060228452, + "grad_norm": 0.25560155510902405, + "learning_rate": 1.3942782744524973e-05, + "loss": 0.0584, + "num_input_tokens_seen": 56433184, + "step": 775 + }, + { + "epoch": 12.731048805815162, + "grad_norm": 0.2899349629878998, + "learning_rate": 1.388412052037682e-05, + "loss": 0.0866, + "num_input_tokens_seen": 56491176, + "step": 776 + }, + { + "epoch": 12.74766355140187, + "grad_norm": 0.26568692922592163, + "learning_rate": 1.3825534483221974e-05, + "loss": 0.0536, + "num_input_tokens_seen": 56560952, + "step": 777 + }, + { + "epoch": 12.764278296988577, + "grad_norm": 0.27220121026039124, + "learning_rate": 1.376702503460259e-05, + "loss": 0.0638, + "num_input_tokens_seen": 56622384, + "step": 778 + }, + { + "epoch": 12.780893042575286, + "grad_norm": 0.24911001324653625, + "learning_rate": 1.3708592575535858e-05, + "loss": 0.0606, + "num_input_tokens_seen": 56709344, + "step": 779 + }, + { + "epoch": 12.797507788161994, + "grad_norm": 0.3394135534763336, + "learning_rate": 1.3650237506511331e-05, + "loss": 0.0752, + "num_input_tokens_seen": 56780664, + "step": 780 + }, + { + "epoch": 12.814122533748701, + "grad_norm": 0.26311007142066956, + "learning_rate": 1.3591960227488098e-05, + "loss": 0.0537, + "num_input_tokens_seen": 56862528, + "step": 781 + }, + { + "epoch": 12.83073727933541, + "grad_norm": 0.2802240550518036, + "learning_rate": 1.3533761137892136e-05, + "loss": 0.0575, + "num_input_tokens_seen": 56933384, + "step": 782 + }, + { + "epoch": 12.847352024922118, + "grad_norm": 0.2789711058139801, + "learning_rate": 1.3475640636613446e-05, + "loss": 0.0555, + "num_input_tokens_seen": 57023008, + "step": 783 + }, + { + "epoch": 12.863966770508826, + "grad_norm": 0.30197107791900635, + "learning_rate": 1.3417599122003464e-05, + "loss": 0.0743, + "num_input_tokens_seen": 57095032, + "step": 784 + }, + { + "epoch": 12.880581516095535, + "grad_norm": 0.2779630422592163, + "learning_rate": 1.3359636991872215e-05, + "loss": 0.1914, + "num_input_tokens_seen": 57162840, + "step": 785 + }, + { + "epoch": 12.897196261682243, + "grad_norm": 0.21620029211044312, + "learning_rate": 1.330175464348567e-05, + "loss": 0.0483, + "num_input_tokens_seen": 57247992, + "step": 786 + }, + { + "epoch": 12.91381100726895, + "grad_norm": 0.28678637742996216, + "learning_rate": 1.3243952473562942e-05, + "loss": 0.0709, + "num_input_tokens_seen": 57312216, + "step": 787 + }, + { + "epoch": 12.93042575285566, + "grad_norm": 0.3376156687736511, + "learning_rate": 1.3186230878273653e-05, + "loss": 0.0697, + "num_input_tokens_seen": 57367104, + "step": 788 + }, + { + "epoch": 12.947040498442368, + "grad_norm": 0.2537081837654114, + "learning_rate": 1.312859025323514e-05, + "loss": 0.0562, + "num_input_tokens_seen": 57437176, + "step": 789 + }, + { + "epoch": 12.963655244029075, + "grad_norm": 0.3236291706562042, + "learning_rate": 1.3071030993509788e-05, + "loss": 0.0653, + "num_input_tokens_seen": 57495304, + "step": 790 + }, + { + "epoch": 12.980269989615785, + "grad_norm": 0.2506027817726135, + "learning_rate": 1.3013553493602338e-05, + "loss": 0.0566, + "num_input_tokens_seen": 57582400, + "step": 791 + }, + { + "epoch": 12.996884735202492, + "grad_norm": 0.24630357325077057, + "learning_rate": 1.2956158147457115e-05, + "loss": 0.0577, + "num_input_tokens_seen": 57666136, + "step": 792 + }, + { + "epoch": 13.0, + "grad_norm": 0.40804192423820496, + "learning_rate": 1.2898845348455418e-05, + "loss": 0.0425, + "num_input_tokens_seen": 57682352, + "step": 793 + }, + { + "epoch": 13.016614745586708, + "grad_norm": 0.24909114837646484, + "learning_rate": 1.2841615489412739e-05, + "loss": 0.0538, + "num_input_tokens_seen": 57745080, + "step": 794 + }, + { + "epoch": 13.033229491173417, + "grad_norm": 0.26190048456192017, + "learning_rate": 1.2784468962576136e-05, + "loss": 0.0569, + "num_input_tokens_seen": 57808680, + "step": 795 + }, + { + "epoch": 13.049844236760125, + "grad_norm": 0.2653025686740875, + "learning_rate": 1.272740615962148e-05, + "loss": 0.067, + "num_input_tokens_seen": 57878016, + "step": 796 + }, + { + "epoch": 13.066458982346832, + "grad_norm": 0.2509848177433014, + "learning_rate": 1.2670427471650864e-05, + "loss": 0.0593, + "num_input_tokens_seen": 57937640, + "step": 797 + }, + { + "epoch": 13.083073727933542, + "grad_norm": 0.2545263171195984, + "learning_rate": 1.261353328918981e-05, + "loss": 0.0554, + "num_input_tokens_seen": 58009448, + "step": 798 + }, + { + "epoch": 13.09968847352025, + "grad_norm": 0.2856726348400116, + "learning_rate": 1.2556724002184696e-05, + "loss": 0.2005, + "num_input_tokens_seen": 58068936, + "step": 799 + }, + { + "epoch": 13.116303219106957, + "grad_norm": 0.2398708462715149, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.0501, + "num_input_tokens_seen": 58148824, + "step": 800 + }, + { + "epoch": 13.132917964693666, + "grad_norm": 0.24167364835739136, + "learning_rate": 1.2443361671415687e-05, + "loss": 0.048, + "num_input_tokens_seen": 58220720, + "step": 801 + }, + { + "epoch": 13.149532710280374, + "grad_norm": 0.2955470681190491, + "learning_rate": 1.2386809404624521e-05, + "loss": 0.05, + "num_input_tokens_seen": 58290944, + "step": 802 + }, + { + "epoch": 13.166147455867081, + "grad_norm": 0.24610291421413422, + "learning_rate": 1.2330343587229396e-05, + "loss": 0.0517, + "num_input_tokens_seen": 58382480, + "step": 803 + }, + { + "epoch": 13.18276220145379, + "grad_norm": 0.31220707297325134, + "learning_rate": 1.2273964606240718e-05, + "loss": 0.0558, + "num_input_tokens_seen": 58443424, + "step": 804 + }, + { + "epoch": 13.199376947040498, + "grad_norm": 0.27322131395339966, + "learning_rate": 1.2217672848073702e-05, + "loss": 0.0602, + "num_input_tokens_seen": 58534424, + "step": 805 + }, + { + "epoch": 13.215991692627206, + "grad_norm": 0.29824891686439514, + "learning_rate": 1.2161468698545755e-05, + "loss": 0.0585, + "num_input_tokens_seen": 58597336, + "step": 806 + }, + { + "epoch": 13.232606438213915, + "grad_norm": 0.3074946701526642, + "learning_rate": 1.2105352542873815e-05, + "loss": 0.0642, + "num_input_tokens_seen": 58647872, + "step": 807 + }, + { + "epoch": 13.249221183800623, + "grad_norm": 0.27672338485717773, + "learning_rate": 1.2049324765671749e-05, + "loss": 0.0544, + "num_input_tokens_seen": 58731544, + "step": 808 + }, + { + "epoch": 13.26583592938733, + "grad_norm": 0.2907312512397766, + "learning_rate": 1.199338575094765e-05, + "loss": 0.0639, + "num_input_tokens_seen": 58788488, + "step": 809 + }, + { + "epoch": 13.28245067497404, + "grad_norm": 0.17112018167972565, + "learning_rate": 1.1937535882101281e-05, + "loss": 0.0392, + "num_input_tokens_seen": 58886640, + "step": 810 + }, + { + "epoch": 13.299065420560748, + "grad_norm": 0.3354332149028778, + "learning_rate": 1.1881775541921378e-05, + "loss": 0.0644, + "num_input_tokens_seen": 58943304, + "step": 811 + }, + { + "epoch": 13.315680166147455, + "grad_norm": 0.25639235973358154, + "learning_rate": 1.1826105112583061e-05, + "loss": 0.0632, + "num_input_tokens_seen": 59020992, + "step": 812 + }, + { + "epoch": 13.332294911734165, + "grad_norm": 0.21590979397296906, + "learning_rate": 1.1770524975645238e-05, + "loss": 0.0424, + "num_input_tokens_seen": 59111368, + "step": 813 + }, + { + "epoch": 13.348909657320872, + "grad_norm": 0.28309866786003113, + "learning_rate": 1.1715035512047925e-05, + "loss": 0.0574, + "num_input_tokens_seen": 59194880, + "step": 814 + }, + { + "epoch": 13.36552440290758, + "grad_norm": 0.2805883586406708, + "learning_rate": 1.1659637102109714e-05, + "loss": 0.0521, + "num_input_tokens_seen": 59250648, + "step": 815 + }, + { + "epoch": 13.38213914849429, + "grad_norm": 0.25733205676078796, + "learning_rate": 1.1604330125525079e-05, + "loss": 0.0515, + "num_input_tokens_seen": 59324888, + "step": 816 + }, + { + "epoch": 13.398753894080997, + "grad_norm": 0.28011178970336914, + "learning_rate": 1.154911496136188e-05, + "loss": 0.0585, + "num_input_tokens_seen": 59395120, + "step": 817 + }, + { + "epoch": 13.415368639667705, + "grad_norm": 0.2550284266471863, + "learning_rate": 1.1493991988058625e-05, + "loss": 0.0526, + "num_input_tokens_seen": 59475152, + "step": 818 + }, + { + "epoch": 13.431983385254414, + "grad_norm": 0.2467280924320221, + "learning_rate": 1.1438961583422037e-05, + "loss": 0.0484, + "num_input_tokens_seen": 59538328, + "step": 819 + }, + { + "epoch": 13.448598130841122, + "grad_norm": 0.31466010212898254, + "learning_rate": 1.1384024124624324e-05, + "loss": 0.0599, + "num_input_tokens_seen": 59606280, + "step": 820 + }, + { + "epoch": 13.46521287642783, + "grad_norm": 0.2625578045845032, + "learning_rate": 1.1329179988200694e-05, + "loss": 0.0585, + "num_input_tokens_seen": 59694032, + "step": 821 + }, + { + "epoch": 13.481827622014539, + "grad_norm": 0.24019013345241547, + "learning_rate": 1.1274429550046704e-05, + "loss": 0.0534, + "num_input_tokens_seen": 59764904, + "step": 822 + }, + { + "epoch": 13.498442367601246, + "grad_norm": 0.23081430792808533, + "learning_rate": 1.121977318541575e-05, + "loss": 0.0486, + "num_input_tokens_seen": 59839232, + "step": 823 + }, + { + "epoch": 13.515057113187954, + "grad_norm": 0.2902800738811493, + "learning_rate": 1.11652112689164e-05, + "loss": 0.0608, + "num_input_tokens_seen": 59907248, + "step": 824 + }, + { + "epoch": 13.531671858774663, + "grad_norm": 0.27917391061782837, + "learning_rate": 1.1110744174509952e-05, + "loss": 0.062, + "num_input_tokens_seen": 59971912, + "step": 825 + }, + { + "epoch": 13.54828660436137, + "grad_norm": 0.2244357317686081, + "learning_rate": 1.1056372275507749e-05, + "loss": 0.0495, + "num_input_tokens_seen": 60054728, + "step": 826 + }, + { + "epoch": 13.564901349948078, + "grad_norm": 0.24689778685569763, + "learning_rate": 1.1002095944568707e-05, + "loss": 0.0506, + "num_input_tokens_seen": 60133824, + "step": 827 + }, + { + "epoch": 13.581516095534788, + "grad_norm": 0.2756742537021637, + "learning_rate": 1.0947915553696742e-05, + "loss": 0.058, + "num_input_tokens_seen": 60194832, + "step": 828 + }, + { + "epoch": 13.598130841121495, + "grad_norm": 0.2537204623222351, + "learning_rate": 1.089383147423815e-05, + "loss": 0.0592, + "num_input_tokens_seen": 60280224, + "step": 829 + }, + { + "epoch": 13.614745586708203, + "grad_norm": 0.3149183392524719, + "learning_rate": 1.0839844076879185e-05, + "loss": 0.0658, + "num_input_tokens_seen": 60341144, + "step": 830 + }, + { + "epoch": 13.631360332294912, + "grad_norm": 0.26036155223846436, + "learning_rate": 1.07859537316434e-05, + "loss": 0.0504, + "num_input_tokens_seen": 60425128, + "step": 831 + }, + { + "epoch": 13.64797507788162, + "grad_norm": 0.20459908246994019, + "learning_rate": 1.0732160807889211e-05, + "loss": 0.0523, + "num_input_tokens_seen": 60520456, + "step": 832 + }, + { + "epoch": 13.664589823468328, + "grad_norm": 0.24871429800987244, + "learning_rate": 1.0678465674307273e-05, + "loss": 0.0495, + "num_input_tokens_seen": 60617640, + "step": 833 + }, + { + "epoch": 13.681204569055037, + "grad_norm": 0.30655166506767273, + "learning_rate": 1.0624868698918045e-05, + "loss": 0.0632, + "num_input_tokens_seen": 60676584, + "step": 834 + }, + { + "epoch": 13.697819314641745, + "grad_norm": 0.25585272908210754, + "learning_rate": 1.0571370249069162e-05, + "loss": 0.1454, + "num_input_tokens_seen": 60750760, + "step": 835 + }, + { + "epoch": 13.714434060228452, + "grad_norm": 0.3102145791053772, + "learning_rate": 1.0517970691433035e-05, + "loss": 0.0637, + "num_input_tokens_seen": 60823520, + "step": 836 + }, + { + "epoch": 13.731048805815162, + "grad_norm": 0.21035313606262207, + "learning_rate": 1.0464670392004235e-05, + "loss": 0.0587, + "num_input_tokens_seen": 60917712, + "step": 837 + }, + { + "epoch": 13.74766355140187, + "grad_norm": 0.2799111306667328, + "learning_rate": 1.0411469716097067e-05, + "loss": 0.0646, + "num_input_tokens_seen": 60979928, + "step": 838 + }, + { + "epoch": 13.764278296988577, + "grad_norm": 0.2967042624950409, + "learning_rate": 1.0358369028342985e-05, + "loss": 0.0609, + "num_input_tokens_seen": 61050592, + "step": 839 + }, + { + "epoch": 13.780893042575286, + "grad_norm": 0.24081780016422272, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.0457, + "num_input_tokens_seen": 61126704, + "step": 840 + }, + { + "epoch": 13.797507788161994, + "grad_norm": 0.28168758749961853, + "learning_rate": 1.0252469072390994e-05, + "loss": 0.0592, + "num_input_tokens_seen": 61192016, + "step": 841 + }, + { + "epoch": 13.814122533748701, + "grad_norm": 0.3008675277233124, + "learning_rate": 1.0199670530019511e-05, + "loss": 0.0687, + "num_input_tokens_seen": 61264256, + "step": 842 + }, + { + "epoch": 13.83073727933541, + "grad_norm": 0.28137174248695374, + "learning_rate": 1.0146973427449038e-05, + "loss": 0.0559, + "num_input_tokens_seen": 61330712, + "step": 843 + }, + { + "epoch": 13.847352024922118, + "grad_norm": 0.29582953453063965, + "learning_rate": 1.0094378125859602e-05, + "loss": 0.0539, + "num_input_tokens_seen": 61403992, + "step": 844 + }, + { + "epoch": 13.863966770508826, + "grad_norm": 0.2721082866191864, + "learning_rate": 1.0041884985733524e-05, + "loss": 0.0551, + "num_input_tokens_seen": 61505072, + "step": 845 + }, + { + "epoch": 13.880581516095535, + "grad_norm": 0.29855334758758545, + "learning_rate": 9.989494366852904e-06, + "loss": 0.058, + "num_input_tokens_seen": 61574704, + "step": 846 + }, + { + "epoch": 13.897196261682243, + "grad_norm": 0.2921924591064453, + "learning_rate": 9.937206628297172e-06, + "loss": 0.0584, + "num_input_tokens_seen": 61647640, + "step": 847 + }, + { + "epoch": 13.91381100726895, + "grad_norm": 0.32202112674713135, + "learning_rate": 9.88502212844063e-06, + "loss": 0.0682, + "num_input_tokens_seen": 61708696, + "step": 848 + }, + { + "epoch": 13.93042575285566, + "grad_norm": 0.2174832671880722, + "learning_rate": 9.832941224950012e-06, + "loss": 0.0436, + "num_input_tokens_seen": 61801464, + "step": 849 + }, + { + "epoch": 13.947040498442368, + "grad_norm": 0.255197674036026, + "learning_rate": 9.780964274781984e-06, + "loss": 0.0593, + "num_input_tokens_seen": 61876376, + "step": 850 + }, + { + "epoch": 13.963655244029075, + "grad_norm": 0.25164854526519775, + "learning_rate": 9.729091634180756e-06, + "loss": 0.0562, + "num_input_tokens_seen": 61947232, + "step": 851 + }, + { + "epoch": 13.980269989615785, + "grad_norm": 0.2508520781993866, + "learning_rate": 9.677323658675594e-06, + "loss": 0.0485, + "num_input_tokens_seen": 62026520, + "step": 852 + }, + { + "epoch": 13.996884735202492, + "grad_norm": 0.24607762694358826, + "learning_rate": 9.625660703078392e-06, + "loss": 0.1437, + "num_input_tokens_seen": 62107152, + "step": 853 + }, + { + "epoch": 14.0, + "grad_norm": 0.6298788189888, + "learning_rate": 9.574103121481288e-06, + "loss": 0.054, + "num_input_tokens_seen": 62117944, + "step": 854 + }, + { + "epoch": 14.016614745586708, + "grad_norm": 0.2503069341182709, + "learning_rate": 9.522651267254149e-06, + "loss": 0.0527, + "num_input_tokens_seen": 62192136, + "step": 855 + }, + { + "epoch": 14.033229491173417, + "grad_norm": 0.22820697724819183, + "learning_rate": 9.471305493042243e-06, + "loss": 0.0479, + "num_input_tokens_seen": 62290128, + "step": 856 + }, + { + "epoch": 14.049844236760125, + "grad_norm": 0.2834872901439667, + "learning_rate": 9.420066150763748e-06, + "loss": 0.0542, + "num_input_tokens_seen": 62346640, + "step": 857 + }, + { + "epoch": 14.066458982346832, + "grad_norm": 0.21925635635852814, + "learning_rate": 9.368933591607378e-06, + "loss": 0.0412, + "num_input_tokens_seen": 62430464, + "step": 858 + }, + { + "epoch": 14.083073727933542, + "grad_norm": 0.27315613627433777, + "learning_rate": 9.317908166029962e-06, + "loss": 0.055, + "num_input_tokens_seen": 62487224, + "step": 859 + }, + { + "epoch": 14.09968847352025, + "grad_norm": 0.2407708764076233, + "learning_rate": 9.266990223754069e-06, + "loss": 0.0644, + "num_input_tokens_seen": 62570352, + "step": 860 + }, + { + "epoch": 14.116303219106957, + "grad_norm": 0.23930609226226807, + "learning_rate": 9.216180113765558e-06, + "loss": 0.0509, + "num_input_tokens_seen": 62677432, + "step": 861 + }, + { + "epoch": 14.132917964693666, + "grad_norm": 0.2706838548183441, + "learning_rate": 9.16547818431125e-06, + "loss": 0.0538, + "num_input_tokens_seen": 62750304, + "step": 862 + }, + { + "epoch": 14.149532710280374, + "grad_norm": 0.2537670433521271, + "learning_rate": 9.114884782896483e-06, + "loss": 0.0495, + "num_input_tokens_seen": 62817736, + "step": 863 + }, + { + "epoch": 14.166147455867081, + "grad_norm": 0.2521739602088928, + "learning_rate": 9.064400256282757e-06, + "loss": 0.0521, + "num_input_tokens_seen": 62896016, + "step": 864 + }, + { + "epoch": 14.18276220145379, + "grad_norm": 0.2947329580783844, + "learning_rate": 9.014024950485383e-06, + "loss": 0.0483, + "num_input_tokens_seen": 62957488, + "step": 865 + }, + { + "epoch": 14.199376947040498, + "grad_norm": 0.2857937514781952, + "learning_rate": 8.963759210771052e-06, + "loss": 0.0526, + "num_input_tokens_seen": 63013344, + "step": 866 + }, + { + "epoch": 14.215991692627206, + "grad_norm": 0.27009105682373047, + "learning_rate": 8.913603381655528e-06, + "loss": 0.0543, + "num_input_tokens_seen": 63073504, + "step": 867 + }, + { + "epoch": 14.232606438213915, + "grad_norm": 0.24146534502506256, + "learning_rate": 8.863557806901233e-06, + "loss": 0.0438, + "num_input_tokens_seen": 63143344, + "step": 868 + }, + { + "epoch": 14.249221183800623, + "grad_norm": 0.19567841291427612, + "learning_rate": 8.813622829514956e-06, + "loss": 0.041, + "num_input_tokens_seen": 63245488, + "step": 869 + }, + { + "epoch": 14.26583592938733, + "grad_norm": 0.2820407748222351, + "learning_rate": 8.763798791745411e-06, + "loss": 0.0523, + "num_input_tokens_seen": 63297208, + "step": 870 + }, + { + "epoch": 14.28245067497404, + "grad_norm": 0.28351321816444397, + "learning_rate": 8.714086035080996e-06, + "loss": 0.055, + "num_input_tokens_seen": 63357392, + "step": 871 + }, + { + "epoch": 14.299065420560748, + "grad_norm": 0.23567432165145874, + "learning_rate": 8.664484900247363e-06, + "loss": 0.0499, + "num_input_tokens_seen": 63429064, + "step": 872 + }, + { + "epoch": 14.315680166147455, + "grad_norm": 0.32714203000068665, + "learning_rate": 8.614995727205156e-06, + "loss": 0.0698, + "num_input_tokens_seen": 63478112, + "step": 873 + }, + { + "epoch": 14.332294911734165, + "grad_norm": 0.2806774377822876, + "learning_rate": 8.565618855147603e-06, + "loss": 0.0464, + "num_input_tokens_seen": 63558472, + "step": 874 + }, + { + "epoch": 14.348909657320872, + "grad_norm": 0.22814752161502838, + "learning_rate": 8.51635462249828e-06, + "loss": 0.0474, + "num_input_tokens_seen": 63641776, + "step": 875 + }, + { + "epoch": 14.36552440290758, + "grad_norm": 0.20048512518405914, + "learning_rate": 8.467203366908707e-06, + "loss": 0.0458, + "num_input_tokens_seen": 63746080, + "step": 876 + }, + { + "epoch": 14.38213914849429, + "grad_norm": 0.17419511079788208, + "learning_rate": 8.41816542525608e-06, + "loss": 0.1133, + "num_input_tokens_seen": 63846320, + "step": 877 + }, + { + "epoch": 14.398753894080997, + "grad_norm": 0.26787328720092773, + "learning_rate": 8.369241133640982e-06, + "loss": 0.0462, + "num_input_tokens_seen": 63917736, + "step": 878 + }, + { + "epoch": 14.415368639667705, + "grad_norm": 0.2565169334411621, + "learning_rate": 8.320430827385003e-06, + "loss": 0.133, + "num_input_tokens_seen": 63981576, + "step": 879 + }, + { + "epoch": 14.431983385254414, + "grad_norm": 0.31951773166656494, + "learning_rate": 8.271734841028553e-06, + "loss": 0.0649, + "num_input_tokens_seen": 64032848, + "step": 880 + }, + { + "epoch": 14.448598130841122, + "grad_norm": 0.261004775762558, + "learning_rate": 8.22315350832843e-06, + "loss": 0.0533, + "num_input_tokens_seen": 64098072, + "step": 881 + }, + { + "epoch": 14.46521287642783, + "grad_norm": 0.2179432213306427, + "learning_rate": 8.174687162255672e-06, + "loss": 0.0484, + "num_input_tokens_seen": 64186088, + "step": 882 + }, + { + "epoch": 14.481827622014539, + "grad_norm": 0.31436631083488464, + "learning_rate": 8.126336134993176e-06, + "loss": 0.05, + "num_input_tokens_seen": 64277520, + "step": 883 + }, + { + "epoch": 14.498442367601246, + "grad_norm": 0.3128837049007416, + "learning_rate": 8.078100757933485e-06, + "loss": 0.0603, + "num_input_tokens_seen": 64340728, + "step": 884 + }, + { + "epoch": 14.515057113187954, + "grad_norm": 0.2509966492652893, + "learning_rate": 8.029981361676456e-06, + "loss": 0.0497, + "num_input_tokens_seen": 64417080, + "step": 885 + }, + { + "epoch": 14.531671858774663, + "grad_norm": 0.2274176925420761, + "learning_rate": 7.981978276027055e-06, + "loss": 0.0396, + "num_input_tokens_seen": 64503560, + "step": 886 + }, + { + "epoch": 14.54828660436137, + "grad_norm": 0.23473070561885834, + "learning_rate": 7.934091829993055e-06, + "loss": 0.0505, + "num_input_tokens_seen": 64575432, + "step": 887 + }, + { + "epoch": 14.564901349948078, + "grad_norm": 0.2230038344860077, + "learning_rate": 7.886322351782783e-06, + "loss": 0.0455, + "num_input_tokens_seen": 64669216, + "step": 888 + }, + { + "epoch": 14.581516095534788, + "grad_norm": 0.27560481429100037, + "learning_rate": 7.838670168802909e-06, + "loss": 0.0633, + "num_input_tokens_seen": 64728640, + "step": 889 + }, + { + "epoch": 14.598130841121495, + "grad_norm": 0.30317774415016174, + "learning_rate": 7.791135607656147e-06, + "loss": 0.0598, + "num_input_tokens_seen": 64780032, + "step": 890 + }, + { + "epoch": 14.614745586708203, + "grad_norm": 0.2672622799873352, + "learning_rate": 7.743718994139071e-06, + "loss": 0.0536, + "num_input_tokens_seen": 64863760, + "step": 891 + }, + { + "epoch": 14.631360332294912, + "grad_norm": 0.29224610328674316, + "learning_rate": 7.696420653239833e-06, + "loss": 0.0515, + "num_input_tokens_seen": 64928808, + "step": 892 + }, + { + "epoch": 14.64797507788162, + "grad_norm": 0.2772779166698456, + "learning_rate": 7.649240909135965e-06, + "loss": 0.0579, + "num_input_tokens_seen": 64999256, + "step": 893 + }, + { + "epoch": 14.664589823468328, + "grad_norm": 0.22769714891910553, + "learning_rate": 7.602180085192143e-06, + "loss": 0.0462, + "num_input_tokens_seen": 65081544, + "step": 894 + }, + { + "epoch": 14.681204569055037, + "grad_norm": 0.28400924801826477, + "learning_rate": 7.555238503958001e-06, + "loss": 0.0545, + "num_input_tokens_seen": 65145096, + "step": 895 + }, + { + "epoch": 14.697819314641745, + "grad_norm": 0.2113635390996933, + "learning_rate": 7.508416487165862e-06, + "loss": 0.0446, + "num_input_tokens_seen": 65229504, + "step": 896 + }, + { + "epoch": 14.714434060228452, + "grad_norm": 0.2663005590438843, + "learning_rate": 7.461714355728608e-06, + "loss": 0.0511, + "num_input_tokens_seen": 65299408, + "step": 897 + }, + { + "epoch": 14.731048805815162, + "grad_norm": 0.2649020850658417, + "learning_rate": 7.415132429737407e-06, + "loss": 0.061, + "num_input_tokens_seen": 65376216, + "step": 898 + }, + { + "epoch": 14.74766355140187, + "grad_norm": 0.22136181592941284, + "learning_rate": 7.368671028459564e-06, + "loss": 0.0453, + "num_input_tokens_seen": 65457240, + "step": 899 + }, + { + "epoch": 14.764278296988577, + "grad_norm": 0.2594226896762848, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.0585, + "num_input_tokens_seen": 65531064, + "step": 900 + }, + { + "epoch": 14.780893042575286, + "grad_norm": 0.24974443018436432, + "learning_rate": 7.276111072980663e-06, + "loss": 0.1328, + "num_input_tokens_seen": 65600688, + "step": 901 + }, + { + "epoch": 14.797507788161994, + "grad_norm": 0.2837437689304352, + "learning_rate": 7.230013153175188e-06, + "loss": 0.0515, + "num_input_tokens_seen": 65678640, + "step": 902 + }, + { + "epoch": 14.814122533748701, + "grad_norm": 0.25543513894081116, + "learning_rate": 7.184037026869867e-06, + "loss": 0.0496, + "num_input_tokens_seen": 65755816, + "step": 903 + }, + { + "epoch": 14.83073727933541, + "grad_norm": 0.24205391108989716, + "learning_rate": 7.138183009179922e-06, + "loss": 0.0499, + "num_input_tokens_seen": 65837696, + "step": 904 + }, + { + "epoch": 14.847352024922118, + "grad_norm": 0.325641393661499, + "learning_rate": 7.092451414383644e-06, + "loss": 0.0545, + "num_input_tokens_seen": 65901792, + "step": 905 + }, + { + "epoch": 14.863966770508826, + "grad_norm": 0.28845149278640747, + "learning_rate": 7.046842555920283e-06, + "loss": 0.0614, + "num_input_tokens_seen": 65969960, + "step": 906 + }, + { + "epoch": 14.880581516095535, + "grad_norm": 0.2403915971517563, + "learning_rate": 7.00135674638783e-06, + "loss": 0.053, + "num_input_tokens_seen": 66043752, + "step": 907 + }, + { + "epoch": 14.897196261682243, + "grad_norm": 0.35168591141700745, + "learning_rate": 6.9559942975409465e-06, + "loss": 0.0562, + "num_input_tokens_seen": 66100456, + "step": 908 + }, + { + "epoch": 14.91381100726895, + "grad_norm": 0.2577241361141205, + "learning_rate": 6.91075552028877e-06, + "loss": 0.0582, + "num_input_tokens_seen": 66177320, + "step": 909 + }, + { + "epoch": 14.93042575285566, + "grad_norm": 0.28989753127098083, + "learning_rate": 6.865640724692815e-06, + "loss": 0.0582, + "num_input_tokens_seen": 66231656, + "step": 910 + }, + { + "epoch": 14.947040498442368, + "grad_norm": 0.25177815556526184, + "learning_rate": 6.820650219964833e-06, + "loss": 0.0477, + "num_input_tokens_seen": 66315768, + "step": 911 + }, + { + "epoch": 14.963655244029075, + "grad_norm": 0.2509918808937073, + "learning_rate": 6.775784314464717e-06, + "loss": 0.0522, + "num_input_tokens_seen": 66391256, + "step": 912 + }, + { + "epoch": 14.980269989615785, + "grad_norm": 0.30490386486053467, + "learning_rate": 6.731043315698346e-06, + "loss": 0.0617, + "num_input_tokens_seen": 66469184, + "step": 913 + }, + { + "epoch": 14.996884735202492, + "grad_norm": 0.2597614526748657, + "learning_rate": 6.686427530315534e-06, + "loss": 0.0553, + "num_input_tokens_seen": 66540392, + "step": 914 + }, + { + "epoch": 15.0, + "grad_norm": 0.3879867196083069, + "learning_rate": 6.641937264107867e-06, + "loss": 0.038, + "num_input_tokens_seen": 66561792, + "step": 915 + }, + { + "epoch": 15.016614745586708, + "grad_norm": 0.26023438572883606, + "learning_rate": 6.5975728220066425e-06, + "loss": 0.0557, + "num_input_tokens_seen": 66635072, + "step": 916 + }, + { + "epoch": 15.033229491173417, + "grad_norm": 0.22681663930416107, + "learning_rate": 6.553334508080794e-06, + "loss": 0.0475, + "num_input_tokens_seen": 66708200, + "step": 917 + }, + { + "epoch": 15.049844236760125, + "grad_norm": 0.19712482392787933, + "learning_rate": 6.509222625534755e-06, + "loss": 0.0401, + "num_input_tokens_seen": 66796600, + "step": 918 + }, + { + "epoch": 15.066458982346832, + "grad_norm": 0.2641635835170746, + "learning_rate": 6.465237476706449e-06, + "loss": 0.0624, + "num_input_tokens_seen": 66878560, + "step": 919 + }, + { + "epoch": 15.083073727933542, + "grad_norm": 0.24981576204299927, + "learning_rate": 6.421379363065142e-06, + "loss": 0.0502, + "num_input_tokens_seen": 66944600, + "step": 920 + }, + { + "epoch": 15.09968847352025, + "grad_norm": 0.19411137700080872, + "learning_rate": 6.377648585209456e-06, + "loss": 0.0384, + "num_input_tokens_seen": 67026488, + "step": 921 + }, + { + "epoch": 15.116303219106957, + "grad_norm": 0.2732497751712799, + "learning_rate": 6.334045442865219e-06, + "loss": 0.0525, + "num_input_tokens_seen": 67095344, + "step": 922 + }, + { + "epoch": 15.132917964693666, + "grad_norm": 0.26258111000061035, + "learning_rate": 6.290570234883505e-06, + "loss": 0.0567, + "num_input_tokens_seen": 67171352, + "step": 923 + }, + { + "epoch": 15.149532710280374, + "grad_norm": 0.3083919286727905, + "learning_rate": 6.247223259238511e-06, + "loss": 0.0611, + "num_input_tokens_seen": 67232288, + "step": 924 + }, + { + "epoch": 15.166147455867081, + "grad_norm": 0.3361482322216034, + "learning_rate": 6.204004813025568e-06, + "loss": 0.0583, + "num_input_tokens_seen": 67287128, + "step": 925 + }, + { + "epoch": 15.18276220145379, + "grad_norm": 0.21773198246955872, + "learning_rate": 6.160915192459058e-06, + "loss": 0.0419, + "num_input_tokens_seen": 67362248, + "step": 926 + }, + { + "epoch": 15.199376947040498, + "grad_norm": 0.27043187618255615, + "learning_rate": 6.117954692870412e-06, + "loss": 0.0484, + "num_input_tokens_seen": 67427448, + "step": 927 + }, + { + "epoch": 15.215991692627206, + "grad_norm": 0.22789040207862854, + "learning_rate": 6.075123608706093e-06, + "loss": 0.0477, + "num_input_tokens_seen": 67506648, + "step": 928 + }, + { + "epoch": 15.232606438213915, + "grad_norm": 0.21521471440792084, + "learning_rate": 6.032422233525545e-06, + "loss": 0.0411, + "num_input_tokens_seen": 67589024, + "step": 929 + }, + { + "epoch": 15.249221183800623, + "grad_norm": 0.2916419804096222, + "learning_rate": 5.989850859999227e-06, + "loss": 0.0511, + "num_input_tokens_seen": 67649592, + "step": 930 + }, + { + "epoch": 15.26583592938733, + "grad_norm": 0.2598723769187927, + "learning_rate": 5.947409779906554e-06, + "loss": 0.048, + "num_input_tokens_seen": 67717080, + "step": 931 + }, + { + "epoch": 15.28245067497404, + "grad_norm": 0.22704635560512543, + "learning_rate": 5.905099284133952e-06, + "loss": 0.0511, + "num_input_tokens_seen": 67791560, + "step": 932 + }, + { + "epoch": 15.299065420560748, + "grad_norm": 0.23701129853725433, + "learning_rate": 5.8629196626728e-06, + "loss": 0.0452, + "num_input_tokens_seen": 67852376, + "step": 933 + }, + { + "epoch": 15.315680166147455, + "grad_norm": 0.18171566724777222, + "learning_rate": 5.820871204617515e-06, + "loss": 0.0384, + "num_input_tokens_seen": 67946192, + "step": 934 + }, + { + "epoch": 15.332294911734165, + "grad_norm": 0.21201346814632416, + "learning_rate": 5.778954198163514e-06, + "loss": 0.0404, + "num_input_tokens_seen": 68025248, + "step": 935 + }, + { + "epoch": 15.348909657320872, + "grad_norm": 0.24997848272323608, + "learning_rate": 5.737168930605272e-06, + "loss": 0.0517, + "num_input_tokens_seen": 68109776, + "step": 936 + }, + { + "epoch": 15.36552440290758, + "grad_norm": 0.22962713241577148, + "learning_rate": 5.6955156883343265e-06, + "loss": 0.0481, + "num_input_tokens_seen": 68187496, + "step": 937 + }, + { + "epoch": 15.38213914849429, + "grad_norm": 0.21411846578121185, + "learning_rate": 5.653994756837347e-06, + "loss": 0.0429, + "num_input_tokens_seen": 68268560, + "step": 938 + }, + { + "epoch": 15.398753894080997, + "grad_norm": 0.25815847516059875, + "learning_rate": 5.612606420694141e-06, + "loss": 0.058, + "num_input_tokens_seen": 68331208, + "step": 939 + }, + { + "epoch": 15.415368639667705, + "grad_norm": 0.25215816497802734, + "learning_rate": 5.571350963575728e-06, + "loss": 0.0561, + "num_input_tokens_seen": 68401712, + "step": 940 + }, + { + "epoch": 15.431983385254414, + "grad_norm": 0.19544276595115662, + "learning_rate": 5.530228668242402e-06, + "loss": 0.0411, + "num_input_tokens_seen": 68487824, + "step": 941 + }, + { + "epoch": 15.448598130841122, + "grad_norm": 0.2418648898601532, + "learning_rate": 5.489239816541755e-06, + "loss": 0.05, + "num_input_tokens_seen": 68558256, + "step": 942 + }, + { + "epoch": 15.46521287642783, + "grad_norm": 0.27279072999954224, + "learning_rate": 5.4483846894068046e-06, + "loss": 0.0493, + "num_input_tokens_seen": 68652768, + "step": 943 + }, + { + "epoch": 15.481827622014539, + "grad_norm": 0.21187296509742737, + "learning_rate": 5.4076635668540075e-06, + "loss": 0.0433, + "num_input_tokens_seen": 68745352, + "step": 944 + }, + { + "epoch": 15.498442367601246, + "grad_norm": 0.20128142833709717, + "learning_rate": 5.367076727981382e-06, + "loss": 0.0411, + "num_input_tokens_seen": 68830520, + "step": 945 + }, + { + "epoch": 15.515057113187954, + "grad_norm": 0.30053532123565674, + "learning_rate": 5.326624450966569e-06, + "loss": 0.2515, + "num_input_tokens_seen": 68894064, + "step": 946 + }, + { + "epoch": 15.531671858774663, + "grad_norm": 0.23420806229114532, + "learning_rate": 5.286307013064956e-06, + "loss": 0.0441, + "num_input_tokens_seen": 68963888, + "step": 947 + }, + { + "epoch": 15.54828660436137, + "grad_norm": 0.2615622282028198, + "learning_rate": 5.24612469060774e-06, + "loss": 0.048, + "num_input_tokens_seen": 69025032, + "step": 948 + }, + { + "epoch": 15.564901349948078, + "grad_norm": 0.2581160366535187, + "learning_rate": 5.206077759000069e-06, + "loss": 0.0477, + "num_input_tokens_seen": 69087736, + "step": 949 + }, + { + "epoch": 15.581516095534788, + "grad_norm": 0.26595795154571533, + "learning_rate": 5.166166492719124e-06, + "loss": 0.0533, + "num_input_tokens_seen": 69168720, + "step": 950 + }, + { + "epoch": 15.598130841121495, + "grad_norm": 0.30843034386634827, + "learning_rate": 5.12639116531225e-06, + "loss": 0.0589, + "num_input_tokens_seen": 69234496, + "step": 951 + }, + { + "epoch": 15.614745586708203, + "grad_norm": 0.2536829113960266, + "learning_rate": 5.086752049395094e-06, + "loss": 0.0476, + "num_input_tokens_seen": 69303200, + "step": 952 + }, + { + "epoch": 15.631360332294912, + "grad_norm": 0.2525365650653839, + "learning_rate": 5.0472494166497135e-06, + "loss": 0.0486, + "num_input_tokens_seen": 69387968, + "step": 953 + }, + { + "epoch": 15.64797507788162, + "grad_norm": 0.21798045933246613, + "learning_rate": 5.007883537822736e-06, + "loss": 0.0428, + "num_input_tokens_seen": 69468496, + "step": 954 + }, + { + "epoch": 15.664589823468328, + "grad_norm": 0.25125470757484436, + "learning_rate": 4.9686546827234865e-06, + "loss": 0.0485, + "num_input_tokens_seen": 69545408, + "step": 955 + }, + { + "epoch": 15.681204569055037, + "grad_norm": 0.24528349936008453, + "learning_rate": 4.929563120222141e-06, + "loss": 0.0468, + "num_input_tokens_seen": 69604664, + "step": 956 + }, + { + "epoch": 15.697819314641745, + "grad_norm": 0.3065119683742523, + "learning_rate": 4.890609118247888e-06, + "loss": 0.061, + "num_input_tokens_seen": 69663456, + "step": 957 + }, + { + "epoch": 15.714434060228452, + "grad_norm": 0.32543885707855225, + "learning_rate": 4.851792943787109e-06, + "loss": 0.0588, + "num_input_tokens_seen": 69726392, + "step": 958 + }, + { + "epoch": 15.731048805815162, + "grad_norm": 0.2984660565853119, + "learning_rate": 4.813114862881502e-06, + "loss": 0.0527, + "num_input_tokens_seen": 69774792, + "step": 959 + }, + { + "epoch": 15.74766355140187, + "grad_norm": 0.21434898674488068, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.0377, + "num_input_tokens_seen": 69864152, + "step": 960 + }, + { + "epoch": 15.764278296988577, + "grad_norm": 0.2650623321533203, + "learning_rate": 4.7361740411684865e-06, + "loss": 0.0498, + "num_input_tokens_seen": 69932112, + "step": 961 + }, + { + "epoch": 15.780893042575286, + "grad_norm": 0.24544315040111542, + "learning_rate": 4.6979118277048426e-06, + "loss": 0.0467, + "num_input_tokens_seen": 69997512, + "step": 962 + }, + { + "epoch": 15.797507788161994, + "grad_norm": 0.2633085250854492, + "learning_rate": 4.659788762480327e-06, + "loss": 0.047, + "num_input_tokens_seen": 70063808, + "step": 963 + }, + { + "epoch": 15.814122533748701, + "grad_norm": 0.24816827476024628, + "learning_rate": 4.621805106786142e-06, + "loss": 0.0486, + "num_input_tokens_seen": 70140760, + "step": 964 + }, + { + "epoch": 15.83073727933541, + "grad_norm": 0.24450010061264038, + "learning_rate": 4.583961120958027e-06, + "loss": 0.0468, + "num_input_tokens_seen": 70218960, + "step": 965 + }, + { + "epoch": 15.847352024922118, + "grad_norm": 0.22891941666603088, + "learning_rate": 4.54625706437441e-06, + "loss": 0.0467, + "num_input_tokens_seen": 70293824, + "step": 966 + }, + { + "epoch": 15.863966770508826, + "grad_norm": 0.2394842803478241, + "learning_rate": 4.508693195454694e-06, + "loss": 0.0515, + "num_input_tokens_seen": 70384464, + "step": 967 + }, + { + "epoch": 15.880581516095535, + "grad_norm": 0.23781423270702362, + "learning_rate": 4.4712697716574e-06, + "loss": 0.042, + "num_input_tokens_seen": 70455440, + "step": 968 + }, + { + "epoch": 15.897196261682243, + "grad_norm": 0.2540624737739563, + "learning_rate": 4.433987049478508e-06, + "loss": 0.0487, + "num_input_tokens_seen": 70513984, + "step": 969 + }, + { + "epoch": 15.91381100726895, + "grad_norm": 0.2405335158109665, + "learning_rate": 4.396845284449608e-06, + "loss": 0.0435, + "num_input_tokens_seen": 70588896, + "step": 970 + }, + { + "epoch": 15.93042575285566, + "grad_norm": 0.2510586977005005, + "learning_rate": 4.359844731136209e-06, + "loss": 0.1434, + "num_input_tokens_seen": 70673296, + "step": 971 + }, + { + "epoch": 15.947040498442368, + "grad_norm": 0.2762887179851532, + "learning_rate": 4.322985643135952e-06, + "loss": 0.0477, + "num_input_tokens_seen": 70742760, + "step": 972 + }, + { + "epoch": 15.963655244029075, + "grad_norm": 0.2639402449131012, + "learning_rate": 4.286268273076915e-06, + "loss": 0.0543, + "num_input_tokens_seen": 70806728, + "step": 973 + }, + { + "epoch": 15.980269989615785, + "grad_norm": 0.24937745928764343, + "learning_rate": 4.2496928726158155e-06, + "loss": 0.0496, + "num_input_tokens_seen": 70909320, + "step": 974 + }, + { + "epoch": 15.996884735202492, + "grad_norm": 0.2835952937602997, + "learning_rate": 4.213259692436367e-06, + "loss": 0.0566, + "num_input_tokens_seen": 70989640, + "step": 975 + }, + { + "epoch": 16.0, + "grad_norm": 0.7339544296264648, + "learning_rate": 4.176968982247514e-06, + "loss": 0.049, + "num_input_tokens_seen": 70997848, + "step": 976 + }, + { + "epoch": 16.016614745586708, + "grad_norm": 0.28989917039871216, + "learning_rate": 4.140820990781705e-06, + "loss": 0.0534, + "num_input_tokens_seen": 71051960, + "step": 977 + }, + { + "epoch": 16.033229491173415, + "grad_norm": 0.22893474996089935, + "learning_rate": 4.104815965793249e-06, + "loss": 0.0468, + "num_input_tokens_seen": 71132536, + "step": 978 + }, + { + "epoch": 16.049844236760123, + "grad_norm": 0.23161743581295013, + "learning_rate": 4.068954154056528e-06, + "loss": 0.0448, + "num_input_tokens_seen": 71202432, + "step": 979 + }, + { + "epoch": 16.066458982346834, + "grad_norm": 0.2329898178577423, + "learning_rate": 4.0332358013644016e-06, + "loss": 0.0415, + "num_input_tokens_seen": 71287576, + "step": 980 + }, + { + "epoch": 16.08307372793354, + "grad_norm": 0.2544691860675812, + "learning_rate": 3.9976611525264525e-06, + "loss": 0.1111, + "num_input_tokens_seen": 71367248, + "step": 981 + }, + { + "epoch": 16.09968847352025, + "grad_norm": 0.20354589819908142, + "learning_rate": 3.962230451367349e-06, + "loss": 0.0432, + "num_input_tokens_seen": 71447928, + "step": 982 + }, + { + "epoch": 16.116303219106957, + "grad_norm": 0.28865981101989746, + "learning_rate": 3.926943940725137e-06, + "loss": 0.0499, + "num_input_tokens_seen": 71501472, + "step": 983 + }, + { + "epoch": 16.132917964693664, + "grad_norm": 0.30155521631240845, + "learning_rate": 3.891801862449629e-06, + "loss": 0.0511, + "num_input_tokens_seen": 71568048, + "step": 984 + }, + { + "epoch": 16.149532710280372, + "grad_norm": 0.21292831003665924, + "learning_rate": 3.85680445740067e-06, + "loss": 0.0476, + "num_input_tokens_seen": 71641112, + "step": 985 + }, + { + "epoch": 16.166147455867083, + "grad_norm": 0.22878766059875488, + "learning_rate": 3.821951965446577e-06, + "loss": 0.0456, + "num_input_tokens_seen": 71716800, + "step": 986 + }, + { + "epoch": 16.18276220145379, + "grad_norm": 0.18548932671546936, + "learning_rate": 3.7872446254624104e-06, + "loss": 0.0341, + "num_input_tokens_seen": 71799616, + "step": 987 + }, + { + "epoch": 16.1993769470405, + "grad_norm": 0.26984840631484985, + "learning_rate": 3.752682675328406e-06, + "loss": 0.0496, + "num_input_tokens_seen": 71863392, + "step": 988 + }, + { + "epoch": 16.215991692627206, + "grad_norm": 0.28214019536972046, + "learning_rate": 3.718266351928287e-06, + "loss": 0.0458, + "num_input_tokens_seen": 71927704, + "step": 989 + }, + { + "epoch": 16.232606438213914, + "grad_norm": 0.2738666534423828, + "learning_rate": 3.6839958911476957e-06, + "loss": 0.0473, + "num_input_tokens_seen": 71984208, + "step": 990 + }, + { + "epoch": 16.24922118380062, + "grad_norm": 0.22236013412475586, + "learning_rate": 3.6498715278725254e-06, + "loss": 0.0378, + "num_input_tokens_seen": 72053448, + "step": 991 + }, + { + "epoch": 16.265835929387332, + "grad_norm": 0.2531529664993286, + "learning_rate": 3.6158934959873353e-06, + "loss": 0.0422, + "num_input_tokens_seen": 72123464, + "step": 992 + }, + { + "epoch": 16.28245067497404, + "grad_norm": 0.2313385158777237, + "learning_rate": 3.5820620283737616e-06, + "loss": 0.1494, + "num_input_tokens_seen": 72197920, + "step": 993 + }, + { + "epoch": 16.299065420560748, + "grad_norm": 0.2416224330663681, + "learning_rate": 3.5483773569088856e-06, + "loss": 0.044, + "num_input_tokens_seen": 72295824, + "step": 994 + }, + { + "epoch": 16.315680166147455, + "grad_norm": 0.2035658210515976, + "learning_rate": 3.5148397124636826e-06, + "loss": 0.0428, + "num_input_tokens_seen": 72389296, + "step": 995 + }, + { + "epoch": 16.332294911734163, + "grad_norm": 0.29375237226486206, + "learning_rate": 3.4814493249014116e-06, + "loss": 0.0546, + "num_input_tokens_seen": 72451080, + "step": 996 + }, + { + "epoch": 16.34890965732087, + "grad_norm": 0.2521064579486847, + "learning_rate": 3.4482064230760474e-06, + "loss": 0.0457, + "num_input_tokens_seen": 72523168, + "step": 997 + }, + { + "epoch": 16.36552440290758, + "grad_norm": 0.32425233721733093, + "learning_rate": 3.415111234830709e-06, + "loss": 0.0573, + "num_input_tokens_seen": 72585384, + "step": 998 + }, + { + "epoch": 16.38213914849429, + "grad_norm": 0.27826961874961853, + "learning_rate": 3.382163986996126e-06, + "loss": 0.0484, + "num_input_tokens_seen": 72645544, + "step": 999 + }, + { + "epoch": 16.398753894080997, + "grad_norm": 0.223323792219162, + "learning_rate": 3.3493649053890326e-06, + "loss": 0.0412, + "num_input_tokens_seen": 72722408, + "step": 1000 + }, + { + "epoch": 16.415368639667705, + "grad_norm": 0.28299546241760254, + "learning_rate": 3.3167142148106762e-06, + "loss": 0.0498, + "num_input_tokens_seen": 72798192, + "step": 1001 + }, + { + "epoch": 16.431983385254412, + "grad_norm": 0.26504456996917725, + "learning_rate": 3.284212139045223e-06, + "loss": 0.0492, + "num_input_tokens_seen": 72862440, + "step": 1002 + }, + { + "epoch": 16.44859813084112, + "grad_norm": 0.2836116552352905, + "learning_rate": 3.2518589008582597e-06, + "loss": 0.0507, + "num_input_tokens_seen": 72911448, + "step": 1003 + }, + { + "epoch": 16.46521287642783, + "grad_norm": 0.25678548216819763, + "learning_rate": 3.219654721995266e-06, + "loss": 0.0501, + "num_input_tokens_seen": 72980368, + "step": 1004 + }, + { + "epoch": 16.48182762201454, + "grad_norm": 0.29687047004699707, + "learning_rate": 3.187599823180071e-06, + "loss": 0.0548, + "num_input_tokens_seen": 73044888, + "step": 1005 + }, + { + "epoch": 16.498442367601246, + "grad_norm": 0.23092421889305115, + "learning_rate": 3.15569442411337e-06, + "loss": 0.0427, + "num_input_tokens_seen": 73119968, + "step": 1006 + }, + { + "epoch": 16.515057113187954, + "grad_norm": 0.2751709222793579, + "learning_rate": 3.1239387434711916e-06, + "loss": 0.0475, + "num_input_tokens_seen": 73184104, + "step": 1007 + }, + { + "epoch": 16.53167185877466, + "grad_norm": 0.29244229197502136, + "learning_rate": 3.092332998903416e-06, + "loss": 0.0517, + "num_input_tokens_seen": 73239464, + "step": 1008 + }, + { + "epoch": 16.54828660436137, + "grad_norm": 0.25361648201942444, + "learning_rate": 3.06087740703227e-06, + "loss": 0.0548, + "num_input_tokens_seen": 73312808, + "step": 1009 + }, + { + "epoch": 16.56490134994808, + "grad_norm": 0.23709113895893097, + "learning_rate": 3.029572183450868e-06, + "loss": 0.1552, + "num_input_tokens_seen": 73381528, + "step": 1010 + }, + { + "epoch": 16.581516095534788, + "grad_norm": 0.2809208035469055, + "learning_rate": 2.9984175427217016e-06, + "loss": 0.0461, + "num_input_tokens_seen": 73455200, + "step": 1011 + }, + { + "epoch": 16.598130841121495, + "grad_norm": 0.28413939476013184, + "learning_rate": 2.967413698375196e-06, + "loss": 0.0539, + "num_input_tokens_seen": 73527448, + "step": 1012 + }, + { + "epoch": 16.614745586708203, + "grad_norm": 0.23016762733459473, + "learning_rate": 2.936560862908225e-06, + "loss": 0.0475, + "num_input_tokens_seen": 73615192, + "step": 1013 + }, + { + "epoch": 16.63136033229491, + "grad_norm": 0.2616693675518036, + "learning_rate": 2.9058592477826636e-06, + "loss": 0.0471, + "num_input_tokens_seen": 73692976, + "step": 1014 + }, + { + "epoch": 16.64797507788162, + "grad_norm": 0.25557368993759155, + "learning_rate": 2.875309063423956e-06, + "loss": 0.0443, + "num_input_tokens_seen": 73753968, + "step": 1015 + }, + { + "epoch": 16.66458982346833, + "grad_norm": 0.2636294960975647, + "learning_rate": 2.8449105192196316e-06, + "loss": 0.0624, + "num_input_tokens_seen": 73816936, + "step": 1016 + }, + { + "epoch": 16.681204569055037, + "grad_norm": 0.25584691762924194, + "learning_rate": 2.8146638235179213e-06, + "loss": 0.0516, + "num_input_tokens_seen": 73895184, + "step": 1017 + }, + { + "epoch": 16.697819314641745, + "grad_norm": 0.23321789503097534, + "learning_rate": 2.784569183626276e-06, + "loss": 0.0471, + "num_input_tokens_seen": 74005280, + "step": 1018 + }, + { + "epoch": 16.714434060228452, + "grad_norm": 0.2339867651462555, + "learning_rate": 2.7546268058100096e-06, + "loss": 0.046, + "num_input_tokens_seen": 74078960, + "step": 1019 + }, + { + "epoch": 16.73104880581516, + "grad_norm": 0.24815727770328522, + "learning_rate": 2.7248368952908053e-06, + "loss": 0.0442, + "num_input_tokens_seen": 74143592, + "step": 1020 + } + ], + "logging_steps": 1.0, + "max_steps": 1200, + "num_input_tokens_seen": 74143592, + "num_train_epochs": 20, + "save_steps": 60, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.254877437100818e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}