diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,33440 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4771, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020959966464053657, + "grad_norm": 0.22152887284755707, + "learning_rate": 0.0, + "loss": 0.1548, + "step": 1 + }, + { + "epoch": 0.00041919932928107315, + "grad_norm": 0.1861487776041031, + "learning_rate": 6.944444444444444e-08, + "loss": 0.1418, + "step": 2 + }, + { + "epoch": 0.0006287989939216097, + "grad_norm": 0.20890986919403076, + "learning_rate": 1.3888888888888888e-07, + "loss": 0.1602, + "step": 3 + }, + { + "epoch": 0.0008383986585621463, + "grad_norm": 0.20199677348136902, + "learning_rate": 2.0833333333333333e-07, + "loss": 0.1467, + "step": 4 + }, + { + "epoch": 0.0010479983232026828, + "grad_norm": 0.18860988318920135, + "learning_rate": 2.7777777777777776e-07, + "loss": 0.1357, + "step": 5 + }, + { + "epoch": 0.0012575979878432194, + "grad_norm": 0.1798183023929596, + "learning_rate": 3.472222222222223e-07, + "loss": 0.1343, + "step": 6 + }, + { + "epoch": 0.001467197652483756, + "grad_norm": 0.12569160759449005, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.149, + "step": 7 + }, + { + "epoch": 0.0016767973171242926, + "grad_norm": 0.12502586841583252, + "learning_rate": 4.861111111111112e-07, + "loss": 0.1465, + "step": 8 + }, + { + "epoch": 0.0018863969817648292, + "grad_norm": 0.09551655501127243, + "learning_rate": 5.555555555555555e-07, + "loss": 0.1407, + "step": 9 + }, + { + "epoch": 0.0020959966464053656, + "grad_norm": 0.27150362730026245, + "learning_rate": 6.25e-07, + "loss": 0.141, + "step": 10 + }, + { + "epoch": 0.002305596311045902, + "grad_norm": 0.2170093059539795, + "learning_rate": 6.944444444444446e-07, + "loss": 0.1529, + "step": 11 + }, + { + "epoch": 0.0025151959756864388, + "grad_norm": 0.1994446963071823, + "learning_rate": 7.63888888888889e-07, + "loss": 0.1387, + "step": 12 + }, + { + "epoch": 0.0027247956403269754, + "grad_norm": 0.16523687541484833, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1401, + "step": 13 + }, + { + "epoch": 0.002934395304967512, + "grad_norm": 0.22484736144542694, + "learning_rate": 9.027777777777779e-07, + "loss": 0.1438, + "step": 14 + }, + { + "epoch": 0.0031439949696080486, + "grad_norm": 0.14437100291252136, + "learning_rate": 9.722222222222224e-07, + "loss": 0.1485, + "step": 15 + }, + { + "epoch": 0.003353594634248585, + "grad_norm": 0.12035862356424332, + "learning_rate": 1.0416666666666667e-06, + "loss": 0.1463, + "step": 16 + }, + { + "epoch": 0.0035631942988891218, + "grad_norm": 0.19523362815380096, + "learning_rate": 1.111111111111111e-06, + "loss": 0.1401, + "step": 17 + }, + { + "epoch": 0.0037727939635296584, + "grad_norm": 0.2469567507505417, + "learning_rate": 1.1805555555555556e-06, + "loss": 0.1442, + "step": 18 + }, + { + "epoch": 0.0039823936281701946, + "grad_norm": 0.21258766949176788, + "learning_rate": 1.25e-06, + "loss": 0.1348, + "step": 19 + }, + { + "epoch": 0.004191993292810731, + "grad_norm": 0.13423168659210205, + "learning_rate": 1.3194444444444446e-06, + "loss": 0.1419, + "step": 20 + }, + { + "epoch": 0.004401592957451268, + "grad_norm": 0.15541696548461914, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.1387, + "step": 21 + }, + { + "epoch": 0.004611192622091804, + "grad_norm": 0.1762888878583908, + "learning_rate": 1.4583333333333335e-06, + "loss": 0.1372, + "step": 22 + }, + { + "epoch": 0.004820792286732341, + "grad_norm": 0.1527898907661438, + "learning_rate": 1.527777777777778e-06, + "loss": 0.1325, + "step": 23 + }, + { + "epoch": 0.0050303919513728776, + "grad_norm": 0.11752472072839737, + "learning_rate": 1.5972222222222221e-06, + "loss": 0.1354, + "step": 24 + }, + { + "epoch": 0.005239991616013414, + "grad_norm": 0.1533581018447876, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.1342, + "step": 25 + }, + { + "epoch": 0.005449591280653951, + "grad_norm": 0.14854849874973297, + "learning_rate": 1.7361111111111112e-06, + "loss": 0.1397, + "step": 26 + }, + { + "epoch": 0.005659190945294487, + "grad_norm": 0.1156805008649826, + "learning_rate": 1.8055555555555557e-06, + "loss": 0.1357, + "step": 27 + }, + { + "epoch": 0.005868790609935024, + "grad_norm": 0.11853114515542984, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.1258, + "step": 28 + }, + { + "epoch": 0.0060783902745755606, + "grad_norm": 0.14961375296115875, + "learning_rate": 1.944444444444445e-06, + "loss": 0.1292, + "step": 29 + }, + { + "epoch": 0.006287989939216097, + "grad_norm": 0.10973469167947769, + "learning_rate": 2.0138888888888893e-06, + "loss": 0.1403, + "step": 30 + }, + { + "epoch": 0.006497589603856634, + "grad_norm": 0.10710575431585312, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.1435, + "step": 31 + }, + { + "epoch": 0.00670718926849717, + "grad_norm": 0.12166962772607803, + "learning_rate": 2.152777777777778e-06, + "loss": 0.1351, + "step": 32 + }, + { + "epoch": 0.006916788933137707, + "grad_norm": 0.10747282952070236, + "learning_rate": 2.222222222222222e-06, + "loss": 0.1308, + "step": 33 + }, + { + "epoch": 0.0071263885977782436, + "grad_norm": 0.10978581756353378, + "learning_rate": 2.2916666666666666e-06, + "loss": 0.1303, + "step": 34 + }, + { + "epoch": 0.00733598826241878, + "grad_norm": 0.1297036111354828, + "learning_rate": 2.361111111111111e-06, + "loss": 0.1341, + "step": 35 + }, + { + "epoch": 0.007545587927059317, + "grad_norm": 0.10622533410787582, + "learning_rate": 2.4305555555555557e-06, + "loss": 0.1305, + "step": 36 + }, + { + "epoch": 0.007755187591699853, + "grad_norm": 0.12965598702430725, + "learning_rate": 2.5e-06, + "loss": 0.1179, + "step": 37 + }, + { + "epoch": 0.007964787256340389, + "grad_norm": 0.08929996192455292, + "learning_rate": 2.5694444444444443e-06, + "loss": 0.1246, + "step": 38 + }, + { + "epoch": 0.008174386920980926, + "grad_norm": 0.1213381290435791, + "learning_rate": 2.6388888888888893e-06, + "loss": 0.135, + "step": 39 + }, + { + "epoch": 0.008383986585621462, + "grad_norm": 0.13696981966495514, + "learning_rate": 2.7083333333333334e-06, + "loss": 0.1359, + "step": 40 + }, + { + "epoch": 0.008593586250261999, + "grad_norm": 0.10106691718101501, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.1308, + "step": 41 + }, + { + "epoch": 0.008803185914902536, + "grad_norm": 0.16903014481067657, + "learning_rate": 2.8472222222222224e-06, + "loss": 0.1243, + "step": 42 + }, + { + "epoch": 0.009012785579543072, + "grad_norm": 0.26333266496658325, + "learning_rate": 2.916666666666667e-06, + "loss": 0.1269, + "step": 43 + }, + { + "epoch": 0.009222385244183609, + "grad_norm": 0.5060687065124512, + "learning_rate": 2.986111111111111e-06, + "loss": 0.1267, + "step": 44 + }, + { + "epoch": 0.009431984908824145, + "grad_norm": 0.37180081009864807, + "learning_rate": 3.055555555555556e-06, + "loss": 0.1359, + "step": 45 + }, + { + "epoch": 0.009641584573464682, + "grad_norm": 0.37329721450805664, + "learning_rate": 3.125e-06, + "loss": 0.1266, + "step": 46 + }, + { + "epoch": 0.009851184238105219, + "grad_norm": 0.9444931745529175, + "learning_rate": 3.1944444444444443e-06, + "loss": 0.1265, + "step": 47 + }, + { + "epoch": 0.010060783902745755, + "grad_norm": 0.3645467460155487, + "learning_rate": 3.2638888888888892e-06, + "loss": 0.1252, + "step": 48 + }, + { + "epoch": 0.010270383567386292, + "grad_norm": 0.35541367530822754, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1315, + "step": 49 + }, + { + "epoch": 0.010479983232026828, + "grad_norm": 0.26033562421798706, + "learning_rate": 3.4027777777777783e-06, + "loss": 0.1319, + "step": 50 + }, + { + "epoch": 0.010689582896667365, + "grad_norm": 0.43073058128356934, + "learning_rate": 3.4722222222222224e-06, + "loss": 0.1278, + "step": 51 + }, + { + "epoch": 0.010899182561307902, + "grad_norm": 0.3041709363460541, + "learning_rate": 3.5416666666666673e-06, + "loss": 0.1283, + "step": 52 + }, + { + "epoch": 0.011108782225948438, + "grad_norm": 0.3577912747859955, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.1314, + "step": 53 + }, + { + "epoch": 0.011318381890588975, + "grad_norm": 0.25406020879745483, + "learning_rate": 3.680555555555556e-06, + "loss": 0.1303, + "step": 54 + }, + { + "epoch": 0.011527981555229511, + "grad_norm": 0.32610198855400085, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.1254, + "step": 55 + }, + { + "epoch": 0.011737581219870048, + "grad_norm": 0.2049138844013214, + "learning_rate": 3.819444444444444e-06, + "loss": 0.1319, + "step": 56 + }, + { + "epoch": 0.011947180884510585, + "grad_norm": 0.29326915740966797, + "learning_rate": 3.88888888888889e-06, + "loss": 0.1192, + "step": 57 + }, + { + "epoch": 0.012156780549151121, + "grad_norm": 0.2198316603899002, + "learning_rate": 3.958333333333333e-06, + "loss": 0.1177, + "step": 58 + }, + { + "epoch": 0.012366380213791658, + "grad_norm": 0.20488092303276062, + "learning_rate": 4.027777777777779e-06, + "loss": 0.1266, + "step": 59 + }, + { + "epoch": 0.012575979878432194, + "grad_norm": 0.22538422048091888, + "learning_rate": 4.097222222222222e-06, + "loss": 0.1168, + "step": 60 + }, + { + "epoch": 0.012785579543072731, + "grad_norm": 0.2462712526321411, + "learning_rate": 4.166666666666667e-06, + "loss": 0.1159, + "step": 61 + }, + { + "epoch": 0.012995179207713268, + "grad_norm": 0.17663376033306122, + "learning_rate": 4.236111111111111e-06, + "loss": 0.1273, + "step": 62 + }, + { + "epoch": 0.013204778872353804, + "grad_norm": 0.16204069554805756, + "learning_rate": 4.305555555555556e-06, + "loss": 0.1134, + "step": 63 + }, + { + "epoch": 0.01341437853699434, + "grad_norm": 0.2351599931716919, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.1272, + "step": 64 + }, + { + "epoch": 0.013623978201634877, + "grad_norm": 0.22874589264392853, + "learning_rate": 4.444444444444444e-06, + "loss": 0.1286, + "step": 65 + }, + { + "epoch": 0.013833577866275414, + "grad_norm": 0.25400885939598083, + "learning_rate": 4.5138888888888895e-06, + "loss": 0.1205, + "step": 66 + }, + { + "epoch": 0.01404317753091595, + "grad_norm": 0.07518032938241959, + "learning_rate": 4.583333333333333e-06, + "loss": 0.1247, + "step": 67 + }, + { + "epoch": 0.014252777195556487, + "grad_norm": 0.2462567836046219, + "learning_rate": 4.652777777777779e-06, + "loss": 0.1212, + "step": 68 + }, + { + "epoch": 0.014462376860197024, + "grad_norm": 0.3761221468448639, + "learning_rate": 4.722222222222222e-06, + "loss": 0.1161, + "step": 69 + }, + { + "epoch": 0.01467197652483756, + "grad_norm": 0.3552791476249695, + "learning_rate": 4.791666666666668e-06, + "loss": 0.1275, + "step": 70 + }, + { + "epoch": 0.014881576189478097, + "grad_norm": 0.2262704074382782, + "learning_rate": 4.861111111111111e-06, + "loss": 0.1217, + "step": 71 + }, + { + "epoch": 0.015091175854118634, + "grad_norm": 1.2746901512145996, + "learning_rate": 4.930555555555556e-06, + "loss": 0.1272, + "step": 72 + }, + { + "epoch": 0.01530077551875917, + "grad_norm": 0.47996070981025696, + "learning_rate": 5e-06, + "loss": 0.1326, + "step": 73 + }, + { + "epoch": 0.015510375183399707, + "grad_norm": 0.591677725315094, + "learning_rate": 5.069444444444445e-06, + "loss": 0.1359, + "step": 74 + }, + { + "epoch": 0.015719974848040243, + "grad_norm": 0.4169331192970276, + "learning_rate": 5.138888888888889e-06, + "loss": 0.1353, + "step": 75 + }, + { + "epoch": 0.015929574512680778, + "grad_norm": 0.2262561172246933, + "learning_rate": 5.208333333333334e-06, + "loss": 0.1318, + "step": 76 + }, + { + "epoch": 0.016139174177321317, + "grad_norm": 0.42291682958602905, + "learning_rate": 5.2777777777777785e-06, + "loss": 0.1287, + "step": 77 + }, + { + "epoch": 0.01634877384196185, + "grad_norm": 0.36955782771110535, + "learning_rate": 5.347222222222222e-06, + "loss": 0.1348, + "step": 78 + }, + { + "epoch": 0.01655837350660239, + "grad_norm": 0.286143034696579, + "learning_rate": 5.416666666666667e-06, + "loss": 0.1284, + "step": 79 + }, + { + "epoch": 0.016767973171242925, + "grad_norm": 0.3029952347278595, + "learning_rate": 5.486111111111112e-06, + "loss": 0.1357, + "step": 80 + }, + { + "epoch": 0.016977572835883463, + "grad_norm": 0.24525821208953857, + "learning_rate": 5.555555555555557e-06, + "loss": 0.1263, + "step": 81 + }, + { + "epoch": 0.017187172500523998, + "grad_norm": 0.3474200665950775, + "learning_rate": 5.625e-06, + "loss": 0.133, + "step": 82 + }, + { + "epoch": 0.017396772165164536, + "grad_norm": 0.2455555945634842, + "learning_rate": 5.694444444444445e-06, + "loss": 0.1292, + "step": 83 + }, + { + "epoch": 0.01760637182980507, + "grad_norm": 0.26876527070999146, + "learning_rate": 5.7638888888888886e-06, + "loss": 0.1327, + "step": 84 + }, + { + "epoch": 0.01781597149444561, + "grad_norm": 0.27033689618110657, + "learning_rate": 5.833333333333334e-06, + "loss": 0.1282, + "step": 85 + }, + { + "epoch": 0.018025571159086144, + "grad_norm": 0.19568723440170288, + "learning_rate": 5.9027777777777785e-06, + "loss": 0.1258, + "step": 86 + }, + { + "epoch": 0.018235170823726683, + "grad_norm": 0.30962151288986206, + "learning_rate": 5.972222222222222e-06, + "loss": 0.1259, + "step": 87 + }, + { + "epoch": 0.018444770488367217, + "grad_norm": 0.22522631287574768, + "learning_rate": 6.041666666666667e-06, + "loss": 0.1231, + "step": 88 + }, + { + "epoch": 0.018654370153007756, + "grad_norm": 0.3179832398891449, + "learning_rate": 6.111111111111112e-06, + "loss": 0.118, + "step": 89 + }, + { + "epoch": 0.01886396981764829, + "grad_norm": 0.34084218740463257, + "learning_rate": 6.180555555555557e-06, + "loss": 0.1219, + "step": 90 + }, + { + "epoch": 0.01907356948228883, + "grad_norm": 0.32963189482688904, + "learning_rate": 6.25e-06, + "loss": 0.1245, + "step": 91 + }, + { + "epoch": 0.019283169146929364, + "grad_norm": 0.45660048723220825, + "learning_rate": 6.319444444444445e-06, + "loss": 0.1265, + "step": 92 + }, + { + "epoch": 0.019492768811569902, + "grad_norm": 0.6942883133888245, + "learning_rate": 6.3888888888888885e-06, + "loss": 0.1288, + "step": 93 + }, + { + "epoch": 0.019702368476210437, + "grad_norm": 0.7902718782424927, + "learning_rate": 6.458333333333334e-06, + "loss": 0.129, + "step": 94 + }, + { + "epoch": 0.019911968140850975, + "grad_norm": 0.4735928773880005, + "learning_rate": 6.5277777777777784e-06, + "loss": 0.1187, + "step": 95 + }, + { + "epoch": 0.02012156780549151, + "grad_norm": 0.22121404111385345, + "learning_rate": 6.597222222222223e-06, + "loss": 0.1236, + "step": 96 + }, + { + "epoch": 0.02033116747013205, + "grad_norm": 0.515511691570282, + "learning_rate": 6.666666666666667e-06, + "loss": 0.1235, + "step": 97 + }, + { + "epoch": 0.020540767134772583, + "grad_norm": 0.4277538061141968, + "learning_rate": 6.736111111111112e-06, + "loss": 0.1242, + "step": 98 + }, + { + "epoch": 0.020750366799413122, + "grad_norm": 0.17368359863758087, + "learning_rate": 6.8055555555555566e-06, + "loss": 0.1263, + "step": 99 + }, + { + "epoch": 0.020959966464053657, + "grad_norm": 0.37945282459259033, + "learning_rate": 6.875e-06, + "loss": 0.1212, + "step": 100 + }, + { + "epoch": 0.021169566128694195, + "grad_norm": 0.366905152797699, + "learning_rate": 6.944444444444445e-06, + "loss": 0.1252, + "step": 101 + }, + { + "epoch": 0.02137916579333473, + "grad_norm": 0.17307527363300323, + "learning_rate": 7.013888888888889e-06, + "loss": 0.1171, + "step": 102 + }, + { + "epoch": 0.021588765457975268, + "grad_norm": 0.374055415391922, + "learning_rate": 7.083333333333335e-06, + "loss": 0.1113, + "step": 103 + }, + { + "epoch": 0.021798365122615803, + "grad_norm": 0.3751891255378723, + "learning_rate": 7.152777777777778e-06, + "loss": 0.1286, + "step": 104 + }, + { + "epoch": 0.02200796478725634, + "grad_norm": 0.20103880763053894, + "learning_rate": 7.222222222222223e-06, + "loss": 0.1132, + "step": 105 + }, + { + "epoch": 0.022217564451896876, + "grad_norm": 0.20718024671077728, + "learning_rate": 7.291666666666667e-06, + "loss": 0.1152, + "step": 106 + }, + { + "epoch": 0.022427164116537415, + "grad_norm": 0.30478349328041077, + "learning_rate": 7.361111111111112e-06, + "loss": 0.1166, + "step": 107 + }, + { + "epoch": 0.02263676378117795, + "grad_norm": 0.2688567042350769, + "learning_rate": 7.4305555555555565e-06, + "loss": 0.115, + "step": 108 + }, + { + "epoch": 0.022846363445818488, + "grad_norm": 0.19921305775642395, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1181, + "step": 109 + }, + { + "epoch": 0.023055963110459023, + "grad_norm": 0.34289443492889404, + "learning_rate": 7.569444444444445e-06, + "loss": 0.1153, + "step": 110 + }, + { + "epoch": 0.02326556277509956, + "grad_norm": 0.4600578546524048, + "learning_rate": 7.638888888888888e-06, + "loss": 0.1192, + "step": 111 + }, + { + "epoch": 0.023475162439740096, + "grad_norm": 0.29753392934799194, + "learning_rate": 7.708333333333334e-06, + "loss": 0.1153, + "step": 112 + }, + { + "epoch": 0.023684762104380634, + "grad_norm": 0.19611868262290955, + "learning_rate": 7.77777777777778e-06, + "loss": 0.1139, + "step": 113 + }, + { + "epoch": 0.02389436176902117, + "grad_norm": 0.3326936960220337, + "learning_rate": 7.847222222222223e-06, + "loss": 0.1125, + "step": 114 + }, + { + "epoch": 0.024103961433661707, + "grad_norm": 0.43471208214759827, + "learning_rate": 7.916666666666667e-06, + "loss": 0.1144, + "step": 115 + }, + { + "epoch": 0.024313561098302242, + "grad_norm": 0.419888436794281, + "learning_rate": 7.986111111111112e-06, + "loss": 0.1153, + "step": 116 + }, + { + "epoch": 0.02452316076294278, + "grad_norm": 0.1664547622203827, + "learning_rate": 8.055555555555557e-06, + "loss": 0.1098, + "step": 117 + }, + { + "epoch": 0.024732760427583315, + "grad_norm": 0.23118549585342407, + "learning_rate": 8.125000000000001e-06, + "loss": 0.1162, + "step": 118 + }, + { + "epoch": 0.024942360092223854, + "grad_norm": 0.33760204911231995, + "learning_rate": 8.194444444444445e-06, + "loss": 0.1115, + "step": 119 + }, + { + "epoch": 0.02515195975686439, + "grad_norm": 0.25965481996536255, + "learning_rate": 8.263888888888888e-06, + "loss": 0.1102, + "step": 120 + }, + { + "epoch": 0.025361559421504927, + "grad_norm": 0.2375420331954956, + "learning_rate": 8.333333333333334e-06, + "loss": 0.1186, + "step": 121 + }, + { + "epoch": 0.025571159086145462, + "grad_norm": 0.23151437938213348, + "learning_rate": 8.402777777777779e-06, + "loss": 0.1088, + "step": 122 + }, + { + "epoch": 0.025780758750786, + "grad_norm": 0.34959205985069275, + "learning_rate": 8.472222222222223e-06, + "loss": 0.1084, + "step": 123 + }, + { + "epoch": 0.025990358415426535, + "grad_norm": 0.3837231695652008, + "learning_rate": 8.541666666666666e-06, + "loss": 0.1142, + "step": 124 + }, + { + "epoch": 0.026199958080067073, + "grad_norm": 0.317340612411499, + "learning_rate": 8.611111111111112e-06, + "loss": 0.1201, + "step": 125 + }, + { + "epoch": 0.026409557744707608, + "grad_norm": 0.3754269778728485, + "learning_rate": 8.680555555555557e-06, + "loss": 0.1122, + "step": 126 + }, + { + "epoch": 0.026619157409348147, + "grad_norm": 0.39836880564689636, + "learning_rate": 8.750000000000001e-06, + "loss": 0.1105, + "step": 127 + }, + { + "epoch": 0.02682875707398868, + "grad_norm": 0.458625465631485, + "learning_rate": 8.819444444444445e-06, + "loss": 0.1169, + "step": 128 + }, + { + "epoch": 0.02703835673862922, + "grad_norm": 0.34019047021865845, + "learning_rate": 8.888888888888888e-06, + "loss": 0.1127, + "step": 129 + }, + { + "epoch": 0.027247956403269755, + "grad_norm": 0.3323642313480377, + "learning_rate": 8.958333333333334e-06, + "loss": 0.1098, + "step": 130 + }, + { + "epoch": 0.027457556067910293, + "grad_norm": 0.2268533557653427, + "learning_rate": 9.027777777777779e-06, + "loss": 0.1095, + "step": 131 + }, + { + "epoch": 0.027667155732550828, + "grad_norm": 0.4628676176071167, + "learning_rate": 9.097222222222223e-06, + "loss": 0.1077, + "step": 132 + }, + { + "epoch": 0.027876755397191366, + "grad_norm": 0.5547027587890625, + "learning_rate": 9.166666666666666e-06, + "loss": 0.1156, + "step": 133 + }, + { + "epoch": 0.0280863550618319, + "grad_norm": 0.5809503197669983, + "learning_rate": 9.236111111111112e-06, + "loss": 0.1168, + "step": 134 + }, + { + "epoch": 0.028295954726472436, + "grad_norm": 0.497976154088974, + "learning_rate": 9.305555555555557e-06, + "loss": 0.1161, + "step": 135 + }, + { + "epoch": 0.028505554391112974, + "grad_norm": 0.67780601978302, + "learning_rate": 9.375000000000001e-06, + "loss": 0.1201, + "step": 136 + }, + { + "epoch": 0.02871515405575351, + "grad_norm": 0.6441759467124939, + "learning_rate": 9.444444444444445e-06, + "loss": 0.1192, + "step": 137 + }, + { + "epoch": 0.028924753720394047, + "grad_norm": 0.3711428642272949, + "learning_rate": 9.51388888888889e-06, + "loss": 0.1195, + "step": 138 + }, + { + "epoch": 0.029134353385034582, + "grad_norm": 0.3569689393043518, + "learning_rate": 9.583333333333335e-06, + "loss": 0.1118, + "step": 139 + }, + { + "epoch": 0.02934395304967512, + "grad_norm": 0.3563655912876129, + "learning_rate": 9.652777777777779e-06, + "loss": 0.1176, + "step": 140 + }, + { + "epoch": 0.029553552714315656, + "grad_norm": 0.2493436187505722, + "learning_rate": 9.722222222222223e-06, + "loss": 0.1147, + "step": 141 + }, + { + "epoch": 0.029763152378956194, + "grad_norm": 0.29483145475387573, + "learning_rate": 9.791666666666666e-06, + "loss": 0.1153, + "step": 142 + }, + { + "epoch": 0.02997275204359673, + "grad_norm": 0.2700521647930145, + "learning_rate": 9.861111111111112e-06, + "loss": 0.1111, + "step": 143 + }, + { + "epoch": 0.030182351708237267, + "grad_norm": 0.26881495118141174, + "learning_rate": 9.930555555555557e-06, + "loss": 0.1104, + "step": 144 + }, + { + "epoch": 0.030391951372877802, + "grad_norm": 0.30803802609443665, + "learning_rate": 1e-05, + "loss": 0.1122, + "step": 145 + }, + { + "epoch": 0.03060155103751834, + "grad_norm": 0.31767815351486206, + "learning_rate": 9.99999884750052e-06, + "loss": 0.1096, + "step": 146 + }, + { + "epoch": 0.030811150702158875, + "grad_norm": 0.22586488723754883, + "learning_rate": 9.99999539000261e-06, + "loss": 0.1082, + "step": 147 + }, + { + "epoch": 0.031020750366799413, + "grad_norm": 0.2010418325662613, + "learning_rate": 9.999989627507863e-06, + "loss": 0.1123, + "step": 148 + }, + { + "epoch": 0.03123035003143995, + "grad_norm": 0.2666708528995514, + "learning_rate": 9.999981560018935e-06, + "loss": 0.1035, + "step": 149 + }, + { + "epoch": 0.03143994969608049, + "grad_norm": 0.34085071086883545, + "learning_rate": 9.999971187539547e-06, + "loss": 0.1063, + "step": 150 + }, + { + "epoch": 0.03164954936072102, + "grad_norm": 0.31965869665145874, + "learning_rate": 9.999958510074482e-06, + "loss": 0.1095, + "step": 151 + }, + { + "epoch": 0.031859149025361556, + "grad_norm": 0.26403307914733887, + "learning_rate": 9.99994352762958e-06, + "loss": 0.1017, + "step": 152 + }, + { + "epoch": 0.0320687486900021, + "grad_norm": 0.17258979380130768, + "learning_rate": 9.999926240211752e-06, + "loss": 0.1045, + "step": 153 + }, + { + "epoch": 0.03227834835464263, + "grad_norm": 0.2114904820919037, + "learning_rate": 9.999906647828966e-06, + "loss": 0.1017, + "step": 154 + }, + { + "epoch": 0.03248794801928317, + "grad_norm": 0.2613145112991333, + "learning_rate": 9.999884750490255e-06, + "loss": 0.1064, + "step": 155 + }, + { + "epoch": 0.0326975476839237, + "grad_norm": 0.23586612939834595, + "learning_rate": 9.99986054820571e-06, + "loss": 0.1097, + "step": 156 + }, + { + "epoch": 0.032907147348564245, + "grad_norm": 0.24500718712806702, + "learning_rate": 9.999834040986491e-06, + "loss": 0.102, + "step": 157 + }, + { + "epoch": 0.03311674701320478, + "grad_norm": 0.36189910769462585, + "learning_rate": 9.999805228844818e-06, + "loss": 0.1048, + "step": 158 + }, + { + "epoch": 0.033326346677845314, + "grad_norm": 0.43638429045677185, + "learning_rate": 9.999774111793974e-06, + "loss": 0.1153, + "step": 159 + }, + { + "epoch": 0.03353594634248585, + "grad_norm": 0.3592098355293274, + "learning_rate": 9.999740689848302e-06, + "loss": 0.115, + "step": 160 + }, + { + "epoch": 0.03374554600712639, + "grad_norm": 0.23697715997695923, + "learning_rate": 9.999704963023213e-06, + "loss": 0.0969, + "step": 161 + }, + { + "epoch": 0.033955145671766926, + "grad_norm": 0.3014034628868103, + "learning_rate": 9.999666931335172e-06, + "loss": 0.102, + "step": 162 + }, + { + "epoch": 0.03416474533640746, + "grad_norm": 0.35838088393211365, + "learning_rate": 9.999626594801714e-06, + "loss": 0.1007, + "step": 163 + }, + { + "epoch": 0.034374345001047996, + "grad_norm": 0.2900119125843048, + "learning_rate": 9.999583953441436e-06, + "loss": 0.0993, + "step": 164 + }, + { + "epoch": 0.03458394466568854, + "grad_norm": 0.21882066130638123, + "learning_rate": 9.999539007273993e-06, + "loss": 0.1039, + "step": 165 + }, + { + "epoch": 0.03479354433032907, + "grad_norm": 0.1922779232263565, + "learning_rate": 9.999491756320105e-06, + "loss": 0.1049, + "step": 166 + }, + { + "epoch": 0.03500314399496961, + "grad_norm": 0.2402738481760025, + "learning_rate": 9.999442200601559e-06, + "loss": 0.1003, + "step": 167 + }, + { + "epoch": 0.03521274365961014, + "grad_norm": 0.25979745388031006, + "learning_rate": 9.999390340141195e-06, + "loss": 0.1041, + "step": 168 + }, + { + "epoch": 0.035422343324250684, + "grad_norm": 0.2431039661169052, + "learning_rate": 9.999336174962922e-06, + "loss": 0.1008, + "step": 169 + }, + { + "epoch": 0.03563194298889122, + "grad_norm": 0.22206492722034454, + "learning_rate": 9.999279705091711e-06, + "loss": 0.1052, + "step": 170 + }, + { + "epoch": 0.035841542653531754, + "grad_norm": 0.2511206567287445, + "learning_rate": 9.999220930553595e-06, + "loss": 0.1107, + "step": 171 + }, + { + "epoch": 0.03605114231817229, + "grad_norm": 0.3174304664134979, + "learning_rate": 9.999159851375668e-06, + "loss": 0.1022, + "step": 172 + }, + { + "epoch": 0.03626074198281283, + "grad_norm": 0.4061136841773987, + "learning_rate": 9.99909646758609e-06, + "loss": 0.101, + "step": 173 + }, + { + "epoch": 0.036470341647453365, + "grad_norm": 0.47826582193374634, + "learning_rate": 9.999030779214076e-06, + "loss": 0.1052, + "step": 174 + }, + { + "epoch": 0.0366799413120939, + "grad_norm": 0.4733898639678955, + "learning_rate": 9.998962786289912e-06, + "loss": 0.1049, + "step": 175 + }, + { + "epoch": 0.036889540976734435, + "grad_norm": 0.3916158974170685, + "learning_rate": 9.998892488844942e-06, + "loss": 0.1058, + "step": 176 + }, + { + "epoch": 0.03709914064137498, + "grad_norm": 0.25847071409225464, + "learning_rate": 9.998819886911574e-06, + "loss": 0.0942, + "step": 177 + }, + { + "epoch": 0.03730874030601551, + "grad_norm": 0.2088022381067276, + "learning_rate": 9.998744980523276e-06, + "loss": 0.1051, + "step": 178 + }, + { + "epoch": 0.037518339970656046, + "grad_norm": 0.34360599517822266, + "learning_rate": 9.99866776971458e-06, + "loss": 0.0934, + "step": 179 + }, + { + "epoch": 0.03772793963529658, + "grad_norm": 0.4142899513244629, + "learning_rate": 9.99858825452108e-06, + "loss": 0.0996, + "step": 180 + }, + { + "epoch": 0.03793753929993712, + "grad_norm": 0.32127901911735535, + "learning_rate": 9.998506434979434e-06, + "loss": 0.0974, + "step": 181 + }, + { + "epoch": 0.03814713896457766, + "grad_norm": 0.16188944876194, + "learning_rate": 9.998422311127357e-06, + "loss": 0.0959, + "step": 182 + }, + { + "epoch": 0.03835673862921819, + "grad_norm": 0.20421625673770905, + "learning_rate": 9.998335883003636e-06, + "loss": 0.0971, + "step": 183 + }, + { + "epoch": 0.03856633829385873, + "grad_norm": 0.29221752285957336, + "learning_rate": 9.99824715064811e-06, + "loss": 0.1037, + "step": 184 + }, + { + "epoch": 0.03877593795849927, + "grad_norm": 0.3054715096950531, + "learning_rate": 9.998156114101687e-06, + "loss": 0.0977, + "step": 185 + }, + { + "epoch": 0.038985537623139804, + "grad_norm": 0.22702643275260925, + "learning_rate": 9.998062773406332e-06, + "loss": 0.0999, + "step": 186 + }, + { + "epoch": 0.03919513728778034, + "grad_norm": 0.1409185230731964, + "learning_rate": 9.997967128605078e-06, + "loss": 0.0935, + "step": 187 + }, + { + "epoch": 0.039404736952420874, + "grad_norm": 0.21621251106262207, + "learning_rate": 9.997869179742014e-06, + "loss": 0.0987, + "step": 188 + }, + { + "epoch": 0.039614336617061416, + "grad_norm": 0.3209380507469177, + "learning_rate": 9.9977689268623e-06, + "loss": 0.0975, + "step": 189 + }, + { + "epoch": 0.03982393628170195, + "grad_norm": 0.3941796123981476, + "learning_rate": 9.997666370012145e-06, + "loss": 0.0999, + "step": 190 + }, + { + "epoch": 0.040033535946342486, + "grad_norm": 0.40109798312187195, + "learning_rate": 9.997561509238833e-06, + "loss": 0.0977, + "step": 191 + }, + { + "epoch": 0.04024313561098302, + "grad_norm": 0.31651192903518677, + "learning_rate": 9.997454344590702e-06, + "loss": 0.1025, + "step": 192 + }, + { + "epoch": 0.04045273527562356, + "grad_norm": 0.20569577813148499, + "learning_rate": 9.997344876117157e-06, + "loss": 0.0967, + "step": 193 + }, + { + "epoch": 0.0406623349402641, + "grad_norm": 0.2173171490430832, + "learning_rate": 9.997233103868664e-06, + "loss": 0.098, + "step": 194 + }, + { + "epoch": 0.04087193460490463, + "grad_norm": 0.26545464992523193, + "learning_rate": 9.997119027896745e-06, + "loss": 0.0997, + "step": 195 + }, + { + "epoch": 0.04108153426954517, + "grad_norm": 0.25196799635887146, + "learning_rate": 9.997002648253994e-06, + "loss": 0.0936, + "step": 196 + }, + { + "epoch": 0.04129113393418571, + "grad_norm": 0.18679989874362946, + "learning_rate": 9.99688396499406e-06, + "loss": 0.0958, + "step": 197 + }, + { + "epoch": 0.041500733598826244, + "grad_norm": 0.16168518364429474, + "learning_rate": 9.996762978171657e-06, + "loss": 0.0906, + "step": 198 + }, + { + "epoch": 0.04171033326346678, + "grad_norm": 0.19161826372146606, + "learning_rate": 9.996639687842558e-06, + "loss": 0.0974, + "step": 199 + }, + { + "epoch": 0.04191993292810731, + "grad_norm": 0.18548256158828735, + "learning_rate": 9.9965140940636e-06, + "loss": 0.0976, + "step": 200 + }, + { + "epoch": 0.042129532592747855, + "grad_norm": 0.14844392240047455, + "learning_rate": 9.996386196892683e-06, + "loss": 0.0948, + "step": 201 + }, + { + "epoch": 0.04233913225738839, + "grad_norm": 0.10816401243209839, + "learning_rate": 9.996255996388767e-06, + "loss": 0.0927, + "step": 202 + }, + { + "epoch": 0.042548731922028925, + "grad_norm": 0.14506785571575165, + "learning_rate": 9.996123492611875e-06, + "loss": 0.1019, + "step": 203 + }, + { + "epoch": 0.04275833158666946, + "grad_norm": 0.18897125124931335, + "learning_rate": 9.99598868562309e-06, + "loss": 0.1029, + "step": 204 + }, + { + "epoch": 0.042967931251309995, + "grad_norm": 0.24414150416851044, + "learning_rate": 9.99585157548456e-06, + "loss": 0.0895, + "step": 205 + }, + { + "epoch": 0.043177530915950536, + "grad_norm": 0.34987929463386536, + "learning_rate": 9.995712162259489e-06, + "loss": 0.0933, + "step": 206 + }, + { + "epoch": 0.04338713058059107, + "grad_norm": 0.4623091518878937, + "learning_rate": 9.995570446012152e-06, + "loss": 0.106, + "step": 207 + }, + { + "epoch": 0.043596730245231606, + "grad_norm": 0.42472946643829346, + "learning_rate": 9.995426426807875e-06, + "loss": 0.1011, + "step": 208 + }, + { + "epoch": 0.04380632990987214, + "grad_norm": 0.20523960888385773, + "learning_rate": 9.995280104713055e-06, + "loss": 0.096, + "step": 209 + }, + { + "epoch": 0.04401592957451268, + "grad_norm": 0.23212192952632904, + "learning_rate": 9.995131479795142e-06, + "loss": 0.0945, + "step": 210 + }, + { + "epoch": 0.04422552923915322, + "grad_norm": 0.40809500217437744, + "learning_rate": 9.994980552122655e-06, + "loss": 0.0935, + "step": 211 + }, + { + "epoch": 0.04443512890379375, + "grad_norm": 0.3801667094230652, + "learning_rate": 9.99482732176517e-06, + "loss": 0.0985, + "step": 212 + }, + { + "epoch": 0.04464472856843429, + "grad_norm": 0.17014552652835846, + "learning_rate": 9.994671788793328e-06, + "loss": 0.0951, + "step": 213 + }, + { + "epoch": 0.04485432823307483, + "grad_norm": 0.31415608525276184, + "learning_rate": 9.99451395327883e-06, + "loss": 0.0986, + "step": 214 + }, + { + "epoch": 0.045063927897715364, + "grad_norm": 0.3655521273612976, + "learning_rate": 9.994353815294438e-06, + "loss": 0.0979, + "step": 215 + }, + { + "epoch": 0.0452735275623559, + "grad_norm": 0.22098667919635773, + "learning_rate": 9.99419137491397e-06, + "loss": 0.0965, + "step": 216 + }, + { + "epoch": 0.045483127226996434, + "grad_norm": 0.2817780673503876, + "learning_rate": 9.99402663221232e-06, + "loss": 0.0976, + "step": 217 + }, + { + "epoch": 0.045692726891636976, + "grad_norm": 0.27700185775756836, + "learning_rate": 9.993859587265429e-06, + "loss": 0.1014, + "step": 218 + }, + { + "epoch": 0.04590232655627751, + "grad_norm": 0.18445956707000732, + "learning_rate": 9.993690240150305e-06, + "loss": 0.0954, + "step": 219 + }, + { + "epoch": 0.046111926220918045, + "grad_norm": 0.2870791554450989, + "learning_rate": 9.993518590945017e-06, + "loss": 0.0995, + "step": 220 + }, + { + "epoch": 0.04632152588555858, + "grad_norm": 0.20852920413017273, + "learning_rate": 9.993344639728694e-06, + "loss": 0.0923, + "step": 221 + }, + { + "epoch": 0.04653112555019912, + "grad_norm": 0.2063411921262741, + "learning_rate": 9.993168386581533e-06, + "loss": 0.0903, + "step": 222 + }, + { + "epoch": 0.04674072521483966, + "grad_norm": 0.2189558446407318, + "learning_rate": 9.992989831584781e-06, + "loss": 0.0934, + "step": 223 + }, + { + "epoch": 0.04695032487948019, + "grad_norm": 0.2275790423154831, + "learning_rate": 9.992808974820755e-06, + "loss": 0.0944, + "step": 224 + }, + { + "epoch": 0.04715992454412073, + "grad_norm": 0.29324543476104736, + "learning_rate": 9.992625816372828e-06, + "loss": 0.0966, + "step": 225 + }, + { + "epoch": 0.04736952420876127, + "grad_norm": 0.24611949920654297, + "learning_rate": 9.992440356325437e-06, + "loss": 0.0948, + "step": 226 + }, + { + "epoch": 0.0475791238734018, + "grad_norm": 0.25625893473625183, + "learning_rate": 9.992252594764079e-06, + "loss": 0.0896, + "step": 227 + }, + { + "epoch": 0.04778872353804234, + "grad_norm": 0.22875913977622986, + "learning_rate": 9.99206253177531e-06, + "loss": 0.0887, + "step": 228 + }, + { + "epoch": 0.04799832320268287, + "grad_norm": 0.2700249254703522, + "learning_rate": 9.991870167446751e-06, + "loss": 0.0935, + "step": 229 + }, + { + "epoch": 0.048207922867323415, + "grad_norm": 0.2741994857788086, + "learning_rate": 9.991675501867083e-06, + "loss": 0.0945, + "step": 230 + }, + { + "epoch": 0.04841752253196395, + "grad_norm": 0.2627350389957428, + "learning_rate": 9.991478535126045e-06, + "loss": 0.0937, + "step": 231 + }, + { + "epoch": 0.048627122196604484, + "grad_norm": 0.2349289357662201, + "learning_rate": 9.99127926731444e-06, + "loss": 0.091, + "step": 232 + }, + { + "epoch": 0.04883672186124502, + "grad_norm": 0.19692082703113556, + "learning_rate": 9.991077698524128e-06, + "loss": 0.0925, + "step": 233 + }, + { + "epoch": 0.04904632152588556, + "grad_norm": 0.21425145864486694, + "learning_rate": 9.990873828848035e-06, + "loss": 0.092, + "step": 234 + }, + { + "epoch": 0.049255921190526096, + "grad_norm": 0.22980082035064697, + "learning_rate": 9.990667658380145e-06, + "loss": 0.0891, + "step": 235 + }, + { + "epoch": 0.04946552085516663, + "grad_norm": 0.3201009929180145, + "learning_rate": 9.990459187215498e-06, + "loss": 0.0925, + "step": 236 + }, + { + "epoch": 0.049675120519807166, + "grad_norm": 0.366510272026062, + "learning_rate": 9.990248415450204e-06, + "loss": 0.0991, + "step": 237 + }, + { + "epoch": 0.04988472018444771, + "grad_norm": 0.3486904203891754, + "learning_rate": 9.990035343181426e-06, + "loss": 0.0938, + "step": 238 + }, + { + "epoch": 0.05009431984908824, + "grad_norm": 0.2655981779098511, + "learning_rate": 9.989819970507392e-06, + "loss": 0.0972, + "step": 239 + }, + { + "epoch": 0.05030391951372878, + "grad_norm": 0.19759038090705872, + "learning_rate": 9.989602297527387e-06, + "loss": 0.0894, + "step": 240 + }, + { + "epoch": 0.05051351917836931, + "grad_norm": 0.2099776715040207, + "learning_rate": 9.98938232434176e-06, + "loss": 0.0906, + "step": 241 + }, + { + "epoch": 0.050723118843009854, + "grad_norm": 0.22497335076332092, + "learning_rate": 9.98916005105192e-06, + "loss": 0.0953, + "step": 242 + }, + { + "epoch": 0.05093271850765039, + "grad_norm": 0.19926096498966217, + "learning_rate": 9.98893547776033e-06, + "loss": 0.0873, + "step": 243 + }, + { + "epoch": 0.051142318172290924, + "grad_norm": 0.19615811109542847, + "learning_rate": 9.988708604570523e-06, + "loss": 0.0868, + "step": 244 + }, + { + "epoch": 0.05135191783693146, + "grad_norm": 0.2284836620092392, + "learning_rate": 9.988479431587085e-06, + "loss": 0.0891, + "step": 245 + }, + { + "epoch": 0.051561517501572, + "grad_norm": 0.24722371995449066, + "learning_rate": 9.988247958915665e-06, + "loss": 0.091, + "step": 246 + }, + { + "epoch": 0.051771117166212535, + "grad_norm": 0.2157488912343979, + "learning_rate": 9.988014186662971e-06, + "loss": 0.0939, + "step": 247 + }, + { + "epoch": 0.05198071683085307, + "grad_norm": 0.19561699032783508, + "learning_rate": 9.987778114936775e-06, + "loss": 0.0886, + "step": 248 + }, + { + "epoch": 0.052190316495493605, + "grad_norm": 0.20849882066249847, + "learning_rate": 9.987539743845902e-06, + "loss": 0.09, + "step": 249 + }, + { + "epoch": 0.05239991616013415, + "grad_norm": 0.23204147815704346, + "learning_rate": 9.987299073500245e-06, + "loss": 0.0882, + "step": 250 + }, + { + "epoch": 0.05260951582477468, + "grad_norm": 0.2583141028881073, + "learning_rate": 9.98705610401075e-06, + "loss": 0.0949, + "step": 251 + }, + { + "epoch": 0.052819115489415217, + "grad_norm": 0.27133336663246155, + "learning_rate": 9.986810835489426e-06, + "loss": 0.0914, + "step": 252 + }, + { + "epoch": 0.05302871515405575, + "grad_norm": 0.2617621123790741, + "learning_rate": 9.986563268049345e-06, + "loss": 0.0862, + "step": 253 + }, + { + "epoch": 0.05323831481869629, + "grad_norm": 0.2514387369155884, + "learning_rate": 9.98631340180463e-06, + "loss": 0.093, + "step": 254 + }, + { + "epoch": 0.05344791448333683, + "grad_norm": 0.22442054748535156, + "learning_rate": 9.986061236870478e-06, + "loss": 0.0963, + "step": 255 + }, + { + "epoch": 0.05365751414797736, + "grad_norm": 0.16423439979553223, + "learning_rate": 9.985806773363127e-06, + "loss": 0.0916, + "step": 256 + }, + { + "epoch": 0.0538671138126179, + "grad_norm": 0.19374483823776245, + "learning_rate": 9.985550011399889e-06, + "loss": 0.088, + "step": 257 + }, + { + "epoch": 0.05407671347725844, + "grad_norm": 0.28776028752326965, + "learning_rate": 9.985290951099134e-06, + "loss": 0.0912, + "step": 258 + }, + { + "epoch": 0.054286313141898974, + "grad_norm": 0.3693079948425293, + "learning_rate": 9.985029592580284e-06, + "loss": 0.0912, + "step": 259 + }, + { + "epoch": 0.05449591280653951, + "grad_norm": 0.4039154052734375, + "learning_rate": 9.984765935963826e-06, + "loss": 0.0935, + "step": 260 + }, + { + "epoch": 0.054705512471180044, + "grad_norm": 0.3492220640182495, + "learning_rate": 9.98449998137131e-06, + "loss": 0.0939, + "step": 261 + }, + { + "epoch": 0.054915112135820586, + "grad_norm": 0.21770621836185455, + "learning_rate": 9.984231728925338e-06, + "loss": 0.0911, + "step": 262 + }, + { + "epoch": 0.05512471180046112, + "grad_norm": 0.17481771111488342, + "learning_rate": 9.983961178749573e-06, + "loss": 0.0894, + "step": 263 + }, + { + "epoch": 0.055334311465101656, + "grad_norm": 0.23308181762695312, + "learning_rate": 9.98368833096874e-06, + "loss": 0.0929, + "step": 264 + }, + { + "epoch": 0.05554391112974219, + "grad_norm": 0.2207728773355484, + "learning_rate": 9.983413185708622e-06, + "loss": 0.0912, + "step": 265 + }, + { + "epoch": 0.05575351079438273, + "grad_norm": 0.2060740441083908, + "learning_rate": 9.98313574309606e-06, + "loss": 0.0877, + "step": 266 + }, + { + "epoch": 0.05596311045902327, + "grad_norm": 0.22956304252147675, + "learning_rate": 9.982856003258954e-06, + "loss": 0.0892, + "step": 267 + }, + { + "epoch": 0.0561727101236638, + "grad_norm": 0.22290349006652832, + "learning_rate": 9.982573966326268e-06, + "loss": 0.0882, + "step": 268 + }, + { + "epoch": 0.05638230978830434, + "grad_norm": 0.20171046257019043, + "learning_rate": 9.982289632428017e-06, + "loss": 0.0891, + "step": 269 + }, + { + "epoch": 0.05659190945294487, + "grad_norm": 0.2069445252418518, + "learning_rate": 9.982003001695282e-06, + "loss": 0.0881, + "step": 270 + }, + { + "epoch": 0.056801509117585414, + "grad_norm": 0.206475630402565, + "learning_rate": 9.981714074260196e-06, + "loss": 0.0902, + "step": 271 + }, + { + "epoch": 0.05701110878222595, + "grad_norm": 0.19828340411186218, + "learning_rate": 9.98142285025596e-06, + "loss": 0.0878, + "step": 272 + }, + { + "epoch": 0.05722070844686648, + "grad_norm": 0.18767406046390533, + "learning_rate": 9.981129329816821e-06, + "loss": 0.0905, + "step": 273 + }, + { + "epoch": 0.05743030811150702, + "grad_norm": 0.1480463147163391, + "learning_rate": 9.980833513078097e-06, + "loss": 0.089, + "step": 274 + }, + { + "epoch": 0.05763990777614756, + "grad_norm": 0.12146848440170288, + "learning_rate": 9.980535400176158e-06, + "loss": 0.0853, + "step": 275 + }, + { + "epoch": 0.057849507440788095, + "grad_norm": 0.17080923914909363, + "learning_rate": 9.980234991248434e-06, + "loss": 0.0851, + "step": 276 + }, + { + "epoch": 0.05805910710542863, + "grad_norm": 0.20135587453842163, + "learning_rate": 9.979932286433414e-06, + "loss": 0.0851, + "step": 277 + }, + { + "epoch": 0.058268706770069165, + "grad_norm": 0.193786159157753, + "learning_rate": 9.979627285870644e-06, + "loss": 0.0844, + "step": 278 + }, + { + "epoch": 0.058478306434709706, + "grad_norm": 0.17661581933498383, + "learning_rate": 9.979319989700729e-06, + "loss": 0.0899, + "step": 279 + }, + { + "epoch": 0.05868790609935024, + "grad_norm": 0.1813191920518875, + "learning_rate": 9.979010398065334e-06, + "loss": 0.0852, + "step": 280 + }, + { + "epoch": 0.058897505763990776, + "grad_norm": 0.20303796231746674, + "learning_rate": 9.97869851110718e-06, + "loss": 0.0912, + "step": 281 + }, + { + "epoch": 0.05910710542863131, + "grad_norm": 0.1993362307548523, + "learning_rate": 9.978384328970045e-06, + "loss": 0.0903, + "step": 282 + }, + { + "epoch": 0.05931670509327185, + "grad_norm": 0.18841809034347534, + "learning_rate": 9.978067851798771e-06, + "loss": 0.0918, + "step": 283 + }, + { + "epoch": 0.05952630475791239, + "grad_norm": 0.22483417391777039, + "learning_rate": 9.97774907973925e-06, + "loss": 0.0929, + "step": 284 + }, + { + "epoch": 0.05973590442255292, + "grad_norm": 0.3062545657157898, + "learning_rate": 9.977428012938437e-06, + "loss": 0.0889, + "step": 285 + }, + { + "epoch": 0.05994550408719346, + "grad_norm": 0.3955790400505066, + "learning_rate": 9.977104651544342e-06, + "loss": 0.0917, + "step": 286 + }, + { + "epoch": 0.060155103751834, + "grad_norm": 0.4466722309589386, + "learning_rate": 9.97677899570604e-06, + "loss": 0.091, + "step": 287 + }, + { + "epoch": 0.060364703416474534, + "grad_norm": 0.3777875304222107, + "learning_rate": 9.976451045573653e-06, + "loss": 0.0872, + "step": 288 + }, + { + "epoch": 0.06057430308111507, + "grad_norm": 0.21344004571437836, + "learning_rate": 9.976120801298368e-06, + "loss": 0.0874, + "step": 289 + }, + { + "epoch": 0.060783902745755604, + "grad_norm": 0.2490052580833435, + "learning_rate": 9.975788263032427e-06, + "loss": 0.0873, + "step": 290 + }, + { + "epoch": 0.060993502410396146, + "grad_norm": 0.3740801513195038, + "learning_rate": 9.97545343092913e-06, + "loss": 0.0905, + "step": 291 + }, + { + "epoch": 0.06120310207503668, + "grad_norm": 0.356458842754364, + "learning_rate": 9.975116305142836e-06, + "loss": 0.0872, + "step": 292 + }, + { + "epoch": 0.061412701739677215, + "grad_norm": 0.2755817472934723, + "learning_rate": 9.974776885828958e-06, + "loss": 0.0902, + "step": 293 + }, + { + "epoch": 0.06162230140431775, + "grad_norm": 0.222996786236763, + "learning_rate": 9.974435173143968e-06, + "loss": 0.0848, + "step": 294 + }, + { + "epoch": 0.06183190106895829, + "grad_norm": 0.24334965646266937, + "learning_rate": 9.974091167245397e-06, + "loss": 0.0915, + "step": 295 + }, + { + "epoch": 0.06204150073359883, + "grad_norm": 0.25788915157318115, + "learning_rate": 9.973744868291832e-06, + "loss": 0.0894, + "step": 296 + }, + { + "epoch": 0.06225110039823936, + "grad_norm": 0.22377200424671173, + "learning_rate": 9.973396276442917e-06, + "loss": 0.0896, + "step": 297 + }, + { + "epoch": 0.0624607000628799, + "grad_norm": 0.23889151215553284, + "learning_rate": 9.973045391859348e-06, + "loss": 0.0859, + "step": 298 + }, + { + "epoch": 0.06267029972752043, + "grad_norm": 0.18626217544078827, + "learning_rate": 9.97269221470289e-06, + "loss": 0.0922, + "step": 299 + }, + { + "epoch": 0.06287989939216097, + "grad_norm": 0.14671076834201813, + "learning_rate": 9.97233674513635e-06, + "loss": 0.0922, + "step": 300 + }, + { + "epoch": 0.06308949905680152, + "grad_norm": 0.21669505536556244, + "learning_rate": 9.971978983323606e-06, + "loss": 0.084, + "step": 301 + }, + { + "epoch": 0.06329909872144204, + "grad_norm": 0.2512351870536804, + "learning_rate": 9.971618929429584e-06, + "loss": 0.0917, + "step": 302 + }, + { + "epoch": 0.06350869838608258, + "grad_norm": 0.28498363494873047, + "learning_rate": 9.971256583620268e-06, + "loss": 0.0854, + "step": 303 + }, + { + "epoch": 0.06371829805072311, + "grad_norm": 0.2733207643032074, + "learning_rate": 9.970891946062698e-06, + "loss": 0.0867, + "step": 304 + }, + { + "epoch": 0.06392789771536365, + "grad_norm": 0.19943512976169586, + "learning_rate": 9.970525016924974e-06, + "loss": 0.0913, + "step": 305 + }, + { + "epoch": 0.0641374973800042, + "grad_norm": 0.16102051734924316, + "learning_rate": 9.97015579637625e-06, + "loss": 0.0878, + "step": 306 + }, + { + "epoch": 0.06434709704464472, + "grad_norm": 0.19645822048187256, + "learning_rate": 9.969784284586736e-06, + "loss": 0.0856, + "step": 307 + }, + { + "epoch": 0.06455669670928527, + "grad_norm": 0.2531105875968933, + "learning_rate": 9.9694104817277e-06, + "loss": 0.0897, + "step": 308 + }, + { + "epoch": 0.06476629637392581, + "grad_norm": 0.24819032847881317, + "learning_rate": 9.969034387971463e-06, + "loss": 0.0854, + "step": 309 + }, + { + "epoch": 0.06497589603856634, + "grad_norm": 0.23713769018650055, + "learning_rate": 9.968656003491407e-06, + "loss": 0.0861, + "step": 310 + }, + { + "epoch": 0.06518549570320688, + "grad_norm": 0.21117182075977325, + "learning_rate": 9.968275328461964e-06, + "loss": 0.085, + "step": 311 + }, + { + "epoch": 0.0653950953678474, + "grad_norm": 0.16627727448940277, + "learning_rate": 9.967892363058626e-06, + "loss": 0.0854, + "step": 312 + }, + { + "epoch": 0.06560469503248795, + "grad_norm": 0.22320255637168884, + "learning_rate": 9.967507107457942e-06, + "loss": 0.085, + "step": 313 + }, + { + "epoch": 0.06581429469712849, + "grad_norm": 0.30917251110076904, + "learning_rate": 9.967119561837513e-06, + "loss": 0.0909, + "step": 314 + }, + { + "epoch": 0.06602389436176902, + "grad_norm": 0.3397315740585327, + "learning_rate": 9.966729726375997e-06, + "loss": 0.0938, + "step": 315 + }, + { + "epoch": 0.06623349402640956, + "grad_norm": 0.3161088228225708, + "learning_rate": 9.96633760125311e-06, + "loss": 0.0945, + "step": 316 + }, + { + "epoch": 0.0664430936910501, + "grad_norm": 0.2754218876361847, + "learning_rate": 9.965943186649619e-06, + "loss": 0.088, + "step": 317 + }, + { + "epoch": 0.06665269335569063, + "grad_norm": 0.2540167272090912, + "learning_rate": 9.965546482747352e-06, + "loss": 0.0835, + "step": 318 + }, + { + "epoch": 0.06686229302033117, + "grad_norm": 0.24167752265930176, + "learning_rate": 9.965147489729187e-06, + "loss": 0.0872, + "step": 319 + }, + { + "epoch": 0.0670718926849717, + "grad_norm": 0.19365692138671875, + "learning_rate": 9.96474620777906e-06, + "loss": 0.0865, + "step": 320 + }, + { + "epoch": 0.06728149234961224, + "grad_norm": 0.13701936602592468, + "learning_rate": 9.964342637081962e-06, + "loss": 0.0913, + "step": 321 + }, + { + "epoch": 0.06749109201425278, + "grad_norm": 0.2027350515127182, + "learning_rate": 9.963936777823941e-06, + "loss": 0.0852, + "step": 322 + }, + { + "epoch": 0.06770069167889331, + "grad_norm": 0.26346316933631897, + "learning_rate": 9.963528630192098e-06, + "loss": 0.086, + "step": 323 + }, + { + "epoch": 0.06791029134353385, + "grad_norm": 0.24423374235630035, + "learning_rate": 9.963118194374585e-06, + "loss": 0.0885, + "step": 324 + }, + { + "epoch": 0.0681198910081744, + "grad_norm": 0.21973365545272827, + "learning_rate": 9.962705470560616e-06, + "loss": 0.0876, + "step": 325 + }, + { + "epoch": 0.06832949067281492, + "grad_norm": 0.23790043592453003, + "learning_rate": 9.962290458940456e-06, + "loss": 0.0858, + "step": 326 + }, + { + "epoch": 0.06853909033745546, + "grad_norm": 0.23737956583499908, + "learning_rate": 9.961873159705426e-06, + "loss": 0.0813, + "step": 327 + }, + { + "epoch": 0.06874869000209599, + "grad_norm": 0.1772824376821518, + "learning_rate": 9.961453573047898e-06, + "loss": 0.0837, + "step": 328 + }, + { + "epoch": 0.06895828966673653, + "grad_norm": 0.14443804323673248, + "learning_rate": 9.961031699161305e-06, + "loss": 0.0847, + "step": 329 + }, + { + "epoch": 0.06916788933137707, + "grad_norm": 0.16732539236545563, + "learning_rate": 9.960607538240129e-06, + "loss": 0.0901, + "step": 330 + }, + { + "epoch": 0.0693774889960176, + "grad_norm": 0.16969183087348938, + "learning_rate": 9.960181090479908e-06, + "loss": 0.0868, + "step": 331 + }, + { + "epoch": 0.06958708866065814, + "grad_norm": 0.17372508347034454, + "learning_rate": 9.959752356077234e-06, + "loss": 0.085, + "step": 332 + }, + { + "epoch": 0.06979668832529869, + "grad_norm": 0.18569670617580414, + "learning_rate": 9.959321335229754e-06, + "loss": 0.0894, + "step": 333 + }, + { + "epoch": 0.07000628798993921, + "grad_norm": 0.2023860216140747, + "learning_rate": 9.95888802813617e-06, + "loss": 0.0892, + "step": 334 + }, + { + "epoch": 0.07021588765457976, + "grad_norm": 0.20857451856136322, + "learning_rate": 9.958452434996235e-06, + "loss": 0.0826, + "step": 335 + }, + { + "epoch": 0.07042548731922028, + "grad_norm": 0.23407702147960663, + "learning_rate": 9.958014556010757e-06, + "loss": 0.0795, + "step": 336 + }, + { + "epoch": 0.07063508698386083, + "grad_norm": 0.3349778950214386, + "learning_rate": 9.957574391381597e-06, + "loss": 0.0886, + "step": 337 + }, + { + "epoch": 0.07084468664850137, + "grad_norm": 0.36891162395477295, + "learning_rate": 9.957131941311675e-06, + "loss": 0.0923, + "step": 338 + }, + { + "epoch": 0.0710542863131419, + "grad_norm": 0.2576613128185272, + "learning_rate": 9.956687206004955e-06, + "loss": 0.0866, + "step": 339 + }, + { + "epoch": 0.07126388597778244, + "grad_norm": 0.2293458729982376, + "learning_rate": 9.956240185666465e-06, + "loss": 0.0864, + "step": 340 + }, + { + "epoch": 0.07147348564242297, + "grad_norm": 0.266307532787323, + "learning_rate": 9.955790880502278e-06, + "loss": 0.0923, + "step": 341 + }, + { + "epoch": 0.07168308530706351, + "grad_norm": 0.22257249057292938, + "learning_rate": 9.955339290719525e-06, + "loss": 0.0858, + "step": 342 + }, + { + "epoch": 0.07189268497170405, + "grad_norm": 0.25660571455955505, + "learning_rate": 9.954885416526388e-06, + "loss": 0.09, + "step": 343 + }, + { + "epoch": 0.07210228463634458, + "grad_norm": 0.24506278336048126, + "learning_rate": 9.954429258132102e-06, + "loss": 0.0849, + "step": 344 + }, + { + "epoch": 0.07231188430098512, + "grad_norm": 0.1874029040336609, + "learning_rate": 9.953970815746958e-06, + "loss": 0.0879, + "step": 345 + }, + { + "epoch": 0.07252148396562566, + "grad_norm": 0.2142496556043625, + "learning_rate": 9.953510089582297e-06, + "loss": 0.083, + "step": 346 + }, + { + "epoch": 0.07273108363026619, + "grad_norm": 0.200823113322258, + "learning_rate": 9.953047079850514e-06, + "loss": 0.0836, + "step": 347 + }, + { + "epoch": 0.07294068329490673, + "grad_norm": 0.22651934623718262, + "learning_rate": 9.952581786765057e-06, + "loss": 0.0872, + "step": 348 + }, + { + "epoch": 0.07315028295954726, + "grad_norm": 0.2763821482658386, + "learning_rate": 9.952114210540423e-06, + "loss": 0.0924, + "step": 349 + }, + { + "epoch": 0.0733598826241878, + "grad_norm": 0.26317474246025085, + "learning_rate": 9.951644351392167e-06, + "loss": 0.0901, + "step": 350 + }, + { + "epoch": 0.07356948228882834, + "grad_norm": 0.22677338123321533, + "learning_rate": 9.951172209536895e-06, + "loss": 0.0854, + "step": 351 + }, + { + "epoch": 0.07377908195346887, + "grad_norm": 0.23431210219860077, + "learning_rate": 9.95069778519226e-06, + "loss": 0.087, + "step": 352 + }, + { + "epoch": 0.07398868161810941, + "grad_norm": 0.23959867656230927, + "learning_rate": 9.950221078576977e-06, + "loss": 0.0839, + "step": 353 + }, + { + "epoch": 0.07419828128274995, + "grad_norm": 0.21534642577171326, + "learning_rate": 9.949742089910805e-06, + "loss": 0.0893, + "step": 354 + }, + { + "epoch": 0.07440788094739048, + "grad_norm": 0.2186158299446106, + "learning_rate": 9.949260819414557e-06, + "loss": 0.0807, + "step": 355 + }, + { + "epoch": 0.07461748061203102, + "grad_norm": 0.23446103930473328, + "learning_rate": 9.948777267310099e-06, + "loss": 0.0836, + "step": 356 + }, + { + "epoch": 0.07482708027667155, + "grad_norm": 0.20559126138687134, + "learning_rate": 9.948291433820348e-06, + "loss": 0.0866, + "step": 357 + }, + { + "epoch": 0.07503667994131209, + "grad_norm": 0.1898106038570404, + "learning_rate": 9.947803319169275e-06, + "loss": 0.0869, + "step": 358 + }, + { + "epoch": 0.07524627960595263, + "grad_norm": 0.2029874622821808, + "learning_rate": 9.9473129235819e-06, + "loss": 0.0866, + "step": 359 + }, + { + "epoch": 0.07545587927059316, + "grad_norm": 0.20538607239723206, + "learning_rate": 9.946820247284295e-06, + "loss": 0.0795, + "step": 360 + }, + { + "epoch": 0.0756654789352337, + "grad_norm": 0.24543048441410065, + "learning_rate": 9.946325290503583e-06, + "loss": 0.079, + "step": 361 + }, + { + "epoch": 0.07587507859987425, + "grad_norm": 0.2604290246963501, + "learning_rate": 9.945828053467939e-06, + "loss": 0.0871, + "step": 362 + }, + { + "epoch": 0.07608467826451477, + "grad_norm": 0.249879851937294, + "learning_rate": 9.945328536406588e-06, + "loss": 0.0826, + "step": 363 + }, + { + "epoch": 0.07629427792915532, + "grad_norm": 0.23733551800251007, + "learning_rate": 9.944826739549812e-06, + "loss": 0.0852, + "step": 364 + }, + { + "epoch": 0.07650387759379584, + "grad_norm": 0.18714891374111176, + "learning_rate": 9.944322663128936e-06, + "loss": 0.0845, + "step": 365 + }, + { + "epoch": 0.07671347725843639, + "grad_norm": 0.17555269598960876, + "learning_rate": 9.943816307376337e-06, + "loss": 0.0801, + "step": 366 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.20943832397460938, + "learning_rate": 9.943307672525448e-06, + "loss": 0.0853, + "step": 367 + }, + { + "epoch": 0.07713267658771746, + "grad_norm": 0.21346049010753632, + "learning_rate": 9.94279675881075e-06, + "loss": 0.0841, + "step": 368 + }, + { + "epoch": 0.077342276252358, + "grad_norm": 0.2275484949350357, + "learning_rate": 9.942283566467773e-06, + "loss": 0.0872, + "step": 369 + }, + { + "epoch": 0.07755187591699854, + "grad_norm": 0.21712201833724976, + "learning_rate": 9.941768095733098e-06, + "loss": 0.0833, + "step": 370 + }, + { + "epoch": 0.07776147558163907, + "grad_norm": 0.19494830071926117, + "learning_rate": 9.941250346844358e-06, + "loss": 0.0855, + "step": 371 + }, + { + "epoch": 0.07797107524627961, + "grad_norm": 0.19557230174541473, + "learning_rate": 9.940730320040233e-06, + "loss": 0.0827, + "step": 372 + }, + { + "epoch": 0.07818067491092014, + "grad_norm": 0.1738850325345993, + "learning_rate": 9.940208015560458e-06, + "loss": 0.0811, + "step": 373 + }, + { + "epoch": 0.07839027457556068, + "grad_norm": 0.19140446186065674, + "learning_rate": 9.939683433645813e-06, + "loss": 0.0806, + "step": 374 + }, + { + "epoch": 0.07859987424020122, + "grad_norm": 0.23054589331150055, + "learning_rate": 9.939156574538131e-06, + "loss": 0.0817, + "step": 375 + }, + { + "epoch": 0.07880947390484175, + "grad_norm": 0.25932809710502625, + "learning_rate": 9.938627438480295e-06, + "loss": 0.0801, + "step": 376 + }, + { + "epoch": 0.07901907356948229, + "grad_norm": 0.24759088456630707, + "learning_rate": 9.938096025716235e-06, + "loss": 0.0894, + "step": 377 + }, + { + "epoch": 0.07922867323412283, + "grad_norm": 0.15014545619487762, + "learning_rate": 9.937562336490934e-06, + "loss": 0.0855, + "step": 378 + }, + { + "epoch": 0.07943827289876336, + "grad_norm": 0.10033906251192093, + "learning_rate": 9.93702637105042e-06, + "loss": 0.0854, + "step": 379 + }, + { + "epoch": 0.0796478725634039, + "grad_norm": 0.1658346801996231, + "learning_rate": 9.936488129641777e-06, + "loss": 0.0812, + "step": 380 + }, + { + "epoch": 0.07985747222804443, + "grad_norm": 0.2382059395313263, + "learning_rate": 9.935947612513129e-06, + "loss": 0.0819, + "step": 381 + }, + { + "epoch": 0.08006707189268497, + "grad_norm": 0.2596674859523773, + "learning_rate": 9.935404819913658e-06, + "loss": 0.0849, + "step": 382 + }, + { + "epoch": 0.08027667155732551, + "grad_norm": 0.19236969947814941, + "learning_rate": 9.93485975209359e-06, + "loss": 0.0805, + "step": 383 + }, + { + "epoch": 0.08048627122196604, + "grad_norm": 0.12010635435581207, + "learning_rate": 9.934312409304201e-06, + "loss": 0.0858, + "step": 384 + }, + { + "epoch": 0.08069587088660658, + "grad_norm": 0.1619727462530136, + "learning_rate": 9.933762791797816e-06, + "loss": 0.0829, + "step": 385 + }, + { + "epoch": 0.08090547055124712, + "grad_norm": 0.256807804107666, + "learning_rate": 9.93321089982781e-06, + "loss": 0.0805, + "step": 386 + }, + { + "epoch": 0.08111507021588765, + "grad_norm": 0.3097319006919861, + "learning_rate": 9.932656733648602e-06, + "loss": 0.0834, + "step": 387 + }, + { + "epoch": 0.0813246698805282, + "grad_norm": 0.3272354006767273, + "learning_rate": 9.932100293515667e-06, + "loss": 0.0817, + "step": 388 + }, + { + "epoch": 0.08153426954516872, + "grad_norm": 0.36321696639060974, + "learning_rate": 9.931541579685519e-06, + "loss": 0.0859, + "step": 389 + }, + { + "epoch": 0.08174386920980926, + "grad_norm": 0.40557923913002014, + "learning_rate": 9.930980592415728e-06, + "loss": 0.0852, + "step": 390 + }, + { + "epoch": 0.0819534688744498, + "grad_norm": 0.3595048785209656, + "learning_rate": 9.93041733196491e-06, + "loss": 0.0873, + "step": 391 + }, + { + "epoch": 0.08216306853909033, + "grad_norm": 0.18180204927921295, + "learning_rate": 9.929851798592723e-06, + "loss": 0.0863, + "step": 392 + }, + { + "epoch": 0.08237266820373088, + "grad_norm": 0.3407440781593323, + "learning_rate": 9.929283992559882e-06, + "loss": 0.0889, + "step": 393 + }, + { + "epoch": 0.08258226786837142, + "grad_norm": 0.39048606157302856, + "learning_rate": 9.928713914128146e-06, + "loss": 0.0907, + "step": 394 + }, + { + "epoch": 0.08279186753301195, + "grad_norm": 0.2129063457250595, + "learning_rate": 9.928141563560316e-06, + "loss": 0.0911, + "step": 395 + }, + { + "epoch": 0.08300146719765249, + "grad_norm": 0.3368994891643524, + "learning_rate": 9.92756694112025e-06, + "loss": 0.09, + "step": 396 + }, + { + "epoch": 0.08321106686229301, + "grad_norm": 0.3096039891242981, + "learning_rate": 9.926990047072849e-06, + "loss": 0.0918, + "step": 397 + }, + { + "epoch": 0.08342066652693356, + "grad_norm": 0.21943970024585724, + "learning_rate": 9.92641088168406e-06, + "loss": 0.0892, + "step": 398 + }, + { + "epoch": 0.0836302661915741, + "grad_norm": 0.23184233903884888, + "learning_rate": 9.925829445220876e-06, + "loss": 0.0848, + "step": 399 + }, + { + "epoch": 0.08383986585621463, + "grad_norm": 0.22483260929584503, + "learning_rate": 9.925245737951342e-06, + "loss": 0.0806, + "step": 400 + }, + { + "epoch": 0.08404946552085517, + "grad_norm": 0.20455621182918549, + "learning_rate": 9.924659760144546e-06, + "loss": 0.0873, + "step": 401 + }, + { + "epoch": 0.08425906518549571, + "grad_norm": 0.19038546085357666, + "learning_rate": 9.924071512070623e-06, + "loss": 0.0832, + "step": 402 + }, + { + "epoch": 0.08446866485013624, + "grad_norm": 0.1766170859336853, + "learning_rate": 9.923480994000756e-06, + "loss": 0.0841, + "step": 403 + }, + { + "epoch": 0.08467826451477678, + "grad_norm": 0.14354343712329865, + "learning_rate": 9.922888206207174e-06, + "loss": 0.0889, + "step": 404 + }, + { + "epoch": 0.08488786417941731, + "grad_norm": 0.17357495427131653, + "learning_rate": 9.922293148963152e-06, + "loss": 0.0795, + "step": 405 + }, + { + "epoch": 0.08509746384405785, + "grad_norm": 0.16406401991844177, + "learning_rate": 9.921695822543009e-06, + "loss": 0.0837, + "step": 406 + }, + { + "epoch": 0.08530706350869839, + "grad_norm": 0.16059234738349915, + "learning_rate": 9.921096227222115e-06, + "loss": 0.0823, + "step": 407 + }, + { + "epoch": 0.08551666317333892, + "grad_norm": 0.16859528422355652, + "learning_rate": 9.920494363276882e-06, + "loss": 0.0842, + "step": 408 + }, + { + "epoch": 0.08572626283797946, + "grad_norm": 0.11957108229398727, + "learning_rate": 9.91989023098477e-06, + "loss": 0.0817, + "step": 409 + }, + { + "epoch": 0.08593586250261999, + "grad_norm": 0.1288835108280182, + "learning_rate": 9.919283830624283e-06, + "loss": 0.0845, + "step": 410 + }, + { + "epoch": 0.08614546216726053, + "grad_norm": 0.14702066779136658, + "learning_rate": 9.918675162474974e-06, + "loss": 0.0813, + "step": 411 + }, + { + "epoch": 0.08635506183190107, + "grad_norm": 0.13575831055641174, + "learning_rate": 9.918064226817435e-06, + "loss": 0.0801, + "step": 412 + }, + { + "epoch": 0.0865646614965416, + "grad_norm": 0.16246341168880463, + "learning_rate": 9.91745102393331e-06, + "loss": 0.0843, + "step": 413 + }, + { + "epoch": 0.08677426116118214, + "grad_norm": 0.1807781308889389, + "learning_rate": 9.916835554105282e-06, + "loss": 0.0809, + "step": 414 + }, + { + "epoch": 0.08698386082582268, + "grad_norm": 0.18538668751716614, + "learning_rate": 9.916217817617087e-06, + "loss": 0.083, + "step": 415 + }, + { + "epoch": 0.08719346049046321, + "grad_norm": 0.19886340200901031, + "learning_rate": 9.915597814753498e-06, + "loss": 0.078, + "step": 416 + }, + { + "epoch": 0.08740306015510375, + "grad_norm": 0.20253928005695343, + "learning_rate": 9.914975545800339e-06, + "loss": 0.0817, + "step": 417 + }, + { + "epoch": 0.08761265981974428, + "grad_norm": 0.1704034060239792, + "learning_rate": 9.914351011044472e-06, + "loss": 0.0832, + "step": 418 + }, + { + "epoch": 0.08782225948438482, + "grad_norm": 0.14112436771392822, + "learning_rate": 9.913724210773812e-06, + "loss": 0.0819, + "step": 419 + }, + { + "epoch": 0.08803185914902537, + "grad_norm": 0.11341799050569534, + "learning_rate": 9.91309514527731e-06, + "loss": 0.0804, + "step": 420 + }, + { + "epoch": 0.0882414588136659, + "grad_norm": 0.12630800902843475, + "learning_rate": 9.912463814844968e-06, + "loss": 0.0773, + "step": 421 + }, + { + "epoch": 0.08845105847830644, + "grad_norm": 0.1604209989309311, + "learning_rate": 9.911830219767827e-06, + "loss": 0.0821, + "step": 422 + }, + { + "epoch": 0.08866065814294698, + "grad_norm": 0.1671822965145111, + "learning_rate": 9.911194360337975e-06, + "loss": 0.083, + "step": 423 + }, + { + "epoch": 0.0888702578075875, + "grad_norm": 0.14921946823596954, + "learning_rate": 9.910556236848543e-06, + "loss": 0.0817, + "step": 424 + }, + { + "epoch": 0.08907985747222805, + "grad_norm": 0.13964901864528656, + "learning_rate": 9.909915849593705e-06, + "loss": 0.0815, + "step": 425 + }, + { + "epoch": 0.08928945713686857, + "grad_norm": 0.1505957692861557, + "learning_rate": 9.909273198868682e-06, + "loss": 0.0837, + "step": 426 + }, + { + "epoch": 0.08949905680150912, + "grad_norm": 0.15062043070793152, + "learning_rate": 9.908628284969732e-06, + "loss": 0.08, + "step": 427 + }, + { + "epoch": 0.08970865646614966, + "grad_norm": 0.15089496970176697, + "learning_rate": 9.907981108194165e-06, + "loss": 0.0811, + "step": 428 + }, + { + "epoch": 0.08991825613079019, + "grad_norm": 0.1628749966621399, + "learning_rate": 9.907331668840325e-06, + "loss": 0.0797, + "step": 429 + }, + { + "epoch": 0.09012785579543073, + "grad_norm": 0.1800432950258255, + "learning_rate": 9.906679967207604e-06, + "loss": 0.08, + "step": 430 + }, + { + "epoch": 0.09033745546007127, + "grad_norm": 0.1852189302444458, + "learning_rate": 9.906026003596438e-06, + "loss": 0.0866, + "step": 431 + }, + { + "epoch": 0.0905470551247118, + "grad_norm": 0.16221773624420166, + "learning_rate": 9.905369778308304e-06, + "loss": 0.0785, + "step": 432 + }, + { + "epoch": 0.09075665478935234, + "grad_norm": 0.13077011704444885, + "learning_rate": 9.904711291645721e-06, + "loss": 0.0793, + "step": 433 + }, + { + "epoch": 0.09096625445399287, + "grad_norm": 0.14190125465393066, + "learning_rate": 9.904050543912252e-06, + "loss": 0.0774, + "step": 434 + }, + { + "epoch": 0.09117585411863341, + "grad_norm": 0.1918221116065979, + "learning_rate": 9.9033875354125e-06, + "loss": 0.086, + "step": 435 + }, + { + "epoch": 0.09138545378327395, + "grad_norm": 0.24989114701747894, + "learning_rate": 9.902722266452112e-06, + "loss": 0.0751, + "step": 436 + }, + { + "epoch": 0.09159505344791448, + "grad_norm": 0.2872444987297058, + "learning_rate": 9.902054737337778e-06, + "loss": 0.0794, + "step": 437 + }, + { + "epoch": 0.09180465311255502, + "grad_norm": 0.26268768310546875, + "learning_rate": 9.90138494837723e-06, + "loss": 0.0783, + "step": 438 + }, + { + "epoch": 0.09201425277719556, + "grad_norm": 0.19035717844963074, + "learning_rate": 9.900712899879237e-06, + "loss": 0.0777, + "step": 439 + }, + { + "epoch": 0.09222385244183609, + "grad_norm": 0.14101530611515045, + "learning_rate": 9.900038592153616e-06, + "loss": 0.0783, + "step": 440 + }, + { + "epoch": 0.09243345210647663, + "grad_norm": 0.21867899596691132, + "learning_rate": 9.899362025511221e-06, + "loss": 0.0799, + "step": 441 + }, + { + "epoch": 0.09264305177111716, + "grad_norm": 0.3020075857639313, + "learning_rate": 9.898683200263951e-06, + "loss": 0.0903, + "step": 442 + }, + { + "epoch": 0.0928526514357577, + "grad_norm": 0.29326069355010986, + "learning_rate": 9.898002116724743e-06, + "loss": 0.081, + "step": 443 + }, + { + "epoch": 0.09306225110039824, + "grad_norm": 0.2977989912033081, + "learning_rate": 9.897318775207576e-06, + "loss": 0.0794, + "step": 444 + }, + { + "epoch": 0.09327185076503877, + "grad_norm": 0.3779714107513428, + "learning_rate": 9.89663317602747e-06, + "loss": 0.0853, + "step": 445 + }, + { + "epoch": 0.09348145042967931, + "grad_norm": 0.3647499680519104, + "learning_rate": 9.895945319500488e-06, + "loss": 0.0878, + "step": 446 + }, + { + "epoch": 0.09369105009431986, + "grad_norm": 0.18845055997371674, + "learning_rate": 9.895255205943732e-06, + "loss": 0.0776, + "step": 447 + }, + { + "epoch": 0.09390064975896038, + "grad_norm": 0.23743781447410583, + "learning_rate": 9.894562835675343e-06, + "loss": 0.0822, + "step": 448 + }, + { + "epoch": 0.09411024942360093, + "grad_norm": 0.32447198033332825, + "learning_rate": 9.893868209014502e-06, + "loss": 0.0813, + "step": 449 + }, + { + "epoch": 0.09431984908824145, + "grad_norm": 0.22521555423736572, + "learning_rate": 9.893171326281433e-06, + "loss": 0.0779, + "step": 450 + }, + { + "epoch": 0.094529448752882, + "grad_norm": 0.2047656625509262, + "learning_rate": 9.8924721877974e-06, + "loss": 0.0787, + "step": 451 + }, + { + "epoch": 0.09473904841752254, + "grad_norm": 0.241439089179039, + "learning_rate": 9.891770793884703e-06, + "loss": 0.0838, + "step": 452 + }, + { + "epoch": 0.09494864808216306, + "grad_norm": 0.18499858677387238, + "learning_rate": 9.891067144866687e-06, + "loss": 0.0806, + "step": 453 + }, + { + "epoch": 0.0951582477468036, + "grad_norm": 0.19206255674362183, + "learning_rate": 9.890361241067734e-06, + "loss": 0.084, + "step": 454 + }, + { + "epoch": 0.09536784741144415, + "grad_norm": 0.21595366299152374, + "learning_rate": 9.889653082813264e-06, + "loss": 0.0796, + "step": 455 + }, + { + "epoch": 0.09557744707608468, + "grad_norm": 0.19207443296909332, + "learning_rate": 9.888942670429738e-06, + "loss": 0.0791, + "step": 456 + }, + { + "epoch": 0.09578704674072522, + "grad_norm": 0.16599828004837036, + "learning_rate": 9.888230004244657e-06, + "loss": 0.0806, + "step": 457 + }, + { + "epoch": 0.09599664640536575, + "grad_norm": 0.1895645707845688, + "learning_rate": 9.88751508458656e-06, + "loss": 0.0784, + "step": 458 + }, + { + "epoch": 0.09620624607000629, + "grad_norm": 0.18604148924350739, + "learning_rate": 9.886797911785023e-06, + "loss": 0.0858, + "step": 459 + }, + { + "epoch": 0.09641584573464683, + "grad_norm": 0.12998424470424652, + "learning_rate": 9.886078486170665e-06, + "loss": 0.0792, + "step": 460 + }, + { + "epoch": 0.09662544539928736, + "grad_norm": 0.16682370007038116, + "learning_rate": 9.885356808075139e-06, + "loss": 0.0798, + "step": 461 + }, + { + "epoch": 0.0968350450639279, + "grad_norm": 0.17146462202072144, + "learning_rate": 9.884632877831139e-06, + "loss": 0.0797, + "step": 462 + }, + { + "epoch": 0.09704464472856844, + "grad_norm": 0.11582599580287933, + "learning_rate": 9.883906695772399e-06, + "loss": 0.0812, + "step": 463 + }, + { + "epoch": 0.09725424439320897, + "grad_norm": 0.1586909145116806, + "learning_rate": 9.883178262233684e-06, + "loss": 0.0817, + "step": 464 + }, + { + "epoch": 0.09746384405784951, + "grad_norm": 0.1509367823600769, + "learning_rate": 9.882447577550805e-06, + "loss": 0.0845, + "step": 465 + }, + { + "epoch": 0.09767344372249004, + "grad_norm": 0.1255314201116562, + "learning_rate": 9.881714642060609e-06, + "loss": 0.0789, + "step": 466 + }, + { + "epoch": 0.09788304338713058, + "grad_norm": 0.17706221342086792, + "learning_rate": 9.880979456100974e-06, + "loss": 0.0821, + "step": 467 + }, + { + "epoch": 0.09809264305177112, + "grad_norm": 0.23085959255695343, + "learning_rate": 9.880242020010827e-06, + "loss": 0.0801, + "step": 468 + }, + { + "epoch": 0.09830224271641165, + "grad_norm": 0.354623407125473, + "learning_rate": 9.87950233413012e-06, + "loss": 0.0826, + "step": 469 + }, + { + "epoch": 0.09851184238105219, + "grad_norm": 0.4140135943889618, + "learning_rate": 9.87876039879985e-06, + "loss": 0.0901, + "step": 470 + }, + { + "epoch": 0.09872144204569272, + "grad_norm": 0.289201021194458, + "learning_rate": 9.878016214362051e-06, + "loss": 0.0836, + "step": 471 + }, + { + "epoch": 0.09893104171033326, + "grad_norm": 0.16922371089458466, + "learning_rate": 9.877269781159791e-06, + "loss": 0.0786, + "step": 472 + }, + { + "epoch": 0.0991406413749738, + "grad_norm": 0.2799324095249176, + "learning_rate": 9.876521099537173e-06, + "loss": 0.0827, + "step": 473 + }, + { + "epoch": 0.09935024103961433, + "grad_norm": 0.3039519190788269, + "learning_rate": 9.875770169839343e-06, + "loss": 0.0852, + "step": 474 + }, + { + "epoch": 0.09955984070425487, + "grad_norm": 0.19314952194690704, + "learning_rate": 9.875016992412476e-06, + "loss": 0.0777, + "step": 475 + }, + { + "epoch": 0.09976944036889542, + "grad_norm": 0.1906445324420929, + "learning_rate": 9.87426156760379e-06, + "loss": 0.0788, + "step": 476 + }, + { + "epoch": 0.09997904003353594, + "grad_norm": 0.19903278350830078, + "learning_rate": 9.87350389576153e-06, + "loss": 0.0836, + "step": 477 + }, + { + "epoch": 0.10018863969817648, + "grad_norm": 0.2028076946735382, + "learning_rate": 9.872743977234992e-06, + "loss": 0.0797, + "step": 478 + }, + { + "epoch": 0.10039823936281701, + "grad_norm": 0.1903727948665619, + "learning_rate": 9.871981812374488e-06, + "loss": 0.0791, + "step": 479 + }, + { + "epoch": 0.10060783902745755, + "grad_norm": 0.16171634197235107, + "learning_rate": 9.871217401531382e-06, + "loss": 0.0819, + "step": 480 + }, + { + "epoch": 0.1008174386920981, + "grad_norm": 0.18417572975158691, + "learning_rate": 9.870450745058066e-06, + "loss": 0.0836, + "step": 481 + }, + { + "epoch": 0.10102703835673862, + "grad_norm": 0.18208323419094086, + "learning_rate": 9.869681843307968e-06, + "loss": 0.0834, + "step": 482 + }, + { + "epoch": 0.10123663802137917, + "grad_norm": 0.17092318832874298, + "learning_rate": 9.868910696635551e-06, + "loss": 0.0791, + "step": 483 + }, + { + "epoch": 0.10144623768601971, + "grad_norm": 0.13848985731601715, + "learning_rate": 9.868137305396317e-06, + "loss": 0.0766, + "step": 484 + }, + { + "epoch": 0.10165583735066024, + "grad_norm": 0.17035254836082458, + "learning_rate": 9.867361669946793e-06, + "loss": 0.0812, + "step": 485 + }, + { + "epoch": 0.10186543701530078, + "grad_norm": 0.1731637865304947, + "learning_rate": 9.866583790644553e-06, + "loss": 0.078, + "step": 486 + }, + { + "epoch": 0.1020750366799413, + "grad_norm": 0.16337701678276062, + "learning_rate": 9.865803667848195e-06, + "loss": 0.0807, + "step": 487 + }, + { + "epoch": 0.10228463634458185, + "grad_norm": 0.16541989147663116, + "learning_rate": 9.865021301917358e-06, + "loss": 0.0792, + "step": 488 + }, + { + "epoch": 0.10249423600922239, + "grad_norm": 0.13967937231063843, + "learning_rate": 9.864236693212709e-06, + "loss": 0.0748, + "step": 489 + }, + { + "epoch": 0.10270383567386292, + "grad_norm": 0.15016354620456696, + "learning_rate": 9.863449842095955e-06, + "loss": 0.0794, + "step": 490 + }, + { + "epoch": 0.10291343533850346, + "grad_norm": 0.18421977758407593, + "learning_rate": 9.862660748929835e-06, + "loss": 0.0783, + "step": 491 + }, + { + "epoch": 0.103123035003144, + "grad_norm": 0.21068471670150757, + "learning_rate": 9.86186941407812e-06, + "loss": 0.0785, + "step": 492 + }, + { + "epoch": 0.10333263466778453, + "grad_norm": 0.23309852182865143, + "learning_rate": 9.861075837905616e-06, + "loss": 0.0783, + "step": 493 + }, + { + "epoch": 0.10354223433242507, + "grad_norm": 0.23478467762470245, + "learning_rate": 9.860280020778158e-06, + "loss": 0.0806, + "step": 494 + }, + { + "epoch": 0.1037518339970656, + "grad_norm": 0.19462421536445618, + "learning_rate": 9.859481963062623e-06, + "loss": 0.0747, + "step": 495 + }, + { + "epoch": 0.10396143366170614, + "grad_norm": 0.15075665712356567, + "learning_rate": 9.85868166512691e-06, + "loss": 0.0748, + "step": 496 + }, + { + "epoch": 0.10417103332634668, + "grad_norm": 0.13068079948425293, + "learning_rate": 9.85787912733996e-06, + "loss": 0.079, + "step": 497 + }, + { + "epoch": 0.10438063299098721, + "grad_norm": 0.14314256608486176, + "learning_rate": 9.85707435007174e-06, + "loss": 0.0805, + "step": 498 + }, + { + "epoch": 0.10459023265562775, + "grad_norm": 0.18346239626407623, + "learning_rate": 9.856267333693255e-06, + "loss": 0.0804, + "step": 499 + }, + { + "epoch": 0.1047998323202683, + "grad_norm": 0.2016962468624115, + "learning_rate": 9.855458078576537e-06, + "loss": 0.077, + "step": 500 + }, + { + "epoch": 0.10500943198490882, + "grad_norm": 0.18932455778121948, + "learning_rate": 9.854646585094654e-06, + "loss": 0.0811, + "step": 501 + }, + { + "epoch": 0.10521903164954936, + "grad_norm": 0.1751541942358017, + "learning_rate": 9.853832853621703e-06, + "loss": 0.0776, + "step": 502 + }, + { + "epoch": 0.10542863131418989, + "grad_norm": 0.18486347794532776, + "learning_rate": 9.853016884532814e-06, + "loss": 0.077, + "step": 503 + }, + { + "epoch": 0.10563823097883043, + "grad_norm": 0.20413736999034882, + "learning_rate": 9.85219867820415e-06, + "loss": 0.0793, + "step": 504 + }, + { + "epoch": 0.10584783064347097, + "grad_norm": 0.19140803813934326, + "learning_rate": 9.851378235012905e-06, + "loss": 0.0792, + "step": 505 + }, + { + "epoch": 0.1060574303081115, + "grad_norm": 0.14764432609081268, + "learning_rate": 9.8505555553373e-06, + "loss": 0.0826, + "step": 506 + }, + { + "epoch": 0.10626702997275204, + "grad_norm": 0.1288614273071289, + "learning_rate": 9.84973063955659e-06, + "loss": 0.0768, + "step": 507 + }, + { + "epoch": 0.10647662963739259, + "grad_norm": 0.14826104044914246, + "learning_rate": 9.848903488051065e-06, + "loss": 0.0772, + "step": 508 + }, + { + "epoch": 0.10668622930203311, + "grad_norm": 0.14957068860530853, + "learning_rate": 9.848074101202037e-06, + "loss": 0.0755, + "step": 509 + }, + { + "epoch": 0.10689582896667366, + "grad_norm": 0.11332403123378754, + "learning_rate": 9.847242479391857e-06, + "loss": 0.0733, + "step": 510 + }, + { + "epoch": 0.10710542863131418, + "grad_norm": 0.10584530979394913, + "learning_rate": 9.8464086230039e-06, + "loss": 0.0752, + "step": 511 + }, + { + "epoch": 0.10731502829595473, + "grad_norm": 0.1642168015241623, + "learning_rate": 9.845572532422576e-06, + "loss": 0.0752, + "step": 512 + }, + { + "epoch": 0.10752462796059527, + "grad_norm": 0.22667261958122253, + "learning_rate": 9.844734208033318e-06, + "loss": 0.0826, + "step": 513 + }, + { + "epoch": 0.1077342276252358, + "grad_norm": 0.2658557593822479, + "learning_rate": 9.843893650222599e-06, + "loss": 0.0838, + "step": 514 + }, + { + "epoch": 0.10794382728987634, + "grad_norm": 0.27352774143218994, + "learning_rate": 9.843050859377913e-06, + "loss": 0.0832, + "step": 515 + }, + { + "epoch": 0.10815342695451688, + "grad_norm": 0.24916625022888184, + "learning_rate": 9.842205835887785e-06, + "loss": 0.0777, + "step": 516 + }, + { + "epoch": 0.10836302661915741, + "grad_norm": 0.2406039983034134, + "learning_rate": 9.841358580141775e-06, + "loss": 0.0757, + "step": 517 + }, + { + "epoch": 0.10857262628379795, + "grad_norm": 0.23990845680236816, + "learning_rate": 9.840509092530465e-06, + "loss": 0.0778, + "step": 518 + }, + { + "epoch": 0.10878222594843848, + "grad_norm": 0.21896076202392578, + "learning_rate": 9.839657373445468e-06, + "loss": 0.0787, + "step": 519 + }, + { + "epoch": 0.10899182561307902, + "grad_norm": 0.2108394056558609, + "learning_rate": 9.838803423279428e-06, + "loss": 0.0782, + "step": 520 + }, + { + "epoch": 0.10920142527771956, + "grad_norm": 0.20889486372470856, + "learning_rate": 9.837947242426013e-06, + "loss": 0.078, + "step": 521 + }, + { + "epoch": 0.10941102494236009, + "grad_norm": 0.20450422167778015, + "learning_rate": 9.837088831279927e-06, + "loss": 0.0837, + "step": 522 + }, + { + "epoch": 0.10962062460700063, + "grad_norm": 0.19421492516994476, + "learning_rate": 9.836228190236892e-06, + "loss": 0.0744, + "step": 523 + }, + { + "epoch": 0.10983022427164117, + "grad_norm": 0.19942732155323029, + "learning_rate": 9.835365319693667e-06, + "loss": 0.0774, + "step": 524 + }, + { + "epoch": 0.1100398239362817, + "grad_norm": 0.2001711130142212, + "learning_rate": 9.834500220048034e-06, + "loss": 0.0787, + "step": 525 + }, + { + "epoch": 0.11024942360092224, + "grad_norm": 0.1794813573360443, + "learning_rate": 9.833632891698801e-06, + "loss": 0.0814, + "step": 526 + }, + { + "epoch": 0.11045902326556277, + "grad_norm": 0.16165196895599365, + "learning_rate": 9.832763335045812e-06, + "loss": 0.0761, + "step": 527 + }, + { + "epoch": 0.11066862293020331, + "grad_norm": 0.18171708285808563, + "learning_rate": 9.83189155048993e-06, + "loss": 0.0809, + "step": 528 + }, + { + "epoch": 0.11087822259484385, + "grad_norm": 0.1997329592704773, + "learning_rate": 9.831017538433045e-06, + "loss": 0.0764, + "step": 529 + }, + { + "epoch": 0.11108782225948438, + "grad_norm": 0.19061146676540375, + "learning_rate": 9.83014129927808e-06, + "loss": 0.0783, + "step": 530 + }, + { + "epoch": 0.11129742192412492, + "grad_norm": 0.1783018410205841, + "learning_rate": 9.829262833428978e-06, + "loss": 0.079, + "step": 531 + }, + { + "epoch": 0.11150702158876546, + "grad_norm": 0.18371321260929108, + "learning_rate": 9.828382141290713e-06, + "loss": 0.0769, + "step": 532 + }, + { + "epoch": 0.11171662125340599, + "grad_norm": 0.16765272617340088, + "learning_rate": 9.827499223269285e-06, + "loss": 0.0772, + "step": 533 + }, + { + "epoch": 0.11192622091804653, + "grad_norm": 0.12363309413194656, + "learning_rate": 9.826614079771716e-06, + "loss": 0.0765, + "step": 534 + }, + { + "epoch": 0.11213582058268706, + "grad_norm": 0.10953837633132935, + "learning_rate": 9.82572671120606e-06, + "loss": 0.0817, + "step": 535 + }, + { + "epoch": 0.1123454202473276, + "grad_norm": 0.14569856226444244, + "learning_rate": 9.824837117981392e-06, + "loss": 0.0754, + "step": 536 + }, + { + "epoch": 0.11255501991196815, + "grad_norm": 0.16665959358215332, + "learning_rate": 9.823945300507815e-06, + "loss": 0.0772, + "step": 537 + }, + { + "epoch": 0.11276461957660867, + "grad_norm": 0.167064368724823, + "learning_rate": 9.823051259196456e-06, + "loss": 0.0795, + "step": 538 + }, + { + "epoch": 0.11297421924124922, + "grad_norm": 0.1980743259191513, + "learning_rate": 9.82215499445947e-06, + "loss": 0.0751, + "step": 539 + }, + { + "epoch": 0.11318381890588974, + "grad_norm": 0.2310371845960617, + "learning_rate": 9.821256506710032e-06, + "loss": 0.0757, + "step": 540 + }, + { + "epoch": 0.11339341857053029, + "grad_norm": 0.2004477083683014, + "learning_rate": 9.820355796362346e-06, + "loss": 0.0774, + "step": 541 + }, + { + "epoch": 0.11360301823517083, + "grad_norm": 0.16151970624923706, + "learning_rate": 9.81945286383164e-06, + "loss": 0.0766, + "step": 542 + }, + { + "epoch": 0.11381261789981136, + "grad_norm": 0.2251511514186859, + "learning_rate": 9.818547709534163e-06, + "loss": 0.0783, + "step": 543 + }, + { + "epoch": 0.1140222175644519, + "grad_norm": 0.2668745219707489, + "learning_rate": 9.817640333887194e-06, + "loss": 0.0803, + "step": 544 + }, + { + "epoch": 0.11423181722909244, + "grad_norm": 0.24963510036468506, + "learning_rate": 9.816730737309032e-06, + "loss": 0.0782, + "step": 545 + }, + { + "epoch": 0.11444141689373297, + "grad_norm": 0.2355589121580124, + "learning_rate": 9.815818920219e-06, + "loss": 0.0758, + "step": 546 + }, + { + "epoch": 0.11465101655837351, + "grad_norm": 0.20662012696266174, + "learning_rate": 9.814904883037445e-06, + "loss": 0.0769, + "step": 547 + }, + { + "epoch": 0.11486061622301404, + "grad_norm": 0.15815681219100952, + "learning_rate": 9.813988626185742e-06, + "loss": 0.0782, + "step": 548 + }, + { + "epoch": 0.11507021588765458, + "grad_norm": 0.16817304491996765, + "learning_rate": 9.81307015008628e-06, + "loss": 0.0769, + "step": 549 + }, + { + "epoch": 0.11527981555229512, + "grad_norm": 0.19368018209934235, + "learning_rate": 9.81214945516248e-06, + "loss": 0.0796, + "step": 550 + }, + { + "epoch": 0.11548941521693565, + "grad_norm": 0.2023620903491974, + "learning_rate": 9.81122654183878e-06, + "loss": 0.0763, + "step": 551 + }, + { + "epoch": 0.11569901488157619, + "grad_norm": 0.2068459391593933, + "learning_rate": 9.810301410540643e-06, + "loss": 0.074, + "step": 552 + }, + { + "epoch": 0.11590861454621673, + "grad_norm": 0.1989372819662094, + "learning_rate": 9.809374061694555e-06, + "loss": 0.0758, + "step": 553 + }, + { + "epoch": 0.11611821421085726, + "grad_norm": 0.17017100751399994, + "learning_rate": 9.808444495728024e-06, + "loss": 0.0743, + "step": 554 + }, + { + "epoch": 0.1163278138754978, + "grad_norm": 0.16604629158973694, + "learning_rate": 9.80751271306958e-06, + "loss": 0.0794, + "step": 555 + }, + { + "epoch": 0.11653741354013833, + "grad_norm": 0.1727730929851532, + "learning_rate": 9.806578714148774e-06, + "loss": 0.0776, + "step": 556 + }, + { + "epoch": 0.11674701320477887, + "grad_norm": 0.18742415308952332, + "learning_rate": 9.805642499396177e-06, + "loss": 0.0755, + "step": 557 + }, + { + "epoch": 0.11695661286941941, + "grad_norm": 0.22661146521568298, + "learning_rate": 9.804704069243389e-06, + "loss": 0.0803, + "step": 558 + }, + { + "epoch": 0.11716621253405994, + "grad_norm": 0.25402432680130005, + "learning_rate": 9.80376342412302e-06, + "loss": 0.0761, + "step": 559 + }, + { + "epoch": 0.11737581219870048, + "grad_norm": 0.2397623509168625, + "learning_rate": 9.802820564468712e-06, + "loss": 0.0736, + "step": 560 + }, + { + "epoch": 0.11758541186334102, + "grad_norm": 0.20311333239078522, + "learning_rate": 9.801875490715123e-06, + "loss": 0.076, + "step": 561 + }, + { + "epoch": 0.11779501152798155, + "grad_norm": 0.16478881239891052, + "learning_rate": 9.800928203297927e-06, + "loss": 0.0755, + "step": 562 + }, + { + "epoch": 0.1180046111926221, + "grad_norm": 0.14853514730930328, + "learning_rate": 9.79997870265383e-06, + "loss": 0.0707, + "step": 563 + }, + { + "epoch": 0.11821421085726262, + "grad_norm": 0.17599347233772278, + "learning_rate": 9.799026989220543e-06, + "loss": 0.0779, + "step": 564 + }, + { + "epoch": 0.11842381052190316, + "grad_norm": 0.2129105031490326, + "learning_rate": 9.798073063436815e-06, + "loss": 0.0767, + "step": 565 + }, + { + "epoch": 0.1186334101865437, + "grad_norm": 0.23926801979541779, + "learning_rate": 9.7971169257424e-06, + "loss": 0.0771, + "step": 566 + }, + { + "epoch": 0.11884300985118423, + "grad_norm": 0.2424420565366745, + "learning_rate": 9.79615857657808e-06, + "loss": 0.0823, + "step": 567 + }, + { + "epoch": 0.11905260951582478, + "grad_norm": 0.21663707494735718, + "learning_rate": 9.795198016385651e-06, + "loss": 0.0778, + "step": 568 + }, + { + "epoch": 0.11926220918046532, + "grad_norm": 0.21969793736934662, + "learning_rate": 9.794235245607933e-06, + "loss": 0.0756, + "step": 569 + }, + { + "epoch": 0.11947180884510585, + "grad_norm": 0.26351258158683777, + "learning_rate": 9.793270264688763e-06, + "loss": 0.0776, + "step": 570 + }, + { + "epoch": 0.11968140850974639, + "grad_norm": 0.3025349974632263, + "learning_rate": 9.792303074072995e-06, + "loss": 0.0747, + "step": 571 + }, + { + "epoch": 0.11989100817438691, + "grad_norm": 0.30396711826324463, + "learning_rate": 9.791333674206507e-06, + "loss": 0.0796, + "step": 572 + }, + { + "epoch": 0.12010060783902746, + "grad_norm": 0.23939970135688782, + "learning_rate": 9.790362065536189e-06, + "loss": 0.0797, + "step": 573 + }, + { + "epoch": 0.120310207503668, + "grad_norm": 0.1715303361415863, + "learning_rate": 9.789388248509957e-06, + "loss": 0.075, + "step": 574 + }, + { + "epoch": 0.12051980716830853, + "grad_norm": 0.16927644610404968, + "learning_rate": 9.788412223576734e-06, + "loss": 0.0821, + "step": 575 + }, + { + "epoch": 0.12072940683294907, + "grad_norm": 0.18329590559005737, + "learning_rate": 9.787433991186472e-06, + "loss": 0.0732, + "step": 576 + }, + { + "epoch": 0.12093900649758961, + "grad_norm": 0.18802891671657562, + "learning_rate": 9.786453551790133e-06, + "loss": 0.0783, + "step": 577 + }, + { + "epoch": 0.12114860616223014, + "grad_norm": 0.1802418977022171, + "learning_rate": 9.785470905839703e-06, + "loss": 0.0733, + "step": 578 + }, + { + "epoch": 0.12135820582687068, + "grad_norm": 0.19172324240207672, + "learning_rate": 9.784486053788179e-06, + "loss": 0.0771, + "step": 579 + }, + { + "epoch": 0.12156780549151121, + "grad_norm": 0.1751202493906021, + "learning_rate": 9.783498996089577e-06, + "loss": 0.0773, + "step": 580 + }, + { + "epoch": 0.12177740515615175, + "grad_norm": 0.12133464962244034, + "learning_rate": 9.782509733198932e-06, + "loss": 0.0751, + "step": 581 + }, + { + "epoch": 0.12198700482079229, + "grad_norm": 0.14133097231388092, + "learning_rate": 9.781518265572294e-06, + "loss": 0.0748, + "step": 582 + }, + { + "epoch": 0.12219660448543282, + "grad_norm": 0.18668805062770844, + "learning_rate": 9.780524593666727e-06, + "loss": 0.0736, + "step": 583 + }, + { + "epoch": 0.12240620415007336, + "grad_norm": 0.18670430779457092, + "learning_rate": 9.779528717940317e-06, + "loss": 0.0739, + "step": 584 + }, + { + "epoch": 0.1226158038147139, + "grad_norm": 0.19386443495750427, + "learning_rate": 9.77853063885216e-06, + "loss": 0.079, + "step": 585 + }, + { + "epoch": 0.12282540347935443, + "grad_norm": 0.19840072095394135, + "learning_rate": 9.77753035686237e-06, + "loss": 0.0821, + "step": 586 + }, + { + "epoch": 0.12303500314399497, + "grad_norm": 0.2034991979598999, + "learning_rate": 9.77652787243208e-06, + "loss": 0.0744, + "step": 587 + }, + { + "epoch": 0.1232446028086355, + "grad_norm": 0.22970953583717346, + "learning_rate": 9.775523186023432e-06, + "loss": 0.0718, + "step": 588 + }, + { + "epoch": 0.12345420247327604, + "grad_norm": 0.20958104729652405, + "learning_rate": 9.774516298099588e-06, + "loss": 0.0769, + "step": 589 + }, + { + "epoch": 0.12366380213791658, + "grad_norm": 0.1604912430047989, + "learning_rate": 9.773507209124721e-06, + "loss": 0.0713, + "step": 590 + }, + { + "epoch": 0.12387340180255711, + "grad_norm": 0.16623252630233765, + "learning_rate": 9.772495919564022e-06, + "loss": 0.0728, + "step": 591 + }, + { + "epoch": 0.12408300146719765, + "grad_norm": 0.20733430981636047, + "learning_rate": 9.771482429883697e-06, + "loss": 0.0747, + "step": 592 + }, + { + "epoch": 0.1242926011318382, + "grad_norm": 0.2416054755449295, + "learning_rate": 9.770466740550963e-06, + "loss": 0.0795, + "step": 593 + }, + { + "epoch": 0.12450220079647872, + "grad_norm": 0.21449747681617737, + "learning_rate": 9.769448852034051e-06, + "loss": 0.0782, + "step": 594 + }, + { + "epoch": 0.12471180046111927, + "grad_norm": 0.15167087316513062, + "learning_rate": 9.768428764802209e-06, + "loss": 0.0818, + "step": 595 + }, + { + "epoch": 0.1249214001257598, + "grad_norm": 0.15075545012950897, + "learning_rate": 9.767406479325698e-06, + "loss": 0.0739, + "step": 596 + }, + { + "epoch": 0.12513099979040035, + "grad_norm": 0.1983119398355484, + "learning_rate": 9.76638199607579e-06, + "loss": 0.076, + "step": 597 + }, + { + "epoch": 0.12534059945504086, + "grad_norm": 0.18619777262210846, + "learning_rate": 9.765355315524772e-06, + "loss": 0.0787, + "step": 598 + }, + { + "epoch": 0.1255501991196814, + "grad_norm": 0.13842901587486267, + "learning_rate": 9.764326438145944e-06, + "loss": 0.078, + "step": 599 + }, + { + "epoch": 0.12575979878432195, + "grad_norm": 0.16321855783462524, + "learning_rate": 9.763295364413616e-06, + "loss": 0.0754, + "step": 600 + }, + { + "epoch": 0.1259693984489625, + "grad_norm": 0.18347276747226715, + "learning_rate": 9.762262094803115e-06, + "loss": 0.0757, + "step": 601 + }, + { + "epoch": 0.12617899811360303, + "grad_norm": 0.17401129007339478, + "learning_rate": 9.761226629790777e-06, + "loss": 0.0757, + "step": 602 + }, + { + "epoch": 0.12638859777824354, + "grad_norm": 0.19704696536064148, + "learning_rate": 9.760188969853953e-06, + "loss": 0.0786, + "step": 603 + }, + { + "epoch": 0.1265981974428841, + "grad_norm": 0.21989691257476807, + "learning_rate": 9.759149115471001e-06, + "loss": 0.0737, + "step": 604 + }, + { + "epoch": 0.12680779710752463, + "grad_norm": 0.19509382545948029, + "learning_rate": 9.758107067121298e-06, + "loss": 0.0775, + "step": 605 + }, + { + "epoch": 0.12701739677216517, + "grad_norm": 0.16452129185199738, + "learning_rate": 9.757062825285223e-06, + "loss": 0.0751, + "step": 606 + }, + { + "epoch": 0.1272269964368057, + "grad_norm": 0.16775386035442352, + "learning_rate": 9.756016390444174e-06, + "loss": 0.0785, + "step": 607 + }, + { + "epoch": 0.12743659610144623, + "grad_norm": 0.1647682636976242, + "learning_rate": 9.754967763080558e-06, + "loss": 0.0779, + "step": 608 + }, + { + "epoch": 0.12764619576608677, + "grad_norm": 0.15964777767658234, + "learning_rate": 9.75391694367779e-06, + "loss": 0.0757, + "step": 609 + }, + { + "epoch": 0.1278557954307273, + "grad_norm": 0.1809694468975067, + "learning_rate": 9.7528639327203e-06, + "loss": 0.0745, + "step": 610 + }, + { + "epoch": 0.12806539509536785, + "grad_norm": 0.21518062055110931, + "learning_rate": 9.751808730693521e-06, + "loss": 0.0721, + "step": 611 + }, + { + "epoch": 0.1282749947600084, + "grad_norm": 0.23137721419334412, + "learning_rate": 9.750751338083906e-06, + "loss": 0.0773, + "step": 612 + }, + { + "epoch": 0.1284845944246489, + "grad_norm": 0.25104039907455444, + "learning_rate": 9.749691755378912e-06, + "loss": 0.0789, + "step": 613 + }, + { + "epoch": 0.12869419408928945, + "grad_norm": 0.3227115869522095, + "learning_rate": 9.748629983067004e-06, + "loss": 0.0814, + "step": 614 + }, + { + "epoch": 0.12890379375393, + "grad_norm": 0.3414157032966614, + "learning_rate": 9.747566021637662e-06, + "loss": 0.0784, + "step": 615 + }, + { + "epoch": 0.12911339341857053, + "grad_norm": 0.2576848566532135, + "learning_rate": 9.746499871581368e-06, + "loss": 0.0821, + "step": 616 + }, + { + "epoch": 0.12932299308321107, + "grad_norm": 0.18533697724342346, + "learning_rate": 9.74543153338962e-06, + "loss": 0.0791, + "step": 617 + }, + { + "epoch": 0.12953259274785162, + "grad_norm": 0.21570216119289398, + "learning_rate": 9.744361007554922e-06, + "loss": 0.0762, + "step": 618 + }, + { + "epoch": 0.12974219241249213, + "grad_norm": 0.2508070170879364, + "learning_rate": 9.743288294570784e-06, + "loss": 0.0747, + "step": 619 + }, + { + "epoch": 0.12995179207713267, + "grad_norm": 0.23686811327934265, + "learning_rate": 9.742213394931726e-06, + "loss": 0.0766, + "step": 620 + }, + { + "epoch": 0.1301613917417732, + "grad_norm": 0.18374064564704895, + "learning_rate": 9.741136309133279e-06, + "loss": 0.0751, + "step": 621 + }, + { + "epoch": 0.13037099140641376, + "grad_norm": 0.21081578731536865, + "learning_rate": 9.74005703767198e-06, + "loss": 0.0769, + "step": 622 + }, + { + "epoch": 0.1305805910710543, + "grad_norm": 0.20336848497390747, + "learning_rate": 9.738975581045368e-06, + "loss": 0.0788, + "step": 623 + }, + { + "epoch": 0.1307901907356948, + "grad_norm": 0.1740516871213913, + "learning_rate": 9.737891939752e-06, + "loss": 0.0744, + "step": 624 + }, + { + "epoch": 0.13099979040033535, + "grad_norm": 0.19728244841098785, + "learning_rate": 9.73680611429143e-06, + "loss": 0.0737, + "step": 625 + }, + { + "epoch": 0.1312093900649759, + "grad_norm": 0.19369123876094818, + "learning_rate": 9.735718105164228e-06, + "loss": 0.0759, + "step": 626 + }, + { + "epoch": 0.13141898972961644, + "grad_norm": 0.1801711618900299, + "learning_rate": 9.734627912871962e-06, + "loss": 0.0759, + "step": 627 + }, + { + "epoch": 0.13162858939425698, + "grad_norm": 0.1812150627374649, + "learning_rate": 9.733535537917211e-06, + "loss": 0.078, + "step": 628 + }, + { + "epoch": 0.1318381890588975, + "grad_norm": 0.17281141877174377, + "learning_rate": 9.732440980803561e-06, + "loss": 0.0759, + "step": 629 + }, + { + "epoch": 0.13204778872353803, + "grad_norm": 0.18802779912948608, + "learning_rate": 9.7313442420356e-06, + "loss": 0.075, + "step": 630 + }, + { + "epoch": 0.13225738838817858, + "grad_norm": 0.17967858910560608, + "learning_rate": 9.730245322118929e-06, + "loss": 0.0737, + "step": 631 + }, + { + "epoch": 0.13246698805281912, + "grad_norm": 0.15727940201759338, + "learning_rate": 9.729144221560145e-06, + "loss": 0.0728, + "step": 632 + }, + { + "epoch": 0.13267658771745966, + "grad_norm": 0.1466912478208542, + "learning_rate": 9.72804094086686e-06, + "loss": 0.0765, + "step": 633 + }, + { + "epoch": 0.1328861873821002, + "grad_norm": 0.14428047835826874, + "learning_rate": 9.726935480547679e-06, + "loss": 0.0759, + "step": 634 + }, + { + "epoch": 0.13309578704674072, + "grad_norm": 0.14207249879837036, + "learning_rate": 9.725827841112226e-06, + "loss": 0.0763, + "step": 635 + }, + { + "epoch": 0.13330538671138126, + "grad_norm": 0.1447642594575882, + "learning_rate": 9.72471802307112e-06, + "loss": 0.0738, + "step": 636 + }, + { + "epoch": 0.1335149863760218, + "grad_norm": 0.15724752843379974, + "learning_rate": 9.723606026935986e-06, + "loss": 0.0746, + "step": 637 + }, + { + "epoch": 0.13372458604066234, + "grad_norm": 0.13662230968475342, + "learning_rate": 9.722491853219455e-06, + "loss": 0.0735, + "step": 638 + }, + { + "epoch": 0.13393418570530288, + "grad_norm": 0.14362376928329468, + "learning_rate": 9.72137550243516e-06, + "loss": 0.0738, + "step": 639 + }, + { + "epoch": 0.1341437853699434, + "grad_norm": 0.18275748193264008, + "learning_rate": 9.720256975097741e-06, + "loss": 0.0806, + "step": 640 + }, + { + "epoch": 0.13435338503458394, + "grad_norm": 0.17419646680355072, + "learning_rate": 9.719136271722835e-06, + "loss": 0.076, + "step": 641 + }, + { + "epoch": 0.13456298469922448, + "grad_norm": 0.15311293303966522, + "learning_rate": 9.718013392827087e-06, + "loss": 0.0743, + "step": 642 + }, + { + "epoch": 0.13477258436386502, + "grad_norm": 0.18459968268871307, + "learning_rate": 9.716888338928147e-06, + "loss": 0.0764, + "step": 643 + }, + { + "epoch": 0.13498218402850556, + "grad_norm": 0.2108069807291031, + "learning_rate": 9.715761110544663e-06, + "loss": 0.0752, + "step": 644 + }, + { + "epoch": 0.13519178369314608, + "grad_norm": 0.19579482078552246, + "learning_rate": 9.714631708196287e-06, + "loss": 0.0788, + "step": 645 + }, + { + "epoch": 0.13540138335778662, + "grad_norm": 0.18873129785060883, + "learning_rate": 9.71350013240367e-06, + "loss": 0.0733, + "step": 646 + }, + { + "epoch": 0.13561098302242716, + "grad_norm": 0.21143223345279694, + "learning_rate": 9.712366383688474e-06, + "loss": 0.0746, + "step": 647 + }, + { + "epoch": 0.1358205826870677, + "grad_norm": 0.23884397745132446, + "learning_rate": 9.71123046257335e-06, + "loss": 0.0744, + "step": 648 + }, + { + "epoch": 0.13603018235170825, + "grad_norm": 0.22789151966571808, + "learning_rate": 9.710092369581966e-06, + "loss": 0.0766, + "step": 649 + }, + { + "epoch": 0.1362397820163488, + "grad_norm": 0.18522526323795319, + "learning_rate": 9.708952105238975e-06, + "loss": 0.0743, + "step": 650 + }, + { + "epoch": 0.1364493816809893, + "grad_norm": 0.20992542803287506, + "learning_rate": 9.707809670070043e-06, + "loss": 0.073, + "step": 651 + }, + { + "epoch": 0.13665898134562984, + "grad_norm": 0.1938726305961609, + "learning_rate": 9.706665064601831e-06, + "loss": 0.0729, + "step": 652 + }, + { + "epoch": 0.13686858101027038, + "grad_norm": 0.12811291217803955, + "learning_rate": 9.705518289362001e-06, + "loss": 0.0764, + "step": 653 + }, + { + "epoch": 0.13707818067491093, + "grad_norm": 0.19632869958877563, + "learning_rate": 9.704369344879219e-06, + "loss": 0.0765, + "step": 654 + }, + { + "epoch": 0.13728778033955147, + "grad_norm": 0.21263957023620605, + "learning_rate": 9.703218231683143e-06, + "loss": 0.0765, + "step": 655 + }, + { + "epoch": 0.13749738000419198, + "grad_norm": 0.18144431710243225, + "learning_rate": 9.702064950304442e-06, + "loss": 0.0739, + "step": 656 + }, + { + "epoch": 0.13770697966883252, + "grad_norm": 0.21412257850170135, + "learning_rate": 9.700909501274773e-06, + "loss": 0.0771, + "step": 657 + }, + { + "epoch": 0.13791657933347307, + "grad_norm": 0.2056640088558197, + "learning_rate": 9.699751885126803e-06, + "loss": 0.0746, + "step": 658 + }, + { + "epoch": 0.1381261789981136, + "grad_norm": 0.17441654205322266, + "learning_rate": 9.698592102394188e-06, + "loss": 0.0806, + "step": 659 + }, + { + "epoch": 0.13833577866275415, + "grad_norm": 0.18476596474647522, + "learning_rate": 9.697430153611592e-06, + "loss": 0.0764, + "step": 660 + }, + { + "epoch": 0.13854537832739466, + "grad_norm": 0.15858349204063416, + "learning_rate": 9.69626603931467e-06, + "loss": 0.0748, + "step": 661 + }, + { + "epoch": 0.1387549779920352, + "grad_norm": 0.14706814289093018, + "learning_rate": 9.695099760040079e-06, + "loss": 0.0737, + "step": 662 + }, + { + "epoch": 0.13896457765667575, + "grad_norm": 0.18588264286518097, + "learning_rate": 9.693931316325473e-06, + "loss": 0.0706, + "step": 663 + }, + { + "epoch": 0.1391741773213163, + "grad_norm": 0.18025395274162292, + "learning_rate": 9.692760708709506e-06, + "loss": 0.0763, + "step": 664 + }, + { + "epoch": 0.13938377698595683, + "grad_norm": 0.18428079783916473, + "learning_rate": 9.691587937731827e-06, + "loss": 0.0772, + "step": 665 + }, + { + "epoch": 0.13959337665059737, + "grad_norm": 0.20305953919887543, + "learning_rate": 9.690413003933084e-06, + "loss": 0.0742, + "step": 666 + }, + { + "epoch": 0.1398029763152379, + "grad_norm": 0.21288353204727173, + "learning_rate": 9.68923590785492e-06, + "loss": 0.077, + "step": 667 + }, + { + "epoch": 0.14001257597987843, + "grad_norm": 0.2299814224243164, + "learning_rate": 9.688056650039976e-06, + "loss": 0.0754, + "step": 668 + }, + { + "epoch": 0.14022217564451897, + "grad_norm": 0.23318803310394287, + "learning_rate": 9.68687523103189e-06, + "loss": 0.0748, + "step": 669 + }, + { + "epoch": 0.1404317753091595, + "grad_norm": 0.21237865090370178, + "learning_rate": 9.685691651375297e-06, + "loss": 0.0719, + "step": 670 + }, + { + "epoch": 0.14064137497380005, + "grad_norm": 0.20599277317523956, + "learning_rate": 9.684505911615825e-06, + "loss": 0.0731, + "step": 671 + }, + { + "epoch": 0.14085097463844057, + "grad_norm": 0.2014642357826233, + "learning_rate": 9.683318012300103e-06, + "loss": 0.0739, + "step": 672 + }, + { + "epoch": 0.1410605743030811, + "grad_norm": 0.21525225043296814, + "learning_rate": 9.682127953975748e-06, + "loss": 0.0796, + "step": 673 + }, + { + "epoch": 0.14127017396772165, + "grad_norm": 0.20060989260673523, + "learning_rate": 9.68093573719138e-06, + "loss": 0.0717, + "step": 674 + }, + { + "epoch": 0.1414797736323622, + "grad_norm": 0.19899265468120575, + "learning_rate": 9.679741362496608e-06, + "loss": 0.0723, + "step": 675 + }, + { + "epoch": 0.14168937329700274, + "grad_norm": 0.1738089621067047, + "learning_rate": 9.678544830442041e-06, + "loss": 0.0776, + "step": 676 + }, + { + "epoch": 0.14189897296164325, + "grad_norm": 0.1539677530527115, + "learning_rate": 9.677346141579277e-06, + "loss": 0.0725, + "step": 677 + }, + { + "epoch": 0.1421085726262838, + "grad_norm": 0.16590017080307007, + "learning_rate": 9.676145296460917e-06, + "loss": 0.0733, + "step": 678 + }, + { + "epoch": 0.14231817229092433, + "grad_norm": 0.17394165694713593, + "learning_rate": 9.674942295640544e-06, + "loss": 0.0744, + "step": 679 + }, + { + "epoch": 0.14252777195556487, + "grad_norm": 0.17285530269145966, + "learning_rate": 9.673737139672746e-06, + "loss": 0.0729, + "step": 680 + }, + { + "epoch": 0.14273737162020542, + "grad_norm": 0.14739976823329926, + "learning_rate": 9.672529829113095e-06, + "loss": 0.073, + "step": 681 + }, + { + "epoch": 0.14294697128484593, + "grad_norm": 0.1775324046611786, + "learning_rate": 9.671320364518164e-06, + "loss": 0.0745, + "step": 682 + }, + { + "epoch": 0.14315657094948647, + "grad_norm": 0.1740763634443283, + "learning_rate": 9.670108746445514e-06, + "loss": 0.0734, + "step": 683 + }, + { + "epoch": 0.14336617061412701, + "grad_norm": 0.14297354221343994, + "learning_rate": 9.668894975453705e-06, + "loss": 0.0763, + "step": 684 + }, + { + "epoch": 0.14357577027876756, + "grad_norm": 0.1698678582906723, + "learning_rate": 9.66767905210228e-06, + "loss": 0.0727, + "step": 685 + }, + { + "epoch": 0.1437853699434081, + "grad_norm": 0.18393908441066742, + "learning_rate": 9.666460976951783e-06, + "loss": 0.0755, + "step": 686 + }, + { + "epoch": 0.14399496960804864, + "grad_norm": 0.16469605267047882, + "learning_rate": 9.665240750563743e-06, + "loss": 0.0718, + "step": 687 + }, + { + "epoch": 0.14420456927268915, + "grad_norm": 0.15286237001419067, + "learning_rate": 9.664018373500688e-06, + "loss": 0.0774, + "step": 688 + }, + { + "epoch": 0.1444141689373297, + "grad_norm": 0.15138964354991913, + "learning_rate": 9.662793846326131e-06, + "loss": 0.0738, + "step": 689 + }, + { + "epoch": 0.14462376860197024, + "grad_norm": 0.13426071405410767, + "learning_rate": 9.661567169604579e-06, + "loss": 0.0732, + "step": 690 + }, + { + "epoch": 0.14483336826661078, + "grad_norm": 0.14377839863300323, + "learning_rate": 9.66033834390153e-06, + "loss": 0.0743, + "step": 691 + }, + { + "epoch": 0.14504296793125132, + "grad_norm": 0.14970825612545013, + "learning_rate": 9.659107369783473e-06, + "loss": 0.0719, + "step": 692 + }, + { + "epoch": 0.14525256759589183, + "grad_norm": 0.12804554402828217, + "learning_rate": 9.657874247817886e-06, + "loss": 0.0715, + "step": 693 + }, + { + "epoch": 0.14546216726053238, + "grad_norm": 0.13127097487449646, + "learning_rate": 9.656638978573238e-06, + "loss": 0.0761, + "step": 694 + }, + { + "epoch": 0.14567176692517292, + "grad_norm": 0.16138702630996704, + "learning_rate": 9.655401562618989e-06, + "loss": 0.0727, + "step": 695 + }, + { + "epoch": 0.14588136658981346, + "grad_norm": 0.16640619933605194, + "learning_rate": 9.654162000525585e-06, + "loss": 0.0704, + "step": 696 + }, + { + "epoch": 0.146090966254454, + "grad_norm": 0.17615856230258942, + "learning_rate": 9.652920292864467e-06, + "loss": 0.0763, + "step": 697 + }, + { + "epoch": 0.14630056591909452, + "grad_norm": 0.20064422488212585, + "learning_rate": 9.65167644020806e-06, + "loss": 0.0709, + "step": 698 + }, + { + "epoch": 0.14651016558373506, + "grad_norm": 0.2409866899251938, + "learning_rate": 9.650430443129781e-06, + "loss": 0.0752, + "step": 699 + }, + { + "epoch": 0.1467197652483756, + "grad_norm": 0.28271573781967163, + "learning_rate": 9.649182302204034e-06, + "loss": 0.0732, + "step": 700 + }, + { + "epoch": 0.14692936491301614, + "grad_norm": 0.29788634181022644, + "learning_rate": 9.64793201800621e-06, + "loss": 0.0734, + "step": 701 + }, + { + "epoch": 0.14713896457765668, + "grad_norm": 0.25497421622276306, + "learning_rate": 9.64667959111269e-06, + "loss": 0.0739, + "step": 702 + }, + { + "epoch": 0.14734856424229723, + "grad_norm": 0.1792333573102951, + "learning_rate": 9.645425022100847e-06, + "loss": 0.0752, + "step": 703 + }, + { + "epoch": 0.14755816390693774, + "grad_norm": 0.16406488418579102, + "learning_rate": 9.644168311549032e-06, + "loss": 0.0736, + "step": 704 + }, + { + "epoch": 0.14776776357157828, + "grad_norm": 0.20219507813453674, + "learning_rate": 9.64290946003659e-06, + "loss": 0.0803, + "step": 705 + }, + { + "epoch": 0.14797736323621882, + "grad_norm": 0.20861254632472992, + "learning_rate": 9.641648468143852e-06, + "loss": 0.0759, + "step": 706 + }, + { + "epoch": 0.14818696290085936, + "grad_norm": 0.17586849629878998, + "learning_rate": 9.640385336452135e-06, + "loss": 0.0752, + "step": 707 + }, + { + "epoch": 0.1483965625654999, + "grad_norm": 0.16276384890079498, + "learning_rate": 9.639120065543738e-06, + "loss": 0.0714, + "step": 708 + }, + { + "epoch": 0.14860616223014042, + "grad_norm": 0.20247167348861694, + "learning_rate": 9.637852656001957e-06, + "loss": 0.0752, + "step": 709 + }, + { + "epoch": 0.14881576189478096, + "grad_norm": 0.18190069496631622, + "learning_rate": 9.636583108411066e-06, + "loss": 0.0753, + "step": 710 + }, + { + "epoch": 0.1490253615594215, + "grad_norm": 0.1566484421491623, + "learning_rate": 9.635311423356324e-06, + "loss": 0.072, + "step": 711 + }, + { + "epoch": 0.14923496122406205, + "grad_norm": 0.1933658868074417, + "learning_rate": 9.63403760142398e-06, + "loss": 0.0746, + "step": 712 + }, + { + "epoch": 0.1494445608887026, + "grad_norm": 0.1682639867067337, + "learning_rate": 9.632761643201262e-06, + "loss": 0.073, + "step": 713 + }, + { + "epoch": 0.1496541605533431, + "grad_norm": 0.13671089708805084, + "learning_rate": 9.63148354927639e-06, + "loss": 0.0719, + "step": 714 + }, + { + "epoch": 0.14986376021798364, + "grad_norm": 0.15228819847106934, + "learning_rate": 9.630203320238564e-06, + "loss": 0.0724, + "step": 715 + }, + { + "epoch": 0.15007335988262419, + "grad_norm": 0.15035517513751984, + "learning_rate": 9.628920956677969e-06, + "loss": 0.0737, + "step": 716 + }, + { + "epoch": 0.15028295954726473, + "grad_norm": 0.1775098294019699, + "learning_rate": 9.627636459185774e-06, + "loss": 0.0695, + "step": 717 + }, + { + "epoch": 0.15049255921190527, + "grad_norm": 0.20350755751132965, + "learning_rate": 9.626349828354133e-06, + "loss": 0.0755, + "step": 718 + }, + { + "epoch": 0.1507021588765458, + "grad_norm": 0.18517783284187317, + "learning_rate": 9.625061064776183e-06, + "loss": 0.0733, + "step": 719 + }, + { + "epoch": 0.15091175854118632, + "grad_norm": 0.19070328772068024, + "learning_rate": 9.623770169046042e-06, + "loss": 0.0725, + "step": 720 + }, + { + "epoch": 0.15112135820582687, + "grad_norm": 0.19574813544750214, + "learning_rate": 9.622477141758813e-06, + "loss": 0.0717, + "step": 721 + }, + { + "epoch": 0.1513309578704674, + "grad_norm": 0.18914958834648132, + "learning_rate": 9.621181983510582e-06, + "loss": 0.0761, + "step": 722 + }, + { + "epoch": 0.15154055753510795, + "grad_norm": 0.18750722706317902, + "learning_rate": 9.619884694898417e-06, + "loss": 0.0726, + "step": 723 + }, + { + "epoch": 0.1517501571997485, + "grad_norm": 0.23439928889274597, + "learning_rate": 9.618585276520367e-06, + "loss": 0.0723, + "step": 724 + }, + { + "epoch": 0.151959756864389, + "grad_norm": 0.27055707573890686, + "learning_rate": 9.617283728975464e-06, + "loss": 0.0748, + "step": 725 + }, + { + "epoch": 0.15216935652902955, + "grad_norm": 0.20538926124572754, + "learning_rate": 9.61598005286372e-06, + "loss": 0.0721, + "step": 726 + }, + { + "epoch": 0.1523789561936701, + "grad_norm": 0.15176212787628174, + "learning_rate": 9.614674248786131e-06, + "loss": 0.0724, + "step": 727 + }, + { + "epoch": 0.15258855585831063, + "grad_norm": 0.18770189583301544, + "learning_rate": 9.613366317344674e-06, + "loss": 0.0714, + "step": 728 + }, + { + "epoch": 0.15279815552295117, + "grad_norm": 0.14766095578670502, + "learning_rate": 9.6120562591423e-06, + "loss": 0.0722, + "step": 729 + }, + { + "epoch": 0.1530077551875917, + "grad_norm": 0.1400596648454666, + "learning_rate": 9.610744074782951e-06, + "loss": 0.0731, + "step": 730 + }, + { + "epoch": 0.15321735485223223, + "grad_norm": 0.16470056772232056, + "learning_rate": 9.60942976487154e-06, + "loss": 0.0709, + "step": 731 + }, + { + "epoch": 0.15342695451687277, + "grad_norm": 0.1465228945016861, + "learning_rate": 9.608113330013964e-06, + "loss": 0.0711, + "step": 732 + }, + { + "epoch": 0.1536365541815133, + "grad_norm": 0.14212250709533691, + "learning_rate": 9.606794770817102e-06, + "loss": 0.0772, + "step": 733 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.12148991227149963, + "learning_rate": 9.605474087888806e-06, + "loss": 0.0738, + "step": 734 + }, + { + "epoch": 0.1540557535107944, + "grad_norm": 0.1582231968641281, + "learning_rate": 9.604151281837912e-06, + "loss": 0.0723, + "step": 735 + }, + { + "epoch": 0.1542653531754349, + "grad_norm": 0.1933947652578354, + "learning_rate": 9.602826353274235e-06, + "loss": 0.073, + "step": 736 + }, + { + "epoch": 0.15447495284007545, + "grad_norm": 0.16174952685832977, + "learning_rate": 9.601499302808565e-06, + "loss": 0.0688, + "step": 737 + }, + { + "epoch": 0.154684552504716, + "grad_norm": 0.14676834642887115, + "learning_rate": 9.600170131052671e-06, + "loss": 0.0712, + "step": 738 + }, + { + "epoch": 0.15489415216935654, + "grad_norm": 0.1428784430027008, + "learning_rate": 9.598838838619302e-06, + "loss": 0.0703, + "step": 739 + }, + { + "epoch": 0.15510375183399708, + "grad_norm": 0.1900653839111328, + "learning_rate": 9.597505426122184e-06, + "loss": 0.0687, + "step": 740 + }, + { + "epoch": 0.1553133514986376, + "grad_norm": 0.2115175575017929, + "learning_rate": 9.596169894176021e-06, + "loss": 0.0719, + "step": 741 + }, + { + "epoch": 0.15552295116327813, + "grad_norm": 0.16086673736572266, + "learning_rate": 9.59483224339649e-06, + "loss": 0.0721, + "step": 742 + }, + { + "epoch": 0.15573255082791868, + "grad_norm": 0.16307683289051056, + "learning_rate": 9.59349247440025e-06, + "loss": 0.0734, + "step": 743 + }, + { + "epoch": 0.15594215049255922, + "grad_norm": 0.17130297422409058, + "learning_rate": 9.592150587804934e-06, + "loss": 0.0731, + "step": 744 + }, + { + "epoch": 0.15615175015719976, + "grad_norm": 0.1763344258069992, + "learning_rate": 9.590806584229149e-06, + "loss": 0.0758, + "step": 745 + }, + { + "epoch": 0.15636134982184027, + "grad_norm": 0.20289908349514008, + "learning_rate": 9.589460464292483e-06, + "loss": 0.0694, + "step": 746 + }, + { + "epoch": 0.15657094948648081, + "grad_norm": 0.2093002051115036, + "learning_rate": 9.588112228615495e-06, + "loss": 0.0719, + "step": 747 + }, + { + "epoch": 0.15678054915112136, + "grad_norm": 0.19119895994663239, + "learning_rate": 9.586761877819726e-06, + "loss": 0.0685, + "step": 748 + }, + { + "epoch": 0.1569901488157619, + "grad_norm": 0.17849349975585938, + "learning_rate": 9.585409412527682e-06, + "loss": 0.0719, + "step": 749 + }, + { + "epoch": 0.15719974848040244, + "grad_norm": 0.21115368604660034, + "learning_rate": 9.584054833362851e-06, + "loss": 0.0738, + "step": 750 + }, + { + "epoch": 0.15740934814504295, + "grad_norm": 0.19666269421577454, + "learning_rate": 9.582698140949696e-06, + "loss": 0.0716, + "step": 751 + }, + { + "epoch": 0.1576189478096835, + "grad_norm": 0.1261245310306549, + "learning_rate": 9.581339335913647e-06, + "loss": 0.0695, + "step": 752 + }, + { + "epoch": 0.15782854747432404, + "grad_norm": 0.16366009414196014, + "learning_rate": 9.579978418881118e-06, + "loss": 0.0715, + "step": 753 + }, + { + "epoch": 0.15803814713896458, + "grad_norm": 0.21418072283267975, + "learning_rate": 9.57861539047949e-06, + "loss": 0.0721, + "step": 754 + }, + { + "epoch": 0.15824774680360512, + "grad_norm": 0.16937729716300964, + "learning_rate": 9.577250251337114e-06, + "loss": 0.0725, + "step": 755 + }, + { + "epoch": 0.15845734646824566, + "grad_norm": 0.1247527152299881, + "learning_rate": 9.575883002083326e-06, + "loss": 0.0726, + "step": 756 + }, + { + "epoch": 0.15866694613288618, + "grad_norm": 0.1638677716255188, + "learning_rate": 9.574513643348424e-06, + "loss": 0.0717, + "step": 757 + }, + { + "epoch": 0.15887654579752672, + "grad_norm": 0.19583426415920258, + "learning_rate": 9.573142175763683e-06, + "loss": 0.0752, + "step": 758 + }, + { + "epoch": 0.15908614546216726, + "grad_norm": 0.19242113828659058, + "learning_rate": 9.571768599961349e-06, + "loss": 0.0718, + "step": 759 + }, + { + "epoch": 0.1592957451268078, + "grad_norm": 0.18468959629535675, + "learning_rate": 9.570392916574639e-06, + "loss": 0.0712, + "step": 760 + }, + { + "epoch": 0.15950534479144834, + "grad_norm": 0.18113423883914948, + "learning_rate": 9.569015126237744e-06, + "loss": 0.0728, + "step": 761 + }, + { + "epoch": 0.15971494445608886, + "grad_norm": 0.15867562592029572, + "learning_rate": 9.567635229585826e-06, + "loss": 0.0714, + "step": 762 + }, + { + "epoch": 0.1599245441207294, + "grad_norm": 0.14228221774101257, + "learning_rate": 9.566253227255015e-06, + "loss": 0.0751, + "step": 763 + }, + { + "epoch": 0.16013414378536994, + "grad_norm": 0.14296479523181915, + "learning_rate": 9.564869119882414e-06, + "loss": 0.0713, + "step": 764 + }, + { + "epoch": 0.16034374345001048, + "grad_norm": 0.15336044132709503, + "learning_rate": 9.563482908106098e-06, + "loss": 0.0734, + "step": 765 + }, + { + "epoch": 0.16055334311465103, + "grad_norm": 0.16982528567314148, + "learning_rate": 9.56209459256511e-06, + "loss": 0.0748, + "step": 766 + }, + { + "epoch": 0.16076294277929154, + "grad_norm": 0.20288409292697906, + "learning_rate": 9.560704173899461e-06, + "loss": 0.0717, + "step": 767 + }, + { + "epoch": 0.16097254244393208, + "grad_norm": 0.24066397547721863, + "learning_rate": 9.559311652750135e-06, + "loss": 0.0727, + "step": 768 + }, + { + "epoch": 0.16118214210857262, + "grad_norm": 0.24527938663959503, + "learning_rate": 9.557917029759087e-06, + "loss": 0.0755, + "step": 769 + }, + { + "epoch": 0.16139174177321317, + "grad_norm": 0.18823891878128052, + "learning_rate": 9.556520305569232e-06, + "loss": 0.0698, + "step": 770 + }, + { + "epoch": 0.1616013414378537, + "grad_norm": 0.1405959278345108, + "learning_rate": 9.555121480824463e-06, + "loss": 0.0722, + "step": 771 + }, + { + "epoch": 0.16181094110249425, + "grad_norm": 0.1578516960144043, + "learning_rate": 9.553720556169639e-06, + "loss": 0.0736, + "step": 772 + }, + { + "epoch": 0.16202054076713476, + "grad_norm": 0.15858958661556244, + "learning_rate": 9.552317532250584e-06, + "loss": 0.0727, + "step": 773 + }, + { + "epoch": 0.1622301404317753, + "grad_norm": 0.1704108715057373, + "learning_rate": 9.55091240971409e-06, + "loss": 0.0697, + "step": 774 + }, + { + "epoch": 0.16243974009641585, + "grad_norm": 0.18552088737487793, + "learning_rate": 9.549505189207924e-06, + "loss": 0.0715, + "step": 775 + }, + { + "epoch": 0.1626493397610564, + "grad_norm": 0.14782559871673584, + "learning_rate": 9.548095871380808e-06, + "loss": 0.0746, + "step": 776 + }, + { + "epoch": 0.16285893942569693, + "grad_norm": 0.13771621882915497, + "learning_rate": 9.54668445688244e-06, + "loss": 0.0711, + "step": 777 + }, + { + "epoch": 0.16306853909033744, + "grad_norm": 0.15888865292072296, + "learning_rate": 9.545270946363484e-06, + "loss": 0.0713, + "step": 778 + }, + { + "epoch": 0.163278138754978, + "grad_norm": 0.13176760077476501, + "learning_rate": 9.543855340475565e-06, + "loss": 0.0713, + "step": 779 + }, + { + "epoch": 0.16348773841961853, + "grad_norm": 0.1236625537276268, + "learning_rate": 9.542437639871279e-06, + "loss": 0.0736, + "step": 780 + }, + { + "epoch": 0.16369733808425907, + "grad_norm": 0.14519374072551727, + "learning_rate": 9.541017845204182e-06, + "loss": 0.0725, + "step": 781 + }, + { + "epoch": 0.1639069377488996, + "grad_norm": 0.13058701157569885, + "learning_rate": 9.539595957128803e-06, + "loss": 0.0672, + "step": 782 + }, + { + "epoch": 0.16411653741354013, + "grad_norm": 0.13305912911891937, + "learning_rate": 9.53817197630063e-06, + "loss": 0.0706, + "step": 783 + }, + { + "epoch": 0.16432613707818067, + "grad_norm": 0.15352047979831696, + "learning_rate": 9.53674590337612e-06, + "loss": 0.0721, + "step": 784 + }, + { + "epoch": 0.1645357367428212, + "grad_norm": 0.15192946791648865, + "learning_rate": 9.535317739012689e-06, + "loss": 0.075, + "step": 785 + }, + { + "epoch": 0.16474533640746175, + "grad_norm": 0.1807214766740799, + "learning_rate": 9.533887483868723e-06, + "loss": 0.0691, + "step": 786 + }, + { + "epoch": 0.1649549360721023, + "grad_norm": 0.22403304278850555, + "learning_rate": 9.53245513860357e-06, + "loss": 0.0723, + "step": 787 + }, + { + "epoch": 0.16516453573674283, + "grad_norm": 0.22124335169792175, + "learning_rate": 9.531020703877539e-06, + "loss": 0.0738, + "step": 788 + }, + { + "epoch": 0.16537413540138335, + "grad_norm": 0.197389617562294, + "learning_rate": 9.529584180351902e-06, + "loss": 0.0728, + "step": 789 + }, + { + "epoch": 0.1655837350660239, + "grad_norm": 0.19445890188217163, + "learning_rate": 9.528145568688902e-06, + "loss": 0.0709, + "step": 790 + }, + { + "epoch": 0.16579333473066443, + "grad_norm": 0.1834816336631775, + "learning_rate": 9.526704869551736e-06, + "loss": 0.0734, + "step": 791 + }, + { + "epoch": 0.16600293439530497, + "grad_norm": 0.15776723623275757, + "learning_rate": 9.525262083604562e-06, + "loss": 0.0735, + "step": 792 + }, + { + "epoch": 0.16621253405994552, + "grad_norm": 0.1593591719865799, + "learning_rate": 9.523817211512511e-06, + "loss": 0.0708, + "step": 793 + }, + { + "epoch": 0.16642213372458603, + "grad_norm": 0.19884854555130005, + "learning_rate": 9.522370253941664e-06, + "loss": 0.0729, + "step": 794 + }, + { + "epoch": 0.16663173338922657, + "grad_norm": 0.208924800157547, + "learning_rate": 9.520921211559067e-06, + "loss": 0.0696, + "step": 795 + }, + { + "epoch": 0.1668413330538671, + "grad_norm": 0.1883474439382553, + "learning_rate": 9.519470085032733e-06, + "loss": 0.0718, + "step": 796 + }, + { + "epoch": 0.16705093271850766, + "grad_norm": 0.14821593463420868, + "learning_rate": 9.518016875031628e-06, + "loss": 0.0723, + "step": 797 + }, + { + "epoch": 0.1672605323831482, + "grad_norm": 0.12627242505550385, + "learning_rate": 9.516561582225682e-06, + "loss": 0.0725, + "step": 798 + }, + { + "epoch": 0.1674701320477887, + "grad_norm": 0.18372437357902527, + "learning_rate": 9.515104207285785e-06, + "loss": 0.0732, + "step": 799 + }, + { + "epoch": 0.16767973171242925, + "grad_norm": 0.20521710813045502, + "learning_rate": 9.513644750883786e-06, + "loss": 0.0708, + "step": 800 + }, + { + "epoch": 0.1678893313770698, + "grad_norm": 0.1553443819284439, + "learning_rate": 9.512183213692494e-06, + "loss": 0.0744, + "step": 801 + }, + { + "epoch": 0.16809893104171034, + "grad_norm": 0.1297149360179901, + "learning_rate": 9.510719596385678e-06, + "loss": 0.0729, + "step": 802 + }, + { + "epoch": 0.16830853070635088, + "grad_norm": 0.17294561862945557, + "learning_rate": 9.509253899638066e-06, + "loss": 0.0716, + "step": 803 + }, + { + "epoch": 0.16851813037099142, + "grad_norm": 0.1920863389968872, + "learning_rate": 9.507786124125342e-06, + "loss": 0.0718, + "step": 804 + }, + { + "epoch": 0.16872773003563193, + "grad_norm": 0.17762252688407898, + "learning_rate": 9.506316270524152e-06, + "loss": 0.0725, + "step": 805 + }, + { + "epoch": 0.16893732970027248, + "grad_norm": 0.1815175563097, + "learning_rate": 9.504844339512096e-06, + "loss": 0.0707, + "step": 806 + }, + { + "epoch": 0.16914692936491302, + "grad_norm": 0.18983136117458344, + "learning_rate": 9.503370331767736e-06, + "loss": 0.0685, + "step": 807 + }, + { + "epoch": 0.16935652902955356, + "grad_norm": 0.16265596449375153, + "learning_rate": 9.50189424797059e-06, + "loss": 0.0699, + "step": 808 + }, + { + "epoch": 0.1695661286941941, + "grad_norm": 0.14986425638198853, + "learning_rate": 9.500416088801128e-06, + "loss": 0.0718, + "step": 809 + }, + { + "epoch": 0.16977572835883462, + "grad_norm": 0.16179025173187256, + "learning_rate": 9.498935854940785e-06, + "loss": 0.0711, + "step": 810 + }, + { + "epoch": 0.16998532802347516, + "grad_norm": 0.14040148258209229, + "learning_rate": 9.497453547071948e-06, + "loss": 0.0735, + "step": 811 + }, + { + "epoch": 0.1701949276881157, + "grad_norm": 0.14858156442642212, + "learning_rate": 9.495969165877959e-06, + "loss": 0.0714, + "step": 812 + }, + { + "epoch": 0.17040452735275624, + "grad_norm": 0.1716984659433365, + "learning_rate": 9.494482712043119e-06, + "loss": 0.0701, + "step": 813 + }, + { + "epoch": 0.17061412701739678, + "grad_norm": 0.1606951206922531, + "learning_rate": 9.492994186252681e-06, + "loss": 0.0727, + "step": 814 + }, + { + "epoch": 0.1708237266820373, + "grad_norm": 0.14578759670257568, + "learning_rate": 9.491503589192859e-06, + "loss": 0.0709, + "step": 815 + }, + { + "epoch": 0.17103332634667784, + "grad_norm": 0.16695253551006317, + "learning_rate": 9.490010921550814e-06, + "loss": 0.0683, + "step": 816 + }, + { + "epoch": 0.17124292601131838, + "grad_norm": 0.19223074615001678, + "learning_rate": 9.488516184014667e-06, + "loss": 0.0721, + "step": 817 + }, + { + "epoch": 0.17145252567595892, + "grad_norm": 0.2144256979227066, + "learning_rate": 9.48701937727349e-06, + "loss": 0.0729, + "step": 818 + }, + { + "epoch": 0.17166212534059946, + "grad_norm": 0.2188243865966797, + "learning_rate": 9.485520502017314e-06, + "loss": 0.0684, + "step": 819 + }, + { + "epoch": 0.17187172500523998, + "grad_norm": 0.1678440421819687, + "learning_rate": 9.48401955893712e-06, + "loss": 0.0722, + "step": 820 + }, + { + "epoch": 0.17208132466988052, + "grad_norm": 0.13523709774017334, + "learning_rate": 9.482516548724836e-06, + "loss": 0.0705, + "step": 821 + }, + { + "epoch": 0.17229092433452106, + "grad_norm": 0.1449403315782547, + "learning_rate": 9.481011472073359e-06, + "loss": 0.0697, + "step": 822 + }, + { + "epoch": 0.1725005239991616, + "grad_norm": 0.1349162459373474, + "learning_rate": 9.47950432967652e-06, + "loss": 0.0749, + "step": 823 + }, + { + "epoch": 0.17271012366380215, + "grad_norm": 0.1460842341184616, + "learning_rate": 9.477995122229117e-06, + "loss": 0.0669, + "step": 824 + }, + { + "epoch": 0.1729197233284427, + "grad_norm": 0.16330771148204803, + "learning_rate": 9.476483850426895e-06, + "loss": 0.0705, + "step": 825 + }, + { + "epoch": 0.1731293229930832, + "grad_norm": 0.1619548201560974, + "learning_rate": 9.474970514966545e-06, + "loss": 0.0695, + "step": 826 + }, + { + "epoch": 0.17333892265772374, + "grad_norm": 0.19208967685699463, + "learning_rate": 9.473455116545718e-06, + "loss": 0.0697, + "step": 827 + }, + { + "epoch": 0.17354852232236428, + "grad_norm": 0.20909924805164337, + "learning_rate": 9.471937655863011e-06, + "loss": 0.0732, + "step": 828 + }, + { + "epoch": 0.17375812198700483, + "grad_norm": 0.18117384612560272, + "learning_rate": 9.470418133617973e-06, + "loss": 0.0719, + "step": 829 + }, + { + "epoch": 0.17396772165164537, + "grad_norm": 0.1741568148136139, + "learning_rate": 9.468896550511106e-06, + "loss": 0.0695, + "step": 830 + }, + { + "epoch": 0.17417732131628588, + "grad_norm": 0.18191158771514893, + "learning_rate": 9.467372907243858e-06, + "loss": 0.0737, + "step": 831 + }, + { + "epoch": 0.17438692098092642, + "grad_norm": 0.1610174924135208, + "learning_rate": 9.465847204518626e-06, + "loss": 0.069, + "step": 832 + }, + { + "epoch": 0.17459652064556697, + "grad_norm": 0.1415458768606186, + "learning_rate": 9.464319443038759e-06, + "loss": 0.071, + "step": 833 + }, + { + "epoch": 0.1748061203102075, + "grad_norm": 0.12450738251209259, + "learning_rate": 9.462789623508559e-06, + "loss": 0.0735, + "step": 834 + }, + { + "epoch": 0.17501571997484805, + "grad_norm": 0.1258174180984497, + "learning_rate": 9.461257746633267e-06, + "loss": 0.0696, + "step": 835 + }, + { + "epoch": 0.17522531963948856, + "grad_norm": 0.16757184267044067, + "learning_rate": 9.459723813119081e-06, + "loss": 0.0707, + "step": 836 + }, + { + "epoch": 0.1754349193041291, + "grad_norm": 0.18003013730049133, + "learning_rate": 9.458187823673145e-06, + "loss": 0.0681, + "step": 837 + }, + { + "epoch": 0.17564451896876965, + "grad_norm": 0.15927360951900482, + "learning_rate": 9.456649779003548e-06, + "loss": 0.0696, + "step": 838 + }, + { + "epoch": 0.1758541186334102, + "grad_norm": 0.1543167531490326, + "learning_rate": 9.45510967981933e-06, + "loss": 0.0673, + "step": 839 + }, + { + "epoch": 0.17606371829805073, + "grad_norm": 0.1319018304347992, + "learning_rate": 9.453567526830471e-06, + "loss": 0.072, + "step": 840 + }, + { + "epoch": 0.17627331796269127, + "grad_norm": 0.1290387362241745, + "learning_rate": 9.452023320747909e-06, + "loss": 0.0718, + "step": 841 + }, + { + "epoch": 0.1764829176273318, + "grad_norm": 0.1876518577337265, + "learning_rate": 9.450477062283523e-06, + "loss": 0.0717, + "step": 842 + }, + { + "epoch": 0.17669251729197233, + "grad_norm": 0.2217532843351364, + "learning_rate": 9.448928752150134e-06, + "loss": 0.0691, + "step": 843 + }, + { + "epoch": 0.17690211695661287, + "grad_norm": 0.2288810908794403, + "learning_rate": 9.447378391061515e-06, + "loss": 0.0718, + "step": 844 + }, + { + "epoch": 0.1771117166212534, + "grad_norm": 0.23466289043426514, + "learning_rate": 9.44582597973238e-06, + "loss": 0.0716, + "step": 845 + }, + { + "epoch": 0.17732131628589395, + "grad_norm": 0.20989495515823364, + "learning_rate": 9.444271518878393e-06, + "loss": 0.0726, + "step": 846 + }, + { + "epoch": 0.17753091595053447, + "grad_norm": 0.1636846363544464, + "learning_rate": 9.442715009216159e-06, + "loss": 0.0689, + "step": 847 + }, + { + "epoch": 0.177740515615175, + "grad_norm": 0.13839955627918243, + "learning_rate": 9.441156451463228e-06, + "loss": 0.0693, + "step": 848 + }, + { + "epoch": 0.17795011527981555, + "grad_norm": 0.17058932781219482, + "learning_rate": 9.439595846338097e-06, + "loss": 0.0693, + "step": 849 + }, + { + "epoch": 0.1781597149444561, + "grad_norm": 0.20214615762233734, + "learning_rate": 9.4380331945602e-06, + "loss": 0.0727, + "step": 850 + }, + { + "epoch": 0.17836931460909664, + "grad_norm": 0.178322434425354, + "learning_rate": 9.436468496849924e-06, + "loss": 0.0766, + "step": 851 + }, + { + "epoch": 0.17857891427373715, + "grad_norm": 0.14965812861919403, + "learning_rate": 9.434901753928593e-06, + "loss": 0.0709, + "step": 852 + }, + { + "epoch": 0.1787885139383777, + "grad_norm": 0.16711200773715973, + "learning_rate": 9.433332966518473e-06, + "loss": 0.0673, + "step": 853 + }, + { + "epoch": 0.17899811360301823, + "grad_norm": 0.17375777661800385, + "learning_rate": 9.43176213534278e-06, + "loss": 0.0738, + "step": 854 + }, + { + "epoch": 0.17920771326765877, + "grad_norm": 0.1556476205587387, + "learning_rate": 9.43018926112566e-06, + "loss": 0.068, + "step": 855 + }, + { + "epoch": 0.17941731293229932, + "grad_norm": 0.1631685495376587, + "learning_rate": 9.42861434459221e-06, + "loss": 0.0718, + "step": 856 + }, + { + "epoch": 0.17962691259693986, + "grad_norm": 0.1664845198392868, + "learning_rate": 9.42703738646847e-06, + "loss": 0.0689, + "step": 857 + }, + { + "epoch": 0.17983651226158037, + "grad_norm": 0.15820707380771637, + "learning_rate": 9.425458387481412e-06, + "loss": 0.072, + "step": 858 + }, + { + "epoch": 0.18004611192622091, + "grad_norm": 0.15286661684513092, + "learning_rate": 9.423877348358956e-06, + "loss": 0.0743, + "step": 859 + }, + { + "epoch": 0.18025571159086146, + "grad_norm": 0.14925047755241394, + "learning_rate": 9.422294269829963e-06, + "loss": 0.0665, + "step": 860 + }, + { + "epoch": 0.180465311255502, + "grad_norm": 0.15763109922409058, + "learning_rate": 9.420709152624232e-06, + "loss": 0.0725, + "step": 861 + }, + { + "epoch": 0.18067491092014254, + "grad_norm": 0.1531532257795334, + "learning_rate": 9.419121997472497e-06, + "loss": 0.0699, + "step": 862 + }, + { + "epoch": 0.18088451058478305, + "grad_norm": 0.14012256264686584, + "learning_rate": 9.41753280510644e-06, + "loss": 0.0718, + "step": 863 + }, + { + "epoch": 0.1810941102494236, + "grad_norm": 0.14636793732643127, + "learning_rate": 9.415941576258679e-06, + "loss": 0.0683, + "step": 864 + }, + { + "epoch": 0.18130370991406414, + "grad_norm": 0.16276605427265167, + "learning_rate": 9.414348311662766e-06, + "loss": 0.0706, + "step": 865 + }, + { + "epoch": 0.18151330957870468, + "grad_norm": 0.1377471536397934, + "learning_rate": 9.4127530120532e-06, + "loss": 0.0727, + "step": 866 + }, + { + "epoch": 0.18172290924334522, + "grad_norm": 0.11346344649791718, + "learning_rate": 9.41115567816541e-06, + "loss": 0.0699, + "step": 867 + }, + { + "epoch": 0.18193250890798573, + "grad_norm": 0.1656496673822403, + "learning_rate": 9.40955631073577e-06, + "loss": 0.0673, + "step": 868 + }, + { + "epoch": 0.18214210857262628, + "grad_norm": 0.16461622714996338, + "learning_rate": 9.407954910501586e-06, + "loss": 0.0769, + "step": 869 + }, + { + "epoch": 0.18235170823726682, + "grad_norm": 0.12896548211574554, + "learning_rate": 9.406351478201105e-06, + "loss": 0.0699, + "step": 870 + }, + { + "epoch": 0.18256130790190736, + "grad_norm": 0.17316146194934845, + "learning_rate": 9.404746014573508e-06, + "loss": 0.0713, + "step": 871 + }, + { + "epoch": 0.1827709075665479, + "grad_norm": 0.17203350365161896, + "learning_rate": 9.403138520358911e-06, + "loss": 0.0718, + "step": 872 + }, + { + "epoch": 0.18298050723118844, + "grad_norm": 0.17705942690372467, + "learning_rate": 9.401528996298375e-06, + "loss": 0.074, + "step": 873 + }, + { + "epoch": 0.18319010689582896, + "grad_norm": 0.2316497564315796, + "learning_rate": 9.399917443133883e-06, + "loss": 0.0689, + "step": 874 + }, + { + "epoch": 0.1833997065604695, + "grad_norm": 0.23241883516311646, + "learning_rate": 9.398303861608363e-06, + "loss": 0.0707, + "step": 875 + }, + { + "epoch": 0.18360930622511004, + "grad_norm": 0.21609675884246826, + "learning_rate": 9.396688252465678e-06, + "loss": 0.0739, + "step": 876 + }, + { + "epoch": 0.18381890588975058, + "grad_norm": 0.21849660575389862, + "learning_rate": 9.395070616450622e-06, + "loss": 0.0729, + "step": 877 + }, + { + "epoch": 0.18402850555439113, + "grad_norm": 0.17387786507606506, + "learning_rate": 9.393450954308922e-06, + "loss": 0.0715, + "step": 878 + }, + { + "epoch": 0.18423810521903164, + "grad_norm": 0.1386607140302658, + "learning_rate": 9.391829266787248e-06, + "loss": 0.0708, + "step": 879 + }, + { + "epoch": 0.18444770488367218, + "grad_norm": 0.14751608669757843, + "learning_rate": 9.390205554633193e-06, + "loss": 0.0714, + "step": 880 + }, + { + "epoch": 0.18465730454831272, + "grad_norm": 0.14015577733516693, + "learning_rate": 9.38857981859529e-06, + "loss": 0.0699, + "step": 881 + }, + { + "epoch": 0.18486690421295326, + "grad_norm": 0.13030049204826355, + "learning_rate": 9.386952059423e-06, + "loss": 0.0732, + "step": 882 + }, + { + "epoch": 0.1850765038775938, + "grad_norm": 0.14583101868629456, + "learning_rate": 9.385322277866724e-06, + "loss": 0.0681, + "step": 883 + }, + { + "epoch": 0.18528610354223432, + "grad_norm": 0.1531733274459839, + "learning_rate": 9.38369047467779e-06, + "loss": 0.0691, + "step": 884 + }, + { + "epoch": 0.18549570320687486, + "grad_norm": 0.16215740144252777, + "learning_rate": 9.382056650608454e-06, + "loss": 0.0709, + "step": 885 + }, + { + "epoch": 0.1857053028715154, + "grad_norm": 0.1694212257862091, + "learning_rate": 9.380420806411914e-06, + "loss": 0.0694, + "step": 886 + }, + { + "epoch": 0.18591490253615595, + "grad_norm": 0.16096119582653046, + "learning_rate": 9.378782942842292e-06, + "loss": 0.0656, + "step": 887 + }, + { + "epoch": 0.1861245022007965, + "grad_norm": 0.13555142283439636, + "learning_rate": 9.377143060654645e-06, + "loss": 0.0741, + "step": 888 + }, + { + "epoch": 0.186334101865437, + "grad_norm": 0.11763742566108704, + "learning_rate": 9.375501160604955e-06, + "loss": 0.069, + "step": 889 + }, + { + "epoch": 0.18654370153007754, + "grad_norm": 0.14128689467906952, + "learning_rate": 9.373857243450138e-06, + "loss": 0.0691, + "step": 890 + }, + { + "epoch": 0.18675330119471809, + "grad_norm": 0.15519973635673523, + "learning_rate": 9.37221130994804e-06, + "loss": 0.0691, + "step": 891 + }, + { + "epoch": 0.18696290085935863, + "grad_norm": 0.1528463065624237, + "learning_rate": 9.370563360857437e-06, + "loss": 0.0708, + "step": 892 + }, + { + "epoch": 0.18717250052399917, + "grad_norm": 0.1430312991142273, + "learning_rate": 9.368913396938031e-06, + "loss": 0.0702, + "step": 893 + }, + { + "epoch": 0.1873821001886397, + "grad_norm": 0.13003818690776825, + "learning_rate": 9.367261418950459e-06, + "loss": 0.0729, + "step": 894 + }, + { + "epoch": 0.18759169985328022, + "grad_norm": 0.1460046023130417, + "learning_rate": 9.365607427656277e-06, + "loss": 0.0701, + "step": 895 + }, + { + "epoch": 0.18780129951792077, + "grad_norm": 0.1653718799352646, + "learning_rate": 9.363951423817982e-06, + "loss": 0.0713, + "step": 896 + }, + { + "epoch": 0.1880108991825613, + "grad_norm": 0.1808050125837326, + "learning_rate": 9.362293408198983e-06, + "loss": 0.0671, + "step": 897 + }, + { + "epoch": 0.18822049884720185, + "grad_norm": 0.17613857984542847, + "learning_rate": 9.360633381563631e-06, + "loss": 0.0684, + "step": 898 + }, + { + "epoch": 0.1884300985118424, + "grad_norm": 0.17661046981811523, + "learning_rate": 9.358971344677197e-06, + "loss": 0.068, + "step": 899 + }, + { + "epoch": 0.1886396981764829, + "grad_norm": 0.205166295170784, + "learning_rate": 9.357307298305875e-06, + "loss": 0.0695, + "step": 900 + }, + { + "epoch": 0.18884929784112345, + "grad_norm": 0.22776846587657928, + "learning_rate": 9.355641243216798e-06, + "loss": 0.0695, + "step": 901 + }, + { + "epoch": 0.189058897505764, + "grad_norm": 0.21361425518989563, + "learning_rate": 9.35397318017801e-06, + "loss": 0.0736, + "step": 902 + }, + { + "epoch": 0.18926849717040453, + "grad_norm": 0.18417485058307648, + "learning_rate": 9.35230310995849e-06, + "loss": 0.0719, + "step": 903 + }, + { + "epoch": 0.18947809683504507, + "grad_norm": 0.17140980064868927, + "learning_rate": 9.35063103332814e-06, + "loss": 0.0692, + "step": 904 + }, + { + "epoch": 0.1896876964996856, + "grad_norm": 0.17577317357063293, + "learning_rate": 9.348956951057788e-06, + "loss": 0.0702, + "step": 905 + }, + { + "epoch": 0.18989729616432613, + "grad_norm": 0.18869447708129883, + "learning_rate": 9.347280863919186e-06, + "loss": 0.0682, + "step": 906 + }, + { + "epoch": 0.19010689582896667, + "grad_norm": 0.15782123804092407, + "learning_rate": 9.345602772685008e-06, + "loss": 0.0674, + "step": 907 + }, + { + "epoch": 0.1903164954936072, + "grad_norm": 0.15781988203525543, + "learning_rate": 9.343922678128854e-06, + "loss": 0.0703, + "step": 908 + }, + { + "epoch": 0.19052609515824775, + "grad_norm": 0.17203202843666077, + "learning_rate": 9.342240581025248e-06, + "loss": 0.0672, + "step": 909 + }, + { + "epoch": 0.1907356948228883, + "grad_norm": 0.16652663052082062, + "learning_rate": 9.340556482149637e-06, + "loss": 0.0671, + "step": 910 + }, + { + "epoch": 0.1909452944875288, + "grad_norm": 0.15487754344940186, + "learning_rate": 9.33887038227839e-06, + "loss": 0.0699, + "step": 911 + }, + { + "epoch": 0.19115489415216935, + "grad_norm": 0.13165274262428284, + "learning_rate": 9.337182282188794e-06, + "loss": 0.0666, + "step": 912 + }, + { + "epoch": 0.1913644938168099, + "grad_norm": 0.1349601447582245, + "learning_rate": 9.335492182659071e-06, + "loss": 0.0664, + "step": 913 + }, + { + "epoch": 0.19157409348145044, + "grad_norm": 0.1551382839679718, + "learning_rate": 9.33380008446835e-06, + "loss": 0.0677, + "step": 914 + }, + { + "epoch": 0.19178369314609098, + "grad_norm": 0.16102422773838043, + "learning_rate": 9.332105988396692e-06, + "loss": 0.0693, + "step": 915 + }, + { + "epoch": 0.1919932928107315, + "grad_norm": 0.17208023369312286, + "learning_rate": 9.330409895225072e-06, + "loss": 0.0704, + "step": 916 + }, + { + "epoch": 0.19220289247537203, + "grad_norm": 0.16486892104148865, + "learning_rate": 9.32871180573539e-06, + "loss": 0.0695, + "step": 917 + }, + { + "epoch": 0.19241249214001258, + "grad_norm": 0.15366514027118683, + "learning_rate": 9.327011720710464e-06, + "loss": 0.0688, + "step": 918 + }, + { + "epoch": 0.19262209180465312, + "grad_norm": 0.17730504274368286, + "learning_rate": 9.325309640934036e-06, + "loss": 0.0678, + "step": 919 + }, + { + "epoch": 0.19283169146929366, + "grad_norm": 0.2048606127500534, + "learning_rate": 9.323605567190758e-06, + "loss": 0.0696, + "step": 920 + }, + { + "epoch": 0.19304129113393417, + "grad_norm": 0.1947096735239029, + "learning_rate": 9.321899500266216e-06, + "loss": 0.0694, + "step": 921 + }, + { + "epoch": 0.19325089079857471, + "grad_norm": 0.15000185370445251, + "learning_rate": 9.3201914409469e-06, + "loss": 0.0675, + "step": 922 + }, + { + "epoch": 0.19346049046321526, + "grad_norm": 0.15103282034397125, + "learning_rate": 9.318481390020226e-06, + "loss": 0.071, + "step": 923 + }, + { + "epoch": 0.1936700901278558, + "grad_norm": 0.16864833235740662, + "learning_rate": 9.31676934827453e-06, + "loss": 0.0695, + "step": 924 + }, + { + "epoch": 0.19387968979249634, + "grad_norm": 0.1579030156135559, + "learning_rate": 9.31505531649906e-06, + "loss": 0.0652, + "step": 925 + }, + { + "epoch": 0.19408928945713688, + "grad_norm": 0.13766202330589294, + "learning_rate": 9.313339295483985e-06, + "loss": 0.071, + "step": 926 + }, + { + "epoch": 0.1942988891217774, + "grad_norm": 0.1618116796016693, + "learning_rate": 9.311621286020393e-06, + "loss": 0.0692, + "step": 927 + }, + { + "epoch": 0.19450848878641794, + "grad_norm": 0.15055081248283386, + "learning_rate": 9.309901288900283e-06, + "loss": 0.0675, + "step": 928 + }, + { + "epoch": 0.19471808845105848, + "grad_norm": 0.13458986580371857, + "learning_rate": 9.308179304916573e-06, + "loss": 0.0717, + "step": 929 + }, + { + "epoch": 0.19492768811569902, + "grad_norm": 0.13572897017002106, + "learning_rate": 9.306455334863098e-06, + "loss": 0.0708, + "step": 930 + }, + { + "epoch": 0.19513728778033956, + "grad_norm": 0.13782450556755066, + "learning_rate": 9.30472937953461e-06, + "loss": 0.067, + "step": 931 + }, + { + "epoch": 0.19534688744498008, + "grad_norm": 0.1412908285856247, + "learning_rate": 9.303001439726772e-06, + "loss": 0.0667, + "step": 932 + }, + { + "epoch": 0.19555648710962062, + "grad_norm": 0.1327088475227356, + "learning_rate": 9.301271516236162e-06, + "loss": 0.0674, + "step": 933 + }, + { + "epoch": 0.19576608677426116, + "grad_norm": 0.13067220151424408, + "learning_rate": 9.299539609860278e-06, + "loss": 0.0683, + "step": 934 + }, + { + "epoch": 0.1959756864389017, + "grad_norm": 0.13276226818561554, + "learning_rate": 9.297805721397525e-06, + "loss": 0.0717, + "step": 935 + }, + { + "epoch": 0.19618528610354224, + "grad_norm": 0.13292694091796875, + "learning_rate": 9.29606985164723e-06, + "loss": 0.0682, + "step": 936 + }, + { + "epoch": 0.19639488576818276, + "grad_norm": 0.1614527702331543, + "learning_rate": 9.294332001409625e-06, + "loss": 0.0693, + "step": 937 + }, + { + "epoch": 0.1966044854328233, + "grad_norm": 0.20869992673397064, + "learning_rate": 9.292592171485858e-06, + "loss": 0.0687, + "step": 938 + }, + { + "epoch": 0.19681408509746384, + "grad_norm": 0.25103163719177246, + "learning_rate": 9.290850362677993e-06, + "loss": 0.0673, + "step": 939 + }, + { + "epoch": 0.19702368476210438, + "grad_norm": 0.2684808671474457, + "learning_rate": 9.289106575789001e-06, + "loss": 0.0711, + "step": 940 + }, + { + "epoch": 0.19723328442674493, + "grad_norm": 0.2292683720588684, + "learning_rate": 9.28736081162277e-06, + "loss": 0.075, + "step": 941 + }, + { + "epoch": 0.19744288409138544, + "grad_norm": 0.148003488779068, + "learning_rate": 9.285613070984094e-06, + "loss": 0.0736, + "step": 942 + }, + { + "epoch": 0.19765248375602598, + "grad_norm": 0.1527291238307953, + "learning_rate": 9.283863354678683e-06, + "loss": 0.0691, + "step": 943 + }, + { + "epoch": 0.19786208342066652, + "grad_norm": 0.17190136015415192, + "learning_rate": 9.282111663513156e-06, + "loss": 0.0696, + "step": 944 + }, + { + "epoch": 0.19807168308530707, + "grad_norm": 0.16085271537303925, + "learning_rate": 9.280357998295041e-06, + "loss": 0.0692, + "step": 945 + }, + { + "epoch": 0.1982812827499476, + "grad_norm": 0.16574037075042725, + "learning_rate": 9.278602359832778e-06, + "loss": 0.0695, + "step": 946 + }, + { + "epoch": 0.19849088241458815, + "grad_norm": 0.1433473825454712, + "learning_rate": 9.276844748935715e-06, + "loss": 0.066, + "step": 947 + }, + { + "epoch": 0.19870048207922866, + "grad_norm": 0.15432019531726837, + "learning_rate": 9.275085166414113e-06, + "loss": 0.0716, + "step": 948 + }, + { + "epoch": 0.1989100817438692, + "grad_norm": 0.155776709318161, + "learning_rate": 9.273323613079135e-06, + "loss": 0.0687, + "step": 949 + }, + { + "epoch": 0.19911968140850975, + "grad_norm": 0.14493581652641296, + "learning_rate": 9.27156008974286e-06, + "loss": 0.0688, + "step": 950 + }, + { + "epoch": 0.1993292810731503, + "grad_norm": 0.16887947916984558, + "learning_rate": 9.269794597218271e-06, + "loss": 0.0682, + "step": 951 + }, + { + "epoch": 0.19953888073779083, + "grad_norm": 0.15968920290470123, + "learning_rate": 9.26802713631926e-06, + "loss": 0.0652, + "step": 952 + }, + { + "epoch": 0.19974848040243134, + "grad_norm": 0.15079444646835327, + "learning_rate": 9.266257707860625e-06, + "loss": 0.0661, + "step": 953 + }, + { + "epoch": 0.19995808006707189, + "grad_norm": 0.15379679203033447, + "learning_rate": 9.264486312658073e-06, + "loss": 0.0723, + "step": 954 + }, + { + "epoch": 0.20016767973171243, + "grad_norm": 0.1347336769104004, + "learning_rate": 9.262712951528217e-06, + "loss": 0.0729, + "step": 955 + }, + { + "epoch": 0.20037727939635297, + "grad_norm": 0.13610079884529114, + "learning_rate": 9.260937625288576e-06, + "loss": 0.0693, + "step": 956 + }, + { + "epoch": 0.2005868790609935, + "grad_norm": 0.14802809059619904, + "learning_rate": 9.259160334757575e-06, + "loss": 0.0685, + "step": 957 + }, + { + "epoch": 0.20079647872563403, + "grad_norm": 0.15134483575820923, + "learning_rate": 9.257381080754544e-06, + "loss": 0.0655, + "step": 958 + }, + { + "epoch": 0.20100607839027457, + "grad_norm": 0.1678546667098999, + "learning_rate": 9.255599864099718e-06, + "loss": 0.0715, + "step": 959 + }, + { + "epoch": 0.2012156780549151, + "grad_norm": 0.15980494022369385, + "learning_rate": 9.25381668561424e-06, + "loss": 0.0695, + "step": 960 + }, + { + "epoch": 0.20142527771955565, + "grad_norm": 0.13359016180038452, + "learning_rate": 9.252031546120153e-06, + "loss": 0.0671, + "step": 961 + }, + { + "epoch": 0.2016348773841962, + "grad_norm": 0.154948428273201, + "learning_rate": 9.250244446440406e-06, + "loss": 0.0676, + "step": 962 + }, + { + "epoch": 0.20184447704883673, + "grad_norm": 0.16912510991096497, + "learning_rate": 9.248455387398853e-06, + "loss": 0.0687, + "step": 963 + }, + { + "epoch": 0.20205407671347725, + "grad_norm": 0.1481633484363556, + "learning_rate": 9.246664369820249e-06, + "loss": 0.0696, + "step": 964 + }, + { + "epoch": 0.2022636763781178, + "grad_norm": 0.1526942402124405, + "learning_rate": 9.244871394530252e-06, + "loss": 0.0669, + "step": 965 + }, + { + "epoch": 0.20247327604275833, + "grad_norm": 0.1594848930835724, + "learning_rate": 9.243076462355424e-06, + "loss": 0.0695, + "step": 966 + }, + { + "epoch": 0.20268287570739887, + "grad_norm": 0.15428997576236725, + "learning_rate": 9.241279574123228e-06, + "loss": 0.0676, + "step": 967 + }, + { + "epoch": 0.20289247537203942, + "grad_norm": 0.15648357570171356, + "learning_rate": 9.239480730662029e-06, + "loss": 0.0701, + "step": 968 + }, + { + "epoch": 0.20310207503667993, + "grad_norm": 0.14083035290241241, + "learning_rate": 9.237679932801095e-06, + "loss": 0.0707, + "step": 969 + }, + { + "epoch": 0.20331167470132047, + "grad_norm": 0.11451932042837143, + "learning_rate": 9.235877181370592e-06, + "loss": 0.0687, + "step": 970 + }, + { + "epoch": 0.203521274365961, + "grad_norm": 0.12343299388885498, + "learning_rate": 9.234072477201588e-06, + "loss": 0.0672, + "step": 971 + }, + { + "epoch": 0.20373087403060156, + "grad_norm": 0.1318097859621048, + "learning_rate": 9.23226582112605e-06, + "loss": 0.0692, + "step": 972 + }, + { + "epoch": 0.2039404736952421, + "grad_norm": 0.13317148387432098, + "learning_rate": 9.23045721397685e-06, + "loss": 0.0652, + "step": 973 + }, + { + "epoch": 0.2041500733598826, + "grad_norm": 0.14441275596618652, + "learning_rate": 9.228646656587751e-06, + "loss": 0.0669, + "step": 974 + }, + { + "epoch": 0.20435967302452315, + "grad_norm": 0.1513681560754776, + "learning_rate": 9.226834149793422e-06, + "loss": 0.0654, + "step": 975 + }, + { + "epoch": 0.2045692726891637, + "grad_norm": 0.17051391303539276, + "learning_rate": 9.225019694429429e-06, + "loss": 0.0664, + "step": 976 + }, + { + "epoch": 0.20477887235380424, + "grad_norm": 0.19565874338150024, + "learning_rate": 9.223203291332234e-06, + "loss": 0.0691, + "step": 977 + }, + { + "epoch": 0.20498847201844478, + "grad_norm": 0.2343725860118866, + "learning_rate": 9.2213849413392e-06, + "loss": 0.069, + "step": 978 + }, + { + "epoch": 0.20519807168308532, + "grad_norm": 0.27315953373908997, + "learning_rate": 9.219564645288583e-06, + "loss": 0.0675, + "step": 979 + }, + { + "epoch": 0.20540767134772583, + "grad_norm": 0.2724561393260956, + "learning_rate": 9.217742404019544e-06, + "loss": 0.0702, + "step": 980 + }, + { + "epoch": 0.20561727101236638, + "grad_norm": 0.23024438321590424, + "learning_rate": 9.215918218372128e-06, + "loss": 0.0723, + "step": 981 + }, + { + "epoch": 0.20582687067700692, + "grad_norm": 0.15981607139110565, + "learning_rate": 9.214092089187293e-06, + "loss": 0.0696, + "step": 982 + }, + { + "epoch": 0.20603647034164746, + "grad_norm": 0.165577694773674, + "learning_rate": 9.212264017306878e-06, + "loss": 0.0725, + "step": 983 + }, + { + "epoch": 0.206246070006288, + "grad_norm": 0.19346004724502563, + "learning_rate": 9.210434003573627e-06, + "loss": 0.0689, + "step": 984 + }, + { + "epoch": 0.20645566967092852, + "grad_norm": 0.17302066087722778, + "learning_rate": 9.208602048831176e-06, + "loss": 0.0695, + "step": 985 + }, + { + "epoch": 0.20666526933556906, + "grad_norm": 0.17835290729999542, + "learning_rate": 9.206768153924052e-06, + "loss": 0.0684, + "step": 986 + }, + { + "epoch": 0.2068748690002096, + "grad_norm": 0.1855718344449997, + "learning_rate": 9.204932319697686e-06, + "loss": 0.0694, + "step": 987 + }, + { + "epoch": 0.20708446866485014, + "grad_norm": 0.16161462664604187, + "learning_rate": 9.203094546998392e-06, + "loss": 0.0689, + "step": 988 + }, + { + "epoch": 0.20729406832949068, + "grad_norm": 0.1590941995382309, + "learning_rate": 9.201254836673386e-06, + "loss": 0.0676, + "step": 989 + }, + { + "epoch": 0.2075036679941312, + "grad_norm": 0.17780111730098724, + "learning_rate": 9.199413189570772e-06, + "loss": 0.0699, + "step": 990 + }, + { + "epoch": 0.20771326765877174, + "grad_norm": 0.15813374519348145, + "learning_rate": 9.197569606539551e-06, + "loss": 0.0678, + "step": 991 + }, + { + "epoch": 0.20792286732341228, + "grad_norm": 0.14259178936481476, + "learning_rate": 9.195724088429611e-06, + "loss": 0.0682, + "step": 992 + }, + { + "epoch": 0.20813246698805282, + "grad_norm": 0.15479755401611328, + "learning_rate": 9.193876636091741e-06, + "loss": 0.0703, + "step": 993 + }, + { + "epoch": 0.20834206665269336, + "grad_norm": 0.16291916370391846, + "learning_rate": 9.192027250377611e-06, + "loss": 0.0714, + "step": 994 + }, + { + "epoch": 0.2085516663173339, + "grad_norm": 0.16300472617149353, + "learning_rate": 9.19017593213979e-06, + "loss": 0.0686, + "step": 995 + }, + { + "epoch": 0.20876126598197442, + "grad_norm": 0.16274508833885193, + "learning_rate": 9.188322682231733e-06, + "loss": 0.0674, + "step": 996 + }, + { + "epoch": 0.20897086564661496, + "grad_norm": 0.18186375498771667, + "learning_rate": 9.186467501507792e-06, + "loss": 0.0696, + "step": 997 + }, + { + "epoch": 0.2091804653112555, + "grad_norm": 0.20312577486038208, + "learning_rate": 9.184610390823202e-06, + "loss": 0.0659, + "step": 998 + }, + { + "epoch": 0.20939006497589605, + "grad_norm": 0.21246276795864105, + "learning_rate": 9.18275135103409e-06, + "loss": 0.0714, + "step": 999 + }, + { + "epoch": 0.2095996646405366, + "grad_norm": 0.19625847041606903, + "learning_rate": 9.180890382997473e-06, + "loss": 0.0662, + "step": 1000 + }, + { + "epoch": 0.2098092643051771, + "grad_norm": 0.181460440158844, + "learning_rate": 9.179027487571258e-06, + "loss": 0.0662, + "step": 1001 + }, + { + "epoch": 0.21001886396981764, + "grad_norm": 0.17908546328544617, + "learning_rate": 9.177162665614242e-06, + "loss": 0.0685, + "step": 1002 + }, + { + "epoch": 0.21022846363445818, + "grad_norm": 0.19209085404872894, + "learning_rate": 9.175295917986103e-06, + "loss": 0.0682, + "step": 1003 + }, + { + "epoch": 0.21043806329909873, + "grad_norm": 0.21326391398906708, + "learning_rate": 9.173427245547414e-06, + "loss": 0.0691, + "step": 1004 + }, + { + "epoch": 0.21064766296373927, + "grad_norm": 0.19269311428070068, + "learning_rate": 9.17155664915963e-06, + "loss": 0.0727, + "step": 1005 + }, + { + "epoch": 0.21085726262837978, + "grad_norm": 0.19682708382606506, + "learning_rate": 9.169684129685099e-06, + "loss": 0.0713, + "step": 1006 + }, + { + "epoch": 0.21106686229302032, + "grad_norm": 0.18189750611782074, + "learning_rate": 9.16780968798705e-06, + "loss": 0.0714, + "step": 1007 + }, + { + "epoch": 0.21127646195766087, + "grad_norm": 0.169154092669487, + "learning_rate": 9.165933324929599e-06, + "loss": 0.0702, + "step": 1008 + }, + { + "epoch": 0.2114860616223014, + "grad_norm": 0.20186983048915863, + "learning_rate": 9.164055041377754e-06, + "loss": 0.0691, + "step": 1009 + }, + { + "epoch": 0.21169566128694195, + "grad_norm": 0.19826248288154602, + "learning_rate": 9.162174838197396e-06, + "loss": 0.0679, + "step": 1010 + }, + { + "epoch": 0.21190526095158246, + "grad_norm": 0.1933613270521164, + "learning_rate": 9.160292716255303e-06, + "loss": 0.0691, + "step": 1011 + }, + { + "epoch": 0.212114860616223, + "grad_norm": 0.1532355546951294, + "learning_rate": 9.158408676419133e-06, + "loss": 0.0712, + "step": 1012 + }, + { + "epoch": 0.21232446028086355, + "grad_norm": 0.13550697267055511, + "learning_rate": 9.156522719557428e-06, + "loss": 0.0688, + "step": 1013 + }, + { + "epoch": 0.2125340599455041, + "grad_norm": 0.19453023374080658, + "learning_rate": 9.15463484653961e-06, + "loss": 0.0646, + "step": 1014 + }, + { + "epoch": 0.21274365961014463, + "grad_norm": 0.16020941734313965, + "learning_rate": 9.152745058235993e-06, + "loss": 0.0692, + "step": 1015 + }, + { + "epoch": 0.21295325927478517, + "grad_norm": 0.17123140394687653, + "learning_rate": 9.150853355517765e-06, + "loss": 0.0689, + "step": 1016 + }, + { + "epoch": 0.2131628589394257, + "grad_norm": 0.17353491485118866, + "learning_rate": 9.148959739257005e-06, + "loss": 0.0687, + "step": 1017 + }, + { + "epoch": 0.21337245860406623, + "grad_norm": 0.15080298483371735, + "learning_rate": 9.147064210326664e-06, + "loss": 0.0664, + "step": 1018 + }, + { + "epoch": 0.21358205826870677, + "grad_norm": 0.1491314321756363, + "learning_rate": 9.145166769600584e-06, + "loss": 0.0697, + "step": 1019 + }, + { + "epoch": 0.2137916579333473, + "grad_norm": 0.10465126484632492, + "learning_rate": 9.143267417953486e-06, + "loss": 0.068, + "step": 1020 + }, + { + "epoch": 0.21400125759798785, + "grad_norm": 0.15090526640415192, + "learning_rate": 9.141366156260967e-06, + "loss": 0.0727, + "step": 1021 + }, + { + "epoch": 0.21421085726262837, + "grad_norm": 0.11032428592443466, + "learning_rate": 9.139462985399512e-06, + "loss": 0.0682, + "step": 1022 + }, + { + "epoch": 0.2144204569272689, + "grad_norm": 0.13925404846668243, + "learning_rate": 9.137557906246479e-06, + "loss": 0.0682, + "step": 1023 + }, + { + "epoch": 0.21463005659190945, + "grad_norm": 0.15173637866973877, + "learning_rate": 9.135650919680112e-06, + "loss": 0.0653, + "step": 1024 + }, + { + "epoch": 0.21483965625655, + "grad_norm": 0.1346423327922821, + "learning_rate": 9.133742026579528e-06, + "loss": 0.0679, + "step": 1025 + }, + { + "epoch": 0.21504925592119054, + "grad_norm": 0.1535898894071579, + "learning_rate": 9.13183122782473e-06, + "loss": 0.0667, + "step": 1026 + }, + { + "epoch": 0.21525885558583105, + "grad_norm": 0.14157144725322723, + "learning_rate": 9.129918524296596e-06, + "loss": 0.0707, + "step": 1027 + }, + { + "epoch": 0.2154684552504716, + "grad_norm": 0.14217810332775116, + "learning_rate": 9.128003916876878e-06, + "loss": 0.0708, + "step": 1028 + }, + { + "epoch": 0.21567805491511213, + "grad_norm": 0.1338115632534027, + "learning_rate": 9.126087406448211e-06, + "loss": 0.0648, + "step": 1029 + }, + { + "epoch": 0.21588765457975267, + "grad_norm": 0.12991099059581757, + "learning_rate": 9.124168993894107e-06, + "loss": 0.0677, + "step": 1030 + }, + { + "epoch": 0.21609725424439322, + "grad_norm": 0.1184862032532692, + "learning_rate": 9.122248680098956e-06, + "loss": 0.0673, + "step": 1031 + }, + { + "epoch": 0.21630685390903376, + "grad_norm": 0.1306854635477066, + "learning_rate": 9.120326465948016e-06, + "loss": 0.068, + "step": 1032 + }, + { + "epoch": 0.21651645357367427, + "grad_norm": 0.16064853966236115, + "learning_rate": 9.118402352327433e-06, + "loss": 0.0663, + "step": 1033 + }, + { + "epoch": 0.21672605323831481, + "grad_norm": 0.1781821846961975, + "learning_rate": 9.11647634012422e-06, + "loss": 0.0683, + "step": 1034 + }, + { + "epoch": 0.21693565290295536, + "grad_norm": 0.18965835869312286, + "learning_rate": 9.11454843022627e-06, + "loss": 0.0679, + "step": 1035 + }, + { + "epoch": 0.2171452525675959, + "grad_norm": 0.19845952093601227, + "learning_rate": 9.112618623522351e-06, + "loss": 0.0663, + "step": 1036 + }, + { + "epoch": 0.21735485223223644, + "grad_norm": 0.16511160135269165, + "learning_rate": 9.110686920902097e-06, + "loss": 0.0717, + "step": 1037 + }, + { + "epoch": 0.21756445189687695, + "grad_norm": 0.1469397395849228, + "learning_rate": 9.108753323256028e-06, + "loss": 0.0662, + "step": 1038 + }, + { + "epoch": 0.2177740515615175, + "grad_norm": 0.15417161583900452, + "learning_rate": 9.106817831475529e-06, + "loss": 0.0736, + "step": 1039 + }, + { + "epoch": 0.21798365122615804, + "grad_norm": 0.10822763293981552, + "learning_rate": 9.104880446452866e-06, + "loss": 0.0666, + "step": 1040 + }, + { + "epoch": 0.21819325089079858, + "grad_norm": 0.1328975260257721, + "learning_rate": 9.102941169081167e-06, + "loss": 0.0671, + "step": 1041 + }, + { + "epoch": 0.21840285055543912, + "grad_norm": 0.15660911798477173, + "learning_rate": 9.101000000254442e-06, + "loss": 0.0679, + "step": 1042 + }, + { + "epoch": 0.21861245022007963, + "grad_norm": 0.14654120802879333, + "learning_rate": 9.09905694086757e-06, + "loss": 0.0692, + "step": 1043 + }, + { + "epoch": 0.21882204988472018, + "grad_norm": 0.15261763334274292, + "learning_rate": 9.097111991816297e-06, + "loss": 0.0677, + "step": 1044 + }, + { + "epoch": 0.21903164954936072, + "grad_norm": 0.1555902063846588, + "learning_rate": 9.095165153997249e-06, + "loss": 0.0653, + "step": 1045 + }, + { + "epoch": 0.21924124921400126, + "grad_norm": 0.1320633888244629, + "learning_rate": 9.093216428307914e-06, + "loss": 0.0683, + "step": 1046 + }, + { + "epoch": 0.2194508488786418, + "grad_norm": 0.12523691356182098, + "learning_rate": 9.091265815646658e-06, + "loss": 0.0646, + "step": 1047 + }, + { + "epoch": 0.21966044854328234, + "grad_norm": 0.14748716354370117, + "learning_rate": 9.089313316912708e-06, + "loss": 0.067, + "step": 1048 + }, + { + "epoch": 0.21987004820792286, + "grad_norm": 0.13303905725479126, + "learning_rate": 9.08735893300617e-06, + "loss": 0.0677, + "step": 1049 + }, + { + "epoch": 0.2200796478725634, + "grad_norm": 0.1327192634344101, + "learning_rate": 9.085402664828013e-06, + "loss": 0.072, + "step": 1050 + }, + { + "epoch": 0.22028924753720394, + "grad_norm": 0.13275259733200073, + "learning_rate": 9.083444513280076e-06, + "loss": 0.0659, + "step": 1051 + }, + { + "epoch": 0.22049884720184448, + "grad_norm": 0.11244450509548187, + "learning_rate": 9.081484479265067e-06, + "loss": 0.0719, + "step": 1052 + }, + { + "epoch": 0.22070844686648503, + "grad_norm": 0.11435073614120483, + "learning_rate": 9.07952256368656e-06, + "loss": 0.0648, + "step": 1053 + }, + { + "epoch": 0.22091804653112554, + "grad_norm": 0.1512385904788971, + "learning_rate": 9.077558767448999e-06, + "loss": 0.0677, + "step": 1054 + }, + { + "epoch": 0.22112764619576608, + "grad_norm": 0.1435592770576477, + "learning_rate": 9.075593091457692e-06, + "loss": 0.064, + "step": 1055 + }, + { + "epoch": 0.22133724586040662, + "grad_norm": 0.14917264878749847, + "learning_rate": 9.073625536618819e-06, + "loss": 0.0661, + "step": 1056 + }, + { + "epoch": 0.22154684552504716, + "grad_norm": 0.17828944325447083, + "learning_rate": 9.071656103839419e-06, + "loss": 0.0683, + "step": 1057 + }, + { + "epoch": 0.2217564451896877, + "grad_norm": 0.13577091693878174, + "learning_rate": 9.069684794027401e-06, + "loss": 0.0659, + "step": 1058 + }, + { + "epoch": 0.22196604485432822, + "grad_norm": 0.1218876987695694, + "learning_rate": 9.067711608091536e-06, + "loss": 0.0655, + "step": 1059 + }, + { + "epoch": 0.22217564451896876, + "grad_norm": 0.14669421315193176, + "learning_rate": 9.065736546941467e-06, + "loss": 0.0669, + "step": 1060 + }, + { + "epoch": 0.2223852441836093, + "grad_norm": 0.12597538530826569, + "learning_rate": 9.063759611487693e-06, + "loss": 0.0673, + "step": 1061 + }, + { + "epoch": 0.22259484384824985, + "grad_norm": 0.13924571871757507, + "learning_rate": 9.061780802641582e-06, + "loss": 0.0628, + "step": 1062 + }, + { + "epoch": 0.2228044435128904, + "grad_norm": 0.14818163216114044, + "learning_rate": 9.059800121315365e-06, + "loss": 0.0656, + "step": 1063 + }, + { + "epoch": 0.22301404317753093, + "grad_norm": 0.13732635974884033, + "learning_rate": 9.057817568422135e-06, + "loss": 0.0666, + "step": 1064 + }, + { + "epoch": 0.22322364284217144, + "grad_norm": 0.1377662718296051, + "learning_rate": 9.05583314487585e-06, + "loss": 0.0695, + "step": 1065 + }, + { + "epoch": 0.22343324250681199, + "grad_norm": 0.14314699172973633, + "learning_rate": 9.053846851591328e-06, + "loss": 0.0689, + "step": 1066 + }, + { + "epoch": 0.22364284217145253, + "grad_norm": 0.16072338819503784, + "learning_rate": 9.05185868948425e-06, + "loss": 0.0672, + "step": 1067 + }, + { + "epoch": 0.22385244183609307, + "grad_norm": 0.1533009558916092, + "learning_rate": 9.049868659471156e-06, + "loss": 0.0703, + "step": 1068 + }, + { + "epoch": 0.2240620415007336, + "grad_norm": 0.16292600333690643, + "learning_rate": 9.047876762469451e-06, + "loss": 0.0701, + "step": 1069 + }, + { + "epoch": 0.22427164116537412, + "grad_norm": 0.19941456615924835, + "learning_rate": 9.0458829993974e-06, + "loss": 0.0683, + "step": 1070 + }, + { + "epoch": 0.22448124083001467, + "grad_norm": 0.22352056205272675, + "learning_rate": 9.043887371174128e-06, + "loss": 0.0661, + "step": 1071 + }, + { + "epoch": 0.2246908404946552, + "grad_norm": 0.24914206564426422, + "learning_rate": 9.041889878719617e-06, + "loss": 0.0674, + "step": 1072 + }, + { + "epoch": 0.22490044015929575, + "grad_norm": 0.21870644390583038, + "learning_rate": 9.03989052295471e-06, + "loss": 0.0669, + "step": 1073 + }, + { + "epoch": 0.2251100398239363, + "grad_norm": 0.16291193664073944, + "learning_rate": 9.037889304801112e-06, + "loss": 0.0678, + "step": 1074 + }, + { + "epoch": 0.2253196394885768, + "grad_norm": 0.12002480030059814, + "learning_rate": 9.035886225181384e-06, + "loss": 0.0663, + "step": 1075 + }, + { + "epoch": 0.22552923915321735, + "grad_norm": 0.12656140327453613, + "learning_rate": 9.033881285018945e-06, + "loss": 0.0649, + "step": 1076 + }, + { + "epoch": 0.2257388388178579, + "grad_norm": 0.17447209358215332, + "learning_rate": 9.031874485238068e-06, + "loss": 0.0653, + "step": 1077 + }, + { + "epoch": 0.22594843848249843, + "grad_norm": 0.1942858248949051, + "learning_rate": 9.029865826763895e-06, + "loss": 0.068, + "step": 1078 + }, + { + "epoch": 0.22615803814713897, + "grad_norm": 0.16547489166259766, + "learning_rate": 9.027855310522411e-06, + "loss": 0.0645, + "step": 1079 + }, + { + "epoch": 0.2263676378117795, + "grad_norm": 0.1284940242767334, + "learning_rate": 9.025842937440466e-06, + "loss": 0.0648, + "step": 1080 + }, + { + "epoch": 0.22657723747642003, + "grad_norm": 0.12198358029127121, + "learning_rate": 9.023828708445762e-06, + "loss": 0.0707, + "step": 1081 + }, + { + "epoch": 0.22678683714106057, + "grad_norm": 0.13251115381717682, + "learning_rate": 9.02181262446686e-06, + "loss": 0.069, + "step": 1082 + }, + { + "epoch": 0.2269964368057011, + "grad_norm": 0.13787177205085754, + "learning_rate": 9.019794686433174e-06, + "loss": 0.0696, + "step": 1083 + }, + { + "epoch": 0.22720603647034165, + "grad_norm": 0.12242651730775833, + "learning_rate": 9.017774895274971e-06, + "loss": 0.066, + "step": 1084 + }, + { + "epoch": 0.2274156361349822, + "grad_norm": 0.11319856345653534, + "learning_rate": 9.015753251923378e-06, + "loss": 0.0644, + "step": 1085 + }, + { + "epoch": 0.2276252357996227, + "grad_norm": 0.11255019158124924, + "learning_rate": 9.013729757310368e-06, + "loss": 0.0681, + "step": 1086 + }, + { + "epoch": 0.22783483546426325, + "grad_norm": 0.10680454969406128, + "learning_rate": 9.011704412368776e-06, + "loss": 0.0675, + "step": 1087 + }, + { + "epoch": 0.2280444351289038, + "grad_norm": 0.10743577778339386, + "learning_rate": 9.00967721803228e-06, + "loss": 0.0701, + "step": 1088 + }, + { + "epoch": 0.22825403479354434, + "grad_norm": 0.09966851025819778, + "learning_rate": 9.007648175235421e-06, + "loss": 0.0657, + "step": 1089 + }, + { + "epoch": 0.22846363445818488, + "grad_norm": 0.10842377692461014, + "learning_rate": 9.005617284913586e-06, + "loss": 0.0646, + "step": 1090 + }, + { + "epoch": 0.2286732341228254, + "grad_norm": 0.12540468573570251, + "learning_rate": 9.003584548003015e-06, + "loss": 0.0661, + "step": 1091 + }, + { + "epoch": 0.22888283378746593, + "grad_norm": 0.13884596526622772, + "learning_rate": 9.001549965440798e-06, + "loss": 0.0704, + "step": 1092 + }, + { + "epoch": 0.22909243345210648, + "grad_norm": 0.16879726946353912, + "learning_rate": 8.99951353816488e-06, + "loss": 0.0625, + "step": 1093 + }, + { + "epoch": 0.22930203311674702, + "grad_norm": 0.19034424424171448, + "learning_rate": 8.99747526711405e-06, + "loss": 0.0672, + "step": 1094 + }, + { + "epoch": 0.22951163278138756, + "grad_norm": 0.18537910282611847, + "learning_rate": 8.995435153227951e-06, + "loss": 0.0687, + "step": 1095 + }, + { + "epoch": 0.22972123244602807, + "grad_norm": 0.15541553497314453, + "learning_rate": 8.993393197447078e-06, + "loss": 0.0692, + "step": 1096 + }, + { + "epoch": 0.22993083211066861, + "grad_norm": 0.12911075353622437, + "learning_rate": 8.991349400712772e-06, + "loss": 0.0684, + "step": 1097 + }, + { + "epoch": 0.23014043177530916, + "grad_norm": 0.12497539818286896, + "learning_rate": 8.989303763967218e-06, + "loss": 0.0694, + "step": 1098 + }, + { + "epoch": 0.2303500314399497, + "grad_norm": 0.12200130522251129, + "learning_rate": 8.98725628815346e-06, + "loss": 0.0662, + "step": 1099 + }, + { + "epoch": 0.23055963110459024, + "grad_norm": 0.14163267612457275, + "learning_rate": 8.985206974215381e-06, + "loss": 0.0688, + "step": 1100 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.18513454496860504, + "learning_rate": 8.983155823097713e-06, + "loss": 0.0702, + "step": 1101 + }, + { + "epoch": 0.2309788304338713, + "grad_norm": 0.20042511820793152, + "learning_rate": 8.98110283574604e-06, + "loss": 0.0672, + "step": 1102 + }, + { + "epoch": 0.23118843009851184, + "grad_norm": 0.1693289577960968, + "learning_rate": 8.979048013106786e-06, + "loss": 0.0691, + "step": 1103 + }, + { + "epoch": 0.23139802976315238, + "grad_norm": 0.1325998604297638, + "learning_rate": 8.976991356127225e-06, + "loss": 0.0681, + "step": 1104 + }, + { + "epoch": 0.23160762942779292, + "grad_norm": 0.14559799432754517, + "learning_rate": 8.974932865755472e-06, + "loss": 0.069, + "step": 1105 + }, + { + "epoch": 0.23181722909243346, + "grad_norm": 0.1867041438817978, + "learning_rate": 8.972872542940496e-06, + "loss": 0.0691, + "step": 1106 + }, + { + "epoch": 0.23202682875707398, + "grad_norm": 0.18667063117027283, + "learning_rate": 8.970810388632102e-06, + "loss": 0.0693, + "step": 1107 + }, + { + "epoch": 0.23223642842171452, + "grad_norm": 0.149042010307312, + "learning_rate": 8.968746403780945e-06, + "loss": 0.0651, + "step": 1108 + }, + { + "epoch": 0.23244602808635506, + "grad_norm": 0.11694446206092834, + "learning_rate": 8.96668058933852e-06, + "loss": 0.0658, + "step": 1109 + }, + { + "epoch": 0.2326556277509956, + "grad_norm": 0.12780682742595673, + "learning_rate": 8.964612946257167e-06, + "loss": 0.0652, + "step": 1110 + }, + { + "epoch": 0.23286522741563614, + "grad_norm": 0.17143504321575165, + "learning_rate": 8.962543475490068e-06, + "loss": 0.0654, + "step": 1111 + }, + { + "epoch": 0.23307482708027666, + "grad_norm": 0.1847982108592987, + "learning_rate": 8.960472177991252e-06, + "loss": 0.0667, + "step": 1112 + }, + { + "epoch": 0.2332844267449172, + "grad_norm": 0.1703869104385376, + "learning_rate": 8.958399054715583e-06, + "loss": 0.0655, + "step": 1113 + }, + { + "epoch": 0.23349402640955774, + "grad_norm": 0.14903445541858673, + "learning_rate": 8.956324106618773e-06, + "loss": 0.066, + "step": 1114 + }, + { + "epoch": 0.23370362607419828, + "grad_norm": 0.13800345361232758, + "learning_rate": 8.954247334657371e-06, + "loss": 0.0662, + "step": 1115 + }, + { + "epoch": 0.23391322573883883, + "grad_norm": 0.14202521741390228, + "learning_rate": 8.952168739788769e-06, + "loss": 0.0662, + "step": 1116 + }, + { + "epoch": 0.23412282540347937, + "grad_norm": 0.1242147833108902, + "learning_rate": 8.9500883229712e-06, + "loss": 0.0659, + "step": 1117 + }, + { + "epoch": 0.23433242506811988, + "grad_norm": 0.13212472200393677, + "learning_rate": 8.948006085163735e-06, + "loss": 0.0659, + "step": 1118 + }, + { + "epoch": 0.23454202473276042, + "grad_norm": 0.13743017613887787, + "learning_rate": 8.945922027326283e-06, + "loss": 0.0653, + "step": 1119 + }, + { + "epoch": 0.23475162439740097, + "grad_norm": 0.12877334654331207, + "learning_rate": 8.943836150419596e-06, + "loss": 0.0652, + "step": 1120 + }, + { + "epoch": 0.2349612240620415, + "grad_norm": 0.13227277994155884, + "learning_rate": 8.941748455405264e-06, + "loss": 0.0677, + "step": 1121 + }, + { + "epoch": 0.23517082372668205, + "grad_norm": 0.1250903308391571, + "learning_rate": 8.939658943245712e-06, + "loss": 0.0655, + "step": 1122 + }, + { + "epoch": 0.23538042339132256, + "grad_norm": 0.10851810872554779, + "learning_rate": 8.937567614904205e-06, + "loss": 0.0631, + "step": 1123 + }, + { + "epoch": 0.2355900230559631, + "grad_norm": 0.13291695713996887, + "learning_rate": 8.935474471344848e-06, + "loss": 0.0655, + "step": 1124 + }, + { + "epoch": 0.23579962272060365, + "grad_norm": 0.16525736451148987, + "learning_rate": 8.933379513532575e-06, + "loss": 0.0637, + "step": 1125 + }, + { + "epoch": 0.2360092223852442, + "grad_norm": 0.20342209935188293, + "learning_rate": 8.931282742433163e-06, + "loss": 0.066, + "step": 1126 + }, + { + "epoch": 0.23621882204988473, + "grad_norm": 0.24097025394439697, + "learning_rate": 8.929184159013225e-06, + "loss": 0.0687, + "step": 1127 + }, + { + "epoch": 0.23642842171452524, + "grad_norm": 0.23975655436515808, + "learning_rate": 8.927083764240205e-06, + "loss": 0.0708, + "step": 1128 + }, + { + "epoch": 0.23663802137916579, + "grad_norm": 0.20488587021827698, + "learning_rate": 8.924981559082386e-06, + "loss": 0.0659, + "step": 1129 + }, + { + "epoch": 0.23684762104380633, + "grad_norm": 0.16722434759140015, + "learning_rate": 8.922877544508882e-06, + "loss": 0.0665, + "step": 1130 + }, + { + "epoch": 0.23705722070844687, + "grad_norm": 0.15572957694530487, + "learning_rate": 8.920771721489646e-06, + "loss": 0.0673, + "step": 1131 + }, + { + "epoch": 0.2372668203730874, + "grad_norm": 0.13812050223350525, + "learning_rate": 8.91866409099546e-06, + "loss": 0.0702, + "step": 1132 + }, + { + "epoch": 0.23747642003772795, + "grad_norm": 0.12852191925048828, + "learning_rate": 8.916554653997943e-06, + "loss": 0.0691, + "step": 1133 + }, + { + "epoch": 0.23768601970236847, + "grad_norm": 0.14980794489383698, + "learning_rate": 8.914443411469544e-06, + "loss": 0.0705, + "step": 1134 + }, + { + "epoch": 0.237895619367009, + "grad_norm": 0.15687119960784912, + "learning_rate": 8.912330364383546e-06, + "loss": 0.0716, + "step": 1135 + }, + { + "epoch": 0.23810521903164955, + "grad_norm": 0.13968421518802643, + "learning_rate": 8.91021551371406e-06, + "loss": 0.0679, + "step": 1136 + }, + { + "epoch": 0.2383148186962901, + "grad_norm": 0.1380040943622589, + "learning_rate": 8.908098860436036e-06, + "loss": 0.0624, + "step": 1137 + }, + { + "epoch": 0.23852441836093063, + "grad_norm": 0.13776816427707672, + "learning_rate": 8.90598040552525e-06, + "loss": 0.0655, + "step": 1138 + }, + { + "epoch": 0.23873401802557115, + "grad_norm": 0.11478232592344284, + "learning_rate": 8.903860149958308e-06, + "loss": 0.0681, + "step": 1139 + }, + { + "epoch": 0.2389436176902117, + "grad_norm": 0.1092422679066658, + "learning_rate": 8.901738094712648e-06, + "loss": 0.0639, + "step": 1140 + }, + { + "epoch": 0.23915321735485223, + "grad_norm": 0.11861050873994827, + "learning_rate": 8.899614240766537e-06, + "loss": 0.0646, + "step": 1141 + }, + { + "epoch": 0.23936281701949277, + "grad_norm": 0.11883353441953659, + "learning_rate": 8.89748858909907e-06, + "loss": 0.0654, + "step": 1142 + }, + { + "epoch": 0.23957241668413332, + "grad_norm": 0.11742465198040009, + "learning_rate": 8.895361140690173e-06, + "loss": 0.0655, + "step": 1143 + }, + { + "epoch": 0.23978201634877383, + "grad_norm": 0.1311412900686264, + "learning_rate": 8.8932318965206e-06, + "loss": 0.0668, + "step": 1144 + }, + { + "epoch": 0.23999161601341437, + "grad_norm": 0.16970883309841156, + "learning_rate": 8.89110085757193e-06, + "loss": 0.0681, + "step": 1145 + }, + { + "epoch": 0.2402012156780549, + "grad_norm": 0.18959131836891174, + "learning_rate": 8.888968024826575e-06, + "loss": 0.0645, + "step": 1146 + }, + { + "epoch": 0.24041081534269546, + "grad_norm": 0.17889679968357086, + "learning_rate": 8.886833399267767e-06, + "loss": 0.0666, + "step": 1147 + }, + { + "epoch": 0.240620415007336, + "grad_norm": 0.1588187962770462, + "learning_rate": 8.88469698187957e-06, + "loss": 0.0651, + "step": 1148 + }, + { + "epoch": 0.2408300146719765, + "grad_norm": 0.13770383596420288, + "learning_rate": 8.88255877364687e-06, + "loss": 0.0638, + "step": 1149 + }, + { + "epoch": 0.24103961433661705, + "grad_norm": 0.12436515837907791, + "learning_rate": 8.880418775555382e-06, + "loss": 0.0687, + "step": 1150 + }, + { + "epoch": 0.2412492140012576, + "grad_norm": 0.13518837094306946, + "learning_rate": 8.878276988591645e-06, + "loss": 0.071, + "step": 1151 + }, + { + "epoch": 0.24145881366589814, + "grad_norm": 0.14451143145561218, + "learning_rate": 8.876133413743023e-06, + "loss": 0.0668, + "step": 1152 + }, + { + "epoch": 0.24166841333053868, + "grad_norm": 0.13057959079742432, + "learning_rate": 8.873988051997702e-06, + "loss": 0.066, + "step": 1153 + }, + { + "epoch": 0.24187801299517922, + "grad_norm": 0.11131347715854645, + "learning_rate": 8.871840904344692e-06, + "loss": 0.0641, + "step": 1154 + }, + { + "epoch": 0.24208761265981973, + "grad_norm": 0.116900235414505, + "learning_rate": 8.86969197177383e-06, + "loss": 0.0639, + "step": 1155 + }, + { + "epoch": 0.24229721232446028, + "grad_norm": 0.12555626034736633, + "learning_rate": 8.867541255275774e-06, + "loss": 0.0681, + "step": 1156 + }, + { + "epoch": 0.24250681198910082, + "grad_norm": 0.12001737207174301, + "learning_rate": 8.865388755842002e-06, + "loss": 0.0634, + "step": 1157 + }, + { + "epoch": 0.24271641165374136, + "grad_norm": 0.12198159843683243, + "learning_rate": 8.863234474464817e-06, + "loss": 0.0657, + "step": 1158 + }, + { + "epoch": 0.2429260113183819, + "grad_norm": 0.14027060568332672, + "learning_rate": 8.86107841213734e-06, + "loss": 0.067, + "step": 1159 + }, + { + "epoch": 0.24313561098302242, + "grad_norm": 0.1360728144645691, + "learning_rate": 8.85892056985352e-06, + "loss": 0.0653, + "step": 1160 + }, + { + "epoch": 0.24334521064766296, + "grad_norm": 0.11649616807699203, + "learning_rate": 8.856760948608117e-06, + "loss": 0.0665, + "step": 1161 + }, + { + "epoch": 0.2435548103123035, + "grad_norm": 0.11606436967849731, + "learning_rate": 8.854599549396717e-06, + "loss": 0.0644, + "step": 1162 + }, + { + "epoch": 0.24376440997694404, + "grad_norm": 0.13204872608184814, + "learning_rate": 8.852436373215727e-06, + "loss": 0.0654, + "step": 1163 + }, + { + "epoch": 0.24397400964158458, + "grad_norm": 0.1417747139930725, + "learning_rate": 8.850271421062368e-06, + "loss": 0.0639, + "step": 1164 + }, + { + "epoch": 0.2441836093062251, + "grad_norm": 0.1318253129720688, + "learning_rate": 8.848104693934683e-06, + "loss": 0.065, + "step": 1165 + }, + { + "epoch": 0.24439320897086564, + "grad_norm": 0.11403854191303253, + "learning_rate": 8.845936192831536e-06, + "loss": 0.065, + "step": 1166 + }, + { + "epoch": 0.24460280863550618, + "grad_norm": 0.11362165957689285, + "learning_rate": 8.8437659187526e-06, + "loss": 0.063, + "step": 1167 + }, + { + "epoch": 0.24481240830014672, + "grad_norm": 0.11161333322525024, + "learning_rate": 8.841593872698377e-06, + "loss": 0.0663, + "step": 1168 + }, + { + "epoch": 0.24502200796478726, + "grad_norm": 0.11067195981740952, + "learning_rate": 8.839420055670175e-06, + "loss": 0.0646, + "step": 1169 + }, + { + "epoch": 0.2452316076294278, + "grad_norm": 0.13394485414028168, + "learning_rate": 8.837244468670126e-06, + "loss": 0.0637, + "step": 1170 + }, + { + "epoch": 0.24544120729406832, + "grad_norm": 0.1596195250749588, + "learning_rate": 8.835067112701172e-06, + "loss": 0.0679, + "step": 1171 + }, + { + "epoch": 0.24565080695870886, + "grad_norm": 0.1561581790447235, + "learning_rate": 8.832887988767076e-06, + "loss": 0.066, + "step": 1172 + }, + { + "epoch": 0.2458604066233494, + "grad_norm": 0.15718278288841248, + "learning_rate": 8.830707097872413e-06, + "loss": 0.0637, + "step": 1173 + }, + { + "epoch": 0.24607000628798995, + "grad_norm": 0.18930961191654205, + "learning_rate": 8.828524441022575e-06, + "loss": 0.0649, + "step": 1174 + }, + { + "epoch": 0.2462796059526305, + "grad_norm": 0.20854079723358154, + "learning_rate": 8.826340019223765e-06, + "loss": 0.0711, + "step": 1175 + }, + { + "epoch": 0.246489205617271, + "grad_norm": 0.17393162846565247, + "learning_rate": 8.824153833483001e-06, + "loss": 0.0639, + "step": 1176 + }, + { + "epoch": 0.24669880528191154, + "grad_norm": 0.12246709316968918, + "learning_rate": 8.821965884808112e-06, + "loss": 0.0641, + "step": 1177 + }, + { + "epoch": 0.24690840494655208, + "grad_norm": 0.11515267938375473, + "learning_rate": 8.819776174207746e-06, + "loss": 0.0629, + "step": 1178 + }, + { + "epoch": 0.24711800461119263, + "grad_norm": 0.14278475940227509, + "learning_rate": 8.817584702691358e-06, + "loss": 0.0679, + "step": 1179 + }, + { + "epoch": 0.24732760427583317, + "grad_norm": 0.17082667350769043, + "learning_rate": 8.815391471269212e-06, + "loss": 0.0655, + "step": 1180 + }, + { + "epoch": 0.24753720394047368, + "grad_norm": 0.17258518934249878, + "learning_rate": 8.813196480952393e-06, + "loss": 0.0674, + "step": 1181 + }, + { + "epoch": 0.24774680360511422, + "grad_norm": 0.15490064024925232, + "learning_rate": 8.810999732752788e-06, + "loss": 0.0649, + "step": 1182 + }, + { + "epoch": 0.24795640326975477, + "grad_norm": 0.1470610648393631, + "learning_rate": 8.808801227683095e-06, + "loss": 0.0647, + "step": 1183 + }, + { + "epoch": 0.2481660029343953, + "grad_norm": 0.14642740786075592, + "learning_rate": 8.80660096675683e-06, + "loss": 0.066, + "step": 1184 + }, + { + "epoch": 0.24837560259903585, + "grad_norm": 0.13396520912647247, + "learning_rate": 8.80439895098831e-06, + "loss": 0.0664, + "step": 1185 + }, + { + "epoch": 0.2485852022636764, + "grad_norm": 0.12936848402023315, + "learning_rate": 8.802195181392663e-06, + "loss": 0.0659, + "step": 1186 + }, + { + "epoch": 0.2487948019283169, + "grad_norm": 0.1439359188079834, + "learning_rate": 8.799989658985828e-06, + "loss": 0.0656, + "step": 1187 + }, + { + "epoch": 0.24900440159295745, + "grad_norm": 0.14836929738521576, + "learning_rate": 8.797782384784549e-06, + "loss": 0.0661, + "step": 1188 + }, + { + "epoch": 0.249214001257598, + "grad_norm": 0.12386839091777802, + "learning_rate": 8.79557335980638e-06, + "loss": 0.0694, + "step": 1189 + }, + { + "epoch": 0.24942360092223853, + "grad_norm": 0.1206989586353302, + "learning_rate": 8.793362585069677e-06, + "loss": 0.0644, + "step": 1190 + }, + { + "epoch": 0.24963320058687907, + "grad_norm": 0.13306036591529846, + "learning_rate": 8.791150061593615e-06, + "loss": 0.066, + "step": 1191 + }, + { + "epoch": 0.2498428002515196, + "grad_norm": 0.12486235052347183, + "learning_rate": 8.78893579039816e-06, + "loss": 0.0662, + "step": 1192 + }, + { + "epoch": 0.25005239991616013, + "grad_norm": 0.12596681714057922, + "learning_rate": 8.786719772504092e-06, + "loss": 0.0653, + "step": 1193 + }, + { + "epoch": 0.2502619995808007, + "grad_norm": 0.1434643715620041, + "learning_rate": 8.784502008932998e-06, + "loss": 0.0651, + "step": 1194 + }, + { + "epoch": 0.2504715992454412, + "grad_norm": 0.13905848562717438, + "learning_rate": 8.782282500707262e-06, + "loss": 0.0611, + "step": 1195 + }, + { + "epoch": 0.2506811989100817, + "grad_norm": 0.14193584024906158, + "learning_rate": 8.780061248850078e-06, + "loss": 0.064, + "step": 1196 + }, + { + "epoch": 0.2508907985747223, + "grad_norm": 0.15785372257232666, + "learning_rate": 8.777838254385444e-06, + "loss": 0.065, + "step": 1197 + }, + { + "epoch": 0.2511003982393628, + "grad_norm": 0.16986316442489624, + "learning_rate": 8.775613518338161e-06, + "loss": 0.0641, + "step": 1198 + }, + { + "epoch": 0.2513099979040034, + "grad_norm": 0.16595768928527832, + "learning_rate": 8.773387041733829e-06, + "loss": 0.0638, + "step": 1199 + }, + { + "epoch": 0.2515195975686439, + "grad_norm": 0.15224842727184296, + "learning_rate": 8.771158825598855e-06, + "loss": 0.0669, + "step": 1200 + }, + { + "epoch": 0.2517291972332844, + "grad_norm": 0.14295248687267303, + "learning_rate": 8.768928870960447e-06, + "loss": 0.0642, + "step": 1201 + }, + { + "epoch": 0.251938796897925, + "grad_norm": 0.1518334299325943, + "learning_rate": 8.766697178846611e-06, + "loss": 0.0665, + "step": 1202 + }, + { + "epoch": 0.2521483965625655, + "grad_norm": 0.15003719925880432, + "learning_rate": 8.764463750286158e-06, + "loss": 0.0665, + "step": 1203 + }, + { + "epoch": 0.25235799622720606, + "grad_norm": 0.13841992616653442, + "learning_rate": 8.762228586308697e-06, + "loss": 0.0639, + "step": 1204 + }, + { + "epoch": 0.2525675958918466, + "grad_norm": 0.13648587465286255, + "learning_rate": 8.75999168794464e-06, + "loss": 0.0656, + "step": 1205 + }, + { + "epoch": 0.2527771955564871, + "grad_norm": 0.1389138102531433, + "learning_rate": 8.757753056225197e-06, + "loss": 0.0641, + "step": 1206 + }, + { + "epoch": 0.25298679522112766, + "grad_norm": 0.15803200006484985, + "learning_rate": 8.755512692182376e-06, + "loss": 0.0641, + "step": 1207 + }, + { + "epoch": 0.2531963948857682, + "grad_norm": 0.17170608043670654, + "learning_rate": 8.753270596848982e-06, + "loss": 0.0642, + "step": 1208 + }, + { + "epoch": 0.25340599455040874, + "grad_norm": 0.1522701382637024, + "learning_rate": 8.751026771258622e-06, + "loss": 0.0679, + "step": 1209 + }, + { + "epoch": 0.25361559421504926, + "grad_norm": 0.137386754155159, + "learning_rate": 8.748781216445702e-06, + "loss": 0.0632, + "step": 1210 + }, + { + "epoch": 0.25382519387968977, + "grad_norm": 0.15736961364746094, + "learning_rate": 8.746533933445418e-06, + "loss": 0.0675, + "step": 1211 + }, + { + "epoch": 0.25403479354433034, + "grad_norm": 0.15517348051071167, + "learning_rate": 8.74428492329377e-06, + "loss": 0.0628, + "step": 1212 + }, + { + "epoch": 0.25424439320897085, + "grad_norm": 0.14377760887145996, + "learning_rate": 8.74203418702755e-06, + "loss": 0.0655, + "step": 1213 + }, + { + "epoch": 0.2544539928736114, + "grad_norm": 0.13686007261276245, + "learning_rate": 8.739781725684346e-06, + "loss": 0.0634, + "step": 1214 + }, + { + "epoch": 0.25466359253825194, + "grad_norm": 0.1474810093641281, + "learning_rate": 8.737527540302543e-06, + "loss": 0.0695, + "step": 1215 + }, + { + "epoch": 0.25487319220289245, + "grad_norm": 0.16937799751758575, + "learning_rate": 8.735271631921322e-06, + "loss": 0.0691, + "step": 1216 + }, + { + "epoch": 0.255082791867533, + "grad_norm": 0.1835523098707199, + "learning_rate": 8.733014001580656e-06, + "loss": 0.0642, + "step": 1217 + }, + { + "epoch": 0.25529239153217353, + "grad_norm": 0.1800583302974701, + "learning_rate": 8.730754650321307e-06, + "loss": 0.0675, + "step": 1218 + }, + { + "epoch": 0.2555019911968141, + "grad_norm": 0.14136351644992828, + "learning_rate": 8.728493579184841e-06, + "loss": 0.0669, + "step": 1219 + }, + { + "epoch": 0.2557115908614546, + "grad_norm": 0.1205543577671051, + "learning_rate": 8.72623078921361e-06, + "loss": 0.0643, + "step": 1220 + }, + { + "epoch": 0.25592119052609513, + "grad_norm": 0.13225074112415314, + "learning_rate": 8.723966281450758e-06, + "loss": 0.0618, + "step": 1221 + }, + { + "epoch": 0.2561307901907357, + "grad_norm": 0.18301644921302795, + "learning_rate": 8.721700056940224e-06, + "loss": 0.0663, + "step": 1222 + }, + { + "epoch": 0.2563403898553762, + "grad_norm": 0.1921045482158661, + "learning_rate": 8.719432116726738e-06, + "loss": 0.0687, + "step": 1223 + }, + { + "epoch": 0.2565499895200168, + "grad_norm": 0.15844593942165375, + "learning_rate": 8.717162461855817e-06, + "loss": 0.0646, + "step": 1224 + }, + { + "epoch": 0.2567595891846573, + "grad_norm": 0.16561459004878998, + "learning_rate": 8.714891093373774e-06, + "loss": 0.0662, + "step": 1225 + }, + { + "epoch": 0.2569691888492978, + "grad_norm": 0.1608954221010208, + "learning_rate": 8.712618012327709e-06, + "loss": 0.0681, + "step": 1226 + }, + { + "epoch": 0.2571787885139384, + "grad_norm": 0.1320895552635193, + "learning_rate": 8.710343219765512e-06, + "loss": 0.0708, + "step": 1227 + }, + { + "epoch": 0.2573883881785789, + "grad_norm": 0.12631681561470032, + "learning_rate": 8.70806671673586e-06, + "loss": 0.065, + "step": 1228 + }, + { + "epoch": 0.25759798784321947, + "grad_norm": 0.1299394816160202, + "learning_rate": 8.705788504288222e-06, + "loss": 0.0662, + "step": 1229 + }, + { + "epoch": 0.25780758750786, + "grad_norm": 0.14543578028678894, + "learning_rate": 8.703508583472855e-06, + "loss": 0.0628, + "step": 1230 + }, + { + "epoch": 0.25801718717250055, + "grad_norm": 0.1560242772102356, + "learning_rate": 8.701226955340797e-06, + "loss": 0.063, + "step": 1231 + }, + { + "epoch": 0.25822678683714106, + "grad_norm": 0.15224745869636536, + "learning_rate": 8.698943620943885e-06, + "loss": 0.0642, + "step": 1232 + }, + { + "epoch": 0.2584363865017816, + "grad_norm": 0.17773905396461487, + "learning_rate": 8.696658581334728e-06, + "loss": 0.0693, + "step": 1233 + }, + { + "epoch": 0.25864598616642215, + "grad_norm": 0.18498320877552032, + "learning_rate": 8.694371837566737e-06, + "loss": 0.0649, + "step": 1234 + }, + { + "epoch": 0.25885558583106266, + "grad_norm": 0.15482178330421448, + "learning_rate": 8.692083390694095e-06, + "loss": 0.0639, + "step": 1235 + }, + { + "epoch": 0.25906518549570323, + "grad_norm": 0.1407817006111145, + "learning_rate": 8.689793241771775e-06, + "loss": 0.0639, + "step": 1236 + }, + { + "epoch": 0.25927478516034375, + "grad_norm": 0.11920575052499771, + "learning_rate": 8.68750139185554e-06, + "loss": 0.0641, + "step": 1237 + }, + { + "epoch": 0.25948438482498426, + "grad_norm": 0.12093566358089447, + "learning_rate": 8.685207842001928e-06, + "loss": 0.0656, + "step": 1238 + }, + { + "epoch": 0.25969398448962483, + "grad_norm": 0.14436878263950348, + "learning_rate": 8.682912593268265e-06, + "loss": 0.0635, + "step": 1239 + }, + { + "epoch": 0.25990358415426534, + "grad_norm": 0.17225278913974762, + "learning_rate": 8.680615646712663e-06, + "loss": 0.0664, + "step": 1240 + }, + { + "epoch": 0.2601131838189059, + "grad_norm": 0.16033266484737396, + "learning_rate": 8.678317003394013e-06, + "loss": 0.0665, + "step": 1241 + }, + { + "epoch": 0.2603227834835464, + "grad_norm": 0.13092049956321716, + "learning_rate": 8.676016664371986e-06, + "loss": 0.0654, + "step": 1242 + }, + { + "epoch": 0.26053238314818694, + "grad_norm": 0.12013121694326401, + "learning_rate": 8.673714630707043e-06, + "loss": 0.0668, + "step": 1243 + }, + { + "epoch": 0.2607419828128275, + "grad_norm": 0.10177928954362869, + "learning_rate": 8.671410903460416e-06, + "loss": 0.0633, + "step": 1244 + }, + { + "epoch": 0.260951582477468, + "grad_norm": 0.11560986191034317, + "learning_rate": 8.669105483694126e-06, + "loss": 0.0653, + "step": 1245 + }, + { + "epoch": 0.2611611821421086, + "grad_norm": 0.1309051811695099, + "learning_rate": 8.666798372470971e-06, + "loss": 0.0649, + "step": 1246 + }, + { + "epoch": 0.2613707818067491, + "grad_norm": 0.13534289598464966, + "learning_rate": 8.664489570854526e-06, + "loss": 0.0646, + "step": 1247 + }, + { + "epoch": 0.2615803814713896, + "grad_norm": 0.13940168917179108, + "learning_rate": 8.662179079909152e-06, + "loss": 0.0652, + "step": 1248 + }, + { + "epoch": 0.2617899811360302, + "grad_norm": 0.16176725924015045, + "learning_rate": 8.659866900699983e-06, + "loss": 0.0652, + "step": 1249 + }, + { + "epoch": 0.2619995808006707, + "grad_norm": 0.18724969029426575, + "learning_rate": 8.657553034292932e-06, + "loss": 0.0638, + "step": 1250 + }, + { + "epoch": 0.2622091804653113, + "grad_norm": 0.198954775929451, + "learning_rate": 8.65523748175469e-06, + "loss": 0.0639, + "step": 1251 + }, + { + "epoch": 0.2624187801299518, + "grad_norm": 0.17343983054161072, + "learning_rate": 8.652920244152732e-06, + "loss": 0.0675, + "step": 1252 + }, + { + "epoch": 0.2626283797945923, + "grad_norm": 0.12653297185897827, + "learning_rate": 8.650601322555299e-06, + "loss": 0.0635, + "step": 1253 + }, + { + "epoch": 0.2628379794592329, + "grad_norm": 0.16065514087677002, + "learning_rate": 8.648280718031412e-06, + "loss": 0.0663, + "step": 1254 + }, + { + "epoch": 0.2630475791238734, + "grad_norm": 0.17681656777858734, + "learning_rate": 8.645958431650875e-06, + "loss": 0.0636, + "step": 1255 + }, + { + "epoch": 0.26325717878851396, + "grad_norm": 0.13592669367790222, + "learning_rate": 8.643634464484257e-06, + "loss": 0.0646, + "step": 1256 + }, + { + "epoch": 0.26346677845315447, + "grad_norm": 0.1315130740404129, + "learning_rate": 8.641308817602908e-06, + "loss": 0.0654, + "step": 1257 + }, + { + "epoch": 0.263676378117795, + "grad_norm": 0.1273263692855835, + "learning_rate": 8.63898149207895e-06, + "loss": 0.0675, + "step": 1258 + }, + { + "epoch": 0.26388597778243555, + "grad_norm": 0.10599275678396225, + "learning_rate": 8.636652488985282e-06, + "loss": 0.0647, + "step": 1259 + }, + { + "epoch": 0.26409557744707607, + "grad_norm": 0.11827687919139862, + "learning_rate": 8.634321809395569e-06, + "loss": 0.0642, + "step": 1260 + }, + { + "epoch": 0.26430517711171664, + "grad_norm": 0.14330358803272247, + "learning_rate": 8.631989454384258e-06, + "loss": 0.063, + "step": 1261 + }, + { + "epoch": 0.26451477677635715, + "grad_norm": 0.13524498045444489, + "learning_rate": 8.629655425026564e-06, + "loss": 0.0629, + "step": 1262 + }, + { + "epoch": 0.2647243764409977, + "grad_norm": 0.12704388797283173, + "learning_rate": 8.627319722398471e-06, + "loss": 0.062, + "step": 1263 + }, + { + "epoch": 0.26493397610563824, + "grad_norm": 0.13736125826835632, + "learning_rate": 8.624982347576741e-06, + "loss": 0.0634, + "step": 1264 + }, + { + "epoch": 0.26514357577027875, + "grad_norm": 0.12946844100952148, + "learning_rate": 8.622643301638902e-06, + "loss": 0.0662, + "step": 1265 + }, + { + "epoch": 0.2653531754349193, + "grad_norm": 0.1286906898021698, + "learning_rate": 8.620302585663252e-06, + "loss": 0.0663, + "step": 1266 + }, + { + "epoch": 0.26556277509955983, + "grad_norm": 0.12868361175060272, + "learning_rate": 8.617960200728863e-06, + "loss": 0.0593, + "step": 1267 + }, + { + "epoch": 0.2657723747642004, + "grad_norm": 0.12836746871471405, + "learning_rate": 8.615616147915573e-06, + "loss": 0.0656, + "step": 1268 + }, + { + "epoch": 0.2659819744288409, + "grad_norm": 0.14441126585006714, + "learning_rate": 8.613270428303991e-06, + "loss": 0.0686, + "step": 1269 + }, + { + "epoch": 0.26619157409348143, + "grad_norm": 0.14020347595214844, + "learning_rate": 8.61092304297549e-06, + "loss": 0.0651, + "step": 1270 + }, + { + "epoch": 0.266401173758122, + "grad_norm": 0.125614196062088, + "learning_rate": 8.608573993012217e-06, + "loss": 0.0661, + "step": 1271 + }, + { + "epoch": 0.2666107734227625, + "grad_norm": 0.14085769653320312, + "learning_rate": 8.606223279497081e-06, + "loss": 0.0643, + "step": 1272 + }, + { + "epoch": 0.2668203730874031, + "grad_norm": 0.14944879710674286, + "learning_rate": 8.603870903513765e-06, + "loss": 0.0641, + "step": 1273 + }, + { + "epoch": 0.2670299727520436, + "grad_norm": 0.13971124589443207, + "learning_rate": 8.601516866146711e-06, + "loss": 0.0644, + "step": 1274 + }, + { + "epoch": 0.2672395724166841, + "grad_norm": 0.1283096969127655, + "learning_rate": 8.599161168481127e-06, + "loss": 0.0622, + "step": 1275 + }, + { + "epoch": 0.2674491720813247, + "grad_norm": 0.13809433579444885, + "learning_rate": 8.596803811602994e-06, + "loss": 0.0658, + "step": 1276 + }, + { + "epoch": 0.2676587717459652, + "grad_norm": 0.13916701078414917, + "learning_rate": 8.594444796599051e-06, + "loss": 0.0636, + "step": 1277 + }, + { + "epoch": 0.26786837141060577, + "grad_norm": 0.1267821043729782, + "learning_rate": 8.592084124556803e-06, + "loss": 0.0632, + "step": 1278 + }, + { + "epoch": 0.2680779710752463, + "grad_norm": 0.14250284433364868, + "learning_rate": 8.589721796564521e-06, + "loss": 0.0617, + "step": 1279 + }, + { + "epoch": 0.2682875707398868, + "grad_norm": 0.14109215140342712, + "learning_rate": 8.587357813711234e-06, + "loss": 0.0637, + "step": 1280 + }, + { + "epoch": 0.26849717040452736, + "grad_norm": 0.14766204357147217, + "learning_rate": 8.584992177086742e-06, + "loss": 0.0653, + "step": 1281 + }, + { + "epoch": 0.2687067700691679, + "grad_norm": 0.1617199033498764, + "learning_rate": 8.5826248877816e-06, + "loss": 0.0659, + "step": 1282 + }, + { + "epoch": 0.26891636973380845, + "grad_norm": 0.17226214706897736, + "learning_rate": 8.580255946887129e-06, + "loss": 0.0618, + "step": 1283 + }, + { + "epoch": 0.26912596939844896, + "grad_norm": 0.1840105652809143, + "learning_rate": 8.577885355495412e-06, + "loss": 0.0641, + "step": 1284 + }, + { + "epoch": 0.2693355690630895, + "grad_norm": 0.1407589465379715, + "learning_rate": 8.575513114699288e-06, + "loss": 0.0621, + "step": 1285 + }, + { + "epoch": 0.26954516872773004, + "grad_norm": 0.14098970592021942, + "learning_rate": 8.57313922559236e-06, + "loss": 0.0632, + "step": 1286 + }, + { + "epoch": 0.26975476839237056, + "grad_norm": 0.14431780576705933, + "learning_rate": 8.57076368926899e-06, + "loss": 0.0636, + "step": 1287 + }, + { + "epoch": 0.26996436805701113, + "grad_norm": 0.11286340653896332, + "learning_rate": 8.568386506824304e-06, + "loss": 0.0634, + "step": 1288 + }, + { + "epoch": 0.27017396772165164, + "grad_norm": 0.13212832808494568, + "learning_rate": 8.566007679354178e-06, + "loss": 0.0621, + "step": 1289 + }, + { + "epoch": 0.27038356738629216, + "grad_norm": 0.1318579912185669, + "learning_rate": 8.563627207955255e-06, + "loss": 0.0629, + "step": 1290 + }, + { + "epoch": 0.2705931670509327, + "grad_norm": 0.12174471467733383, + "learning_rate": 8.561245093724926e-06, + "loss": 0.0654, + "step": 1291 + }, + { + "epoch": 0.27080276671557324, + "grad_norm": 0.12569588422775269, + "learning_rate": 8.558861337761349e-06, + "loss": 0.0644, + "step": 1292 + }, + { + "epoch": 0.2710123663802138, + "grad_norm": 0.09849688410758972, + "learning_rate": 8.556475941163436e-06, + "loss": 0.0645, + "step": 1293 + }, + { + "epoch": 0.2712219660448543, + "grad_norm": 0.11899732053279877, + "learning_rate": 8.554088905030852e-06, + "loss": 0.0658, + "step": 1294 + }, + { + "epoch": 0.27143156570949484, + "grad_norm": 0.12569987773895264, + "learning_rate": 8.551700230464022e-06, + "loss": 0.0658, + "step": 1295 + }, + { + "epoch": 0.2716411653741354, + "grad_norm": 0.13572193682193756, + "learning_rate": 8.549309918564122e-06, + "loss": 0.0599, + "step": 1296 + }, + { + "epoch": 0.2718507650387759, + "grad_norm": 0.16335296630859375, + "learning_rate": 8.546917970433087e-06, + "loss": 0.065, + "step": 1297 + }, + { + "epoch": 0.2720603647034165, + "grad_norm": 0.1542273610830307, + "learning_rate": 8.544524387173605e-06, + "loss": 0.0633, + "step": 1298 + }, + { + "epoch": 0.272269964368057, + "grad_norm": 0.18188713490962982, + "learning_rate": 8.542129169889117e-06, + "loss": 0.0615, + "step": 1299 + }, + { + "epoch": 0.2724795640326976, + "grad_norm": 0.18146541714668274, + "learning_rate": 8.539732319683817e-06, + "loss": 0.0629, + "step": 1300 + }, + { + "epoch": 0.2726891636973381, + "grad_norm": 0.16707837581634521, + "learning_rate": 8.537333837662653e-06, + "loss": 0.0646, + "step": 1301 + }, + { + "epoch": 0.2728987633619786, + "grad_norm": 0.143534317612648, + "learning_rate": 8.534933724931324e-06, + "loss": 0.0642, + "step": 1302 + }, + { + "epoch": 0.27310836302661917, + "grad_norm": 0.10924031585454941, + "learning_rate": 8.532531982596284e-06, + "loss": 0.0634, + "step": 1303 + }, + { + "epoch": 0.2733179626912597, + "grad_norm": 0.11168647557497025, + "learning_rate": 8.530128611764731e-06, + "loss": 0.0634, + "step": 1304 + }, + { + "epoch": 0.27352756235590026, + "grad_norm": 0.14308258891105652, + "learning_rate": 8.527723613544623e-06, + "loss": 0.0638, + "step": 1305 + }, + { + "epoch": 0.27373716202054077, + "grad_norm": 0.18562361598014832, + "learning_rate": 8.525316989044663e-06, + "loss": 0.0666, + "step": 1306 + }, + { + "epoch": 0.2739467616851813, + "grad_norm": 0.21448108553886414, + "learning_rate": 8.5229087393743e-06, + "loss": 0.0633, + "step": 1307 + }, + { + "epoch": 0.27415636134982185, + "grad_norm": 0.20579898357391357, + "learning_rate": 8.520498865643742e-06, + "loss": 0.0624, + "step": 1308 + }, + { + "epoch": 0.27436596101446237, + "grad_norm": 0.18763695657253265, + "learning_rate": 8.518087368963938e-06, + "loss": 0.066, + "step": 1309 + }, + { + "epoch": 0.27457556067910294, + "grad_norm": 0.15348191559314728, + "learning_rate": 8.515674250446588e-06, + "loss": 0.0644, + "step": 1310 + }, + { + "epoch": 0.27478516034374345, + "grad_norm": 0.11642424762248993, + "learning_rate": 8.51325951120414e-06, + "loss": 0.0639, + "step": 1311 + }, + { + "epoch": 0.27499476000838396, + "grad_norm": 0.15585097670555115, + "learning_rate": 8.510843152349786e-06, + "loss": 0.0624, + "step": 1312 + }, + { + "epoch": 0.27520435967302453, + "grad_norm": 0.18433383107185364, + "learning_rate": 8.508425174997467e-06, + "loss": 0.0648, + "step": 1313 + }, + { + "epoch": 0.27541395933766505, + "grad_norm": 0.1758815348148346, + "learning_rate": 8.506005580261872e-06, + "loss": 0.0672, + "step": 1314 + }, + { + "epoch": 0.2756235590023056, + "grad_norm": 0.11144900321960449, + "learning_rate": 8.503584369258434e-06, + "loss": 0.0606, + "step": 1315 + }, + { + "epoch": 0.27583315866694613, + "grad_norm": 0.11539231985807419, + "learning_rate": 8.501161543103327e-06, + "loss": 0.0634, + "step": 1316 + }, + { + "epoch": 0.27604275833158665, + "grad_norm": 0.15406975150108337, + "learning_rate": 8.498737102913476e-06, + "loss": 0.0628, + "step": 1317 + }, + { + "epoch": 0.2762523579962272, + "grad_norm": 0.16817119717597961, + "learning_rate": 8.496311049806549e-06, + "loss": 0.0657, + "step": 1318 + }, + { + "epoch": 0.27646195766086773, + "grad_norm": 0.1416136920452118, + "learning_rate": 8.493883384900953e-06, + "loss": 0.0644, + "step": 1319 + }, + { + "epoch": 0.2766715573255083, + "grad_norm": 0.10792358964681625, + "learning_rate": 8.491454109315844e-06, + "loss": 0.0654, + "step": 1320 + }, + { + "epoch": 0.2768811569901488, + "grad_norm": 0.1336362659931183, + "learning_rate": 8.489023224171114e-06, + "loss": 0.064, + "step": 1321 + }, + { + "epoch": 0.2770907566547893, + "grad_norm": 0.15568262338638306, + "learning_rate": 8.486590730587403e-06, + "loss": 0.0648, + "step": 1322 + }, + { + "epoch": 0.2773003563194299, + "grad_norm": 0.1655251383781433, + "learning_rate": 8.48415662968609e-06, + "loss": 0.0647, + "step": 1323 + }, + { + "epoch": 0.2775099559840704, + "grad_norm": 0.1461930274963379, + "learning_rate": 8.481720922589294e-06, + "loss": 0.0631, + "step": 1324 + }, + { + "epoch": 0.277719555648711, + "grad_norm": 0.13114267587661743, + "learning_rate": 8.479283610419876e-06, + "loss": 0.0632, + "step": 1325 + }, + { + "epoch": 0.2779291553133515, + "grad_norm": 0.12551239132881165, + "learning_rate": 8.476844694301437e-06, + "loss": 0.0617, + "step": 1326 + }, + { + "epoch": 0.278138754977992, + "grad_norm": 0.13701596856117249, + "learning_rate": 8.474404175358315e-06, + "loss": 0.063, + "step": 1327 + }, + { + "epoch": 0.2783483546426326, + "grad_norm": 0.140949547290802, + "learning_rate": 8.47196205471559e-06, + "loss": 0.0644, + "step": 1328 + }, + { + "epoch": 0.2785579543072731, + "grad_norm": 0.13582463562488556, + "learning_rate": 8.469518333499079e-06, + "loss": 0.064, + "step": 1329 + }, + { + "epoch": 0.27876755397191366, + "grad_norm": 0.14434821903705597, + "learning_rate": 8.467073012835338e-06, + "loss": 0.0583, + "step": 1330 + }, + { + "epoch": 0.2789771536365542, + "grad_norm": 0.14979779720306396, + "learning_rate": 8.464626093851657e-06, + "loss": 0.0643, + "step": 1331 + }, + { + "epoch": 0.27918675330119475, + "grad_norm": 0.15899671614170074, + "learning_rate": 8.462177577676066e-06, + "loss": 0.0639, + "step": 1332 + }, + { + "epoch": 0.27939635296583526, + "grad_norm": 0.16557057201862335, + "learning_rate": 8.459727465437332e-06, + "loss": 0.0674, + "step": 1333 + }, + { + "epoch": 0.2796059526304758, + "grad_norm": 0.16371293365955353, + "learning_rate": 8.457275758264956e-06, + "loss": 0.0625, + "step": 1334 + }, + { + "epoch": 0.27981555229511634, + "grad_norm": 0.16819259524345398, + "learning_rate": 8.45482245728917e-06, + "loss": 0.066, + "step": 1335 + }, + { + "epoch": 0.28002515195975686, + "grad_norm": 0.1529543697834015, + "learning_rate": 8.452367563640953e-06, + "loss": 0.0608, + "step": 1336 + }, + { + "epoch": 0.2802347516243974, + "grad_norm": 0.13105508685112, + "learning_rate": 8.449911078452004e-06, + "loss": 0.0615, + "step": 1337 + }, + { + "epoch": 0.28044435128903794, + "grad_norm": 0.13537244498729706, + "learning_rate": 8.447453002854763e-06, + "loss": 0.0608, + "step": 1338 + }, + { + "epoch": 0.28065395095367845, + "grad_norm": 0.13226556777954102, + "learning_rate": 8.444993337982408e-06, + "loss": 0.0627, + "step": 1339 + }, + { + "epoch": 0.280863550618319, + "grad_norm": 0.11894568800926208, + "learning_rate": 8.442532084968836e-06, + "loss": 0.0631, + "step": 1340 + }, + { + "epoch": 0.28107315028295954, + "grad_norm": 0.0991387814283371, + "learning_rate": 8.44006924494869e-06, + "loss": 0.0619, + "step": 1341 + }, + { + "epoch": 0.2812827499476001, + "grad_norm": 0.11959309130907059, + "learning_rate": 8.437604819057336e-06, + "loss": 0.0658, + "step": 1342 + }, + { + "epoch": 0.2814923496122406, + "grad_norm": 0.13812269270420074, + "learning_rate": 8.435138808430873e-06, + "loss": 0.0599, + "step": 1343 + }, + { + "epoch": 0.28170194927688114, + "grad_norm": 0.15743722021579742, + "learning_rate": 8.432671214206135e-06, + "loss": 0.0643, + "step": 1344 + }, + { + "epoch": 0.2819115489415217, + "grad_norm": 0.15632624924182892, + "learning_rate": 8.43020203752068e-06, + "loss": 0.0648, + "step": 1345 + }, + { + "epoch": 0.2821211486061622, + "grad_norm": 0.1336655467748642, + "learning_rate": 8.427731279512797e-06, + "loss": 0.0648, + "step": 1346 + }, + { + "epoch": 0.2823307482708028, + "grad_norm": 0.11537309736013412, + "learning_rate": 8.425258941321508e-06, + "loss": 0.0627, + "step": 1347 + }, + { + "epoch": 0.2825403479354433, + "grad_norm": 0.12339965254068375, + "learning_rate": 8.422785024086557e-06, + "loss": 0.0608, + "step": 1348 + }, + { + "epoch": 0.2827499476000838, + "grad_norm": 0.139318585395813, + "learning_rate": 8.420309528948422e-06, + "loss": 0.065, + "step": 1349 + }, + { + "epoch": 0.2829595472647244, + "grad_norm": 0.12725292146205902, + "learning_rate": 8.417832457048302e-06, + "loss": 0.0621, + "step": 1350 + }, + { + "epoch": 0.2831691469293649, + "grad_norm": 0.1099279448390007, + "learning_rate": 8.415353809528133e-06, + "loss": 0.0631, + "step": 1351 + }, + { + "epoch": 0.28337874659400547, + "grad_norm": 0.11662383377552032, + "learning_rate": 8.412873587530565e-06, + "loss": 0.0633, + "step": 1352 + }, + { + "epoch": 0.283588346258646, + "grad_norm": 0.13634788990020752, + "learning_rate": 8.410391792198982e-06, + "loss": 0.0673, + "step": 1353 + }, + { + "epoch": 0.2837979459232865, + "grad_norm": 0.1415703296661377, + "learning_rate": 8.407908424677493e-06, + "loss": 0.065, + "step": 1354 + }, + { + "epoch": 0.28400754558792707, + "grad_norm": 0.13151812553405762, + "learning_rate": 8.405423486110926e-06, + "loss": 0.0633, + "step": 1355 + }, + { + "epoch": 0.2842171452525676, + "grad_norm": 0.1383858174085617, + "learning_rate": 8.40293697764484e-06, + "loss": 0.0613, + "step": 1356 + }, + { + "epoch": 0.28442674491720815, + "grad_norm": 0.15346135199069977, + "learning_rate": 8.400448900425515e-06, + "loss": 0.0641, + "step": 1357 + }, + { + "epoch": 0.28463634458184867, + "grad_norm": 0.14821362495422363, + "learning_rate": 8.397959255599952e-06, + "loss": 0.0655, + "step": 1358 + }, + { + "epoch": 0.2848459442464892, + "grad_norm": 0.1301778107881546, + "learning_rate": 8.395468044315878e-06, + "loss": 0.063, + "step": 1359 + }, + { + "epoch": 0.28505554391112975, + "grad_norm": 0.13555464148521423, + "learning_rate": 8.392975267721742e-06, + "loss": 0.0635, + "step": 1360 + }, + { + "epoch": 0.28526514357577026, + "grad_norm": 0.1415833681821823, + "learning_rate": 8.39048092696671e-06, + "loss": 0.0608, + "step": 1361 + }, + { + "epoch": 0.28547474324041083, + "grad_norm": 0.1284692883491516, + "learning_rate": 8.387985023200677e-06, + "loss": 0.063, + "step": 1362 + }, + { + "epoch": 0.28568434290505135, + "grad_norm": 0.10931932926177979, + "learning_rate": 8.385487557574253e-06, + "loss": 0.0633, + "step": 1363 + }, + { + "epoch": 0.28589394256969186, + "grad_norm": 0.11410527676343918, + "learning_rate": 8.382988531238766e-06, + "loss": 0.0621, + "step": 1364 + }, + { + "epoch": 0.28610354223433243, + "grad_norm": 0.13259191811084747, + "learning_rate": 8.380487945346269e-06, + "loss": 0.062, + "step": 1365 + }, + { + "epoch": 0.28631314189897294, + "grad_norm": 0.1383572220802307, + "learning_rate": 8.377985801049533e-06, + "loss": 0.0611, + "step": 1366 + }, + { + "epoch": 0.2865227415636135, + "grad_norm": 0.13124366104602814, + "learning_rate": 8.375482099502043e-06, + "loss": 0.0654, + "step": 1367 + }, + { + "epoch": 0.28673234122825403, + "grad_norm": 0.1339644193649292, + "learning_rate": 8.372976841858007e-06, + "loss": 0.0616, + "step": 1368 + }, + { + "epoch": 0.2869419408928946, + "grad_norm": 0.14934468269348145, + "learning_rate": 8.370470029272348e-06, + "loss": 0.0616, + "step": 1369 + }, + { + "epoch": 0.2871515405575351, + "grad_norm": 0.15344421565532684, + "learning_rate": 8.367961662900704e-06, + "loss": 0.0625, + "step": 1370 + }, + { + "epoch": 0.2873611402221756, + "grad_norm": 0.14816948771476746, + "learning_rate": 8.365451743899433e-06, + "loss": 0.065, + "step": 1371 + }, + { + "epoch": 0.2875707398868162, + "grad_norm": 0.16822992265224457, + "learning_rate": 8.362940273425609e-06, + "loss": 0.0622, + "step": 1372 + }, + { + "epoch": 0.2877803395514567, + "grad_norm": 0.19093914330005646, + "learning_rate": 8.360427252637015e-06, + "loss": 0.0627, + "step": 1373 + }, + { + "epoch": 0.2879899392160973, + "grad_norm": 0.17542891204357147, + "learning_rate": 8.357912682692158e-06, + "loss": 0.0645, + "step": 1374 + }, + { + "epoch": 0.2881995388807378, + "grad_norm": 0.13324493169784546, + "learning_rate": 8.355396564750251e-06, + "loss": 0.0636, + "step": 1375 + }, + { + "epoch": 0.2884091385453783, + "grad_norm": 0.11579709500074387, + "learning_rate": 8.352878899971225e-06, + "loss": 0.064, + "step": 1376 + }, + { + "epoch": 0.2886187382100189, + "grad_norm": 0.1267646849155426, + "learning_rate": 8.35035968951572e-06, + "loss": 0.0654, + "step": 1377 + }, + { + "epoch": 0.2888283378746594, + "grad_norm": 0.13641488552093506, + "learning_rate": 8.347838934545097e-06, + "loss": 0.064, + "step": 1378 + }, + { + "epoch": 0.28903793753929996, + "grad_norm": 0.14326706528663635, + "learning_rate": 8.34531663622142e-06, + "loss": 0.0621, + "step": 1379 + }, + { + "epoch": 0.2892475372039405, + "grad_norm": 0.14493903517723083, + "learning_rate": 8.342792795707468e-06, + "loss": 0.0651, + "step": 1380 + }, + { + "epoch": 0.289457136868581, + "grad_norm": 0.1295025646686554, + "learning_rate": 8.340267414166731e-06, + "loss": 0.0628, + "step": 1381 + }, + { + "epoch": 0.28966673653322156, + "grad_norm": 0.11359550803899765, + "learning_rate": 8.337740492763412e-06, + "loss": 0.0608, + "step": 1382 + }, + { + "epoch": 0.2898763361978621, + "grad_norm": 0.1159205511212349, + "learning_rate": 8.33521203266242e-06, + "loss": 0.0629, + "step": 1383 + }, + { + "epoch": 0.29008593586250264, + "grad_norm": 0.1216607317328453, + "learning_rate": 8.33268203502937e-06, + "loss": 0.0619, + "step": 1384 + }, + { + "epoch": 0.29029553552714316, + "grad_norm": 0.12574604153633118, + "learning_rate": 8.330150501030597e-06, + "loss": 0.0611, + "step": 1385 + }, + { + "epoch": 0.29050513519178367, + "grad_norm": 0.13100169599056244, + "learning_rate": 8.327617431833132e-06, + "loss": 0.06, + "step": 1386 + }, + { + "epoch": 0.29071473485642424, + "grad_norm": 0.14436903595924377, + "learning_rate": 8.325082828604724e-06, + "loss": 0.0687, + "step": 1387 + }, + { + "epoch": 0.29092433452106475, + "grad_norm": 0.150770902633667, + "learning_rate": 8.322546692513822e-06, + "loss": 0.0639, + "step": 1388 + }, + { + "epoch": 0.2911339341857053, + "grad_norm": 0.13927258551120758, + "learning_rate": 8.320009024729586e-06, + "loss": 0.0638, + "step": 1389 + }, + { + "epoch": 0.29134353385034584, + "grad_norm": 0.1205689087510109, + "learning_rate": 8.317469826421877e-06, + "loss": 0.0614, + "step": 1390 + }, + { + "epoch": 0.29155313351498635, + "grad_norm": 0.12410090863704681, + "learning_rate": 8.314929098761268e-06, + "loss": 0.064, + "step": 1391 + }, + { + "epoch": 0.2917627331796269, + "grad_norm": 0.1389668583869934, + "learning_rate": 8.312386842919034e-06, + "loss": 0.0623, + "step": 1392 + }, + { + "epoch": 0.29197233284426743, + "grad_norm": 0.1345934271812439, + "learning_rate": 8.309843060067152e-06, + "loss": 0.0655, + "step": 1393 + }, + { + "epoch": 0.292181932508908, + "grad_norm": 0.13228191435337067, + "learning_rate": 8.307297751378309e-06, + "loss": 0.0644, + "step": 1394 + }, + { + "epoch": 0.2923915321735485, + "grad_norm": 0.1374116837978363, + "learning_rate": 8.304750918025888e-06, + "loss": 0.0639, + "step": 1395 + }, + { + "epoch": 0.29260113183818903, + "grad_norm": 0.13251402974128723, + "learning_rate": 8.30220256118398e-06, + "loss": 0.067, + "step": 1396 + }, + { + "epoch": 0.2928107315028296, + "grad_norm": 0.124875508248806, + "learning_rate": 8.29965268202738e-06, + "loss": 0.0629, + "step": 1397 + }, + { + "epoch": 0.2930203311674701, + "grad_norm": 0.12808595597743988, + "learning_rate": 8.297101281731576e-06, + "loss": 0.0595, + "step": 1398 + }, + { + "epoch": 0.2932299308321107, + "grad_norm": 0.13613183796405792, + "learning_rate": 8.294548361472767e-06, + "loss": 0.064, + "step": 1399 + }, + { + "epoch": 0.2934395304967512, + "grad_norm": 0.1279173195362091, + "learning_rate": 8.291993922427848e-06, + "loss": 0.0584, + "step": 1400 + }, + { + "epoch": 0.29364913016139177, + "grad_norm": 0.11164448410272598, + "learning_rate": 8.289437965774414e-06, + "loss": 0.063, + "step": 1401 + }, + { + "epoch": 0.2938587298260323, + "grad_norm": 0.11260772496461868, + "learning_rate": 8.286880492690761e-06, + "loss": 0.0582, + "step": 1402 + }, + { + "epoch": 0.2940683294906728, + "grad_norm": 0.12324314564466476, + "learning_rate": 8.284321504355884e-06, + "loss": 0.0635, + "step": 1403 + }, + { + "epoch": 0.29427792915531337, + "grad_norm": 0.12230779230594635, + "learning_rate": 8.281761001949474e-06, + "loss": 0.063, + "step": 1404 + }, + { + "epoch": 0.2944875288199539, + "grad_norm": 0.11499473452568054, + "learning_rate": 8.279198986651925e-06, + "loss": 0.0613, + "step": 1405 + }, + { + "epoch": 0.29469712848459445, + "grad_norm": 0.12175990641117096, + "learning_rate": 8.276635459644327e-06, + "loss": 0.0623, + "step": 1406 + }, + { + "epoch": 0.29490672814923496, + "grad_norm": 0.14893771708011627, + "learning_rate": 8.274070422108458e-06, + "loss": 0.0619, + "step": 1407 + }, + { + "epoch": 0.2951163278138755, + "grad_norm": 0.17682506144046783, + "learning_rate": 8.271503875226807e-06, + "loss": 0.0652, + "step": 1408 + }, + { + "epoch": 0.29532592747851605, + "grad_norm": 0.1828548014163971, + "learning_rate": 8.268935820182549e-06, + "loss": 0.0639, + "step": 1409 + }, + { + "epoch": 0.29553552714315656, + "grad_norm": 0.16200965642929077, + "learning_rate": 8.266366258159557e-06, + "loss": 0.0657, + "step": 1410 + }, + { + "epoch": 0.29574512680779713, + "grad_norm": 0.12557782232761383, + "learning_rate": 8.263795190342398e-06, + "loss": 0.062, + "step": 1411 + }, + { + "epoch": 0.29595472647243765, + "grad_norm": 0.11403919011354446, + "learning_rate": 8.261222617916335e-06, + "loss": 0.064, + "step": 1412 + }, + { + "epoch": 0.29616432613707816, + "grad_norm": 0.14523234963417053, + "learning_rate": 8.258648542067322e-06, + "loss": 0.0627, + "step": 1413 + }, + { + "epoch": 0.29637392580171873, + "grad_norm": 0.16806814074516296, + "learning_rate": 8.256072963982008e-06, + "loss": 0.0636, + "step": 1414 + }, + { + "epoch": 0.29658352546635924, + "grad_norm": 0.16369985044002533, + "learning_rate": 8.253495884847735e-06, + "loss": 0.0627, + "step": 1415 + }, + { + "epoch": 0.2967931251309998, + "grad_norm": 0.14731934666633606, + "learning_rate": 8.250917305852532e-06, + "loss": 0.0622, + "step": 1416 + }, + { + "epoch": 0.2970027247956403, + "grad_norm": 0.14016751945018768, + "learning_rate": 8.248337228185128e-06, + "loss": 0.0643, + "step": 1417 + }, + { + "epoch": 0.29721232446028084, + "grad_norm": 0.12805722653865814, + "learning_rate": 8.245755653034938e-06, + "loss": 0.0641, + "step": 1418 + }, + { + "epoch": 0.2974219241249214, + "grad_norm": 0.13014602661132812, + "learning_rate": 8.243172581592066e-06, + "loss": 0.0627, + "step": 1419 + }, + { + "epoch": 0.2976315237895619, + "grad_norm": 0.12117176502943039, + "learning_rate": 8.240588015047306e-06, + "loss": 0.0617, + "step": 1420 + }, + { + "epoch": 0.2978411234542025, + "grad_norm": 0.11306478828191757, + "learning_rate": 8.238001954592143e-06, + "loss": 0.0598, + "step": 1421 + }, + { + "epoch": 0.298050723118843, + "grad_norm": 0.1199759840965271, + "learning_rate": 8.235414401418754e-06, + "loss": 0.0624, + "step": 1422 + }, + { + "epoch": 0.2982603227834835, + "grad_norm": 0.12165199220180511, + "learning_rate": 8.232825356719998e-06, + "loss": 0.0645, + "step": 1423 + }, + { + "epoch": 0.2984699224481241, + "grad_norm": 0.1166660338640213, + "learning_rate": 8.230234821689423e-06, + "loss": 0.0657, + "step": 1424 + }, + { + "epoch": 0.2986795221127646, + "grad_norm": 0.11805491894483566, + "learning_rate": 8.227642797521265e-06, + "loss": 0.0615, + "step": 1425 + }, + { + "epoch": 0.2988891217774052, + "grad_norm": 0.11944872885942459, + "learning_rate": 8.22504928541045e-06, + "loss": 0.062, + "step": 1426 + }, + { + "epoch": 0.2990987214420457, + "grad_norm": 0.1310216188430786, + "learning_rate": 8.222454286552583e-06, + "loss": 0.0641, + "step": 1427 + }, + { + "epoch": 0.2993083211066862, + "grad_norm": 0.15212152898311615, + "learning_rate": 8.21985780214396e-06, + "loss": 0.0626, + "step": 1428 + }, + { + "epoch": 0.2995179207713268, + "grad_norm": 0.14432914555072784, + "learning_rate": 8.217259833381559e-06, + "loss": 0.0609, + "step": 1429 + }, + { + "epoch": 0.2997275204359673, + "grad_norm": 0.11263640224933624, + "learning_rate": 8.214660381463043e-06, + "loss": 0.0616, + "step": 1430 + }, + { + "epoch": 0.29993712010060786, + "grad_norm": 0.12339860200881958, + "learning_rate": 8.212059447586758e-06, + "loss": 0.0627, + "step": 1431 + }, + { + "epoch": 0.30014671976524837, + "grad_norm": 0.15246935188770294, + "learning_rate": 8.209457032951735e-06, + "loss": 0.0607, + "step": 1432 + }, + { + "epoch": 0.3003563194298889, + "grad_norm": 0.16209474205970764, + "learning_rate": 8.206853138757687e-06, + "loss": 0.0666, + "step": 1433 + }, + { + "epoch": 0.30056591909452945, + "grad_norm": 0.15916895866394043, + "learning_rate": 8.204247766205008e-06, + "loss": 0.0616, + "step": 1434 + }, + { + "epoch": 0.30077551875916997, + "grad_norm": 0.16150227189064026, + "learning_rate": 8.201640916494776e-06, + "loss": 0.0629, + "step": 1435 + }, + { + "epoch": 0.30098511842381054, + "grad_norm": 0.15138226747512817, + "learning_rate": 8.199032590828745e-06, + "loss": 0.0625, + "step": 1436 + }, + { + "epoch": 0.30119471808845105, + "grad_norm": 0.1358492225408554, + "learning_rate": 8.196422790409352e-06, + "loss": 0.0648, + "step": 1437 + }, + { + "epoch": 0.3014043177530916, + "grad_norm": 0.11498319357633591, + "learning_rate": 8.19381151643972e-06, + "loss": 0.0634, + "step": 1438 + }, + { + "epoch": 0.30161391741773214, + "grad_norm": 0.09225708991289139, + "learning_rate": 8.191198770123643e-06, + "loss": 0.0627, + "step": 1439 + }, + { + "epoch": 0.30182351708237265, + "grad_norm": 0.10581572353839874, + "learning_rate": 8.188584552665592e-06, + "loss": 0.0615, + "step": 1440 + }, + { + "epoch": 0.3020331167470132, + "grad_norm": 0.1425763964653015, + "learning_rate": 8.185968865270729e-06, + "loss": 0.0624, + "step": 1441 + }, + { + "epoch": 0.30224271641165373, + "grad_norm": 0.14036968350410461, + "learning_rate": 8.183351709144877e-06, + "loss": 0.0617, + "step": 1442 + }, + { + "epoch": 0.3024523160762943, + "grad_norm": 0.11378060281276703, + "learning_rate": 8.18073308549455e-06, + "loss": 0.0581, + "step": 1443 + }, + { + "epoch": 0.3026619157409348, + "grad_norm": 0.12382522225379944, + "learning_rate": 8.178112995526932e-06, + "loss": 0.0589, + "step": 1444 + }, + { + "epoch": 0.30287151540557533, + "grad_norm": 0.1571829468011856, + "learning_rate": 8.17549144044988e-06, + "loss": 0.0616, + "step": 1445 + }, + { + "epoch": 0.3030811150702159, + "grad_norm": 0.1604911983013153, + "learning_rate": 8.172868421471936e-06, + "loss": 0.0628, + "step": 1446 + }, + { + "epoch": 0.3032907147348564, + "grad_norm": 0.13987775146961212, + "learning_rate": 8.17024393980231e-06, + "loss": 0.0592, + "step": 1447 + }, + { + "epoch": 0.303500314399497, + "grad_norm": 0.1492827981710434, + "learning_rate": 8.167617996650885e-06, + "loss": 0.0579, + "step": 1448 + }, + { + "epoch": 0.3037099140641375, + "grad_norm": 0.14875125885009766, + "learning_rate": 8.164990593228222e-06, + "loss": 0.0636, + "step": 1449 + }, + { + "epoch": 0.303919513728778, + "grad_norm": 0.12761233747005463, + "learning_rate": 8.16236173074555e-06, + "loss": 0.0606, + "step": 1450 + }, + { + "epoch": 0.3041291133934186, + "grad_norm": 0.13052557408809662, + "learning_rate": 8.15973141041478e-06, + "loss": 0.0608, + "step": 1451 + }, + { + "epoch": 0.3043387130580591, + "grad_norm": 0.14425970613956451, + "learning_rate": 8.157099633448486e-06, + "loss": 0.0626, + "step": 1452 + }, + { + "epoch": 0.30454831272269967, + "grad_norm": 0.13814501464366913, + "learning_rate": 8.154466401059916e-06, + "loss": 0.0627, + "step": 1453 + }, + { + "epoch": 0.3047579123873402, + "grad_norm": 0.160516157746315, + "learning_rate": 8.15183171446299e-06, + "loss": 0.0588, + "step": 1454 + }, + { + "epoch": 0.3049675120519807, + "grad_norm": 0.19766630232334137, + "learning_rate": 8.1491955748723e-06, + "loss": 0.0662, + "step": 1455 + }, + { + "epoch": 0.30517711171662126, + "grad_norm": 0.17199066281318665, + "learning_rate": 8.146557983503103e-06, + "loss": 0.0651, + "step": 1456 + }, + { + "epoch": 0.3053867113812618, + "grad_norm": 0.13344770669937134, + "learning_rate": 8.143918941571329e-06, + "loss": 0.0614, + "step": 1457 + }, + { + "epoch": 0.30559631104590235, + "grad_norm": 0.12617316842079163, + "learning_rate": 8.141278450293576e-06, + "loss": 0.0616, + "step": 1458 + }, + { + "epoch": 0.30580591071054286, + "grad_norm": 0.12648488581180573, + "learning_rate": 8.13863651088711e-06, + "loss": 0.0619, + "step": 1459 + }, + { + "epoch": 0.3060155103751834, + "grad_norm": 0.1468046009540558, + "learning_rate": 8.135993124569865e-06, + "loss": 0.0636, + "step": 1460 + }, + { + "epoch": 0.30622511003982394, + "grad_norm": 0.16080905497074127, + "learning_rate": 8.133348292560442e-06, + "loss": 0.0615, + "step": 1461 + }, + { + "epoch": 0.30643470970446446, + "grad_norm": 0.15127602219581604, + "learning_rate": 8.130702016078105e-06, + "loss": 0.0616, + "step": 1462 + }, + { + "epoch": 0.30664430936910503, + "grad_norm": 0.12708480656147003, + "learning_rate": 8.12805429634279e-06, + "loss": 0.063, + "step": 1463 + }, + { + "epoch": 0.30685390903374554, + "grad_norm": 0.12862573564052582, + "learning_rate": 8.125405134575093e-06, + "loss": 0.0598, + "step": 1464 + }, + { + "epoch": 0.30706350869838606, + "grad_norm": 0.13088904321193695, + "learning_rate": 8.122754531996278e-06, + "loss": 0.0642, + "step": 1465 + }, + { + "epoch": 0.3072731083630266, + "grad_norm": 0.1227186918258667, + "learning_rate": 8.120102489828273e-06, + "loss": 0.0607, + "step": 1466 + }, + { + "epoch": 0.30748270802766714, + "grad_norm": 0.12489646673202515, + "learning_rate": 8.117449009293668e-06, + "loss": 0.058, + "step": 1467 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.13636116683483124, + "learning_rate": 8.114794091615718e-06, + "loss": 0.0638, + "step": 1468 + }, + { + "epoch": 0.3079019073569482, + "grad_norm": 0.15484750270843506, + "learning_rate": 8.11213773801834e-06, + "loss": 0.0628, + "step": 1469 + }, + { + "epoch": 0.3081115070215888, + "grad_norm": 0.15130093693733215, + "learning_rate": 8.109479949726109e-06, + "loss": 0.0613, + "step": 1470 + }, + { + "epoch": 0.3083211066862293, + "grad_norm": 0.13728772103786469, + "learning_rate": 8.106820727964267e-06, + "loss": 0.0608, + "step": 1471 + }, + { + "epoch": 0.3085307063508698, + "grad_norm": 0.13158975541591644, + "learning_rate": 8.104160073958716e-06, + "loss": 0.0656, + "step": 1472 + }, + { + "epoch": 0.3087403060155104, + "grad_norm": 0.14283715188503265, + "learning_rate": 8.101497988936015e-06, + "loss": 0.0615, + "step": 1473 + }, + { + "epoch": 0.3089499056801509, + "grad_norm": 0.14496344327926636, + "learning_rate": 8.098834474123385e-06, + "loss": 0.0608, + "step": 1474 + }, + { + "epoch": 0.3091595053447915, + "grad_norm": 0.11485762149095535, + "learning_rate": 8.096169530748708e-06, + "loss": 0.0628, + "step": 1475 + }, + { + "epoch": 0.309369105009432, + "grad_norm": 0.10256731510162354, + "learning_rate": 8.093503160040517e-06, + "loss": 0.0632, + "step": 1476 + }, + { + "epoch": 0.3095787046740725, + "grad_norm": 0.12671735882759094, + "learning_rate": 8.090835363228016e-06, + "loss": 0.063, + "step": 1477 + }, + { + "epoch": 0.30978830433871307, + "grad_norm": 0.1311434656381607, + "learning_rate": 8.088166141541052e-06, + "loss": 0.0607, + "step": 1478 + }, + { + "epoch": 0.3099979040033536, + "grad_norm": 0.12401635199785233, + "learning_rate": 8.08549549621014e-06, + "loss": 0.0606, + "step": 1479 + }, + { + "epoch": 0.31020750366799416, + "grad_norm": 0.12939530611038208, + "learning_rate": 8.082823428466442e-06, + "loss": 0.0625, + "step": 1480 + }, + { + "epoch": 0.31041710333263467, + "grad_norm": 0.14530347287654877, + "learning_rate": 8.080149939541786e-06, + "loss": 0.0619, + "step": 1481 + }, + { + "epoch": 0.3106267029972752, + "grad_norm": 0.16024935245513916, + "learning_rate": 8.077475030668647e-06, + "loss": 0.057, + "step": 1482 + }, + { + "epoch": 0.31083630266191575, + "grad_norm": 0.16392071545124054, + "learning_rate": 8.074798703080158e-06, + "loss": 0.0627, + "step": 1483 + }, + { + "epoch": 0.31104590232655627, + "grad_norm": 0.16095848381519318, + "learning_rate": 8.072120958010106e-06, + "loss": 0.0598, + "step": 1484 + }, + { + "epoch": 0.31125550199119684, + "grad_norm": 0.13986064493656158, + "learning_rate": 8.069441796692932e-06, + "loss": 0.0633, + "step": 1485 + }, + { + "epoch": 0.31146510165583735, + "grad_norm": 0.11697458475828171, + "learning_rate": 8.066761220363724e-06, + "loss": 0.0608, + "step": 1486 + }, + { + "epoch": 0.31167470132047786, + "grad_norm": 0.1427479237318039, + "learning_rate": 8.064079230258233e-06, + "loss": 0.0633, + "step": 1487 + }, + { + "epoch": 0.31188430098511843, + "grad_norm": 0.1725568026304245, + "learning_rate": 8.061395827612854e-06, + "loss": 0.0622, + "step": 1488 + }, + { + "epoch": 0.31209390064975895, + "grad_norm": 0.15745431184768677, + "learning_rate": 8.058711013664633e-06, + "loss": 0.0627, + "step": 1489 + }, + { + "epoch": 0.3123035003143995, + "grad_norm": 0.13785965740680695, + "learning_rate": 8.056024789651269e-06, + "loss": 0.0605, + "step": 1490 + }, + { + "epoch": 0.31251309997904003, + "grad_norm": 0.12854214012622833, + "learning_rate": 8.053337156811112e-06, + "loss": 0.061, + "step": 1491 + }, + { + "epoch": 0.31272269964368055, + "grad_norm": 0.13071073591709137, + "learning_rate": 8.050648116383162e-06, + "loss": 0.0609, + "step": 1492 + }, + { + "epoch": 0.3129322993083211, + "grad_norm": 0.13808673620224, + "learning_rate": 8.047957669607062e-06, + "loss": 0.0584, + "step": 1493 + }, + { + "epoch": 0.31314189897296163, + "grad_norm": 0.1332629919052124, + "learning_rate": 8.04526581772311e-06, + "loss": 0.0627, + "step": 1494 + }, + { + "epoch": 0.3133514986376022, + "grad_norm": 0.11996429413557053, + "learning_rate": 8.042572561972249e-06, + "loss": 0.0623, + "step": 1495 + }, + { + "epoch": 0.3135610983022427, + "grad_norm": 0.1371801495552063, + "learning_rate": 8.039877903596069e-06, + "loss": 0.0623, + "step": 1496 + }, + { + "epoch": 0.3137706979668832, + "grad_norm": 0.1538374274969101, + "learning_rate": 8.037181843836807e-06, + "loss": 0.0595, + "step": 1497 + }, + { + "epoch": 0.3139802976315238, + "grad_norm": 0.09957927465438843, + "learning_rate": 8.034484383937345e-06, + "loss": 0.0629, + "step": 1498 + }, + { + "epoch": 0.3141898972961643, + "grad_norm": 0.12047518789768219, + "learning_rate": 8.031785525141214e-06, + "loss": 0.0626, + "step": 1499 + }, + { + "epoch": 0.3143994969608049, + "grad_norm": 0.17362119257450104, + "learning_rate": 8.029085268692584e-06, + "loss": 0.0599, + "step": 1500 + }, + { + "epoch": 0.3146090966254454, + "grad_norm": 0.12966394424438477, + "learning_rate": 8.026383615836273e-06, + "loss": 0.0631, + "step": 1501 + }, + { + "epoch": 0.3148186962900859, + "grad_norm": 0.16443343460559845, + "learning_rate": 8.023680567817746e-06, + "loss": 0.0623, + "step": 1502 + }, + { + "epoch": 0.3150282959547265, + "grad_norm": 0.18140558898448944, + "learning_rate": 8.020976125883105e-06, + "loss": 0.0622, + "step": 1503 + }, + { + "epoch": 0.315237895619367, + "grad_norm": 0.1121053621172905, + "learning_rate": 8.018270291279098e-06, + "loss": 0.0598, + "step": 1504 + }, + { + "epoch": 0.31544749528400756, + "grad_norm": 0.1866942197084427, + "learning_rate": 8.015563065253113e-06, + "loss": 0.0623, + "step": 1505 + }, + { + "epoch": 0.3156570949486481, + "grad_norm": 0.1441410928964615, + "learning_rate": 8.01285444905318e-06, + "loss": 0.0621, + "step": 1506 + }, + { + "epoch": 0.31586669461328865, + "grad_norm": 0.13300150632858276, + "learning_rate": 8.010144443927974e-06, + "loss": 0.0604, + "step": 1507 + }, + { + "epoch": 0.31607629427792916, + "grad_norm": 0.17902185022830963, + "learning_rate": 8.007433051126805e-06, + "loss": 0.0619, + "step": 1508 + }, + { + "epoch": 0.3162858939425697, + "grad_norm": 0.12487073242664337, + "learning_rate": 8.004720271899622e-06, + "loss": 0.063, + "step": 1509 + }, + { + "epoch": 0.31649549360721024, + "grad_norm": 0.14495615661144257, + "learning_rate": 8.002006107497018e-06, + "loss": 0.0617, + "step": 1510 + }, + { + "epoch": 0.31670509327185076, + "grad_norm": 0.14461486041545868, + "learning_rate": 7.999290559170222e-06, + "loss": 0.0651, + "step": 1511 + }, + { + "epoch": 0.3169146929364913, + "grad_norm": 0.16186927258968353, + "learning_rate": 7.996573628171103e-06, + "loss": 0.061, + "step": 1512 + }, + { + "epoch": 0.31712429260113184, + "grad_norm": 0.20015843212604523, + "learning_rate": 7.993855315752163e-06, + "loss": 0.063, + "step": 1513 + }, + { + "epoch": 0.31733389226577235, + "grad_norm": 0.1886589378118515, + "learning_rate": 7.991135623166543e-06, + "loss": 0.064, + "step": 1514 + }, + { + "epoch": 0.3175434919304129, + "grad_norm": 0.15665331482887268, + "learning_rate": 7.988414551668025e-06, + "loss": 0.0644, + "step": 1515 + }, + { + "epoch": 0.31775309159505344, + "grad_norm": 0.13003768026828766, + "learning_rate": 7.985692102511018e-06, + "loss": 0.0632, + "step": 1516 + }, + { + "epoch": 0.317962691259694, + "grad_norm": 0.14635911583900452, + "learning_rate": 7.982968276950568e-06, + "loss": 0.0614, + "step": 1517 + }, + { + "epoch": 0.3181722909243345, + "grad_norm": 0.13516655564308167, + "learning_rate": 7.980243076242367e-06, + "loss": 0.0647, + "step": 1518 + }, + { + "epoch": 0.31838189058897504, + "grad_norm": 0.15535247325897217, + "learning_rate": 7.977516501642725e-06, + "loss": 0.0612, + "step": 1519 + }, + { + "epoch": 0.3185914902536156, + "grad_norm": 0.16718165576457977, + "learning_rate": 7.974788554408594e-06, + "loss": 0.0611, + "step": 1520 + }, + { + "epoch": 0.3188010899182561, + "grad_norm": 0.14838221669197083, + "learning_rate": 7.97205923579756e-06, + "loss": 0.0602, + "step": 1521 + }, + { + "epoch": 0.3190106895828967, + "grad_norm": 0.15352647006511688, + "learning_rate": 7.969328547067832e-06, + "loss": 0.0625, + "step": 1522 + }, + { + "epoch": 0.3192202892475372, + "grad_norm": 0.1140708476305008, + "learning_rate": 7.966596489478261e-06, + "loss": 0.059, + "step": 1523 + }, + { + "epoch": 0.3194298889121777, + "grad_norm": 0.11593464761972427, + "learning_rate": 7.963863064288326e-06, + "loss": 0.0605, + "step": 1524 + }, + { + "epoch": 0.3196394885768183, + "grad_norm": 0.11878301203250885, + "learning_rate": 7.961128272758133e-06, + "loss": 0.0617, + "step": 1525 + }, + { + "epoch": 0.3198490882414588, + "grad_norm": 0.10293014347553253, + "learning_rate": 7.958392116148424e-06, + "loss": 0.0601, + "step": 1526 + }, + { + "epoch": 0.32005868790609937, + "grad_norm": 0.12255340069532394, + "learning_rate": 7.95565459572056e-06, + "loss": 0.0616, + "step": 1527 + }, + { + "epoch": 0.3202682875707399, + "grad_norm": 0.1113123744726181, + "learning_rate": 7.952915712736545e-06, + "loss": 0.0629, + "step": 1528 + }, + { + "epoch": 0.3204778872353804, + "grad_norm": 0.11932878196239471, + "learning_rate": 7.950175468458999e-06, + "loss": 0.0632, + "step": 1529 + }, + { + "epoch": 0.32068748690002097, + "grad_norm": 0.13146136701107025, + "learning_rate": 7.947433864151173e-06, + "loss": 0.0624, + "step": 1530 + }, + { + "epoch": 0.3208970865646615, + "grad_norm": 0.11554694920778275, + "learning_rate": 7.944690901076949e-06, + "loss": 0.0591, + "step": 1531 + }, + { + "epoch": 0.32110668622930205, + "grad_norm": 0.11757069826126099, + "learning_rate": 7.94194658050083e-06, + "loss": 0.0659, + "step": 1532 + }, + { + "epoch": 0.32131628589394257, + "grad_norm": 0.12100815027952194, + "learning_rate": 7.93920090368795e-06, + "loss": 0.0579, + "step": 1533 + }, + { + "epoch": 0.3215258855585831, + "grad_norm": 0.12287240475416183, + "learning_rate": 7.936453871904065e-06, + "loss": 0.0601, + "step": 1534 + }, + { + "epoch": 0.32173548522322365, + "grad_norm": 0.12194748222827911, + "learning_rate": 7.933705486415553e-06, + "loss": 0.06, + "step": 1535 + }, + { + "epoch": 0.32194508488786416, + "grad_norm": 0.11489801853895187, + "learning_rate": 7.93095574848942e-06, + "loss": 0.0594, + "step": 1536 + }, + { + "epoch": 0.32215468455250473, + "grad_norm": 0.11319497227668762, + "learning_rate": 7.928204659393297e-06, + "loss": 0.06, + "step": 1537 + }, + { + "epoch": 0.32236428421714525, + "grad_norm": 0.10197950154542923, + "learning_rate": 7.925452220395436e-06, + "loss": 0.064, + "step": 1538 + }, + { + "epoch": 0.3225738838817858, + "grad_norm": 0.10284919291734695, + "learning_rate": 7.922698432764709e-06, + "loss": 0.0616, + "step": 1539 + }, + { + "epoch": 0.32278348354642633, + "grad_norm": 0.10016728937625885, + "learning_rate": 7.919943297770609e-06, + "loss": 0.0588, + "step": 1540 + }, + { + "epoch": 0.32299308321106684, + "grad_norm": 0.10762354731559753, + "learning_rate": 7.917186816683256e-06, + "loss": 0.0587, + "step": 1541 + }, + { + "epoch": 0.3232026828757074, + "grad_norm": 0.13142406940460205, + "learning_rate": 7.914428990773388e-06, + "loss": 0.0594, + "step": 1542 + }, + { + "epoch": 0.32341228254034793, + "grad_norm": 0.15556885302066803, + "learning_rate": 7.91166982131236e-06, + "loss": 0.0594, + "step": 1543 + }, + { + "epoch": 0.3236218822049885, + "grad_norm": 0.18388265371322632, + "learning_rate": 7.908909309572147e-06, + "loss": 0.0606, + "step": 1544 + }, + { + "epoch": 0.323831481869629, + "grad_norm": 0.21343399584293365, + "learning_rate": 7.906147456825349e-06, + "loss": 0.0634, + "step": 1545 + }, + { + "epoch": 0.3240410815342695, + "grad_norm": 0.21992023289203644, + "learning_rate": 7.903384264345177e-06, + "loss": 0.0635, + "step": 1546 + }, + { + "epoch": 0.3242506811989101, + "grad_norm": 0.16700875759124756, + "learning_rate": 7.900619733405462e-06, + "loss": 0.0641, + "step": 1547 + }, + { + "epoch": 0.3244602808635506, + "grad_norm": 0.09242475032806396, + "learning_rate": 7.897853865280652e-06, + "loss": 0.0614, + "step": 1548 + }, + { + "epoch": 0.3246698805281912, + "grad_norm": 0.12979759275913239, + "learning_rate": 7.895086661245811e-06, + "loss": 0.061, + "step": 1549 + }, + { + "epoch": 0.3248794801928317, + "grad_norm": 0.19285419583320618, + "learning_rate": 7.892318122576623e-06, + "loss": 0.0602, + "step": 1550 + }, + { + "epoch": 0.3250890798574722, + "grad_norm": 0.1928529143333435, + "learning_rate": 7.889548250549379e-06, + "loss": 0.0613, + "step": 1551 + }, + { + "epoch": 0.3252986795221128, + "grad_norm": 0.13492649793624878, + "learning_rate": 7.886777046440993e-06, + "loss": 0.061, + "step": 1552 + }, + { + "epoch": 0.3255082791867533, + "grad_norm": 0.1183929294347763, + "learning_rate": 7.884004511528988e-06, + "loss": 0.0637, + "step": 1553 + }, + { + "epoch": 0.32571787885139386, + "grad_norm": 0.17243815958499908, + "learning_rate": 7.881230647091502e-06, + "loss": 0.0601, + "step": 1554 + }, + { + "epoch": 0.3259274785160344, + "grad_norm": 0.1707419902086258, + "learning_rate": 7.878455454407285e-06, + "loss": 0.0625, + "step": 1555 + }, + { + "epoch": 0.3261370781806749, + "grad_norm": 0.14786763489246368, + "learning_rate": 7.875678934755704e-06, + "loss": 0.0632, + "step": 1556 + }, + { + "epoch": 0.32634667784531546, + "grad_norm": 0.1466890424489975, + "learning_rate": 7.87290108941673e-06, + "loss": 0.0613, + "step": 1557 + }, + { + "epoch": 0.326556277509956, + "grad_norm": 0.10979172587394714, + "learning_rate": 7.87012191967095e-06, + "loss": 0.063, + "step": 1558 + }, + { + "epoch": 0.32676587717459654, + "grad_norm": 0.11911406368017197, + "learning_rate": 7.867341426799562e-06, + "loss": 0.0567, + "step": 1559 + }, + { + "epoch": 0.32697547683923706, + "grad_norm": 0.14850041270256042, + "learning_rate": 7.864559612084372e-06, + "loss": 0.0609, + "step": 1560 + }, + { + "epoch": 0.32718507650387757, + "grad_norm": 0.13936297595500946, + "learning_rate": 7.861776476807795e-06, + "loss": 0.0613, + "step": 1561 + }, + { + "epoch": 0.32739467616851814, + "grad_norm": 0.12747827172279358, + "learning_rate": 7.858992022252859e-06, + "loss": 0.0626, + "step": 1562 + }, + { + "epoch": 0.32760427583315865, + "grad_norm": 0.11281277984380722, + "learning_rate": 7.856206249703191e-06, + "loss": 0.0612, + "step": 1563 + }, + { + "epoch": 0.3278138754977992, + "grad_norm": 0.11081743985414505, + "learning_rate": 7.853419160443038e-06, + "loss": 0.0625, + "step": 1564 + }, + { + "epoch": 0.32802347516243974, + "grad_norm": 0.10656673461198807, + "learning_rate": 7.850630755757242e-06, + "loss": 0.0625, + "step": 1565 + }, + { + "epoch": 0.32823307482708025, + "grad_norm": 0.09804133325815201, + "learning_rate": 7.847841036931263e-06, + "loss": 0.0602, + "step": 1566 + }, + { + "epoch": 0.3284426744917208, + "grad_norm": 0.10339375585317612, + "learning_rate": 7.845050005251156e-06, + "loss": 0.06, + "step": 1567 + }, + { + "epoch": 0.32865227415636133, + "grad_norm": 0.09550289064645767, + "learning_rate": 7.842257662003587e-06, + "loss": 0.0571, + "step": 1568 + }, + { + "epoch": 0.3288618738210019, + "grad_norm": 0.09345971792936325, + "learning_rate": 7.839464008475825e-06, + "loss": 0.0604, + "step": 1569 + }, + { + "epoch": 0.3290714734856424, + "grad_norm": 0.10545200109481812, + "learning_rate": 7.836669045955746e-06, + "loss": 0.0598, + "step": 1570 + }, + { + "epoch": 0.32928107315028293, + "grad_norm": 0.11077643185853958, + "learning_rate": 7.83387277573183e-06, + "loss": 0.0586, + "step": 1571 + }, + { + "epoch": 0.3294906728149235, + "grad_norm": 0.12594513595104218, + "learning_rate": 7.831075199093148e-06, + "loss": 0.0598, + "step": 1572 + }, + { + "epoch": 0.329700272479564, + "grad_norm": 0.13642063736915588, + "learning_rate": 7.828276317329388e-06, + "loss": 0.0601, + "step": 1573 + }, + { + "epoch": 0.3299098721442046, + "grad_norm": 0.13808242976665497, + "learning_rate": 7.825476131730836e-06, + "loss": 0.0591, + "step": 1574 + }, + { + "epoch": 0.3301194718088451, + "grad_norm": 0.14252924919128418, + "learning_rate": 7.822674643588372e-06, + "loss": 0.0589, + "step": 1575 + }, + { + "epoch": 0.33032907147348567, + "grad_norm": 0.12825094163417816, + "learning_rate": 7.819871854193484e-06, + "loss": 0.0622, + "step": 1576 + }, + { + "epoch": 0.3305386711381262, + "grad_norm": 0.10428497940301895, + "learning_rate": 7.817067764838257e-06, + "loss": 0.0583, + "step": 1577 + }, + { + "epoch": 0.3307482708027667, + "grad_norm": 0.10690061748027802, + "learning_rate": 7.814262376815375e-06, + "loss": 0.0581, + "step": 1578 + }, + { + "epoch": 0.33095787046740727, + "grad_norm": 0.11123620718717575, + "learning_rate": 7.811455691418123e-06, + "loss": 0.0607, + "step": 1579 + }, + { + "epoch": 0.3311674701320478, + "grad_norm": 0.10212855786085129, + "learning_rate": 7.80864770994038e-06, + "loss": 0.0624, + "step": 1580 + }, + { + "epoch": 0.33137706979668835, + "grad_norm": 0.09154222905635834, + "learning_rate": 7.805838433676627e-06, + "loss": 0.0604, + "step": 1581 + }, + { + "epoch": 0.33158666946132886, + "grad_norm": 0.09208472818136215, + "learning_rate": 7.803027863921939e-06, + "loss": 0.06, + "step": 1582 + }, + { + "epoch": 0.3317962691259694, + "grad_norm": 0.09515499323606491, + "learning_rate": 7.800216001971988e-06, + "loss": 0.0654, + "step": 1583 + }, + { + "epoch": 0.33200586879060995, + "grad_norm": 0.09495844691991806, + "learning_rate": 7.79740284912304e-06, + "loss": 0.063, + "step": 1584 + }, + { + "epoch": 0.33221546845525046, + "grad_norm": 0.09243495017290115, + "learning_rate": 7.794588406671962e-06, + "loss": 0.0622, + "step": 1585 + }, + { + "epoch": 0.33242506811989103, + "grad_norm": 0.08537401258945465, + "learning_rate": 7.791772675916207e-06, + "loss": 0.0601, + "step": 1586 + }, + { + "epoch": 0.33263466778453155, + "grad_norm": 0.08266492187976837, + "learning_rate": 7.788955658153829e-06, + "loss": 0.0582, + "step": 1587 + }, + { + "epoch": 0.33284426744917206, + "grad_norm": 0.10280580073595047, + "learning_rate": 7.786137354683472e-06, + "loss": 0.0598, + "step": 1588 + }, + { + "epoch": 0.33305386711381263, + "grad_norm": 0.13502821326255798, + "learning_rate": 7.783317766804375e-06, + "loss": 0.0586, + "step": 1589 + }, + { + "epoch": 0.33326346677845314, + "grad_norm": 0.16894817352294922, + "learning_rate": 7.780496895816363e-06, + "loss": 0.0596, + "step": 1590 + }, + { + "epoch": 0.3334730664430937, + "grad_norm": 0.21050219237804413, + "learning_rate": 7.77767474301986e-06, + "loss": 0.0609, + "step": 1591 + }, + { + "epoch": 0.3336826661077342, + "grad_norm": 0.203968346118927, + "learning_rate": 7.774851309715878e-06, + "loss": 0.0625, + "step": 1592 + }, + { + "epoch": 0.33389226577237474, + "grad_norm": 0.13803130388259888, + "learning_rate": 7.77202659720602e-06, + "loss": 0.057, + "step": 1593 + }, + { + "epoch": 0.3341018654370153, + "grad_norm": 0.13375061750411987, + "learning_rate": 7.769200606792476e-06, + "loss": 0.057, + "step": 1594 + }, + { + "epoch": 0.3343114651016558, + "grad_norm": 0.12815599143505096, + "learning_rate": 7.766373339778026e-06, + "loss": 0.0587, + "step": 1595 + }, + { + "epoch": 0.3345210647662964, + "grad_norm": 0.14430293440818787, + "learning_rate": 7.763544797466041e-06, + "loss": 0.0586, + "step": 1596 + }, + { + "epoch": 0.3347306644309369, + "grad_norm": 0.15994097292423248, + "learning_rate": 7.76071498116048e-06, + "loss": 0.0569, + "step": 1597 + }, + { + "epoch": 0.3349402640955774, + "grad_norm": 0.12301892787218094, + "learning_rate": 7.757883892165886e-06, + "loss": 0.0596, + "step": 1598 + }, + { + "epoch": 0.335149863760218, + "grad_norm": 0.12102188915014267, + "learning_rate": 7.755051531787388e-06, + "loss": 0.0604, + "step": 1599 + }, + { + "epoch": 0.3353594634248585, + "grad_norm": 0.12119577080011368, + "learning_rate": 7.752217901330707e-06, + "loss": 0.0602, + "step": 1600 + }, + { + "epoch": 0.3355690630894991, + "grad_norm": 0.11957602202892303, + "learning_rate": 7.749383002102147e-06, + "loss": 0.0608, + "step": 1601 + }, + { + "epoch": 0.3357786627541396, + "grad_norm": 0.1391220986843109, + "learning_rate": 7.746546835408593e-06, + "loss": 0.0592, + "step": 1602 + }, + { + "epoch": 0.3359882624187801, + "grad_norm": 0.14518260955810547, + "learning_rate": 7.74370940255752e-06, + "loss": 0.0605, + "step": 1603 + }, + { + "epoch": 0.3361978620834207, + "grad_norm": 0.15618950128555298, + "learning_rate": 7.74087070485698e-06, + "loss": 0.0604, + "step": 1604 + }, + { + "epoch": 0.3364074617480612, + "grad_norm": 0.13505420088768005, + "learning_rate": 7.738030743615615e-06, + "loss": 0.0605, + "step": 1605 + }, + { + "epoch": 0.33661706141270176, + "grad_norm": 0.11345727741718292, + "learning_rate": 7.735189520142645e-06, + "loss": 0.0627, + "step": 1606 + }, + { + "epoch": 0.33682666107734227, + "grad_norm": 0.10767614841461182, + "learning_rate": 7.732347035747878e-06, + "loss": 0.0631, + "step": 1607 + }, + { + "epoch": 0.33703626074198284, + "grad_norm": 0.11709927022457123, + "learning_rate": 7.72950329174169e-06, + "loss": 0.0615, + "step": 1608 + }, + { + "epoch": 0.33724586040662335, + "grad_norm": 0.13534541428089142, + "learning_rate": 7.726658289435055e-06, + "loss": 0.0617, + "step": 1609 + }, + { + "epoch": 0.33745546007126387, + "grad_norm": 0.12872527539730072, + "learning_rate": 7.723812030139514e-06, + "loss": 0.0574, + "step": 1610 + }, + { + "epoch": 0.33766505973590444, + "grad_norm": 0.12165652960538864, + "learning_rate": 7.720964515167193e-06, + "loss": 0.0573, + "step": 1611 + }, + { + "epoch": 0.33787465940054495, + "grad_norm": 0.10215598344802856, + "learning_rate": 7.718115745830797e-06, + "loss": 0.0579, + "step": 1612 + }, + { + "epoch": 0.3380842590651855, + "grad_norm": 0.09414192289113998, + "learning_rate": 7.715265723443606e-06, + "loss": 0.0607, + "step": 1613 + }, + { + "epoch": 0.33829385872982604, + "grad_norm": 0.11345679312944412, + "learning_rate": 7.712414449319478e-06, + "loss": 0.0591, + "step": 1614 + }, + { + "epoch": 0.33850345839446655, + "grad_norm": 0.11734048277139664, + "learning_rate": 7.709561924772855e-06, + "loss": 0.0595, + "step": 1615 + }, + { + "epoch": 0.3387130580591071, + "grad_norm": 0.10740841180086136, + "learning_rate": 7.706708151118747e-06, + "loss": 0.0593, + "step": 1616 + }, + { + "epoch": 0.33892265772374763, + "grad_norm": 0.1052752360701561, + "learning_rate": 7.703853129672742e-06, + "loss": 0.0576, + "step": 1617 + }, + { + "epoch": 0.3391322573883882, + "grad_norm": 0.11419695615768433, + "learning_rate": 7.700996861751009e-06, + "loss": 0.0632, + "step": 1618 + }, + { + "epoch": 0.3393418570530287, + "grad_norm": 0.10033425688743591, + "learning_rate": 7.698139348670281e-06, + "loss": 0.061, + "step": 1619 + }, + { + "epoch": 0.33955145671766923, + "grad_norm": 0.10025940090417862, + "learning_rate": 7.695280591747875e-06, + "loss": 0.0609, + "step": 1620 + }, + { + "epoch": 0.3397610563823098, + "grad_norm": 0.09737148880958557, + "learning_rate": 7.692420592301675e-06, + "loss": 0.0609, + "step": 1621 + }, + { + "epoch": 0.3399706560469503, + "grad_norm": 0.09889432042837143, + "learning_rate": 7.689559351650142e-06, + "loss": 0.0598, + "step": 1622 + }, + { + "epoch": 0.3401802557115909, + "grad_norm": 0.10725081712007523, + "learning_rate": 7.686696871112306e-06, + "loss": 0.0581, + "step": 1623 + }, + { + "epoch": 0.3403898553762314, + "grad_norm": 0.10398346930742264, + "learning_rate": 7.683833152007772e-06, + "loss": 0.0577, + "step": 1624 + }, + { + "epoch": 0.3405994550408719, + "grad_norm": 0.11018167436122894, + "learning_rate": 7.68096819565671e-06, + "loss": 0.0603, + "step": 1625 + }, + { + "epoch": 0.3408090547055125, + "grad_norm": 0.10882928967475891, + "learning_rate": 7.67810200337987e-06, + "loss": 0.0595, + "step": 1626 + }, + { + "epoch": 0.341018654370153, + "grad_norm": 0.09781121462583542, + "learning_rate": 7.675234576498561e-06, + "loss": 0.0611, + "step": 1627 + }, + { + "epoch": 0.34122825403479357, + "grad_norm": 0.08528628945350647, + "learning_rate": 7.672365916334668e-06, + "loss": 0.0576, + "step": 1628 + }, + { + "epoch": 0.3414378536994341, + "grad_norm": 0.08956452459096909, + "learning_rate": 7.66949602421064e-06, + "loss": 0.0624, + "step": 1629 + }, + { + "epoch": 0.3416474533640746, + "grad_norm": 0.09840685874223709, + "learning_rate": 7.666624901449503e-06, + "loss": 0.0581, + "step": 1630 + }, + { + "epoch": 0.34185705302871516, + "grad_norm": 0.11417452245950699, + "learning_rate": 7.66375254937484e-06, + "loss": 0.0581, + "step": 1631 + }, + { + "epoch": 0.3420666526933557, + "grad_norm": 0.11850450932979584, + "learning_rate": 7.660878969310803e-06, + "loss": 0.0607, + "step": 1632 + }, + { + "epoch": 0.34227625235799625, + "grad_norm": 0.12879729270935059, + "learning_rate": 7.658004162582116e-06, + "loss": 0.0619, + "step": 1633 + }, + { + "epoch": 0.34248585202263676, + "grad_norm": 0.1304604858160019, + "learning_rate": 7.655128130514061e-06, + "loss": 0.0632, + "step": 1634 + }, + { + "epoch": 0.3426954516872773, + "grad_norm": 0.10286222398281097, + "learning_rate": 7.652250874432489e-06, + "loss": 0.0611, + "step": 1635 + }, + { + "epoch": 0.34290505135191784, + "grad_norm": 0.11092126369476318, + "learning_rate": 7.649372395663816e-06, + "loss": 0.0615, + "step": 1636 + }, + { + "epoch": 0.34311465101655836, + "grad_norm": 0.109388567507267, + "learning_rate": 7.646492695535018e-06, + "loss": 0.0641, + "step": 1637 + }, + { + "epoch": 0.34332425068119893, + "grad_norm": 0.09455049782991409, + "learning_rate": 7.643611775373637e-06, + "loss": 0.0613, + "step": 1638 + }, + { + "epoch": 0.34353385034583944, + "grad_norm": 0.09676145017147064, + "learning_rate": 7.640729636507778e-06, + "loss": 0.0597, + "step": 1639 + }, + { + "epoch": 0.34374345001047996, + "grad_norm": 0.09647035598754883, + "learning_rate": 7.637846280266103e-06, + "loss": 0.0593, + "step": 1640 + }, + { + "epoch": 0.3439530496751205, + "grad_norm": 0.10977063328027725, + "learning_rate": 7.634961707977843e-06, + "loss": 0.0596, + "step": 1641 + }, + { + "epoch": 0.34416264933976104, + "grad_norm": 0.1262187659740448, + "learning_rate": 7.632075920972782e-06, + "loss": 0.0604, + "step": 1642 + }, + { + "epoch": 0.3443722490044016, + "grad_norm": 0.1421433538198471, + "learning_rate": 7.629188920581267e-06, + "loss": 0.0571, + "step": 1643 + }, + { + "epoch": 0.3445818486690421, + "grad_norm": 0.14843688905239105, + "learning_rate": 7.626300708134207e-06, + "loss": 0.0601, + "step": 1644 + }, + { + "epoch": 0.3447914483336827, + "grad_norm": 0.14421787858009338, + "learning_rate": 7.623411284963066e-06, + "loss": 0.0608, + "step": 1645 + }, + { + "epoch": 0.3450010479983232, + "grad_norm": 0.14530403912067413, + "learning_rate": 7.620520652399867e-06, + "loss": 0.059, + "step": 1646 + }, + { + "epoch": 0.3452106476629637, + "grad_norm": 0.14376644790172577, + "learning_rate": 7.617628811777191e-06, + "loss": 0.0613, + "step": 1647 + }, + { + "epoch": 0.3454202473276043, + "grad_norm": 0.14656515419483185, + "learning_rate": 7.614735764428178e-06, + "loss": 0.0616, + "step": 1648 + }, + { + "epoch": 0.3456298469922448, + "grad_norm": 0.15034882724285126, + "learning_rate": 7.611841511686521e-06, + "loss": 0.0645, + "step": 1649 + }, + { + "epoch": 0.3458394466568854, + "grad_norm": 0.17289340496063232, + "learning_rate": 7.608946054886468e-06, + "loss": 0.0605, + "step": 1650 + }, + { + "epoch": 0.3460490463215259, + "grad_norm": 0.20710116624832153, + "learning_rate": 7.606049395362827e-06, + "loss": 0.0601, + "step": 1651 + }, + { + "epoch": 0.3462586459861664, + "grad_norm": 0.24352329969406128, + "learning_rate": 7.6031515344509545e-06, + "loss": 0.0619, + "step": 1652 + }, + { + "epoch": 0.34646824565080697, + "grad_norm": 0.24319349229335785, + "learning_rate": 7.6002524734867676e-06, + "loss": 0.0637, + "step": 1653 + }, + { + "epoch": 0.3466778453154475, + "grad_norm": 0.18296609818935394, + "learning_rate": 7.597352213806729e-06, + "loss": 0.0614, + "step": 1654 + }, + { + "epoch": 0.34688744498008806, + "grad_norm": 0.11361169070005417, + "learning_rate": 7.5944507567478585e-06, + "loss": 0.0596, + "step": 1655 + }, + { + "epoch": 0.34709704464472857, + "grad_norm": 0.14272648096084595, + "learning_rate": 7.59154810364773e-06, + "loss": 0.0609, + "step": 1656 + }, + { + "epoch": 0.3473066443093691, + "grad_norm": 0.18227772414684296, + "learning_rate": 7.588644255844464e-06, + "loss": 0.0609, + "step": 1657 + }, + { + "epoch": 0.34751624397400965, + "grad_norm": 0.15642352402210236, + "learning_rate": 7.585739214676731e-06, + "loss": 0.0611, + "step": 1658 + }, + { + "epoch": 0.34772584363865017, + "grad_norm": 0.13368570804595947, + "learning_rate": 7.582832981483761e-06, + "loss": 0.0607, + "step": 1659 + }, + { + "epoch": 0.34793544330329074, + "grad_norm": 0.12491488456726074, + "learning_rate": 7.57992555760532e-06, + "loss": 0.0628, + "step": 1660 + }, + { + "epoch": 0.34814504296793125, + "grad_norm": 0.12365151941776276, + "learning_rate": 7.577016944381734e-06, + "loss": 0.0582, + "step": 1661 + }, + { + "epoch": 0.34835464263257176, + "grad_norm": 0.14696553349494934, + "learning_rate": 7.574107143153872e-06, + "loss": 0.0611, + "step": 1662 + }, + { + "epoch": 0.34856424229721233, + "grad_norm": 0.14201070368289948, + "learning_rate": 7.571196155263152e-06, + "loss": 0.0617, + "step": 1663 + }, + { + "epoch": 0.34877384196185285, + "grad_norm": 0.13257716596126556, + "learning_rate": 7.568283982051538e-06, + "loss": 0.062, + "step": 1664 + }, + { + "epoch": 0.3489834416264934, + "grad_norm": 0.10744427889585495, + "learning_rate": 7.565370624861541e-06, + "loss": 0.0615, + "step": 1665 + }, + { + "epoch": 0.34919304129113393, + "grad_norm": 0.11801417917013168, + "learning_rate": 7.562456085036221e-06, + "loss": 0.0591, + "step": 1666 + }, + { + "epoch": 0.34940264095577445, + "grad_norm": 0.13146089017391205, + "learning_rate": 7.5595403639191775e-06, + "loss": 0.0618, + "step": 1667 + }, + { + "epoch": 0.349612240620415, + "grad_norm": 0.11520653963088989, + "learning_rate": 7.556623462854555e-06, + "loss": 0.0604, + "step": 1668 + }, + { + "epoch": 0.34982184028505553, + "grad_norm": 0.111871138215065, + "learning_rate": 7.553705383187051e-06, + "loss": 0.0567, + "step": 1669 + }, + { + "epoch": 0.3500314399496961, + "grad_norm": 0.10216851532459259, + "learning_rate": 7.550786126261893e-06, + "loss": 0.0601, + "step": 1670 + }, + { + "epoch": 0.3502410396143366, + "grad_norm": 0.11334113776683807, + "learning_rate": 7.5478656934248626e-06, + "loss": 0.0607, + "step": 1671 + }, + { + "epoch": 0.3504506392789771, + "grad_norm": 0.12396923452615738, + "learning_rate": 7.544944086022276e-06, + "loss": 0.0611, + "step": 1672 + }, + { + "epoch": 0.3506602389436177, + "grad_norm": 0.11316753178834915, + "learning_rate": 7.5420213054009935e-06, + "loss": 0.0623, + "step": 1673 + }, + { + "epoch": 0.3508698386082582, + "grad_norm": 0.12763440608978271, + "learning_rate": 7.539097352908419e-06, + "loss": 0.0587, + "step": 1674 + }, + { + "epoch": 0.3510794382728988, + "grad_norm": 0.12276781350374222, + "learning_rate": 7.536172229892491e-06, + "loss": 0.0599, + "step": 1675 + }, + { + "epoch": 0.3512890379375393, + "grad_norm": 0.10066503286361694, + "learning_rate": 7.533245937701692e-06, + "loss": 0.0587, + "step": 1676 + }, + { + "epoch": 0.35149863760217986, + "grad_norm": 0.11499958485364914, + "learning_rate": 7.530318477685043e-06, + "loss": 0.0578, + "step": 1677 + }, + { + "epoch": 0.3517082372668204, + "grad_norm": 0.10679785162210464, + "learning_rate": 7.527389851192099e-06, + "loss": 0.0618, + "step": 1678 + }, + { + "epoch": 0.3519178369314609, + "grad_norm": 0.11170266568660736, + "learning_rate": 7.52446005957296e-06, + "loss": 0.0567, + "step": 1679 + }, + { + "epoch": 0.35212743659610146, + "grad_norm": 0.13220727443695068, + "learning_rate": 7.521529104178258e-06, + "loss": 0.0624, + "step": 1680 + }, + { + "epoch": 0.352337036260742, + "grad_norm": 0.11351287364959717, + "learning_rate": 7.518596986359163e-06, + "loss": 0.0576, + "step": 1681 + }, + { + "epoch": 0.35254663592538255, + "grad_norm": 0.1113462969660759, + "learning_rate": 7.51566370746738e-06, + "loss": 0.0627, + "step": 1682 + }, + { + "epoch": 0.35275623559002306, + "grad_norm": 0.11083466559648514, + "learning_rate": 7.51272926885515e-06, + "loss": 0.0582, + "step": 1683 + }, + { + "epoch": 0.3529658352546636, + "grad_norm": 0.09305007755756378, + "learning_rate": 7.50979367187525e-06, + "loss": 0.0585, + "step": 1684 + }, + { + "epoch": 0.35317543491930414, + "grad_norm": 0.09011313319206238, + "learning_rate": 7.506856917880989e-06, + "loss": 0.0612, + "step": 1685 + }, + { + "epoch": 0.35338503458394466, + "grad_norm": 0.09725736081600189, + "learning_rate": 7.503919008226208e-06, + "loss": 0.0565, + "step": 1686 + }, + { + "epoch": 0.3535946342485852, + "grad_norm": 0.10482776910066605, + "learning_rate": 7.5009799442652856e-06, + "loss": 0.0598, + "step": 1687 + }, + { + "epoch": 0.35380423391322574, + "grad_norm": 0.12371653318405151, + "learning_rate": 7.498039727353127e-06, + "loss": 0.0615, + "step": 1688 + }, + { + "epoch": 0.35401383357786625, + "grad_norm": 0.13332313299179077, + "learning_rate": 7.495098358845174e-06, + "loss": 0.0587, + "step": 1689 + }, + { + "epoch": 0.3542234332425068, + "grad_norm": 0.11412235349416733, + "learning_rate": 7.492155840097396e-06, + "loss": 0.0608, + "step": 1690 + }, + { + "epoch": 0.35443303290714734, + "grad_norm": 0.09121813625097275, + "learning_rate": 7.4892121724662915e-06, + "loss": 0.0599, + "step": 1691 + }, + { + "epoch": 0.3546426325717879, + "grad_norm": 0.09299511462450027, + "learning_rate": 7.486267357308896e-06, + "loss": 0.0611, + "step": 1692 + }, + { + "epoch": 0.3548522322364284, + "grad_norm": 0.09584937989711761, + "learning_rate": 7.483321395982762e-06, + "loss": 0.0628, + "step": 1693 + }, + { + "epoch": 0.35506183190106894, + "grad_norm": 0.0966387689113617, + "learning_rate": 7.4803742898459816e-06, + "loss": 0.0595, + "step": 1694 + }, + { + "epoch": 0.3552714315657095, + "grad_norm": 0.09714031964540482, + "learning_rate": 7.4774260402571696e-06, + "loss": 0.0597, + "step": 1695 + }, + { + "epoch": 0.35548103123035, + "grad_norm": 0.09430839866399765, + "learning_rate": 7.474476648575466e-06, + "loss": 0.059, + "step": 1696 + }, + { + "epoch": 0.3556906308949906, + "grad_norm": 0.09601857513189316, + "learning_rate": 7.471526116160542e-06, + "loss": 0.0576, + "step": 1697 + }, + { + "epoch": 0.3559002305596311, + "grad_norm": 0.11064789444208145, + "learning_rate": 7.468574444372593e-06, + "loss": 0.0608, + "step": 1698 + }, + { + "epoch": 0.3561098302242716, + "grad_norm": 0.11107967793941498, + "learning_rate": 7.465621634572336e-06, + "loss": 0.0602, + "step": 1699 + }, + { + "epoch": 0.3563194298889122, + "grad_norm": 0.12140575051307678, + "learning_rate": 7.462667688121019e-06, + "loss": 0.0576, + "step": 1700 + }, + { + "epoch": 0.3565290295535527, + "grad_norm": 0.14458388090133667, + "learning_rate": 7.459712606380409e-06, + "loss": 0.0606, + "step": 1701 + }, + { + "epoch": 0.35673862921819327, + "grad_norm": 0.14192935824394226, + "learning_rate": 7.4567563907127985e-06, + "loss": 0.061, + "step": 1702 + }, + { + "epoch": 0.3569482288828338, + "grad_norm": 0.14503100514411926, + "learning_rate": 7.453799042481002e-06, + "loss": 0.059, + "step": 1703 + }, + { + "epoch": 0.3571578285474743, + "grad_norm": 0.13211016356945038, + "learning_rate": 7.450840563048356e-06, + "loss": 0.0612, + "step": 1704 + }, + { + "epoch": 0.35736742821211487, + "grad_norm": 0.10646926611661911, + "learning_rate": 7.44788095377872e-06, + "loss": 0.0621, + "step": 1705 + }, + { + "epoch": 0.3575770278767554, + "grad_norm": 0.09362011402845383, + "learning_rate": 7.444920216036473e-06, + "loss": 0.0574, + "step": 1706 + }, + { + "epoch": 0.35778662754139595, + "grad_norm": 0.11936776340007782, + "learning_rate": 7.441958351186514e-06, + "loss": 0.0604, + "step": 1707 + }, + { + "epoch": 0.35799622720603647, + "grad_norm": 0.13509775698184967, + "learning_rate": 7.4389953605942634e-06, + "loss": 0.0611, + "step": 1708 + }, + { + "epoch": 0.358205826870677, + "grad_norm": 0.11575739830732346, + "learning_rate": 7.436031245625657e-06, + "loss": 0.0607, + "step": 1709 + }, + { + "epoch": 0.35841542653531755, + "grad_norm": 0.10143768787384033, + "learning_rate": 7.433066007647152e-06, + "loss": 0.0576, + "step": 1710 + }, + { + "epoch": 0.35862502619995806, + "grad_norm": 0.09407970309257507, + "learning_rate": 7.430099648025723e-06, + "loss": 0.0597, + "step": 1711 + }, + { + "epoch": 0.35883462586459863, + "grad_norm": 0.11093775928020477, + "learning_rate": 7.427132168128862e-06, + "loss": 0.0594, + "step": 1712 + }, + { + "epoch": 0.35904422552923915, + "grad_norm": 0.13045847415924072, + "learning_rate": 7.4241635693245766e-06, + "loss": 0.0631, + "step": 1713 + }, + { + "epoch": 0.3592538251938797, + "grad_norm": 0.128498837351799, + "learning_rate": 7.421193852981386e-06, + "loss": 0.0617, + "step": 1714 + }, + { + "epoch": 0.35946342485852023, + "grad_norm": 0.12185259163379669, + "learning_rate": 7.418223020468335e-06, + "loss": 0.0578, + "step": 1715 + }, + { + "epoch": 0.35967302452316074, + "grad_norm": 0.13501186668872833, + "learning_rate": 7.415251073154972e-06, + "loss": 0.0589, + "step": 1716 + }, + { + "epoch": 0.3598826241878013, + "grad_norm": 0.12140613049268723, + "learning_rate": 7.412278012411368e-06, + "loss": 0.0621, + "step": 1717 + }, + { + "epoch": 0.36009222385244183, + "grad_norm": 0.10289863497018814, + "learning_rate": 7.4093038396081e-06, + "loss": 0.0575, + "step": 1718 + }, + { + "epoch": 0.3603018235170824, + "grad_norm": 0.11283747851848602, + "learning_rate": 7.4063285561162624e-06, + "loss": 0.0584, + "step": 1719 + }, + { + "epoch": 0.3605114231817229, + "grad_norm": 0.13699781894683838, + "learning_rate": 7.40335216330746e-06, + "loss": 0.0603, + "step": 1720 + }, + { + "epoch": 0.3607210228463634, + "grad_norm": 0.16813865303993225, + "learning_rate": 7.400374662553811e-06, + "loss": 0.0605, + "step": 1721 + }, + { + "epoch": 0.360930622511004, + "grad_norm": 0.1661754846572876, + "learning_rate": 7.39739605522794e-06, + "loss": 0.0595, + "step": 1722 + }, + { + "epoch": 0.3611402221756445, + "grad_norm": 0.13529466092586517, + "learning_rate": 7.394416342702986e-06, + "loss": 0.0588, + "step": 1723 + }, + { + "epoch": 0.3613498218402851, + "grad_norm": 0.1041502058506012, + "learning_rate": 7.391435526352594e-06, + "loss": 0.0619, + "step": 1724 + }, + { + "epoch": 0.3615594215049256, + "grad_norm": 0.08625433593988419, + "learning_rate": 7.3884536075509225e-06, + "loss": 0.0591, + "step": 1725 + }, + { + "epoch": 0.3617690211695661, + "grad_norm": 0.1282806396484375, + "learning_rate": 7.385470587672634e-06, + "loss": 0.0607, + "step": 1726 + }, + { + "epoch": 0.3619786208342067, + "grad_norm": 0.15391872823238373, + "learning_rate": 7.382486468092899e-06, + "loss": 0.0588, + "step": 1727 + }, + { + "epoch": 0.3621882204988472, + "grad_norm": 0.1479339599609375, + "learning_rate": 7.379501250187399e-06, + "loss": 0.0598, + "step": 1728 + }, + { + "epoch": 0.36239782016348776, + "grad_norm": 0.1250709593296051, + "learning_rate": 7.376514935332314e-06, + "loss": 0.0605, + "step": 1729 + }, + { + "epoch": 0.3626074198281283, + "grad_norm": 0.09578738361597061, + "learning_rate": 7.37352752490434e-06, + "loss": 0.0602, + "step": 1730 + }, + { + "epoch": 0.3628170194927688, + "grad_norm": 0.09304331988096237, + "learning_rate": 7.37053902028067e-06, + "loss": 0.0581, + "step": 1731 + }, + { + "epoch": 0.36302661915740936, + "grad_norm": 0.11090435832738876, + "learning_rate": 7.367549422839002e-06, + "loss": 0.0587, + "step": 1732 + }, + { + "epoch": 0.36323621882204987, + "grad_norm": 0.12240754812955856, + "learning_rate": 7.364558733957543e-06, + "loss": 0.0576, + "step": 1733 + }, + { + "epoch": 0.36344581848669044, + "grad_norm": 0.12205494195222855, + "learning_rate": 7.361566955014999e-06, + "loss": 0.0618, + "step": 1734 + }, + { + "epoch": 0.36365541815133096, + "grad_norm": 0.1121978685259819, + "learning_rate": 7.3585740873905795e-06, + "loss": 0.0586, + "step": 1735 + }, + { + "epoch": 0.36386501781597147, + "grad_norm": 0.09682106226682663, + "learning_rate": 7.355580132463998e-06, + "loss": 0.0604, + "step": 1736 + }, + { + "epoch": 0.36407461748061204, + "grad_norm": 0.08918868750333786, + "learning_rate": 7.352585091615461e-06, + "loss": 0.0609, + "step": 1737 + }, + { + "epoch": 0.36428421714525255, + "grad_norm": 0.08579257130622864, + "learning_rate": 7.349588966225686e-06, + "loss": 0.0565, + "step": 1738 + }, + { + "epoch": 0.3644938168098931, + "grad_norm": 0.08958027511835098, + "learning_rate": 7.346591757675886e-06, + "loss": 0.059, + "step": 1739 + }, + { + "epoch": 0.36470341647453364, + "grad_norm": 0.08919413387775421, + "learning_rate": 7.3435934673477735e-06, + "loss": 0.0588, + "step": 1740 + }, + { + "epoch": 0.36491301613917415, + "grad_norm": 0.08150623738765717, + "learning_rate": 7.340594096623559e-06, + "loss": 0.0612, + "step": 1741 + }, + { + "epoch": 0.3651226158038147, + "grad_norm": 0.08697859197854996, + "learning_rate": 7.337593646885952e-06, + "loss": 0.0579, + "step": 1742 + }, + { + "epoch": 0.36533221546845523, + "grad_norm": 0.09409506618976593, + "learning_rate": 7.3345921195181605e-06, + "loss": 0.0609, + "step": 1743 + }, + { + "epoch": 0.3655418151330958, + "grad_norm": 0.12264561653137207, + "learning_rate": 7.331589515903885e-06, + "loss": 0.0571, + "step": 1744 + }, + { + "epoch": 0.3657514147977363, + "grad_norm": 0.15233023464679718, + "learning_rate": 7.328585837427329e-06, + "loss": 0.0616, + "step": 1745 + }, + { + "epoch": 0.3659610144623769, + "grad_norm": 0.15437071025371552, + "learning_rate": 7.3255810854731845e-06, + "loss": 0.0583, + "step": 1746 + }, + { + "epoch": 0.3661706141270174, + "grad_norm": 0.14194343984127045, + "learning_rate": 7.322575261426643e-06, + "loss": 0.0579, + "step": 1747 + }, + { + "epoch": 0.3663802137916579, + "grad_norm": 0.14112666249275208, + "learning_rate": 7.319568366673389e-06, + "loss": 0.0576, + "step": 1748 + }, + { + "epoch": 0.3665898134562985, + "grad_norm": 0.15609388053417206, + "learning_rate": 7.316560402599598e-06, + "loss": 0.0629, + "step": 1749 + }, + { + "epoch": 0.366799413120939, + "grad_norm": 0.1522039920091629, + "learning_rate": 7.313551370591944e-06, + "loss": 0.0605, + "step": 1750 + }, + { + "epoch": 0.36700901278557957, + "grad_norm": 0.13480624556541443, + "learning_rate": 7.310541272037588e-06, + "loss": 0.0589, + "step": 1751 + }, + { + "epoch": 0.3672186124502201, + "grad_norm": 0.12041886895895004, + "learning_rate": 7.307530108324186e-06, + "loss": 0.0586, + "step": 1752 + }, + { + "epoch": 0.3674282121148606, + "grad_norm": 0.11600147187709808, + "learning_rate": 7.304517880839883e-06, + "loss": 0.058, + "step": 1753 + }, + { + "epoch": 0.36763781177950117, + "grad_norm": 0.11566165834665298, + "learning_rate": 7.3015045909733165e-06, + "loss": 0.0623, + "step": 1754 + }, + { + "epoch": 0.3678474114441417, + "grad_norm": 0.12034988403320312, + "learning_rate": 7.2984902401136115e-06, + "loss": 0.059, + "step": 1755 + }, + { + "epoch": 0.36805701110878225, + "grad_norm": 0.13276949524879456, + "learning_rate": 7.295474829650382e-06, + "loss": 0.0582, + "step": 1756 + }, + { + "epoch": 0.36826661077342276, + "grad_norm": 0.14941634237766266, + "learning_rate": 7.292458360973733e-06, + "loss": 0.059, + "step": 1757 + }, + { + "epoch": 0.3684762104380633, + "grad_norm": 0.14865432679653168, + "learning_rate": 7.289440835474257e-06, + "loss": 0.0625, + "step": 1758 + }, + { + "epoch": 0.36868581010270385, + "grad_norm": 0.11310829967260361, + "learning_rate": 7.286422254543031e-06, + "loss": 0.0582, + "step": 1759 + }, + { + "epoch": 0.36889540976734436, + "grad_norm": 0.0944291204214096, + "learning_rate": 7.28340261957162e-06, + "loss": 0.0588, + "step": 1760 + }, + { + "epoch": 0.36910500943198493, + "grad_norm": 0.11601667106151581, + "learning_rate": 7.2803819319520765e-06, + "loss": 0.0576, + "step": 1761 + }, + { + "epoch": 0.36931460909662545, + "grad_norm": 0.1313844472169876, + "learning_rate": 7.277360193076936e-06, + "loss": 0.062, + "step": 1762 + }, + { + "epoch": 0.36952420876126596, + "grad_norm": 0.12570670247077942, + "learning_rate": 7.274337404339218e-06, + "loss": 0.0607, + "step": 1763 + }, + { + "epoch": 0.36973380842590653, + "grad_norm": 0.10776617377996445, + "learning_rate": 7.271313567132431e-06, + "loss": 0.0618, + "step": 1764 + }, + { + "epoch": 0.36994340809054704, + "grad_norm": 0.09985363483428955, + "learning_rate": 7.26828868285056e-06, + "loss": 0.0583, + "step": 1765 + }, + { + "epoch": 0.3701530077551876, + "grad_norm": 0.10462814569473267, + "learning_rate": 7.265262752888078e-06, + "loss": 0.0603, + "step": 1766 + }, + { + "epoch": 0.3703626074198281, + "grad_norm": 0.10572189837694168, + "learning_rate": 7.262235778639938e-06, + "loss": 0.0587, + "step": 1767 + }, + { + "epoch": 0.37057220708446864, + "grad_norm": 0.12369339168071747, + "learning_rate": 7.259207761501572e-06, + "loss": 0.0567, + "step": 1768 + }, + { + "epoch": 0.3707818067491092, + "grad_norm": 0.14378707110881805, + "learning_rate": 7.256178702868899e-06, + "loss": 0.0586, + "step": 1769 + }, + { + "epoch": 0.3709914064137497, + "grad_norm": 0.13347645103931427, + "learning_rate": 7.253148604138312e-06, + "loss": 0.0572, + "step": 1770 + }, + { + "epoch": 0.3712010060783903, + "grad_norm": 0.12877966463565826, + "learning_rate": 7.250117466706686e-06, + "loss": 0.0575, + "step": 1771 + }, + { + "epoch": 0.3714106057430308, + "grad_norm": 0.15981978178024292, + "learning_rate": 7.247085291971377e-06, + "loss": 0.0589, + "step": 1772 + }, + { + "epoch": 0.3716202054076713, + "grad_norm": 0.17421603202819824, + "learning_rate": 7.244052081330214e-06, + "loss": 0.0602, + "step": 1773 + }, + { + "epoch": 0.3718298050723119, + "grad_norm": 0.166316419839859, + "learning_rate": 7.241017836181508e-06, + "loss": 0.059, + "step": 1774 + }, + { + "epoch": 0.3720394047369524, + "grad_norm": 0.14877326786518097, + "learning_rate": 7.237982557924044e-06, + "loss": 0.0604, + "step": 1775 + }, + { + "epoch": 0.372249004401593, + "grad_norm": 0.11550521105527878, + "learning_rate": 7.234946247957087e-06, + "loss": 0.058, + "step": 1776 + }, + { + "epoch": 0.3724586040662335, + "grad_norm": 0.12925846874713898, + "learning_rate": 7.231908907680373e-06, + "loss": 0.0607, + "step": 1777 + }, + { + "epoch": 0.372668203730874, + "grad_norm": 0.16908065974712372, + "learning_rate": 7.228870538494116e-06, + "loss": 0.0592, + "step": 1778 + }, + { + "epoch": 0.3728778033955146, + "grad_norm": 0.1643764078617096, + "learning_rate": 7.225831141799004e-06, + "loss": 0.0613, + "step": 1779 + }, + { + "epoch": 0.3730874030601551, + "grad_norm": 0.1174246221780777, + "learning_rate": 7.222790718996199e-06, + "loss": 0.058, + "step": 1780 + }, + { + "epoch": 0.37329700272479566, + "grad_norm": 0.11582423746585846, + "learning_rate": 7.219749271487333e-06, + "loss": 0.0582, + "step": 1781 + }, + { + "epoch": 0.37350660238943617, + "grad_norm": 0.1282123178243637, + "learning_rate": 7.216706800674514e-06, + "loss": 0.0604, + "step": 1782 + }, + { + "epoch": 0.37371620205407674, + "grad_norm": 0.12925361096858978, + "learning_rate": 7.213663307960321e-06, + "loss": 0.0586, + "step": 1783 + }, + { + "epoch": 0.37392580171871725, + "grad_norm": 0.12871502339839935, + "learning_rate": 7.210618794747803e-06, + "loss": 0.0582, + "step": 1784 + }, + { + "epoch": 0.37413540138335777, + "grad_norm": 0.11913754045963287, + "learning_rate": 7.207573262440479e-06, + "loss": 0.059, + "step": 1785 + }, + { + "epoch": 0.37434500104799834, + "grad_norm": 0.12632472813129425, + "learning_rate": 7.20452671244234e-06, + "loss": 0.0597, + "step": 1786 + }, + { + "epoch": 0.37455460071263885, + "grad_norm": 0.14089874923229218, + "learning_rate": 7.2014791461578445e-06, + "loss": 0.0598, + "step": 1787 + }, + { + "epoch": 0.3747642003772794, + "grad_norm": 0.1204831525683403, + "learning_rate": 7.1984305649919195e-06, + "loss": 0.0588, + "step": 1788 + }, + { + "epoch": 0.37497380004191994, + "grad_norm": 0.11883781105279922, + "learning_rate": 7.195380970349961e-06, + "loss": 0.0595, + "step": 1789 + }, + { + "epoch": 0.37518339970656045, + "grad_norm": 0.12562531232833862, + "learning_rate": 7.192330363637832e-06, + "loss": 0.0608, + "step": 1790 + }, + { + "epoch": 0.375392999371201, + "grad_norm": 0.1165882870554924, + "learning_rate": 7.18927874626186e-06, + "loss": 0.059, + "step": 1791 + }, + { + "epoch": 0.37560259903584153, + "grad_norm": 0.12378109246492386, + "learning_rate": 7.186226119628841e-06, + "loss": 0.0599, + "step": 1792 + }, + { + "epoch": 0.3758121987004821, + "grad_norm": 0.11985018849372864, + "learning_rate": 7.183172485146033e-06, + "loss": 0.0592, + "step": 1793 + }, + { + "epoch": 0.3760217983651226, + "grad_norm": 0.09804270416498184, + "learning_rate": 7.180117844221166e-06, + "loss": 0.0577, + "step": 1794 + }, + { + "epoch": 0.37623139802976313, + "grad_norm": 0.11297204345464706, + "learning_rate": 7.177062198262424e-06, + "loss": 0.0617, + "step": 1795 + }, + { + "epoch": 0.3764409976944037, + "grad_norm": 0.11378408223390579, + "learning_rate": 7.1740055486784595e-06, + "loss": 0.0601, + "step": 1796 + }, + { + "epoch": 0.3766505973590442, + "grad_norm": 0.10527677834033966, + "learning_rate": 7.170947896878392e-06, + "loss": 0.0609, + "step": 1797 + }, + { + "epoch": 0.3768601970236848, + "grad_norm": 0.11885930597782135, + "learning_rate": 7.167889244271792e-06, + "loss": 0.0564, + "step": 1798 + }, + { + "epoch": 0.3770697966883253, + "grad_norm": 0.11827868223190308, + "learning_rate": 7.164829592268702e-06, + "loss": 0.0605, + "step": 1799 + }, + { + "epoch": 0.3772793963529658, + "grad_norm": 0.09840241819620132, + "learning_rate": 7.16176894227962e-06, + "loss": 0.0598, + "step": 1800 + }, + { + "epoch": 0.3774889960176064, + "grad_norm": 0.11675871908664703, + "learning_rate": 7.158707295715504e-06, + "loss": 0.0584, + "step": 1801 + }, + { + "epoch": 0.3776985956822469, + "grad_norm": 0.1085144504904747, + "learning_rate": 7.155644653987772e-06, + "loss": 0.0565, + "step": 1802 + }, + { + "epoch": 0.37790819534688747, + "grad_norm": 0.0793166384100914, + "learning_rate": 7.152581018508305e-06, + "loss": 0.0586, + "step": 1803 + }, + { + "epoch": 0.378117795011528, + "grad_norm": 0.09427671879529953, + "learning_rate": 7.149516390689433e-06, + "loss": 0.0602, + "step": 1804 + }, + { + "epoch": 0.3783273946761685, + "grad_norm": 0.09693881869316101, + "learning_rate": 7.146450771943953e-06, + "loss": 0.0579, + "step": 1805 + }, + { + "epoch": 0.37853699434080906, + "grad_norm": 0.08774615824222565, + "learning_rate": 7.143384163685112e-06, + "loss": 0.0581, + "step": 1806 + }, + { + "epoch": 0.3787465940054496, + "grad_norm": 0.10799058526754379, + "learning_rate": 7.140316567326617e-06, + "loss": 0.0581, + "step": 1807 + }, + { + "epoch": 0.37895619367009015, + "grad_norm": 0.12135607749223709, + "learning_rate": 7.13724798428263e-06, + "loss": 0.0586, + "step": 1808 + }, + { + "epoch": 0.37916579333473066, + "grad_norm": 0.12671098113059998, + "learning_rate": 7.134178415967765e-06, + "loss": 0.0583, + "step": 1809 + }, + { + "epoch": 0.3793753929993712, + "grad_norm": 0.1515565663576126, + "learning_rate": 7.131107863797093e-06, + "loss": 0.0578, + "step": 1810 + }, + { + "epoch": 0.37958499266401174, + "grad_norm": 0.18114930391311646, + "learning_rate": 7.12803632918614e-06, + "loss": 0.0587, + "step": 1811 + }, + { + "epoch": 0.37979459232865226, + "grad_norm": 0.18494822084903717, + "learning_rate": 7.12496381355088e-06, + "loss": 0.0581, + "step": 1812 + }, + { + "epoch": 0.38000419199329283, + "grad_norm": 0.14751562476158142, + "learning_rate": 7.121890318307745e-06, + "loss": 0.0603, + "step": 1813 + }, + { + "epoch": 0.38021379165793334, + "grad_norm": 0.09215519577264786, + "learning_rate": 7.1188158448736135e-06, + "loss": 0.0607, + "step": 1814 + }, + { + "epoch": 0.38042339132257386, + "grad_norm": 0.10190481692552567, + "learning_rate": 7.115740394665816e-06, + "loss": 0.0595, + "step": 1815 + }, + { + "epoch": 0.3806329909872144, + "grad_norm": 0.10842154920101166, + "learning_rate": 7.112663969102138e-06, + "loss": 0.0562, + "step": 1816 + }, + { + "epoch": 0.38084259065185494, + "grad_norm": 0.11397344619035721, + "learning_rate": 7.1095865696008085e-06, + "loss": 0.0586, + "step": 1817 + }, + { + "epoch": 0.3810521903164955, + "grad_norm": 0.12245843559503555, + "learning_rate": 7.1065081975805086e-06, + "loss": 0.0613, + "step": 1818 + }, + { + "epoch": 0.381261789981136, + "grad_norm": 0.11635053157806396, + "learning_rate": 7.103428854460367e-06, + "loss": 0.0585, + "step": 1819 + }, + { + "epoch": 0.3814713896457766, + "grad_norm": 0.10383999347686768, + "learning_rate": 7.100348541659961e-06, + "loss": 0.0625, + "step": 1820 + }, + { + "epoch": 0.3816809893104171, + "grad_norm": 0.09212907403707504, + "learning_rate": 7.0972672605993106e-06, + "loss": 0.0634, + "step": 1821 + }, + { + "epoch": 0.3818905889750576, + "grad_norm": 0.10867664963006973, + "learning_rate": 7.094185012698893e-06, + "loss": 0.0571, + "step": 1822 + }, + { + "epoch": 0.3821001886396982, + "grad_norm": 0.10434751957654953, + "learning_rate": 7.091101799379617e-06, + "loss": 0.0615, + "step": 1823 + }, + { + "epoch": 0.3823097883043387, + "grad_norm": 0.09127280861139297, + "learning_rate": 7.088017622062847e-06, + "loss": 0.0572, + "step": 1824 + }, + { + "epoch": 0.3825193879689793, + "grad_norm": 0.11248623579740524, + "learning_rate": 7.084932482170385e-06, + "loss": 0.06, + "step": 1825 + }, + { + "epoch": 0.3827289876336198, + "grad_norm": 0.12546822428703308, + "learning_rate": 7.081846381124484e-06, + "loss": 0.0571, + "step": 1826 + }, + { + "epoch": 0.3829385872982603, + "grad_norm": 0.11402714997529984, + "learning_rate": 7.078759320347833e-06, + "loss": 0.0578, + "step": 1827 + }, + { + "epoch": 0.38314818696290087, + "grad_norm": 0.10201937705278397, + "learning_rate": 7.075671301263568e-06, + "loss": 0.0579, + "step": 1828 + }, + { + "epoch": 0.3833577866275414, + "grad_norm": 0.0954611748456955, + "learning_rate": 7.072582325295262e-06, + "loss": 0.0569, + "step": 1829 + }, + { + "epoch": 0.38356738629218196, + "grad_norm": 0.10198832303285599, + "learning_rate": 7.069492393866937e-06, + "loss": 0.0601, + "step": 1830 + }, + { + "epoch": 0.38377698595682247, + "grad_norm": 0.09932437539100647, + "learning_rate": 7.066401508403047e-06, + "loss": 0.0576, + "step": 1831 + }, + { + "epoch": 0.383986585621463, + "grad_norm": 0.08973968774080276, + "learning_rate": 7.063309670328491e-06, + "loss": 0.0588, + "step": 1832 + }, + { + "epoch": 0.38419618528610355, + "grad_norm": 0.10677745938301086, + "learning_rate": 7.060216881068607e-06, + "loss": 0.0569, + "step": 1833 + }, + { + "epoch": 0.38440578495074407, + "grad_norm": 0.1219630241394043, + "learning_rate": 7.057123142049166e-06, + "loss": 0.0576, + "step": 1834 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.14446859061717987, + "learning_rate": 7.0540284546963846e-06, + "loss": 0.0577, + "step": 1835 + }, + { + "epoch": 0.38482498428002515, + "grad_norm": 0.17997580766677856, + "learning_rate": 7.050932820436915e-06, + "loss": 0.0598, + "step": 1836 + }, + { + "epoch": 0.38503458394466566, + "grad_norm": 0.17284150421619415, + "learning_rate": 7.047836240697837e-06, + "loss": 0.0602, + "step": 1837 + }, + { + "epoch": 0.38524418360930623, + "grad_norm": 0.12842464447021484, + "learning_rate": 7.044738716906679e-06, + "loss": 0.0594, + "step": 1838 + }, + { + "epoch": 0.38545378327394675, + "grad_norm": 0.12296885251998901, + "learning_rate": 7.041640250491398e-06, + "loss": 0.0619, + "step": 1839 + }, + { + "epoch": 0.3856633829385873, + "grad_norm": 0.14263387024402618, + "learning_rate": 7.038540842880386e-06, + "loss": 0.06, + "step": 1840 + }, + { + "epoch": 0.38587298260322783, + "grad_norm": 0.14516735076904297, + "learning_rate": 7.035440495502469e-06, + "loss": 0.0623, + "step": 1841 + }, + { + "epoch": 0.38608258226786835, + "grad_norm": 0.12423399090766907, + "learning_rate": 7.0323392097869044e-06, + "loss": 0.0591, + "step": 1842 + }, + { + "epoch": 0.3862921819325089, + "grad_norm": 0.10325875133275986, + "learning_rate": 7.029236987163388e-06, + "loss": 0.0577, + "step": 1843 + }, + { + "epoch": 0.38650178159714943, + "grad_norm": 0.09640491008758545, + "learning_rate": 7.026133829062041e-06, + "loss": 0.0592, + "step": 1844 + }, + { + "epoch": 0.38671138126179, + "grad_norm": 0.09991448372602463, + "learning_rate": 7.023029736913421e-06, + "loss": 0.0593, + "step": 1845 + }, + { + "epoch": 0.3869209809264305, + "grad_norm": 0.10741396993398666, + "learning_rate": 7.019924712148511e-06, + "loss": 0.0589, + "step": 1846 + }, + { + "epoch": 0.387130580591071, + "grad_norm": 0.10864882171154022, + "learning_rate": 7.0168187561987285e-06, + "loss": 0.0581, + "step": 1847 + }, + { + "epoch": 0.3873401802557116, + "grad_norm": 0.10337284207344055, + "learning_rate": 7.013711870495919e-06, + "loss": 0.0576, + "step": 1848 + }, + { + "epoch": 0.3875497799203521, + "grad_norm": 0.1069745123386383, + "learning_rate": 7.010604056472355e-06, + "loss": 0.0572, + "step": 1849 + }, + { + "epoch": 0.3877593795849927, + "grad_norm": 0.09350664913654327, + "learning_rate": 7.0074953155607395e-06, + "loss": 0.0566, + "step": 1850 + }, + { + "epoch": 0.3879689792496332, + "grad_norm": 0.08867273479700089, + "learning_rate": 7.004385649194199e-06, + "loss": 0.0596, + "step": 1851 + }, + { + "epoch": 0.38817857891427376, + "grad_norm": 0.109881192445755, + "learning_rate": 7.001275058806292e-06, + "loss": 0.058, + "step": 1852 + }, + { + "epoch": 0.3883881785789143, + "grad_norm": 0.1241583600640297, + "learning_rate": 6.998163545830998e-06, + "loss": 0.0577, + "step": 1853 + }, + { + "epoch": 0.3885977782435548, + "grad_norm": 0.10716693848371506, + "learning_rate": 6.995051111702724e-06, + "loss": 0.0574, + "step": 1854 + }, + { + "epoch": 0.38880737790819536, + "grad_norm": 0.10287656635046005, + "learning_rate": 6.991937757856302e-06, + "loss": 0.0589, + "step": 1855 + }, + { + "epoch": 0.3890169775728359, + "grad_norm": 0.128378227353096, + "learning_rate": 6.9888234857269875e-06, + "loss": 0.0598, + "step": 1856 + }, + { + "epoch": 0.38922657723747645, + "grad_norm": 0.13415974378585815, + "learning_rate": 6.98570829675046e-06, + "loss": 0.0621, + "step": 1857 + }, + { + "epoch": 0.38943617690211696, + "grad_norm": 0.1083337590098381, + "learning_rate": 6.98259219236282e-06, + "loss": 0.0588, + "step": 1858 + }, + { + "epoch": 0.3896457765667575, + "grad_norm": 0.10138161480426788, + "learning_rate": 6.979475174000591e-06, + "loss": 0.0616, + "step": 1859 + }, + { + "epoch": 0.38985537623139804, + "grad_norm": 0.1118636503815651, + "learning_rate": 6.976357243100718e-06, + "loss": 0.0579, + "step": 1860 + }, + { + "epoch": 0.39006497589603856, + "grad_norm": 0.11336030066013336, + "learning_rate": 6.973238401100565e-06, + "loss": 0.0558, + "step": 1861 + }, + { + "epoch": 0.3902745755606791, + "grad_norm": 0.11859306693077087, + "learning_rate": 6.970118649437919e-06, + "loss": 0.0579, + "step": 1862 + }, + { + "epoch": 0.39048417522531964, + "grad_norm": 0.1387511044740677, + "learning_rate": 6.966997989550988e-06, + "loss": 0.0584, + "step": 1863 + }, + { + "epoch": 0.39069377488996015, + "grad_norm": 0.14954118430614471, + "learning_rate": 6.96387642287839e-06, + "loss": 0.0588, + "step": 1864 + }, + { + "epoch": 0.3909033745546007, + "grad_norm": 0.14520269632339478, + "learning_rate": 6.960753950859168e-06, + "loss": 0.0596, + "step": 1865 + }, + { + "epoch": 0.39111297421924124, + "grad_norm": 0.11415694653987885, + "learning_rate": 6.957630574932784e-06, + "loss": 0.0603, + "step": 1866 + }, + { + "epoch": 0.3913225738838818, + "grad_norm": 0.13128365576267242, + "learning_rate": 6.954506296539112e-06, + "loss": 0.0591, + "step": 1867 + }, + { + "epoch": 0.3915321735485223, + "grad_norm": 0.13788260519504547, + "learning_rate": 6.951381117118441e-06, + "loss": 0.059, + "step": 1868 + }, + { + "epoch": 0.39174177321316284, + "grad_norm": 0.1196761503815651, + "learning_rate": 6.948255038111482e-06, + "loss": 0.0558, + "step": 1869 + }, + { + "epoch": 0.3919513728778034, + "grad_norm": 0.13322733342647552, + "learning_rate": 6.945128060959354e-06, + "loss": 0.0576, + "step": 1870 + }, + { + "epoch": 0.3921609725424439, + "grad_norm": 0.12423675507307053, + "learning_rate": 6.942000187103594e-06, + "loss": 0.0593, + "step": 1871 + }, + { + "epoch": 0.3923705722070845, + "grad_norm": 0.1317249983549118, + "learning_rate": 6.938871417986153e-06, + "loss": 0.0581, + "step": 1872 + }, + { + "epoch": 0.392580171871725, + "grad_norm": 0.1220802292227745, + "learning_rate": 6.935741755049389e-06, + "loss": 0.0602, + "step": 1873 + }, + { + "epoch": 0.3927897715363655, + "grad_norm": 0.10902206599712372, + "learning_rate": 6.9326111997360775e-06, + "loss": 0.056, + "step": 1874 + }, + { + "epoch": 0.3929993712010061, + "grad_norm": 0.12212048470973969, + "learning_rate": 6.929479753489406e-06, + "loss": 0.0588, + "step": 1875 + }, + { + "epoch": 0.3932089708656466, + "grad_norm": 0.09955111145973206, + "learning_rate": 6.926347417752969e-06, + "loss": 0.0581, + "step": 1876 + }, + { + "epoch": 0.39341857053028717, + "grad_norm": 0.11181198060512543, + "learning_rate": 6.923214193970773e-06, + "loss": 0.0576, + "step": 1877 + }, + { + "epoch": 0.3936281701949277, + "grad_norm": 0.13282683491706848, + "learning_rate": 6.920080083587231e-06, + "loss": 0.0619, + "step": 1878 + }, + { + "epoch": 0.3938377698595682, + "grad_norm": 0.09793748706579208, + "learning_rate": 6.916945088047171e-06, + "loss": 0.0548, + "step": 1879 + }, + { + "epoch": 0.39404736952420877, + "grad_norm": 0.1649271547794342, + "learning_rate": 6.913809208795823e-06, + "loss": 0.06, + "step": 1880 + }, + { + "epoch": 0.3942569691888493, + "grad_norm": 0.1789517104625702, + "learning_rate": 6.910672447278827e-06, + "loss": 0.061, + "step": 1881 + }, + { + "epoch": 0.39446656885348985, + "grad_norm": 0.17387685179710388, + "learning_rate": 6.90753480494223e-06, + "loss": 0.0588, + "step": 1882 + }, + { + "epoch": 0.39467616851813037, + "grad_norm": 0.1808399260044098, + "learning_rate": 6.904396283232484e-06, + "loss": 0.0592, + "step": 1883 + }, + { + "epoch": 0.3948857681827709, + "grad_norm": 0.12922294437885284, + "learning_rate": 6.9012568835964474e-06, + "loss": 0.0611, + "step": 1884 + }, + { + "epoch": 0.39509536784741145, + "grad_norm": 0.14349722862243652, + "learning_rate": 6.898116607481382e-06, + "loss": 0.0583, + "step": 1885 + }, + { + "epoch": 0.39530496751205196, + "grad_norm": 0.14803367853164673, + "learning_rate": 6.894975456334956e-06, + "loss": 0.0587, + "step": 1886 + }, + { + "epoch": 0.39551456717669253, + "grad_norm": 0.15976615250110626, + "learning_rate": 6.891833431605237e-06, + "loss": 0.0583, + "step": 1887 + }, + { + "epoch": 0.39572416684133305, + "grad_norm": 0.1385822743177414, + "learning_rate": 6.8886905347406985e-06, + "loss": 0.0625, + "step": 1888 + }, + { + "epoch": 0.3959337665059736, + "grad_norm": 0.12182749807834625, + "learning_rate": 6.885546767190217e-06, + "loss": 0.0548, + "step": 1889 + }, + { + "epoch": 0.39614336617061413, + "grad_norm": 0.13791294395923615, + "learning_rate": 6.882402130403066e-06, + "loss": 0.0583, + "step": 1890 + }, + { + "epoch": 0.39635296583525464, + "grad_norm": 0.10044901072978973, + "learning_rate": 6.879256625828925e-06, + "loss": 0.057, + "step": 1891 + }, + { + "epoch": 0.3965625654998952, + "grad_norm": 0.13600683212280273, + "learning_rate": 6.8761102549178706e-06, + "loss": 0.0614, + "step": 1892 + }, + { + "epoch": 0.39677216516453573, + "grad_norm": 0.1270650029182434, + "learning_rate": 6.872963019120377e-06, + "loss": 0.0601, + "step": 1893 + }, + { + "epoch": 0.3969817648291763, + "grad_norm": 0.10988420993089676, + "learning_rate": 6.869814919887321e-06, + "loss": 0.0561, + "step": 1894 + }, + { + "epoch": 0.3971913644938168, + "grad_norm": 0.11587254703044891, + "learning_rate": 6.866665958669976e-06, + "loss": 0.0584, + "step": 1895 + }, + { + "epoch": 0.3974009641584573, + "grad_norm": 0.1132350042462349, + "learning_rate": 6.863516136920012e-06, + "loss": 0.0598, + "step": 1896 + }, + { + "epoch": 0.3976105638230979, + "grad_norm": 0.14506936073303223, + "learning_rate": 6.860365456089495e-06, + "loss": 0.058, + "step": 1897 + }, + { + "epoch": 0.3978201634877384, + "grad_norm": 0.12474244832992554, + "learning_rate": 6.8572139176308885e-06, + "loss": 0.0584, + "step": 1898 + }, + { + "epoch": 0.398029763152379, + "grad_norm": 0.1177903488278389, + "learning_rate": 6.854061522997053e-06, + "loss": 0.0563, + "step": 1899 + }, + { + "epoch": 0.3982393628170195, + "grad_norm": 0.11654134839773178, + "learning_rate": 6.85090827364124e-06, + "loss": 0.0556, + "step": 1900 + }, + { + "epoch": 0.39844896248166, + "grad_norm": 0.10216284543275833, + "learning_rate": 6.847754171017097e-06, + "loss": 0.0629, + "step": 1901 + }, + { + "epoch": 0.3986585621463006, + "grad_norm": 0.11993620544672012, + "learning_rate": 6.844599216578667e-06, + "loss": 0.0565, + "step": 1902 + }, + { + "epoch": 0.3988681618109411, + "grad_norm": 0.10797383636236191, + "learning_rate": 6.8414434117803785e-06, + "loss": 0.0574, + "step": 1903 + }, + { + "epoch": 0.39907776147558166, + "grad_norm": 0.111129529774189, + "learning_rate": 6.83828675807706e-06, + "loss": 0.0593, + "step": 1904 + }, + { + "epoch": 0.3992873611402222, + "grad_norm": 0.1041678711771965, + "learning_rate": 6.835129256923931e-06, + "loss": 0.0569, + "step": 1905 + }, + { + "epoch": 0.3994969608048627, + "grad_norm": 0.09219536185264587, + "learning_rate": 6.831970909776593e-06, + "loss": 0.0573, + "step": 1906 + }, + { + "epoch": 0.39970656046950326, + "grad_norm": 0.10893040150403976, + "learning_rate": 6.828811718091046e-06, + "loss": 0.0586, + "step": 1907 + }, + { + "epoch": 0.39991616013414377, + "grad_norm": 0.08363223820924759, + "learning_rate": 6.8256516833236774e-06, + "loss": 0.0584, + "step": 1908 + }, + { + "epoch": 0.40012575979878434, + "grad_norm": 0.1113632470369339, + "learning_rate": 6.822490806931262e-06, + "loss": 0.0603, + "step": 1909 + }, + { + "epoch": 0.40033535946342486, + "grad_norm": 0.13083884119987488, + "learning_rate": 6.819329090370964e-06, + "loss": 0.0586, + "step": 1910 + }, + { + "epoch": 0.40054495912806537, + "grad_norm": 0.12002018094062805, + "learning_rate": 6.816166535100332e-06, + "loss": 0.0589, + "step": 1911 + }, + { + "epoch": 0.40075455879270594, + "grad_norm": 0.13928495347499847, + "learning_rate": 6.813003142577306e-06, + "loss": 0.0586, + "step": 1912 + }, + { + "epoch": 0.40096415845734645, + "grad_norm": 0.14167679846286774, + "learning_rate": 6.809838914260208e-06, + "loss": 0.0567, + "step": 1913 + }, + { + "epoch": 0.401173758121987, + "grad_norm": 0.13765917718410492, + "learning_rate": 6.806673851607745e-06, + "loss": 0.0596, + "step": 1914 + }, + { + "epoch": 0.40138335778662754, + "grad_norm": 0.11585170775651932, + "learning_rate": 6.803507956079012e-06, + "loss": 0.0562, + "step": 1915 + }, + { + "epoch": 0.40159295745126805, + "grad_norm": 0.1145220398902893, + "learning_rate": 6.800341229133486e-06, + "loss": 0.0572, + "step": 1916 + }, + { + "epoch": 0.4018025571159086, + "grad_norm": 0.11287350952625275, + "learning_rate": 6.797173672231027e-06, + "loss": 0.0581, + "step": 1917 + }, + { + "epoch": 0.40201215678054913, + "grad_norm": 0.0974586084485054, + "learning_rate": 6.794005286831878e-06, + "loss": 0.0552, + "step": 1918 + }, + { + "epoch": 0.4022217564451897, + "grad_norm": 0.10374786704778671, + "learning_rate": 6.790836074396666e-06, + "loss": 0.0583, + "step": 1919 + }, + { + "epoch": 0.4024313561098302, + "grad_norm": 0.11808675527572632, + "learning_rate": 6.7876660363863955e-06, + "loss": 0.0586, + "step": 1920 + }, + { + "epoch": 0.4026409557744708, + "grad_norm": 0.11047716438770294, + "learning_rate": 6.784495174262452e-06, + "loss": 0.0589, + "step": 1921 + }, + { + "epoch": 0.4028505554391113, + "grad_norm": 0.09720434248447418, + "learning_rate": 6.781323489486606e-06, + "loss": 0.0586, + "step": 1922 + }, + { + "epoch": 0.4030601551037518, + "grad_norm": 0.11186391115188599, + "learning_rate": 6.778150983520999e-06, + "loss": 0.0589, + "step": 1923 + }, + { + "epoch": 0.4032697547683924, + "grad_norm": 0.11044489592313766, + "learning_rate": 6.774977657828159e-06, + "loss": 0.0598, + "step": 1924 + }, + { + "epoch": 0.4034793544330329, + "grad_norm": 0.08856026083230972, + "learning_rate": 6.771803513870988e-06, + "loss": 0.0551, + "step": 1925 + }, + { + "epoch": 0.40368895409767347, + "grad_norm": 0.11433659493923187, + "learning_rate": 6.768628553112763e-06, + "loss": 0.0583, + "step": 1926 + }, + { + "epoch": 0.403898553762314, + "grad_norm": 0.12080615013837814, + "learning_rate": 6.765452777017146e-06, + "loss": 0.0574, + "step": 1927 + }, + { + "epoch": 0.4041081534269545, + "grad_norm": 0.09817066043615341, + "learning_rate": 6.762276187048164e-06, + "loss": 0.0604, + "step": 1928 + }, + { + "epoch": 0.40431775309159507, + "grad_norm": 0.14070898294448853, + "learning_rate": 6.759098784670224e-06, + "loss": 0.058, + "step": 1929 + }, + { + "epoch": 0.4045273527562356, + "grad_norm": 0.16221067309379578, + "learning_rate": 6.755920571348111e-06, + "loss": 0.0593, + "step": 1930 + }, + { + "epoch": 0.40473695242087615, + "grad_norm": 0.14239110052585602, + "learning_rate": 6.75274154854698e-06, + "loss": 0.0559, + "step": 1931 + }, + { + "epoch": 0.40494655208551666, + "grad_norm": 0.1578344851732254, + "learning_rate": 6.749561717732359e-06, + "loss": 0.0568, + "step": 1932 + }, + { + "epoch": 0.4051561517501572, + "grad_norm": 0.11431273072957993, + "learning_rate": 6.7463810803701495e-06, + "loss": 0.0581, + "step": 1933 + }, + { + "epoch": 0.40536575141479775, + "grad_norm": 0.10527955740690231, + "learning_rate": 6.743199637926623e-06, + "loss": 0.0593, + "step": 1934 + }, + { + "epoch": 0.40557535107943826, + "grad_norm": 0.13100096583366394, + "learning_rate": 6.740017391868427e-06, + "loss": 0.059, + "step": 1935 + }, + { + "epoch": 0.40578495074407883, + "grad_norm": 0.11144158244132996, + "learning_rate": 6.736834343662576e-06, + "loss": 0.0579, + "step": 1936 + }, + { + "epoch": 0.40599455040871935, + "grad_norm": 0.13539659976959229, + "learning_rate": 6.73365049477645e-06, + "loss": 0.0581, + "step": 1937 + }, + { + "epoch": 0.40620415007335986, + "grad_norm": 0.13399067521095276, + "learning_rate": 6.7304658466778095e-06, + "loss": 0.0582, + "step": 1938 + }, + { + "epoch": 0.40641374973800043, + "grad_norm": 0.11665608733892441, + "learning_rate": 6.7272804008347705e-06, + "loss": 0.0557, + "step": 1939 + }, + { + "epoch": 0.40662334940264094, + "grad_norm": 0.12888075411319733, + "learning_rate": 6.7240941587158274e-06, + "loss": 0.0614, + "step": 1940 + }, + { + "epoch": 0.4068329490672815, + "grad_norm": 0.11636313796043396, + "learning_rate": 6.720907121789835e-06, + "loss": 0.0591, + "step": 1941 + }, + { + "epoch": 0.407042548731922, + "grad_norm": 0.11793368309736252, + "learning_rate": 6.717719291526016e-06, + "loss": 0.0597, + "step": 1942 + }, + { + "epoch": 0.40725214839656254, + "grad_norm": 0.13560403883457184, + "learning_rate": 6.71453066939396e-06, + "loss": 0.0574, + "step": 1943 + }, + { + "epoch": 0.4074617480612031, + "grad_norm": 0.09925508499145508, + "learning_rate": 6.711341256863623e-06, + "loss": 0.0591, + "step": 1944 + }, + { + "epoch": 0.4076713477258436, + "grad_norm": 0.1104302629828453, + "learning_rate": 6.708151055405321e-06, + "loss": 0.0595, + "step": 1945 + }, + { + "epoch": 0.4078809473904842, + "grad_norm": 0.1431284248828888, + "learning_rate": 6.704960066489738e-06, + "loss": 0.0606, + "step": 1946 + }, + { + "epoch": 0.4080905470551247, + "grad_norm": 0.1400553286075592, + "learning_rate": 6.701768291587918e-06, + "loss": 0.0602, + "step": 1947 + }, + { + "epoch": 0.4083001467197652, + "grad_norm": 0.16290639340877533, + "learning_rate": 6.698575732171269e-06, + "loss": 0.0591, + "step": 1948 + }, + { + "epoch": 0.4085097463844058, + "grad_norm": 0.14287681877613068, + "learning_rate": 6.695382389711561e-06, + "loss": 0.0571, + "step": 1949 + }, + { + "epoch": 0.4087193460490463, + "grad_norm": 0.13841015100479126, + "learning_rate": 6.692188265680923e-06, + "loss": 0.0579, + "step": 1950 + }, + { + "epoch": 0.4089289457136869, + "grad_norm": 0.1272697001695633, + "learning_rate": 6.688993361551847e-06, + "loss": 0.0557, + "step": 1951 + }, + { + "epoch": 0.4091385453783274, + "grad_norm": 0.16662158071994781, + "learning_rate": 6.6857976787971815e-06, + "loss": 0.0621, + "step": 1952 + }, + { + "epoch": 0.4093481450429679, + "grad_norm": 0.18397577106952667, + "learning_rate": 6.682601218890136e-06, + "loss": 0.06, + "step": 1953 + }, + { + "epoch": 0.4095577447076085, + "grad_norm": 0.14603066444396973, + "learning_rate": 6.679403983304278e-06, + "loss": 0.0582, + "step": 1954 + }, + { + "epoch": 0.409767344372249, + "grad_norm": 0.1586339771747589, + "learning_rate": 6.6762059735135325e-06, + "loss": 0.0604, + "step": 1955 + }, + { + "epoch": 0.40997694403688956, + "grad_norm": 0.1153809055685997, + "learning_rate": 6.673007190992181e-06, + "loss": 0.0626, + "step": 1956 + }, + { + "epoch": 0.41018654370153007, + "grad_norm": 0.15605716407299042, + "learning_rate": 6.669807637214862e-06, + "loss": 0.0584, + "step": 1957 + }, + { + "epoch": 0.41039614336617064, + "grad_norm": 0.17064720392227173, + "learning_rate": 6.66660731365657e-06, + "loss": 0.0572, + "step": 1958 + }, + { + "epoch": 0.41060574303081115, + "grad_norm": 0.141871839761734, + "learning_rate": 6.663406221792652e-06, + "loss": 0.0552, + "step": 1959 + }, + { + "epoch": 0.41081534269545167, + "grad_norm": 0.14091584086418152, + "learning_rate": 6.660204363098812e-06, + "loss": 0.06, + "step": 1960 + }, + { + "epoch": 0.41102494236009224, + "grad_norm": 0.10206598043441772, + "learning_rate": 6.657001739051105e-06, + "loss": 0.0599, + "step": 1961 + }, + { + "epoch": 0.41123454202473275, + "grad_norm": 0.1338677704334259, + "learning_rate": 6.65379835112594e-06, + "loss": 0.0609, + "step": 1962 + }, + { + "epoch": 0.4114441416893733, + "grad_norm": 0.13361436128616333, + "learning_rate": 6.65059420080008e-06, + "loss": 0.058, + "step": 1963 + }, + { + "epoch": 0.41165374135401384, + "grad_norm": 0.15927137434482574, + "learning_rate": 6.647389289550635e-06, + "loss": 0.061, + "step": 1964 + }, + { + "epoch": 0.41186334101865435, + "grad_norm": 0.11594053357839584, + "learning_rate": 6.64418361885507e-06, + "loss": 0.0563, + "step": 1965 + }, + { + "epoch": 0.4120729406832949, + "grad_norm": 0.10348820686340332, + "learning_rate": 6.640977190191198e-06, + "loss": 0.0613, + "step": 1966 + }, + { + "epoch": 0.41228254034793543, + "grad_norm": 0.14770418405532837, + "learning_rate": 6.637770005037182e-06, + "loss": 0.0593, + "step": 1967 + }, + { + "epoch": 0.412492140012576, + "grad_norm": 0.12195451557636261, + "learning_rate": 6.634562064871535e-06, + "loss": 0.0585, + "step": 1968 + }, + { + "epoch": 0.4127017396772165, + "grad_norm": 0.11958389729261398, + "learning_rate": 6.631353371173115e-06, + "loss": 0.058, + "step": 1969 + }, + { + "epoch": 0.41291133934185703, + "grad_norm": 0.09207571297883987, + "learning_rate": 6.628143925421129e-06, + "loss": 0.0556, + "step": 1970 + }, + { + "epoch": 0.4131209390064976, + "grad_norm": 0.10654924809932709, + "learning_rate": 6.624933729095133e-06, + "loss": 0.0564, + "step": 1971 + }, + { + "epoch": 0.4133305386711381, + "grad_norm": 0.12806440889835358, + "learning_rate": 6.621722783675024e-06, + "loss": 0.0573, + "step": 1972 + }, + { + "epoch": 0.4135401383357787, + "grad_norm": 0.15015651285648346, + "learning_rate": 6.6185110906410485e-06, + "loss": 0.0567, + "step": 1973 + }, + { + "epoch": 0.4137497380004192, + "grad_norm": 0.13939987123012543, + "learning_rate": 6.615298651473799e-06, + "loss": 0.058, + "step": 1974 + }, + { + "epoch": 0.4139593376650597, + "grad_norm": 0.10312556475400925, + "learning_rate": 6.612085467654204e-06, + "loss": 0.0589, + "step": 1975 + }, + { + "epoch": 0.4141689373297003, + "grad_norm": 0.11151403933763504, + "learning_rate": 6.608871540663544e-06, + "loss": 0.0598, + "step": 1976 + }, + { + "epoch": 0.4143785369943408, + "grad_norm": 0.10755238682031631, + "learning_rate": 6.605656871983439e-06, + "loss": 0.0598, + "step": 1977 + }, + { + "epoch": 0.41458813665898137, + "grad_norm": 0.09601911902427673, + "learning_rate": 6.6024414630958475e-06, + "loss": 0.0571, + "step": 1978 + }, + { + "epoch": 0.4147977363236219, + "grad_norm": 0.11368954926729202, + "learning_rate": 6.599225315483076e-06, + "loss": 0.0556, + "step": 1979 + }, + { + "epoch": 0.4150073359882624, + "grad_norm": 0.11148924380540848, + "learning_rate": 6.596008430627766e-06, + "loss": 0.0572, + "step": 1980 + }, + { + "epoch": 0.41521693565290296, + "grad_norm": 0.11703907698392868, + "learning_rate": 6.592790810012901e-06, + "loss": 0.0583, + "step": 1981 + }, + { + "epoch": 0.4154265353175435, + "grad_norm": 0.1259683072566986, + "learning_rate": 6.589572455121804e-06, + "loss": 0.0599, + "step": 1982 + }, + { + "epoch": 0.41563613498218405, + "grad_norm": 0.12415824085474014, + "learning_rate": 6.586353367438134e-06, + "loss": 0.0568, + "step": 1983 + }, + { + "epoch": 0.41584573464682456, + "grad_norm": 0.11688219755887985, + "learning_rate": 6.5831335484458915e-06, + "loss": 0.0564, + "step": 1984 + }, + { + "epoch": 0.4160553343114651, + "grad_norm": 0.10014292597770691, + "learning_rate": 6.579912999629412e-06, + "loss": 0.0595, + "step": 1985 + }, + { + "epoch": 0.41626493397610564, + "grad_norm": 0.09642448276281357, + "learning_rate": 6.576691722473368e-06, + "loss": 0.0587, + "step": 1986 + }, + { + "epoch": 0.41647453364074616, + "grad_norm": 0.10601100325584412, + "learning_rate": 6.573469718462768e-06, + "loss": 0.0572, + "step": 1987 + }, + { + "epoch": 0.41668413330538673, + "grad_norm": 0.10878970474004745, + "learning_rate": 6.570246989082954e-06, + "loss": 0.0598, + "step": 1988 + }, + { + "epoch": 0.41689373297002724, + "grad_norm": 0.11488792300224304, + "learning_rate": 6.567023535819605e-06, + "loss": 0.0606, + "step": 1989 + }, + { + "epoch": 0.4171033326346678, + "grad_norm": 0.1324160397052765, + "learning_rate": 6.5637993601587305e-06, + "loss": 0.058, + "step": 1990 + }, + { + "epoch": 0.4173129322993083, + "grad_norm": 0.1504766047000885, + "learning_rate": 6.560574463586677e-06, + "loss": 0.0565, + "step": 1991 + }, + { + "epoch": 0.41752253196394884, + "grad_norm": 0.1516587734222412, + "learning_rate": 6.557348847590118e-06, + "loss": 0.0564, + "step": 1992 + }, + { + "epoch": 0.4177321316285894, + "grad_norm": 0.14044304192066193, + "learning_rate": 6.554122513656065e-06, + "loss": 0.0586, + "step": 1993 + }, + { + "epoch": 0.4179417312932299, + "grad_norm": 0.11305470019578934, + "learning_rate": 6.550895463271856e-06, + "loss": 0.0593, + "step": 1994 + }, + { + "epoch": 0.4181513309578705, + "grad_norm": 0.11471327394247055, + "learning_rate": 6.547667697925161e-06, + "loss": 0.055, + "step": 1995 + }, + { + "epoch": 0.418360930622511, + "grad_norm": 0.15590576827526093, + "learning_rate": 6.544439219103977e-06, + "loss": 0.059, + "step": 1996 + }, + { + "epoch": 0.4185705302871515, + "grad_norm": 0.16013385355472565, + "learning_rate": 6.5412100282966366e-06, + "loss": 0.0576, + "step": 1997 + }, + { + "epoch": 0.4187801299517921, + "grad_norm": 0.13815461099147797, + "learning_rate": 6.53798012699179e-06, + "loss": 0.0577, + "step": 1998 + }, + { + "epoch": 0.4189897296164326, + "grad_norm": 0.11431432515382767, + "learning_rate": 6.534749516678427e-06, + "loss": 0.0585, + "step": 1999 + }, + { + "epoch": 0.4191993292810732, + "grad_norm": 0.1008606106042862, + "learning_rate": 6.531518198845854e-06, + "loss": 0.0561, + "step": 2000 + }, + { + "epoch": 0.4194089289457137, + "grad_norm": 0.10297921299934387, + "learning_rate": 6.52828617498371e-06, + "loss": 0.0571, + "step": 2001 + }, + { + "epoch": 0.4196185286103542, + "grad_norm": 0.10644082725048065, + "learning_rate": 6.525053446581957e-06, + "loss": 0.0551, + "step": 2002 + }, + { + "epoch": 0.41982812827499477, + "grad_norm": 0.10528367012739182, + "learning_rate": 6.52182001513088e-06, + "loss": 0.0583, + "step": 2003 + }, + { + "epoch": 0.4200377279396353, + "grad_norm": 0.09672725200653076, + "learning_rate": 6.518585882121096e-06, + "loss": 0.0558, + "step": 2004 + }, + { + "epoch": 0.42024732760427586, + "grad_norm": 0.09464428573846817, + "learning_rate": 6.515351049043533e-06, + "loss": 0.058, + "step": 2005 + }, + { + "epoch": 0.42045692726891637, + "grad_norm": 0.10385248810052872, + "learning_rate": 6.5121155173894515e-06, + "loss": 0.0584, + "step": 2006 + }, + { + "epoch": 0.4206665269335569, + "grad_norm": 0.10311248153448105, + "learning_rate": 6.508879288650431e-06, + "loss": 0.0564, + "step": 2007 + }, + { + "epoch": 0.42087612659819745, + "grad_norm": 0.08637700229883194, + "learning_rate": 6.505642364318372e-06, + "loss": 0.0596, + "step": 2008 + }, + { + "epoch": 0.42108572626283797, + "grad_norm": 0.10172644257545471, + "learning_rate": 6.502404745885495e-06, + "loss": 0.0594, + "step": 2009 + }, + { + "epoch": 0.42129532592747854, + "grad_norm": 0.12740617990493774, + "learning_rate": 6.499166434844344e-06, + "loss": 0.058, + "step": 2010 + }, + { + "epoch": 0.42150492559211905, + "grad_norm": 0.12112673372030258, + "learning_rate": 6.495927432687777e-06, + "loss": 0.0566, + "step": 2011 + }, + { + "epoch": 0.42171452525675956, + "grad_norm": 0.10495518893003464, + "learning_rate": 6.492687740908973e-06, + "loss": 0.0581, + "step": 2012 + }, + { + "epoch": 0.42192412492140013, + "grad_norm": 0.11961285024881363, + "learning_rate": 6.489447361001431e-06, + "loss": 0.0561, + "step": 2013 + }, + { + "epoch": 0.42213372458604065, + "grad_norm": 0.1219894215464592, + "learning_rate": 6.486206294458966e-06, + "loss": 0.0599, + "step": 2014 + }, + { + "epoch": 0.4223433242506812, + "grad_norm": 0.11397355794906616, + "learning_rate": 6.482964542775707e-06, + "loss": 0.0564, + "step": 2015 + }, + { + "epoch": 0.42255292391532173, + "grad_norm": 0.10773434489965439, + "learning_rate": 6.479722107446102e-06, + "loss": 0.0577, + "step": 2016 + }, + { + "epoch": 0.42276252357996225, + "grad_norm": 0.09557615965604782, + "learning_rate": 6.476478989964914e-06, + "loss": 0.0588, + "step": 2017 + }, + { + "epoch": 0.4229721232446028, + "grad_norm": 0.09783432632684708, + "learning_rate": 6.473235191827219e-06, + "loss": 0.056, + "step": 2018 + }, + { + "epoch": 0.42318172290924333, + "grad_norm": 0.10292766988277435, + "learning_rate": 6.469990714528403e-06, + "loss": 0.0593, + "step": 2019 + }, + { + "epoch": 0.4233913225738839, + "grad_norm": 0.09302861243486404, + "learning_rate": 6.466745559564175e-06, + "loss": 0.0565, + "step": 2020 + }, + { + "epoch": 0.4236009222385244, + "grad_norm": 0.1034521758556366, + "learning_rate": 6.463499728430549e-06, + "loss": 0.0567, + "step": 2021 + }, + { + "epoch": 0.4238105219031649, + "grad_norm": 0.11888016015291214, + "learning_rate": 6.460253222623851e-06, + "loss": 0.0578, + "step": 2022 + }, + { + "epoch": 0.4240201215678055, + "grad_norm": 0.11663860082626343, + "learning_rate": 6.457006043640722e-06, + "loss": 0.0627, + "step": 2023 + }, + { + "epoch": 0.424229721232446, + "grad_norm": 0.11908293515443802, + "learning_rate": 6.453758192978109e-06, + "loss": 0.0577, + "step": 2024 + }, + { + "epoch": 0.4244393208970866, + "grad_norm": 0.11339572072029114, + "learning_rate": 6.450509672133272e-06, + "loss": 0.0609, + "step": 2025 + }, + { + "epoch": 0.4246489205617271, + "grad_norm": 0.10132212191820145, + "learning_rate": 6.447260482603776e-06, + "loss": 0.0551, + "step": 2026 + }, + { + "epoch": 0.42485852022636766, + "grad_norm": 0.119560107588768, + "learning_rate": 6.444010625887498e-06, + "loss": 0.0589, + "step": 2027 + }, + { + "epoch": 0.4250681198910082, + "grad_norm": 0.12544332444667816, + "learning_rate": 6.4407601034826225e-06, + "loss": 0.0594, + "step": 2028 + }, + { + "epoch": 0.4252777195556487, + "grad_norm": 0.10760358721017838, + "learning_rate": 6.437508916887638e-06, + "loss": 0.0583, + "step": 2029 + }, + { + "epoch": 0.42548731922028926, + "grad_norm": 0.08328445255756378, + "learning_rate": 6.4342570676013415e-06, + "loss": 0.0586, + "step": 2030 + }, + { + "epoch": 0.4256969188849298, + "grad_norm": 0.09850215911865234, + "learning_rate": 6.4310045571228344e-06, + "loss": 0.0574, + "step": 2031 + }, + { + "epoch": 0.42590651854957035, + "grad_norm": 0.11578439176082611, + "learning_rate": 6.427751386951525e-06, + "loss": 0.0607, + "step": 2032 + }, + { + "epoch": 0.42611611821421086, + "grad_norm": 0.10482225567102432, + "learning_rate": 6.424497558587122e-06, + "loss": 0.0572, + "step": 2033 + }, + { + "epoch": 0.4263257178788514, + "grad_norm": 0.11043455451726913, + "learning_rate": 6.421243073529639e-06, + "loss": 0.0568, + "step": 2034 + }, + { + "epoch": 0.42653531754349194, + "grad_norm": 0.12734782695770264, + "learning_rate": 6.417987933279397e-06, + "loss": 0.0573, + "step": 2035 + }, + { + "epoch": 0.42674491720813246, + "grad_norm": 0.11755295097827911, + "learning_rate": 6.41473213933701e-06, + "loss": 0.0586, + "step": 2036 + }, + { + "epoch": 0.426954516872773, + "grad_norm": 0.10255907475948334, + "learning_rate": 6.411475693203402e-06, + "loss": 0.0627, + "step": 2037 + }, + { + "epoch": 0.42716411653741354, + "grad_norm": 0.07925237715244293, + "learning_rate": 6.408218596379792e-06, + "loss": 0.0562, + "step": 2038 + }, + { + "epoch": 0.42737371620205405, + "grad_norm": 0.07128457725048065, + "learning_rate": 6.404960850367701e-06, + "loss": 0.0545, + "step": 2039 + }, + { + "epoch": 0.4275833158666946, + "grad_norm": 0.08888307213783264, + "learning_rate": 6.4017024566689515e-06, + "loss": 0.0587, + "step": 2040 + }, + { + "epoch": 0.42779291553133514, + "grad_norm": 0.09860040247440338, + "learning_rate": 6.39844341678566e-06, + "loss": 0.0571, + "step": 2041 + }, + { + "epoch": 0.4280025151959757, + "grad_norm": 0.09772216528654099, + "learning_rate": 6.395183732220242e-06, + "loss": 0.0591, + "step": 2042 + }, + { + "epoch": 0.4282121148606162, + "grad_norm": 0.08198471367359161, + "learning_rate": 6.391923404475416e-06, + "loss": 0.0592, + "step": 2043 + }, + { + "epoch": 0.42842171452525674, + "grad_norm": 0.07226858288049698, + "learning_rate": 6.388662435054187e-06, + "loss": 0.0569, + "step": 2044 + }, + { + "epoch": 0.4286313141898973, + "grad_norm": 0.1091211661696434, + "learning_rate": 6.385400825459865e-06, + "loss": 0.0546, + "step": 2045 + }, + { + "epoch": 0.4288409138545378, + "grad_norm": 0.11992846429347992, + "learning_rate": 6.382138577196052e-06, + "loss": 0.0588, + "step": 2046 + }, + { + "epoch": 0.4290505135191784, + "grad_norm": 0.09196002781391144, + "learning_rate": 6.378875691766639e-06, + "loss": 0.0565, + "step": 2047 + }, + { + "epoch": 0.4292601131838189, + "grad_norm": 0.15287606418132782, + "learning_rate": 6.375612170675821e-06, + "loss": 0.0568, + "step": 2048 + }, + { + "epoch": 0.4294697128484594, + "grad_norm": 0.15699946880340576, + "learning_rate": 6.372348015428077e-06, + "loss": 0.0595, + "step": 2049 + }, + { + "epoch": 0.4296793125131, + "grad_norm": 0.12692134082317352, + "learning_rate": 6.3690832275281835e-06, + "loss": 0.059, + "step": 2050 + }, + { + "epoch": 0.4298889121777405, + "grad_norm": 0.14513273537158966, + "learning_rate": 6.3658178084812065e-06, + "loss": 0.0585, + "step": 2051 + }, + { + "epoch": 0.43009851184238107, + "grad_norm": 0.13621389865875244, + "learning_rate": 6.3625517597925025e-06, + "loss": 0.0557, + "step": 2052 + }, + { + "epoch": 0.4303081115070216, + "grad_norm": 0.14017432928085327, + "learning_rate": 6.359285082967721e-06, + "loss": 0.0575, + "step": 2053 + }, + { + "epoch": 0.4305177111716621, + "grad_norm": 0.11645541340112686, + "learning_rate": 6.356017779512799e-06, + "loss": 0.0586, + "step": 2054 + }, + { + "epoch": 0.43072731083630267, + "grad_norm": 0.12075808644294739, + "learning_rate": 6.352749850933961e-06, + "loss": 0.0586, + "step": 2055 + }, + { + "epoch": 0.4309369105009432, + "grad_norm": 0.1253325641155243, + "learning_rate": 6.349481298737723e-06, + "loss": 0.057, + "step": 2056 + }, + { + "epoch": 0.43114651016558375, + "grad_norm": 0.12212934345006943, + "learning_rate": 6.346212124430888e-06, + "loss": 0.0561, + "step": 2057 + }, + { + "epoch": 0.43135610983022427, + "grad_norm": 0.10774050652980804, + "learning_rate": 6.342942329520543e-06, + "loss": 0.0599, + "step": 2058 + }, + { + "epoch": 0.43156570949486484, + "grad_norm": 0.08897628635168076, + "learning_rate": 6.339671915514062e-06, + "loss": 0.0602, + "step": 2059 + }, + { + "epoch": 0.43177530915950535, + "grad_norm": 0.12079823017120361, + "learning_rate": 6.336400883919106e-06, + "loss": 0.0568, + "step": 2060 + }, + { + "epoch": 0.43198490882414586, + "grad_norm": 0.11661111563444138, + "learning_rate": 6.33312923624362e-06, + "loss": 0.0562, + "step": 2061 + }, + { + "epoch": 0.43219450848878643, + "grad_norm": 0.09649336338043213, + "learning_rate": 6.329856973995835e-06, + "loss": 0.0592, + "step": 2062 + }, + { + "epoch": 0.43240410815342695, + "grad_norm": 0.12174370884895325, + "learning_rate": 6.32658409868426e-06, + "loss": 0.0586, + "step": 2063 + }, + { + "epoch": 0.4326137078180675, + "grad_norm": 0.13494165241718292, + "learning_rate": 6.32331061181769e-06, + "loss": 0.0575, + "step": 2064 + }, + { + "epoch": 0.43282330748270803, + "grad_norm": 0.1156393364071846, + "learning_rate": 6.320036514905204e-06, + "loss": 0.0577, + "step": 2065 + }, + { + "epoch": 0.43303290714734854, + "grad_norm": 0.11403451859951019, + "learning_rate": 6.316761809456159e-06, + "loss": 0.0599, + "step": 2066 + }, + { + "epoch": 0.4332425068119891, + "grad_norm": 0.1350114345550537, + "learning_rate": 6.313486496980192e-06, + "loss": 0.0572, + "step": 2067 + }, + { + "epoch": 0.43345210647662963, + "grad_norm": 0.10050825029611588, + "learning_rate": 6.310210578987225e-06, + "loss": 0.0613, + "step": 2068 + }, + { + "epoch": 0.4336617061412702, + "grad_norm": 0.1249486580491066, + "learning_rate": 6.306934056987452e-06, + "loss": 0.0597, + "step": 2069 + }, + { + "epoch": 0.4338713058059107, + "grad_norm": 0.15121574699878693, + "learning_rate": 6.303656932491349e-06, + "loss": 0.0558, + "step": 2070 + }, + { + "epoch": 0.4340809054705512, + "grad_norm": 0.12158171832561493, + "learning_rate": 6.3003792070096735e-06, + "loss": 0.059, + "step": 2071 + }, + { + "epoch": 0.4342905051351918, + "grad_norm": 0.11560632288455963, + "learning_rate": 6.297100882053451e-06, + "loss": 0.0586, + "step": 2072 + }, + { + "epoch": 0.4345001047998323, + "grad_norm": 0.1341363489627838, + "learning_rate": 6.293821959133993e-06, + "loss": 0.0616, + "step": 2073 + }, + { + "epoch": 0.4347097044644729, + "grad_norm": 0.11600136011838913, + "learning_rate": 6.2905424397628816e-06, + "loss": 0.0593, + "step": 2074 + }, + { + "epoch": 0.4349193041291134, + "grad_norm": 0.10497263818979263, + "learning_rate": 6.28726232545197e-06, + "loss": 0.0561, + "step": 2075 + }, + { + "epoch": 0.4351289037937539, + "grad_norm": 0.10920844972133636, + "learning_rate": 6.283981617713397e-06, + "loss": 0.0577, + "step": 2076 + }, + { + "epoch": 0.4353385034583945, + "grad_norm": 0.11021441221237183, + "learning_rate": 6.280700318059563e-06, + "loss": 0.0574, + "step": 2077 + }, + { + "epoch": 0.435548103123035, + "grad_norm": 0.09715298563241959, + "learning_rate": 6.277418428003149e-06, + "loss": 0.0599, + "step": 2078 + }, + { + "epoch": 0.43575770278767556, + "grad_norm": 0.0873681902885437, + "learning_rate": 6.274135949057107e-06, + "loss": 0.0556, + "step": 2079 + }, + { + "epoch": 0.4359673024523161, + "grad_norm": 0.11400279402732849, + "learning_rate": 6.270852882734654e-06, + "loss": 0.0596, + "step": 2080 + }, + { + "epoch": 0.4361769021169566, + "grad_norm": 0.10710210353136063, + "learning_rate": 6.267569230549288e-06, + "loss": 0.0607, + "step": 2081 + }, + { + "epoch": 0.43638650178159716, + "grad_norm": 0.1295563280582428, + "learning_rate": 6.26428499401477e-06, + "loss": 0.0584, + "step": 2082 + }, + { + "epoch": 0.43659610144623767, + "grad_norm": 0.1621527224779129, + "learning_rate": 6.261000174645131e-06, + "loss": 0.0576, + "step": 2083 + }, + { + "epoch": 0.43680570111087824, + "grad_norm": 0.17567972838878632, + "learning_rate": 6.257714773954674e-06, + "loss": 0.057, + "step": 2084 + }, + { + "epoch": 0.43701530077551876, + "grad_norm": 0.16140656173229218, + "learning_rate": 6.254428793457967e-06, + "loss": 0.0583, + "step": 2085 + }, + { + "epoch": 0.43722490044015927, + "grad_norm": 0.13268983364105225, + "learning_rate": 6.251142234669848e-06, + "loss": 0.0565, + "step": 2086 + }, + { + "epoch": 0.43743450010479984, + "grad_norm": 0.11374874413013458, + "learning_rate": 6.24785509910542e-06, + "loss": 0.056, + "step": 2087 + }, + { + "epoch": 0.43764409976944035, + "grad_norm": 0.09471353888511658, + "learning_rate": 6.244567388280047e-06, + "loss": 0.0595, + "step": 2088 + }, + { + "epoch": 0.4378536994340809, + "grad_norm": 0.12923859059810638, + "learning_rate": 6.241279103709368e-06, + "loss": 0.0555, + "step": 2089 + }, + { + "epoch": 0.43806329909872144, + "grad_norm": 0.12571561336517334, + "learning_rate": 6.23799024690928e-06, + "loss": 0.0587, + "step": 2090 + }, + { + "epoch": 0.43827289876336195, + "grad_norm": 0.1770731657743454, + "learning_rate": 6.234700819395946e-06, + "loss": 0.0601, + "step": 2091 + }, + { + "epoch": 0.4384824984280025, + "grad_norm": 0.15990307927131653, + "learning_rate": 6.231410822685791e-06, + "loss": 0.0585, + "step": 2092 + }, + { + "epoch": 0.43869209809264303, + "grad_norm": 0.11964087188243866, + "learning_rate": 6.228120258295501e-06, + "loss": 0.0585, + "step": 2093 + }, + { + "epoch": 0.4389016977572836, + "grad_norm": 0.17662663757801056, + "learning_rate": 6.224829127742028e-06, + "loss": 0.0592, + "step": 2094 + }, + { + "epoch": 0.4391112974219241, + "grad_norm": 0.17629125714302063, + "learning_rate": 6.221537432542581e-06, + "loss": 0.0581, + "step": 2095 + }, + { + "epoch": 0.4393208970865647, + "grad_norm": 0.20414410531520844, + "learning_rate": 6.218245174214632e-06, + "loss": 0.0573, + "step": 2096 + }, + { + "epoch": 0.4395304967512052, + "grad_norm": 0.16704221069812775, + "learning_rate": 6.21495235427591e-06, + "loss": 0.0575, + "step": 2097 + }, + { + "epoch": 0.4397400964158457, + "grad_norm": 0.11289860308170319, + "learning_rate": 6.211658974244407e-06, + "loss": 0.0558, + "step": 2098 + }, + { + "epoch": 0.4399496960804863, + "grad_norm": 0.17604808509349823, + "learning_rate": 6.208365035638366e-06, + "loss": 0.0585, + "step": 2099 + }, + { + "epoch": 0.4401592957451268, + "grad_norm": 0.18010827898979187, + "learning_rate": 6.205070539976297e-06, + "loss": 0.0616, + "step": 2100 + }, + { + "epoch": 0.44036889540976737, + "grad_norm": 0.15220797061920166, + "learning_rate": 6.2017754887769576e-06, + "loss": 0.0565, + "step": 2101 + }, + { + "epoch": 0.4405784950744079, + "grad_norm": 0.12355081737041473, + "learning_rate": 6.198479883559367e-06, + "loss": 0.0581, + "step": 2102 + }, + { + "epoch": 0.4407880947390484, + "grad_norm": 0.16100534796714783, + "learning_rate": 6.195183725842799e-06, + "loss": 0.0599, + "step": 2103 + }, + { + "epoch": 0.44099769440368897, + "grad_norm": 0.1588708758354187, + "learning_rate": 6.191887017146784e-06, + "loss": 0.0561, + "step": 2104 + }, + { + "epoch": 0.4412072940683295, + "grad_norm": 0.10125992447137833, + "learning_rate": 6.1885897589911e-06, + "loss": 0.0616, + "step": 2105 + }, + { + "epoch": 0.44141689373297005, + "grad_norm": 0.16263973712921143, + "learning_rate": 6.185291952895784e-06, + "loss": 0.0601, + "step": 2106 + }, + { + "epoch": 0.44162649339761056, + "grad_norm": 0.15129660069942474, + "learning_rate": 6.181993600381126e-06, + "loss": 0.0571, + "step": 2107 + }, + { + "epoch": 0.4418360930622511, + "grad_norm": 0.13963650166988373, + "learning_rate": 6.17869470296766e-06, + "loss": 0.0569, + "step": 2108 + }, + { + "epoch": 0.44204569272689165, + "grad_norm": 0.13959255814552307, + "learning_rate": 6.175395262176184e-06, + "loss": 0.0612, + "step": 2109 + }, + { + "epoch": 0.44225529239153216, + "grad_norm": 0.13720089197158813, + "learning_rate": 6.172095279527735e-06, + "loss": 0.0559, + "step": 2110 + }, + { + "epoch": 0.44246489205617273, + "grad_norm": 0.1654689759016037, + "learning_rate": 6.168794756543605e-06, + "loss": 0.057, + "step": 2111 + }, + { + "epoch": 0.44267449172081325, + "grad_norm": 0.145694300532341, + "learning_rate": 6.1654936947453355e-06, + "loss": 0.0561, + "step": 2112 + }, + { + "epoch": 0.44288409138545376, + "grad_norm": 0.13383303582668304, + "learning_rate": 6.162192095654714e-06, + "loss": 0.0579, + "step": 2113 + }, + { + "epoch": 0.44309369105009433, + "grad_norm": 0.1771506667137146, + "learning_rate": 6.158889960793779e-06, + "loss": 0.0558, + "step": 2114 + }, + { + "epoch": 0.44330329071473484, + "grad_norm": 0.1819649338722229, + "learning_rate": 6.155587291684814e-06, + "loss": 0.0574, + "step": 2115 + }, + { + "epoch": 0.4435128903793754, + "grad_norm": 0.16896513104438782, + "learning_rate": 6.1522840898503446e-06, + "loss": 0.0588, + "step": 2116 + }, + { + "epoch": 0.4437224900440159, + "grad_norm": 0.135847270488739, + "learning_rate": 6.148980356813151e-06, + "loss": 0.0569, + "step": 2117 + }, + { + "epoch": 0.44393208970865644, + "grad_norm": 0.20298628509044647, + "learning_rate": 6.145676094096251e-06, + "loss": 0.0624, + "step": 2118 + }, + { + "epoch": 0.444141689373297, + "grad_norm": 0.15171730518341064, + "learning_rate": 6.142371303222909e-06, + "loss": 0.0567, + "step": 2119 + }, + { + "epoch": 0.4443512890379375, + "grad_norm": 0.1395554542541504, + "learning_rate": 6.139065985716635e-06, + "loss": 0.0578, + "step": 2120 + }, + { + "epoch": 0.4445608887025781, + "grad_norm": 0.14991877973079681, + "learning_rate": 6.135760143101177e-06, + "loss": 0.0587, + "step": 2121 + }, + { + "epoch": 0.4447704883672186, + "grad_norm": 0.16107945144176483, + "learning_rate": 6.13245377690053e-06, + "loss": 0.058, + "step": 2122 + }, + { + "epoch": 0.4449800880318591, + "grad_norm": 0.1380777806043625, + "learning_rate": 6.129146888638928e-06, + "loss": 0.0574, + "step": 2123 + }, + { + "epoch": 0.4451896876964997, + "grad_norm": 0.10554420948028564, + "learning_rate": 6.1258394798408424e-06, + "loss": 0.0586, + "step": 2124 + }, + { + "epoch": 0.4453992873611402, + "grad_norm": 0.1480768322944641, + "learning_rate": 6.122531552030992e-06, + "loss": 0.0602, + "step": 2125 + }, + { + "epoch": 0.4456088870257808, + "grad_norm": 0.1352657824754715, + "learning_rate": 6.119223106734328e-06, + "loss": 0.0581, + "step": 2126 + }, + { + "epoch": 0.4458184866904213, + "grad_norm": 0.1190410628914833, + "learning_rate": 6.115914145476045e-06, + "loss": 0.0586, + "step": 2127 + }, + { + "epoch": 0.44602808635506186, + "grad_norm": 0.13114245235919952, + "learning_rate": 6.112604669781572e-06, + "loss": 0.0564, + "step": 2128 + }, + { + "epoch": 0.4462376860197024, + "grad_norm": 0.1338747888803482, + "learning_rate": 6.109294681176578e-06, + "loss": 0.0603, + "step": 2129 + }, + { + "epoch": 0.4464472856843429, + "grad_norm": 0.12414805591106415, + "learning_rate": 6.105984181186968e-06, + "loss": 0.0564, + "step": 2130 + }, + { + "epoch": 0.44665688534898346, + "grad_norm": 0.09722696989774704, + "learning_rate": 6.102673171338878e-06, + "loss": 0.0565, + "step": 2131 + }, + { + "epoch": 0.44686648501362397, + "grad_norm": 0.13036277890205383, + "learning_rate": 6.099361653158687e-06, + "loss": 0.0595, + "step": 2132 + }, + { + "epoch": 0.44707608467826454, + "grad_norm": 0.12158705294132233, + "learning_rate": 6.0960496281729995e-06, + "loss": 0.0564, + "step": 2133 + }, + { + "epoch": 0.44728568434290505, + "grad_norm": 0.13005104660987854, + "learning_rate": 6.092737097908663e-06, + "loss": 0.0613, + "step": 2134 + }, + { + "epoch": 0.44749528400754557, + "grad_norm": 0.1160380020737648, + "learning_rate": 6.08942406389275e-06, + "loss": 0.0599, + "step": 2135 + }, + { + "epoch": 0.44770488367218614, + "grad_norm": 0.11387168616056442, + "learning_rate": 6.086110527652571e-06, + "loss": 0.0565, + "step": 2136 + }, + { + "epoch": 0.44791448333682665, + "grad_norm": 0.15158362686634064, + "learning_rate": 6.082796490715666e-06, + "loss": 0.0593, + "step": 2137 + }, + { + "epoch": 0.4481240830014672, + "grad_norm": 0.12060023099184036, + "learning_rate": 6.0794819546098006e-06, + "loss": 0.0565, + "step": 2138 + }, + { + "epoch": 0.44833368266610774, + "grad_norm": 0.12229592353105545, + "learning_rate": 6.076166920862979e-06, + "loss": 0.0584, + "step": 2139 + }, + { + "epoch": 0.44854328233074825, + "grad_norm": 0.10849461704492569, + "learning_rate": 6.072851391003432e-06, + "loss": 0.0565, + "step": 2140 + }, + { + "epoch": 0.4487528819953888, + "grad_norm": 0.1278999298810959, + "learning_rate": 6.069535366559615e-06, + "loss": 0.0572, + "step": 2141 + }, + { + "epoch": 0.44896248166002933, + "grad_norm": 0.1158517524600029, + "learning_rate": 6.066218849060217e-06, + "loss": 0.0586, + "step": 2142 + }, + { + "epoch": 0.4491720813246699, + "grad_norm": 0.11099664121866226, + "learning_rate": 6.0629018400341514e-06, + "loss": 0.0569, + "step": 2143 + }, + { + "epoch": 0.4493816809893104, + "grad_norm": 0.10765625536441803, + "learning_rate": 6.059584341010556e-06, + "loss": 0.0578, + "step": 2144 + }, + { + "epoch": 0.44959128065395093, + "grad_norm": 0.10618588328361511, + "learning_rate": 6.056266353518803e-06, + "loss": 0.0557, + "step": 2145 + }, + { + "epoch": 0.4498008803185915, + "grad_norm": 0.11303580552339554, + "learning_rate": 6.052947879088479e-06, + "loss": 0.0572, + "step": 2146 + }, + { + "epoch": 0.450010479983232, + "grad_norm": 0.09245932102203369, + "learning_rate": 6.0496289192494e-06, + "loss": 0.0542, + "step": 2147 + }, + { + "epoch": 0.4502200796478726, + "grad_norm": 0.09954038262367249, + "learning_rate": 6.046309475531609e-06, + "loss": 0.0579, + "step": 2148 + }, + { + "epoch": 0.4504296793125131, + "grad_norm": 0.08930417150259018, + "learning_rate": 6.0429895494653655e-06, + "loss": 0.0557, + "step": 2149 + }, + { + "epoch": 0.4506392789771536, + "grad_norm": 0.10569532960653305, + "learning_rate": 6.039669142581157e-06, + "loss": 0.0588, + "step": 2150 + }, + { + "epoch": 0.4508488786417942, + "grad_norm": 0.10851980745792389, + "learning_rate": 6.036348256409692e-06, + "loss": 0.0586, + "step": 2151 + }, + { + "epoch": 0.4510584783064347, + "grad_norm": 0.10847117751836777, + "learning_rate": 6.0330268924818925e-06, + "loss": 0.0563, + "step": 2152 + }, + { + "epoch": 0.45126807797107527, + "grad_norm": 0.10027307271957397, + "learning_rate": 6.029705052328912e-06, + "loss": 0.058, + "step": 2153 + }, + { + "epoch": 0.4514776776357158, + "grad_norm": 0.08025722205638885, + "learning_rate": 6.026382737482116e-06, + "loss": 0.0541, + "step": 2154 + }, + { + "epoch": 0.4516872773003563, + "grad_norm": 0.09462866187095642, + "learning_rate": 6.023059949473091e-06, + "loss": 0.0561, + "step": 2155 + }, + { + "epoch": 0.45189687696499686, + "grad_norm": 0.0991954430937767, + "learning_rate": 6.019736689833643e-06, + "loss": 0.0566, + "step": 2156 + }, + { + "epoch": 0.4521064766296374, + "grad_norm": 0.11048531532287598, + "learning_rate": 6.016412960095791e-06, + "loss": 0.057, + "step": 2157 + }, + { + "epoch": 0.45231607629427795, + "grad_norm": 0.09113509207963943, + "learning_rate": 6.013088761791776e-06, + "loss": 0.0562, + "step": 2158 + }, + { + "epoch": 0.45252567595891846, + "grad_norm": 0.07936461269855499, + "learning_rate": 6.0097640964540535e-06, + "loss": 0.0601, + "step": 2159 + }, + { + "epoch": 0.452735275623559, + "grad_norm": 0.10202574729919434, + "learning_rate": 6.006438965615291e-06, + "loss": 0.0573, + "step": 2160 + }, + { + "epoch": 0.45294487528819954, + "grad_norm": 0.10225759446620941, + "learning_rate": 6.003113370808375e-06, + "loss": 0.0587, + "step": 2161 + }, + { + "epoch": 0.45315447495284006, + "grad_norm": 0.09442250430583954, + "learning_rate": 5.999787313566403e-06, + "loss": 0.0609, + "step": 2162 + }, + { + "epoch": 0.45336407461748063, + "grad_norm": 0.11422485113143921, + "learning_rate": 5.996460795422688e-06, + "loss": 0.0597, + "step": 2163 + }, + { + "epoch": 0.45357367428212114, + "grad_norm": 0.08849114924669266, + "learning_rate": 5.993133817910752e-06, + "loss": 0.0596, + "step": 2164 + }, + { + "epoch": 0.4537832739467617, + "grad_norm": 0.1041831523180008, + "learning_rate": 5.9898063825643335e-06, + "loss": 0.0561, + "step": 2165 + }, + { + "epoch": 0.4539928736114022, + "grad_norm": 0.12931424379348755, + "learning_rate": 5.986478490917378e-06, + "loss": 0.0563, + "step": 2166 + }, + { + "epoch": 0.45420247327604274, + "grad_norm": 0.07999792695045471, + "learning_rate": 5.983150144504043e-06, + "loss": 0.0577, + "step": 2167 + }, + { + "epoch": 0.4544120729406833, + "grad_norm": 0.10252411663532257, + "learning_rate": 5.979821344858695e-06, + "loss": 0.0587, + "step": 2168 + }, + { + "epoch": 0.4546216726053238, + "grad_norm": 0.10504443198442459, + "learning_rate": 5.976492093515911e-06, + "loss": 0.0596, + "step": 2169 + }, + { + "epoch": 0.4548312722699644, + "grad_norm": 0.09300100803375244, + "learning_rate": 5.973162392010474e-06, + "loss": 0.0626, + "step": 2170 + }, + { + "epoch": 0.4550408719346049, + "grad_norm": 0.11920040845870972, + "learning_rate": 5.969832241877378e-06, + "loss": 0.0583, + "step": 2171 + }, + { + "epoch": 0.4552504715992454, + "grad_norm": 0.10893451422452927, + "learning_rate": 5.966501644651817e-06, + "loss": 0.055, + "step": 2172 + }, + { + "epoch": 0.455460071263886, + "grad_norm": 0.08888144791126251, + "learning_rate": 5.963170601869203e-06, + "loss": 0.0577, + "step": 2173 + }, + { + "epoch": 0.4556696709285265, + "grad_norm": 0.11214505136013031, + "learning_rate": 5.959839115065138e-06, + "loss": 0.057, + "step": 2174 + }, + { + "epoch": 0.4558792705931671, + "grad_norm": 0.10267043858766556, + "learning_rate": 5.956507185775441e-06, + "loss": 0.0584, + "step": 2175 + }, + { + "epoch": 0.4560888702578076, + "grad_norm": 0.10312189161777496, + "learning_rate": 5.953174815536131e-06, + "loss": 0.0589, + "step": 2176 + }, + { + "epoch": 0.4562984699224481, + "grad_norm": 0.11400537937879562, + "learning_rate": 5.949842005883428e-06, + "loss": 0.0566, + "step": 2177 + }, + { + "epoch": 0.45650806958708867, + "grad_norm": 0.0924360379576683, + "learning_rate": 5.9465087583537594e-06, + "loss": 0.0597, + "step": 2178 + }, + { + "epoch": 0.4567176692517292, + "grad_norm": 0.11675729602575302, + "learning_rate": 5.943175074483749e-06, + "loss": 0.0566, + "step": 2179 + }, + { + "epoch": 0.45692726891636976, + "grad_norm": 0.11411453783512115, + "learning_rate": 5.939840955810223e-06, + "loss": 0.0598, + "step": 2180 + }, + { + "epoch": 0.45713686858101027, + "grad_norm": 0.11106470972299576, + "learning_rate": 5.936506403870215e-06, + "loss": 0.0569, + "step": 2181 + }, + { + "epoch": 0.4573464682456508, + "grad_norm": 0.1371942162513733, + "learning_rate": 5.933171420200946e-06, + "loss": 0.0571, + "step": 2182 + }, + { + "epoch": 0.45755606791029135, + "grad_norm": 0.10696960240602493, + "learning_rate": 5.929836006339848e-06, + "loss": 0.0614, + "step": 2183 + }, + { + "epoch": 0.45776566757493187, + "grad_norm": 0.1150413230061531, + "learning_rate": 5.926500163824546e-06, + "loss": 0.0592, + "step": 2184 + }, + { + "epoch": 0.45797526723957244, + "grad_norm": 0.10095971077680588, + "learning_rate": 5.923163894192857e-06, + "loss": 0.0564, + "step": 2185 + }, + { + "epoch": 0.45818486690421295, + "grad_norm": 0.10621986538171768, + "learning_rate": 5.9198271989828075e-06, + "loss": 0.0544, + "step": 2186 + }, + { + "epoch": 0.45839446656885346, + "grad_norm": 0.12330467998981476, + "learning_rate": 5.91649007973261e-06, + "loss": 0.0565, + "step": 2187 + }, + { + "epoch": 0.45860406623349403, + "grad_norm": 0.10255663841962814, + "learning_rate": 5.913152537980674e-06, + "loss": 0.057, + "step": 2188 + }, + { + "epoch": 0.45881366589813455, + "grad_norm": 0.13345320522785187, + "learning_rate": 5.909814575265609e-06, + "loss": 0.0597, + "step": 2189 + }, + { + "epoch": 0.4590232655627751, + "grad_norm": 0.112071193754673, + "learning_rate": 5.9064761931262135e-06, + "loss": 0.0556, + "step": 2190 + }, + { + "epoch": 0.45923286522741563, + "grad_norm": 0.12068294733762741, + "learning_rate": 5.903137393101482e-06, + "loss": 0.0554, + "step": 2191 + }, + { + "epoch": 0.45944246489205615, + "grad_norm": 0.11650517582893372, + "learning_rate": 5.8997981767306e-06, + "loss": 0.0557, + "step": 2192 + }, + { + "epoch": 0.4596520645566967, + "grad_norm": 0.1526482254266739, + "learning_rate": 5.896458545552946e-06, + "loss": 0.0581, + "step": 2193 + }, + { + "epoch": 0.45986166422133723, + "grad_norm": 0.15084873139858246, + "learning_rate": 5.893118501108087e-06, + "loss": 0.0581, + "step": 2194 + }, + { + "epoch": 0.4600712638859778, + "grad_norm": 0.12740178406238556, + "learning_rate": 5.889778044935785e-06, + "loss": 0.0579, + "step": 2195 + }, + { + "epoch": 0.4602808635506183, + "grad_norm": 0.1175624206662178, + "learning_rate": 5.88643717857599e-06, + "loss": 0.0556, + "step": 2196 + }, + { + "epoch": 0.4604904632152589, + "grad_norm": 0.12416189163923264, + "learning_rate": 5.883095903568838e-06, + "loss": 0.0587, + "step": 2197 + }, + { + "epoch": 0.4607000628798994, + "grad_norm": 0.13820405304431915, + "learning_rate": 5.87975422145466e-06, + "loss": 0.0573, + "step": 2198 + }, + { + "epoch": 0.4609096625445399, + "grad_norm": 0.11369778960943222, + "learning_rate": 5.876412133773968e-06, + "loss": 0.0573, + "step": 2199 + }, + { + "epoch": 0.4611192622091805, + "grad_norm": 0.1153719425201416, + "learning_rate": 5.873069642067464e-06, + "loss": 0.0553, + "step": 2200 + }, + { + "epoch": 0.461328861873821, + "grad_norm": 0.10116839408874512, + "learning_rate": 5.869726747876036e-06, + "loss": 0.0588, + "step": 2201 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.1170465499162674, + "learning_rate": 5.866383452740758e-06, + "loss": 0.0555, + "step": 2202 + }, + { + "epoch": 0.4617480612031021, + "grad_norm": 0.14628863334655762, + "learning_rate": 5.863039758202889e-06, + "loss": 0.0561, + "step": 2203 + }, + { + "epoch": 0.4619576608677426, + "grad_norm": 0.15336976945400238, + "learning_rate": 5.85969566580387e-06, + "loss": 0.0593, + "step": 2204 + }, + { + "epoch": 0.46216726053238316, + "grad_norm": 0.14001086354255676, + "learning_rate": 5.856351177085327e-06, + "loss": 0.0572, + "step": 2205 + }, + { + "epoch": 0.4623768601970237, + "grad_norm": 0.09859530627727509, + "learning_rate": 5.853006293589071e-06, + "loss": 0.057, + "step": 2206 + }, + { + "epoch": 0.46258645986166425, + "grad_norm": 0.10227430611848831, + "learning_rate": 5.84966101685709e-06, + "loss": 0.0542, + "step": 2207 + }, + { + "epoch": 0.46279605952630476, + "grad_norm": 0.12590292096138, + "learning_rate": 5.846315348431555e-06, + "loss": 0.0573, + "step": 2208 + }, + { + "epoch": 0.4630056591909453, + "grad_norm": 0.11103103309869766, + "learning_rate": 5.842969289854823e-06, + "loss": 0.0576, + "step": 2209 + }, + { + "epoch": 0.46321525885558584, + "grad_norm": 0.12213583290576935, + "learning_rate": 5.839622842669423e-06, + "loss": 0.0586, + "step": 2210 + }, + { + "epoch": 0.46342485852022636, + "grad_norm": 0.10279441624879837, + "learning_rate": 5.836276008418065e-06, + "loss": 0.0569, + "step": 2211 + }, + { + "epoch": 0.4636344581848669, + "grad_norm": 0.09687735140323639, + "learning_rate": 5.832928788643644e-06, + "loss": 0.0582, + "step": 2212 + }, + { + "epoch": 0.46384405784950744, + "grad_norm": 0.09342148154973984, + "learning_rate": 5.8295811848892215e-06, + "loss": 0.0585, + "step": 2213 + }, + { + "epoch": 0.46405365751414795, + "grad_norm": 0.09567330032587051, + "learning_rate": 5.826233198698047e-06, + "loss": 0.0571, + "step": 2214 + }, + { + "epoch": 0.4642632571787885, + "grad_norm": 0.10038771480321884, + "learning_rate": 5.822884831613538e-06, + "loss": 0.0558, + "step": 2215 + }, + { + "epoch": 0.46447285684342904, + "grad_norm": 0.10112529247999191, + "learning_rate": 5.819536085179293e-06, + "loss": 0.0575, + "step": 2216 + }, + { + "epoch": 0.4646824565080696, + "grad_norm": 0.11409956961870193, + "learning_rate": 5.816186960939084e-06, + "loss": 0.0551, + "step": 2217 + }, + { + "epoch": 0.4648920561727101, + "grad_norm": 0.11404009908437729, + "learning_rate": 5.8128374604368534e-06, + "loss": 0.0572, + "step": 2218 + }, + { + "epoch": 0.46510165583735064, + "grad_norm": 0.1052129939198494, + "learning_rate": 5.809487585216725e-06, + "loss": 0.0569, + "step": 2219 + }, + { + "epoch": 0.4653112555019912, + "grad_norm": 0.1068800687789917, + "learning_rate": 5.806137336822987e-06, + "loss": 0.0568, + "step": 2220 + }, + { + "epoch": 0.4655208551666317, + "grad_norm": 0.1187024712562561, + "learning_rate": 5.802786716800102e-06, + "loss": 0.0567, + "step": 2221 + }, + { + "epoch": 0.4657304548312723, + "grad_norm": 0.11063768714666367, + "learning_rate": 5.79943572669271e-06, + "loss": 0.0565, + "step": 2222 + }, + { + "epoch": 0.4659400544959128, + "grad_norm": 0.11005666106939316, + "learning_rate": 5.796084368045612e-06, + "loss": 0.0578, + "step": 2223 + }, + { + "epoch": 0.4661496541605533, + "grad_norm": 0.09957437217235565, + "learning_rate": 5.7927326424037875e-06, + "loss": 0.0566, + "step": 2224 + }, + { + "epoch": 0.4663592538251939, + "grad_norm": 0.10687166452407837, + "learning_rate": 5.789380551312379e-06, + "loss": 0.0565, + "step": 2225 + }, + { + "epoch": 0.4665688534898344, + "grad_norm": 0.12073647975921631, + "learning_rate": 5.7860280963167e-06, + "loss": 0.0587, + "step": 2226 + }, + { + "epoch": 0.46677845315447497, + "grad_norm": 0.11012930423021317, + "learning_rate": 5.782675278962232e-06, + "loss": 0.056, + "step": 2227 + }, + { + "epoch": 0.4669880528191155, + "grad_norm": 0.10260272771120071, + "learning_rate": 5.7793221007946245e-06, + "loss": 0.0595, + "step": 2228 + }, + { + "epoch": 0.467197652483756, + "grad_norm": 0.07964638620615005, + "learning_rate": 5.775968563359688e-06, + "loss": 0.0563, + "step": 2229 + }, + { + "epoch": 0.46740725214839657, + "grad_norm": 0.09134583175182343, + "learning_rate": 5.7726146682034055e-06, + "loss": 0.0548, + "step": 2230 + }, + { + "epoch": 0.4676168518130371, + "grad_norm": 0.10940870642662048, + "learning_rate": 5.7692604168719225e-06, + "loss": 0.0535, + "step": 2231 + }, + { + "epoch": 0.46782645147767765, + "grad_norm": 0.1325157731771469, + "learning_rate": 5.765905810911546e-06, + "loss": 0.0571, + "step": 2232 + }, + { + "epoch": 0.46803605114231817, + "grad_norm": 0.13585898280143738, + "learning_rate": 5.762550851868751e-06, + "loss": 0.057, + "step": 2233 + }, + { + "epoch": 0.46824565080695874, + "grad_norm": 0.11632729321718216, + "learning_rate": 5.759195541290171e-06, + "loss": 0.0581, + "step": 2234 + }, + { + "epoch": 0.46845525047159925, + "grad_norm": 0.08722345530986786, + "learning_rate": 5.7558398807226045e-06, + "loss": 0.0561, + "step": 2235 + }, + { + "epoch": 0.46866485013623976, + "grad_norm": 0.1064450740814209, + "learning_rate": 5.7524838717130095e-06, + "loss": 0.0549, + "step": 2236 + }, + { + "epoch": 0.46887444980088033, + "grad_norm": 0.12046857923269272, + "learning_rate": 5.749127515808506e-06, + "loss": 0.0576, + "step": 2237 + }, + { + "epoch": 0.46908404946552085, + "grad_norm": 0.11908888071775436, + "learning_rate": 5.745770814556373e-06, + "loss": 0.0575, + "step": 2238 + }, + { + "epoch": 0.4692936491301614, + "grad_norm": 0.10716499388217926, + "learning_rate": 5.7424137695040495e-06, + "loss": 0.0562, + "step": 2239 + }, + { + "epoch": 0.46950324879480193, + "grad_norm": 0.09762990474700928, + "learning_rate": 5.7390563821991326e-06, + "loss": 0.0582, + "step": 2240 + }, + { + "epoch": 0.46971284845944244, + "grad_norm": 0.11566614359617233, + "learning_rate": 5.735698654189377e-06, + "loss": 0.0592, + "step": 2241 + }, + { + "epoch": 0.469922448124083, + "grad_norm": 0.09800199419260025, + "learning_rate": 5.7323405870226955e-06, + "loss": 0.0567, + "step": 2242 + }, + { + "epoch": 0.47013204778872353, + "grad_norm": 0.09443154186010361, + "learning_rate": 5.7289821822471545e-06, + "loss": 0.058, + "step": 2243 + }, + { + "epoch": 0.4703416474533641, + "grad_norm": 0.07052087038755417, + "learning_rate": 5.725623441410979e-06, + "loss": 0.0536, + "step": 2244 + }, + { + "epoch": 0.4705512471180046, + "grad_norm": 0.10066118091344833, + "learning_rate": 5.722264366062549e-06, + "loss": 0.0568, + "step": 2245 + }, + { + "epoch": 0.4707608467826451, + "grad_norm": 0.10691467672586441, + "learning_rate": 5.718904957750394e-06, + "loss": 0.0564, + "step": 2246 + }, + { + "epoch": 0.4709704464472857, + "grad_norm": 0.10959997028112411, + "learning_rate": 5.715545218023205e-06, + "loss": 0.0544, + "step": 2247 + }, + { + "epoch": 0.4711800461119262, + "grad_norm": 0.09174855053424835, + "learning_rate": 5.7121851484298184e-06, + "loss": 0.0534, + "step": 2248 + }, + { + "epoch": 0.4713896457765668, + "grad_norm": 0.10001726448535919, + "learning_rate": 5.708824750519225e-06, + "loss": 0.0574, + "step": 2249 + }, + { + "epoch": 0.4715992454412073, + "grad_norm": 0.10282893478870392, + "learning_rate": 5.705464025840571e-06, + "loss": 0.0535, + "step": 2250 + }, + { + "epoch": 0.4718088451058478, + "grad_norm": 0.0975334644317627, + "learning_rate": 5.702102975943147e-06, + "loss": 0.0609, + "step": 2251 + }, + { + "epoch": 0.4720184447704884, + "grad_norm": 0.08588805049657822, + "learning_rate": 5.698741602376395e-06, + "loss": 0.0538, + "step": 2252 + }, + { + "epoch": 0.4722280444351289, + "grad_norm": 0.08347409963607788, + "learning_rate": 5.695379906689912e-06, + "loss": 0.0594, + "step": 2253 + }, + { + "epoch": 0.47243764409976946, + "grad_norm": 0.0910545289516449, + "learning_rate": 5.6920178904334346e-06, + "loss": 0.0592, + "step": 2254 + }, + { + "epoch": 0.47264724376441, + "grad_norm": 0.08542585372924805, + "learning_rate": 5.688655555156854e-06, + "loss": 0.058, + "step": 2255 + }, + { + "epoch": 0.4728568434290505, + "grad_norm": 0.09314323216676712, + "learning_rate": 5.6852929024102065e-06, + "loss": 0.0546, + "step": 2256 + }, + { + "epoch": 0.47306644309369106, + "grad_norm": 0.09594455361366272, + "learning_rate": 5.681929933743672e-06, + "loss": 0.0546, + "step": 2257 + }, + { + "epoch": 0.47327604275833157, + "grad_norm": 0.09240654855966568, + "learning_rate": 5.67856665070758e-06, + "loss": 0.0568, + "step": 2258 + }, + { + "epoch": 0.47348564242297214, + "grad_norm": 0.08861672133207321, + "learning_rate": 5.675203054852403e-06, + "loss": 0.0584, + "step": 2259 + }, + { + "epoch": 0.47369524208761266, + "grad_norm": 0.08311017602682114, + "learning_rate": 5.671839147728758e-06, + "loss": 0.0548, + "step": 2260 + }, + { + "epoch": 0.47390484175225317, + "grad_norm": 0.08416654914617538, + "learning_rate": 5.668474930887406e-06, + "loss": 0.0569, + "step": 2261 + }, + { + "epoch": 0.47411444141689374, + "grad_norm": 0.07767292857170105, + "learning_rate": 5.6651104058792496e-06, + "loss": 0.0558, + "step": 2262 + }, + { + "epoch": 0.47432404108153425, + "grad_norm": 0.09354811161756516, + "learning_rate": 5.661745574255334e-06, + "loss": 0.0562, + "step": 2263 + }, + { + "epoch": 0.4745336407461748, + "grad_norm": 0.1013355404138565, + "learning_rate": 5.658380437566846e-06, + "loss": 0.0563, + "step": 2264 + }, + { + "epoch": 0.47474324041081534, + "grad_norm": 0.08834841102361679, + "learning_rate": 5.655014997365114e-06, + "loss": 0.0581, + "step": 2265 + }, + { + "epoch": 0.4749528400754559, + "grad_norm": 0.07844306528568268, + "learning_rate": 5.651649255201603e-06, + "loss": 0.0559, + "step": 2266 + }, + { + "epoch": 0.4751624397400964, + "grad_norm": 0.08669186383485794, + "learning_rate": 5.648283212627921e-06, + "loss": 0.0571, + "step": 2267 + }, + { + "epoch": 0.47537203940473693, + "grad_norm": 0.08236299455165863, + "learning_rate": 5.6449168711958135e-06, + "loss": 0.057, + "step": 2268 + }, + { + "epoch": 0.4755816390693775, + "grad_norm": 0.0980667769908905, + "learning_rate": 5.641550232457162e-06, + "loss": 0.0546, + "step": 2269 + }, + { + "epoch": 0.475791238734018, + "grad_norm": 0.11645791679620743, + "learning_rate": 5.638183297963986e-06, + "loss": 0.0549, + "step": 2270 + }, + { + "epoch": 0.4760008383986586, + "grad_norm": 0.12081831693649292, + "learning_rate": 5.634816069268442e-06, + "loss": 0.0582, + "step": 2271 + }, + { + "epoch": 0.4762104380632991, + "grad_norm": 0.1103171780705452, + "learning_rate": 5.631448547922822e-06, + "loss": 0.0567, + "step": 2272 + }, + { + "epoch": 0.4764200377279396, + "grad_norm": 0.1022624745965004, + "learning_rate": 5.628080735479553e-06, + "loss": 0.0577, + "step": 2273 + }, + { + "epoch": 0.4766296373925802, + "grad_norm": 0.0808848962187767, + "learning_rate": 5.624712633491196e-06, + "loss": 0.0593, + "step": 2274 + }, + { + "epoch": 0.4768392370572207, + "grad_norm": 0.07358387112617493, + "learning_rate": 5.621344243510444e-06, + "loss": 0.0568, + "step": 2275 + }, + { + "epoch": 0.47704883672186127, + "grad_norm": 0.07857120782136917, + "learning_rate": 5.6179755670901245e-06, + "loss": 0.0569, + "step": 2276 + }, + { + "epoch": 0.4772584363865018, + "grad_norm": 0.06889036297798157, + "learning_rate": 5.614606605783197e-06, + "loss": 0.0586, + "step": 2277 + }, + { + "epoch": 0.4774680360511423, + "grad_norm": 0.07760413736104965, + "learning_rate": 5.611237361142753e-06, + "loss": 0.0551, + "step": 2278 + }, + { + "epoch": 0.47767763571578287, + "grad_norm": 0.08692996203899384, + "learning_rate": 5.607867834722012e-06, + "loss": 0.0545, + "step": 2279 + }, + { + "epoch": 0.4778872353804234, + "grad_norm": 0.08127982169389725, + "learning_rate": 5.604498028074323e-06, + "loss": 0.0571, + "step": 2280 + }, + { + "epoch": 0.47809683504506395, + "grad_norm": 0.07126957178115845, + "learning_rate": 5.601127942753173e-06, + "loss": 0.0577, + "step": 2281 + }, + { + "epoch": 0.47830643470970446, + "grad_norm": 0.08008185029029846, + "learning_rate": 5.597757580312163e-06, + "loss": 0.0542, + "step": 2282 + }, + { + "epoch": 0.478516034374345, + "grad_norm": 0.07614471018314362, + "learning_rate": 5.594386942305035e-06, + "loss": 0.0554, + "step": 2283 + }, + { + "epoch": 0.47872563403898555, + "grad_norm": 0.06831208616495132, + "learning_rate": 5.5910160302856486e-06, + "loss": 0.0555, + "step": 2284 + }, + { + "epoch": 0.47893523370362606, + "grad_norm": 0.07799834758043289, + "learning_rate": 5.587644845807994e-06, + "loss": 0.0569, + "step": 2285 + }, + { + "epoch": 0.47914483336826663, + "grad_norm": 0.08827179670333862, + "learning_rate": 5.584273390426189e-06, + "loss": 0.0555, + "step": 2286 + }, + { + "epoch": 0.47935443303290715, + "grad_norm": 0.07891764491796494, + "learning_rate": 5.580901665694471e-06, + "loss": 0.0582, + "step": 2287 + }, + { + "epoch": 0.47956403269754766, + "grad_norm": 0.06794466078281403, + "learning_rate": 5.577529673167208e-06, + "loss": 0.0563, + "step": 2288 + }, + { + "epoch": 0.47977363236218823, + "grad_norm": 0.06872694939374924, + "learning_rate": 5.574157414398885e-06, + "loss": 0.0551, + "step": 2289 + }, + { + "epoch": 0.47998323202682874, + "grad_norm": 0.06112726777791977, + "learning_rate": 5.570784890944112e-06, + "loss": 0.0571, + "step": 2290 + }, + { + "epoch": 0.4801928316914693, + "grad_norm": 0.07286538183689117, + "learning_rate": 5.567412104357623e-06, + "loss": 0.057, + "step": 2291 + }, + { + "epoch": 0.4804024313561098, + "grad_norm": 0.06900748610496521, + "learning_rate": 5.564039056194274e-06, + "loss": 0.0567, + "step": 2292 + }, + { + "epoch": 0.48061203102075034, + "grad_norm": 0.07319528609514236, + "learning_rate": 5.560665748009034e-06, + "loss": 0.0562, + "step": 2293 + }, + { + "epoch": 0.4808216306853909, + "grad_norm": 0.07866884768009186, + "learning_rate": 5.557292181357003e-06, + "loss": 0.0561, + "step": 2294 + }, + { + "epoch": 0.4810312303500314, + "grad_norm": 0.07217524945735931, + "learning_rate": 5.553918357793391e-06, + "loss": 0.0565, + "step": 2295 + }, + { + "epoch": 0.481240830014672, + "grad_norm": 0.0580948144197464, + "learning_rate": 5.550544278873531e-06, + "loss": 0.0583, + "step": 2296 + }, + { + "epoch": 0.4814504296793125, + "grad_norm": 0.0548657588660717, + "learning_rate": 5.547169946152874e-06, + "loss": 0.0574, + "step": 2297 + }, + { + "epoch": 0.481660029343953, + "grad_norm": 0.06751734763383865, + "learning_rate": 5.543795361186984e-06, + "loss": 0.0567, + "step": 2298 + }, + { + "epoch": 0.4818696290085936, + "grad_norm": 0.0683041512966156, + "learning_rate": 5.540420525531547e-06, + "loss": 0.0561, + "step": 2299 + }, + { + "epoch": 0.4820792286732341, + "grad_norm": 0.0687454417347908, + "learning_rate": 5.537045440742359e-06, + "loss": 0.0569, + "step": 2300 + }, + { + "epoch": 0.4822888283378747, + "grad_norm": 0.060074321925640106, + "learning_rate": 5.533670108375334e-06, + "loss": 0.0553, + "step": 2301 + }, + { + "epoch": 0.4824984280025152, + "grad_norm": 0.05447131395339966, + "learning_rate": 5.5302945299865005e-06, + "loss": 0.057, + "step": 2302 + }, + { + "epoch": 0.48270802766715576, + "grad_norm": 0.051248107105493546, + "learning_rate": 5.526918707132e-06, + "loss": 0.0561, + "step": 2303 + }, + { + "epoch": 0.4829176273317963, + "grad_norm": 0.0613834448158741, + "learning_rate": 5.523542641368083e-06, + "loss": 0.0582, + "step": 2304 + }, + { + "epoch": 0.4831272269964368, + "grad_norm": 0.054669518023729324, + "learning_rate": 5.520166334251118e-06, + "loss": 0.056, + "step": 2305 + }, + { + "epoch": 0.48333682666107736, + "grad_norm": 0.058429256081581116, + "learning_rate": 5.5167897873375805e-06, + "loss": 0.0573, + "step": 2306 + }, + { + "epoch": 0.48354642632571787, + "grad_norm": 0.06477133929729462, + "learning_rate": 5.513413002184059e-06, + "loss": 0.0557, + "step": 2307 + }, + { + "epoch": 0.48375602599035844, + "grad_norm": 0.06107893958687782, + "learning_rate": 5.510035980347249e-06, + "loss": 0.0576, + "step": 2308 + }, + { + "epoch": 0.48396562565499895, + "grad_norm": 0.06686937808990479, + "learning_rate": 5.50665872338396e-06, + "loss": 0.0586, + "step": 2309 + }, + { + "epoch": 0.48417522531963947, + "grad_norm": 0.0777740553021431, + "learning_rate": 5.503281232851102e-06, + "loss": 0.0551, + "step": 2310 + }, + { + "epoch": 0.48438482498428004, + "grad_norm": 0.08693230897188187, + "learning_rate": 5.499903510305703e-06, + "loss": 0.0538, + "step": 2311 + }, + { + "epoch": 0.48459442464892055, + "grad_norm": 0.09828737378120422, + "learning_rate": 5.496525557304888e-06, + "loss": 0.0546, + "step": 2312 + }, + { + "epoch": 0.4848040243135611, + "grad_norm": 0.11315979063510895, + "learning_rate": 5.4931473754058935e-06, + "loss": 0.0586, + "step": 2313 + }, + { + "epoch": 0.48501362397820164, + "grad_norm": 0.1083892285823822, + "learning_rate": 5.489768966166064e-06, + "loss": 0.0606, + "step": 2314 + }, + { + "epoch": 0.48522322364284215, + "grad_norm": 0.08665548264980316, + "learning_rate": 5.486390331142841e-06, + "loss": 0.0561, + "step": 2315 + }, + { + "epoch": 0.4854328233074827, + "grad_norm": 0.06847358494997025, + "learning_rate": 5.483011471893775e-06, + "loss": 0.0552, + "step": 2316 + }, + { + "epoch": 0.48564242297212323, + "grad_norm": 0.08172042667865753, + "learning_rate": 5.479632389976524e-06, + "loss": 0.0575, + "step": 2317 + }, + { + "epoch": 0.4858520226367638, + "grad_norm": 0.120709627866745, + "learning_rate": 5.4762530869488385e-06, + "loss": 0.0572, + "step": 2318 + }, + { + "epoch": 0.4860616223014043, + "grad_norm": 0.1355757862329483, + "learning_rate": 5.4728735643685804e-06, + "loss": 0.0587, + "step": 2319 + }, + { + "epoch": 0.48627122196604483, + "grad_norm": 0.12048780918121338, + "learning_rate": 5.469493823793706e-06, + "loss": 0.0591, + "step": 2320 + }, + { + "epoch": 0.4864808216306854, + "grad_norm": 0.07901732623577118, + "learning_rate": 5.466113866782277e-06, + "loss": 0.0552, + "step": 2321 + }, + { + "epoch": 0.4866904212953259, + "grad_norm": 0.0863695964217186, + "learning_rate": 5.462733694892452e-06, + "loss": 0.0571, + "step": 2322 + }, + { + "epoch": 0.4869000209599665, + "grad_norm": 0.13559545576572418, + "learning_rate": 5.4593533096824906e-06, + "loss": 0.0566, + "step": 2323 + }, + { + "epoch": 0.487109620624607, + "grad_norm": 0.1430426388978958, + "learning_rate": 5.455972712710748e-06, + "loss": 0.058, + "step": 2324 + }, + { + "epoch": 0.4873192202892475, + "grad_norm": 0.09979075938463211, + "learning_rate": 5.4525919055356804e-06, + "loss": 0.0557, + "step": 2325 + }, + { + "epoch": 0.4875288199538881, + "grad_norm": 0.07715443521738052, + "learning_rate": 5.449210889715837e-06, + "loss": 0.0599, + "step": 2326 + }, + { + "epoch": 0.4877384196185286, + "grad_norm": 0.09540057182312012, + "learning_rate": 5.445829666809866e-06, + "loss": 0.0527, + "step": 2327 + }, + { + "epoch": 0.48794801928316917, + "grad_norm": 0.10082437098026276, + "learning_rate": 5.442448238376513e-06, + "loss": 0.058, + "step": 2328 + }, + { + "epoch": 0.4881576189478097, + "grad_norm": 0.09437216818332672, + "learning_rate": 5.439066605974615e-06, + "loss": 0.0533, + "step": 2329 + }, + { + "epoch": 0.4883672186124502, + "grad_norm": 0.10088086873292923, + "learning_rate": 5.4356847711631015e-06, + "loss": 0.0566, + "step": 2330 + }, + { + "epoch": 0.48857681827709076, + "grad_norm": 0.09661588817834854, + "learning_rate": 5.432302735500998e-06, + "loss": 0.0525, + "step": 2331 + }, + { + "epoch": 0.4887864179417313, + "grad_norm": 0.08586642891168594, + "learning_rate": 5.428920500547425e-06, + "loss": 0.0568, + "step": 2332 + }, + { + "epoch": 0.48899601760637185, + "grad_norm": 0.09433707594871521, + "learning_rate": 5.4255380678615885e-06, + "loss": 0.0546, + "step": 2333 + }, + { + "epoch": 0.48920561727101236, + "grad_norm": 0.1241917684674263, + "learning_rate": 5.422155439002793e-06, + "loss": 0.0577, + "step": 2334 + }, + { + "epoch": 0.48941521693565293, + "grad_norm": 0.11522339284420013, + "learning_rate": 5.418772615530426e-06, + "loss": 0.0575, + "step": 2335 + }, + { + "epoch": 0.48962481660029344, + "grad_norm": 0.09254017472267151, + "learning_rate": 5.415389599003972e-06, + "loss": 0.0549, + "step": 2336 + }, + { + "epoch": 0.48983441626493396, + "grad_norm": 0.1002906784415245, + "learning_rate": 5.412006390982999e-06, + "loss": 0.0554, + "step": 2337 + }, + { + "epoch": 0.49004401592957453, + "grad_norm": 0.09268494695425034, + "learning_rate": 5.4086229930271636e-06, + "loss": 0.059, + "step": 2338 + }, + { + "epoch": 0.49025361559421504, + "grad_norm": 0.0953909158706665, + "learning_rate": 5.405239406696216e-06, + "loss": 0.054, + "step": 2339 + }, + { + "epoch": 0.4904632152588556, + "grad_norm": 0.10069115459918976, + "learning_rate": 5.401855633549984e-06, + "loss": 0.0563, + "step": 2340 + }, + { + "epoch": 0.4906728149234961, + "grad_norm": 0.09953933954238892, + "learning_rate": 5.398471675148389e-06, + "loss": 0.0573, + "step": 2341 + }, + { + "epoch": 0.49088241458813664, + "grad_norm": 0.11302805691957474, + "learning_rate": 5.395087533051434e-06, + "loss": 0.0545, + "step": 2342 + }, + { + "epoch": 0.4910920142527772, + "grad_norm": 0.10684003680944443, + "learning_rate": 5.391703208819209e-06, + "loss": 0.0559, + "step": 2343 + }, + { + "epoch": 0.4913016139174177, + "grad_norm": 0.09628332406282425, + "learning_rate": 5.388318704011885e-06, + "loss": 0.0545, + "step": 2344 + }, + { + "epoch": 0.4915112135820583, + "grad_norm": 0.10096313059329987, + "learning_rate": 5.38493402018972e-06, + "loss": 0.059, + "step": 2345 + }, + { + "epoch": 0.4917208132466988, + "grad_norm": 0.08674436807632446, + "learning_rate": 5.3815491589130506e-06, + "loss": 0.0603, + "step": 2346 + }, + { + "epoch": 0.4919304129113393, + "grad_norm": 0.0984383150935173, + "learning_rate": 5.378164121742301e-06, + "loss": 0.0552, + "step": 2347 + }, + { + "epoch": 0.4921400125759799, + "grad_norm": 0.10171261429786682, + "learning_rate": 5.374778910237968e-06, + "loss": 0.0556, + "step": 2348 + }, + { + "epoch": 0.4923496122406204, + "grad_norm": 0.09578763693571091, + "learning_rate": 5.3713935259606345e-06, + "loss": 0.0573, + "step": 2349 + }, + { + "epoch": 0.492559211905261, + "grad_norm": 0.09438461065292358, + "learning_rate": 5.368007970470964e-06, + "loss": 0.0582, + "step": 2350 + }, + { + "epoch": 0.4927688115699015, + "grad_norm": 0.08926350623369217, + "learning_rate": 5.3646222453296936e-06, + "loss": 0.0568, + "step": 2351 + }, + { + "epoch": 0.492978411234542, + "grad_norm": 0.09703266620635986, + "learning_rate": 5.361236352097646e-06, + "loss": 0.057, + "step": 2352 + }, + { + "epoch": 0.49318801089918257, + "grad_norm": 0.0997847244143486, + "learning_rate": 5.357850292335715e-06, + "loss": 0.0538, + "step": 2353 + }, + { + "epoch": 0.4933976105638231, + "grad_norm": 0.09428234398365021, + "learning_rate": 5.354464067604872e-06, + "loss": 0.0584, + "step": 2354 + }, + { + "epoch": 0.49360721022846366, + "grad_norm": 0.10148053616285324, + "learning_rate": 5.35107767946617e-06, + "loss": 0.0593, + "step": 2355 + }, + { + "epoch": 0.49381680989310417, + "grad_norm": 0.11165421456098557, + "learning_rate": 5.3476911294807284e-06, + "loss": 0.057, + "step": 2356 + }, + { + "epoch": 0.4940264095577447, + "grad_norm": 0.0888749286532402, + "learning_rate": 5.344304419209748e-06, + "loss": 0.0558, + "step": 2357 + }, + { + "epoch": 0.49423600922238525, + "grad_norm": 0.10030533373355865, + "learning_rate": 5.340917550214504e-06, + "loss": 0.0587, + "step": 2358 + }, + { + "epoch": 0.49444560888702577, + "grad_norm": 0.10392221808433533, + "learning_rate": 5.337530524056338e-06, + "loss": 0.0557, + "step": 2359 + }, + { + "epoch": 0.49465520855166634, + "grad_norm": 0.09198975563049316, + "learning_rate": 5.334143342296672e-06, + "loss": 0.0559, + "step": 2360 + }, + { + "epoch": 0.49486480821630685, + "grad_norm": 0.0988771989941597, + "learning_rate": 5.3307560064969955e-06, + "loss": 0.0561, + "step": 2361 + }, + { + "epoch": 0.49507440788094736, + "grad_norm": 0.09191461652517319, + "learning_rate": 5.327368518218866e-06, + "loss": 0.0575, + "step": 2362 + }, + { + "epoch": 0.49528400754558793, + "grad_norm": 0.08047354221343994, + "learning_rate": 5.323980879023919e-06, + "loss": 0.0582, + "step": 2363 + }, + { + "epoch": 0.49549360721022845, + "grad_norm": 0.097519151866436, + "learning_rate": 5.3205930904738544e-06, + "loss": 0.0563, + "step": 2364 + }, + { + "epoch": 0.495703206874869, + "grad_norm": 0.09642237424850464, + "learning_rate": 5.317205154130442e-06, + "loss": 0.0562, + "step": 2365 + }, + { + "epoch": 0.49591280653950953, + "grad_norm": 0.09264837205410004, + "learning_rate": 5.3138170715555186e-06, + "loss": 0.0574, + "step": 2366 + }, + { + "epoch": 0.49612240620415005, + "grad_norm": 0.10126832872629166, + "learning_rate": 5.31042884431099e-06, + "loss": 0.0553, + "step": 2367 + }, + { + "epoch": 0.4963320058687906, + "grad_norm": 0.09630407392978668, + "learning_rate": 5.3070404739588285e-06, + "loss": 0.0527, + "step": 2368 + }, + { + "epoch": 0.49654160553343113, + "grad_norm": 0.0988563820719719, + "learning_rate": 5.303651962061074e-06, + "loss": 0.0599, + "step": 2369 + }, + { + "epoch": 0.4967512051980717, + "grad_norm": 0.09341637045145035, + "learning_rate": 5.300263310179826e-06, + "loss": 0.0563, + "step": 2370 + }, + { + "epoch": 0.4969608048627122, + "grad_norm": 0.08345013111829758, + "learning_rate": 5.296874519877256e-06, + "loss": 0.0572, + "step": 2371 + }, + { + "epoch": 0.4971704045273528, + "grad_norm": 0.08412396162748337, + "learning_rate": 5.293485592715593e-06, + "loss": 0.0528, + "step": 2372 + }, + { + "epoch": 0.4973800041919933, + "grad_norm": 0.09522940963506699, + "learning_rate": 5.290096530257134e-06, + "loss": 0.0576, + "step": 2373 + }, + { + "epoch": 0.4975896038566338, + "grad_norm": 0.0857052206993103, + "learning_rate": 5.286707334064234e-06, + "loss": 0.0547, + "step": 2374 + }, + { + "epoch": 0.4977992035212744, + "grad_norm": 0.08157432824373245, + "learning_rate": 5.283318005699313e-06, + "loss": 0.0595, + "step": 2375 + }, + { + "epoch": 0.4980088031859149, + "grad_norm": 0.08909313380718231, + "learning_rate": 5.279928546724849e-06, + "loss": 0.0555, + "step": 2376 + }, + { + "epoch": 0.49821840285055546, + "grad_norm": 0.0913192629814148, + "learning_rate": 5.276538958703384e-06, + "loss": 0.0553, + "step": 2377 + }, + { + "epoch": 0.498428002515196, + "grad_norm": 0.09321916103363037, + "learning_rate": 5.273149243197517e-06, + "loss": 0.0555, + "step": 2378 + }, + { + "epoch": 0.4986376021798365, + "grad_norm": 0.09494626522064209, + "learning_rate": 5.269759401769904e-06, + "loss": 0.0588, + "step": 2379 + }, + { + "epoch": 0.49884720184447706, + "grad_norm": 0.08014019578695297, + "learning_rate": 5.266369435983264e-06, + "loss": 0.0564, + "step": 2380 + }, + { + "epoch": 0.4990568015091176, + "grad_norm": 0.08060725778341293, + "learning_rate": 5.26297934740037e-06, + "loss": 0.0594, + "step": 2381 + }, + { + "epoch": 0.49926640117375815, + "grad_norm": 0.09616056829690933, + "learning_rate": 5.259589137584049e-06, + "loss": 0.0564, + "step": 2382 + }, + { + "epoch": 0.49947600083839866, + "grad_norm": 0.10162606090307236, + "learning_rate": 5.256198808097192e-06, + "loss": 0.0553, + "step": 2383 + }, + { + "epoch": 0.4996856005030392, + "grad_norm": 0.09933992475271225, + "learning_rate": 5.252808360502737e-06, + "loss": 0.0562, + "step": 2384 + }, + { + "epoch": 0.49989520016767974, + "grad_norm": 0.106983482837677, + "learning_rate": 5.2494177963636785e-06, + "loss": 0.0579, + "step": 2385 + }, + { + "epoch": 0.5001047998323203, + "grad_norm": 0.10930163413286209, + "learning_rate": 5.246027117243071e-06, + "loss": 0.0578, + "step": 2386 + }, + { + "epoch": 0.5003143994969608, + "grad_norm": 0.10855498164892197, + "learning_rate": 5.24263632470401e-06, + "loss": 0.0547, + "step": 2387 + }, + { + "epoch": 0.5005239991616014, + "grad_norm": 0.09061628580093384, + "learning_rate": 5.239245420309658e-06, + "loss": 0.0546, + "step": 2388 + }, + { + "epoch": 0.5007335988262419, + "grad_norm": 0.07383181154727936, + "learning_rate": 5.2358544056232145e-06, + "loss": 0.0568, + "step": 2389 + }, + { + "epoch": 0.5009431984908824, + "grad_norm": 0.07915870100259781, + "learning_rate": 5.232463282207937e-06, + "loss": 0.058, + "step": 2390 + }, + { + "epoch": 0.5011527981555229, + "grad_norm": 0.07858150452375412, + "learning_rate": 5.229072051627137e-06, + "loss": 0.0581, + "step": 2391 + }, + { + "epoch": 0.5013623978201635, + "grad_norm": 0.07529259473085403, + "learning_rate": 5.225680715444168e-06, + "loss": 0.0572, + "step": 2392 + }, + { + "epoch": 0.5015719974848041, + "grad_norm": 0.07334654778242111, + "learning_rate": 5.222289275222436e-06, + "loss": 0.0565, + "step": 2393 + }, + { + "epoch": 0.5017815971494446, + "grad_norm": 0.07713853567838669, + "learning_rate": 5.218897732525397e-06, + "loss": 0.0555, + "step": 2394 + }, + { + "epoch": 0.5019911968140851, + "grad_norm": 0.0780978575348854, + "learning_rate": 5.215506088916545e-06, + "loss": 0.0557, + "step": 2395 + }, + { + "epoch": 0.5022007964787256, + "grad_norm": 0.06955696642398834, + "learning_rate": 5.212114345959433e-06, + "loss": 0.0569, + "step": 2396 + }, + { + "epoch": 0.5024103961433661, + "grad_norm": 0.07240567356348038, + "learning_rate": 5.2087225052176515e-06, + "loss": 0.0579, + "step": 2397 + }, + { + "epoch": 0.5026199958080068, + "grad_norm": 0.08363047242164612, + "learning_rate": 5.205330568254837e-06, + "loss": 0.0534, + "step": 2398 + }, + { + "epoch": 0.5028295954726473, + "grad_norm": 0.08383505046367645, + "learning_rate": 5.201938536634674e-06, + "loss": 0.0569, + "step": 2399 + }, + { + "epoch": 0.5030391951372878, + "grad_norm": 0.07523591816425323, + "learning_rate": 5.198546411920888e-06, + "loss": 0.0564, + "step": 2400 + }, + { + "epoch": 0.5032487948019283, + "grad_norm": 0.08412330597639084, + "learning_rate": 5.195154195677247e-06, + "loss": 0.0541, + "step": 2401 + }, + { + "epoch": 0.5034583944665688, + "grad_norm": 0.08403821289539337, + "learning_rate": 5.1917618894675615e-06, + "loss": 0.0546, + "step": 2402 + }, + { + "epoch": 0.5036679941312094, + "grad_norm": 0.07715484499931335, + "learning_rate": 5.188369494855686e-06, + "loss": 0.0564, + "step": 2403 + }, + { + "epoch": 0.50387759379585, + "grad_norm": 0.07596497237682343, + "learning_rate": 5.184977013405512e-06, + "loss": 0.0592, + "step": 2404 + }, + { + "epoch": 0.5040871934604905, + "grad_norm": 0.0813627615571022, + "learning_rate": 5.181584446680974e-06, + "loss": 0.0554, + "step": 2405 + }, + { + "epoch": 0.504296793125131, + "grad_norm": 0.10759492963552475, + "learning_rate": 5.178191796246043e-06, + "loss": 0.0575, + "step": 2406 + }, + { + "epoch": 0.5045063927897715, + "grad_norm": 0.12914326786994934, + "learning_rate": 5.174799063664731e-06, + "loss": 0.0546, + "step": 2407 + }, + { + "epoch": 0.5047159924544121, + "grad_norm": 0.12072386592626572, + "learning_rate": 5.171406250501087e-06, + "loss": 0.0569, + "step": 2408 + }, + { + "epoch": 0.5049255921190526, + "grad_norm": 0.10323359817266464, + "learning_rate": 5.1680133583191975e-06, + "loss": 0.0564, + "step": 2409 + }, + { + "epoch": 0.5051351917836931, + "grad_norm": 0.10388664901256561, + "learning_rate": 5.164620388683183e-06, + "loss": 0.0547, + "step": 2410 + }, + { + "epoch": 0.5053447914483337, + "grad_norm": 0.09523274004459381, + "learning_rate": 5.1612273431572055e-06, + "loss": 0.0558, + "step": 2411 + }, + { + "epoch": 0.5055543911129742, + "grad_norm": 0.08987044543027878, + "learning_rate": 5.1578342233054555e-06, + "loss": 0.0577, + "step": 2412 + }, + { + "epoch": 0.5057639907776148, + "grad_norm": 0.08983159810304642, + "learning_rate": 5.154441030692162e-06, + "loss": 0.0571, + "step": 2413 + }, + { + "epoch": 0.5059735904422553, + "grad_norm": 0.07881300896406174, + "learning_rate": 5.151047766881585e-06, + "loss": 0.0565, + "step": 2414 + }, + { + "epoch": 0.5061831901068958, + "grad_norm": 0.07373305410146713, + "learning_rate": 5.147654433438018e-06, + "loss": 0.0545, + "step": 2415 + }, + { + "epoch": 0.5063927897715363, + "grad_norm": 0.07738931477069855, + "learning_rate": 5.144261031925789e-06, + "loss": 0.0551, + "step": 2416 + }, + { + "epoch": 0.5066023894361769, + "grad_norm": 0.07858271896839142, + "learning_rate": 5.140867563909254e-06, + "loss": 0.0541, + "step": 2417 + }, + { + "epoch": 0.5068119891008175, + "grad_norm": 0.08694379776716232, + "learning_rate": 5.137474030952801e-06, + "loss": 0.0558, + "step": 2418 + }, + { + "epoch": 0.507021588765458, + "grad_norm": 0.09215452522039413, + "learning_rate": 5.134080434620849e-06, + "loss": 0.0572, + "step": 2419 + }, + { + "epoch": 0.5072311884300985, + "grad_norm": 0.08612719178199768, + "learning_rate": 5.1306867764778445e-06, + "loss": 0.054, + "step": 2420 + }, + { + "epoch": 0.507440788094739, + "grad_norm": 0.0733933225274086, + "learning_rate": 5.127293058088261e-06, + "loss": 0.0533, + "step": 2421 + }, + { + "epoch": 0.5076503877593795, + "grad_norm": 0.06744138896465302, + "learning_rate": 5.1238992810166065e-06, + "loss": 0.0542, + "step": 2422 + }, + { + "epoch": 0.5078599874240202, + "grad_norm": 0.062108736485242844, + "learning_rate": 5.120505446827409e-06, + "loss": 0.0584, + "step": 2423 + }, + { + "epoch": 0.5080695870886607, + "grad_norm": 0.06326153874397278, + "learning_rate": 5.117111557085225e-06, + "loss": 0.0542, + "step": 2424 + }, + { + "epoch": 0.5082791867533012, + "grad_norm": 0.07352180778980255, + "learning_rate": 5.1137176133546376e-06, + "loss": 0.0584, + "step": 2425 + }, + { + "epoch": 0.5084887864179417, + "grad_norm": 0.07469355314970016, + "learning_rate": 5.1103236172002534e-06, + "loss": 0.0574, + "step": 2426 + }, + { + "epoch": 0.5086983860825822, + "grad_norm": 0.08976007997989655, + "learning_rate": 5.106929570186706e-06, + "loss": 0.057, + "step": 2427 + }, + { + "epoch": 0.5089079857472228, + "grad_norm": 0.09722107648849487, + "learning_rate": 5.1035354738786465e-06, + "loss": 0.0571, + "step": 2428 + }, + { + "epoch": 0.5091175854118634, + "grad_norm": 0.07933610677719116, + "learning_rate": 5.100141329840757e-06, + "loss": 0.0566, + "step": 2429 + }, + { + "epoch": 0.5093271850765039, + "grad_norm": 0.07435188442468643, + "learning_rate": 5.096747139637737e-06, + "loss": 0.0562, + "step": 2430 + }, + { + "epoch": 0.5095367847411444, + "grad_norm": 0.06793338805437088, + "learning_rate": 5.0933529048343025e-06, + "loss": 0.0573, + "step": 2431 + }, + { + "epoch": 0.5097463844057849, + "grad_norm": 0.07182233035564423, + "learning_rate": 5.089958626995199e-06, + "loss": 0.055, + "step": 2432 + }, + { + "epoch": 0.5099559840704255, + "grad_norm": 0.09133189171552658, + "learning_rate": 5.086564307685188e-06, + "loss": 0.0541, + "step": 2433 + }, + { + "epoch": 0.510165583735066, + "grad_norm": 0.08642099797725677, + "learning_rate": 5.083169948469049e-06, + "loss": 0.0556, + "step": 2434 + }, + { + "epoch": 0.5103751833997066, + "grad_norm": 0.08702914416790009, + "learning_rate": 5.07977555091158e-06, + "loss": 0.0571, + "step": 2435 + }, + { + "epoch": 0.5105847830643471, + "grad_norm": 0.08467281609773636, + "learning_rate": 5.0763811165776e-06, + "loss": 0.0562, + "step": 2436 + }, + { + "epoch": 0.5107943827289876, + "grad_norm": 0.06674730032682419, + "learning_rate": 5.07298664703194e-06, + "loss": 0.0522, + "step": 2437 + }, + { + "epoch": 0.5110039823936282, + "grad_norm": 0.07278907299041748, + "learning_rate": 5.069592143839452e-06, + "loss": 0.0542, + "step": 2438 + }, + { + "epoch": 0.5112135820582687, + "grad_norm": 0.0811818316578865, + "learning_rate": 5.066197608564999e-06, + "loss": 0.0573, + "step": 2439 + }, + { + "epoch": 0.5114231817229092, + "grad_norm": 0.07044421881437302, + "learning_rate": 5.062803042773463e-06, + "loss": 0.0575, + "step": 2440 + }, + { + "epoch": 0.5116327813875498, + "grad_norm": 0.06003498286008835, + "learning_rate": 5.059408448029737e-06, + "loss": 0.0577, + "step": 2441 + }, + { + "epoch": 0.5118423810521903, + "grad_norm": 0.06972794234752655, + "learning_rate": 5.056013825898729e-06, + "loss": 0.0543, + "step": 2442 + }, + { + "epoch": 0.5120519807168309, + "grad_norm": 0.07647814601659775, + "learning_rate": 5.052619177945358e-06, + "loss": 0.0548, + "step": 2443 + }, + { + "epoch": 0.5122615803814714, + "grad_norm": 0.06816070526838303, + "learning_rate": 5.049224505734558e-06, + "loss": 0.0582, + "step": 2444 + }, + { + "epoch": 0.5124711800461119, + "grad_norm": 0.0557573027908802, + "learning_rate": 5.045829810831271e-06, + "loss": 0.0547, + "step": 2445 + }, + { + "epoch": 0.5126807797107524, + "grad_norm": 0.06597109884023666, + "learning_rate": 5.042435094800451e-06, + "loss": 0.056, + "step": 2446 + }, + { + "epoch": 0.512890379375393, + "grad_norm": 0.0730803981423378, + "learning_rate": 5.0390403592070605e-06, + "loss": 0.0547, + "step": 2447 + }, + { + "epoch": 0.5130999790400336, + "grad_norm": 0.07265052199363708, + "learning_rate": 5.0356456056160715e-06, + "loss": 0.0567, + "step": 2448 + }, + { + "epoch": 0.5133095787046741, + "grad_norm": 0.07946612685918808, + "learning_rate": 5.032250835592467e-06, + "loss": 0.0576, + "step": 2449 + }, + { + "epoch": 0.5135191783693146, + "grad_norm": 0.09229972958564758, + "learning_rate": 5.028856050701234e-06, + "loss": 0.0536, + "step": 2450 + }, + { + "epoch": 0.5137287780339551, + "grad_norm": 0.1069934144616127, + "learning_rate": 5.025461252507367e-06, + "loss": 0.0562, + "step": 2451 + }, + { + "epoch": 0.5139383776985956, + "grad_norm": 0.12349528819322586, + "learning_rate": 5.0220664425758695e-06, + "loss": 0.0573, + "step": 2452 + }, + { + "epoch": 0.5141479773632363, + "grad_norm": 0.1274581402540207, + "learning_rate": 5.0186716224717445e-06, + "loss": 0.0576, + "step": 2453 + }, + { + "epoch": 0.5143575770278768, + "grad_norm": 0.11807071417570114, + "learning_rate": 5.0152767937600055e-06, + "loss": 0.0576, + "step": 2454 + }, + { + "epoch": 0.5145671766925173, + "grad_norm": 0.10544098168611526, + "learning_rate": 5.0118819580056686e-06, + "loss": 0.0589, + "step": 2455 + }, + { + "epoch": 0.5147767763571578, + "grad_norm": 0.091020368039608, + "learning_rate": 5.008487116773752e-06, + "loss": 0.0574, + "step": 2456 + }, + { + "epoch": 0.5149863760217984, + "grad_norm": 0.07477325946092606, + "learning_rate": 5.0050922716292745e-06, + "loss": 0.0584, + "step": 2457 + }, + { + "epoch": 0.5151959756864389, + "grad_norm": 0.06757844984531403, + "learning_rate": 5.001697424137264e-06, + "loss": 0.0552, + "step": 2458 + }, + { + "epoch": 0.5154055753510794, + "grad_norm": 0.06685573607683182, + "learning_rate": 4.9983025758627376e-06, + "loss": 0.0533, + "step": 2459 + }, + { + "epoch": 0.51561517501572, + "grad_norm": 0.0714949518442154, + "learning_rate": 4.9949077283707255e-06, + "loss": 0.0563, + "step": 2460 + }, + { + "epoch": 0.5158247746803605, + "grad_norm": 0.06642146408557892, + "learning_rate": 4.991512883226251e-06, + "loss": 0.0599, + "step": 2461 + }, + { + "epoch": 0.5160343743450011, + "grad_norm": 0.06495679169893265, + "learning_rate": 4.988118041994332e-06, + "loss": 0.0593, + "step": 2462 + }, + { + "epoch": 0.5162439740096416, + "grad_norm": 0.06817002594470978, + "learning_rate": 4.984723206239995e-06, + "loss": 0.0564, + "step": 2463 + }, + { + "epoch": 0.5164535736742821, + "grad_norm": 0.0714736357331276, + "learning_rate": 4.981328377528258e-06, + "loss": 0.0588, + "step": 2464 + }, + { + "epoch": 0.5166631733389226, + "grad_norm": 0.07612305879592896, + "learning_rate": 4.977933557424133e-06, + "loss": 0.0566, + "step": 2465 + }, + { + "epoch": 0.5168727730035632, + "grad_norm": 0.0783240795135498, + "learning_rate": 4.974538747492634e-06, + "loss": 0.056, + "step": 2466 + }, + { + "epoch": 0.5170823726682038, + "grad_norm": 0.07756256312131882, + "learning_rate": 4.971143949298769e-06, + "loss": 0.0546, + "step": 2467 + }, + { + "epoch": 0.5172919723328443, + "grad_norm": 0.07845515012741089, + "learning_rate": 4.967749164407535e-06, + "loss": 0.0533, + "step": 2468 + }, + { + "epoch": 0.5175015719974848, + "grad_norm": 0.08055856078863144, + "learning_rate": 4.964354394383929e-06, + "loss": 0.0586, + "step": 2469 + }, + { + "epoch": 0.5177111716621253, + "grad_norm": 0.07212040573358536, + "learning_rate": 4.960959640792943e-06, + "loss": 0.0549, + "step": 2470 + }, + { + "epoch": 0.5179207713267658, + "grad_norm": 0.07273006439208984, + "learning_rate": 4.9575649051995515e-06, + "loss": 0.0545, + "step": 2471 + }, + { + "epoch": 0.5181303709914065, + "grad_norm": 0.08731981366872787, + "learning_rate": 4.954170189168731e-06, + "loss": 0.0581, + "step": 2472 + }, + { + "epoch": 0.518339970656047, + "grad_norm": 0.07897140830755234, + "learning_rate": 4.950775494265443e-06, + "loss": 0.0562, + "step": 2473 + }, + { + "epoch": 0.5185495703206875, + "grad_norm": 0.055604368448257446, + "learning_rate": 4.947380822054643e-06, + "loss": 0.0545, + "step": 2474 + }, + { + "epoch": 0.518759169985328, + "grad_norm": 0.05771341547369957, + "learning_rate": 4.9439861741012726e-06, + "loss": 0.0541, + "step": 2475 + }, + { + "epoch": 0.5189687696499685, + "grad_norm": 0.08119264245033264, + "learning_rate": 4.940591551970264e-06, + "loss": 0.0558, + "step": 2476 + }, + { + "epoch": 0.5191783693146091, + "grad_norm": 0.08719661831855774, + "learning_rate": 4.93719695722654e-06, + "loss": 0.0553, + "step": 2477 + }, + { + "epoch": 0.5193879689792497, + "grad_norm": 0.08087395131587982, + "learning_rate": 4.933802391435002e-06, + "loss": 0.0536, + "step": 2478 + }, + { + "epoch": 0.5195975686438902, + "grad_norm": 0.07597414404153824, + "learning_rate": 4.93040785616055e-06, + "loss": 0.0563, + "step": 2479 + }, + { + "epoch": 0.5198071683085307, + "grad_norm": 0.07065032422542572, + "learning_rate": 4.927013352968063e-06, + "loss": 0.0543, + "step": 2480 + }, + { + "epoch": 0.5200167679731712, + "grad_norm": 0.06194977089762688, + "learning_rate": 4.9236188834224015e-06, + "loss": 0.0575, + "step": 2481 + }, + { + "epoch": 0.5202263676378118, + "grad_norm": 0.059151582419872284, + "learning_rate": 4.920224449088421e-06, + "loss": 0.0577, + "step": 2482 + }, + { + "epoch": 0.5204359673024523, + "grad_norm": 0.07860930263996124, + "learning_rate": 4.9168300515309515e-06, + "loss": 0.0566, + "step": 2483 + }, + { + "epoch": 0.5206455669670929, + "grad_norm": 0.11033756285905838, + "learning_rate": 4.913435692314814e-06, + "loss": 0.056, + "step": 2484 + }, + { + "epoch": 0.5208551666317334, + "grad_norm": 0.13464130461215973, + "learning_rate": 4.910041373004802e-06, + "loss": 0.0548, + "step": 2485 + }, + { + "epoch": 0.5210647662963739, + "grad_norm": 0.1375139057636261, + "learning_rate": 4.906647095165698e-06, + "loss": 0.0568, + "step": 2486 + }, + { + "epoch": 0.5212743659610145, + "grad_norm": 0.10741981118917465, + "learning_rate": 4.903252860362266e-06, + "loss": 0.0593, + "step": 2487 + }, + { + "epoch": 0.521483965625655, + "grad_norm": 0.08266045153141022, + "learning_rate": 4.8998586701592436e-06, + "loss": 0.0558, + "step": 2488 + }, + { + "epoch": 0.5216935652902955, + "grad_norm": 0.0874459519982338, + "learning_rate": 4.8964645261213535e-06, + "loss": 0.0557, + "step": 2489 + }, + { + "epoch": 0.521903164954936, + "grad_norm": 0.10738598555326462, + "learning_rate": 4.8930704298132965e-06, + "loss": 0.0556, + "step": 2490 + }, + { + "epoch": 0.5221127646195766, + "grad_norm": 0.11535639315843582, + "learning_rate": 4.889676382799748e-06, + "loss": 0.0566, + "step": 2491 + }, + { + "epoch": 0.5223223642842172, + "grad_norm": 0.10083887726068497, + "learning_rate": 4.886282386645363e-06, + "loss": 0.0567, + "step": 2492 + }, + { + "epoch": 0.5225319639488577, + "grad_norm": 0.07577608525753021, + "learning_rate": 4.8828884429147775e-06, + "loss": 0.0539, + "step": 2493 + }, + { + "epoch": 0.5227415636134982, + "grad_norm": 0.08031262457370758, + "learning_rate": 4.879494553172594e-06, + "loss": 0.0541, + "step": 2494 + }, + { + "epoch": 0.5229511632781387, + "grad_norm": 0.10877335071563721, + "learning_rate": 4.876100718983394e-06, + "loss": 0.0566, + "step": 2495 + }, + { + "epoch": 0.5231607629427792, + "grad_norm": 0.10455876588821411, + "learning_rate": 4.872706941911739e-06, + "loss": 0.0559, + "step": 2496 + }, + { + "epoch": 0.5233703626074199, + "grad_norm": 0.07597562670707703, + "learning_rate": 4.869313223522159e-06, + "loss": 0.0565, + "step": 2497 + }, + { + "epoch": 0.5235799622720604, + "grad_norm": 0.08931368589401245, + "learning_rate": 4.865919565379152e-06, + "loss": 0.0583, + "step": 2498 + }, + { + "epoch": 0.5237895619367009, + "grad_norm": 0.09384021908044815, + "learning_rate": 4.8625259690472e-06, + "loss": 0.0574, + "step": 2499 + }, + { + "epoch": 0.5239991616013414, + "grad_norm": 0.0911545678973198, + "learning_rate": 4.859132436090748e-06, + "loss": 0.0561, + "step": 2500 + }, + { + "epoch": 0.5242087612659819, + "grad_norm": 0.10391653329133987, + "learning_rate": 4.855738968074212e-06, + "loss": 0.0552, + "step": 2501 + }, + { + "epoch": 0.5244183609306226, + "grad_norm": 0.09222046285867691, + "learning_rate": 4.852345566561983e-06, + "loss": 0.059, + "step": 2502 + }, + { + "epoch": 0.5246279605952631, + "grad_norm": 0.07000227272510529, + "learning_rate": 4.848952233118417e-06, + "loss": 0.0584, + "step": 2503 + }, + { + "epoch": 0.5248375602599036, + "grad_norm": 0.08648461848497391, + "learning_rate": 4.845558969307839e-06, + "loss": 0.0541, + "step": 2504 + }, + { + "epoch": 0.5250471599245441, + "grad_norm": 0.09578924626111984, + "learning_rate": 4.842165776694545e-06, + "loss": 0.0551, + "step": 2505 + }, + { + "epoch": 0.5252567595891846, + "grad_norm": 0.091171033680439, + "learning_rate": 4.8387726568427945e-06, + "loss": 0.055, + "step": 2506 + }, + { + "epoch": 0.5254663592538252, + "grad_norm": 0.09794994443655014, + "learning_rate": 4.835379611316818e-06, + "loss": 0.0602, + "step": 2507 + }, + { + "epoch": 0.5256759589184657, + "grad_norm": 0.08770140260457993, + "learning_rate": 4.831986641680804e-06, + "loss": 0.0555, + "step": 2508 + }, + { + "epoch": 0.5258855585831063, + "grad_norm": 0.07761924713850021, + "learning_rate": 4.828593749498913e-06, + "loss": 0.0558, + "step": 2509 + }, + { + "epoch": 0.5260951582477468, + "grad_norm": 0.09006881713867188, + "learning_rate": 4.825200936335272e-06, + "loss": 0.0575, + "step": 2510 + }, + { + "epoch": 0.5263047579123873, + "grad_norm": 0.08472940325737, + "learning_rate": 4.821808203753959e-06, + "loss": 0.0555, + "step": 2511 + }, + { + "epoch": 0.5265143575770279, + "grad_norm": 0.06748061627149582, + "learning_rate": 4.818415553319027e-06, + "loss": 0.0555, + "step": 2512 + }, + { + "epoch": 0.5267239572416684, + "grad_norm": 0.07318470627069473, + "learning_rate": 4.815022986594491e-06, + "loss": 0.0564, + "step": 2513 + }, + { + "epoch": 0.5269335569063089, + "grad_norm": 0.07728718966245651, + "learning_rate": 4.811630505144316e-06, + "loss": 0.055, + "step": 2514 + }, + { + "epoch": 0.5271431565709495, + "grad_norm": 0.06775672733783722, + "learning_rate": 4.808238110532439e-06, + "loss": 0.0543, + "step": 2515 + }, + { + "epoch": 0.52735275623559, + "grad_norm": 0.08202599734067917, + "learning_rate": 4.804845804322756e-06, + "loss": 0.0562, + "step": 2516 + }, + { + "epoch": 0.5275623559002306, + "grad_norm": 0.09403207898139954, + "learning_rate": 4.801453588079113e-06, + "loss": 0.0548, + "step": 2517 + }, + { + "epoch": 0.5277719555648711, + "grad_norm": 0.08758095651865005, + "learning_rate": 4.798061463365327e-06, + "loss": 0.0533, + "step": 2518 + }, + { + "epoch": 0.5279815552295116, + "grad_norm": 0.07951238006353378, + "learning_rate": 4.7946694317451635e-06, + "loss": 0.0558, + "step": 2519 + }, + { + "epoch": 0.5281911548941521, + "grad_norm": 0.06479839980602264, + "learning_rate": 4.791277494782351e-06, + "loss": 0.056, + "step": 2520 + }, + { + "epoch": 0.5284007545587927, + "grad_norm": 0.060277167707681656, + "learning_rate": 4.787885654040569e-06, + "loss": 0.0572, + "step": 2521 + }, + { + "epoch": 0.5286103542234333, + "grad_norm": 0.06489501148462296, + "learning_rate": 4.784493911083455e-06, + "loss": 0.0559, + "step": 2522 + }, + { + "epoch": 0.5288199538880738, + "grad_norm": 0.07337628304958344, + "learning_rate": 4.781102267474606e-06, + "loss": 0.0576, + "step": 2523 + }, + { + "epoch": 0.5290295535527143, + "grad_norm": 0.08746179193258286, + "learning_rate": 4.777710724777565e-06, + "loss": 0.0556, + "step": 2524 + }, + { + "epoch": 0.5292391532173548, + "grad_norm": 0.09314849972724915, + "learning_rate": 4.774319284555833e-06, + "loss": 0.0566, + "step": 2525 + }, + { + "epoch": 0.5294487528819954, + "grad_norm": 0.08934000134468079, + "learning_rate": 4.770927948372865e-06, + "loss": 0.0522, + "step": 2526 + }, + { + "epoch": 0.529658352546636, + "grad_norm": 0.08320695906877518, + "learning_rate": 4.7675367177920645e-06, + "loss": 0.0547, + "step": 2527 + }, + { + "epoch": 0.5298679522112765, + "grad_norm": 0.07654823362827301, + "learning_rate": 4.764145594376788e-06, + "loss": 0.0531, + "step": 2528 + }, + { + "epoch": 0.530077551875917, + "grad_norm": 0.07065373659133911, + "learning_rate": 4.7607545796903444e-06, + "loss": 0.0584, + "step": 2529 + }, + { + "epoch": 0.5302871515405575, + "grad_norm": 0.06831829994916916, + "learning_rate": 4.757363675295991e-06, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.5304967512051981, + "grad_norm": 0.06134999543428421, + "learning_rate": 4.753972882756931e-06, + "loss": 0.0548, + "step": 2531 + }, + { + "epoch": 0.5307063508698386, + "grad_norm": 0.05438331514596939, + "learning_rate": 4.7505822036363214e-06, + "loss": 0.0568, + "step": 2532 + }, + { + "epoch": 0.5309159505344792, + "grad_norm": 0.047348763793706894, + "learning_rate": 4.747191639497266e-06, + "loss": 0.0568, + "step": 2533 + }, + { + "epoch": 0.5311255501991197, + "grad_norm": 0.048679519444704056, + "learning_rate": 4.743801191902809e-06, + "loss": 0.0563, + "step": 2534 + }, + { + "epoch": 0.5313351498637602, + "grad_norm": 0.045131586492061615, + "learning_rate": 4.740410862415952e-06, + "loss": 0.0555, + "step": 2535 + }, + { + "epoch": 0.5315447495284008, + "grad_norm": 0.05211302638053894, + "learning_rate": 4.737020652599633e-06, + "loss": 0.0551, + "step": 2536 + }, + { + "epoch": 0.5317543491930413, + "grad_norm": 0.05385665223002434, + "learning_rate": 4.733630564016738e-06, + "loss": 0.0541, + "step": 2537 + }, + { + "epoch": 0.5319639488576818, + "grad_norm": 0.046267565339803696, + "learning_rate": 4.730240598230097e-06, + "loss": 0.0533, + "step": 2538 + }, + { + "epoch": 0.5321735485223223, + "grad_norm": 0.04819132760167122, + "learning_rate": 4.726850756802486e-06, + "loss": 0.0564, + "step": 2539 + }, + { + "epoch": 0.5323831481869629, + "grad_norm": 0.048858221620321274, + "learning_rate": 4.723461041296618e-06, + "loss": 0.0547, + "step": 2540 + }, + { + "epoch": 0.5325927478516035, + "grad_norm": 0.042201049625873566, + "learning_rate": 4.720071453275152e-06, + "loss": 0.0573, + "step": 2541 + }, + { + "epoch": 0.532802347516244, + "grad_norm": 0.04239390045404434, + "learning_rate": 4.716681994300688e-06, + "loss": 0.0573, + "step": 2542 + }, + { + "epoch": 0.5330119471808845, + "grad_norm": 0.05800594016909599, + "learning_rate": 4.7132926659357675e-06, + "loss": 0.0546, + "step": 2543 + }, + { + "epoch": 0.533221546845525, + "grad_norm": 0.05549288168549538, + "learning_rate": 4.7099034697428676e-06, + "loss": 0.0565, + "step": 2544 + }, + { + "epoch": 0.5334311465101655, + "grad_norm": 0.05524330586194992, + "learning_rate": 4.706514407284407e-06, + "loss": 0.0544, + "step": 2545 + }, + { + "epoch": 0.5336407461748062, + "grad_norm": 0.07277749478816986, + "learning_rate": 4.703125480122747e-06, + "loss": 0.0542, + "step": 2546 + }, + { + "epoch": 0.5338503458394467, + "grad_norm": 0.06170268729329109, + "learning_rate": 4.699736689820175e-06, + "loss": 0.0565, + "step": 2547 + }, + { + "epoch": 0.5340599455040872, + "grad_norm": 0.05076323822140694, + "learning_rate": 4.696348037938927e-06, + "loss": 0.0565, + "step": 2548 + }, + { + "epoch": 0.5342695451687277, + "grad_norm": 0.06921599805355072, + "learning_rate": 4.692959526041174e-06, + "loss": 0.0565, + "step": 2549 + }, + { + "epoch": 0.5344791448333682, + "grad_norm": 0.057804301381111145, + "learning_rate": 4.689571155689012e-06, + "loss": 0.053, + "step": 2550 + }, + { + "epoch": 0.5346887444980088, + "grad_norm": 0.04741832986474037, + "learning_rate": 4.686182928444484e-06, + "loss": 0.0552, + "step": 2551 + }, + { + "epoch": 0.5348983441626494, + "grad_norm": 0.07584595680236816, + "learning_rate": 4.682794845869559e-06, + "loss": 0.0525, + "step": 2552 + }, + { + "epoch": 0.5351079438272899, + "grad_norm": 0.07168301194906235, + "learning_rate": 4.679406909526147e-06, + "loss": 0.0579, + "step": 2553 + }, + { + "epoch": 0.5353175434919304, + "grad_norm": 0.04884837195277214, + "learning_rate": 4.676019120976082e-06, + "loss": 0.0574, + "step": 2554 + }, + { + "epoch": 0.5355271431565709, + "grad_norm": 0.06782133877277374, + "learning_rate": 4.672631481781134e-06, + "loss": 0.0565, + "step": 2555 + }, + { + "epoch": 0.5357367428212115, + "grad_norm": 0.07214924693107605, + "learning_rate": 4.669243993503008e-06, + "loss": 0.0584, + "step": 2556 + }, + { + "epoch": 0.535946342485852, + "grad_norm": 0.05515038222074509, + "learning_rate": 4.665856657703329e-06, + "loss": 0.0545, + "step": 2557 + }, + { + "epoch": 0.5361559421504926, + "grad_norm": 0.06522560119628906, + "learning_rate": 4.662469475943662e-06, + "loss": 0.0558, + "step": 2558 + }, + { + "epoch": 0.5363655418151331, + "grad_norm": 0.06113879382610321, + "learning_rate": 4.659082449785498e-06, + "loss": 0.0558, + "step": 2559 + }, + { + "epoch": 0.5365751414797736, + "grad_norm": 0.06243380904197693, + "learning_rate": 4.655695580790254e-06, + "loss": 0.0578, + "step": 2560 + }, + { + "epoch": 0.5367847411444142, + "grad_norm": 0.07164395600557327, + "learning_rate": 4.652308870519272e-06, + "loss": 0.0574, + "step": 2561 + }, + { + "epoch": 0.5369943408090547, + "grad_norm": 0.06726083904504776, + "learning_rate": 4.648922320533833e-06, + "loss": 0.0546, + "step": 2562 + }, + { + "epoch": 0.5372039404736952, + "grad_norm": 0.054155874997377396, + "learning_rate": 4.645535932395129e-06, + "loss": 0.0536, + "step": 2563 + }, + { + "epoch": 0.5374135401383358, + "grad_norm": 0.0629991814494133, + "learning_rate": 4.6421497076642864e-06, + "loss": 0.0576, + "step": 2564 + }, + { + "epoch": 0.5376231398029763, + "grad_norm": 0.06752461940050125, + "learning_rate": 4.638763647902355e-06, + "loss": 0.0567, + "step": 2565 + }, + { + "epoch": 0.5378327394676169, + "grad_norm": 0.0500505268573761, + "learning_rate": 4.635377754670307e-06, + "loss": 0.0546, + "step": 2566 + }, + { + "epoch": 0.5380423391322574, + "grad_norm": 0.043680962175130844, + "learning_rate": 4.631992029529037e-06, + "loss": 0.0556, + "step": 2567 + }, + { + "epoch": 0.5382519387968979, + "grad_norm": 0.06089196354150772, + "learning_rate": 4.628606474039366e-06, + "loss": 0.0545, + "step": 2568 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 0.07623324543237686, + "learning_rate": 4.625221089762034e-06, + "loss": 0.0577, + "step": 2569 + }, + { + "epoch": 0.538671138126179, + "grad_norm": 0.09632382541894913, + "learning_rate": 4.621835878257701e-06, + "loss": 0.0567, + "step": 2570 + }, + { + "epoch": 0.5388807377908196, + "grad_norm": 0.12334061414003372, + "learning_rate": 4.6184508410869486e-06, + "loss": 0.0533, + "step": 2571 + }, + { + "epoch": 0.5390903374554601, + "grad_norm": 0.1471899002790451, + "learning_rate": 4.615065979810282e-06, + "loss": 0.0571, + "step": 2572 + }, + { + "epoch": 0.5392999371201006, + "grad_norm": 0.16688735783100128, + "learning_rate": 4.6116812959881154e-06, + "loss": 0.0543, + "step": 2573 + }, + { + "epoch": 0.5395095367847411, + "grad_norm": 0.17000025510787964, + "learning_rate": 4.608296791180793e-06, + "loss": 0.0555, + "step": 2574 + }, + { + "epoch": 0.5397191364493816, + "grad_norm": 0.13026584684848785, + "learning_rate": 4.604912466948568e-06, + "loss": 0.058, + "step": 2575 + }, + { + "epoch": 0.5399287361140223, + "grad_norm": 0.06306587904691696, + "learning_rate": 4.601528324851613e-06, + "loss": 0.0546, + "step": 2576 + }, + { + "epoch": 0.5401383357786628, + "grad_norm": 0.06573072820901871, + "learning_rate": 4.598144366450018e-06, + "loss": 0.057, + "step": 2577 + }, + { + "epoch": 0.5403479354433033, + "grad_norm": 0.1011551097035408, + "learning_rate": 4.594760593303785e-06, + "loss": 0.0563, + "step": 2578 + }, + { + "epoch": 0.5405575351079438, + "grad_norm": 0.13201986253261566, + "learning_rate": 4.591377006972837e-06, + "loss": 0.0559, + "step": 2579 + }, + { + "epoch": 0.5407671347725843, + "grad_norm": 0.13169518113136292, + "learning_rate": 4.587993609017003e-06, + "loss": 0.0583, + "step": 2580 + }, + { + "epoch": 0.5409767344372249, + "grad_norm": 0.07675211131572723, + "learning_rate": 4.584610400996028e-06, + "loss": 0.0547, + "step": 2581 + }, + { + "epoch": 0.5411863341018655, + "grad_norm": 0.07180249691009521, + "learning_rate": 4.581227384469575e-06, + "loss": 0.056, + "step": 2582 + }, + { + "epoch": 0.541395933766506, + "grad_norm": 0.11214875429868698, + "learning_rate": 4.577844560997208e-06, + "loss": 0.0561, + "step": 2583 + }, + { + "epoch": 0.5416055334311465, + "grad_norm": 0.1112339124083519, + "learning_rate": 4.574461932138412e-06, + "loss": 0.0535, + "step": 2584 + }, + { + "epoch": 0.541815133095787, + "grad_norm": 0.08749670535326004, + "learning_rate": 4.571079499452578e-06, + "loss": 0.057, + "step": 2585 + }, + { + "epoch": 0.5420247327604276, + "grad_norm": 0.08394355326890945, + "learning_rate": 4.567697264499003e-06, + "loss": 0.0547, + "step": 2586 + }, + { + "epoch": 0.5422343324250681, + "grad_norm": 0.12321101129055023, + "learning_rate": 4.564315228836901e-06, + "loss": 0.0577, + "step": 2587 + }, + { + "epoch": 0.5424439320897086, + "grad_norm": 0.13158167898654938, + "learning_rate": 4.560933394025386e-06, + "loss": 0.0583, + "step": 2588 + }, + { + "epoch": 0.5426535317543492, + "grad_norm": 0.08250062167644501, + "learning_rate": 4.5575517616234874e-06, + "loss": 0.0547, + "step": 2589 + }, + { + "epoch": 0.5428631314189897, + "grad_norm": 0.07321292906999588, + "learning_rate": 4.5541703331901346e-06, + "loss": 0.0575, + "step": 2590 + }, + { + "epoch": 0.5430727310836303, + "grad_norm": 0.10240411758422852, + "learning_rate": 4.550789110284164e-06, + "loss": 0.0548, + "step": 2591 + }, + { + "epoch": 0.5432823307482708, + "grad_norm": 0.09402070939540863, + "learning_rate": 4.547408094464322e-06, + "loss": 0.0568, + "step": 2592 + }, + { + "epoch": 0.5434919304129113, + "grad_norm": 0.06143482029438019, + "learning_rate": 4.5440272872892546e-06, + "loss": 0.0552, + "step": 2593 + }, + { + "epoch": 0.5437015300775518, + "grad_norm": 0.07297394424676895, + "learning_rate": 4.54064669031751e-06, + "loss": 0.0561, + "step": 2594 + }, + { + "epoch": 0.5439111297421925, + "grad_norm": 0.08411424607038498, + "learning_rate": 4.537266305107549e-06, + "loss": 0.0554, + "step": 2595 + }, + { + "epoch": 0.544120729406833, + "grad_norm": 0.06542636454105377, + "learning_rate": 4.533886133217725e-06, + "loss": 0.0563, + "step": 2596 + }, + { + "epoch": 0.5443303290714735, + "grad_norm": 0.06550241261720657, + "learning_rate": 4.5305061762062945e-06, + "loss": 0.0535, + "step": 2597 + }, + { + "epoch": 0.544539928736114, + "grad_norm": 0.08619559556245804, + "learning_rate": 4.527126435631422e-06, + "loss": 0.0566, + "step": 2598 + }, + { + "epoch": 0.5447495284007545, + "grad_norm": 0.07745786011219025, + "learning_rate": 4.523746913051163e-06, + "loss": 0.0571, + "step": 2599 + }, + { + "epoch": 0.5449591280653951, + "grad_norm": 0.0609513558447361, + "learning_rate": 4.520367610023477e-06, + "loss": 0.0552, + "step": 2600 + }, + { + "epoch": 0.5451687277300357, + "grad_norm": 0.0679200291633606, + "learning_rate": 4.516988528106225e-06, + "loss": 0.0538, + "step": 2601 + }, + { + "epoch": 0.5453783273946762, + "grad_norm": 0.067595936357975, + "learning_rate": 4.513609668857162e-06, + "loss": 0.0541, + "step": 2602 + }, + { + "epoch": 0.5455879270593167, + "grad_norm": 0.06276869773864746, + "learning_rate": 4.510231033833938e-06, + "loss": 0.0549, + "step": 2603 + }, + { + "epoch": 0.5457975267239572, + "grad_norm": 0.058919332921504974, + "learning_rate": 4.506852624594107e-06, + "loss": 0.0553, + "step": 2604 + }, + { + "epoch": 0.5460071263885978, + "grad_norm": 0.06877582520246506, + "learning_rate": 4.503474442695115e-06, + "loss": 0.0558, + "step": 2605 + }, + { + "epoch": 0.5462167260532383, + "grad_norm": 0.07054632157087326, + "learning_rate": 4.500096489694299e-06, + "loss": 0.0591, + "step": 2606 + }, + { + "epoch": 0.5464263257178789, + "grad_norm": 0.05597640573978424, + "learning_rate": 4.496718767148898e-06, + "loss": 0.0552, + "step": 2607 + }, + { + "epoch": 0.5466359253825194, + "grad_norm": 0.052397601306438446, + "learning_rate": 4.493341276616044e-06, + "loss": 0.0543, + "step": 2608 + }, + { + "epoch": 0.5468455250471599, + "grad_norm": 0.06026868522167206, + "learning_rate": 4.489964019652752e-06, + "loss": 0.0578, + "step": 2609 + }, + { + "epoch": 0.5470551247118005, + "grad_norm": 0.05290864408016205, + "learning_rate": 4.486586997815942e-06, + "loss": 0.0533, + "step": 2610 + }, + { + "epoch": 0.547264724376441, + "grad_norm": 0.05662361532449722, + "learning_rate": 4.48321021266242e-06, + "loss": 0.0552, + "step": 2611 + }, + { + "epoch": 0.5474743240410815, + "grad_norm": 0.057523008435964584, + "learning_rate": 4.479833665748884e-06, + "loss": 0.0595, + "step": 2612 + }, + { + "epoch": 0.547683923705722, + "grad_norm": 0.054744523018598557, + "learning_rate": 4.476457358631918e-06, + "loss": 0.0589, + "step": 2613 + }, + { + "epoch": 0.5478935233703626, + "grad_norm": 0.06096423789858818, + "learning_rate": 4.473081292868001e-06, + "loss": 0.054, + "step": 2614 + }, + { + "epoch": 0.5481031230350032, + "grad_norm": 0.06122094392776489, + "learning_rate": 4.4697054700135e-06, + "loss": 0.0524, + "step": 2615 + }, + { + "epoch": 0.5483127226996437, + "grad_norm": 0.06579925119876862, + "learning_rate": 4.4663298916246665e-06, + "loss": 0.0554, + "step": 2616 + }, + { + "epoch": 0.5485223223642842, + "grad_norm": 0.0574062280356884, + "learning_rate": 4.4629545592576415e-06, + "loss": 0.0563, + "step": 2617 + }, + { + "epoch": 0.5487319220289247, + "grad_norm": 0.05878579616546631, + "learning_rate": 4.459579474468455e-06, + "loss": 0.0541, + "step": 2618 + }, + { + "epoch": 0.5489415216935652, + "grad_norm": 0.06138122081756592, + "learning_rate": 4.456204638813017e-06, + "loss": 0.0564, + "step": 2619 + }, + { + "epoch": 0.5491511213582059, + "grad_norm": 0.04858166351914406, + "learning_rate": 4.452830053847127e-06, + "loss": 0.0547, + "step": 2620 + }, + { + "epoch": 0.5493607210228464, + "grad_norm": 0.046725232154130936, + "learning_rate": 4.4494557211264715e-06, + "loss": 0.0551, + "step": 2621 + }, + { + "epoch": 0.5495703206874869, + "grad_norm": 0.05486287176609039, + "learning_rate": 4.446081642206611e-06, + "loss": 0.0574, + "step": 2622 + }, + { + "epoch": 0.5497799203521274, + "grad_norm": 0.052558574825525284, + "learning_rate": 4.442707818642999e-06, + "loss": 0.0535, + "step": 2623 + }, + { + "epoch": 0.5499895200167679, + "grad_norm": 0.04250088334083557, + "learning_rate": 4.439334251990966e-06, + "loss": 0.0561, + "step": 2624 + }, + { + "epoch": 0.5501991196814086, + "grad_norm": 0.047486599534749985, + "learning_rate": 4.435960943805729e-06, + "loss": 0.0572, + "step": 2625 + }, + { + "epoch": 0.5504087193460491, + "grad_norm": 0.04519723728299141, + "learning_rate": 4.432587895642378e-06, + "loss": 0.0536, + "step": 2626 + }, + { + "epoch": 0.5506183190106896, + "grad_norm": 0.042785726487636566, + "learning_rate": 4.4292151090558884e-06, + "loss": 0.0544, + "step": 2627 + }, + { + "epoch": 0.5508279186753301, + "grad_norm": 0.04462413862347603, + "learning_rate": 4.425842585601117e-06, + "loss": 0.0569, + "step": 2628 + }, + { + "epoch": 0.5510375183399706, + "grad_norm": 0.041346605867147446, + "learning_rate": 4.422470326832794e-06, + "loss": 0.0567, + "step": 2629 + }, + { + "epoch": 0.5512471180046112, + "grad_norm": 0.04267747327685356, + "learning_rate": 4.419098334305529e-06, + "loss": 0.0542, + "step": 2630 + }, + { + "epoch": 0.5514567176692518, + "grad_norm": 0.05388062447309494, + "learning_rate": 4.4157266095738125e-06, + "loss": 0.0559, + "step": 2631 + }, + { + "epoch": 0.5516663173338923, + "grad_norm": 0.064181849360466, + "learning_rate": 4.412355154192007e-06, + "loss": 0.0576, + "step": 2632 + }, + { + "epoch": 0.5518759169985328, + "grad_norm": 0.0693066269159317, + "learning_rate": 4.408983969714353e-06, + "loss": 0.056, + "step": 2633 + }, + { + "epoch": 0.5520855166631733, + "grad_norm": 0.07318708300590515, + "learning_rate": 4.405613057694967e-06, + "loss": 0.054, + "step": 2634 + }, + { + "epoch": 0.5522951163278139, + "grad_norm": 0.07104630768299103, + "learning_rate": 4.402242419687839e-06, + "loss": 0.0536, + "step": 2635 + }, + { + "epoch": 0.5525047159924544, + "grad_norm": 0.057221170514822006, + "learning_rate": 4.398872057246829e-06, + "loss": 0.0522, + "step": 2636 + }, + { + "epoch": 0.552714315657095, + "grad_norm": 0.05032278224825859, + "learning_rate": 4.395501971925677e-06, + "loss": 0.0552, + "step": 2637 + }, + { + "epoch": 0.5529239153217355, + "grad_norm": 0.050015486776828766, + "learning_rate": 4.392132165277991e-06, + "loss": 0.0571, + "step": 2638 + }, + { + "epoch": 0.553133514986376, + "grad_norm": 0.05708552524447441, + "learning_rate": 4.388762638857249e-06, + "loss": 0.054, + "step": 2639 + }, + { + "epoch": 0.5533431146510166, + "grad_norm": 0.06874597072601318, + "learning_rate": 4.385393394216804e-06, + "loss": 0.0553, + "step": 2640 + }, + { + "epoch": 0.5535527143156571, + "grad_norm": 0.059858787804841995, + "learning_rate": 4.382024432909878e-06, + "loss": 0.0597, + "step": 2641 + }, + { + "epoch": 0.5537623139802976, + "grad_norm": 0.04494311660528183, + "learning_rate": 4.378655756489558e-06, + "loss": 0.0579, + "step": 2642 + }, + { + "epoch": 0.5539719136449381, + "grad_norm": 0.062470775097608566, + "learning_rate": 4.3752873665088055e-06, + "loss": 0.0557, + "step": 2643 + }, + { + "epoch": 0.5541815133095787, + "grad_norm": 0.07735218852758408, + "learning_rate": 4.371919264520449e-06, + "loss": 0.0562, + "step": 2644 + }, + { + "epoch": 0.5543911129742193, + "grad_norm": 0.07211725413799286, + "learning_rate": 4.368551452077179e-06, + "loss": 0.056, + "step": 2645 + }, + { + "epoch": 0.5546007126388598, + "grad_norm": 0.0716887041926384, + "learning_rate": 4.365183930731559e-06, + "loss": 0.0586, + "step": 2646 + }, + { + "epoch": 0.5548103123035003, + "grad_norm": 0.06757423281669617, + "learning_rate": 4.361816702036015e-06, + "loss": 0.0548, + "step": 2647 + }, + { + "epoch": 0.5550199119681408, + "grad_norm": 0.05915999040007591, + "learning_rate": 4.35844976754284e-06, + "loss": 0.0561, + "step": 2648 + }, + { + "epoch": 0.5552295116327813, + "grad_norm": 0.06384500861167908, + "learning_rate": 4.355083128804188e-06, + "loss": 0.0545, + "step": 2649 + }, + { + "epoch": 0.555439111297422, + "grad_norm": 0.08520271629095078, + "learning_rate": 4.351716787372079e-06, + "loss": 0.0544, + "step": 2650 + }, + { + "epoch": 0.5556487109620625, + "grad_norm": 0.09145956486463547, + "learning_rate": 4.348350744798399e-06, + "loss": 0.0539, + "step": 2651 + }, + { + "epoch": 0.555858310626703, + "grad_norm": 0.08064839243888855, + "learning_rate": 4.344985002634888e-06, + "loss": 0.0575, + "step": 2652 + }, + { + "epoch": 0.5560679102913435, + "grad_norm": 0.055196382105350494, + "learning_rate": 4.341619562433154e-06, + "loss": 0.0543, + "step": 2653 + }, + { + "epoch": 0.556277509955984, + "grad_norm": 0.05988939851522446, + "learning_rate": 4.338254425744669e-06, + "loss": 0.0568, + "step": 2654 + }, + { + "epoch": 0.5564871096206246, + "grad_norm": 0.06533896923065186, + "learning_rate": 4.334889594120751e-06, + "loss": 0.0567, + "step": 2655 + }, + { + "epoch": 0.5566967092852652, + "grad_norm": 0.0704990103840828, + "learning_rate": 4.331525069112595e-06, + "loss": 0.0541, + "step": 2656 + }, + { + "epoch": 0.5569063089499057, + "grad_norm": 0.07413534820079803, + "learning_rate": 4.328160852271241e-06, + "loss": 0.0561, + "step": 2657 + }, + { + "epoch": 0.5571159086145462, + "grad_norm": 0.06026620417833328, + "learning_rate": 4.324796945147598e-06, + "loss": 0.0552, + "step": 2658 + }, + { + "epoch": 0.5573255082791867, + "grad_norm": 0.05583348870277405, + "learning_rate": 4.321433349292422e-06, + "loss": 0.0569, + "step": 2659 + }, + { + "epoch": 0.5575351079438273, + "grad_norm": 0.06051694229245186, + "learning_rate": 4.318070066256328e-06, + "loss": 0.061, + "step": 2660 + }, + { + "epoch": 0.5577447076084678, + "grad_norm": 0.06352775543928146, + "learning_rate": 4.314707097589796e-06, + "loss": 0.0561, + "step": 2661 + }, + { + "epoch": 0.5579543072731084, + "grad_norm": 0.06398675590753555, + "learning_rate": 4.311344444843147e-06, + "loss": 0.0535, + "step": 2662 + }, + { + "epoch": 0.5581639069377489, + "grad_norm": 0.06229983642697334, + "learning_rate": 4.307982109566566e-06, + "loss": 0.0519, + "step": 2663 + }, + { + "epoch": 0.5583735066023895, + "grad_norm": 0.05653541907668114, + "learning_rate": 4.3046200933100905e-06, + "loss": 0.0557, + "step": 2664 + }, + { + "epoch": 0.55858310626703, + "grad_norm": 0.047429151833057404, + "learning_rate": 4.301258397623606e-06, + "loss": 0.0534, + "step": 2665 + }, + { + "epoch": 0.5587927059316705, + "grad_norm": 0.05258271098136902, + "learning_rate": 4.2978970240568556e-06, + "loss": 0.0542, + "step": 2666 + }, + { + "epoch": 0.559002305596311, + "grad_norm": 0.05701644718647003, + "learning_rate": 4.2945359741594315e-06, + "loss": 0.0556, + "step": 2667 + }, + { + "epoch": 0.5592119052609515, + "grad_norm": 0.06717483699321747, + "learning_rate": 4.291175249480777e-06, + "loss": 0.054, + "step": 2668 + }, + { + "epoch": 0.5594215049255922, + "grad_norm": 0.06235239654779434, + "learning_rate": 4.287814851570183e-06, + "loss": 0.0587, + "step": 2669 + }, + { + "epoch": 0.5596311045902327, + "grad_norm": 0.06835221499204636, + "learning_rate": 4.284454781976796e-06, + "loss": 0.0562, + "step": 2670 + }, + { + "epoch": 0.5598407042548732, + "grad_norm": 0.07527367770671844, + "learning_rate": 4.281095042249608e-06, + "loss": 0.0565, + "step": 2671 + }, + { + "epoch": 0.5600503039195137, + "grad_norm": 0.062950000166893, + "learning_rate": 4.2777356339374526e-06, + "loss": 0.0572, + "step": 2672 + }, + { + "epoch": 0.5602599035841542, + "grad_norm": 0.06541959196329117, + "learning_rate": 4.274376558589022e-06, + "loss": 0.0521, + "step": 2673 + }, + { + "epoch": 0.5604695032487949, + "grad_norm": 0.08955197781324387, + "learning_rate": 4.271017817752847e-06, + "loss": 0.053, + "step": 2674 + }, + { + "epoch": 0.5606791029134354, + "grad_norm": 0.07426553964614868, + "learning_rate": 4.267659412977306e-06, + "loss": 0.0531, + "step": 2675 + }, + { + "epoch": 0.5608887025780759, + "grad_norm": 0.061013299971818924, + "learning_rate": 4.264301345810623e-06, + "loss": 0.0548, + "step": 2676 + }, + { + "epoch": 0.5610983022427164, + "grad_norm": 0.06869885325431824, + "learning_rate": 4.260943617800869e-06, + "loss": 0.0564, + "step": 2677 + }, + { + "epoch": 0.5613079019073569, + "grad_norm": 0.060993682593107224, + "learning_rate": 4.257586230495951e-06, + "loss": 0.0556, + "step": 2678 + }, + { + "epoch": 0.5615175015719975, + "grad_norm": 0.08102905750274658, + "learning_rate": 4.254229185443628e-06, + "loss": 0.0537, + "step": 2679 + }, + { + "epoch": 0.561727101236638, + "grad_norm": 0.0864529013633728, + "learning_rate": 4.250872484191495e-06, + "loss": 0.0589, + "step": 2680 + }, + { + "epoch": 0.5619367009012786, + "grad_norm": 0.07401063293218613, + "learning_rate": 4.247516128286992e-06, + "loss": 0.055, + "step": 2681 + }, + { + "epoch": 0.5621463005659191, + "grad_norm": 0.06574457883834839, + "learning_rate": 4.244160119277397e-06, + "loss": 0.059, + "step": 2682 + }, + { + "epoch": 0.5623559002305596, + "grad_norm": 0.05600839853286743, + "learning_rate": 4.24080445870983e-06, + "loss": 0.0577, + "step": 2683 + }, + { + "epoch": 0.5625654998952002, + "grad_norm": 0.058064643293619156, + "learning_rate": 4.2374491481312506e-06, + "loss": 0.0574, + "step": 2684 + }, + { + "epoch": 0.5627750995598407, + "grad_norm": 0.05819283798336983, + "learning_rate": 4.234094189088455e-06, + "loss": 0.0563, + "step": 2685 + }, + { + "epoch": 0.5629846992244812, + "grad_norm": 0.0627334788441658, + "learning_rate": 4.230739583128078e-06, + "loss": 0.0543, + "step": 2686 + }, + { + "epoch": 0.5631942988891218, + "grad_norm": 0.07298675179481506, + "learning_rate": 4.227385331796596e-06, + "loss": 0.0586, + "step": 2687 + }, + { + "epoch": 0.5634038985537623, + "grad_norm": 0.07162720710039139, + "learning_rate": 4.2240314366403135e-06, + "loss": 0.0575, + "step": 2688 + }, + { + "epoch": 0.5636134982184029, + "grad_norm": 0.06274513155221939, + "learning_rate": 4.220677899205376e-06, + "loss": 0.0544, + "step": 2689 + }, + { + "epoch": 0.5638230978830434, + "grad_norm": 0.049171797931194305, + "learning_rate": 4.21732472103777e-06, + "loss": 0.0564, + "step": 2690 + }, + { + "epoch": 0.5640326975476839, + "grad_norm": 0.04283327981829643, + "learning_rate": 4.213971903683301e-06, + "loss": 0.0541, + "step": 2691 + }, + { + "epoch": 0.5642422972123244, + "grad_norm": 0.05370490252971649, + "learning_rate": 4.210619448687622e-06, + "loss": 0.0539, + "step": 2692 + }, + { + "epoch": 0.564451896876965, + "grad_norm": 0.05934003368020058, + "learning_rate": 4.2072673575962125e-06, + "loss": 0.0545, + "step": 2693 + }, + { + "epoch": 0.5646614965416056, + "grad_norm": 0.06390709429979324, + "learning_rate": 4.203915631954389e-06, + "loss": 0.0548, + "step": 2694 + }, + { + "epoch": 0.5648710962062461, + "grad_norm": 0.0646679550409317, + "learning_rate": 4.200564273307292e-06, + "loss": 0.0554, + "step": 2695 + }, + { + "epoch": 0.5650806958708866, + "grad_norm": 0.05779599770903587, + "learning_rate": 4.197213283199898e-06, + "loss": 0.0591, + "step": 2696 + }, + { + "epoch": 0.5652902955355271, + "grad_norm": 0.06428996473550797, + "learning_rate": 4.193862663177016e-06, + "loss": 0.0544, + "step": 2697 + }, + { + "epoch": 0.5654998952001676, + "grad_norm": 0.07036624848842621, + "learning_rate": 4.190512414783278e-06, + "loss": 0.0551, + "step": 2698 + }, + { + "epoch": 0.5657094948648083, + "grad_norm": 0.07967984676361084, + "learning_rate": 4.187162539563147e-06, + "loss": 0.0584, + "step": 2699 + }, + { + "epoch": 0.5659190945294488, + "grad_norm": 0.09749884158372879, + "learning_rate": 4.183813039060919e-06, + "loss": 0.0543, + "step": 2700 + }, + { + "epoch": 0.5661286941940893, + "grad_norm": 0.09949739277362823, + "learning_rate": 4.180463914820709e-06, + "loss": 0.0562, + "step": 2701 + }, + { + "epoch": 0.5663382938587298, + "grad_norm": 0.09305305033922195, + "learning_rate": 4.177115168386463e-06, + "loss": 0.055, + "step": 2702 + }, + { + "epoch": 0.5665478935233703, + "grad_norm": 0.07777238637208939, + "learning_rate": 4.173766801301956e-06, + "loss": 0.0574, + "step": 2703 + }, + { + "epoch": 0.5667574931880109, + "grad_norm": 0.06988883018493652, + "learning_rate": 4.17041881511078e-06, + "loss": 0.0547, + "step": 2704 + }, + { + "epoch": 0.5669670928526515, + "grad_norm": 0.07308480888605118, + "learning_rate": 4.167071211356358e-06, + "loss": 0.0539, + "step": 2705 + }, + { + "epoch": 0.567176692517292, + "grad_norm": 0.053045183420181274, + "learning_rate": 4.163723991581935e-06, + "loss": 0.057, + "step": 2706 + }, + { + "epoch": 0.5673862921819325, + "grad_norm": 0.05672500282526016, + "learning_rate": 4.160377157330579e-06, + "loss": 0.0545, + "step": 2707 + }, + { + "epoch": 0.567595891846573, + "grad_norm": 0.06733475625514984, + "learning_rate": 4.157030710145178e-06, + "loss": 0.0554, + "step": 2708 + }, + { + "epoch": 0.5678054915112136, + "grad_norm": 0.07970572263002396, + "learning_rate": 4.153684651568445e-06, + "loss": 0.0561, + "step": 2709 + }, + { + "epoch": 0.5680150911758541, + "grad_norm": 0.1028270497918129, + "learning_rate": 4.150338983142913e-06, + "loss": 0.0553, + "step": 2710 + }, + { + "epoch": 0.5682246908404947, + "grad_norm": 0.10650909692049026, + "learning_rate": 4.1469937064109305e-06, + "loss": 0.0545, + "step": 2711 + }, + { + "epoch": 0.5684342905051352, + "grad_norm": 0.09105932712554932, + "learning_rate": 4.1436488229146735e-06, + "loss": 0.0579, + "step": 2712 + }, + { + "epoch": 0.5686438901697757, + "grad_norm": 0.0559769943356514, + "learning_rate": 4.140304334196133e-06, + "loss": 0.0559, + "step": 2713 + }, + { + "epoch": 0.5688534898344163, + "grad_norm": 0.056230757385492325, + "learning_rate": 4.136960241797113e-06, + "loss": 0.0553, + "step": 2714 + }, + { + "epoch": 0.5690630894990568, + "grad_norm": 0.07475091516971588, + "learning_rate": 4.1336165472592434e-06, + "loss": 0.0565, + "step": 2715 + }, + { + "epoch": 0.5692726891636973, + "grad_norm": 0.07072797417640686, + "learning_rate": 4.130273252123965e-06, + "loss": 0.0563, + "step": 2716 + }, + { + "epoch": 0.5694822888283378, + "grad_norm": 0.05199355632066727, + "learning_rate": 4.1269303579325385e-06, + "loss": 0.0579, + "step": 2717 + }, + { + "epoch": 0.5696918884929784, + "grad_norm": 0.047952428460121155, + "learning_rate": 4.123587866226035e-06, + "loss": 0.0544, + "step": 2718 + }, + { + "epoch": 0.569901488157619, + "grad_norm": 0.059139080345630646, + "learning_rate": 4.120245778545341e-06, + "loss": 0.0557, + "step": 2719 + }, + { + "epoch": 0.5701110878222595, + "grad_norm": 0.08245202153921127, + "learning_rate": 4.116904096431163e-06, + "loss": 0.0564, + "step": 2720 + }, + { + "epoch": 0.5703206874869, + "grad_norm": 0.09880422055721283, + "learning_rate": 4.113562821424012e-06, + "loss": 0.0558, + "step": 2721 + }, + { + "epoch": 0.5705302871515405, + "grad_norm": 0.09177903831005096, + "learning_rate": 4.1102219550642154e-06, + "loss": 0.0563, + "step": 2722 + }, + { + "epoch": 0.570739886816181, + "grad_norm": 0.06406284123659134, + "learning_rate": 4.1068814988919156e-06, + "loss": 0.0579, + "step": 2723 + }, + { + "epoch": 0.5709494864808217, + "grad_norm": 0.055943313986063004, + "learning_rate": 4.103541454447057e-06, + "loss": 0.0544, + "step": 2724 + }, + { + "epoch": 0.5711590861454622, + "grad_norm": 0.07301504909992218, + "learning_rate": 4.100201823269401e-06, + "loss": 0.054, + "step": 2725 + }, + { + "epoch": 0.5713686858101027, + "grad_norm": 0.08340940624475479, + "learning_rate": 4.0968626068985205e-06, + "loss": 0.0556, + "step": 2726 + }, + { + "epoch": 0.5715782854747432, + "grad_norm": 0.08021462708711624, + "learning_rate": 4.093523806873787e-06, + "loss": 0.0534, + "step": 2727 + }, + { + "epoch": 0.5717878851393837, + "grad_norm": 0.07112640142440796, + "learning_rate": 4.090185424734392e-06, + "loss": 0.0529, + "step": 2728 + }, + { + "epoch": 0.5719974848040243, + "grad_norm": 0.06481733918190002, + "learning_rate": 4.086847462019326e-06, + "loss": 0.0575, + "step": 2729 + }, + { + "epoch": 0.5722070844686649, + "grad_norm": 0.06660385429859161, + "learning_rate": 4.0835099202673926e-06, + "loss": 0.0535, + "step": 2730 + }, + { + "epoch": 0.5724166841333054, + "grad_norm": 0.07477893680334091, + "learning_rate": 4.080172801017195e-06, + "loss": 0.0567, + "step": 2731 + }, + { + "epoch": 0.5726262837979459, + "grad_norm": 0.07714486122131348, + "learning_rate": 4.076836105807143e-06, + "loss": 0.0576, + "step": 2732 + }, + { + "epoch": 0.5728358834625865, + "grad_norm": 0.05599776282906532, + "learning_rate": 4.073499836175457e-06, + "loss": 0.0559, + "step": 2733 + }, + { + "epoch": 0.573045483127227, + "grad_norm": 0.03909073770046234, + "learning_rate": 4.0701639936601535e-06, + "loss": 0.0551, + "step": 2734 + }, + { + "epoch": 0.5732550827918675, + "grad_norm": 0.06613650918006897, + "learning_rate": 4.066828579799054e-06, + "loss": 0.0566, + "step": 2735 + }, + { + "epoch": 0.5734646824565081, + "grad_norm": 0.0671084001660347, + "learning_rate": 4.063493596129788e-06, + "loss": 0.0562, + "step": 2736 + }, + { + "epoch": 0.5736742821211486, + "grad_norm": 0.058822277933359146, + "learning_rate": 4.060159044189778e-06, + "loss": 0.054, + "step": 2737 + }, + { + "epoch": 0.5738838817857892, + "grad_norm": 0.05256880447268486, + "learning_rate": 4.0568249255162526e-06, + "loss": 0.0573, + "step": 2738 + }, + { + "epoch": 0.5740934814504297, + "grad_norm": 0.05657995492219925, + "learning_rate": 4.053491241646242e-06, + "loss": 0.0553, + "step": 2739 + }, + { + "epoch": 0.5743030811150702, + "grad_norm": 0.05349545180797577, + "learning_rate": 4.050157994116573e-06, + "loss": 0.0562, + "step": 2740 + }, + { + "epoch": 0.5745126807797107, + "grad_norm": 0.04297221451997757, + "learning_rate": 4.04682518446387e-06, + "loss": 0.0556, + "step": 2741 + }, + { + "epoch": 0.5747222804443513, + "grad_norm": 0.04681278392672539, + "learning_rate": 4.043492814224559e-06, + "loss": 0.0543, + "step": 2742 + }, + { + "epoch": 0.5749318801089919, + "grad_norm": 0.06194974109530449, + "learning_rate": 4.040160884934864e-06, + "loss": 0.0546, + "step": 2743 + }, + { + "epoch": 0.5751414797736324, + "grad_norm": 0.08011812716722488, + "learning_rate": 4.036829398130799e-06, + "loss": 0.0552, + "step": 2744 + }, + { + "epoch": 0.5753510794382729, + "grad_norm": 0.08187726140022278, + "learning_rate": 4.033498355348183e-06, + "loss": 0.0533, + "step": 2745 + }, + { + "epoch": 0.5755606791029134, + "grad_norm": 0.06547623127698898, + "learning_rate": 4.030167758122625e-06, + "loss": 0.0533, + "step": 2746 + }, + { + "epoch": 0.5757702787675539, + "grad_norm": 0.0580391101539135, + "learning_rate": 4.026837607989527e-06, + "loss": 0.055, + "step": 2747 + }, + { + "epoch": 0.5759798784321946, + "grad_norm": 0.06372876465320587, + "learning_rate": 4.0235079064840905e-06, + "loss": 0.0547, + "step": 2748 + }, + { + "epoch": 0.5761894780968351, + "grad_norm": 0.0708019807934761, + "learning_rate": 4.020178655141307e-06, + "loss": 0.0543, + "step": 2749 + }, + { + "epoch": 0.5763990777614756, + "grad_norm": 0.06585292518138885, + "learning_rate": 4.016849855495959e-06, + "loss": 0.0568, + "step": 2750 + }, + { + "epoch": 0.5766086774261161, + "grad_norm": 0.05741509795188904, + "learning_rate": 4.013521509082624e-06, + "loss": 0.0556, + "step": 2751 + }, + { + "epoch": 0.5768182770907566, + "grad_norm": 0.05286962538957596, + "learning_rate": 4.0101936174356665e-06, + "loss": 0.0564, + "step": 2752 + }, + { + "epoch": 0.5770278767553972, + "grad_norm": 0.04501480236649513, + "learning_rate": 4.0068661820892485e-06, + "loss": 0.0571, + "step": 2753 + }, + { + "epoch": 0.5772374764200378, + "grad_norm": 0.045843351632356644, + "learning_rate": 4.003539204577313e-06, + "loss": 0.0557, + "step": 2754 + }, + { + "epoch": 0.5774470760846783, + "grad_norm": 0.062015220522880554, + "learning_rate": 4.000212686433597e-06, + "loss": 0.053, + "step": 2755 + }, + { + "epoch": 0.5776566757493188, + "grad_norm": 0.06968379020690918, + "learning_rate": 3.9968866291916254e-06, + "loss": 0.0549, + "step": 2756 + }, + { + "epoch": 0.5778662754139593, + "grad_norm": 0.06381241232156754, + "learning_rate": 3.99356103438471e-06, + "loss": 0.0555, + "step": 2757 + }, + { + "epoch": 0.5780758750785999, + "grad_norm": 0.051279522478580475, + "learning_rate": 3.990235903545947e-06, + "loss": 0.0565, + "step": 2758 + }, + { + "epoch": 0.5782854747432404, + "grad_norm": 0.04932436719536781, + "learning_rate": 3.9869112382082255e-06, + "loss": 0.0562, + "step": 2759 + }, + { + "epoch": 0.578495074407881, + "grad_norm": 0.05086817964911461, + "learning_rate": 3.98358703990421e-06, + "loss": 0.0573, + "step": 2760 + }, + { + "epoch": 0.5787046740725215, + "grad_norm": 0.03960997983813286, + "learning_rate": 3.980263310166359e-06, + "loss": 0.0557, + "step": 2761 + }, + { + "epoch": 0.578914273737162, + "grad_norm": 0.04365481063723564, + "learning_rate": 3.976940050526909e-06, + "loss": 0.0548, + "step": 2762 + }, + { + "epoch": 0.5791238734018026, + "grad_norm": 0.0648135393857956, + "learning_rate": 3.973617262517886e-06, + "loss": 0.0554, + "step": 2763 + }, + { + "epoch": 0.5793334730664431, + "grad_norm": 0.0678715854883194, + "learning_rate": 3.970294947671089e-06, + "loss": 0.0566, + "step": 2764 + }, + { + "epoch": 0.5795430727310836, + "grad_norm": 0.06538737565279007, + "learning_rate": 3.9669731075181074e-06, + "loss": 0.0556, + "step": 2765 + }, + { + "epoch": 0.5797526723957241, + "grad_norm": 0.058645885437726974, + "learning_rate": 3.963651743590311e-06, + "loss": 0.0545, + "step": 2766 + }, + { + "epoch": 0.5799622720603647, + "grad_norm": 0.04971649497747421, + "learning_rate": 3.960330857418844e-06, + "loss": 0.0548, + "step": 2767 + }, + { + "epoch": 0.5801718717250053, + "grad_norm": 0.05847443640232086, + "learning_rate": 3.9570104505346345e-06, + "loss": 0.0528, + "step": 2768 + }, + { + "epoch": 0.5803814713896458, + "grad_norm": 0.0770886018872261, + "learning_rate": 3.953690524468393e-06, + "loss": 0.058, + "step": 2769 + }, + { + "epoch": 0.5805910710542863, + "grad_norm": 0.08351907879114151, + "learning_rate": 3.950371080750602e-06, + "loss": 0.0587, + "step": 2770 + }, + { + "epoch": 0.5808006707189268, + "grad_norm": 0.06995301693677902, + "learning_rate": 3.947052120911523e-06, + "loss": 0.0554, + "step": 2771 + }, + { + "epoch": 0.5810102703835673, + "grad_norm": 0.05648371949791908, + "learning_rate": 3.9437336464812e-06, + "loss": 0.0523, + "step": 2772 + }, + { + "epoch": 0.581219870048208, + "grad_norm": 0.06814776360988617, + "learning_rate": 3.940415658989445e-06, + "loss": 0.0542, + "step": 2773 + }, + { + "epoch": 0.5814294697128485, + "grad_norm": 0.07342953979969025, + "learning_rate": 3.93709815996585e-06, + "loss": 0.0562, + "step": 2774 + }, + { + "epoch": 0.581639069377489, + "grad_norm": 0.06355999410152435, + "learning_rate": 3.933781150939784e-06, + "loss": 0.0555, + "step": 2775 + }, + { + "epoch": 0.5818486690421295, + "grad_norm": 0.05689748376607895, + "learning_rate": 3.9304646334403875e-06, + "loss": 0.054, + "step": 2776 + }, + { + "epoch": 0.58205826870677, + "grad_norm": 0.06319185346364975, + "learning_rate": 3.927148608996569e-06, + "loss": 0.0562, + "step": 2777 + }, + { + "epoch": 0.5822678683714106, + "grad_norm": 0.05822164565324783, + "learning_rate": 3.923833079137022e-06, + "loss": 0.0561, + "step": 2778 + }, + { + "epoch": 0.5824774680360512, + "grad_norm": 0.058942124247550964, + "learning_rate": 3.920518045390201e-06, + "loss": 0.0585, + "step": 2779 + }, + { + "epoch": 0.5826870677006917, + "grad_norm": 0.0649891048669815, + "learning_rate": 3.9172035092843365e-06, + "loss": 0.0539, + "step": 2780 + }, + { + "epoch": 0.5828966673653322, + "grad_norm": 0.06785421073436737, + "learning_rate": 3.91388947234743e-06, + "loss": 0.0573, + "step": 2781 + }, + { + "epoch": 0.5831062670299727, + "grad_norm": 0.07181856036186218, + "learning_rate": 3.9105759361072516e-06, + "loss": 0.0548, + "step": 2782 + }, + { + "epoch": 0.5833158666946133, + "grad_norm": 0.07179451733827591, + "learning_rate": 3.907262902091338e-06, + "loss": 0.0551, + "step": 2783 + }, + { + "epoch": 0.5835254663592538, + "grad_norm": 0.0645546168088913, + "learning_rate": 3.903950371827001e-06, + "loss": 0.0542, + "step": 2784 + }, + { + "epoch": 0.5837350660238944, + "grad_norm": 0.05469752475619316, + "learning_rate": 3.900638346841314e-06, + "loss": 0.0533, + "step": 2785 + }, + { + "epoch": 0.5839446656885349, + "grad_norm": 0.04990299046039581, + "learning_rate": 3.897326828661123e-06, + "loss": 0.0538, + "step": 2786 + }, + { + "epoch": 0.5841542653531754, + "grad_norm": 0.05425114557147026, + "learning_rate": 3.894015818813034e-06, + "loss": 0.0566, + "step": 2787 + }, + { + "epoch": 0.584363865017816, + "grad_norm": 0.07104803621768951, + "learning_rate": 3.890705318823421e-06, + "loss": 0.0536, + "step": 2788 + }, + { + "epoch": 0.5845734646824565, + "grad_norm": 0.0836774930357933, + "learning_rate": 3.887395330218429e-06, + "loss": 0.055, + "step": 2789 + }, + { + "epoch": 0.584783064347097, + "grad_norm": 0.08817489445209503, + "learning_rate": 3.884085854523956e-06, + "loss": 0.0544, + "step": 2790 + }, + { + "epoch": 0.5849926640117376, + "grad_norm": 0.08014960587024689, + "learning_rate": 3.880776893265673e-06, + "loss": 0.0536, + "step": 2791 + }, + { + "epoch": 0.5852022636763781, + "grad_norm": 0.07269839197397232, + "learning_rate": 3.877468447969011e-06, + "loss": 0.0537, + "step": 2792 + }, + { + "epoch": 0.5854118633410187, + "grad_norm": 0.06357665359973907, + "learning_rate": 3.874160520159159e-06, + "loss": 0.0549, + "step": 2793 + }, + { + "epoch": 0.5856214630056592, + "grad_norm": 0.051662154495716095, + "learning_rate": 3.8708531113610735e-06, + "loss": 0.0525, + "step": 2794 + }, + { + "epoch": 0.5858310626702997, + "grad_norm": 0.040864262729883194, + "learning_rate": 3.8675462230994725e-06, + "loss": 0.0547, + "step": 2795 + }, + { + "epoch": 0.5860406623349402, + "grad_norm": 0.04362620413303375, + "learning_rate": 3.864239856898824e-06, + "loss": 0.0537, + "step": 2796 + }, + { + "epoch": 0.5862502619995807, + "grad_norm": 0.047262124717235565, + "learning_rate": 3.860934014283366e-06, + "loss": 0.054, + "step": 2797 + }, + { + "epoch": 0.5864598616642214, + "grad_norm": 0.043658629059791565, + "learning_rate": 3.85762869677709e-06, + "loss": 0.0525, + "step": 2798 + }, + { + "epoch": 0.5866694613288619, + "grad_norm": 0.040076591074466705, + "learning_rate": 3.854323905903751e-06, + "loss": 0.0557, + "step": 2799 + }, + { + "epoch": 0.5868790609935024, + "grad_norm": 0.03843994066119194, + "learning_rate": 3.851019643186851e-06, + "loss": 0.0569, + "step": 2800 + }, + { + "epoch": 0.5870886606581429, + "grad_norm": 0.04533432796597481, + "learning_rate": 3.847715910149656e-06, + "loss": 0.0561, + "step": 2801 + }, + { + "epoch": 0.5872982603227835, + "grad_norm": 0.042377371340990067, + "learning_rate": 3.8444127083151885e-06, + "loss": 0.0557, + "step": 2802 + }, + { + "epoch": 0.587507859987424, + "grad_norm": 0.03570021688938141, + "learning_rate": 3.841110039206222e-06, + "loss": 0.0541, + "step": 2803 + }, + { + "epoch": 0.5877174596520646, + "grad_norm": 0.03966277092695236, + "learning_rate": 3.837807904345286e-06, + "loss": 0.0542, + "step": 2804 + }, + { + "epoch": 0.5879270593167051, + "grad_norm": 0.041379570960998535, + "learning_rate": 3.834506305254667e-06, + "loss": 0.0551, + "step": 2805 + }, + { + "epoch": 0.5881366589813456, + "grad_norm": 0.03879513218998909, + "learning_rate": 3.831205243456397e-06, + "loss": 0.056, + "step": 2806 + }, + { + "epoch": 0.5883462586459862, + "grad_norm": 0.041930824518203735, + "learning_rate": 3.827904720472267e-06, + "loss": 0.0544, + "step": 2807 + }, + { + "epoch": 0.5885558583106267, + "grad_norm": 0.04247865080833435, + "learning_rate": 3.824604737823819e-06, + "loss": 0.0552, + "step": 2808 + }, + { + "epoch": 0.5887654579752672, + "grad_norm": 0.03654211759567261, + "learning_rate": 3.821305297032342e-06, + "loss": 0.0553, + "step": 2809 + }, + { + "epoch": 0.5889750576399078, + "grad_norm": 0.04415356367826462, + "learning_rate": 3.818006399618877e-06, + "loss": 0.0576, + "step": 2810 + }, + { + "epoch": 0.5891846573045483, + "grad_norm": 0.058614566922187805, + "learning_rate": 3.8147080471042166e-06, + "loss": 0.057, + "step": 2811 + }, + { + "epoch": 0.5893942569691889, + "grad_norm": 0.05693338066339493, + "learning_rate": 3.811410241008902e-06, + "loss": 0.053, + "step": 2812 + }, + { + "epoch": 0.5896038566338294, + "grad_norm": 0.04473234340548515, + "learning_rate": 3.808112982853217e-06, + "loss": 0.0539, + "step": 2813 + }, + { + "epoch": 0.5898134562984699, + "grad_norm": 0.04146963357925415, + "learning_rate": 3.8048162741572008e-06, + "loss": 0.056, + "step": 2814 + }, + { + "epoch": 0.5900230559631104, + "grad_norm": 0.03515983745455742, + "learning_rate": 3.801520116440635e-06, + "loss": 0.0558, + "step": 2815 + }, + { + "epoch": 0.590232655627751, + "grad_norm": 0.040382951498031616, + "learning_rate": 3.798224511223044e-06, + "loss": 0.0587, + "step": 2816 + }, + { + "epoch": 0.5904422552923916, + "grad_norm": 0.042147841304540634, + "learning_rate": 3.794929460023705e-06, + "loss": 0.0541, + "step": 2817 + }, + { + "epoch": 0.5906518549570321, + "grad_norm": 0.0410742312669754, + "learning_rate": 3.7916349643616357e-06, + "loss": 0.0568, + "step": 2818 + }, + { + "epoch": 0.5908614546216726, + "grad_norm": 0.03527839481830597, + "learning_rate": 3.788341025755595e-06, + "loss": 0.0572, + "step": 2819 + }, + { + "epoch": 0.5910710542863131, + "grad_norm": 0.04356502741575241, + "learning_rate": 3.7850476457240905e-06, + "loss": 0.0555, + "step": 2820 + }, + { + "epoch": 0.5912806539509536, + "grad_norm": 0.053171779960393906, + "learning_rate": 3.781754825785368e-06, + "loss": 0.0572, + "step": 2821 + }, + { + "epoch": 0.5914902536155943, + "grad_norm": 0.06760048121213913, + "learning_rate": 3.77846256745742e-06, + "loss": 0.0539, + "step": 2822 + }, + { + "epoch": 0.5916998532802348, + "grad_norm": 0.06500531733036041, + "learning_rate": 3.7751708722579733e-06, + "loss": 0.0529, + "step": 2823 + }, + { + "epoch": 0.5919094529448753, + "grad_norm": 0.0501357838511467, + "learning_rate": 3.771879741704499e-06, + "loss": 0.053, + "step": 2824 + }, + { + "epoch": 0.5921190526095158, + "grad_norm": 0.05668150261044502, + "learning_rate": 3.768589177314211e-06, + "loss": 0.0561, + "step": 2825 + }, + { + "epoch": 0.5923286522741563, + "grad_norm": 0.05575096979737282, + "learning_rate": 3.765299180604055e-06, + "loss": 0.0568, + "step": 2826 + }, + { + "epoch": 0.592538251938797, + "grad_norm": 0.05972772836685181, + "learning_rate": 3.7620097530907196e-06, + "loss": 0.0563, + "step": 2827 + }, + { + "epoch": 0.5927478516034375, + "grad_norm": 0.05793767794966698, + "learning_rate": 3.758720896290634e-06, + "loss": 0.0539, + "step": 2828 + }, + { + "epoch": 0.592957451268078, + "grad_norm": 0.05584365129470825, + "learning_rate": 3.755432611719954e-06, + "loss": 0.0543, + "step": 2829 + }, + { + "epoch": 0.5931670509327185, + "grad_norm": 0.06453042477369308, + "learning_rate": 3.752144900894582e-06, + "loss": 0.054, + "step": 2830 + }, + { + "epoch": 0.593376650597359, + "grad_norm": 0.05443593114614487, + "learning_rate": 3.7488577653301538e-06, + "loss": 0.0543, + "step": 2831 + }, + { + "epoch": 0.5935862502619996, + "grad_norm": 0.05260578915476799, + "learning_rate": 3.7455712065420335e-06, + "loss": 0.0575, + "step": 2832 + }, + { + "epoch": 0.5937958499266401, + "grad_norm": 0.057504259049892426, + "learning_rate": 3.7422852260453274e-06, + "loss": 0.0539, + "step": 2833 + }, + { + "epoch": 0.5940054495912807, + "grad_norm": 0.05131387338042259, + "learning_rate": 3.7389998253548698e-06, + "loss": 0.055, + "step": 2834 + }, + { + "epoch": 0.5942150492559212, + "grad_norm": 0.04155094549059868, + "learning_rate": 3.7357150059852325e-06, + "loss": 0.0557, + "step": 2835 + }, + { + "epoch": 0.5944246489205617, + "grad_norm": 0.0489707812666893, + "learning_rate": 3.732430769450714e-06, + "loss": 0.0533, + "step": 2836 + }, + { + "epoch": 0.5946342485852023, + "grad_norm": 0.06095966696739197, + "learning_rate": 3.7291471172653463e-06, + "loss": 0.0531, + "step": 2837 + }, + { + "epoch": 0.5948438482498428, + "grad_norm": 0.05945379659533501, + "learning_rate": 3.7258640509428955e-06, + "loss": 0.0565, + "step": 2838 + }, + { + "epoch": 0.5950534479144833, + "grad_norm": 0.05267561972141266, + "learning_rate": 3.7225815719968522e-06, + "loss": 0.0533, + "step": 2839 + }, + { + "epoch": 0.5952630475791238, + "grad_norm": 0.04585307091474533, + "learning_rate": 3.719299681940437e-06, + "loss": 0.0541, + "step": 2840 + }, + { + "epoch": 0.5954726472437644, + "grad_norm": 0.057470791041851044, + "learning_rate": 3.716018382286605e-06, + "loss": 0.0551, + "step": 2841 + }, + { + "epoch": 0.595682246908405, + "grad_norm": 0.07041089981794357, + "learning_rate": 3.7127376745480313e-06, + "loss": 0.0542, + "step": 2842 + }, + { + "epoch": 0.5958918465730455, + "grad_norm": 0.07431340962648392, + "learning_rate": 3.709457560237121e-06, + "loss": 0.0535, + "step": 2843 + }, + { + "epoch": 0.596101446237686, + "grad_norm": 0.06186486408114433, + "learning_rate": 3.7061780408660075e-06, + "loss": 0.054, + "step": 2844 + }, + { + "epoch": 0.5963110459023265, + "grad_norm": 0.047516003251075745, + "learning_rate": 3.7028991179465502e-06, + "loss": 0.0556, + "step": 2845 + }, + { + "epoch": 0.596520645566967, + "grad_norm": 0.04875975474715233, + "learning_rate": 3.699620792990328e-06, + "loss": 0.0547, + "step": 2846 + }, + { + "epoch": 0.5967302452316077, + "grad_norm": 0.05895498767495155, + "learning_rate": 3.696343067508651e-06, + "loss": 0.0538, + "step": 2847 + }, + { + "epoch": 0.5969398448962482, + "grad_norm": 0.05717795342206955, + "learning_rate": 3.6930659430125506e-06, + "loss": 0.0563, + "step": 2848 + }, + { + "epoch": 0.5971494445608887, + "grad_norm": 0.06923867017030716, + "learning_rate": 3.6897894210127765e-06, + "loss": 0.056, + "step": 2849 + }, + { + "epoch": 0.5973590442255292, + "grad_norm": 0.07923050224781036, + "learning_rate": 3.6865135030198084e-06, + "loss": 0.0544, + "step": 2850 + }, + { + "epoch": 0.5975686438901697, + "grad_norm": 0.07306721806526184, + "learning_rate": 3.683238190543843e-06, + "loss": 0.055, + "step": 2851 + }, + { + "epoch": 0.5977782435548104, + "grad_norm": 0.05905516818165779, + "learning_rate": 3.679963485094797e-06, + "loss": 0.0547, + "step": 2852 + }, + { + "epoch": 0.5979878432194509, + "grad_norm": 0.05046588554978371, + "learning_rate": 3.6766893881823106e-06, + "loss": 0.0556, + "step": 2853 + }, + { + "epoch": 0.5981974428840914, + "grad_norm": 0.06019177287817001, + "learning_rate": 3.673415901315743e-06, + "loss": 0.0556, + "step": 2854 + }, + { + "epoch": 0.5984070425487319, + "grad_norm": 0.06949783116579056, + "learning_rate": 3.6701430260041672e-06, + "loss": 0.0553, + "step": 2855 + }, + { + "epoch": 0.5986166422133724, + "grad_norm": 0.06795257329940796, + "learning_rate": 3.6668707637563804e-06, + "loss": 0.0538, + "step": 2856 + }, + { + "epoch": 0.598826241878013, + "grad_norm": 0.05731310322880745, + "learning_rate": 3.6635991160808943e-06, + "loss": 0.0557, + "step": 2857 + }, + { + "epoch": 0.5990358415426535, + "grad_norm": 0.04602685570716858, + "learning_rate": 3.66032808448594e-06, + "loss": 0.0542, + "step": 2858 + }, + { + "epoch": 0.5992454412072941, + "grad_norm": 0.055196985602378845, + "learning_rate": 3.6570576704794593e-06, + "loss": 0.0562, + "step": 2859 + }, + { + "epoch": 0.5994550408719346, + "grad_norm": 0.06921806186437607, + "learning_rate": 3.6537878755691124e-06, + "loss": 0.0544, + "step": 2860 + }, + { + "epoch": 0.5996646405365751, + "grad_norm": 0.0728953555226326, + "learning_rate": 3.650518701262278e-06, + "loss": 0.0534, + "step": 2861 + }, + { + "epoch": 0.5998742402012157, + "grad_norm": 0.07024196535348892, + "learning_rate": 3.6472501490660407e-06, + "loss": 0.0554, + "step": 2862 + }, + { + "epoch": 0.6000838398658562, + "grad_norm": 0.06623408198356628, + "learning_rate": 3.643982220487202e-06, + "loss": 0.0524, + "step": 2863 + }, + { + "epoch": 0.6002934395304967, + "grad_norm": 0.0592159740626812, + "learning_rate": 3.640714917032281e-06, + "loss": 0.054, + "step": 2864 + }, + { + "epoch": 0.6005030391951373, + "grad_norm": 0.04894890636205673, + "learning_rate": 3.637448240207499e-06, + "loss": 0.0521, + "step": 2865 + }, + { + "epoch": 0.6007126388597778, + "grad_norm": 0.04575192928314209, + "learning_rate": 3.634182191518796e-06, + "loss": 0.059, + "step": 2866 + }, + { + "epoch": 0.6009222385244184, + "grad_norm": 0.05822861194610596, + "learning_rate": 3.630916772471817e-06, + "loss": 0.0532, + "step": 2867 + }, + { + "epoch": 0.6011318381890589, + "grad_norm": 0.08098480850458145, + "learning_rate": 3.6276519845719237e-06, + "loss": 0.0572, + "step": 2868 + }, + { + "epoch": 0.6013414378536994, + "grad_norm": 0.10072686523199081, + "learning_rate": 3.624387829324181e-06, + "loss": 0.0538, + "step": 2869 + }, + { + "epoch": 0.6015510375183399, + "grad_norm": 0.09352651983499527, + "learning_rate": 3.62112430823336e-06, + "loss": 0.0559, + "step": 2870 + }, + { + "epoch": 0.6017606371829806, + "grad_norm": 0.06254759430885315, + "learning_rate": 3.61786142280395e-06, + "loss": 0.0529, + "step": 2871 + }, + { + "epoch": 0.6019702368476211, + "grad_norm": 0.05310570448637009, + "learning_rate": 3.6145991745401354e-06, + "loss": 0.0568, + "step": 2872 + }, + { + "epoch": 0.6021798365122616, + "grad_norm": 0.051381923258304596, + "learning_rate": 3.6113375649458126e-06, + "loss": 0.0545, + "step": 2873 + }, + { + "epoch": 0.6023894361769021, + "grad_norm": 0.04709269851446152, + "learning_rate": 3.6080765955245867e-06, + "loss": 0.0551, + "step": 2874 + }, + { + "epoch": 0.6025990358415426, + "grad_norm": 0.05449269711971283, + "learning_rate": 3.6048162677797595e-06, + "loss": 0.053, + "step": 2875 + }, + { + "epoch": 0.6028086355061832, + "grad_norm": 0.0537823885679245, + "learning_rate": 3.601556583214342e-06, + "loss": 0.0567, + "step": 2876 + }, + { + "epoch": 0.6030182351708238, + "grad_norm": 0.041623324155807495, + "learning_rate": 3.5982975433310506e-06, + "loss": 0.0548, + "step": 2877 + }, + { + "epoch": 0.6032278348354643, + "grad_norm": 0.04960298910737038, + "learning_rate": 3.5950391496323007e-06, + "loss": 0.0561, + "step": 2878 + }, + { + "epoch": 0.6034374345001048, + "grad_norm": 0.06087161973118782, + "learning_rate": 3.591781403620209e-06, + "loss": 0.0552, + "step": 2879 + }, + { + "epoch": 0.6036470341647453, + "grad_norm": 0.050851162523031235, + "learning_rate": 3.5885243067965992e-06, + "loss": 0.0517, + "step": 2880 + }, + { + "epoch": 0.6038566338293859, + "grad_norm": 0.049509234726428986, + "learning_rate": 3.585267860662992e-06, + "loss": 0.0563, + "step": 2881 + }, + { + "epoch": 0.6040662334940264, + "grad_norm": 0.04799444228410721, + "learning_rate": 3.582012066720605e-06, + "loss": 0.0555, + "step": 2882 + }, + { + "epoch": 0.604275833158667, + "grad_norm": 0.05108596384525299, + "learning_rate": 3.5787569264703614e-06, + "loss": 0.0555, + "step": 2883 + }, + { + "epoch": 0.6044854328233075, + "grad_norm": 0.05567923188209534, + "learning_rate": 3.575502441412881e-06, + "loss": 0.0538, + "step": 2884 + }, + { + "epoch": 0.604695032487948, + "grad_norm": 0.052811216562986374, + "learning_rate": 3.572248613048477e-06, + "loss": 0.0541, + "step": 2885 + }, + { + "epoch": 0.6049046321525886, + "grad_norm": 0.05271197855472565, + "learning_rate": 3.568995442877167e-06, + "loss": 0.0536, + "step": 2886 + }, + { + "epoch": 0.6051142318172291, + "grad_norm": 0.056101392954587936, + "learning_rate": 3.565742932398661e-06, + "loss": 0.0548, + "step": 2887 + }, + { + "epoch": 0.6053238314818696, + "grad_norm": 0.06310348957777023, + "learning_rate": 3.5624910831123633e-06, + "loss": 0.0554, + "step": 2888 + }, + { + "epoch": 0.6055334311465101, + "grad_norm": 0.05690898001194, + "learning_rate": 3.559239896517379e-06, + "loss": 0.056, + "step": 2889 + }, + { + "epoch": 0.6057430308111507, + "grad_norm": 0.037709228694438934, + "learning_rate": 3.5559893741125018e-06, + "loss": 0.0556, + "step": 2890 + }, + { + "epoch": 0.6059526304757913, + "grad_norm": 0.043450452387332916, + "learning_rate": 3.5527395173962255e-06, + "loss": 0.053, + "step": 2891 + }, + { + "epoch": 0.6061622301404318, + "grad_norm": 0.051693037152290344, + "learning_rate": 3.5494903278667305e-06, + "loss": 0.0546, + "step": 2892 + }, + { + "epoch": 0.6063718298050723, + "grad_norm": 0.04153525084257126, + "learning_rate": 3.5462418070218913e-06, + "loss": 0.0546, + "step": 2893 + }, + { + "epoch": 0.6065814294697128, + "grad_norm": 0.043478380888700485, + "learning_rate": 3.5429939563592795e-06, + "loss": 0.0527, + "step": 2894 + }, + { + "epoch": 0.6067910291343533, + "grad_norm": 0.053386226296424866, + "learning_rate": 3.5397467773761495e-06, + "loss": 0.0544, + "step": 2895 + }, + { + "epoch": 0.607000628798994, + "grad_norm": 0.037870582193136215, + "learning_rate": 3.536500271569452e-06, + "loss": 0.0549, + "step": 2896 + }, + { + "epoch": 0.6072102284636345, + "grad_norm": 0.043722059577703476, + "learning_rate": 3.533254440435826e-06, + "loss": 0.0551, + "step": 2897 + }, + { + "epoch": 0.607419828128275, + "grad_norm": 0.05376339703798294, + "learning_rate": 3.5300092854715985e-06, + "loss": 0.0537, + "step": 2898 + }, + { + "epoch": 0.6076294277929155, + "grad_norm": 0.04392022639513016, + "learning_rate": 3.5267648081727834e-06, + "loss": 0.057, + "step": 2899 + }, + { + "epoch": 0.607839027457556, + "grad_norm": 0.04517117887735367, + "learning_rate": 3.523521010035089e-06, + "loss": 0.0543, + "step": 2900 + }, + { + "epoch": 0.6080486271221966, + "grad_norm": 0.05903865396976471, + "learning_rate": 3.520277892553899e-06, + "loss": 0.0538, + "step": 2901 + }, + { + "epoch": 0.6082582267868372, + "grad_norm": 0.049636900424957275, + "learning_rate": 3.5170354572242936e-06, + "loss": 0.0557, + "step": 2902 + }, + { + "epoch": 0.6084678264514777, + "grad_norm": 0.037783827632665634, + "learning_rate": 3.5137937055410343e-06, + "loss": 0.0566, + "step": 2903 + }, + { + "epoch": 0.6086774261161182, + "grad_norm": 0.04359547793865204, + "learning_rate": 3.51055263899857e-06, + "loss": 0.0542, + "step": 2904 + }, + { + "epoch": 0.6088870257807587, + "grad_norm": 0.042558085173368454, + "learning_rate": 3.5073122590910285e-06, + "loss": 0.057, + "step": 2905 + }, + { + "epoch": 0.6090966254453993, + "grad_norm": 0.04583901911973953, + "learning_rate": 3.5040725673122246e-06, + "loss": 0.0591, + "step": 2906 + }, + { + "epoch": 0.6093062251100398, + "grad_norm": 0.06440989673137665, + "learning_rate": 3.500833565155658e-06, + "loss": 0.057, + "step": 2907 + }, + { + "epoch": 0.6095158247746804, + "grad_norm": 0.057344041764736176, + "learning_rate": 3.4975952541145063e-06, + "loss": 0.0544, + "step": 2908 + }, + { + "epoch": 0.6097254244393209, + "grad_norm": 0.04767520725727081, + "learning_rate": 3.4943576356816287e-06, + "loss": 0.0573, + "step": 2909 + }, + { + "epoch": 0.6099350241039614, + "grad_norm": 0.04085380956530571, + "learning_rate": 3.4911207113495703e-06, + "loss": 0.0563, + "step": 2910 + }, + { + "epoch": 0.610144623768602, + "grad_norm": 0.03516289219260216, + "learning_rate": 3.4878844826105497e-06, + "loss": 0.0583, + "step": 2911 + }, + { + "epoch": 0.6103542234332425, + "grad_norm": 0.048390522599220276, + "learning_rate": 3.4846489509564674e-06, + "loss": 0.0521, + "step": 2912 + }, + { + "epoch": 0.610563823097883, + "grad_norm": 0.05823476240038872, + "learning_rate": 3.481414117878906e-06, + "loss": 0.0571, + "step": 2913 + }, + { + "epoch": 0.6107734227625236, + "grad_norm": 0.05212031677365303, + "learning_rate": 3.47817998486912e-06, + "loss": 0.0531, + "step": 2914 + }, + { + "epoch": 0.6109830224271641, + "grad_norm": 0.054088983684778214, + "learning_rate": 3.474946553418044e-06, + "loss": 0.0541, + "step": 2915 + }, + { + "epoch": 0.6111926220918047, + "grad_norm": 0.05518088862299919, + "learning_rate": 3.4717138250162908e-06, + "loss": 0.0524, + "step": 2916 + }, + { + "epoch": 0.6114022217564452, + "grad_norm": 0.04526456817984581, + "learning_rate": 3.4684818011541484e-06, + "loss": 0.0545, + "step": 2917 + }, + { + "epoch": 0.6116118214210857, + "grad_norm": 0.036848414689302444, + "learning_rate": 3.465250483321575e-06, + "loss": 0.053, + "step": 2918 + }, + { + "epoch": 0.6118214210857262, + "grad_norm": 0.03322865068912506, + "learning_rate": 3.462019873008211e-06, + "loss": 0.0532, + "step": 2919 + }, + { + "epoch": 0.6120310207503667, + "grad_norm": 0.035156525671482086, + "learning_rate": 3.458789971703367e-06, + "loss": 0.0583, + "step": 2920 + }, + { + "epoch": 0.6122406204150074, + "grad_norm": 0.03802697733044624, + "learning_rate": 3.4555607808960232e-06, + "loss": 0.0545, + "step": 2921 + }, + { + "epoch": 0.6124502200796479, + "grad_norm": 0.0401364266872406, + "learning_rate": 3.4523323020748413e-06, + "loss": 0.0569, + "step": 2922 + }, + { + "epoch": 0.6126598197442884, + "grad_norm": 0.04033052548766136, + "learning_rate": 3.449104536728146e-06, + "loss": 0.054, + "step": 2923 + }, + { + "epoch": 0.6128694194089289, + "grad_norm": 0.0390617661178112, + "learning_rate": 3.4458774863439366e-06, + "loss": 0.0552, + "step": 2924 + }, + { + "epoch": 0.6130790190735694, + "grad_norm": 0.042450230568647385, + "learning_rate": 3.4426511524098834e-06, + "loss": 0.0538, + "step": 2925 + }, + { + "epoch": 0.6132886187382101, + "grad_norm": 0.038516972213983536, + "learning_rate": 3.4394255364133245e-06, + "loss": 0.0565, + "step": 2926 + }, + { + "epoch": 0.6134982184028506, + "grad_norm": 0.029110699892044067, + "learning_rate": 3.436200639841271e-06, + "loss": 0.0553, + "step": 2927 + }, + { + "epoch": 0.6137078180674911, + "grad_norm": 0.0337090939283371, + "learning_rate": 3.432976464180397e-06, + "loss": 0.0579, + "step": 2928 + }, + { + "epoch": 0.6139174177321316, + "grad_norm": 0.04489186406135559, + "learning_rate": 3.4297530109170463e-06, + "loss": 0.0531, + "step": 2929 + }, + { + "epoch": 0.6141270173967721, + "grad_norm": 0.04716401919722557, + "learning_rate": 3.426530281537234e-06, + "loss": 0.0538, + "step": 2930 + }, + { + "epoch": 0.6143366170614127, + "grad_norm": 0.053189124912023544, + "learning_rate": 3.423308277526633e-06, + "loss": 0.0568, + "step": 2931 + }, + { + "epoch": 0.6145462167260533, + "grad_norm": 0.05599994957447052, + "learning_rate": 3.4200870003705883e-06, + "loss": 0.056, + "step": 2932 + }, + { + "epoch": 0.6147558163906938, + "grad_norm": 0.04985819011926651, + "learning_rate": 3.41686645155411e-06, + "loss": 0.0561, + "step": 2933 + }, + { + "epoch": 0.6149654160553343, + "grad_norm": 0.04989127814769745, + "learning_rate": 3.413646632561868e-06, + "loss": 0.0537, + "step": 2934 + }, + { + "epoch": 0.6151750157199748, + "grad_norm": 0.04989814758300781, + "learning_rate": 3.410427544878198e-06, + "loss": 0.0575, + "step": 2935 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.03400817885994911, + "learning_rate": 3.4072091899871016e-06, + "loss": 0.0551, + "step": 2936 + }, + { + "epoch": 0.6155942150492559, + "grad_norm": 0.034855592995882034, + "learning_rate": 3.403991569372235e-06, + "loss": 0.0567, + "step": 2937 + }, + { + "epoch": 0.6158038147138964, + "grad_norm": 0.042559489607810974, + "learning_rate": 3.4007746845169253e-06, + "loss": 0.0539, + "step": 2938 + }, + { + "epoch": 0.616013414378537, + "grad_norm": 0.042176634073257446, + "learning_rate": 3.397558536904152e-06, + "loss": 0.0521, + "step": 2939 + }, + { + "epoch": 0.6162230140431776, + "grad_norm": 0.04478650167584419, + "learning_rate": 3.394343128016563e-06, + "loss": 0.0573, + "step": 2940 + }, + { + "epoch": 0.6164326137078181, + "grad_norm": 0.04596136510372162, + "learning_rate": 3.3911284593364568e-06, + "loss": 0.056, + "step": 2941 + }, + { + "epoch": 0.6166422133724586, + "grad_norm": 0.044569674879312515, + "learning_rate": 3.387914532345796e-06, + "loss": 0.0557, + "step": 2942 + }, + { + "epoch": 0.6168518130370991, + "grad_norm": 0.03586115688085556, + "learning_rate": 3.3847013485262037e-06, + "loss": 0.0556, + "step": 2943 + }, + { + "epoch": 0.6170614127017396, + "grad_norm": 0.05500089004635811, + "learning_rate": 3.381488909358952e-06, + "loss": 0.0532, + "step": 2944 + }, + { + "epoch": 0.6172710123663803, + "grad_norm": 0.06483340263366699, + "learning_rate": 3.3782772163249767e-06, + "loss": 0.0562, + "step": 2945 + }, + { + "epoch": 0.6174806120310208, + "grad_norm": 0.057424839586019516, + "learning_rate": 3.375066270904869e-06, + "loss": 0.0568, + "step": 2946 + }, + { + "epoch": 0.6176902116956613, + "grad_norm": 0.0566512756049633, + "learning_rate": 3.3718560745788724e-06, + "loss": 0.0588, + "step": 2947 + }, + { + "epoch": 0.6178998113603018, + "grad_norm": 0.05279812961816788, + "learning_rate": 3.368646628826886e-06, + "loss": 0.0549, + "step": 2948 + }, + { + "epoch": 0.6181094110249423, + "grad_norm": 0.03966805338859558, + "learning_rate": 3.365437935128466e-06, + "loss": 0.0557, + "step": 2949 + }, + { + "epoch": 0.618319010689583, + "grad_norm": 0.03704335168004036, + "learning_rate": 3.3622299949628197e-06, + "loss": 0.0535, + "step": 2950 + }, + { + "epoch": 0.6185286103542235, + "grad_norm": 0.03752034902572632, + "learning_rate": 3.359022809808803e-06, + "loss": 0.055, + "step": 2951 + }, + { + "epoch": 0.618738210018864, + "grad_norm": 0.04068942740559578, + "learning_rate": 3.3558163811449317e-06, + "loss": 0.0546, + "step": 2952 + }, + { + "epoch": 0.6189478096835045, + "grad_norm": 0.03766583278775215, + "learning_rate": 3.352610710449368e-06, + "loss": 0.0553, + "step": 2953 + }, + { + "epoch": 0.619157409348145, + "grad_norm": 0.044414669275283813, + "learning_rate": 3.349405799199922e-06, + "loss": 0.0573, + "step": 2954 + }, + { + "epoch": 0.6193670090127856, + "grad_norm": 0.06159931421279907, + "learning_rate": 3.3462016488740612e-06, + "loss": 0.0544, + "step": 2955 + }, + { + "epoch": 0.6195766086774261, + "grad_norm": 0.05573199316859245, + "learning_rate": 3.3429982609488976e-06, + "loss": 0.0549, + "step": 2956 + }, + { + "epoch": 0.6197862083420667, + "grad_norm": 0.03836704418063164, + "learning_rate": 3.3397956369011897e-06, + "loss": 0.0552, + "step": 2957 + }, + { + "epoch": 0.6199958080067072, + "grad_norm": 0.05629691854119301, + "learning_rate": 3.3365937782073486e-06, + "loss": 0.0566, + "step": 2958 + }, + { + "epoch": 0.6202054076713477, + "grad_norm": 0.07051456719636917, + "learning_rate": 3.3333926863434317e-06, + "loss": 0.0524, + "step": 2959 + }, + { + "epoch": 0.6204150073359883, + "grad_norm": 0.0706133171916008, + "learning_rate": 3.3301923627851385e-06, + "loss": 0.0567, + "step": 2960 + }, + { + "epoch": 0.6206246070006288, + "grad_norm": 0.058882150799036026, + "learning_rate": 3.3269928090078204e-06, + "loss": 0.054, + "step": 2961 + }, + { + "epoch": 0.6208342066652693, + "grad_norm": 0.04647863656282425, + "learning_rate": 3.3237940264864684e-06, + "loss": 0.0558, + "step": 2962 + }, + { + "epoch": 0.6210438063299099, + "grad_norm": 0.046870891004800797, + "learning_rate": 3.320596016695724e-06, + "loss": 0.0534, + "step": 2963 + }, + { + "epoch": 0.6212534059945504, + "grad_norm": 0.043968793004751205, + "learning_rate": 3.3173987811098664e-06, + "loss": 0.0568, + "step": 2964 + }, + { + "epoch": 0.621463005659191, + "grad_norm": 0.05325428396463394, + "learning_rate": 3.3142023212028197e-06, + "loss": 0.0536, + "step": 2965 + }, + { + "epoch": 0.6216726053238315, + "grad_norm": 0.049214769154787064, + "learning_rate": 3.311006638448155e-06, + "loss": 0.0545, + "step": 2966 + }, + { + "epoch": 0.621882204988472, + "grad_norm": 0.04188595712184906, + "learning_rate": 3.307811734319078e-06, + "loss": 0.0546, + "step": 2967 + }, + { + "epoch": 0.6220918046531125, + "grad_norm": 0.040931567549705505, + "learning_rate": 3.304617610288439e-06, + "loss": 0.0557, + "step": 2968 + }, + { + "epoch": 0.622301404317753, + "grad_norm": 0.03652331605553627, + "learning_rate": 3.3014242678287327e-06, + "loss": 0.054, + "step": 2969 + }, + { + "epoch": 0.6225110039823937, + "grad_norm": 0.0368216410279274, + "learning_rate": 3.298231708412083e-06, + "loss": 0.0558, + "step": 2970 + }, + { + "epoch": 0.6227206036470342, + "grad_norm": 0.03968511521816254, + "learning_rate": 3.295039933510262e-06, + "loss": 0.0557, + "step": 2971 + }, + { + "epoch": 0.6229302033116747, + "grad_norm": 0.04443355277180672, + "learning_rate": 3.291848944594679e-06, + "loss": 0.0524, + "step": 2972 + }, + { + "epoch": 0.6231398029763152, + "grad_norm": 0.05209925025701523, + "learning_rate": 3.288658743136378e-06, + "loss": 0.0553, + "step": 2973 + }, + { + "epoch": 0.6233494026409557, + "grad_norm": 0.05424797162413597, + "learning_rate": 3.2854693306060407e-06, + "loss": 0.0547, + "step": 2974 + }, + { + "epoch": 0.6235590023055964, + "grad_norm": 0.04404807463288307, + "learning_rate": 3.282280708473985e-06, + "loss": 0.0548, + "step": 2975 + }, + { + "epoch": 0.6237686019702369, + "grad_norm": 0.03802521154284477, + "learning_rate": 3.2790928782101674e-06, + "loss": 0.0596, + "step": 2976 + }, + { + "epoch": 0.6239782016348774, + "grad_norm": 0.045362699776887894, + "learning_rate": 3.2759058412841742e-06, + "loss": 0.0551, + "step": 2977 + }, + { + "epoch": 0.6241878012995179, + "grad_norm": 0.05245572328567505, + "learning_rate": 3.2727195991652295e-06, + "loss": 0.0543, + "step": 2978 + }, + { + "epoch": 0.6243974009641584, + "grad_norm": 0.056894753128290176, + "learning_rate": 3.2695341533221926e-06, + "loss": 0.0573, + "step": 2979 + }, + { + "epoch": 0.624607000628799, + "grad_norm": 0.05126248672604561, + "learning_rate": 3.2663495052235505e-06, + "loss": 0.0536, + "step": 2980 + }, + { + "epoch": 0.6248166002934396, + "grad_norm": 0.05021638423204422, + "learning_rate": 3.263165656337426e-06, + "loss": 0.0562, + "step": 2981 + }, + { + "epoch": 0.6250261999580801, + "grad_norm": 0.04641691595315933, + "learning_rate": 3.2599826081315744e-06, + "loss": 0.0555, + "step": 2982 + }, + { + "epoch": 0.6252357996227206, + "grad_norm": 0.04853666201233864, + "learning_rate": 3.2568003620733778e-06, + "loss": 0.0559, + "step": 2983 + }, + { + "epoch": 0.6254453992873611, + "grad_norm": 0.05443096533417702, + "learning_rate": 3.2536189196298518e-06, + "loss": 0.0531, + "step": 2984 + }, + { + "epoch": 0.6256549989520017, + "grad_norm": 0.048564642667770386, + "learning_rate": 3.250438282267642e-06, + "loss": 0.0532, + "step": 2985 + }, + { + "epoch": 0.6258645986166422, + "grad_norm": 0.04952483996748924, + "learning_rate": 3.247258451453022e-06, + "loss": 0.0535, + "step": 2986 + }, + { + "epoch": 0.6260741982812827, + "grad_norm": 0.038559552282094955, + "learning_rate": 3.2440794286518896e-06, + "loss": 0.0568, + "step": 2987 + }, + { + "epoch": 0.6262837979459233, + "grad_norm": 0.0397375226020813, + "learning_rate": 3.2409012153297762e-06, + "loss": 0.0548, + "step": 2988 + }, + { + "epoch": 0.6264933976105638, + "grad_norm": 0.050432708114385605, + "learning_rate": 3.2377238129518392e-06, + "loss": 0.0539, + "step": 2989 + }, + { + "epoch": 0.6267029972752044, + "grad_norm": 0.04017667844891548, + "learning_rate": 3.2345472229828556e-06, + "loss": 0.0548, + "step": 2990 + }, + { + "epoch": 0.6269125969398449, + "grad_norm": 0.035998113453388214, + "learning_rate": 3.231371446887237e-06, + "loss": 0.0556, + "step": 2991 + }, + { + "epoch": 0.6271221966044854, + "grad_norm": 0.044560160487890244, + "learning_rate": 3.2281964861290146e-06, + "loss": 0.0553, + "step": 2992 + }, + { + "epoch": 0.6273317962691259, + "grad_norm": 0.04030834883451462, + "learning_rate": 3.225022342171842e-06, + "loss": 0.0535, + "step": 2993 + }, + { + "epoch": 0.6275413959337665, + "grad_norm": 0.03907003253698349, + "learning_rate": 3.2218490164790015e-06, + "loss": 0.0563, + "step": 2994 + }, + { + "epoch": 0.6277509955984071, + "grad_norm": 0.037859588861465454, + "learning_rate": 3.2186765105133955e-06, + "loss": 0.0553, + "step": 2995 + }, + { + "epoch": 0.6279605952630476, + "grad_norm": 0.04589460417628288, + "learning_rate": 3.215504825737549e-06, + "loss": 0.0531, + "step": 2996 + }, + { + "epoch": 0.6281701949276881, + "grad_norm": 0.05734090879559517, + "learning_rate": 3.2123339636136065e-06, + "loss": 0.0535, + "step": 2997 + }, + { + "epoch": 0.6283797945923286, + "grad_norm": 0.04286714643239975, + "learning_rate": 3.209163925603335e-06, + "loss": 0.0539, + "step": 2998 + }, + { + "epoch": 0.6285893942569691, + "grad_norm": 0.04124724492430687, + "learning_rate": 3.2059947131681226e-06, + "loss": 0.0566, + "step": 2999 + }, + { + "epoch": 0.6287989939216098, + "grad_norm": 0.06533816456794739, + "learning_rate": 3.202826327768974e-06, + "loss": 0.0547, + "step": 3000 + }, + { + "epoch": 0.6290085935862503, + "grad_norm": 0.08003267645835876, + "learning_rate": 3.199658770866515e-06, + "loss": 0.0532, + "step": 3001 + }, + { + "epoch": 0.6292181932508908, + "grad_norm": 0.05155742168426514, + "learning_rate": 3.19649204392099e-06, + "loss": 0.0543, + "step": 3002 + }, + { + "epoch": 0.6294277929155313, + "grad_norm": 0.0488094687461853, + "learning_rate": 3.193326148392257e-06, + "loss": 0.0571, + "step": 3003 + }, + { + "epoch": 0.6296373925801718, + "grad_norm": 0.07249227166175842, + "learning_rate": 3.1901610857397936e-06, + "loss": 0.0532, + "step": 3004 + }, + { + "epoch": 0.6298469922448124, + "grad_norm": 0.09708064049482346, + "learning_rate": 3.1869968574226966e-06, + "loss": 0.0545, + "step": 3005 + }, + { + "epoch": 0.630056591909453, + "grad_norm": 0.11073527485132217, + "learning_rate": 3.183833464899669e-06, + "loss": 0.055, + "step": 3006 + }, + { + "epoch": 0.6302661915740935, + "grad_norm": 0.10113838315010071, + "learning_rate": 3.1806709096290377e-06, + "loss": 0.0557, + "step": 3007 + }, + { + "epoch": 0.630475791238734, + "grad_norm": 0.06188422441482544, + "learning_rate": 3.1775091930687374e-06, + "loss": 0.0557, + "step": 3008 + }, + { + "epoch": 0.6306853909033746, + "grad_norm": 0.04998209327459335, + "learning_rate": 3.1743483166763234e-06, + "loss": 0.0554, + "step": 3009 + }, + { + "epoch": 0.6308949905680151, + "grad_norm": 0.11552390456199646, + "learning_rate": 3.1711882819089553e-06, + "loss": 0.0551, + "step": 3010 + }, + { + "epoch": 0.6311045902326556, + "grad_norm": 0.1522252857685089, + "learning_rate": 3.1680290902234078e-06, + "loss": 0.0541, + "step": 3011 + }, + { + "epoch": 0.6313141898972962, + "grad_norm": 0.12954400479793549, + "learning_rate": 3.164870743076072e-06, + "loss": 0.0543, + "step": 3012 + }, + { + "epoch": 0.6315237895619367, + "grad_norm": 0.06520622968673706, + "learning_rate": 3.16171324192294e-06, + "loss": 0.0547, + "step": 3013 + }, + { + "epoch": 0.6317333892265773, + "grad_norm": 0.06795618683099747, + "learning_rate": 3.1585565882196223e-06, + "loss": 0.052, + "step": 3014 + }, + { + "epoch": 0.6319429888912178, + "grad_norm": 0.12307886779308319, + "learning_rate": 3.1554007834213357e-06, + "loss": 0.0545, + "step": 3015 + }, + { + "epoch": 0.6321525885558583, + "grad_norm": 0.10787169635295868, + "learning_rate": 3.1522458289829045e-06, + "loss": 0.0564, + "step": 3016 + }, + { + "epoch": 0.6323621882204988, + "grad_norm": 0.04827128350734711, + "learning_rate": 3.1490917263587607e-06, + "loss": 0.0542, + "step": 3017 + }, + { + "epoch": 0.6325717878851393, + "grad_norm": 0.10941686481237411, + "learning_rate": 3.1459384770029476e-06, + "loss": 0.0537, + "step": 3018 + }, + { + "epoch": 0.63278138754978, + "grad_norm": 0.13342756032943726, + "learning_rate": 3.1427860823691136e-06, + "loss": 0.054, + "step": 3019 + }, + { + "epoch": 0.6329909872144205, + "grad_norm": 0.0916086882352829, + "learning_rate": 3.139634543910507e-06, + "loss": 0.0536, + "step": 3020 + }, + { + "epoch": 0.633200586879061, + "grad_norm": 0.06962735950946808, + "learning_rate": 3.13648386307999e-06, + "loss": 0.0578, + "step": 3021 + }, + { + "epoch": 0.6334101865437015, + "grad_norm": 0.11727678030729294, + "learning_rate": 3.1333340413300263e-06, + "loss": 0.054, + "step": 3022 + }, + { + "epoch": 0.633619786208342, + "grad_norm": 0.11866160482168198, + "learning_rate": 3.1301850801126797e-06, + "loss": 0.0551, + "step": 3023 + }, + { + "epoch": 0.6338293858729827, + "grad_norm": 0.06939590722322464, + "learning_rate": 3.127036980879624e-06, + "loss": 0.0589, + "step": 3024 + }, + { + "epoch": 0.6340389855376232, + "grad_norm": 0.07698585093021393, + "learning_rate": 3.123889745082132e-06, + "loss": 0.0568, + "step": 3025 + }, + { + "epoch": 0.6342485852022637, + "grad_norm": 0.10211010277271271, + "learning_rate": 3.1207433741710757e-06, + "loss": 0.0565, + "step": 3026 + }, + { + "epoch": 0.6344581848669042, + "grad_norm": 0.08808134496212006, + "learning_rate": 3.1175978695969344e-06, + "loss": 0.0556, + "step": 3027 + }, + { + "epoch": 0.6346677845315447, + "grad_norm": 0.05355629697442055, + "learning_rate": 3.1144532328097853e-06, + "loss": 0.0566, + "step": 3028 + }, + { + "epoch": 0.6348773841961853, + "grad_norm": 0.06826507300138474, + "learning_rate": 3.1113094652593023e-06, + "loss": 0.0551, + "step": 3029 + }, + { + "epoch": 0.6350869838608258, + "grad_norm": 0.07204960286617279, + "learning_rate": 3.108166568394765e-06, + "loss": 0.0561, + "step": 3030 + }, + { + "epoch": 0.6352965835254664, + "grad_norm": 0.06383077800273895, + "learning_rate": 3.105024543665045e-06, + "loss": 0.0559, + "step": 3031 + }, + { + "epoch": 0.6355061831901069, + "grad_norm": 0.050440918654203415, + "learning_rate": 3.1018833925186194e-06, + "loss": 0.0551, + "step": 3032 + }, + { + "epoch": 0.6357157828547474, + "grad_norm": 0.05051702260971069, + "learning_rate": 3.0987431164035542e-06, + "loss": 0.0548, + "step": 3033 + }, + { + "epoch": 0.635925382519388, + "grad_norm": 0.05574216693639755, + "learning_rate": 3.0956037167675164e-06, + "loss": 0.0549, + "step": 3034 + }, + { + "epoch": 0.6361349821840285, + "grad_norm": 0.047421135008335114, + "learning_rate": 3.092465195057771e-06, + "loss": 0.0559, + "step": 3035 + }, + { + "epoch": 0.636344581848669, + "grad_norm": 0.0535261407494545, + "learning_rate": 3.0893275527211742e-06, + "loss": 0.055, + "step": 3036 + }, + { + "epoch": 0.6365541815133096, + "grad_norm": 0.05517066270112991, + "learning_rate": 3.0861907912041776e-06, + "loss": 0.0564, + "step": 3037 + }, + { + "epoch": 0.6367637811779501, + "grad_norm": 0.04538208991289139, + "learning_rate": 3.083054911952831e-06, + "loss": 0.0563, + "step": 3038 + }, + { + "epoch": 0.6369733808425907, + "grad_norm": 0.054025497287511826, + "learning_rate": 3.0799199164127704e-06, + "loss": 0.0538, + "step": 3039 + }, + { + "epoch": 0.6371829805072312, + "grad_norm": 0.049201078712940216, + "learning_rate": 3.0767858060292285e-06, + "loss": 0.057, + "step": 3040 + }, + { + "epoch": 0.6373925801718717, + "grad_norm": 0.042893946170806885, + "learning_rate": 3.073652582247033e-06, + "loss": 0.0548, + "step": 3041 + }, + { + "epoch": 0.6376021798365122, + "grad_norm": 0.04716379567980766, + "learning_rate": 3.070520246510595e-06, + "loss": 0.0529, + "step": 3042 + }, + { + "epoch": 0.6378117795011528, + "grad_norm": 0.05528158321976662, + "learning_rate": 3.067388800263923e-06, + "loss": 0.055, + "step": 3043 + }, + { + "epoch": 0.6380213791657934, + "grad_norm": 0.052032019942998886, + "learning_rate": 3.064258244950612e-06, + "loss": 0.0524, + "step": 3044 + }, + { + "epoch": 0.6382309788304339, + "grad_norm": 0.041680578142404556, + "learning_rate": 3.0611285820138493e-06, + "loss": 0.0556, + "step": 3045 + }, + { + "epoch": 0.6384405784950744, + "grad_norm": 0.050715796649456024, + "learning_rate": 3.0579998128964073e-06, + "loss": 0.0546, + "step": 3046 + }, + { + "epoch": 0.6386501781597149, + "grad_norm": 0.057833991944789886, + "learning_rate": 3.0548719390406467e-06, + "loss": 0.0522, + "step": 3047 + }, + { + "epoch": 0.6388597778243554, + "grad_norm": 0.05053876340389252, + "learning_rate": 3.051744961888521e-06, + "loss": 0.0559, + "step": 3048 + }, + { + "epoch": 0.6390693774889961, + "grad_norm": 0.047344524413347244, + "learning_rate": 3.048618882881561e-06, + "loss": 0.0547, + "step": 3049 + }, + { + "epoch": 0.6392789771536366, + "grad_norm": 0.05755054950714111, + "learning_rate": 3.04549370346089e-06, + "loss": 0.0537, + "step": 3050 + }, + { + "epoch": 0.6394885768182771, + "grad_norm": 0.062311019748449326, + "learning_rate": 3.0423694250672182e-06, + "loss": 0.0551, + "step": 3051 + }, + { + "epoch": 0.6396981764829176, + "grad_norm": 0.05935823917388916, + "learning_rate": 3.0392460491408333e-06, + "loss": 0.0545, + "step": 3052 + }, + { + "epoch": 0.6399077761475581, + "grad_norm": 0.05066816136240959, + "learning_rate": 3.0361235771216114e-06, + "loss": 0.0509, + "step": 3053 + }, + { + "epoch": 0.6401173758121987, + "grad_norm": 0.06608299165964127, + "learning_rate": 3.033002010449014e-06, + "loss": 0.0565, + "step": 3054 + }, + { + "epoch": 0.6403269754768393, + "grad_norm": 0.05467569828033447, + "learning_rate": 3.0298813505620816e-06, + "loss": 0.0547, + "step": 3055 + }, + { + "epoch": 0.6405365751414798, + "grad_norm": 0.048877786844968796, + "learning_rate": 3.0267615988994357e-06, + "loss": 0.0563, + "step": 3056 + }, + { + "epoch": 0.6407461748061203, + "grad_norm": 0.053627368062734604, + "learning_rate": 3.0236427568992845e-06, + "loss": 0.0565, + "step": 3057 + }, + { + "epoch": 0.6409557744707608, + "grad_norm": 0.05301428213715553, + "learning_rate": 3.020524825999412e-06, + "loss": 0.0546, + "step": 3058 + }, + { + "epoch": 0.6411653741354014, + "grad_norm": 0.0528411827981472, + "learning_rate": 3.0174078076371815e-06, + "loss": 0.053, + "step": 3059 + }, + { + "epoch": 0.6413749738000419, + "grad_norm": 0.055189281702041626, + "learning_rate": 3.014291703249541e-06, + "loss": 0.0531, + "step": 3060 + }, + { + "epoch": 0.6415845734646825, + "grad_norm": 0.0514843687415123, + "learning_rate": 3.011176514273014e-06, + "loss": 0.0567, + "step": 3061 + }, + { + "epoch": 0.641794173129323, + "grad_norm": 0.04860663786530495, + "learning_rate": 3.008062242143699e-06, + "loss": 0.0563, + "step": 3062 + }, + { + "epoch": 0.6420037727939635, + "grad_norm": 0.05139222368597984, + "learning_rate": 3.0049488882972773e-06, + "loss": 0.0565, + "step": 3063 + }, + { + "epoch": 0.6422133724586041, + "grad_norm": 0.04456450790166855, + "learning_rate": 3.0018364541690048e-06, + "loss": 0.0534, + "step": 3064 + }, + { + "epoch": 0.6424229721232446, + "grad_norm": 0.05100685730576515, + "learning_rate": 2.9987249411937096e-06, + "loss": 0.0531, + "step": 3065 + }, + { + "epoch": 0.6426325717878851, + "grad_norm": 0.04359569400548935, + "learning_rate": 2.9956143508058023e-06, + "loss": 0.0551, + "step": 3066 + }, + { + "epoch": 0.6428421714525256, + "grad_norm": 0.049825966358184814, + "learning_rate": 2.9925046844392613e-06, + "loss": 0.0554, + "step": 3067 + }, + { + "epoch": 0.6430517711171662, + "grad_norm": 0.05026031658053398, + "learning_rate": 2.9893959435276464e-06, + "loss": 0.0543, + "step": 3068 + }, + { + "epoch": 0.6432613707818068, + "grad_norm": 0.04848875477910042, + "learning_rate": 2.9862881295040826e-06, + "loss": 0.055, + "step": 3069 + }, + { + "epoch": 0.6434709704464473, + "grad_norm": 0.0475265197455883, + "learning_rate": 2.983181243801272e-06, + "loss": 0.0545, + "step": 3070 + }, + { + "epoch": 0.6436805701110878, + "grad_norm": 0.05732462555170059, + "learning_rate": 2.9800752878514903e-06, + "loss": 0.0564, + "step": 3071 + }, + { + "epoch": 0.6438901697757283, + "grad_norm": 0.049768634140491486, + "learning_rate": 2.9769702630865814e-06, + "loss": 0.0524, + "step": 3072 + }, + { + "epoch": 0.6440997694403688, + "grad_norm": 0.0468343161046505, + "learning_rate": 2.973866170937959e-06, + "loss": 0.0559, + "step": 3073 + }, + { + "epoch": 0.6443093691050095, + "grad_norm": 0.05665425956249237, + "learning_rate": 2.9707630128366137e-06, + "loss": 0.0539, + "step": 3074 + }, + { + "epoch": 0.64451896876965, + "grad_norm": 0.04687637463212013, + "learning_rate": 2.9676607902130972e-06, + "loss": 0.0534, + "step": 3075 + }, + { + "epoch": 0.6447285684342905, + "grad_norm": 0.04909089207649231, + "learning_rate": 2.9645595044975328e-06, + "loss": 0.0555, + "step": 3076 + }, + { + "epoch": 0.644938168098931, + "grad_norm": 0.046408187597990036, + "learning_rate": 2.961459157119615e-06, + "loss": 0.0535, + "step": 3077 + }, + { + "epoch": 0.6451477677635716, + "grad_norm": 0.04547589272260666, + "learning_rate": 2.958359749508603e-06, + "loss": 0.0563, + "step": 3078 + }, + { + "epoch": 0.6453573674282121, + "grad_norm": 0.04302079603075981, + "learning_rate": 2.9552612830933216e-06, + "loss": 0.0552, + "step": 3079 + }, + { + "epoch": 0.6455669670928527, + "grad_norm": 0.0457628108561039, + "learning_rate": 2.9521637593021636e-06, + "loss": 0.0536, + "step": 3080 + }, + { + "epoch": 0.6457765667574932, + "grad_norm": 0.04492702707648277, + "learning_rate": 2.9490671795630884e-06, + "loss": 0.0543, + "step": 3081 + }, + { + "epoch": 0.6459861664221337, + "grad_norm": 0.04878842458128929, + "learning_rate": 2.9459715453036163e-06, + "loss": 0.0547, + "step": 3082 + }, + { + "epoch": 0.6461957660867743, + "grad_norm": 0.052640918642282486, + "learning_rate": 2.9428768579508347e-06, + "loss": 0.0546, + "step": 3083 + }, + { + "epoch": 0.6464053657514148, + "grad_norm": 0.05040021613240242, + "learning_rate": 2.9397831189313963e-06, + "loss": 0.0541, + "step": 3084 + }, + { + "epoch": 0.6466149654160553, + "grad_norm": 0.04280049726366997, + "learning_rate": 2.936690329671511e-06, + "loss": 0.0556, + "step": 3085 + }, + { + "epoch": 0.6468245650806959, + "grad_norm": 0.03978101909160614, + "learning_rate": 2.933598491596954e-06, + "loss": 0.0547, + "step": 3086 + }, + { + "epoch": 0.6470341647453364, + "grad_norm": 0.03757026419043541, + "learning_rate": 2.930507606133065e-06, + "loss": 0.0551, + "step": 3087 + }, + { + "epoch": 0.647243764409977, + "grad_norm": 0.039234839379787445, + "learning_rate": 2.927417674704739e-06, + "loss": 0.0582, + "step": 3088 + }, + { + "epoch": 0.6474533640746175, + "grad_norm": 0.03372887521982193, + "learning_rate": 2.924328698736434e-06, + "loss": 0.0578, + "step": 3089 + }, + { + "epoch": 0.647662963739258, + "grad_norm": 0.04041403532028198, + "learning_rate": 2.9212406796521675e-06, + "loss": 0.0554, + "step": 3090 + }, + { + "epoch": 0.6478725634038985, + "grad_norm": 0.04249253869056702, + "learning_rate": 2.9181536188755178e-06, + "loss": 0.0546, + "step": 3091 + }, + { + "epoch": 0.648082163068539, + "grad_norm": 0.04245641827583313, + "learning_rate": 2.915067517829615e-06, + "loss": 0.0551, + "step": 3092 + }, + { + "epoch": 0.6482917627331797, + "grad_norm": 0.044679149985313416, + "learning_rate": 2.9119823779371536e-06, + "loss": 0.0552, + "step": 3093 + }, + { + "epoch": 0.6485013623978202, + "grad_norm": 0.03879356011748314, + "learning_rate": 2.9088982006203835e-06, + "loss": 0.0526, + "step": 3094 + }, + { + "epoch": 0.6487109620624607, + "grad_norm": 0.04242287203669548, + "learning_rate": 2.90581498730111e-06, + "loss": 0.0546, + "step": 3095 + }, + { + "epoch": 0.6489205617271012, + "grad_norm": 0.05261940881609917, + "learning_rate": 2.902732739400689e-06, + "loss": 0.0519, + "step": 3096 + }, + { + "epoch": 0.6491301613917417, + "grad_norm": 0.054223619401454926, + "learning_rate": 2.8996514583400425e-06, + "loss": 0.0544, + "step": 3097 + }, + { + "epoch": 0.6493397610563824, + "grad_norm": 0.04815622419118881, + "learning_rate": 2.8965711455396343e-06, + "loss": 0.0534, + "step": 3098 + }, + { + "epoch": 0.6495493607210229, + "grad_norm": 0.0753355398774147, + "learning_rate": 2.893491802419492e-06, + "loss": 0.0551, + "step": 3099 + }, + { + "epoch": 0.6497589603856634, + "grad_norm": 0.07607095688581467, + "learning_rate": 2.8904134303991928e-06, + "loss": 0.0535, + "step": 3100 + }, + { + "epoch": 0.6499685600503039, + "grad_norm": 0.05531083047389984, + "learning_rate": 2.887336030897864e-06, + "loss": 0.0524, + "step": 3101 + }, + { + "epoch": 0.6501781597149444, + "grad_norm": 0.05754781886935234, + "learning_rate": 2.884259605334184e-06, + "loss": 0.0536, + "step": 3102 + }, + { + "epoch": 0.650387759379585, + "grad_norm": 0.05980090796947479, + "learning_rate": 2.8811841551263873e-06, + "loss": 0.0591, + "step": 3103 + }, + { + "epoch": 0.6505973590442256, + "grad_norm": 0.05938847362995148, + "learning_rate": 2.878109681692256e-06, + "loss": 0.0561, + "step": 3104 + }, + { + "epoch": 0.6508069587088661, + "grad_norm": 0.06025892496109009, + "learning_rate": 2.8750361864491195e-06, + "loss": 0.0583, + "step": 3105 + }, + { + "epoch": 0.6510165583735066, + "grad_norm": 0.0442025326192379, + "learning_rate": 2.871963670813861e-06, + "loss": 0.057, + "step": 3106 + }, + { + "epoch": 0.6512261580381471, + "grad_norm": 0.05523305386304855, + "learning_rate": 2.868892136202909e-06, + "loss": 0.0543, + "step": 3107 + }, + { + "epoch": 0.6514357577027877, + "grad_norm": 0.05948861315846443, + "learning_rate": 2.865821584032237e-06, + "loss": 0.0551, + "step": 3108 + }, + { + "epoch": 0.6516453573674282, + "grad_norm": 0.05912363529205322, + "learning_rate": 2.862752015717371e-06, + "loss": 0.0561, + "step": 3109 + }, + { + "epoch": 0.6518549570320687, + "grad_norm": 0.05048060417175293, + "learning_rate": 2.859683432673385e-06, + "loss": 0.056, + "step": 3110 + }, + { + "epoch": 0.6520645566967093, + "grad_norm": 0.07560472190380096, + "learning_rate": 2.856615836314889e-06, + "loss": 0.0548, + "step": 3111 + }, + { + "epoch": 0.6522741563613498, + "grad_norm": 0.07296677678823471, + "learning_rate": 2.8535492280560487e-06, + "loss": 0.055, + "step": 3112 + }, + { + "epoch": 0.6524837560259904, + "grad_norm": 0.0519208163022995, + "learning_rate": 2.850483609310567e-06, + "loss": 0.0532, + "step": 3113 + }, + { + "epoch": 0.6526933556906309, + "grad_norm": 0.05058110132813454, + "learning_rate": 2.8474189814916973e-06, + "loss": 0.0544, + "step": 3114 + }, + { + "epoch": 0.6529029553552714, + "grad_norm": 0.06586900353431702, + "learning_rate": 2.844355346012228e-06, + "loss": 0.0592, + "step": 3115 + }, + { + "epoch": 0.653112555019912, + "grad_norm": 0.056768354028463364, + "learning_rate": 2.8412927042844985e-06, + "loss": 0.0548, + "step": 3116 + }, + { + "epoch": 0.6533221546845525, + "grad_norm": 0.04887418821454048, + "learning_rate": 2.838231057720383e-06, + "loss": 0.053, + "step": 3117 + }, + { + "epoch": 0.6535317543491931, + "grad_norm": 0.06352341920137405, + "learning_rate": 2.8351704077312998e-06, + "loss": 0.0511, + "step": 3118 + }, + { + "epoch": 0.6537413540138336, + "grad_norm": 0.0803912952542305, + "learning_rate": 2.8321107557282083e-06, + "loss": 0.054, + "step": 3119 + }, + { + "epoch": 0.6539509536784741, + "grad_norm": 0.052244700491428375, + "learning_rate": 2.829052103121611e-06, + "loss": 0.0523, + "step": 3120 + }, + { + "epoch": 0.6541605533431146, + "grad_norm": 0.04686099290847778, + "learning_rate": 2.8259944513215405e-06, + "loss": 0.0567, + "step": 3121 + }, + { + "epoch": 0.6543701530077551, + "grad_norm": 0.07411143183708191, + "learning_rate": 2.8229378017375784e-06, + "loss": 0.055, + "step": 3122 + }, + { + "epoch": 0.6545797526723958, + "grad_norm": 0.06782218813896179, + "learning_rate": 2.819882155778836e-06, + "loss": 0.0541, + "step": 3123 + }, + { + "epoch": 0.6547893523370363, + "grad_norm": 0.04807867109775543, + "learning_rate": 2.816827514853968e-06, + "loss": 0.0563, + "step": 3124 + }, + { + "epoch": 0.6549989520016768, + "grad_norm": 0.047602880746126175, + "learning_rate": 2.8137738803711607e-06, + "loss": 0.0538, + "step": 3125 + }, + { + "epoch": 0.6552085516663173, + "grad_norm": 0.05647674947977066, + "learning_rate": 2.81072125373814e-06, + "loss": 0.0552, + "step": 3126 + }, + { + "epoch": 0.6554181513309578, + "grad_norm": 0.04610821232199669, + "learning_rate": 2.807669636362169e-06, + "loss": 0.0572, + "step": 3127 + }, + { + "epoch": 0.6556277509955984, + "grad_norm": 0.04513276740908623, + "learning_rate": 2.8046190296500407e-06, + "loss": 0.0562, + "step": 3128 + }, + { + "epoch": 0.655837350660239, + "grad_norm": 0.06186923012137413, + "learning_rate": 2.8015694350080813e-06, + "loss": 0.0527, + "step": 3129 + }, + { + "epoch": 0.6560469503248795, + "grad_norm": 0.04966472089290619, + "learning_rate": 2.7985208538421576e-06, + "loss": 0.0564, + "step": 3130 + }, + { + "epoch": 0.65625654998952, + "grad_norm": 0.04229920357465744, + "learning_rate": 2.7954732875576606e-06, + "loss": 0.0569, + "step": 3131 + }, + { + "epoch": 0.6564661496541605, + "grad_norm": 0.0393625944852829, + "learning_rate": 2.792426737559521e-06, + "loss": 0.0555, + "step": 3132 + }, + { + "epoch": 0.6566757493188011, + "grad_norm": 0.04377404600381851, + "learning_rate": 2.7893812052521984e-06, + "loss": 0.0534, + "step": 3133 + }, + { + "epoch": 0.6568853489834416, + "grad_norm": 0.04195486381649971, + "learning_rate": 2.7863366920396805e-06, + "loss": 0.0553, + "step": 3134 + }, + { + "epoch": 0.6570949486480822, + "grad_norm": 0.03995470330119133, + "learning_rate": 2.7832931993254865e-06, + "loss": 0.0516, + "step": 3135 + }, + { + "epoch": 0.6573045483127227, + "grad_norm": 0.041193921118974686, + "learning_rate": 2.7802507285126666e-06, + "loss": 0.0564, + "step": 3136 + }, + { + "epoch": 0.6575141479773632, + "grad_norm": 0.04172206670045853, + "learning_rate": 2.7772092810038027e-06, + "loss": 0.052, + "step": 3137 + }, + { + "epoch": 0.6577237476420038, + "grad_norm": 0.040663424879312515, + "learning_rate": 2.774168858200996e-06, + "loss": 0.0556, + "step": 3138 + }, + { + "epoch": 0.6579333473066443, + "grad_norm": 0.03617934137582779, + "learning_rate": 2.771129461505885e-06, + "loss": 0.0541, + "step": 3139 + }, + { + "epoch": 0.6581429469712848, + "grad_norm": 0.04274788871407509, + "learning_rate": 2.7680910923196293e-06, + "loss": 0.061, + "step": 3140 + }, + { + "epoch": 0.6583525466359254, + "grad_norm": 0.040159981697797775, + "learning_rate": 2.765053752042915e-06, + "loss": 0.0571, + "step": 3141 + }, + { + "epoch": 0.6585621463005659, + "grad_norm": 0.04021866247057915, + "learning_rate": 2.762017442075956e-06, + "loss": 0.0527, + "step": 3142 + }, + { + "epoch": 0.6587717459652065, + "grad_norm": 0.036748629063367844, + "learning_rate": 2.7589821638184942e-06, + "loss": 0.0545, + "step": 3143 + }, + { + "epoch": 0.658981345629847, + "grad_norm": 0.047550056129693985, + "learning_rate": 2.7559479186697868e-06, + "loss": 0.0544, + "step": 3144 + }, + { + "epoch": 0.6591909452944875, + "grad_norm": 0.05852500721812248, + "learning_rate": 2.7529147080286246e-06, + "loss": 0.0522, + "step": 3145 + }, + { + "epoch": 0.659400544959128, + "grad_norm": 0.05024390295147896, + "learning_rate": 2.7498825332933154e-06, + "loss": 0.0573, + "step": 3146 + }, + { + "epoch": 0.6596101446237687, + "grad_norm": 0.03166329488158226, + "learning_rate": 2.7468513958616895e-06, + "loss": 0.0534, + "step": 3147 + }, + { + "epoch": 0.6598197442884092, + "grad_norm": 0.03988707438111305, + "learning_rate": 2.7438212971311016e-06, + "loss": 0.0526, + "step": 3148 + }, + { + "epoch": 0.6600293439530497, + "grad_norm": 0.0417884886264801, + "learning_rate": 2.740792238498427e-06, + "loss": 0.0536, + "step": 3149 + }, + { + "epoch": 0.6602389436176902, + "grad_norm": 0.04264437034726143, + "learning_rate": 2.7377642213600652e-06, + "loss": 0.0557, + "step": 3150 + }, + { + "epoch": 0.6604485432823307, + "grad_norm": 0.039088521152734756, + "learning_rate": 2.734737247111924e-06, + "loss": 0.0562, + "step": 3151 + }, + { + "epoch": 0.6606581429469713, + "grad_norm": 0.0382147878408432, + "learning_rate": 2.7317113171494407e-06, + "loss": 0.054, + "step": 3152 + }, + { + "epoch": 0.6608677426116119, + "grad_norm": 0.05300328880548477, + "learning_rate": 2.728686432867571e-06, + "loss": 0.0522, + "step": 3153 + }, + { + "epoch": 0.6610773422762524, + "grad_norm": 0.043220195919275284, + "learning_rate": 2.7256625956607822e-06, + "loss": 0.0544, + "step": 3154 + }, + { + "epoch": 0.6612869419408929, + "grad_norm": 0.034808751195669174, + "learning_rate": 2.722639806923066e-06, + "loss": 0.0513, + "step": 3155 + }, + { + "epoch": 0.6614965416055334, + "grad_norm": 0.045243218541145325, + "learning_rate": 2.719618068047927e-06, + "loss": 0.0548, + "step": 3156 + }, + { + "epoch": 0.661706141270174, + "grad_norm": 0.0599578395485878, + "learning_rate": 2.716597380428382e-06, + "loss": 0.052, + "step": 3157 + }, + { + "epoch": 0.6619157409348145, + "grad_norm": 0.0630265325307846, + "learning_rate": 2.7135777454569703e-06, + "loss": 0.0534, + "step": 3158 + }, + { + "epoch": 0.662125340599455, + "grad_norm": 0.04426709935069084, + "learning_rate": 2.7105591645257432e-06, + "loss": 0.0565, + "step": 3159 + }, + { + "epoch": 0.6623349402640956, + "grad_norm": 0.030477387830615044, + "learning_rate": 2.7075416390262676e-06, + "loss": 0.0565, + "step": 3160 + }, + { + "epoch": 0.6625445399287361, + "grad_norm": 0.04804522171616554, + "learning_rate": 2.70452517034962e-06, + "loss": 0.0542, + "step": 3161 + }, + { + "epoch": 0.6627541395933767, + "grad_norm": 0.06287235021591187, + "learning_rate": 2.7015097598863906e-06, + "loss": 0.052, + "step": 3162 + }, + { + "epoch": 0.6629637392580172, + "grad_norm": 0.04516349360346794, + "learning_rate": 2.6984954090266856e-06, + "loss": 0.0546, + "step": 3163 + }, + { + "epoch": 0.6631733389226577, + "grad_norm": 0.03801724314689636, + "learning_rate": 2.6954821191601175e-06, + "loss": 0.0541, + "step": 3164 + }, + { + "epoch": 0.6633829385872982, + "grad_norm": 0.07248124480247498, + "learning_rate": 2.692469891675814e-06, + "loss": 0.0553, + "step": 3165 + }, + { + "epoch": 0.6635925382519388, + "grad_norm": 0.06941161304712296, + "learning_rate": 2.689458727962413e-06, + "loss": 0.0552, + "step": 3166 + }, + { + "epoch": 0.6638021379165794, + "grad_norm": 0.042739253491163254, + "learning_rate": 2.6864486294080585e-06, + "loss": 0.0544, + "step": 3167 + }, + { + "epoch": 0.6640117375812199, + "grad_norm": 0.05017360299825668, + "learning_rate": 2.683439597400403e-06, + "loss": 0.0566, + "step": 3168 + }, + { + "epoch": 0.6642213372458604, + "grad_norm": 0.07375095039606094, + "learning_rate": 2.680431633326614e-06, + "loss": 0.0569, + "step": 3169 + }, + { + "epoch": 0.6644309369105009, + "grad_norm": 0.0569414347410202, + "learning_rate": 2.677424738573359e-06, + "loss": 0.0524, + "step": 3170 + }, + { + "epoch": 0.6646405365751414, + "grad_norm": 0.041695497930049896, + "learning_rate": 2.6744189145268155e-06, + "loss": 0.0548, + "step": 3171 + }, + { + "epoch": 0.6648501362397821, + "grad_norm": 0.06951986998319626, + "learning_rate": 2.6714141625726725e-06, + "loss": 0.0538, + "step": 3172 + }, + { + "epoch": 0.6650597359044226, + "grad_norm": 0.06547250598669052, + "learning_rate": 2.6684104840961167e-06, + "loss": 0.0563, + "step": 3173 + }, + { + "epoch": 0.6652693355690631, + "grad_norm": 0.041869353502988815, + "learning_rate": 2.665407880481841e-06, + "loss": 0.0529, + "step": 3174 + }, + { + "epoch": 0.6654789352337036, + "grad_norm": 0.05186137557029724, + "learning_rate": 2.6624063531140477e-06, + "loss": 0.0539, + "step": 3175 + }, + { + "epoch": 0.6656885348983441, + "grad_norm": 0.06323316693305969, + "learning_rate": 2.659405903376442e-06, + "loss": 0.0561, + "step": 3176 + }, + { + "epoch": 0.6658981345629847, + "grad_norm": 0.05578729510307312, + "learning_rate": 2.656406532652227e-06, + "loss": 0.0543, + "step": 3177 + }, + { + "epoch": 0.6661077342276253, + "grad_norm": 0.04175778478384018, + "learning_rate": 2.6534082423241154e-06, + "loss": 0.0538, + "step": 3178 + }, + { + "epoch": 0.6663173338922658, + "grad_norm": 0.039774637669324875, + "learning_rate": 2.6504110337743166e-06, + "loss": 0.0528, + "step": 3179 + }, + { + "epoch": 0.6665269335569063, + "grad_norm": 0.04265711456537247, + "learning_rate": 2.6474149083845412e-06, + "loss": 0.0546, + "step": 3180 + }, + { + "epoch": 0.6667365332215468, + "grad_norm": 0.04271739721298218, + "learning_rate": 2.6444198675360044e-06, + "loss": 0.055, + "step": 3181 + }, + { + "epoch": 0.6669461328861874, + "grad_norm": 0.037498895078897476, + "learning_rate": 2.6414259126094192e-06, + "loss": 0.0567, + "step": 3182 + }, + { + "epoch": 0.6671557325508279, + "grad_norm": 0.040308877825737, + "learning_rate": 2.6384330449850028e-06, + "loss": 0.0524, + "step": 3183 + }, + { + "epoch": 0.6673653322154685, + "grad_norm": 0.04669109731912613, + "learning_rate": 2.6354412660424577e-06, + "loss": 0.0537, + "step": 3184 + }, + { + "epoch": 0.667574931880109, + "grad_norm": 0.03690009191632271, + "learning_rate": 2.6324505771609976e-06, + "loss": 0.0583, + "step": 3185 + }, + { + "epoch": 0.6677845315447495, + "grad_norm": 0.04083159938454628, + "learning_rate": 2.6294609797193326e-06, + "loss": 0.0584, + "step": 3186 + }, + { + "epoch": 0.6679941312093901, + "grad_norm": 0.041967883706092834, + "learning_rate": 2.6264724750956605e-06, + "loss": 0.055, + "step": 3187 + }, + { + "epoch": 0.6682037308740306, + "grad_norm": 0.034479230642318726, + "learning_rate": 2.623485064667687e-06, + "loss": 0.0565, + "step": 3188 + }, + { + "epoch": 0.6684133305386711, + "grad_norm": 0.04071040078997612, + "learning_rate": 2.6204987498126046e-06, + "loss": 0.0576, + "step": 3189 + }, + { + "epoch": 0.6686229302033116, + "grad_norm": 0.04605560004711151, + "learning_rate": 2.617513531907103e-06, + "loss": 0.0516, + "step": 3190 + }, + { + "epoch": 0.6688325298679522, + "grad_norm": 0.042507972568273544, + "learning_rate": 2.6145294123273677e-06, + "loss": 0.0521, + "step": 3191 + }, + { + "epoch": 0.6690421295325928, + "grad_norm": 0.03638184443116188, + "learning_rate": 2.6115463924490796e-06, + "loss": 0.0575, + "step": 3192 + }, + { + "epoch": 0.6692517291972333, + "grad_norm": 0.035932447761297226, + "learning_rate": 2.608564473647407e-06, + "loss": 0.0562, + "step": 3193 + }, + { + "epoch": 0.6694613288618738, + "grad_norm": 0.0350482314825058, + "learning_rate": 2.605583657297017e-06, + "loss": 0.0577, + "step": 3194 + }, + { + "epoch": 0.6696709285265143, + "grad_norm": 0.03746052831411362, + "learning_rate": 2.602603944772062e-06, + "loss": 0.0553, + "step": 3195 + }, + { + "epoch": 0.6698805281911548, + "grad_norm": 0.031886544078588486, + "learning_rate": 2.5996253374461924e-06, + "loss": 0.0555, + "step": 3196 + }, + { + "epoch": 0.6700901278557955, + "grad_norm": 0.03672702610492706, + "learning_rate": 2.5966478366925406e-06, + "loss": 0.055, + "step": 3197 + }, + { + "epoch": 0.670299727520436, + "grad_norm": 0.03667857125401497, + "learning_rate": 2.593671443883738e-06, + "loss": 0.0557, + "step": 3198 + }, + { + "epoch": 0.6705093271850765, + "grad_norm": 0.03002052754163742, + "learning_rate": 2.590696160391901e-06, + "loss": 0.0557, + "step": 3199 + }, + { + "epoch": 0.670718926849717, + "grad_norm": 0.034151580184698105, + "learning_rate": 2.587721987588635e-06, + "loss": 0.0556, + "step": 3200 + }, + { + "epoch": 0.6709285265143575, + "grad_norm": 0.037925586104393005, + "learning_rate": 2.5847489268450287e-06, + "loss": 0.0564, + "step": 3201 + }, + { + "epoch": 0.6711381261789982, + "grad_norm": 0.0316159762442112, + "learning_rate": 2.5817769795316674e-06, + "loss": 0.0528, + "step": 3202 + }, + { + "epoch": 0.6713477258436387, + "grad_norm": 0.03222265467047691, + "learning_rate": 2.578806147018614e-06, + "loss": 0.0547, + "step": 3203 + }, + { + "epoch": 0.6715573255082792, + "grad_norm": 0.0310065895318985, + "learning_rate": 2.5758364306754247e-06, + "loss": 0.0515, + "step": 3204 + }, + { + "epoch": 0.6717669251729197, + "grad_norm": 0.03238251060247421, + "learning_rate": 2.5728678318711385e-06, + "loss": 0.058, + "step": 3205 + }, + { + "epoch": 0.6719765248375602, + "grad_norm": 0.026389988139271736, + "learning_rate": 2.5699003519742783e-06, + "loss": 0.0564, + "step": 3206 + }, + { + "epoch": 0.6721861245022008, + "grad_norm": 0.028875682502985, + "learning_rate": 2.566933992352849e-06, + "loss": 0.0546, + "step": 3207 + }, + { + "epoch": 0.6723957241668413, + "grad_norm": 0.03320210054516792, + "learning_rate": 2.563968754374344e-06, + "loss": 0.0556, + "step": 3208 + }, + { + "epoch": 0.6726053238314819, + "grad_norm": 0.02969476394355297, + "learning_rate": 2.5610046394057386e-06, + "loss": 0.0555, + "step": 3209 + }, + { + "epoch": 0.6728149234961224, + "grad_norm": 0.029616905376315117, + "learning_rate": 2.5580416488134864e-06, + "loss": 0.056, + "step": 3210 + }, + { + "epoch": 0.6730245231607629, + "grad_norm": 0.023642316460609436, + "learning_rate": 2.5550797839635283e-06, + "loss": 0.0539, + "step": 3211 + }, + { + "epoch": 0.6732341228254035, + "grad_norm": 0.028511328622698784, + "learning_rate": 2.552119046221282e-06, + "loss": 0.0526, + "step": 3212 + }, + { + "epoch": 0.673443722490044, + "grad_norm": 0.030537253245711327, + "learning_rate": 2.5491594369516452e-06, + "loss": 0.0572, + "step": 3213 + }, + { + "epoch": 0.6736533221546845, + "grad_norm": 0.029667936265468597, + "learning_rate": 2.546200957518999e-06, + "loss": 0.0561, + "step": 3214 + }, + { + "epoch": 0.6738629218193251, + "grad_norm": 0.025500137358903885, + "learning_rate": 2.5432436092872036e-06, + "loss": 0.0569, + "step": 3215 + }, + { + "epoch": 0.6740725214839657, + "grad_norm": 0.024678369984030724, + "learning_rate": 2.5402873936195914e-06, + "loss": 0.0559, + "step": 3216 + }, + { + "epoch": 0.6742821211486062, + "grad_norm": 0.031020864844322205, + "learning_rate": 2.537332311878983e-06, + "loss": 0.0552, + "step": 3217 + }, + { + "epoch": 0.6744917208132467, + "grad_norm": 0.034975748509168625, + "learning_rate": 2.5343783654276644e-06, + "loss": 0.0541, + "step": 3218 + }, + { + "epoch": 0.6747013204778872, + "grad_norm": 0.03894275054335594, + "learning_rate": 2.5314255556274092e-06, + "loss": 0.0545, + "step": 3219 + }, + { + "epoch": 0.6749109201425277, + "grad_norm": 0.029810387641191483, + "learning_rate": 2.5284738838394586e-06, + "loss": 0.0543, + "step": 3220 + }, + { + "epoch": 0.6751205198071684, + "grad_norm": 0.02972550317645073, + "learning_rate": 2.5255233514245358e-06, + "loss": 0.0532, + "step": 3221 + }, + { + "epoch": 0.6753301194718089, + "grad_norm": 0.02595905028283596, + "learning_rate": 2.5225739597428334e-06, + "loss": 0.0548, + "step": 3222 + }, + { + "epoch": 0.6755397191364494, + "grad_norm": 0.025289386510849, + "learning_rate": 2.51962571015402e-06, + "loss": 0.0532, + "step": 3223 + }, + { + "epoch": 0.6757493188010899, + "grad_norm": 0.028770189732313156, + "learning_rate": 2.5166786040172387e-06, + "loss": 0.0516, + "step": 3224 + }, + { + "epoch": 0.6759589184657304, + "grad_norm": 0.024205055087804794, + "learning_rate": 2.5137326426911067e-06, + "loss": 0.052, + "step": 3225 + }, + { + "epoch": 0.676168518130371, + "grad_norm": 0.02634621225297451, + "learning_rate": 2.5107878275337084e-06, + "loss": 0.0547, + "step": 3226 + }, + { + "epoch": 0.6763781177950116, + "grad_norm": 0.031177837401628494, + "learning_rate": 2.507844159902606e-06, + "loss": 0.0581, + "step": 3227 + }, + { + "epoch": 0.6765877174596521, + "grad_norm": 0.032176099717617035, + "learning_rate": 2.5049016411548273e-06, + "loss": 0.0547, + "step": 3228 + }, + { + "epoch": 0.6767973171242926, + "grad_norm": 0.03147950395941734, + "learning_rate": 2.501960272646875e-06, + "loss": 0.0561, + "step": 3229 + }, + { + "epoch": 0.6770069167889331, + "grad_norm": 0.02897576615214348, + "learning_rate": 2.499020055734716e-06, + "loss": 0.0564, + "step": 3230 + }, + { + "epoch": 0.6772165164535737, + "grad_norm": 0.03353496268391609, + "learning_rate": 2.496080991773792e-06, + "loss": 0.0531, + "step": 3231 + }, + { + "epoch": 0.6774261161182142, + "grad_norm": 0.03561830520629883, + "learning_rate": 2.493143082119013e-06, + "loss": 0.0565, + "step": 3232 + }, + { + "epoch": 0.6776357157828548, + "grad_norm": 0.027927443385124207, + "learning_rate": 2.490206328124752e-06, + "loss": 0.0541, + "step": 3233 + }, + { + "epoch": 0.6778453154474953, + "grad_norm": 0.037114545702934265, + "learning_rate": 2.4872707311448504e-06, + "loss": 0.0534, + "step": 3234 + }, + { + "epoch": 0.6780549151121358, + "grad_norm": 0.03275038301944733, + "learning_rate": 2.484336292532622e-06, + "loss": 0.0519, + "step": 3235 + }, + { + "epoch": 0.6782645147767764, + "grad_norm": 0.02878388576209545, + "learning_rate": 2.481403013640838e-06, + "loss": 0.0572, + "step": 3236 + }, + { + "epoch": 0.6784741144414169, + "grad_norm": 0.02609802968800068, + "learning_rate": 2.478470895821742e-06, + "loss": 0.052, + "step": 3237 + }, + { + "epoch": 0.6786837141060574, + "grad_norm": 0.024750111624598503, + "learning_rate": 2.475539940427041e-06, + "loss": 0.0553, + "step": 3238 + }, + { + "epoch": 0.678893313770698, + "grad_norm": 0.02313164807856083, + "learning_rate": 2.472610148807903e-06, + "loss": 0.0561, + "step": 3239 + }, + { + "epoch": 0.6791029134353385, + "grad_norm": 0.02588365226984024, + "learning_rate": 2.469681522314959e-06, + "loss": 0.0566, + "step": 3240 + }, + { + "epoch": 0.6793125130999791, + "grad_norm": 0.02386978268623352, + "learning_rate": 2.4667540622983083e-06, + "loss": 0.053, + "step": 3241 + }, + { + "epoch": 0.6795221127646196, + "grad_norm": 0.025168968364596367, + "learning_rate": 2.4638277701075103e-06, + "loss": 0.0536, + "step": 3242 + }, + { + "epoch": 0.6797317124292601, + "grad_norm": 0.02500263787806034, + "learning_rate": 2.460902647091582e-06, + "loss": 0.0524, + "step": 3243 + }, + { + "epoch": 0.6799413120939006, + "grad_norm": 0.02609972469508648, + "learning_rate": 2.4579786945990073e-06, + "loss": 0.0566, + "step": 3244 + }, + { + "epoch": 0.6801509117585411, + "grad_norm": 0.03376591205596924, + "learning_rate": 2.4550559139777264e-06, + "loss": 0.0542, + "step": 3245 + }, + { + "epoch": 0.6803605114231818, + "grad_norm": 0.03760766610503197, + "learning_rate": 2.452134306575139e-06, + "loss": 0.0535, + "step": 3246 + }, + { + "epoch": 0.6805701110878223, + "grad_norm": 0.029737234115600586, + "learning_rate": 2.4492138737381066e-06, + "loss": 0.0522, + "step": 3247 + }, + { + "epoch": 0.6807797107524628, + "grad_norm": 0.030677396804094315, + "learning_rate": 2.446294616812951e-06, + "loss": 0.0523, + "step": 3248 + }, + { + "epoch": 0.6809893104171033, + "grad_norm": 0.05189737677574158, + "learning_rate": 2.443376537145444e-06, + "loss": 0.0541, + "step": 3249 + }, + { + "epoch": 0.6811989100817438, + "grad_norm": 0.0667051374912262, + "learning_rate": 2.4404596360808255e-06, + "loss": 0.056, + "step": 3250 + }, + { + "epoch": 0.6814085097463844, + "grad_norm": 0.05018071457743645, + "learning_rate": 2.437543914963782e-06, + "loss": 0.0542, + "step": 3251 + }, + { + "epoch": 0.681618109411025, + "grad_norm": 0.02675667405128479, + "learning_rate": 2.4346293751384597e-06, + "loss": 0.0551, + "step": 3252 + }, + { + "epoch": 0.6818277090756655, + "grad_norm": 0.04382751137018204, + "learning_rate": 2.431716017948462e-06, + "loss": 0.0544, + "step": 3253 + }, + { + "epoch": 0.682037308740306, + "grad_norm": 0.051815591752529144, + "learning_rate": 2.428803844736848e-06, + "loss": 0.0546, + "step": 3254 + }, + { + "epoch": 0.6822469084049465, + "grad_norm": 0.034245654940605164, + "learning_rate": 2.4258928568461303e-06, + "loss": 0.0518, + "step": 3255 + }, + { + "epoch": 0.6824565080695871, + "grad_norm": 0.029311811551451683, + "learning_rate": 2.422983055618267e-06, + "loss": 0.0555, + "step": 3256 + }, + { + "epoch": 0.6826661077342276, + "grad_norm": 0.03641844168305397, + "learning_rate": 2.4200744423946803e-06, + "loss": 0.0532, + "step": 3257 + }, + { + "epoch": 0.6828757073988682, + "grad_norm": 0.03218969702720642, + "learning_rate": 2.4171670185162415e-06, + "loss": 0.0543, + "step": 3258 + }, + { + "epoch": 0.6830853070635087, + "grad_norm": 0.024815967306494713, + "learning_rate": 2.4142607853232687e-06, + "loss": 0.0584, + "step": 3259 + }, + { + "epoch": 0.6832949067281492, + "grad_norm": 0.025883564725518227, + "learning_rate": 2.4113557441555384e-06, + "loss": 0.0544, + "step": 3260 + }, + { + "epoch": 0.6835045063927898, + "grad_norm": 0.03328178450465202, + "learning_rate": 2.4084518963522724e-06, + "loss": 0.0541, + "step": 3261 + }, + { + "epoch": 0.6837141060574303, + "grad_norm": 0.028745753690600395, + "learning_rate": 2.4055492432521428e-06, + "loss": 0.0541, + "step": 3262 + }, + { + "epoch": 0.6839237057220708, + "grad_norm": 0.023591186851263046, + "learning_rate": 2.402647786193272e-06, + "loss": 0.0531, + "step": 3263 + }, + { + "epoch": 0.6841333053867114, + "grad_norm": 0.026261072605848312, + "learning_rate": 2.3997475265132333e-06, + "loss": 0.058, + "step": 3264 + }, + { + "epoch": 0.6843429050513519, + "grad_norm": 0.023627085611224174, + "learning_rate": 2.3968484655490455e-06, + "loss": 0.0551, + "step": 3265 + }, + { + "epoch": 0.6845525047159925, + "grad_norm": 0.024966632947325706, + "learning_rate": 2.3939506046371753e-06, + "loss": 0.0562, + "step": 3266 + }, + { + "epoch": 0.684762104380633, + "grad_norm": 0.02931971289217472, + "learning_rate": 2.391053945113533e-06, + "loss": 0.0526, + "step": 3267 + }, + { + "epoch": 0.6849717040452735, + "grad_norm": 0.035215772688388824, + "learning_rate": 2.388158488313481e-06, + "loss": 0.0566, + "step": 3268 + }, + { + "epoch": 0.685181303709914, + "grad_norm": 0.02961471863090992, + "learning_rate": 2.3852642355718224e-06, + "loss": 0.0554, + "step": 3269 + }, + { + "epoch": 0.6853909033745545, + "grad_norm": 0.028823746368288994, + "learning_rate": 2.3823711882228077e-06, + "loss": 0.053, + "step": 3270 + }, + { + "epoch": 0.6856005030391952, + "grad_norm": 0.04585007578134537, + "learning_rate": 2.379479347600134e-06, + "loss": 0.0535, + "step": 3271 + }, + { + "epoch": 0.6858101027038357, + "grad_norm": 0.03799903392791748, + "learning_rate": 2.3765887150369366e-06, + "loss": 0.0529, + "step": 3272 + }, + { + "epoch": 0.6860197023684762, + "grad_norm": 0.02885265089571476, + "learning_rate": 2.373699291865794e-06, + "loss": 0.0559, + "step": 3273 + }, + { + "epoch": 0.6862293020331167, + "grad_norm": 0.03915941342711449, + "learning_rate": 2.370811079418735e-06, + "loss": 0.0528, + "step": 3274 + }, + { + "epoch": 0.6864389016977572, + "grad_norm": 0.03368467465043068, + "learning_rate": 2.3679240790272203e-06, + "loss": 0.0532, + "step": 3275 + }, + { + "epoch": 0.6866485013623979, + "grad_norm": 0.03050885535776615, + "learning_rate": 2.3650382920221578e-06, + "loss": 0.0564, + "step": 3276 + }, + { + "epoch": 0.6868581010270384, + "grad_norm": 0.03771447017788887, + "learning_rate": 2.3621537197338977e-06, + "loss": 0.0533, + "step": 3277 + }, + { + "epoch": 0.6870677006916789, + "grad_norm": 0.040800098329782486, + "learning_rate": 2.359270363492225e-06, + "loss": 0.056, + "step": 3278 + }, + { + "epoch": 0.6872773003563194, + "grad_norm": 0.03476854786276817, + "learning_rate": 2.356388224626364e-06, + "loss": 0.0529, + "step": 3279 + }, + { + "epoch": 0.6874869000209599, + "grad_norm": 0.026112914085388184, + "learning_rate": 2.3535073044649824e-06, + "loss": 0.055, + "step": 3280 + }, + { + "epoch": 0.6876964996856005, + "grad_norm": 0.03609404340386391, + "learning_rate": 2.350627604336186e-06, + "loss": 0.0527, + "step": 3281 + }, + { + "epoch": 0.687906099350241, + "grad_norm": 0.039491958916187286, + "learning_rate": 2.347749125567511e-06, + "loss": 0.054, + "step": 3282 + }, + { + "epoch": 0.6881156990148816, + "grad_norm": 0.02687055617570877, + "learning_rate": 2.344871869485941e-06, + "loss": 0.0556, + "step": 3283 + }, + { + "epoch": 0.6883252986795221, + "grad_norm": 0.03269565477967262, + "learning_rate": 2.341995837417887e-06, + "loss": 0.0552, + "step": 3284 + }, + { + "epoch": 0.6885348983441627, + "grad_norm": 0.037071727216243744, + "learning_rate": 2.3391210306891977e-06, + "loss": 0.0513, + "step": 3285 + }, + { + "epoch": 0.6887444980088032, + "grad_norm": 0.03246400132775307, + "learning_rate": 2.336247450625161e-06, + "loss": 0.0536, + "step": 3286 + }, + { + "epoch": 0.6889540976734437, + "grad_norm": 0.02836901880800724, + "learning_rate": 2.333375098550496e-06, + "loss": 0.0533, + "step": 3287 + }, + { + "epoch": 0.6891636973380842, + "grad_norm": 0.04530966654419899, + "learning_rate": 2.330503975789361e-06, + "loss": 0.0553, + "step": 3288 + }, + { + "epoch": 0.6893732970027248, + "grad_norm": 0.05676478147506714, + "learning_rate": 2.3276340836653342e-06, + "loss": 0.0544, + "step": 3289 + }, + { + "epoch": 0.6895828966673654, + "grad_norm": 0.03974361717700958, + "learning_rate": 2.3247654235014403e-06, + "loss": 0.056, + "step": 3290 + }, + { + "epoch": 0.6897924963320059, + "grad_norm": 0.022683048620820045, + "learning_rate": 2.321897996620132e-06, + "loss": 0.0571, + "step": 3291 + }, + { + "epoch": 0.6900020959966464, + "grad_norm": 0.03632507100701332, + "learning_rate": 2.3190318043432892e-06, + "loss": 0.053, + "step": 3292 + }, + { + "epoch": 0.6902116956612869, + "grad_norm": 0.029882922768592834, + "learning_rate": 2.3161668479922293e-06, + "loss": 0.0551, + "step": 3293 + }, + { + "epoch": 0.6904212953259274, + "grad_norm": 0.0291314534842968, + "learning_rate": 2.3133031288876955e-06, + "loss": 0.0527, + "step": 3294 + }, + { + "epoch": 0.6906308949905681, + "grad_norm": 0.03613777831196785, + "learning_rate": 2.3104406483498593e-06, + "loss": 0.0545, + "step": 3295 + }, + { + "epoch": 0.6908404946552086, + "grad_norm": 0.04006670415401459, + "learning_rate": 2.3075794076983254e-06, + "loss": 0.0551, + "step": 3296 + }, + { + "epoch": 0.6910500943198491, + "grad_norm": 0.031675100326538086, + "learning_rate": 2.304719408252127e-06, + "loss": 0.0566, + "step": 3297 + }, + { + "epoch": 0.6912596939844896, + "grad_norm": 0.03170279040932655, + "learning_rate": 2.30186065132972e-06, + "loss": 0.0545, + "step": 3298 + }, + { + "epoch": 0.6914692936491301, + "grad_norm": 0.04302024096250534, + "learning_rate": 2.2990031382489935e-06, + "loss": 0.055, + "step": 3299 + }, + { + "epoch": 0.6916788933137707, + "grad_norm": 0.03898413106799126, + "learning_rate": 2.296146870327258e-06, + "loss": 0.0566, + "step": 3300 + }, + { + "epoch": 0.6918884929784113, + "grad_norm": 0.021424876525998116, + "learning_rate": 2.293291848881255e-06, + "loss": 0.0572, + "step": 3301 + }, + { + "epoch": 0.6920980926430518, + "grad_norm": 0.03237781301140785, + "learning_rate": 2.290438075227146e-06, + "loss": 0.0543, + "step": 3302 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.03710688278079033, + "learning_rate": 2.2875855506805217e-06, + "loss": 0.0557, + "step": 3303 + }, + { + "epoch": 0.6925172919723328, + "grad_norm": 0.030332237482070923, + "learning_rate": 2.284734276556396e-06, + "loss": 0.056, + "step": 3304 + }, + { + "epoch": 0.6927268916369734, + "grad_norm": 0.02461421862244606, + "learning_rate": 2.281884254169206e-06, + "loss": 0.0517, + "step": 3305 + }, + { + "epoch": 0.6929364913016139, + "grad_norm": 0.02470548450946808, + "learning_rate": 2.279035484832808e-06, + "loss": 0.0545, + "step": 3306 + }, + { + "epoch": 0.6931460909662545, + "grad_norm": 0.025578899309039116, + "learning_rate": 2.276187969860488e-06, + "loss": 0.0556, + "step": 3307 + }, + { + "epoch": 0.693355690630895, + "grad_norm": 0.024182526394724846, + "learning_rate": 2.2733417105649458e-06, + "loss": 0.0549, + "step": 3308 + }, + { + "epoch": 0.6935652902955355, + "grad_norm": 0.02719123847782612, + "learning_rate": 2.270496708258309e-06, + "loss": 0.0541, + "step": 3309 + }, + { + "epoch": 0.6937748899601761, + "grad_norm": 0.03566061705350876, + "learning_rate": 2.2676529642521244e-06, + "loss": 0.0548, + "step": 3310 + }, + { + "epoch": 0.6939844896248166, + "grad_norm": 0.036818746477365494, + "learning_rate": 2.264810479857356e-06, + "loss": 0.0577, + "step": 3311 + }, + { + "epoch": 0.6941940892894571, + "grad_norm": 0.03650686517357826, + "learning_rate": 2.2619692563843865e-06, + "loss": 0.0557, + "step": 3312 + }, + { + "epoch": 0.6944036889540977, + "grad_norm": 0.02722245827317238, + "learning_rate": 2.259129295143021e-06, + "loss": 0.0572, + "step": 3313 + }, + { + "epoch": 0.6946132886187382, + "grad_norm": 0.023747239261865616, + "learning_rate": 2.2562905974424824e-06, + "loss": 0.0539, + "step": 3314 + }, + { + "epoch": 0.6948228882833788, + "grad_norm": 0.027470340952277184, + "learning_rate": 2.2534531645914078e-06, + "loss": 0.0524, + "step": 3315 + }, + { + "epoch": 0.6950324879480193, + "grad_norm": 0.02598888985812664, + "learning_rate": 2.2506169978978543e-06, + "loss": 0.0545, + "step": 3316 + }, + { + "epoch": 0.6952420876126598, + "grad_norm": 0.024863192811608315, + "learning_rate": 2.2477820986692945e-06, + "loss": 0.0519, + "step": 3317 + }, + { + "epoch": 0.6954516872773003, + "grad_norm": 0.0245378315448761, + "learning_rate": 2.2449484682126133e-06, + "loss": 0.0544, + "step": 3318 + }, + { + "epoch": 0.6956612869419408, + "grad_norm": 0.02471884898841381, + "learning_rate": 2.2421161078341163e-06, + "loss": 0.0544, + "step": 3319 + }, + { + "epoch": 0.6958708866065815, + "grad_norm": 0.02821512520313263, + "learning_rate": 2.2392850188395227e-06, + "loss": 0.0583, + "step": 3320 + }, + { + "epoch": 0.696080486271222, + "grad_norm": 0.024967767298221588, + "learning_rate": 2.23645520253396e-06, + "loss": 0.0552, + "step": 3321 + }, + { + "epoch": 0.6962900859358625, + "grad_norm": 0.022636888548731804, + "learning_rate": 2.2336266602219762e-06, + "loss": 0.0556, + "step": 3322 + }, + { + "epoch": 0.696499685600503, + "grad_norm": 0.02549467422068119, + "learning_rate": 2.230799393207526e-06, + "loss": 0.0551, + "step": 3323 + }, + { + "epoch": 0.6967092852651435, + "grad_norm": 0.022535137832164764, + "learning_rate": 2.227973402793982e-06, + "loss": 0.0537, + "step": 3324 + }, + { + "epoch": 0.6969188849297842, + "grad_norm": 0.028567228466272354, + "learning_rate": 2.225148690284122e-06, + "loss": 0.0531, + "step": 3325 + }, + { + "epoch": 0.6971284845944247, + "grad_norm": 0.025192799046635628, + "learning_rate": 2.2223252569801412e-06, + "loss": 0.0575, + "step": 3326 + }, + { + "epoch": 0.6973380842590652, + "grad_norm": 0.02626647800207138, + "learning_rate": 2.2195031041836396e-06, + "loss": 0.0538, + "step": 3327 + }, + { + "epoch": 0.6975476839237057, + "grad_norm": 0.026845086365938187, + "learning_rate": 2.216682233195628e-06, + "loss": 0.0532, + "step": 3328 + }, + { + "epoch": 0.6977572835883462, + "grad_norm": 0.027518998831510544, + "learning_rate": 2.2138626453165286e-06, + "loss": 0.0531, + "step": 3329 + }, + { + "epoch": 0.6979668832529868, + "grad_norm": 0.0274403914809227, + "learning_rate": 2.2110443418461723e-06, + "loss": 0.0507, + "step": 3330 + }, + { + "epoch": 0.6981764829176274, + "grad_norm": 0.03426744416356087, + "learning_rate": 2.208227324083794e-06, + "loss": 0.0554, + "step": 3331 + }, + { + "epoch": 0.6983860825822679, + "grad_norm": 0.029531171545386314, + "learning_rate": 2.2054115933280407e-06, + "loss": 0.0547, + "step": 3332 + }, + { + "epoch": 0.6985956822469084, + "grad_norm": 0.02566445991396904, + "learning_rate": 2.2025971508769607e-06, + "loss": 0.0586, + "step": 3333 + }, + { + "epoch": 0.6988052819115489, + "grad_norm": 0.04020659625530243, + "learning_rate": 2.199783998028015e-06, + "loss": 0.0545, + "step": 3334 + }, + { + "epoch": 0.6990148815761895, + "grad_norm": 0.03343269228935242, + "learning_rate": 2.1969721360780626e-06, + "loss": 0.0539, + "step": 3335 + }, + { + "epoch": 0.69922448124083, + "grad_norm": 0.02812224067747593, + "learning_rate": 2.1941615663233733e-06, + "loss": 0.0545, + "step": 3336 + }, + { + "epoch": 0.6994340809054705, + "grad_norm": 0.04151785001158714, + "learning_rate": 2.191352290059621e-06, + "loss": 0.055, + "step": 3337 + }, + { + "epoch": 0.6996436805701111, + "grad_norm": 0.028555648401379585, + "learning_rate": 2.1885443085818795e-06, + "loss": 0.0527, + "step": 3338 + }, + { + "epoch": 0.6998532802347516, + "grad_norm": 0.034111034125089645, + "learning_rate": 2.185737623184626e-06, + "loss": 0.0539, + "step": 3339 + }, + { + "epoch": 0.7000628798993922, + "grad_norm": 0.035869497805833817, + "learning_rate": 2.1829322351617456e-06, + "loss": 0.0548, + "step": 3340 + }, + { + "epoch": 0.7002724795640327, + "grad_norm": 0.029365766793489456, + "learning_rate": 2.1801281458065177e-06, + "loss": 0.0542, + "step": 3341 + }, + { + "epoch": 0.7004820792286732, + "grad_norm": 0.03768596425652504, + "learning_rate": 2.1773253564116288e-06, + "loss": 0.0529, + "step": 3342 + }, + { + "epoch": 0.7006916788933137, + "grad_norm": 0.03441116586327553, + "learning_rate": 2.174523868269166e-06, + "loss": 0.056, + "step": 3343 + }, + { + "epoch": 0.7009012785579543, + "grad_norm": 0.03129161149263382, + "learning_rate": 2.171723682670613e-06, + "loss": 0.054, + "step": 3344 + }, + { + "epoch": 0.7011108782225949, + "grad_norm": 0.03608183562755585, + "learning_rate": 2.1689248009068532e-06, + "loss": 0.0549, + "step": 3345 + }, + { + "epoch": 0.7013204778872354, + "grad_norm": 0.026896623894572258, + "learning_rate": 2.166127224268172e-06, + "loss": 0.055, + "step": 3346 + }, + { + "epoch": 0.7015300775518759, + "grad_norm": 0.03324010595679283, + "learning_rate": 2.163330954044253e-06, + "loss": 0.0538, + "step": 3347 + }, + { + "epoch": 0.7017396772165164, + "grad_norm": 0.02656789869070053, + "learning_rate": 2.160535991524174e-06, + "loss": 0.0549, + "step": 3348 + }, + { + "epoch": 0.7019492768811569, + "grad_norm": 0.029164310544729233, + "learning_rate": 2.1577423379964147e-06, + "loss": 0.0556, + "step": 3349 + }, + { + "epoch": 0.7021588765457976, + "grad_norm": 0.02474060468375683, + "learning_rate": 2.154949994748847e-06, + "loss": 0.0542, + "step": 3350 + }, + { + "epoch": 0.7023684762104381, + "grad_norm": 0.031023385003209114, + "learning_rate": 2.152158963068739e-06, + "loss": 0.0539, + "step": 3351 + }, + { + "epoch": 0.7025780758750786, + "grad_norm": 0.02580900862812996, + "learning_rate": 2.149369244242758e-06, + "loss": 0.0536, + "step": 3352 + }, + { + "epoch": 0.7027876755397191, + "grad_norm": 0.027426833286881447, + "learning_rate": 2.1465808395569644e-06, + "loss": 0.0532, + "step": 3353 + }, + { + "epoch": 0.7029972752043597, + "grad_norm": 0.031186439096927643, + "learning_rate": 2.1437937502968093e-06, + "loss": 0.0555, + "step": 3354 + }, + { + "epoch": 0.7032068748690002, + "grad_norm": 0.031483497470617294, + "learning_rate": 2.1410079777471444e-06, + "loss": 0.0527, + "step": 3355 + }, + { + "epoch": 0.7034164745336408, + "grad_norm": 0.03852154687047005, + "learning_rate": 2.1382235231922053e-06, + "loss": 0.0592, + "step": 3356 + }, + { + "epoch": 0.7036260741982813, + "grad_norm": 0.03263459354639053, + "learning_rate": 2.13544038791563e-06, + "loss": 0.0546, + "step": 3357 + }, + { + "epoch": 0.7038356738629218, + "grad_norm": 0.028121495619416237, + "learning_rate": 2.1326585732004384e-06, + "loss": 0.0508, + "step": 3358 + }, + { + "epoch": 0.7040452735275624, + "grad_norm": 0.036633770912885666, + "learning_rate": 2.1298780803290497e-06, + "loss": 0.0558, + "step": 3359 + }, + { + "epoch": 0.7042548731922029, + "grad_norm": 0.03133592754602432, + "learning_rate": 2.127098910583273e-06, + "loss": 0.0547, + "step": 3360 + }, + { + "epoch": 0.7044644728568434, + "grad_norm": 0.026572570204734802, + "learning_rate": 2.124321065244298e-06, + "loss": 0.0551, + "step": 3361 + }, + { + "epoch": 0.704674072521484, + "grad_norm": 0.023017263039946556, + "learning_rate": 2.121544545592715e-06, + "loss": 0.0539, + "step": 3362 + }, + { + "epoch": 0.7048836721861245, + "grad_norm": 0.021083930507302284, + "learning_rate": 2.1187693529085e-06, + "loss": 0.0557, + "step": 3363 + }, + { + "epoch": 0.7050932718507651, + "grad_norm": 0.02307903952896595, + "learning_rate": 2.1159954884710133e-06, + "loss": 0.0571, + "step": 3364 + }, + { + "epoch": 0.7053028715154056, + "grad_norm": 0.02702009305357933, + "learning_rate": 2.1132229535590092e-06, + "loss": 0.054, + "step": 3365 + }, + { + "epoch": 0.7055124711800461, + "grad_norm": 0.029108747839927673, + "learning_rate": 2.110451749450624e-06, + "loss": 0.0525, + "step": 3366 + }, + { + "epoch": 0.7057220708446866, + "grad_norm": 0.024553043767809868, + "learning_rate": 2.1076818774233796e-06, + "loss": 0.0525, + "step": 3367 + }, + { + "epoch": 0.7059316705093271, + "grad_norm": 0.03205152973532677, + "learning_rate": 2.104913338754189e-06, + "loss": 0.0532, + "step": 3368 + }, + { + "epoch": 0.7061412701739678, + "grad_norm": 0.04320382699370384, + "learning_rate": 2.102146134719349e-06, + "loss": 0.0527, + "step": 3369 + }, + { + "epoch": 0.7063508698386083, + "grad_norm": 0.04006104916334152, + "learning_rate": 2.0993802665945396e-06, + "loss": 0.0532, + "step": 3370 + }, + { + "epoch": 0.7065604695032488, + "grad_norm": 0.022940274327993393, + "learning_rate": 2.0966157356548255e-06, + "loss": 0.0541, + "step": 3371 + }, + { + "epoch": 0.7067700691678893, + "grad_norm": 0.03261176869273186, + "learning_rate": 2.093852543174652e-06, + "loss": 0.057, + "step": 3372 + }, + { + "epoch": 0.7069796688325298, + "grad_norm": 0.033332716673612595, + "learning_rate": 2.0910906904278542e-06, + "loss": 0.0528, + "step": 3373 + }, + { + "epoch": 0.7071892684971705, + "grad_norm": 0.02474953606724739, + "learning_rate": 2.088330178687642e-06, + "loss": 0.0564, + "step": 3374 + }, + { + "epoch": 0.707398868161811, + "grad_norm": 0.028840798884630203, + "learning_rate": 2.085571009226613e-06, + "loss": 0.0528, + "step": 3375 + }, + { + "epoch": 0.7076084678264515, + "grad_norm": 0.03333436697721481, + "learning_rate": 2.082813183316745e-06, + "loss": 0.0579, + "step": 3376 + }, + { + "epoch": 0.707818067491092, + "grad_norm": 0.03125330060720444, + "learning_rate": 2.080056702229393e-06, + "loss": 0.0536, + "step": 3377 + }, + { + "epoch": 0.7080276671557325, + "grad_norm": 0.03265562653541565, + "learning_rate": 2.0773015672352938e-06, + "loss": 0.0561, + "step": 3378 + }, + { + "epoch": 0.7082372668203731, + "grad_norm": 0.030047666281461716, + "learning_rate": 2.0745477796045664e-06, + "loss": 0.0535, + "step": 3379 + }, + { + "epoch": 0.7084468664850136, + "grad_norm": 0.03281790018081665, + "learning_rate": 2.0717953406067033e-06, + "loss": 0.0539, + "step": 3380 + }, + { + "epoch": 0.7086564661496542, + "grad_norm": 0.031788330525159836, + "learning_rate": 2.0690442515105797e-06, + "loss": 0.0548, + "step": 3381 + }, + { + "epoch": 0.7088660658142947, + "grad_norm": 0.03671342507004738, + "learning_rate": 2.0662945135844493e-06, + "loss": 0.0547, + "step": 3382 + }, + { + "epoch": 0.7090756654789352, + "grad_norm": 0.05137075111269951, + "learning_rate": 2.0635461280959386e-06, + "loss": 0.054, + "step": 3383 + }, + { + "epoch": 0.7092852651435758, + "grad_norm": 0.025410549715161324, + "learning_rate": 2.060799096312051e-06, + "loss": 0.0519, + "step": 3384 + }, + { + "epoch": 0.7094948648082163, + "grad_norm": 0.041060563176870346, + "learning_rate": 2.0580534194991696e-06, + "loss": 0.0533, + "step": 3385 + }, + { + "epoch": 0.7097044644728568, + "grad_norm": 0.04743944853544235, + "learning_rate": 2.0553090989230527e-06, + "loss": 0.0556, + "step": 3386 + }, + { + "epoch": 0.7099140641374974, + "grad_norm": 0.02690894342958927, + "learning_rate": 2.052566135848828e-06, + "loss": 0.0542, + "step": 3387 + }, + { + "epoch": 0.7101236638021379, + "grad_norm": 0.028761066496372223, + "learning_rate": 2.0498245315410037e-06, + "loss": 0.0551, + "step": 3388 + }, + { + "epoch": 0.7103332634667785, + "grad_norm": 0.027141369879245758, + "learning_rate": 2.047084287263458e-06, + "loss": 0.0542, + "step": 3389 + }, + { + "epoch": 0.710542863131419, + "grad_norm": 0.02281133271753788, + "learning_rate": 2.0443454042794404e-06, + "loss": 0.0546, + "step": 3390 + }, + { + "epoch": 0.7107524627960595, + "grad_norm": 0.025255044922232628, + "learning_rate": 2.0416078838515773e-06, + "loss": 0.0542, + "step": 3391 + }, + { + "epoch": 0.7109620624607, + "grad_norm": 0.02403366006910801, + "learning_rate": 2.0388717272418653e-06, + "loss": 0.0549, + "step": 3392 + }, + { + "epoch": 0.7111716621253406, + "grad_norm": 0.022493207827210426, + "learning_rate": 2.036136935711674e-06, + "loss": 0.054, + "step": 3393 + }, + { + "epoch": 0.7113812617899812, + "grad_norm": 0.022795764729380608, + "learning_rate": 2.0334035105217394e-06, + "loss": 0.0582, + "step": 3394 + }, + { + "epoch": 0.7115908614546217, + "grad_norm": 0.024398164823651314, + "learning_rate": 2.0306714529321687e-06, + "loss": 0.0541, + "step": 3395 + }, + { + "epoch": 0.7118004611192622, + "grad_norm": 0.022561099380254745, + "learning_rate": 2.0279407642024427e-06, + "loss": 0.0557, + "step": 3396 + }, + { + "epoch": 0.7120100607839027, + "grad_norm": 0.02206028625369072, + "learning_rate": 2.0252114455914056e-06, + "loss": 0.0549, + "step": 3397 + }, + { + "epoch": 0.7122196604485432, + "grad_norm": 0.021681906655430794, + "learning_rate": 2.0224834983572766e-06, + "loss": 0.0546, + "step": 3398 + }, + { + "epoch": 0.7124292601131839, + "grad_norm": 0.02355903573334217, + "learning_rate": 2.0197569237576352e-06, + "loss": 0.0529, + "step": 3399 + }, + { + "epoch": 0.7126388597778244, + "grad_norm": 0.02560109831392765, + "learning_rate": 2.017031723049432e-06, + "loss": 0.0537, + "step": 3400 + }, + { + "epoch": 0.7128484594424649, + "grad_norm": 0.021792568266391754, + "learning_rate": 2.0143078974889846e-06, + "loss": 0.0549, + "step": 3401 + }, + { + "epoch": 0.7130580591071054, + "grad_norm": 0.022112544625997543, + "learning_rate": 2.011585448331978e-06, + "loss": 0.0541, + "step": 3402 + }, + { + "epoch": 0.7132676587717459, + "grad_norm": 0.02488597482442856, + "learning_rate": 2.008864376833457e-06, + "loss": 0.0531, + "step": 3403 + }, + { + "epoch": 0.7134772584363865, + "grad_norm": 0.022757617756724358, + "learning_rate": 2.0061446842478393e-06, + "loss": 0.0544, + "step": 3404 + }, + { + "epoch": 0.713686858101027, + "grad_norm": 0.028682250529527664, + "learning_rate": 2.003426371828898e-06, + "loss": 0.0521, + "step": 3405 + }, + { + "epoch": 0.7138964577656676, + "grad_norm": 0.026006808504462242, + "learning_rate": 2.000709440829779e-06, + "loss": 0.055, + "step": 3406 + }, + { + "epoch": 0.7141060574303081, + "grad_norm": 0.020952891558408737, + "learning_rate": 1.997993892502983e-06, + "loss": 0.0541, + "step": 3407 + }, + { + "epoch": 0.7143156570949486, + "grad_norm": 0.030959462746977806, + "learning_rate": 1.9952797281003784e-06, + "loss": 0.0533, + "step": 3408 + }, + { + "epoch": 0.7145252567595892, + "grad_norm": 0.022278886288404465, + "learning_rate": 1.9925669488731975e-06, + "loss": 0.0538, + "step": 3409 + }, + { + "epoch": 0.7147348564242297, + "grad_norm": 0.02709982544183731, + "learning_rate": 1.989855556072028e-06, + "loss": 0.0511, + "step": 3410 + }, + { + "epoch": 0.7149444560888703, + "grad_norm": 0.02348734438419342, + "learning_rate": 1.98714555094682e-06, + "loss": 0.0545, + "step": 3411 + }, + { + "epoch": 0.7151540557535108, + "grad_norm": 0.02444126456975937, + "learning_rate": 1.9844369347468895e-06, + "loss": 0.0546, + "step": 3412 + }, + { + "epoch": 0.7153636554181513, + "grad_norm": 0.020223500207066536, + "learning_rate": 1.981729708720903e-06, + "loss": 0.0556, + "step": 3413 + }, + { + "epoch": 0.7155732550827919, + "grad_norm": 0.028548697009682655, + "learning_rate": 1.979023874116895e-06, + "loss": 0.0556, + "step": 3414 + }, + { + "epoch": 0.7157828547474324, + "grad_norm": 0.021214155480265617, + "learning_rate": 1.976319432182254e-06, + "loss": 0.0555, + "step": 3415 + }, + { + "epoch": 0.7159924544120729, + "grad_norm": 0.022887524217367172, + "learning_rate": 1.973616384163728e-06, + "loss": 0.0523, + "step": 3416 + }, + { + "epoch": 0.7162020540767134, + "grad_norm": 0.024027662351727486, + "learning_rate": 1.9709147313074177e-06, + "loss": 0.0544, + "step": 3417 + }, + { + "epoch": 0.716411653741354, + "grad_norm": 0.028380177915096283, + "learning_rate": 1.968214474858787e-06, + "loss": 0.053, + "step": 3418 + }, + { + "epoch": 0.7166212534059946, + "grad_norm": 0.025715503841638565, + "learning_rate": 1.965515616062656e-06, + "loss": 0.0545, + "step": 3419 + }, + { + "epoch": 0.7168308530706351, + "grad_norm": 0.03151887282729149, + "learning_rate": 1.962818156163194e-06, + "loss": 0.0551, + "step": 3420 + }, + { + "epoch": 0.7170404527352756, + "grad_norm": 0.04665770381689072, + "learning_rate": 1.9601220964039324e-06, + "loss": 0.056, + "step": 3421 + }, + { + "epoch": 0.7172500523999161, + "grad_norm": 0.02261737547814846, + "learning_rate": 1.957427438027753e-06, + "loss": 0.0548, + "step": 3422 + }, + { + "epoch": 0.7174596520645568, + "grad_norm": 0.03728951886296272, + "learning_rate": 1.9547341822768906e-06, + "loss": 0.0542, + "step": 3423 + }, + { + "epoch": 0.7176692517291973, + "grad_norm": 0.032367952167987823, + "learning_rate": 1.9520423303929383e-06, + "loss": 0.0532, + "step": 3424 + }, + { + "epoch": 0.7178788513938378, + "grad_norm": 0.028157230466604233, + "learning_rate": 1.9493518836168403e-06, + "loss": 0.0551, + "step": 3425 + }, + { + "epoch": 0.7180884510584783, + "grad_norm": 0.029264342039823532, + "learning_rate": 1.946662843188888e-06, + "loss": 0.0559, + "step": 3426 + }, + { + "epoch": 0.7182980507231188, + "grad_norm": 0.02802872471511364, + "learning_rate": 1.9439752103487324e-06, + "loss": 0.0525, + "step": 3427 + }, + { + "epoch": 0.7185076503877594, + "grad_norm": 0.02839348092675209, + "learning_rate": 1.9412889863353683e-06, + "loss": 0.0532, + "step": 3428 + }, + { + "epoch": 0.7187172500524, + "grad_norm": 0.026173194870352745, + "learning_rate": 1.9386041723871485e-06, + "loss": 0.0528, + "step": 3429 + }, + { + "epoch": 0.7189268497170405, + "grad_norm": 0.028481315821409225, + "learning_rate": 1.9359207697417677e-06, + "loss": 0.0537, + "step": 3430 + }, + { + "epoch": 0.719136449381681, + "grad_norm": 0.028069039806723595, + "learning_rate": 1.9332387796362744e-06, + "loss": 0.0566, + "step": 3431 + }, + { + "epoch": 0.7193460490463215, + "grad_norm": 0.031697165220975876, + "learning_rate": 1.9305582033070714e-06, + "loss": 0.0542, + "step": 3432 + }, + { + "epoch": 0.7195556487109621, + "grad_norm": 0.029638856649398804, + "learning_rate": 1.927879041989895e-06, + "loss": 0.054, + "step": 3433 + }, + { + "epoch": 0.7197652483756026, + "grad_norm": 0.022818326950073242, + "learning_rate": 1.925201296919842e-06, + "loss": 0.0546, + "step": 3434 + }, + { + "epoch": 0.7199748480402431, + "grad_norm": 0.03616581857204437, + "learning_rate": 1.9225249693313547e-06, + "loss": 0.0542, + "step": 3435 + }, + { + "epoch": 0.7201844477048837, + "grad_norm": 0.028139004483819008, + "learning_rate": 1.919850060458215e-06, + "loss": 0.053, + "step": 3436 + }, + { + "epoch": 0.7203940473695242, + "grad_norm": 0.028990762308239937, + "learning_rate": 1.91717657153356e-06, + "loss": 0.0549, + "step": 3437 + }, + { + "epoch": 0.7206036470341648, + "grad_norm": 0.022995000705122948, + "learning_rate": 1.914504503789863e-06, + "loss": 0.0554, + "step": 3438 + }, + { + "epoch": 0.7208132466988053, + "grad_norm": 0.026573555544018745, + "learning_rate": 1.9118338584589503e-06, + "loss": 0.0556, + "step": 3439 + }, + { + "epoch": 0.7210228463634458, + "grad_norm": 0.024811234325170517, + "learning_rate": 1.909164636771986e-06, + "loss": 0.0537, + "step": 3440 + }, + { + "epoch": 0.7212324460280863, + "grad_norm": 0.02453632839024067, + "learning_rate": 1.9064968399594818e-06, + "loss": 0.0517, + "step": 3441 + }, + { + "epoch": 0.7214420456927269, + "grad_norm": 0.024129144847393036, + "learning_rate": 1.9038304692512943e-06, + "loss": 0.0534, + "step": 3442 + }, + { + "epoch": 0.7216516453573675, + "grad_norm": 0.024486854672431946, + "learning_rate": 1.9011655258766165e-06, + "loss": 0.0531, + "step": 3443 + }, + { + "epoch": 0.721861245022008, + "grad_norm": 0.022883350029587746, + "learning_rate": 1.8985020110639862e-06, + "loss": 0.0555, + "step": 3444 + }, + { + "epoch": 0.7220708446866485, + "grad_norm": 0.02430296316742897, + "learning_rate": 1.8958399260412864e-06, + "loss": 0.0543, + "step": 3445 + }, + { + "epoch": 0.722280444351289, + "grad_norm": 0.023016586899757385, + "learning_rate": 1.893179272035734e-06, + "loss": 0.0543, + "step": 3446 + }, + { + "epoch": 0.7224900440159295, + "grad_norm": 0.022667940706014633, + "learning_rate": 1.890520050273892e-06, + "loss": 0.0557, + "step": 3447 + }, + { + "epoch": 0.7226996436805702, + "grad_norm": 0.025942491367459297, + "learning_rate": 1.8878622619816629e-06, + "loss": 0.0539, + "step": 3448 + }, + { + "epoch": 0.7229092433452107, + "grad_norm": 0.027111142873764038, + "learning_rate": 1.8852059083842838e-06, + "loss": 0.0553, + "step": 3449 + }, + { + "epoch": 0.7231188430098512, + "grad_norm": 0.024661056697368622, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.055, + "step": 3450 + }, + { + "epoch": 0.7233284426744917, + "grad_norm": 0.028169486671686172, + "learning_rate": 1.8798975101717275e-06, + "loss": 0.0546, + "step": 3451 + }, + { + "epoch": 0.7235380423391322, + "grad_norm": 0.02489306405186653, + "learning_rate": 1.8772454680037233e-06, + "loss": 0.0551, + "step": 3452 + }, + { + "epoch": 0.7237476420037728, + "grad_norm": 0.024785684421658516, + "learning_rate": 1.8745948654249085e-06, + "loss": 0.0548, + "step": 3453 + }, + { + "epoch": 0.7239572416684134, + "grad_norm": 0.021179838106036186, + "learning_rate": 1.871945703657213e-06, + "loss": 0.0556, + "step": 3454 + }, + { + "epoch": 0.7241668413330539, + "grad_norm": 0.02177123725414276, + "learning_rate": 1.8692979839218984e-06, + "loss": 0.0575, + "step": 3455 + }, + { + "epoch": 0.7243764409976944, + "grad_norm": 0.025510409846901894, + "learning_rate": 1.8666517074395607e-06, + "loss": 0.0527, + "step": 3456 + }, + { + "epoch": 0.7245860406623349, + "grad_norm": 0.01962488703429699, + "learning_rate": 1.8640068754301355e-06, + "loss": 0.0554, + "step": 3457 + }, + { + "epoch": 0.7247956403269755, + "grad_norm": 0.023709991946816444, + "learning_rate": 1.8613634891128912e-06, + "loss": 0.0541, + "step": 3458 + }, + { + "epoch": 0.725005239991616, + "grad_norm": 0.02632421813905239, + "learning_rate": 1.8587215497064242e-06, + "loss": 0.0557, + "step": 3459 + }, + { + "epoch": 0.7252148396562565, + "grad_norm": 0.0217702928930521, + "learning_rate": 1.8560810584286726e-06, + "loss": 0.0557, + "step": 3460 + }, + { + "epoch": 0.7254244393208971, + "grad_norm": 0.02565830387175083, + "learning_rate": 1.853442016496898e-06, + "loss": 0.0528, + "step": 3461 + }, + { + "epoch": 0.7256340389855376, + "grad_norm": 0.034673016518354416, + "learning_rate": 1.8508044251277019e-06, + "loss": 0.0535, + "step": 3462 + }, + { + "epoch": 0.7258436386501782, + "grad_norm": 0.02693593129515648, + "learning_rate": 1.8481682855370098e-06, + "loss": 0.0535, + "step": 3463 + }, + { + "epoch": 0.7260532383148187, + "grad_norm": 0.018855255097150803, + "learning_rate": 1.8455335989400836e-06, + "loss": 0.0527, + "step": 3464 + }, + { + "epoch": 0.7262628379794592, + "grad_norm": 0.039858147501945496, + "learning_rate": 1.8429003665515165e-06, + "loss": 0.0535, + "step": 3465 + }, + { + "epoch": 0.7264724376440997, + "grad_norm": 0.03316653147339821, + "learning_rate": 1.8402685895852213e-06, + "loss": 0.0561, + "step": 3466 + }, + { + "epoch": 0.7266820373087403, + "grad_norm": 0.018629280850291252, + "learning_rate": 1.8376382692544498e-06, + "loss": 0.0525, + "step": 3467 + }, + { + "epoch": 0.7268916369733809, + "grad_norm": 0.024704501032829285, + "learning_rate": 1.835009406771781e-06, + "loss": 0.0541, + "step": 3468 + }, + { + "epoch": 0.7271012366380214, + "grad_norm": 0.020816821604967117, + "learning_rate": 1.8323820033491163e-06, + "loss": 0.0552, + "step": 3469 + }, + { + "epoch": 0.7273108363026619, + "grad_norm": 0.021062025800347328, + "learning_rate": 1.829756060197692e-06, + "loss": 0.0548, + "step": 3470 + }, + { + "epoch": 0.7275204359673024, + "grad_norm": 0.026151493191719055, + "learning_rate": 1.8271315785280658e-06, + "loss": 0.0545, + "step": 3471 + }, + { + "epoch": 0.7277300356319429, + "grad_norm": 0.019930191338062286, + "learning_rate": 1.8245085595501205e-06, + "loss": 0.0541, + "step": 3472 + }, + { + "epoch": 0.7279396352965836, + "grad_norm": 0.028385937213897705, + "learning_rate": 1.8218870044730702e-06, + "loss": 0.0535, + "step": 3473 + }, + { + "epoch": 0.7281492349612241, + "grad_norm": 0.021223368123173714, + "learning_rate": 1.8192669145054503e-06, + "loss": 0.0507, + "step": 3474 + }, + { + "epoch": 0.7283588346258646, + "grad_norm": 0.02485741302371025, + "learning_rate": 1.8166482908551242e-06, + "loss": 0.0555, + "step": 3475 + }, + { + "epoch": 0.7285684342905051, + "grad_norm": 0.022934934124350548, + "learning_rate": 1.8140311347292744e-06, + "loss": 0.0534, + "step": 3476 + }, + { + "epoch": 0.7287780339551456, + "grad_norm": 0.020589333027601242, + "learning_rate": 1.8114154473344081e-06, + "loss": 0.0537, + "step": 3477 + }, + { + "epoch": 0.7289876336197862, + "grad_norm": 0.021330052986741066, + "learning_rate": 1.8088012298763603e-06, + "loss": 0.0529, + "step": 3478 + }, + { + "epoch": 0.7291972332844268, + "grad_norm": 0.025633566081523895, + "learning_rate": 1.8061884835602805e-06, + "loss": 0.0564, + "step": 3479 + }, + { + "epoch": 0.7294068329490673, + "grad_norm": 0.021011043339967728, + "learning_rate": 1.8035772095906462e-06, + "loss": 0.0529, + "step": 3480 + }, + { + "epoch": 0.7296164326137078, + "grad_norm": 0.023229733109474182, + "learning_rate": 1.8009674091712565e-06, + "loss": 0.0541, + "step": 3481 + }, + { + "epoch": 0.7298260322783483, + "grad_norm": 0.02509014680981636, + "learning_rate": 1.7983590835052267e-06, + "loss": 0.054, + "step": 3482 + }, + { + "epoch": 0.7300356319429889, + "grad_norm": 0.021912094205617905, + "learning_rate": 1.7957522337949924e-06, + "loss": 0.0527, + "step": 3483 + }, + { + "epoch": 0.7302452316076294, + "grad_norm": 0.024635696783661842, + "learning_rate": 1.7931468612423142e-06, + "loss": 0.0531, + "step": 3484 + }, + { + "epoch": 0.73045483127227, + "grad_norm": 0.018132206052541733, + "learning_rate": 1.7905429670482654e-06, + "loss": 0.0552, + "step": 3485 + }, + { + "epoch": 0.7306644309369105, + "grad_norm": 0.023826826363801956, + "learning_rate": 1.7879405524132426e-06, + "loss": 0.0526, + "step": 3486 + }, + { + "epoch": 0.730874030601551, + "grad_norm": 0.020080413669347763, + "learning_rate": 1.7853396185369592e-06, + "loss": 0.0542, + "step": 3487 + }, + { + "epoch": 0.7310836302661916, + "grad_norm": 0.02045084349811077, + "learning_rate": 1.7827401666184434e-06, + "loss": 0.0543, + "step": 3488 + }, + { + "epoch": 0.7312932299308321, + "grad_norm": 0.023532617837190628, + "learning_rate": 1.7801421978560418e-06, + "loss": 0.0532, + "step": 3489 + }, + { + "epoch": 0.7315028295954726, + "grad_norm": 0.02120790258049965, + "learning_rate": 1.7775457134474177e-06, + "loss": 0.0526, + "step": 3490 + }, + { + "epoch": 0.7317124292601132, + "grad_norm": 0.024196181446313858, + "learning_rate": 1.7749507145895518e-06, + "loss": 0.0563, + "step": 3491 + }, + { + "epoch": 0.7319220289247538, + "grad_norm": 0.023171545937657356, + "learning_rate": 1.772357202478735e-06, + "loss": 0.0559, + "step": 3492 + }, + { + "epoch": 0.7321316285893943, + "grad_norm": 0.02514570765197277, + "learning_rate": 1.7697651783105795e-06, + "loss": 0.054, + "step": 3493 + }, + { + "epoch": 0.7323412282540348, + "grad_norm": 0.02238095924258232, + "learning_rate": 1.7671746432800058e-06, + "loss": 0.0535, + "step": 3494 + }, + { + "epoch": 0.7325508279186753, + "grad_norm": 0.030948419123888016, + "learning_rate": 1.7645855985812476e-06, + "loss": 0.0521, + "step": 3495 + }, + { + "epoch": 0.7327604275833158, + "grad_norm": 0.02502829022705555, + "learning_rate": 1.7619980454078572e-06, + "loss": 0.0525, + "step": 3496 + }, + { + "epoch": 0.7329700272479565, + "grad_norm": 0.02190042845904827, + "learning_rate": 1.7594119849526947e-06, + "loss": 0.0547, + "step": 3497 + }, + { + "epoch": 0.733179626912597, + "grad_norm": 0.031756095588207245, + "learning_rate": 1.756827418407936e-06, + "loss": 0.0517, + "step": 3498 + }, + { + "epoch": 0.7333892265772375, + "grad_norm": 0.03542139008641243, + "learning_rate": 1.7542443469650638e-06, + "loss": 0.0533, + "step": 3499 + }, + { + "epoch": 0.733598826241878, + "grad_norm": 0.024449272081255913, + "learning_rate": 1.7516627718148716e-06, + "loss": 0.0501, + "step": 3500 + }, + { + "epoch": 0.7338084259065185, + "grad_norm": 0.0264756977558136, + "learning_rate": 1.7490826941474687e-06, + "loss": 0.0556, + "step": 3501 + }, + { + "epoch": 0.7340180255711591, + "grad_norm": 0.026716774329543114, + "learning_rate": 1.7465041151522666e-06, + "loss": 0.0549, + "step": 3502 + }, + { + "epoch": 0.7342276252357997, + "grad_norm": 0.03247566148638725, + "learning_rate": 1.7439270360179933e-06, + "loss": 0.0522, + "step": 3503 + }, + { + "epoch": 0.7344372249004402, + "grad_norm": 0.04316284507513046, + "learning_rate": 1.7413514579326806e-06, + "loss": 0.0556, + "step": 3504 + }, + { + "epoch": 0.7346468245650807, + "grad_norm": 0.021153295412659645, + "learning_rate": 1.7387773820836668e-06, + "loss": 0.055, + "step": 3505 + }, + { + "epoch": 0.7348564242297212, + "grad_norm": 0.045009203255176544, + "learning_rate": 1.7362048096576023e-06, + "loss": 0.0544, + "step": 3506 + }, + { + "epoch": 0.7350660238943618, + "grad_norm": 0.040110033005476, + "learning_rate": 1.7336337418404442e-06, + "loss": 0.0585, + "step": 3507 + }, + { + "epoch": 0.7352756235590023, + "grad_norm": 0.024850115180015564, + "learning_rate": 1.7310641798174516e-06, + "loss": 0.0576, + "step": 3508 + }, + { + "epoch": 0.7354852232236428, + "grad_norm": 0.03829395771026611, + "learning_rate": 1.728496124773194e-06, + "loss": 0.0543, + "step": 3509 + }, + { + "epoch": 0.7356948228882834, + "grad_norm": 0.02593119814991951, + "learning_rate": 1.7259295778915419e-06, + "loss": 0.0555, + "step": 3510 + }, + { + "epoch": 0.7359044225529239, + "grad_norm": 0.027632324025034904, + "learning_rate": 1.7233645403556765e-06, + "loss": 0.0559, + "step": 3511 + }, + { + "epoch": 0.7361140222175645, + "grad_norm": 0.03341587260365486, + "learning_rate": 1.7208010133480751e-06, + "loss": 0.0545, + "step": 3512 + }, + { + "epoch": 0.736323621882205, + "grad_norm": 0.023744579404592514, + "learning_rate": 1.7182389980505254e-06, + "loss": 0.0524, + "step": 3513 + }, + { + "epoch": 0.7365332215468455, + "grad_norm": 0.02730092778801918, + "learning_rate": 1.7156784956441181e-06, + "loss": 0.058, + "step": 3514 + }, + { + "epoch": 0.736742821211486, + "grad_norm": 0.04322980344295502, + "learning_rate": 1.7131195073092417e-06, + "loss": 0.0545, + "step": 3515 + }, + { + "epoch": 0.7369524208761266, + "grad_norm": 0.02667373977601528, + "learning_rate": 1.710562034225588e-06, + "loss": 0.0557, + "step": 3516 + }, + { + "epoch": 0.7371620205407672, + "grad_norm": 0.03887426108121872, + "learning_rate": 1.7080060775721546e-06, + "loss": 0.0514, + "step": 3517 + }, + { + "epoch": 0.7373716202054077, + "grad_norm": 0.06282377243041992, + "learning_rate": 1.7054516385272345e-06, + "loss": 0.0567, + "step": 3518 + }, + { + "epoch": 0.7375812198700482, + "grad_norm": 0.041183438152074814, + "learning_rate": 1.7028987182684248e-06, + "loss": 0.0551, + "step": 3519 + }, + { + "epoch": 0.7377908195346887, + "grad_norm": 0.026868589222431183, + "learning_rate": 1.7003473179726226e-06, + "loss": 0.0558, + "step": 3520 + }, + { + "epoch": 0.7380004191993292, + "grad_norm": 0.061398621648550034, + "learning_rate": 1.6977974388160213e-06, + "loss": 0.0527, + "step": 3521 + }, + { + "epoch": 0.7382100188639699, + "grad_norm": 0.05005694553256035, + "learning_rate": 1.695249081974113e-06, + "loss": 0.0526, + "step": 3522 + }, + { + "epoch": 0.7384196185286104, + "grad_norm": 0.01858423836529255, + "learning_rate": 1.6927022486216915e-06, + "loss": 0.0522, + "step": 3523 + }, + { + "epoch": 0.7386292181932509, + "grad_norm": 0.045225538313388824, + "learning_rate": 1.6901569399328483e-06, + "loss": 0.0529, + "step": 3524 + }, + { + "epoch": 0.7388388178578914, + "grad_norm": 0.03772973269224167, + "learning_rate": 1.6876131570809667e-06, + "loss": 0.0558, + "step": 3525 + }, + { + "epoch": 0.7390484175225319, + "grad_norm": 0.029959680512547493, + "learning_rate": 1.6850709012387328e-06, + "loss": 0.0574, + "step": 3526 + }, + { + "epoch": 0.7392580171871725, + "grad_norm": 0.06689772009849548, + "learning_rate": 1.6825301735781252e-06, + "loss": 0.0507, + "step": 3527 + }, + { + "epoch": 0.7394676168518131, + "grad_norm": 0.04782269150018692, + "learning_rate": 1.679990975270417e-06, + "loss": 0.0543, + "step": 3528 + }, + { + "epoch": 0.7396772165164536, + "grad_norm": 0.035698726773262024, + "learning_rate": 1.6774533074861793e-06, + "loss": 0.053, + "step": 3529 + }, + { + "epoch": 0.7398868161810941, + "grad_norm": 0.07200644910335541, + "learning_rate": 1.6749171713952783e-06, + "loss": 0.0537, + "step": 3530 + }, + { + "epoch": 0.7400964158457346, + "grad_norm": 0.05012660101056099, + "learning_rate": 1.6723825681668692e-06, + "loss": 0.0529, + "step": 3531 + }, + { + "epoch": 0.7403060155103752, + "grad_norm": 0.031019916757941246, + "learning_rate": 1.6698494989694064e-06, + "loss": 0.0542, + "step": 3532 + }, + { + "epoch": 0.7405156151750157, + "grad_norm": 0.07774803787469864, + "learning_rate": 1.6673179649706312e-06, + "loss": 0.0546, + "step": 3533 + }, + { + "epoch": 0.7407252148396563, + "grad_norm": 0.06452113389968872, + "learning_rate": 1.664787967337584e-06, + "loss": 0.053, + "step": 3534 + }, + { + "epoch": 0.7409348145042968, + "grad_norm": 0.024198826402425766, + "learning_rate": 1.6622595072365887e-06, + "loss": 0.0523, + "step": 3535 + }, + { + "epoch": 0.7411444141689373, + "grad_norm": 0.07494359463453293, + "learning_rate": 1.6597325858332675e-06, + "loss": 0.0525, + "step": 3536 + }, + { + "epoch": 0.7413540138335779, + "grad_norm": 0.060714419931173325, + "learning_rate": 1.6572072042925335e-06, + "loss": 0.0551, + "step": 3537 + }, + { + "epoch": 0.7415636134982184, + "grad_norm": 0.028920724987983704, + "learning_rate": 1.6546833637785814e-06, + "loss": 0.055, + "step": 3538 + }, + { + "epoch": 0.7417732131628589, + "grad_norm": 0.07191255688667297, + "learning_rate": 1.652161065454903e-06, + "loss": 0.0569, + "step": 3539 + }, + { + "epoch": 0.7419828128274994, + "grad_norm": 0.045189227908849716, + "learning_rate": 1.64964031048428e-06, + "loss": 0.0538, + "step": 3540 + }, + { + "epoch": 0.74219241249214, + "grad_norm": 0.034395113587379456, + "learning_rate": 1.6471211000287762e-06, + "loss": 0.0559, + "step": 3541 + }, + { + "epoch": 0.7424020121567806, + "grad_norm": 0.062001634389162064, + "learning_rate": 1.6446034352497504e-06, + "loss": 0.0567, + "step": 3542 + }, + { + "epoch": 0.7426116118214211, + "grad_norm": 0.03360583260655403, + "learning_rate": 1.6420873173078422e-06, + "loss": 0.0552, + "step": 3543 + }, + { + "epoch": 0.7428212114860616, + "grad_norm": 0.0365772545337677, + "learning_rate": 1.6395727473629852e-06, + "loss": 0.051, + "step": 3544 + }, + { + "epoch": 0.7430308111507021, + "grad_norm": 0.046734243631362915, + "learning_rate": 1.637059726574392e-06, + "loss": 0.0532, + "step": 3545 + }, + { + "epoch": 0.7432404108153426, + "grad_norm": 0.021529680117964745, + "learning_rate": 1.634548256100566e-06, + "loss": 0.053, + "step": 3546 + }, + { + "epoch": 0.7434500104799833, + "grad_norm": 0.04237625375390053, + "learning_rate": 1.632038337099297e-06, + "loss": 0.0539, + "step": 3547 + }, + { + "epoch": 0.7436596101446238, + "grad_norm": 0.029763460159301758, + "learning_rate": 1.6295299707276546e-06, + "loss": 0.0511, + "step": 3548 + }, + { + "epoch": 0.7438692098092643, + "grad_norm": 0.031175674870610237, + "learning_rate": 1.6270231581419943e-06, + "loss": 0.0551, + "step": 3549 + }, + { + "epoch": 0.7440788094739048, + "grad_norm": 0.04852377995848656, + "learning_rate": 1.6245179004979588e-06, + "loss": 0.054, + "step": 3550 + }, + { + "epoch": 0.7442884091385453, + "grad_norm": 0.027154749259352684, + "learning_rate": 1.6220141989504683e-06, + "loss": 0.0528, + "step": 3551 + }, + { + "epoch": 0.744498008803186, + "grad_norm": 0.026955176144838333, + "learning_rate": 1.6195120546537307e-06, + "loss": 0.0542, + "step": 3552 + }, + { + "epoch": 0.7447076084678265, + "grad_norm": 0.029728444293141365, + "learning_rate": 1.6170114687612349e-06, + "loss": 0.0566, + "step": 3553 + }, + { + "epoch": 0.744917208132467, + "grad_norm": 0.018385691568255424, + "learning_rate": 1.6145124424257497e-06, + "loss": 0.0548, + "step": 3554 + }, + { + "epoch": 0.7451268077971075, + "grad_norm": 0.02659742347896099, + "learning_rate": 1.6120149767993237e-06, + "loss": 0.0572, + "step": 3555 + }, + { + "epoch": 0.745336407461748, + "grad_norm": 0.030509591102600098, + "learning_rate": 1.6095190730332893e-06, + "loss": 0.0547, + "step": 3556 + }, + { + "epoch": 0.7455460071263886, + "grad_norm": 0.02252962999045849, + "learning_rate": 1.60702473227826e-06, + "loss": 0.0554, + "step": 3557 + }, + { + "epoch": 0.7457556067910291, + "grad_norm": 0.029321059584617615, + "learning_rate": 1.6045319556841227e-06, + "loss": 0.0525, + "step": 3558 + }, + { + "epoch": 0.7459652064556697, + "grad_norm": 0.026201212778687477, + "learning_rate": 1.6020407444000497e-06, + "loss": 0.0564, + "step": 3559 + }, + { + "epoch": 0.7461748061203102, + "grad_norm": 0.028670430183410645, + "learning_rate": 1.5995510995744879e-06, + "loss": 0.0548, + "step": 3560 + }, + { + "epoch": 0.7463844057849508, + "grad_norm": 0.028581839054822922, + "learning_rate": 1.5970630223551614e-06, + "loss": 0.0519, + "step": 3561 + }, + { + "epoch": 0.7465940054495913, + "grad_norm": 0.025404004380106926, + "learning_rate": 1.5945765138890746e-06, + "loss": 0.0549, + "step": 3562 + }, + { + "epoch": 0.7468036051142318, + "grad_norm": 0.04105531796813011, + "learning_rate": 1.5920915753225097e-06, + "loss": 0.0537, + "step": 3563 + }, + { + "epoch": 0.7470132047788723, + "grad_norm": 0.02948727272450924, + "learning_rate": 1.5896082078010183e-06, + "loss": 0.0519, + "step": 3564 + }, + { + "epoch": 0.7472228044435129, + "grad_norm": 0.02374359965324402, + "learning_rate": 1.5871264124694368e-06, + "loss": 0.0529, + "step": 3565 + }, + { + "epoch": 0.7474324041081535, + "grad_norm": 0.02481861598789692, + "learning_rate": 1.5846461904718686e-06, + "loss": 0.0539, + "step": 3566 + }, + { + "epoch": 0.747642003772794, + "grad_norm": 0.019914066419005394, + "learning_rate": 1.5821675429516981e-06, + "loss": 0.0535, + "step": 3567 + }, + { + "epoch": 0.7478516034374345, + "grad_norm": 0.027355968952178955, + "learning_rate": 1.5796904710515792e-06, + "loss": 0.054, + "step": 3568 + }, + { + "epoch": 0.748061203102075, + "grad_norm": 0.02385944128036499, + "learning_rate": 1.577214975913443e-06, + "loss": 0.0551, + "step": 3569 + }, + { + "epoch": 0.7482708027667155, + "grad_norm": 0.022167477756738663, + "learning_rate": 1.574741058678495e-06, + "loss": 0.0533, + "step": 3570 + }, + { + "epoch": 0.7484804024313562, + "grad_norm": 0.024447597563266754, + "learning_rate": 1.5722687204872038e-06, + "loss": 0.0569, + "step": 3571 + }, + { + "epoch": 0.7486900020959967, + "grad_norm": 0.019285105168819427, + "learning_rate": 1.569797962479321e-06, + "loss": 0.0547, + "step": 3572 + }, + { + "epoch": 0.7488996017606372, + "grad_norm": 0.026579929515719414, + "learning_rate": 1.5673287857938663e-06, + "loss": 0.0544, + "step": 3573 + }, + { + "epoch": 0.7491092014252777, + "grad_norm": 0.021767770871520042, + "learning_rate": 1.564861191569127e-06, + "loss": 0.0549, + "step": 3574 + }, + { + "epoch": 0.7493188010899182, + "grad_norm": 0.022724144160747528, + "learning_rate": 1.5623951809426663e-06, + "loss": 0.0553, + "step": 3575 + }, + { + "epoch": 0.7495284007545588, + "grad_norm": 0.02660524845123291, + "learning_rate": 1.5599307550513132e-06, + "loss": 0.0526, + "step": 3576 + }, + { + "epoch": 0.7497380004191994, + "grad_norm": 0.02129141427576542, + "learning_rate": 1.5574679150311656e-06, + "loss": 0.0547, + "step": 3577 + }, + { + "epoch": 0.7499476000838399, + "grad_norm": 0.02047175168991089, + "learning_rate": 1.555006662017594e-06, + "loss": 0.0529, + "step": 3578 + }, + { + "epoch": 0.7501571997484804, + "grad_norm": 0.02189241163432598, + "learning_rate": 1.5525469971452362e-06, + "loss": 0.0533, + "step": 3579 + }, + { + "epoch": 0.7503667994131209, + "grad_norm": 0.019183902069926262, + "learning_rate": 1.5500889215479974e-06, + "loss": 0.0535, + "step": 3580 + }, + { + "epoch": 0.7505763990777615, + "grad_norm": 0.022731618955731392, + "learning_rate": 1.54763243635905e-06, + "loss": 0.054, + "step": 3581 + }, + { + "epoch": 0.750785998742402, + "grad_norm": 0.02083863690495491, + "learning_rate": 1.5451775427108302e-06, + "loss": 0.0566, + "step": 3582 + }, + { + "epoch": 0.7509955984070426, + "grad_norm": 0.021793080493807793, + "learning_rate": 1.5427242417350474e-06, + "loss": 0.0539, + "step": 3583 + }, + { + "epoch": 0.7512051980716831, + "grad_norm": 0.024363208562135696, + "learning_rate": 1.540272534562669e-06, + "loss": 0.0542, + "step": 3584 + }, + { + "epoch": 0.7514147977363236, + "grad_norm": 0.020569128915667534, + "learning_rate": 1.5378224223239341e-06, + "loss": 0.0554, + "step": 3585 + }, + { + "epoch": 0.7516243974009642, + "grad_norm": 0.02453795075416565, + "learning_rate": 1.5353739061483446e-06, + "loss": 0.0513, + "step": 3586 + }, + { + "epoch": 0.7518339970656047, + "grad_norm": 0.022484583780169487, + "learning_rate": 1.5329269871646646e-06, + "loss": 0.0539, + "step": 3587 + }, + { + "epoch": 0.7520435967302452, + "grad_norm": 0.021373063325881958, + "learning_rate": 1.530481666500922e-06, + "loss": 0.0554, + "step": 3588 + }, + { + "epoch": 0.7522531963948857, + "grad_norm": 0.026554672047495842, + "learning_rate": 1.5280379452844124e-06, + "loss": 0.0527, + "step": 3589 + }, + { + "epoch": 0.7524627960595263, + "grad_norm": 0.023381482809782028, + "learning_rate": 1.525595824641687e-06, + "loss": 0.0568, + "step": 3590 + }, + { + "epoch": 0.7526723957241669, + "grad_norm": 0.027381494641304016, + "learning_rate": 1.5231553056985642e-06, + "loss": 0.0522, + "step": 3591 + }, + { + "epoch": 0.7528819953888074, + "grad_norm": 0.03211834654211998, + "learning_rate": 1.5207163895801252e-06, + "loss": 0.0554, + "step": 3592 + }, + { + "epoch": 0.7530915950534479, + "grad_norm": 0.021453700959682465, + "learning_rate": 1.5182790774107082e-06, + "loss": 0.0549, + "step": 3593 + }, + { + "epoch": 0.7533011947180884, + "grad_norm": 0.037080686539411545, + "learning_rate": 1.5158433703139114e-06, + "loss": 0.0532, + "step": 3594 + }, + { + "epoch": 0.7535107943827289, + "grad_norm": 0.02473445050418377, + "learning_rate": 1.5134092694125968e-06, + "loss": 0.0549, + "step": 3595 + }, + { + "epoch": 0.7537203940473696, + "grad_norm": 0.02276427671313286, + "learning_rate": 1.510976775828887e-06, + "loss": 0.0568, + "step": 3596 + }, + { + "epoch": 0.7539299937120101, + "grad_norm": 0.020384134724736214, + "learning_rate": 1.508545890684157e-06, + "loss": 0.054, + "step": 3597 + }, + { + "epoch": 0.7541395933766506, + "grad_norm": 0.025425709784030914, + "learning_rate": 1.5061166150990475e-06, + "loss": 0.0536, + "step": 3598 + }, + { + "epoch": 0.7543491930412911, + "grad_norm": 0.022402847185730934, + "learning_rate": 1.5036889501934533e-06, + "loss": 0.0543, + "step": 3599 + }, + { + "epoch": 0.7545587927059316, + "grad_norm": 0.02631525509059429, + "learning_rate": 1.5012628970865245e-06, + "loss": 0.0548, + "step": 3600 + }, + { + "epoch": 0.7547683923705722, + "grad_norm": 0.019039105623960495, + "learning_rate": 1.498838456896674e-06, + "loss": 0.0555, + "step": 3601 + }, + { + "epoch": 0.7549779920352128, + "grad_norm": 0.028720203787088394, + "learning_rate": 1.4964156307415673e-06, + "loss": 0.0541, + "step": 3602 + }, + { + "epoch": 0.7551875916998533, + "grad_norm": 0.02376984804868698, + "learning_rate": 1.493994419738129e-06, + "loss": 0.053, + "step": 3603 + }, + { + "epoch": 0.7553971913644938, + "grad_norm": 0.028366727754473686, + "learning_rate": 1.4915748250025346e-06, + "loss": 0.0518, + "step": 3604 + }, + { + "epoch": 0.7556067910291343, + "grad_norm": 0.024544060230255127, + "learning_rate": 1.4891568476502154e-06, + "loss": 0.0542, + "step": 3605 + }, + { + "epoch": 0.7558163906937749, + "grad_norm": 0.029393775388598442, + "learning_rate": 1.486740488795862e-06, + "loss": 0.0566, + "step": 3606 + }, + { + "epoch": 0.7560259903584154, + "grad_norm": 0.030081739649176598, + "learning_rate": 1.484325749553412e-06, + "loss": 0.054, + "step": 3607 + }, + { + "epoch": 0.756235590023056, + "grad_norm": 0.03067159280180931, + "learning_rate": 1.4819126310360626e-06, + "loss": 0.0546, + "step": 3608 + }, + { + "epoch": 0.7564451896876965, + "grad_norm": 0.02665751427412033, + "learning_rate": 1.4795011343562594e-06, + "loss": 0.0545, + "step": 3609 + }, + { + "epoch": 0.756654789352337, + "grad_norm": 0.025258727371692657, + "learning_rate": 1.4770912606257003e-06, + "loss": 0.0552, + "step": 3610 + }, + { + "epoch": 0.7568643890169776, + "grad_norm": 0.028663890436291695, + "learning_rate": 1.4746830109553388e-06, + "loss": 0.0521, + "step": 3611 + }, + { + "epoch": 0.7570739886816181, + "grad_norm": 0.02796069160103798, + "learning_rate": 1.472276386455378e-06, + "loss": 0.0549, + "step": 3612 + }, + { + "epoch": 0.7572835883462586, + "grad_norm": 0.028017984703183174, + "learning_rate": 1.4698713882352694e-06, + "loss": 0.0551, + "step": 3613 + }, + { + "epoch": 0.7574931880108992, + "grad_norm": 0.036044858396053314, + "learning_rate": 1.4674680174037186e-06, + "loss": 0.0549, + "step": 3614 + }, + { + "epoch": 0.7577027876755397, + "grad_norm": 0.024651629850268364, + "learning_rate": 1.465066275068676e-06, + "loss": 0.0535, + "step": 3615 + }, + { + "epoch": 0.7579123873401803, + "grad_norm": 0.02148495241999626, + "learning_rate": 1.462666162337349e-06, + "loss": 0.0529, + "step": 3616 + }, + { + "epoch": 0.7581219870048208, + "grad_norm": 0.023804927244782448, + "learning_rate": 1.4602676803161842e-06, + "loss": 0.0557, + "step": 3617 + }, + { + "epoch": 0.7583315866694613, + "grad_norm": 0.017874160781502724, + "learning_rate": 1.4578708301108835e-06, + "loss": 0.0548, + "step": 3618 + }, + { + "epoch": 0.7585411863341018, + "grad_norm": 0.021595729514956474, + "learning_rate": 1.4554756128263958e-06, + "loss": 0.0567, + "step": 3619 + }, + { + "epoch": 0.7587507859987423, + "grad_norm": 0.022326629608869553, + "learning_rate": 1.4530820295669145e-06, + "loss": 0.0534, + "step": 3620 + }, + { + "epoch": 0.758960385663383, + "grad_norm": 0.021938448771834373, + "learning_rate": 1.4506900814358794e-06, + "loss": 0.0532, + "step": 3621 + }, + { + "epoch": 0.7591699853280235, + "grad_norm": 0.02145432122051716, + "learning_rate": 1.4482997695359807e-06, + "loss": 0.0548, + "step": 3622 + }, + { + "epoch": 0.759379584992664, + "grad_norm": 0.019530145451426506, + "learning_rate": 1.445911094969149e-06, + "loss": 0.0541, + "step": 3623 + }, + { + "epoch": 0.7595891846573045, + "grad_norm": 0.01897241175174713, + "learning_rate": 1.4435240588365645e-06, + "loss": 0.0528, + "step": 3624 + }, + { + "epoch": 0.759798784321945, + "grad_norm": 0.02352256327867508, + "learning_rate": 1.4411386622386519e-06, + "loss": 0.0556, + "step": 3625 + }, + { + "epoch": 0.7600083839865857, + "grad_norm": 0.016778651624917984, + "learning_rate": 1.4387549062750767e-06, + "loss": 0.055, + "step": 3626 + }, + { + "epoch": 0.7602179836512262, + "grad_norm": 0.02301890030503273, + "learning_rate": 1.4363727920447478e-06, + "loss": 0.053, + "step": 3627 + }, + { + "epoch": 0.7604275833158667, + "grad_norm": 0.020533403381705284, + "learning_rate": 1.4339923206458222e-06, + "loss": 0.0549, + "step": 3628 + }, + { + "epoch": 0.7606371829805072, + "grad_norm": 0.02557838149368763, + "learning_rate": 1.431613493175697e-06, + "loss": 0.0544, + "step": 3629 + }, + { + "epoch": 0.7608467826451477, + "grad_norm": 0.026523873209953308, + "learning_rate": 1.4292363107310091e-06, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.7610563823097883, + "grad_norm": 0.02507561445236206, + "learning_rate": 1.4268607744076419e-06, + "loss": 0.0542, + "step": 3631 + }, + { + "epoch": 0.7612659819744289, + "grad_norm": 0.034792933613061905, + "learning_rate": 1.424486885300715e-06, + "loss": 0.0514, + "step": 3632 + }, + { + "epoch": 0.7614755816390694, + "grad_norm": 0.025153324007987976, + "learning_rate": 1.4221146445045903e-06, + "loss": 0.0534, + "step": 3633 + }, + { + "epoch": 0.7616851813037099, + "grad_norm": 0.0239273589104414, + "learning_rate": 1.419744053112871e-06, + "loss": 0.0546, + "step": 3634 + }, + { + "epoch": 0.7618947809683505, + "grad_norm": 0.027409644797444344, + "learning_rate": 1.417375112218401e-06, + "loss": 0.053, + "step": 3635 + }, + { + "epoch": 0.762104380632991, + "grad_norm": 0.021910734474658966, + "learning_rate": 1.4150078229132586e-06, + "loss": 0.0506, + "step": 3636 + }, + { + "epoch": 0.7623139802976315, + "grad_norm": 0.028264936059713364, + "learning_rate": 1.4126421862887668e-06, + "loss": 0.0526, + "step": 3637 + }, + { + "epoch": 0.762523579962272, + "grad_norm": 0.017890270799398422, + "learning_rate": 1.41027820343548e-06, + "loss": 0.0564, + "step": 3638 + }, + { + "epoch": 0.7627331796269126, + "grad_norm": 0.02689182385802269, + "learning_rate": 1.4079158754431981e-06, + "loss": 0.0544, + "step": 3639 + }, + { + "epoch": 0.7629427792915532, + "grad_norm": 0.024087607860565186, + "learning_rate": 1.4055552034009496e-06, + "loss": 0.0558, + "step": 3640 + }, + { + "epoch": 0.7631523789561937, + "grad_norm": 0.019843678921461105, + "learning_rate": 1.4031961883970053e-06, + "loss": 0.0523, + "step": 3641 + }, + { + "epoch": 0.7633619786208342, + "grad_norm": 0.029412388801574707, + "learning_rate": 1.4008388315188743e-06, + "loss": 0.0544, + "step": 3642 + }, + { + "epoch": 0.7635715782854747, + "grad_norm": 0.022888310253620148, + "learning_rate": 1.3984831338532916e-06, + "loss": 0.0546, + "step": 3643 + }, + { + "epoch": 0.7637811779501152, + "grad_norm": 0.02272786572575569, + "learning_rate": 1.3961290964862356e-06, + "loss": 0.053, + "step": 3644 + }, + { + "epoch": 0.7639907776147559, + "grad_norm": 0.028499163687229156, + "learning_rate": 1.3937767205029196e-06, + "loss": 0.0532, + "step": 3645 + }, + { + "epoch": 0.7642003772793964, + "grad_norm": 0.02279391512274742, + "learning_rate": 1.3914260069877844e-06, + "loss": 0.0518, + "step": 3646 + }, + { + "epoch": 0.7644099769440369, + "grad_norm": 0.03257404640316963, + "learning_rate": 1.3890769570245122e-06, + "loss": 0.056, + "step": 3647 + }, + { + "epoch": 0.7646195766086774, + "grad_norm": 0.030943913385272026, + "learning_rate": 1.386729571696011e-06, + "loss": 0.0548, + "step": 3648 + }, + { + "epoch": 0.7648291762733179, + "grad_norm": 0.017855705693364143, + "learning_rate": 1.3843838520844288e-06, + "loss": 0.0545, + "step": 3649 + }, + { + "epoch": 0.7650387759379585, + "grad_norm": 0.029183411970734596, + "learning_rate": 1.3820397992711377e-06, + "loss": 0.0562, + "step": 3650 + }, + { + "epoch": 0.7652483756025991, + "grad_norm": 0.02148706465959549, + "learning_rate": 1.3796974143367475e-06, + "loss": 0.0561, + "step": 3651 + }, + { + "epoch": 0.7654579752672396, + "grad_norm": 0.018655741587281227, + "learning_rate": 1.3773566983610992e-06, + "loss": 0.0533, + "step": 3652 + }, + { + "epoch": 0.7656675749318801, + "grad_norm": 0.022678757086396217, + "learning_rate": 1.3750176524232605e-06, + "loss": 0.0535, + "step": 3653 + }, + { + "epoch": 0.7658771745965206, + "grad_norm": 0.02211586944758892, + "learning_rate": 1.372680277601529e-06, + "loss": 0.0545, + "step": 3654 + }, + { + "epoch": 0.7660867742611612, + "grad_norm": 0.019549356773495674, + "learning_rate": 1.3703445749734384e-06, + "loss": 0.0548, + "step": 3655 + }, + { + "epoch": 0.7662963739258017, + "grad_norm": 0.02576516941189766, + "learning_rate": 1.3680105456157427e-06, + "loss": 0.0552, + "step": 3656 + }, + { + "epoch": 0.7665059735904423, + "grad_norm": 0.024852564558386803, + "learning_rate": 1.3656781906044315e-06, + "loss": 0.0538, + "step": 3657 + }, + { + "epoch": 0.7667155732550828, + "grad_norm": 0.022067122161388397, + "learning_rate": 1.3633475110147204e-06, + "loss": 0.0551, + "step": 3658 + }, + { + "epoch": 0.7669251729197233, + "grad_norm": 0.029038861393928528, + "learning_rate": 1.3610185079210514e-06, + "loss": 0.0531, + "step": 3659 + }, + { + "epoch": 0.7671347725843639, + "grad_norm": 0.019402772188186646, + "learning_rate": 1.3586911823970933e-06, + "loss": 0.0537, + "step": 3660 + }, + { + "epoch": 0.7673443722490044, + "grad_norm": 0.024672850966453552, + "learning_rate": 1.3563655355157434e-06, + "loss": 0.0522, + "step": 3661 + }, + { + "epoch": 0.7675539719136449, + "grad_norm": 0.022120898589491844, + "learning_rate": 1.3540415683491265e-06, + "loss": 0.0542, + "step": 3662 + }, + { + "epoch": 0.7677635715782855, + "grad_norm": 0.021284013986587524, + "learning_rate": 1.3517192819685875e-06, + "loss": 0.0516, + "step": 3663 + }, + { + "epoch": 0.767973171242926, + "grad_norm": 0.024125682190060616, + "learning_rate": 1.3493986774447032e-06, + "loss": 0.0575, + "step": 3664 + }, + { + "epoch": 0.7681827709075666, + "grad_norm": 0.022346889600157738, + "learning_rate": 1.3470797558472709e-06, + "loss": 0.0539, + "step": 3665 + }, + { + "epoch": 0.7683923705722071, + "grad_norm": 0.022748976945877075, + "learning_rate": 1.34476251824531e-06, + "loss": 0.0563, + "step": 3666 + }, + { + "epoch": 0.7686019702368476, + "grad_norm": 0.02310675010085106, + "learning_rate": 1.3424469657070693e-06, + "loss": 0.0534, + "step": 3667 + }, + { + "epoch": 0.7688115699014881, + "grad_norm": 0.016234230250120163, + "learning_rate": 1.3401330993000195e-06, + "loss": 0.0568, + "step": 3668 + }, + { + "epoch": 0.7690211695661286, + "grad_norm": 0.02276422083377838, + "learning_rate": 1.3378209200908487e-06, + "loss": 0.0533, + "step": 3669 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.020701710134744644, + "learning_rate": 1.3355104291454751e-06, + "loss": 0.0522, + "step": 3670 + }, + { + "epoch": 0.7694403688954098, + "grad_norm": 0.017248544842004776, + "learning_rate": 1.3332016275290304e-06, + "loss": 0.0541, + "step": 3671 + }, + { + "epoch": 0.7696499685600503, + "grad_norm": 0.01811116933822632, + "learning_rate": 1.3308945163058757e-06, + "loss": 0.0532, + "step": 3672 + }, + { + "epoch": 0.7698595682246908, + "grad_norm": 0.017278380692005157, + "learning_rate": 1.3285890965395853e-06, + "loss": 0.0548, + "step": 3673 + }, + { + "epoch": 0.7700691678893313, + "grad_norm": 0.017872022464871407, + "learning_rate": 1.3262853692929583e-06, + "loss": 0.0554, + "step": 3674 + }, + { + "epoch": 0.770278767553972, + "grad_norm": 0.019185813143849373, + "learning_rate": 1.3239833356280152e-06, + "loss": 0.0519, + "step": 3675 + }, + { + "epoch": 0.7704883672186125, + "grad_norm": 0.0181709136813879, + "learning_rate": 1.3216829966059902e-06, + "loss": 0.0555, + "step": 3676 + }, + { + "epoch": 0.770697966883253, + "grad_norm": 0.020711593329906464, + "learning_rate": 1.3193843532873385e-06, + "loss": 0.0529, + "step": 3677 + }, + { + "epoch": 0.7709075665478935, + "grad_norm": 0.015872521325945854, + "learning_rate": 1.3170874067317362e-06, + "loss": 0.0539, + "step": 3678 + }, + { + "epoch": 0.771117166212534, + "grad_norm": 0.02129794843494892, + "learning_rate": 1.3147921579980739e-06, + "loss": 0.0543, + "step": 3679 + }, + { + "epoch": 0.7713267658771746, + "grad_norm": 0.015224761329591274, + "learning_rate": 1.3124986081444625e-06, + "loss": 0.0548, + "step": 3680 + }, + { + "epoch": 0.7715363655418152, + "grad_norm": 0.016702933236956596, + "learning_rate": 1.3102067582282264e-06, + "loss": 0.0549, + "step": 3681 + }, + { + "epoch": 0.7717459652064557, + "grad_norm": 0.01623140461742878, + "learning_rate": 1.307916609305907e-06, + "loss": 0.0532, + "step": 3682 + }, + { + "epoch": 0.7719555648710962, + "grad_norm": 0.014785263687372208, + "learning_rate": 1.305628162433264e-06, + "loss": 0.055, + "step": 3683 + }, + { + "epoch": 0.7721651645357367, + "grad_norm": 0.014699485152959824, + "learning_rate": 1.3033414186652705e-06, + "loss": 0.0561, + "step": 3684 + }, + { + "epoch": 0.7723747642003773, + "grad_norm": 0.017208058387041092, + "learning_rate": 1.3010563790561165e-06, + "loss": 0.0548, + "step": 3685 + }, + { + "epoch": 0.7725843638650178, + "grad_norm": 0.019409308210015297, + "learning_rate": 1.298773044659204e-06, + "loss": 0.0524, + "step": 3686 + }, + { + "epoch": 0.7727939635296583, + "grad_norm": 0.015810973942279816, + "learning_rate": 1.296491416527147e-06, + "loss": 0.0546, + "step": 3687 + }, + { + "epoch": 0.7730035631942989, + "grad_norm": 0.020034223794937134, + "learning_rate": 1.2942114957117797e-06, + "loss": 0.0545, + "step": 3688 + }, + { + "epoch": 0.7732131628589394, + "grad_norm": 0.021472373977303505, + "learning_rate": 1.2919332832641413e-06, + "loss": 0.0534, + "step": 3689 + }, + { + "epoch": 0.77342276252358, + "grad_norm": 0.012456726282835007, + "learning_rate": 1.2896567802344888e-06, + "loss": 0.0529, + "step": 3690 + }, + { + "epoch": 0.7736323621882205, + "grad_norm": 0.018201405182480812, + "learning_rate": 1.287381987672292e-06, + "loss": 0.0526, + "step": 3691 + }, + { + "epoch": 0.773841961852861, + "grad_norm": 0.012917966581881046, + "learning_rate": 1.2851089066262272e-06, + "loss": 0.0538, + "step": 3692 + }, + { + "epoch": 0.7740515615175015, + "grad_norm": 0.019098002463579178, + "learning_rate": 1.2828375381441837e-06, + "loss": 0.055, + "step": 3693 + }, + { + "epoch": 0.774261161182142, + "grad_norm": 0.015969131141901016, + "learning_rate": 1.2805678832732627e-06, + "loss": 0.0544, + "step": 3694 + }, + { + "epoch": 0.7744707608467827, + "grad_norm": 0.016314983367919922, + "learning_rate": 1.2782999430597764e-06, + "loss": 0.0513, + "step": 3695 + }, + { + "epoch": 0.7746803605114232, + "grad_norm": 0.021401531994342804, + "learning_rate": 1.2760337185492423e-06, + "loss": 0.0557, + "step": 3696 + }, + { + "epoch": 0.7748899601760637, + "grad_norm": 0.015217507258057594, + "learning_rate": 1.2737692107863914e-06, + "loss": 0.0566, + "step": 3697 + }, + { + "epoch": 0.7750995598407042, + "grad_norm": 0.013445749878883362, + "learning_rate": 1.2715064208151606e-06, + "loss": 0.0534, + "step": 3698 + }, + { + "epoch": 0.7753091595053447, + "grad_norm": 0.01527884230017662, + "learning_rate": 1.2692453496786933e-06, + "loss": 0.0554, + "step": 3699 + }, + { + "epoch": 0.7755187591699854, + "grad_norm": 0.013215204700827599, + "learning_rate": 1.2669859984193456e-06, + "loss": 0.05, + "step": 3700 + }, + { + "epoch": 0.7757283588346259, + "grad_norm": 0.013395383022725582, + "learning_rate": 1.264728368078678e-06, + "loss": 0.0579, + "step": 3701 + }, + { + "epoch": 0.7759379584992664, + "grad_norm": 0.013562247157096863, + "learning_rate": 1.2624724596974557e-06, + "loss": 0.0546, + "step": 3702 + }, + { + "epoch": 0.7761475581639069, + "grad_norm": 0.0144569780677557, + "learning_rate": 1.260218274315655e-06, + "loss": 0.0544, + "step": 3703 + }, + { + "epoch": 0.7763571578285475, + "grad_norm": 0.014783012680709362, + "learning_rate": 1.2579658129724526e-06, + "loss": 0.0539, + "step": 3704 + }, + { + "epoch": 0.776566757493188, + "grad_norm": 0.014723209664225578, + "learning_rate": 1.2557150767062315e-06, + "loss": 0.0554, + "step": 3705 + }, + { + "epoch": 0.7767763571578286, + "grad_norm": 0.016170332208275795, + "learning_rate": 1.2534660665545822e-06, + "loss": 0.0526, + "step": 3706 + }, + { + "epoch": 0.7769859568224691, + "grad_norm": 0.019932882860302925, + "learning_rate": 1.2512187835542982e-06, + "loss": 0.0526, + "step": 3707 + }, + { + "epoch": 0.7771955564871096, + "grad_norm": 0.016744259744882584, + "learning_rate": 1.248973228741378e-06, + "loss": 0.0535, + "step": 3708 + }, + { + "epoch": 0.7774051561517502, + "grad_norm": 0.016560077667236328, + "learning_rate": 1.2467294031510202e-06, + "loss": 0.0526, + "step": 3709 + }, + { + "epoch": 0.7776147558163907, + "grad_norm": 0.016798479482531548, + "learning_rate": 1.2444873078176262e-06, + "loss": 0.0535, + "step": 3710 + }, + { + "epoch": 0.7778243554810312, + "grad_norm": 0.01939672976732254, + "learning_rate": 1.2422469437748046e-06, + "loss": 0.0531, + "step": 3711 + }, + { + "epoch": 0.7780339551456718, + "grad_norm": 0.019812364131212234, + "learning_rate": 1.2400083120553602e-06, + "loss": 0.0537, + "step": 3712 + }, + { + "epoch": 0.7782435548103123, + "grad_norm": 0.0169843677431345, + "learning_rate": 1.2377714136913028e-06, + "loss": 0.0537, + "step": 3713 + }, + { + "epoch": 0.7784531544749529, + "grad_norm": 0.019338861107826233, + "learning_rate": 1.235536249713845e-06, + "loss": 0.0538, + "step": 3714 + }, + { + "epoch": 0.7786627541395934, + "grad_norm": 0.018493298441171646, + "learning_rate": 1.2333028211533916e-06, + "loss": 0.0527, + "step": 3715 + }, + { + "epoch": 0.7788723538042339, + "grad_norm": 0.0183496605604887, + "learning_rate": 1.2310711290395545e-06, + "loss": 0.0549, + "step": 3716 + }, + { + "epoch": 0.7790819534688744, + "grad_norm": 0.019363224506378174, + "learning_rate": 1.2288411744011464e-06, + "loss": 0.0513, + "step": 3717 + }, + { + "epoch": 0.779291553133515, + "grad_norm": 0.015527610667049885, + "learning_rate": 1.2266129582661712e-06, + "loss": 0.0545, + "step": 3718 + }, + { + "epoch": 0.7795011527981556, + "grad_norm": 0.018898479640483856, + "learning_rate": 1.2243864816618407e-06, + "loss": 0.0539, + "step": 3719 + }, + { + "epoch": 0.7797107524627961, + "grad_norm": 0.014554295688867569, + "learning_rate": 1.2221617456145556e-06, + "loss": 0.0532, + "step": 3720 + }, + { + "epoch": 0.7799203521274366, + "grad_norm": 0.01795900985598564, + "learning_rate": 1.2199387511499234e-06, + "loss": 0.0542, + "step": 3721 + }, + { + "epoch": 0.7801299517920771, + "grad_norm": 0.018716957420110703, + "learning_rate": 1.21771749929274e-06, + "loss": 0.0534, + "step": 3722 + }, + { + "epoch": 0.7803395514567176, + "grad_norm": 0.015783434733748436, + "learning_rate": 1.2154979910670033e-06, + "loss": 0.055, + "step": 3723 + }, + { + "epoch": 0.7805491511213583, + "grad_norm": 0.016636742278933525, + "learning_rate": 1.2132802274959082e-06, + "loss": 0.0527, + "step": 3724 + }, + { + "epoch": 0.7807587507859988, + "grad_norm": 0.017774462699890137, + "learning_rate": 1.2110642096018421e-06, + "loss": 0.0556, + "step": 3725 + }, + { + "epoch": 0.7809683504506393, + "grad_norm": 0.01745484583079815, + "learning_rate": 1.2088499384063868e-06, + "loss": 0.0521, + "step": 3726 + }, + { + "epoch": 0.7811779501152798, + "grad_norm": 0.018969282507896423, + "learning_rate": 1.2066374149303234e-06, + "loss": 0.0524, + "step": 3727 + }, + { + "epoch": 0.7813875497799203, + "grad_norm": 0.0174677986651659, + "learning_rate": 1.2044266401936228e-06, + "loss": 0.0541, + "step": 3728 + }, + { + "epoch": 0.7815971494445609, + "grad_norm": 0.01889014057815075, + "learning_rate": 1.2022176152154525e-06, + "loss": 0.0532, + "step": 3729 + }, + { + "epoch": 0.7818067491092014, + "grad_norm": 0.020782019942998886, + "learning_rate": 1.200010341014174e-06, + "loss": 0.0517, + "step": 3730 + }, + { + "epoch": 0.782016348773842, + "grad_norm": 0.017878524959087372, + "learning_rate": 1.1978048186073388e-06, + "loss": 0.0548, + "step": 3731 + }, + { + "epoch": 0.7822259484384825, + "grad_norm": 0.01691311225295067, + "learning_rate": 1.1956010490116915e-06, + "loss": 0.0516, + "step": 3732 + }, + { + "epoch": 0.782435548103123, + "grad_norm": 0.014464441686868668, + "learning_rate": 1.1933990332431699e-06, + "loss": 0.055, + "step": 3733 + }, + { + "epoch": 0.7826451477677636, + "grad_norm": 0.02182583324611187, + "learning_rate": 1.191198772316905e-06, + "loss": 0.0515, + "step": 3734 + }, + { + "epoch": 0.7828547474324041, + "grad_norm": 0.01545246597379446, + "learning_rate": 1.1890002672472133e-06, + "loss": 0.0551, + "step": 3735 + }, + { + "epoch": 0.7830643470970446, + "grad_norm": 0.022399727255105972, + "learning_rate": 1.1868035190476085e-06, + "loss": 0.0572, + "step": 3736 + }, + { + "epoch": 0.7832739467616852, + "grad_norm": 0.01581631414592266, + "learning_rate": 1.184608528730789e-06, + "loss": 0.0539, + "step": 3737 + }, + { + "epoch": 0.7834835464263257, + "grad_norm": 0.022165510803461075, + "learning_rate": 1.1824152973086444e-06, + "loss": 0.0579, + "step": 3738 + }, + { + "epoch": 0.7836931460909663, + "grad_norm": 0.018284514546394348, + "learning_rate": 1.1802238257922543e-06, + "loss": 0.056, + "step": 3739 + }, + { + "epoch": 0.7839027457556068, + "grad_norm": 0.017083924263715744, + "learning_rate": 1.1780341151918883e-06, + "loss": 0.0517, + "step": 3740 + }, + { + "epoch": 0.7841123454202473, + "grad_norm": 0.01769246533513069, + "learning_rate": 1.1758461665170001e-06, + "loss": 0.0558, + "step": 3741 + }, + { + "epoch": 0.7843219450848878, + "grad_norm": 0.018215294927358627, + "learning_rate": 1.1736599807762366e-06, + "loss": 0.0544, + "step": 3742 + }, + { + "epoch": 0.7845315447495284, + "grad_norm": 0.018238717690110207, + "learning_rate": 1.1714755589774252e-06, + "loss": 0.0523, + "step": 3743 + }, + { + "epoch": 0.784741144414169, + "grad_norm": 0.01696399226784706, + "learning_rate": 1.1692929021275874e-06, + "loss": 0.0542, + "step": 3744 + }, + { + "epoch": 0.7849507440788095, + "grad_norm": 0.020423993468284607, + "learning_rate": 1.1671120112329248e-06, + "loss": 0.0547, + "step": 3745 + }, + { + "epoch": 0.78516034374345, + "grad_norm": 0.016111532226204872, + "learning_rate": 1.1649328872988286e-06, + "loss": 0.0546, + "step": 3746 + }, + { + "epoch": 0.7853699434080905, + "grad_norm": 0.023867281153798103, + "learning_rate": 1.1627555313298777e-06, + "loss": 0.0544, + "step": 3747 + }, + { + "epoch": 0.785579543072731, + "grad_norm": 0.017127446830272675, + "learning_rate": 1.160579944329827e-06, + "loss": 0.0551, + "step": 3748 + }, + { + "epoch": 0.7857891427373717, + "grad_norm": 0.0323978066444397, + "learning_rate": 1.1584061273016245e-06, + "loss": 0.0584, + "step": 3749 + }, + { + "epoch": 0.7859987424020122, + "grad_norm": 0.015841031447052956, + "learning_rate": 1.1562340812474004e-06, + "loss": 0.0511, + "step": 3750 + }, + { + "epoch": 0.7862083420666527, + "grad_norm": 0.03065073862671852, + "learning_rate": 1.154063807168465e-06, + "loss": 0.0526, + "step": 3751 + }, + { + "epoch": 0.7864179417312932, + "grad_norm": 0.02133885584771633, + "learning_rate": 1.1518953060653177e-06, + "loss": 0.054, + "step": 3752 + }, + { + "epoch": 0.7866275413959337, + "grad_norm": 0.02793465554714203, + "learning_rate": 1.1497285789376327e-06, + "loss": 0.0542, + "step": 3753 + }, + { + "epoch": 0.7868371410605743, + "grad_norm": 0.024401402100920677, + "learning_rate": 1.1475636267842754e-06, + "loss": 0.0541, + "step": 3754 + }, + { + "epoch": 0.7870467407252149, + "grad_norm": 0.02825610339641571, + "learning_rate": 1.145400450603284e-06, + "loss": 0.053, + "step": 3755 + }, + { + "epoch": 0.7872563403898554, + "grad_norm": 0.031988829374313354, + "learning_rate": 1.143239051391884e-06, + "loss": 0.0519, + "step": 3756 + }, + { + "epoch": 0.7874659400544959, + "grad_norm": 0.01913175918161869, + "learning_rate": 1.1410794301464817e-06, + "loss": 0.0559, + "step": 3757 + }, + { + "epoch": 0.7876755397191364, + "grad_norm": 0.02814309112727642, + "learning_rate": 1.1389215878626608e-06, + "loss": 0.0569, + "step": 3758 + }, + { + "epoch": 0.787885139383777, + "grad_norm": 0.019799359142780304, + "learning_rate": 1.1367655255351845e-06, + "loss": 0.0542, + "step": 3759 + }, + { + "epoch": 0.7880947390484175, + "grad_norm": 0.023299338296055794, + "learning_rate": 1.1346112441579998e-06, + "loss": 0.0548, + "step": 3760 + }, + { + "epoch": 0.788304338713058, + "grad_norm": 0.022608846426010132, + "learning_rate": 1.132458744724227e-06, + "loss": 0.0527, + "step": 3761 + }, + { + "epoch": 0.7885139383776986, + "grad_norm": 0.03028971515595913, + "learning_rate": 1.1303080282261698e-06, + "loss": 0.0561, + "step": 3762 + }, + { + "epoch": 0.7887235380423391, + "grad_norm": 0.019988486543297768, + "learning_rate": 1.128159095655309e-06, + "loss": 0.0526, + "step": 3763 + }, + { + "epoch": 0.7889331377069797, + "grad_norm": 0.036664996296167374, + "learning_rate": 1.1260119480023008e-06, + "loss": 0.0525, + "step": 3764 + }, + { + "epoch": 0.7891427373716202, + "grad_norm": 0.02409091591835022, + "learning_rate": 1.1238665862569786e-06, + "loss": 0.0527, + "step": 3765 + }, + { + "epoch": 0.7893523370362607, + "grad_norm": 0.02567540481686592, + "learning_rate": 1.121723011408355e-06, + "loss": 0.054, + "step": 3766 + }, + { + "epoch": 0.7895619367009012, + "grad_norm": 0.028370145708322525, + "learning_rate": 1.1195812244446185e-06, + "loss": 0.0561, + "step": 3767 + }, + { + "epoch": 0.7897715363655418, + "grad_norm": 0.015024359337985516, + "learning_rate": 1.117441226353131e-06, + "loss": 0.0532, + "step": 3768 + }, + { + "epoch": 0.7899811360301824, + "grad_norm": 0.02175975777208805, + "learning_rate": 1.115303018120432e-06, + "loss": 0.0534, + "step": 3769 + }, + { + "epoch": 0.7901907356948229, + "grad_norm": 0.01616404764354229, + "learning_rate": 1.1131666007322356e-06, + "loss": 0.0545, + "step": 3770 + }, + { + "epoch": 0.7904003353594634, + "grad_norm": 0.01787029393017292, + "learning_rate": 1.1110319751734271e-06, + "loss": 0.0541, + "step": 3771 + }, + { + "epoch": 0.7906099350241039, + "grad_norm": 0.015742896124720573, + "learning_rate": 1.1088991424280705e-06, + "loss": 0.0542, + "step": 3772 + }, + { + "epoch": 0.7908195346887446, + "grad_norm": 0.018519075587391853, + "learning_rate": 1.106768103479402e-06, + "loss": 0.0527, + "step": 3773 + }, + { + "epoch": 0.7910291343533851, + "grad_norm": 0.012830471619963646, + "learning_rate": 1.1046388593098284e-06, + "loss": 0.0523, + "step": 3774 + }, + { + "epoch": 0.7912387340180256, + "grad_norm": 0.018937064334750175, + "learning_rate": 1.1025114109009321e-06, + "loss": 0.0537, + "step": 3775 + }, + { + "epoch": 0.7914483336826661, + "grad_norm": 0.016385626047849655, + "learning_rate": 1.100385759233465e-06, + "loss": 0.0537, + "step": 3776 + }, + { + "epoch": 0.7916579333473066, + "grad_norm": 0.01458564680069685, + "learning_rate": 1.098261905287354e-06, + "loss": 0.0535, + "step": 3777 + }, + { + "epoch": 0.7918675330119472, + "grad_norm": 0.01804785430431366, + "learning_rate": 1.0961398500416926e-06, + "loss": 0.0504, + "step": 3778 + }, + { + "epoch": 0.7920771326765877, + "grad_norm": 0.015284632332623005, + "learning_rate": 1.0940195944747494e-06, + "loss": 0.0537, + "step": 3779 + }, + { + "epoch": 0.7922867323412283, + "grad_norm": 0.013636418618261814, + "learning_rate": 1.091901139563964e-06, + "loss": 0.0531, + "step": 3780 + }, + { + "epoch": 0.7924963320058688, + "grad_norm": 0.013568844646215439, + "learning_rate": 1.0897844862859407e-06, + "loss": 0.0519, + "step": 3781 + }, + { + "epoch": 0.7927059316705093, + "grad_norm": 0.015443854033946991, + "learning_rate": 1.0876696356164556e-06, + "loss": 0.0566, + "step": 3782 + }, + { + "epoch": 0.7929155313351499, + "grad_norm": 0.013935340568423271, + "learning_rate": 1.0855565885304575e-06, + "loss": 0.0503, + "step": 3783 + }, + { + "epoch": 0.7931251309997904, + "grad_norm": 0.01774810254573822, + "learning_rate": 1.0834453460020577e-06, + "loss": 0.0548, + "step": 3784 + }, + { + "epoch": 0.7933347306644309, + "grad_norm": 0.01960287056863308, + "learning_rate": 1.0813359090045412e-06, + "loss": 0.0535, + "step": 3785 + }, + { + "epoch": 0.7935443303290715, + "grad_norm": 0.014412653632462025, + "learning_rate": 1.0792282785103565e-06, + "loss": 0.0538, + "step": 3786 + }, + { + "epoch": 0.793753929993712, + "grad_norm": 0.01678672805428505, + "learning_rate": 1.0771224554911197e-06, + "loss": 0.0539, + "step": 3787 + }, + { + "epoch": 0.7939635296583526, + "grad_norm": 0.017591232433915138, + "learning_rate": 1.0750184409176156e-06, + "loss": 0.0577, + "step": 3788 + }, + { + "epoch": 0.7941731293229931, + "grad_norm": 0.025450995191931725, + "learning_rate": 1.0729162357597956e-06, + "loss": 0.0556, + "step": 3789 + }, + { + "epoch": 0.7943827289876336, + "grad_norm": 0.01504075713455677, + "learning_rate": 1.0708158409867763e-06, + "loss": 0.0564, + "step": 3790 + }, + { + "epoch": 0.7945923286522741, + "grad_norm": 0.029961617663502693, + "learning_rate": 1.0687172575668381e-06, + "loss": 0.0532, + "step": 3791 + }, + { + "epoch": 0.7948019283169147, + "grad_norm": 0.01688455045223236, + "learning_rate": 1.0666204864674263e-06, + "loss": 0.0561, + "step": 3792 + }, + { + "epoch": 0.7950115279815553, + "grad_norm": 0.022862810641527176, + "learning_rate": 1.0645255286551548e-06, + "loss": 0.0555, + "step": 3793 + }, + { + "epoch": 0.7952211276461958, + "grad_norm": 0.016880135983228683, + "learning_rate": 1.0624323850957952e-06, + "loss": 0.0528, + "step": 3794 + }, + { + "epoch": 0.7954307273108363, + "grad_norm": 0.02419593743979931, + "learning_rate": 1.0603410567542882e-06, + "loss": 0.054, + "step": 3795 + }, + { + "epoch": 0.7956403269754768, + "grad_norm": 0.02624059095978737, + "learning_rate": 1.0582515445947377e-06, + "loss": 0.0534, + "step": 3796 + }, + { + "epoch": 0.7958499266401173, + "grad_norm": 0.013864974491298199, + "learning_rate": 1.056163849580406e-06, + "loss": 0.0516, + "step": 3797 + }, + { + "epoch": 0.796059526304758, + "grad_norm": 0.018372228369116783, + "learning_rate": 1.0540779726737187e-06, + "loss": 0.0544, + "step": 3798 + }, + { + "epoch": 0.7962691259693985, + "grad_norm": 0.017514994367957115, + "learning_rate": 1.0519939148362667e-06, + "loss": 0.0556, + "step": 3799 + }, + { + "epoch": 0.796478725634039, + "grad_norm": 0.026039427146315575, + "learning_rate": 1.0499116770288015e-06, + "loss": 0.0531, + "step": 3800 + }, + { + "epoch": 0.7966883252986795, + "grad_norm": 0.019942620769143105, + "learning_rate": 1.0478312602112312e-06, + "loss": 0.0527, + "step": 3801 + }, + { + "epoch": 0.79689792496332, + "grad_norm": 0.03383920341730118, + "learning_rate": 1.0457526653426303e-06, + "loss": 0.0536, + "step": 3802 + }, + { + "epoch": 0.7971075246279606, + "grad_norm": 0.018814336508512497, + "learning_rate": 1.0436758933812292e-06, + "loss": 0.0547, + "step": 3803 + }, + { + "epoch": 0.7973171242926012, + "grad_norm": 0.035695578902959824, + "learning_rate": 1.0416009452844178e-06, + "loss": 0.0525, + "step": 3804 + }, + { + "epoch": 0.7975267239572417, + "grad_norm": 0.019596830010414124, + "learning_rate": 1.039527822008749e-06, + "loss": 0.0549, + "step": 3805 + }, + { + "epoch": 0.7977363236218822, + "grad_norm": 0.030918627977371216, + "learning_rate": 1.0374565245099328e-06, + "loss": 0.0572, + "step": 3806 + }, + { + "epoch": 0.7979459232865227, + "grad_norm": 0.020429056137800217, + "learning_rate": 1.035387053742834e-06, + "loss": 0.0537, + "step": 3807 + }, + { + "epoch": 0.7981555229511633, + "grad_norm": 0.026732752099633217, + "learning_rate": 1.0333194106614813e-06, + "loss": 0.0556, + "step": 3808 + }, + { + "epoch": 0.7983651226158038, + "grad_norm": 0.01719031110405922, + "learning_rate": 1.0312535962190567e-06, + "loss": 0.0563, + "step": 3809 + }, + { + "epoch": 0.7985747222804443, + "grad_norm": 0.03157135099172592, + "learning_rate": 1.0291896113678983e-06, + "loss": 0.0525, + "step": 3810 + }, + { + "epoch": 0.7987843219450849, + "grad_norm": 0.02437172830104828, + "learning_rate": 1.0271274570595041e-06, + "loss": 0.054, + "step": 3811 + }, + { + "epoch": 0.7989939216097254, + "grad_norm": 0.017890898510813713, + "learning_rate": 1.0250671342445273e-06, + "loss": 0.0557, + "step": 3812 + }, + { + "epoch": 0.799203521274366, + "grad_norm": 0.022037314251065254, + "learning_rate": 1.0230086438727771e-06, + "loss": 0.0532, + "step": 3813 + }, + { + "epoch": 0.7994131209390065, + "grad_norm": 0.014611390419304371, + "learning_rate": 1.020951986893216e-06, + "loss": 0.056, + "step": 3814 + }, + { + "epoch": 0.799622720603647, + "grad_norm": 0.02562752366065979, + "learning_rate": 1.0188971642539614e-06, + "loss": 0.0536, + "step": 3815 + }, + { + "epoch": 0.7998323202682875, + "grad_norm": 0.015589050948619843, + "learning_rate": 1.016844176902288e-06, + "loss": 0.0522, + "step": 3816 + }, + { + "epoch": 0.8000419199329281, + "grad_norm": 0.030634846538305283, + "learning_rate": 1.0147930257846206e-06, + "loss": 0.0525, + "step": 3817 + }, + { + "epoch": 0.8002515195975687, + "grad_norm": 0.018007168546319008, + "learning_rate": 1.0127437118465405e-06, + "loss": 0.0567, + "step": 3818 + }, + { + "epoch": 0.8004611192622092, + "grad_norm": 0.02537359483540058, + "learning_rate": 1.0106962360327832e-06, + "loss": 0.0564, + "step": 3819 + }, + { + "epoch": 0.8006707189268497, + "grad_norm": 0.015520203858613968, + "learning_rate": 1.0086505992872304e-06, + "loss": 0.0566, + "step": 3820 + }, + { + "epoch": 0.8008803185914902, + "grad_norm": 0.02692110277712345, + "learning_rate": 1.0066068025529219e-06, + "loss": 0.0548, + "step": 3821 + }, + { + "epoch": 0.8010899182561307, + "grad_norm": 0.021093720570206642, + "learning_rate": 1.0045648467720492e-06, + "loss": 0.0524, + "step": 3822 + }, + { + "epoch": 0.8012995179207714, + "grad_norm": 0.01891363225877285, + "learning_rate": 1.002524732885951e-06, + "loss": 0.053, + "step": 3823 + }, + { + "epoch": 0.8015091175854119, + "grad_norm": 0.016529306769371033, + "learning_rate": 1.0004864618351223e-06, + "loss": 0.054, + "step": 3824 + }, + { + "epoch": 0.8017187172500524, + "grad_norm": 0.01724756881594658, + "learning_rate": 9.984500345592023e-07, + "loss": 0.0582, + "step": 3825 + }, + { + "epoch": 0.8019283169146929, + "grad_norm": 0.014622312039136887, + "learning_rate": 9.964154519969865e-07, + "loss": 0.0617, + "step": 3826 + }, + { + "epoch": 0.8021379165793334, + "grad_norm": 0.016424983739852905, + "learning_rate": 9.943827150864143e-07, + "loss": 0.0527, + "step": 3827 + }, + { + "epoch": 0.802347516243974, + "grad_norm": 0.016418717801570892, + "learning_rate": 9.923518247645785e-07, + "loss": 0.0525, + "step": 3828 + }, + { + "epoch": 0.8025571159086146, + "grad_norm": 0.01681547239422798, + "learning_rate": 9.903227819677203e-07, + "loss": 0.0546, + "step": 3829 + }, + { + "epoch": 0.8027667155732551, + "grad_norm": 0.016537809744477272, + "learning_rate": 9.882955876312266e-07, + "loss": 0.052, + "step": 3830 + }, + { + "epoch": 0.8029763152378956, + "grad_norm": 0.018042949959635735, + "learning_rate": 9.862702426896327e-07, + "loss": 0.0549, + "step": 3831 + }, + { + "epoch": 0.8031859149025361, + "grad_norm": 0.019126849249005318, + "learning_rate": 9.842467480766243e-07, + "loss": 0.0537, + "step": 3832 + }, + { + "epoch": 0.8033955145671767, + "grad_norm": 0.01958855427801609, + "learning_rate": 9.822251047250298e-07, + "loss": 0.0544, + "step": 3833 + }, + { + "epoch": 0.8036051142318172, + "grad_norm": 0.017055636271834373, + "learning_rate": 9.80205313566827e-07, + "loss": 0.0512, + "step": 3834 + }, + { + "epoch": 0.8038147138964578, + "grad_norm": 0.0194768775254488, + "learning_rate": 9.781873755331412e-07, + "loss": 0.0541, + "step": 3835 + }, + { + "epoch": 0.8040243135610983, + "grad_norm": 0.015344620682299137, + "learning_rate": 9.7617129155424e-07, + "loss": 0.0538, + "step": 3836 + }, + { + "epoch": 0.8042339132257388, + "grad_norm": 0.016386136412620544, + "learning_rate": 9.74157062559536e-07, + "loss": 0.0537, + "step": 3837 + }, + { + "epoch": 0.8044435128903794, + "grad_norm": 0.016445597633719444, + "learning_rate": 9.7214468947759e-07, + "loss": 0.0538, + "step": 3838 + }, + { + "epoch": 0.8046531125550199, + "grad_norm": 0.017396828159689903, + "learning_rate": 9.701341732361068e-07, + "loss": 0.0533, + "step": 3839 + }, + { + "epoch": 0.8048627122196604, + "grad_norm": 0.018001766875386238, + "learning_rate": 9.681255147619317e-07, + "loss": 0.0519, + "step": 3840 + }, + { + "epoch": 0.805072311884301, + "grad_norm": 0.014202550053596497, + "learning_rate": 9.66118714981058e-07, + "loss": 0.0547, + "step": 3841 + }, + { + "epoch": 0.8052819115489416, + "grad_norm": 0.01720808818936348, + "learning_rate": 9.641137748186186e-07, + "loss": 0.0543, + "step": 3842 + }, + { + "epoch": 0.8054915112135821, + "grad_norm": 0.012487749569118023, + "learning_rate": 9.62110695198889e-07, + "loss": 0.0572, + "step": 3843 + }, + { + "epoch": 0.8057011108782226, + "grad_norm": 0.014534911140799522, + "learning_rate": 9.601094770452907e-07, + "loss": 0.0555, + "step": 3844 + }, + { + "epoch": 0.8059107105428631, + "grad_norm": 0.012713300064206123, + "learning_rate": 9.581101212803857e-07, + "loss": 0.0567, + "step": 3845 + }, + { + "epoch": 0.8061203102075036, + "grad_norm": 0.01954994909465313, + "learning_rate": 9.561126288258738e-07, + "loss": 0.0562, + "step": 3846 + }, + { + "epoch": 0.8063299098721443, + "grad_norm": 0.01227479986846447, + "learning_rate": 9.541170006026012e-07, + "loss": 0.0526, + "step": 3847 + }, + { + "epoch": 0.8065395095367848, + "grad_norm": 0.013360395096242428, + "learning_rate": 9.521232375305494e-07, + "loss": 0.0531, + "step": 3848 + }, + { + "epoch": 0.8067491092014253, + "grad_norm": 0.012572353705763817, + "learning_rate": 9.50131340528846e-07, + "loss": 0.0548, + "step": 3849 + }, + { + "epoch": 0.8069587088660658, + "grad_norm": 0.012260310351848602, + "learning_rate": 9.481413105157517e-07, + "loss": 0.0508, + "step": 3850 + }, + { + "epoch": 0.8071683085307063, + "grad_norm": 0.012266767211258411, + "learning_rate": 9.461531484086722e-07, + "loss": 0.0564, + "step": 3851 + }, + { + "epoch": 0.8073779081953469, + "grad_norm": 0.013467447832226753, + "learning_rate": 9.441668551241511e-07, + "loss": 0.0564, + "step": 3852 + }, + { + "epoch": 0.8075875078599875, + "grad_norm": 0.013181211426854134, + "learning_rate": 9.421824315778649e-07, + "loss": 0.0548, + "step": 3853 + }, + { + "epoch": 0.807797107524628, + "grad_norm": 0.014856358990073204, + "learning_rate": 9.401998786846356e-07, + "loss": 0.0543, + "step": 3854 + }, + { + "epoch": 0.8080067071892685, + "grad_norm": 0.013289416208863258, + "learning_rate": 9.382191973584193e-07, + "loss": 0.0524, + "step": 3855 + }, + { + "epoch": 0.808216306853909, + "grad_norm": 0.017398502677679062, + "learning_rate": 9.362403885123084e-07, + "loss": 0.0515, + "step": 3856 + }, + { + "epoch": 0.8084259065185496, + "grad_norm": 0.012360899709165096, + "learning_rate": 9.342634530585354e-07, + "loss": 0.0531, + "step": 3857 + }, + { + "epoch": 0.8086355061831901, + "grad_norm": 0.014594539068639278, + "learning_rate": 9.322883919084652e-07, + "loss": 0.0537, + "step": 3858 + }, + { + "epoch": 0.8088451058478306, + "grad_norm": 0.014027457684278488, + "learning_rate": 9.303152059726023e-07, + "loss": 0.0545, + "step": 3859 + }, + { + "epoch": 0.8090547055124712, + "grad_norm": 0.015334793366491795, + "learning_rate": 9.283438961605829e-07, + "loss": 0.0515, + "step": 3860 + }, + { + "epoch": 0.8092643051771117, + "grad_norm": 0.018460331484675407, + "learning_rate": 9.263744633811816e-07, + "loss": 0.0543, + "step": 3861 + }, + { + "epoch": 0.8094739048417523, + "grad_norm": 0.017743943259119987, + "learning_rate": 9.244069085423074e-07, + "loss": 0.0559, + "step": 3862 + }, + { + "epoch": 0.8096835045063928, + "grad_norm": 0.014662131667137146, + "learning_rate": 9.224412325510024e-07, + "loss": 0.055, + "step": 3863 + }, + { + "epoch": 0.8098931041710333, + "grad_norm": 0.016809482127428055, + "learning_rate": 9.204774363134405e-07, + "loss": 0.0564, + "step": 3864 + }, + { + "epoch": 0.8101027038356738, + "grad_norm": 0.022974979132413864, + "learning_rate": 9.185155207349344e-07, + "loss": 0.0545, + "step": 3865 + }, + { + "epoch": 0.8103123035003144, + "grad_norm": 0.018833018839359283, + "learning_rate": 9.165554867199245e-07, + "loss": 0.0561, + "step": 3866 + }, + { + "epoch": 0.810521903164955, + "grad_norm": 0.021687567234039307, + "learning_rate": 9.145973351719867e-07, + "loss": 0.0569, + "step": 3867 + }, + { + "epoch": 0.8107315028295955, + "grad_norm": 0.016966206952929497, + "learning_rate": 9.126410669938302e-07, + "loss": 0.0518, + "step": 3868 + }, + { + "epoch": 0.810941102494236, + "grad_norm": 0.023966066539287567, + "learning_rate": 9.106866830872929e-07, + "loss": 0.0569, + "step": 3869 + }, + { + "epoch": 0.8111507021588765, + "grad_norm": 0.016374409198760986, + "learning_rate": 9.087341843533437e-07, + "loss": 0.0552, + "step": 3870 + }, + { + "epoch": 0.811360301823517, + "grad_norm": 0.025269117206335068, + "learning_rate": 9.067835716920859e-07, + "loss": 0.0551, + "step": 3871 + }, + { + "epoch": 0.8115699014881577, + "grad_norm": 0.01591191627085209, + "learning_rate": 9.048348460027528e-07, + "loss": 0.0532, + "step": 3872 + }, + { + "epoch": 0.8117795011527982, + "grad_norm": 0.020468199625611305, + "learning_rate": 9.028880081837032e-07, + "loss": 0.0539, + "step": 3873 + }, + { + "epoch": 0.8119891008174387, + "grad_norm": 0.0173234511166811, + "learning_rate": 9.009430591324325e-07, + "loss": 0.0537, + "step": 3874 + }, + { + "epoch": 0.8121987004820792, + "grad_norm": 0.015647042542696, + "learning_rate": 8.989999997455601e-07, + "loss": 0.0563, + "step": 3875 + }, + { + "epoch": 0.8124083001467197, + "grad_norm": 0.022868547588586807, + "learning_rate": 8.970588309188343e-07, + "loss": 0.0527, + "step": 3876 + }, + { + "epoch": 0.8126178998113603, + "grad_norm": 0.018496761098504066, + "learning_rate": 8.951195535471357e-07, + "loss": 0.0546, + "step": 3877 + }, + { + "epoch": 0.8128274994760009, + "grad_norm": 0.023296529427170753, + "learning_rate": 8.931821685244712e-07, + "loss": 0.0546, + "step": 3878 + }, + { + "epoch": 0.8130370991406414, + "grad_norm": 0.019333289936184883, + "learning_rate": 8.912466767439726e-07, + "loss": 0.0532, + "step": 3879 + }, + { + "epoch": 0.8132466988052819, + "grad_norm": 0.021376557648181915, + "learning_rate": 8.893130790979038e-07, + "loss": 0.0532, + "step": 3880 + }, + { + "epoch": 0.8134562984699224, + "grad_norm": 0.015562393702566624, + "learning_rate": 8.873813764776506e-07, + "loss": 0.0536, + "step": 3881 + }, + { + "epoch": 0.813665898134563, + "grad_norm": 0.026157313957810402, + "learning_rate": 8.854515697737298e-07, + "loss": 0.0531, + "step": 3882 + }, + { + "epoch": 0.8138754977992035, + "grad_norm": 0.017569879069924355, + "learning_rate": 8.835236598757796e-07, + "loss": 0.0528, + "step": 3883 + }, + { + "epoch": 0.814085097463844, + "grad_norm": 0.021584592759609222, + "learning_rate": 8.815976476725668e-07, + "loss": 0.053, + "step": 3884 + }, + { + "epoch": 0.8142946971284846, + "grad_norm": 0.01839163526892662, + "learning_rate": 8.796735340519847e-07, + "loss": 0.0539, + "step": 3885 + }, + { + "epoch": 0.8145042967931251, + "grad_norm": 0.01819518581032753, + "learning_rate": 8.777513199010468e-07, + "loss": 0.0558, + "step": 3886 + }, + { + "epoch": 0.8147138964577657, + "grad_norm": 0.01771095208823681, + "learning_rate": 8.758310061058934e-07, + "loss": 0.0527, + "step": 3887 + }, + { + "epoch": 0.8149234961224062, + "grad_norm": 0.01995546743273735, + "learning_rate": 8.739125935517906e-07, + "loss": 0.0531, + "step": 3888 + }, + { + "epoch": 0.8151330957870467, + "grad_norm": 0.016256198287010193, + "learning_rate": 8.719960831231239e-07, + "loss": 0.0523, + "step": 3889 + }, + { + "epoch": 0.8153426954516872, + "grad_norm": 0.020824657753109932, + "learning_rate": 8.70081475703406e-07, + "loss": 0.055, + "step": 3890 + }, + { + "epoch": 0.8155522951163278, + "grad_norm": 0.0172143392264843, + "learning_rate": 8.681687721752719e-07, + "loss": 0.0564, + "step": 3891 + }, + { + "epoch": 0.8157618947809684, + "grad_norm": 0.022133000195026398, + "learning_rate": 8.66257973420473e-07, + "loss": 0.053, + "step": 3892 + }, + { + "epoch": 0.8159714944456089, + "grad_norm": 0.01741599105298519, + "learning_rate": 8.643490803198895e-07, + "loss": 0.0574, + "step": 3893 + }, + { + "epoch": 0.8161810941102494, + "grad_norm": 0.01774427853524685, + "learning_rate": 8.62442093753521e-07, + "loss": 0.0556, + "step": 3894 + }, + { + "epoch": 0.8163906937748899, + "grad_norm": 0.019820379093289375, + "learning_rate": 8.605370146004894e-07, + "loss": 0.0536, + "step": 3895 + }, + { + "epoch": 0.8166002934395304, + "grad_norm": 0.017903191968798637, + "learning_rate": 8.58633843739034e-07, + "loss": 0.056, + "step": 3896 + }, + { + "epoch": 0.8168098931041711, + "grad_norm": 0.02086867205798626, + "learning_rate": 8.567325820465156e-07, + "loss": 0.053, + "step": 3897 + }, + { + "epoch": 0.8170194927688116, + "grad_norm": 0.02020520158112049, + "learning_rate": 8.548332303994167e-07, + "loss": 0.055, + "step": 3898 + }, + { + "epoch": 0.8172290924334521, + "grad_norm": 0.015635685995221138, + "learning_rate": 8.52935789673337e-07, + "loss": 0.0539, + "step": 3899 + }, + { + "epoch": 0.8174386920980926, + "grad_norm": 0.02270682342350483, + "learning_rate": 8.510402607429963e-07, + "loss": 0.056, + "step": 3900 + }, + { + "epoch": 0.8176482917627331, + "grad_norm": 0.014987417496740818, + "learning_rate": 8.491466444822355e-07, + "loss": 0.0546, + "step": 3901 + }, + { + "epoch": 0.8178578914273738, + "grad_norm": 0.0187685564160347, + "learning_rate": 8.472549417640092e-07, + "loss": 0.0521, + "step": 3902 + }, + { + "epoch": 0.8180674910920143, + "grad_norm": 0.01810646429657936, + "learning_rate": 8.453651534603901e-07, + "loss": 0.0542, + "step": 3903 + }, + { + "epoch": 0.8182770907566548, + "grad_norm": 0.01981244422495365, + "learning_rate": 8.434772804425734e-07, + "loss": 0.0551, + "step": 3904 + }, + { + "epoch": 0.8184866904212953, + "grad_norm": 0.017939042299985886, + "learning_rate": 8.415913235808675e-07, + "loss": 0.0553, + "step": 3905 + }, + { + "epoch": 0.8186962900859358, + "grad_norm": 0.02355090342462063, + "learning_rate": 8.397072837446968e-07, + "loss": 0.052, + "step": 3906 + }, + { + "epoch": 0.8189058897505764, + "grad_norm": 0.01684572361409664, + "learning_rate": 8.378251618026051e-07, + "loss": 0.0561, + "step": 3907 + }, + { + "epoch": 0.819115489415217, + "grad_norm": 0.024392616003751755, + "learning_rate": 8.35944958622249e-07, + "loss": 0.0536, + "step": 3908 + }, + { + "epoch": 0.8193250890798575, + "grad_norm": 0.01632305234670639, + "learning_rate": 8.340666750704013e-07, + "loss": 0.0538, + "step": 3909 + }, + { + "epoch": 0.819534688744498, + "grad_norm": 0.023946644738316536, + "learning_rate": 8.32190312012951e-07, + "loss": 0.0542, + "step": 3910 + }, + { + "epoch": 0.8197442884091386, + "grad_norm": 0.016722572967410088, + "learning_rate": 8.303158703149023e-07, + "loss": 0.0546, + "step": 3911 + }, + { + "epoch": 0.8199538880737791, + "grad_norm": 0.02079438418149948, + "learning_rate": 8.2844335084037e-07, + "loss": 0.0552, + "step": 3912 + }, + { + "epoch": 0.8201634877384196, + "grad_norm": 0.016205811873078346, + "learning_rate": 8.265727544525876e-07, + "loss": 0.0574, + "step": 3913 + }, + { + "epoch": 0.8203730874030601, + "grad_norm": 0.020107150077819824, + "learning_rate": 8.247040820138985e-07, + "loss": 0.0529, + "step": 3914 + }, + { + "epoch": 0.8205826870677007, + "grad_norm": 0.02017829939723015, + "learning_rate": 8.228373343857593e-07, + "loss": 0.0559, + "step": 3915 + }, + { + "epoch": 0.8207922867323413, + "grad_norm": 0.015335801057517529, + "learning_rate": 8.20972512428741e-07, + "loss": 0.0539, + "step": 3916 + }, + { + "epoch": 0.8210018863969818, + "grad_norm": 0.01876046508550644, + "learning_rate": 8.191096170025265e-07, + "loss": 0.0531, + "step": 3917 + }, + { + "epoch": 0.8212114860616223, + "grad_norm": 0.022027568891644478, + "learning_rate": 8.172486489659115e-07, + "loss": 0.0528, + "step": 3918 + }, + { + "epoch": 0.8214210857262628, + "grad_norm": 0.013593902811408043, + "learning_rate": 8.153896091768004e-07, + "loss": 0.056, + "step": 3919 + }, + { + "epoch": 0.8216306853909033, + "grad_norm": 0.02417590655386448, + "learning_rate": 8.135324984922088e-07, + "loss": 0.0555, + "step": 3920 + }, + { + "epoch": 0.821840285055544, + "grad_norm": 0.016407081857323647, + "learning_rate": 8.116773177682674e-07, + "loss": 0.0532, + "step": 3921 + }, + { + "epoch": 0.8220498847201845, + "grad_norm": 0.021131981164216995, + "learning_rate": 8.09824067860211e-07, + "loss": 0.0534, + "step": 3922 + }, + { + "epoch": 0.822259484384825, + "grad_norm": 0.01895805634558201, + "learning_rate": 8.079727496223894e-07, + "loss": 0.0551, + "step": 3923 + }, + { + "epoch": 0.8224690840494655, + "grad_norm": 0.027057627215981483, + "learning_rate": 8.061233639082616e-07, + "loss": 0.0553, + "step": 3924 + }, + { + "epoch": 0.822678683714106, + "grad_norm": 0.016223613172769547, + "learning_rate": 8.042759115703891e-07, + "loss": 0.0534, + "step": 3925 + }, + { + "epoch": 0.8228882833787466, + "grad_norm": 0.030798260122537613, + "learning_rate": 8.024303934604505e-07, + "loss": 0.0537, + "step": 3926 + }, + { + "epoch": 0.8230978830433872, + "grad_norm": 0.01324604731053114, + "learning_rate": 8.005868104292291e-07, + "loss": 0.0519, + "step": 3927 + }, + { + "epoch": 0.8233074827080277, + "grad_norm": 0.023022161796689034, + "learning_rate": 7.987451633266153e-07, + "loss": 0.0524, + "step": 3928 + }, + { + "epoch": 0.8235170823726682, + "grad_norm": 0.015440089628100395, + "learning_rate": 7.969054530016091e-07, + "loss": 0.0538, + "step": 3929 + }, + { + "epoch": 0.8237266820373087, + "grad_norm": 0.023025307804346085, + "learning_rate": 7.950676803023149e-07, + "loss": 0.0521, + "step": 3930 + }, + { + "epoch": 0.8239362817019493, + "grad_norm": 0.01681152544915676, + "learning_rate": 7.93231846075948e-07, + "loss": 0.0525, + "step": 3931 + }, + { + "epoch": 0.8241458813665898, + "grad_norm": 0.016274144873023033, + "learning_rate": 7.913979511688252e-07, + "loss": 0.0558, + "step": 3932 + }, + { + "epoch": 0.8243554810312304, + "grad_norm": 0.016948441043496132, + "learning_rate": 7.895659964263725e-07, + "loss": 0.0526, + "step": 3933 + }, + { + "epoch": 0.8245650806958709, + "grad_norm": 0.01407658401876688, + "learning_rate": 7.877359826931225e-07, + "loss": 0.055, + "step": 3934 + }, + { + "epoch": 0.8247746803605114, + "grad_norm": 0.011790010146796703, + "learning_rate": 7.859079108127088e-07, + "loss": 0.0548, + "step": 3935 + }, + { + "epoch": 0.824984280025152, + "grad_norm": 0.014163525775074959, + "learning_rate": 7.840817816278723e-07, + "loss": 0.055, + "step": 3936 + }, + { + "epoch": 0.8251938796897925, + "grad_norm": 0.011464595794677734, + "learning_rate": 7.822575959804596e-07, + "loss": 0.0534, + "step": 3937 + }, + { + "epoch": 0.825403479354433, + "grad_norm": 0.012643976137042046, + "learning_rate": 7.80435354711418e-07, + "loss": 0.0527, + "step": 3938 + }, + { + "epoch": 0.8256130790190735, + "grad_norm": 0.012431315146386623, + "learning_rate": 7.78615058660801e-07, + "loss": 0.0529, + "step": 3939 + }, + { + "epoch": 0.8258226786837141, + "grad_norm": 0.011443213559687138, + "learning_rate": 7.767967086677669e-07, + "loss": 0.0546, + "step": 3940 + }, + { + "epoch": 0.8260322783483547, + "grad_norm": 0.012839680537581444, + "learning_rate": 7.749803055705723e-07, + "loss": 0.054, + "step": 3941 + }, + { + "epoch": 0.8262418780129952, + "grad_norm": 0.010854445397853851, + "learning_rate": 7.731658502065786e-07, + "loss": 0.0539, + "step": 3942 + }, + { + "epoch": 0.8264514776776357, + "grad_norm": 0.01064964011311531, + "learning_rate": 7.713533434122494e-07, + "loss": 0.0548, + "step": 3943 + }, + { + "epoch": 0.8266610773422762, + "grad_norm": 0.012149804271757603, + "learning_rate": 7.695427860231519e-07, + "loss": 0.0553, + "step": 3944 + }, + { + "epoch": 0.8268706770069167, + "grad_norm": 0.01205404382199049, + "learning_rate": 7.677341788739507e-07, + "loss": 0.0537, + "step": 3945 + }, + { + "epoch": 0.8270802766715574, + "grad_norm": 0.014352019876241684, + "learning_rate": 7.659275227984142e-07, + "loss": 0.0514, + "step": 3946 + }, + { + "epoch": 0.8272898763361979, + "grad_norm": 0.01178305596113205, + "learning_rate": 7.641228186294108e-07, + "loss": 0.054, + "step": 3947 + }, + { + "epoch": 0.8274994760008384, + "grad_norm": 0.020204555243253708, + "learning_rate": 7.623200671989067e-07, + "loss": 0.0541, + "step": 3948 + }, + { + "epoch": 0.8277090756654789, + "grad_norm": 0.011678210459649563, + "learning_rate": 7.605192693379715e-07, + "loss": 0.0558, + "step": 3949 + }, + { + "epoch": 0.8279186753301194, + "grad_norm": 0.014891520142555237, + "learning_rate": 7.587204258767733e-07, + "loss": 0.0533, + "step": 3950 + }, + { + "epoch": 0.82812827499476, + "grad_norm": 0.012660454027354717, + "learning_rate": 7.569235376445772e-07, + "loss": 0.0535, + "step": 3951 + }, + { + "epoch": 0.8283378746594006, + "grad_norm": 0.01889774762094021, + "learning_rate": 7.551286054697498e-07, + "loss": 0.0552, + "step": 3952 + }, + { + "epoch": 0.8285474743240411, + "grad_norm": 0.011206352151930332, + "learning_rate": 7.533356301797523e-07, + "loss": 0.0553, + "step": 3953 + }, + { + "epoch": 0.8287570739886816, + "grad_norm": 0.015002798289060593, + "learning_rate": 7.515446126011484e-07, + "loss": 0.0552, + "step": 3954 + }, + { + "epoch": 0.8289666736533221, + "grad_norm": 0.012600685469806194, + "learning_rate": 7.497555535595946e-07, + "loss": 0.0534, + "step": 3955 + }, + { + "epoch": 0.8291762733179627, + "grad_norm": 0.01465271133929491, + "learning_rate": 7.479684538798476e-07, + "loss": 0.0585, + "step": 3956 + }, + { + "epoch": 0.8293858729826032, + "grad_norm": 0.015729481354355812, + "learning_rate": 7.461833143857611e-07, + "loss": 0.0515, + "step": 3957 + }, + { + "epoch": 0.8295954726472438, + "grad_norm": 0.012866144999861717, + "learning_rate": 7.444001359002833e-07, + "loss": 0.0558, + "step": 3958 + }, + { + "epoch": 0.8298050723118843, + "grad_norm": 0.019521057605743408, + "learning_rate": 7.426189192454575e-07, + "loss": 0.0552, + "step": 3959 + }, + { + "epoch": 0.8300146719765248, + "grad_norm": 0.018146377056837082, + "learning_rate": 7.408396652424271e-07, + "loss": 0.0508, + "step": 3960 + }, + { + "epoch": 0.8302242716411654, + "grad_norm": 0.01978515274822712, + "learning_rate": 7.39062374711425e-07, + "loss": 0.0542, + "step": 3961 + }, + { + "epoch": 0.8304338713058059, + "grad_norm": 0.015556586906313896, + "learning_rate": 7.372870484717843e-07, + "loss": 0.0571, + "step": 3962 + }, + { + "epoch": 0.8306434709704464, + "grad_norm": 0.02219201996922493, + "learning_rate": 7.355136873419277e-07, + "loss": 0.0563, + "step": 3963 + }, + { + "epoch": 0.830853070635087, + "grad_norm": 0.01369245070964098, + "learning_rate": 7.337422921393767e-07, + "loss": 0.0554, + "step": 3964 + }, + { + "epoch": 0.8310626702997275, + "grad_norm": 0.021102454513311386, + "learning_rate": 7.319728636807411e-07, + "loss": 0.0518, + "step": 3965 + }, + { + "epoch": 0.8312722699643681, + "grad_norm": 0.013064729049801826, + "learning_rate": 7.302054027817291e-07, + "loss": 0.0556, + "step": 3966 + }, + { + "epoch": 0.8314818696290086, + "grad_norm": 0.015176662243902683, + "learning_rate": 7.28439910257141e-07, + "loss": 0.057, + "step": 3967 + }, + { + "epoch": 0.8316914692936491, + "grad_norm": 0.014463878236711025, + "learning_rate": 7.266763869208665e-07, + "loss": 0.0547, + "step": 3968 + }, + { + "epoch": 0.8319010689582896, + "grad_norm": 0.012370740063488483, + "learning_rate": 7.249148335858891e-07, + "loss": 0.0541, + "step": 3969 + }, + { + "epoch": 0.8321106686229301, + "grad_norm": 0.020610513165593147, + "learning_rate": 7.231552510642864e-07, + "loss": 0.0552, + "step": 3970 + }, + { + "epoch": 0.8323202682875708, + "grad_norm": 0.010926956310868263, + "learning_rate": 7.213976401672235e-07, + "loss": 0.0535, + "step": 3971 + }, + { + "epoch": 0.8325298679522113, + "grad_norm": 0.021732453256845474, + "learning_rate": 7.196420017049599e-07, + "loss": 0.0549, + "step": 3972 + }, + { + "epoch": 0.8327394676168518, + "grad_norm": 0.01048396248370409, + "learning_rate": 7.178883364868455e-07, + "loss": 0.0557, + "step": 3973 + }, + { + "epoch": 0.8329490672814923, + "grad_norm": 0.02181028202176094, + "learning_rate": 7.161366453213181e-07, + "loss": 0.0557, + "step": 3974 + }, + { + "epoch": 0.8331586669461328, + "grad_norm": 0.012369881384074688, + "learning_rate": 7.143869290159067e-07, + "loss": 0.0522, + "step": 3975 + }, + { + "epoch": 0.8333682666107735, + "grad_norm": 0.02149048261344433, + "learning_rate": 7.12639188377231e-07, + "loss": 0.059, + "step": 3976 + }, + { + "epoch": 0.833577866275414, + "grad_norm": 0.013264495879411697, + "learning_rate": 7.108934242109994e-07, + "loss": 0.0581, + "step": 3977 + }, + { + "epoch": 0.8337874659400545, + "grad_norm": 0.019909605383872986, + "learning_rate": 7.091496373220075e-07, + "loss": 0.0549, + "step": 3978 + }, + { + "epoch": 0.833997065604695, + "grad_norm": 0.02005838230252266, + "learning_rate": 7.074078285141428e-07, + "loss": 0.0534, + "step": 3979 + }, + { + "epoch": 0.8342066652693356, + "grad_norm": 0.0182628370821476, + "learning_rate": 7.056679985903774e-07, + "loss": 0.0565, + "step": 3980 + }, + { + "epoch": 0.8344162649339761, + "grad_norm": 0.01993626542389393, + "learning_rate": 7.03930148352771e-07, + "loss": 0.0518, + "step": 3981 + }, + { + "epoch": 0.8346258645986167, + "grad_norm": 0.017620259895920753, + "learning_rate": 7.021942786024743e-07, + "loss": 0.0566, + "step": 3982 + }, + { + "epoch": 0.8348354642632572, + "grad_norm": 0.012461712583899498, + "learning_rate": 7.004603901397239e-07, + "loss": 0.0549, + "step": 3983 + }, + { + "epoch": 0.8350450639278977, + "grad_norm": 0.009824151173233986, + "learning_rate": 6.987284837638391e-07, + "loss": 0.0525, + "step": 3984 + }, + { + "epoch": 0.8352546635925383, + "grad_norm": 0.018029648810625076, + "learning_rate": 6.969985602732309e-07, + "loss": 0.0559, + "step": 3985 + }, + { + "epoch": 0.8354642632571788, + "grad_norm": 0.01331211719661951, + "learning_rate": 6.952706204653914e-07, + "loss": 0.0536, + "step": 3986 + }, + { + "epoch": 0.8356738629218193, + "grad_norm": 0.0163306575268507, + "learning_rate": 6.935446651369027e-07, + "loss": 0.0557, + "step": 3987 + }, + { + "epoch": 0.8358834625864598, + "grad_norm": 0.012105715461075306, + "learning_rate": 6.918206950834283e-07, + "loss": 0.054, + "step": 3988 + }, + { + "epoch": 0.8360930622511004, + "grad_norm": 0.013904665596783161, + "learning_rate": 6.900987110997182e-07, + "loss": 0.0538, + "step": 3989 + }, + { + "epoch": 0.836302661915741, + "grad_norm": 0.012371689081192017, + "learning_rate": 6.883787139796078e-07, + "loss": 0.0539, + "step": 3990 + }, + { + "epoch": 0.8365122615803815, + "grad_norm": 0.012469745241105556, + "learning_rate": 6.866607045160151e-07, + "loss": 0.0568, + "step": 3991 + }, + { + "epoch": 0.836721861245022, + "grad_norm": 0.012731354683637619, + "learning_rate": 6.849446835009405e-07, + "loss": 0.0524, + "step": 3992 + }, + { + "epoch": 0.8369314609096625, + "grad_norm": 0.012761647813022137, + "learning_rate": 6.832306517254716e-07, + "loss": 0.0523, + "step": 3993 + }, + { + "epoch": 0.837141060574303, + "grad_norm": 0.013718758709728718, + "learning_rate": 6.815186099797744e-07, + "loss": 0.0522, + "step": 3994 + }, + { + "epoch": 0.8373506602389437, + "grad_norm": 0.015886375680565834, + "learning_rate": 6.798085590531012e-07, + "loss": 0.0548, + "step": 3995 + }, + { + "epoch": 0.8375602599035842, + "grad_norm": 0.01095049548894167, + "learning_rate": 6.78100499733787e-07, + "loss": 0.0551, + "step": 3996 + }, + { + "epoch": 0.8377698595682247, + "grad_norm": 0.015172593295574188, + "learning_rate": 6.76394432809242e-07, + "loss": 0.0546, + "step": 3997 + }, + { + "epoch": 0.8379794592328652, + "grad_norm": 0.015550515614449978, + "learning_rate": 6.746903590659659e-07, + "loss": 0.0529, + "step": 3998 + }, + { + "epoch": 0.8381890588975057, + "grad_norm": 0.014822781085968018, + "learning_rate": 6.729882792895359e-07, + "loss": 0.0542, + "step": 3999 + }, + { + "epoch": 0.8383986585621463, + "grad_norm": 0.01649445667862892, + "learning_rate": 6.71288194264611e-07, + "loss": 0.0533, + "step": 4000 + }, + { + "epoch": 0.8386082582267869, + "grad_norm": 0.015881575644016266, + "learning_rate": 6.695901047749298e-07, + "loss": 0.0535, + "step": 4001 + }, + { + "epoch": 0.8388178578914274, + "grad_norm": 0.020809467881917953, + "learning_rate": 6.678940116033095e-07, + "loss": 0.0549, + "step": 4002 + }, + { + "epoch": 0.8390274575560679, + "grad_norm": 0.013579810969531536, + "learning_rate": 6.661999155316512e-07, + "loss": 0.0518, + "step": 4003 + }, + { + "epoch": 0.8392370572207084, + "grad_norm": 0.028101500123739243, + "learning_rate": 6.645078173409303e-07, + "loss": 0.052, + "step": 4004 + }, + { + "epoch": 0.839446656885349, + "grad_norm": 0.013599547557532787, + "learning_rate": 6.628177178112055e-07, + "loss": 0.0546, + "step": 4005 + }, + { + "epoch": 0.8396562565499895, + "grad_norm": 0.028915736824274063, + "learning_rate": 6.611296177216125e-07, + "loss": 0.0564, + "step": 4006 + }, + { + "epoch": 0.8398658562146301, + "grad_norm": 0.016699662432074547, + "learning_rate": 6.594435178503644e-07, + "loss": 0.0523, + "step": 4007 + }, + { + "epoch": 0.8400754558792706, + "grad_norm": 0.02233414351940155, + "learning_rate": 6.577594189747521e-07, + "loss": 0.0537, + "step": 4008 + }, + { + "epoch": 0.8402850555439111, + "grad_norm": 0.021178172901272774, + "learning_rate": 6.560773218711458e-07, + "loss": 0.0539, + "step": 4009 + }, + { + "epoch": 0.8404946552085517, + "grad_norm": 0.022526554763317108, + "learning_rate": 6.543972273149928e-07, + "loss": 0.0546, + "step": 4010 + }, + { + "epoch": 0.8407042548731922, + "grad_norm": 0.02022298239171505, + "learning_rate": 6.527191360808144e-07, + "loss": 0.0568, + "step": 4011 + }, + { + "epoch": 0.8409138545378327, + "grad_norm": 0.020070351660251617, + "learning_rate": 6.51043048942212e-07, + "loss": 0.0559, + "step": 4012 + }, + { + "epoch": 0.8411234542024733, + "grad_norm": 0.019938215613365173, + "learning_rate": 6.493689666718611e-07, + "loss": 0.0538, + "step": 4013 + }, + { + "epoch": 0.8413330538671138, + "grad_norm": 0.021108483895659447, + "learning_rate": 6.476968900415115e-07, + "loss": 0.053, + "step": 4014 + }, + { + "epoch": 0.8415426535317544, + "grad_norm": 0.02614649012684822, + "learning_rate": 6.460268198219916e-07, + "loss": 0.0533, + "step": 4015 + }, + { + "epoch": 0.8417522531963949, + "grad_norm": 0.021795378997921944, + "learning_rate": 6.443587567832044e-07, + "loss": 0.0542, + "step": 4016 + }, + { + "epoch": 0.8419618528610354, + "grad_norm": 0.030953634530305862, + "learning_rate": 6.426927016941248e-07, + "loss": 0.0536, + "step": 4017 + }, + { + "epoch": 0.8421714525256759, + "grad_norm": 0.0239299014210701, + "learning_rate": 6.410286553228052e-07, + "loss": 0.0575, + "step": 4018 + }, + { + "epoch": 0.8423810521903164, + "grad_norm": 0.035018209367990494, + "learning_rate": 6.393666184363701e-07, + "loss": 0.0552, + "step": 4019 + }, + { + "epoch": 0.8425906518549571, + "grad_norm": 0.01738305762410164, + "learning_rate": 6.377065918010173e-07, + "loss": 0.0514, + "step": 4020 + }, + { + "epoch": 0.8428002515195976, + "grad_norm": 0.027201887220144272, + "learning_rate": 6.360485761820195e-07, + "loss": 0.0553, + "step": 4021 + }, + { + "epoch": 0.8430098511842381, + "grad_norm": 0.01937497965991497, + "learning_rate": 6.343925723437217e-07, + "loss": 0.0526, + "step": 4022 + }, + { + "epoch": 0.8432194508488786, + "grad_norm": 0.018232090398669243, + "learning_rate": 6.327385810495423e-07, + "loss": 0.0567, + "step": 4023 + }, + { + "epoch": 0.8434290505135191, + "grad_norm": 0.01994934491813183, + "learning_rate": 6.310866030619694e-07, + "loss": 0.0548, + "step": 4024 + }, + { + "epoch": 0.8436386501781598, + "grad_norm": 0.016245190054178238, + "learning_rate": 6.294366391425643e-07, + "loss": 0.0534, + "step": 4025 + }, + { + "epoch": 0.8438482498428003, + "grad_norm": 0.018503889441490173, + "learning_rate": 6.27788690051962e-07, + "loss": 0.0547, + "step": 4026 + }, + { + "epoch": 0.8440578495074408, + "grad_norm": 0.015852371230721474, + "learning_rate": 6.26142756549864e-07, + "loss": 0.0561, + "step": 4027 + }, + { + "epoch": 0.8442674491720813, + "grad_norm": 0.012544246390461922, + "learning_rate": 6.244988393950469e-07, + "loss": 0.0558, + "step": 4028 + }, + { + "epoch": 0.8444770488367218, + "grad_norm": 0.01918352209031582, + "learning_rate": 6.228569393453582e-07, + "loss": 0.052, + "step": 4029 + }, + { + "epoch": 0.8446866485013624, + "grad_norm": 0.014716164208948612, + "learning_rate": 6.212170571577087e-07, + "loss": 0.0517, + "step": 4030 + }, + { + "epoch": 0.844896248166003, + "grad_norm": 0.015544608235359192, + "learning_rate": 6.195791935880868e-07, + "loss": 0.0523, + "step": 4031 + }, + { + "epoch": 0.8451058478306435, + "grad_norm": 0.016333509236574173, + "learning_rate": 6.17943349391546e-07, + "loss": 0.0548, + "step": 4032 + }, + { + "epoch": 0.845315447495284, + "grad_norm": 0.018222004175186157, + "learning_rate": 6.163095253222129e-07, + "loss": 0.052, + "step": 4033 + }, + { + "epoch": 0.8455250471599245, + "grad_norm": 0.022488001734018326, + "learning_rate": 6.146777221332772e-07, + "loss": 0.0538, + "step": 4034 + }, + { + "epoch": 0.8457346468245651, + "grad_norm": 0.019987892359495163, + "learning_rate": 6.130479405770002e-07, + "loss": 0.0545, + "step": 4035 + }, + { + "epoch": 0.8459442464892056, + "grad_norm": 0.01848592422902584, + "learning_rate": 6.114201814047122e-07, + "loss": 0.053, + "step": 4036 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.011879503726959229, + "learning_rate": 6.097944453668081e-07, + "loss": 0.0537, + "step": 4037 + }, + { + "epoch": 0.8463634458184867, + "grad_norm": 0.019580353051424026, + "learning_rate": 6.081707332127523e-07, + "loss": 0.0525, + "step": 4038 + }, + { + "epoch": 0.8465730454831272, + "grad_norm": 0.015657026320695877, + "learning_rate": 6.065490456910777e-07, + "loss": 0.0554, + "step": 4039 + }, + { + "epoch": 0.8467826451477678, + "grad_norm": 0.021646184846758842, + "learning_rate": 6.049293835493803e-07, + "loss": 0.0539, + "step": 4040 + }, + { + "epoch": 0.8469922448124083, + "grad_norm": 0.019581638276576996, + "learning_rate": 6.03311747534323e-07, + "loss": 0.0546, + "step": 4041 + }, + { + "epoch": 0.8472018444770488, + "grad_norm": 0.019119396805763245, + "learning_rate": 6.016961383916381e-07, + "loss": 0.0526, + "step": 4042 + }, + { + "epoch": 0.8474114441416893, + "grad_norm": 0.018327955156564713, + "learning_rate": 6.000825568661184e-07, + "loss": 0.0533, + "step": 4043 + }, + { + "epoch": 0.8476210438063299, + "grad_norm": 0.012688934803009033, + "learning_rate": 5.984710037016267e-07, + "loss": 0.0557, + "step": 4044 + }, + { + "epoch": 0.8478306434709705, + "grad_norm": 0.01570408046245575, + "learning_rate": 5.968614796410882e-07, + "loss": 0.0533, + "step": 4045 + }, + { + "epoch": 0.848040243135611, + "grad_norm": 0.013904701918363571, + "learning_rate": 5.952539854264938e-07, + "loss": 0.0549, + "step": 4046 + }, + { + "epoch": 0.8482498428002515, + "grad_norm": 0.0192690659314394, + "learning_rate": 5.936485217988958e-07, + "loss": 0.0524, + "step": 4047 + }, + { + "epoch": 0.848459442464892, + "grad_norm": 0.013410338200628757, + "learning_rate": 5.920450894984137e-07, + "loss": 0.0537, + "step": 4048 + }, + { + "epoch": 0.8486690421295326, + "grad_norm": 0.014187265187501907, + "learning_rate": 5.904436892642306e-07, + "loss": 0.0548, + "step": 4049 + }, + { + "epoch": 0.8488786417941732, + "grad_norm": 0.012049532495439053, + "learning_rate": 5.888443218345907e-07, + "loss": 0.0562, + "step": 4050 + }, + { + "epoch": 0.8490882414588137, + "grad_norm": 0.012612142600119114, + "learning_rate": 5.872469879468024e-07, + "loss": 0.057, + "step": 4051 + }, + { + "epoch": 0.8492978411234542, + "grad_norm": 0.016537649556994438, + "learning_rate": 5.856516883372365e-07, + "loss": 0.0538, + "step": 4052 + }, + { + "epoch": 0.8495074407880947, + "grad_norm": 0.012568135745823383, + "learning_rate": 5.840584237413239e-07, + "loss": 0.0545, + "step": 4053 + }, + { + "epoch": 0.8497170404527353, + "grad_norm": 0.013640418648719788, + "learning_rate": 5.824671948935606e-07, + "loss": 0.0567, + "step": 4054 + }, + { + "epoch": 0.8499266401173758, + "grad_norm": 0.016170648857951164, + "learning_rate": 5.808780025275045e-07, + "loss": 0.054, + "step": 4055 + }, + { + "epoch": 0.8501362397820164, + "grad_norm": 0.014185959473252296, + "learning_rate": 5.792908473757697e-07, + "loss": 0.0535, + "step": 4056 + }, + { + "epoch": 0.8503458394466569, + "grad_norm": 0.01882059872150421, + "learning_rate": 5.777057301700372e-07, + "loss": 0.0532, + "step": 4057 + }, + { + "epoch": 0.8505554391112974, + "grad_norm": 0.01410512626171112, + "learning_rate": 5.761226516410434e-07, + "loss": 0.0541, + "step": 4058 + }, + { + "epoch": 0.850765038775938, + "grad_norm": 0.01758696883916855, + "learning_rate": 5.745416125185898e-07, + "loss": 0.0543, + "step": 4059 + }, + { + "epoch": 0.8509746384405785, + "grad_norm": 0.015843121334910393, + "learning_rate": 5.729626135315319e-07, + "loss": 0.0518, + "step": 4060 + }, + { + "epoch": 0.851184238105219, + "grad_norm": 0.015141872689127922, + "learning_rate": 5.713856554077901e-07, + "loss": 0.056, + "step": 4061 + }, + { + "epoch": 0.8513938377698596, + "grad_norm": 0.016413336619734764, + "learning_rate": 5.698107388743418e-07, + "loss": 0.0559, + "step": 4062 + }, + { + "epoch": 0.8516034374345001, + "grad_norm": 0.016290897503495216, + "learning_rate": 5.682378646572229e-07, + "loss": 0.0511, + "step": 4063 + }, + { + "epoch": 0.8518130370991407, + "grad_norm": 0.01656182110309601, + "learning_rate": 5.666670334815267e-07, + "loss": 0.0546, + "step": 4064 + }, + { + "epoch": 0.8520226367637812, + "grad_norm": 0.01612510345876217, + "learning_rate": 5.650982460714083e-07, + "loss": 0.0572, + "step": 4065 + }, + { + "epoch": 0.8522322364284217, + "grad_norm": 0.01949678361415863, + "learning_rate": 5.635315031500766e-07, + "loss": 0.0533, + "step": 4066 + }, + { + "epoch": 0.8524418360930622, + "grad_norm": 0.017564570531249046, + "learning_rate": 5.619668054398008e-07, + "loss": 0.0558, + "step": 4067 + }, + { + "epoch": 0.8526514357577027, + "grad_norm": 0.024784216657280922, + "learning_rate": 5.604041536619048e-07, + "loss": 0.0532, + "step": 4068 + }, + { + "epoch": 0.8528610354223434, + "grad_norm": 0.020440300926566124, + "learning_rate": 5.588435485367733e-07, + "loss": 0.0522, + "step": 4069 + }, + { + "epoch": 0.8530706350869839, + "grad_norm": 0.024173369631171227, + "learning_rate": 5.572849907838423e-07, + "loss": 0.0533, + "step": 4070 + }, + { + "epoch": 0.8532802347516244, + "grad_norm": 0.016502492129802704, + "learning_rate": 5.557284811216074e-07, + "loss": 0.0526, + "step": 4071 + }, + { + "epoch": 0.8534898344162649, + "grad_norm": 0.020477591082453728, + "learning_rate": 5.54174020267621e-07, + "loss": 0.051, + "step": 4072 + }, + { + "epoch": 0.8536994340809054, + "grad_norm": 0.01553434506058693, + "learning_rate": 5.526216089384872e-07, + "loss": 0.0532, + "step": 4073 + }, + { + "epoch": 0.853909033745546, + "grad_norm": 0.016066189855337143, + "learning_rate": 5.510712478498675e-07, + "loss": 0.0515, + "step": 4074 + }, + { + "epoch": 0.8541186334101866, + "grad_norm": 0.017956750467419624, + "learning_rate": 5.49522937716479e-07, + "loss": 0.054, + "step": 4075 + }, + { + "epoch": 0.8543282330748271, + "grad_norm": 0.016211628913879395, + "learning_rate": 5.479766792520908e-07, + "loss": 0.0554, + "step": 4076 + }, + { + "epoch": 0.8545378327394676, + "grad_norm": 0.02058112435042858, + "learning_rate": 5.464324731695286e-07, + "loss": 0.0564, + "step": 4077 + }, + { + "epoch": 0.8547474324041081, + "grad_norm": 0.020162228494882584, + "learning_rate": 5.448903201806727e-07, + "loss": 0.0581, + "step": 4078 + }, + { + "epoch": 0.8549570320687487, + "grad_norm": 0.017797335982322693, + "learning_rate": 5.433502209964531e-07, + "loss": 0.0549, + "step": 4079 + }, + { + "epoch": 0.8551666317333892, + "grad_norm": 0.017947765067219734, + "learning_rate": 5.418121763268553e-07, + "loss": 0.0527, + "step": 4080 + }, + { + "epoch": 0.8553762313980298, + "grad_norm": 0.02111784741282463, + "learning_rate": 5.402761868809181e-07, + "loss": 0.0541, + "step": 4081 + }, + { + "epoch": 0.8555858310626703, + "grad_norm": 0.023570656776428223, + "learning_rate": 5.387422533667336e-07, + "loss": 0.0538, + "step": 4082 + }, + { + "epoch": 0.8557954307273108, + "grad_norm": 0.023569107055664062, + "learning_rate": 5.372103764914421e-07, + "loss": 0.0557, + "step": 4083 + }, + { + "epoch": 0.8560050303919514, + "grad_norm": 0.02265586517751217, + "learning_rate": 5.356805569612417e-07, + "loss": 0.0549, + "step": 4084 + }, + { + "epoch": 0.8562146300565919, + "grad_norm": 0.03135437145829201, + "learning_rate": 5.341527954813763e-07, + "loss": 0.0536, + "step": 4085 + }, + { + "epoch": 0.8564242297212324, + "grad_norm": 0.015480936504900455, + "learning_rate": 5.326270927561444e-07, + "loss": 0.0559, + "step": 4086 + }, + { + "epoch": 0.856633829385873, + "grad_norm": 0.025915948674082756, + "learning_rate": 5.311034494888945e-07, + "loss": 0.0521, + "step": 4087 + }, + { + "epoch": 0.8568434290505135, + "grad_norm": 0.025078527629375458, + "learning_rate": 5.295818663820268e-07, + "loss": 0.0523, + "step": 4088 + }, + { + "epoch": 0.8570530287151541, + "grad_norm": 0.015537966974079609, + "learning_rate": 5.280623441369897e-07, + "loss": 0.0529, + "step": 4089 + }, + { + "epoch": 0.8572626283797946, + "grad_norm": 0.028883641585707664, + "learning_rate": 5.265448834542836e-07, + "loss": 0.0587, + "step": 4090 + }, + { + "epoch": 0.8574722280444351, + "grad_norm": 0.018950091674923897, + "learning_rate": 5.250294850334564e-07, + "loss": 0.0531, + "step": 4091 + }, + { + "epoch": 0.8576818277090756, + "grad_norm": 0.019102994352579117, + "learning_rate": 5.235161495731079e-07, + "loss": 0.0513, + "step": 4092 + }, + { + "epoch": 0.8578914273737162, + "grad_norm": 0.02294645458459854, + "learning_rate": 5.22004877770883e-07, + "loss": 0.0553, + "step": 4093 + }, + { + "epoch": 0.8581010270383568, + "grad_norm": 0.018397770822048187, + "learning_rate": 5.204956703234804e-07, + "loss": 0.0538, + "step": 4094 + }, + { + "epoch": 0.8583106267029973, + "grad_norm": 0.019318964332342148, + "learning_rate": 5.189885279266433e-07, + "loss": 0.0546, + "step": 4095 + }, + { + "epoch": 0.8585202263676378, + "grad_norm": 0.018967201933264732, + "learning_rate": 5.174834512751647e-07, + "loss": 0.055, + "step": 4096 + }, + { + "epoch": 0.8587298260322783, + "grad_norm": 0.016506996005773544, + "learning_rate": 5.159804410628827e-07, + "loss": 0.0524, + "step": 4097 + }, + { + "epoch": 0.8589394256969188, + "grad_norm": 0.015648337081074715, + "learning_rate": 5.144794979826867e-07, + "loss": 0.0533, + "step": 4098 + }, + { + "epoch": 0.8591490253615595, + "grad_norm": 0.016446251422166824, + "learning_rate": 5.129806227265099e-07, + "loss": 0.0533, + "step": 4099 + }, + { + "epoch": 0.8593586250262, + "grad_norm": 0.020193303003907204, + "learning_rate": 5.114838159853336e-07, + "loss": 0.0549, + "step": 4100 + }, + { + "epoch": 0.8595682246908405, + "grad_norm": 0.015607200562953949, + "learning_rate": 5.099890784491879e-07, + "loss": 0.0559, + "step": 4101 + }, + { + "epoch": 0.859777824355481, + "grad_norm": 0.01757110469043255, + "learning_rate": 5.084964108071428e-07, + "loss": 0.0546, + "step": 4102 + }, + { + "epoch": 0.8599874240201215, + "grad_norm": 0.021767864003777504, + "learning_rate": 5.070058137473188e-07, + "loss": 0.0536, + "step": 4103 + }, + { + "epoch": 0.8601970236847621, + "grad_norm": 0.01381634920835495, + "learning_rate": 5.055172879568815e-07, + "loss": 0.0543, + "step": 4104 + }, + { + "epoch": 0.8604066233494027, + "grad_norm": 0.016660235822200775, + "learning_rate": 5.040308341220418e-07, + "loss": 0.0551, + "step": 4105 + }, + { + "epoch": 0.8606162230140432, + "grad_norm": 0.0208145659416914, + "learning_rate": 5.025464529280538e-07, + "loss": 0.0551, + "step": 4106 + }, + { + "epoch": 0.8608258226786837, + "grad_norm": 0.013750380836427212, + "learning_rate": 5.010641450592158e-07, + "loss": 0.0538, + "step": 4107 + }, + { + "epoch": 0.8610354223433242, + "grad_norm": 0.01817614957690239, + "learning_rate": 4.995839111988737e-07, + "loss": 0.0553, + "step": 4108 + }, + { + "epoch": 0.8612450220079648, + "grad_norm": 0.019916629418730736, + "learning_rate": 4.981057520294124e-07, + "loss": 0.0522, + "step": 4109 + }, + { + "epoch": 0.8614546216726053, + "grad_norm": 0.012952596880495548, + "learning_rate": 4.966296682322641e-07, + "loss": 0.0554, + "step": 4110 + }, + { + "epoch": 0.8616642213372459, + "grad_norm": 0.01568775624036789, + "learning_rate": 4.951556604879049e-07, + "loss": 0.0523, + "step": 4111 + }, + { + "epoch": 0.8618738210018864, + "grad_norm": 0.020983314141631126, + "learning_rate": 4.936837294758501e-07, + "loss": 0.0545, + "step": 4112 + }, + { + "epoch": 0.8620834206665269, + "grad_norm": 0.017767738550901413, + "learning_rate": 4.922138758746587e-07, + "loss": 0.0548, + "step": 4113 + }, + { + "epoch": 0.8622930203311675, + "grad_norm": 0.016531750559806824, + "learning_rate": 4.907461003619346e-07, + "loss": 0.0573, + "step": 4114 + }, + { + "epoch": 0.862502619995808, + "grad_norm": 0.019049130380153656, + "learning_rate": 4.892804036143223e-07, + "loss": 0.0538, + "step": 4115 + }, + { + "epoch": 0.8627122196604485, + "grad_norm": 0.01858980767428875, + "learning_rate": 4.878167863075061e-07, + "loss": 0.054, + "step": 4116 + }, + { + "epoch": 0.862921819325089, + "grad_norm": 0.013377662748098373, + "learning_rate": 4.863552491162149e-07, + "loss": 0.0547, + "step": 4117 + }, + { + "epoch": 0.8631314189897297, + "grad_norm": 0.022448167204856873, + "learning_rate": 4.848957927142167e-07, + "loss": 0.0553, + "step": 4118 + }, + { + "epoch": 0.8633410186543702, + "grad_norm": 0.019559383392333984, + "learning_rate": 4.83438417774319e-07, + "loss": 0.0549, + "step": 4119 + }, + { + "epoch": 0.8635506183190107, + "grad_norm": 0.017378434538841248, + "learning_rate": 4.819831249683726e-07, + "loss": 0.0533, + "step": 4120 + }, + { + "epoch": 0.8637602179836512, + "grad_norm": 0.022112280130386353, + "learning_rate": 4.805299149672682e-07, + "loss": 0.0558, + "step": 4121 + }, + { + "epoch": 0.8639698176482917, + "grad_norm": 0.02191154658794403, + "learning_rate": 4.790787884409332e-07, + "loss": 0.0577, + "step": 4122 + }, + { + "epoch": 0.8641794173129324, + "grad_norm": 0.015262911096215248, + "learning_rate": 4.776297460583384e-07, + "loss": 0.057, + "step": 4123 + }, + { + "epoch": 0.8643890169775729, + "grad_norm": 0.01982884109020233, + "learning_rate": 4.7618278848749146e-07, + "loss": 0.0536, + "step": 4124 + }, + { + "epoch": 0.8645986166422134, + "grad_norm": 0.024044396355748177, + "learning_rate": 4.7473791639543853e-07, + "loss": 0.0529, + "step": 4125 + }, + { + "epoch": 0.8648082163068539, + "grad_norm": 0.015614238567650318, + "learning_rate": 4.732951304482658e-07, + "loss": 0.0541, + "step": 4126 + }, + { + "epoch": 0.8650178159714944, + "grad_norm": 0.02073797397315502, + "learning_rate": 4.7185443131109785e-07, + "loss": 0.0526, + "step": 4127 + }, + { + "epoch": 0.865227415636135, + "grad_norm": 0.026310456916689873, + "learning_rate": 4.7041581964809733e-07, + "loss": 0.0539, + "step": 4128 + }, + { + "epoch": 0.8654370153007755, + "grad_norm": 0.02114870585501194, + "learning_rate": 4.6897929612246317e-07, + "loss": 0.0532, + "step": 4129 + }, + { + "epoch": 0.8656466149654161, + "grad_norm": 0.020745746791362762, + "learning_rate": 4.675448613964317e-07, + "loss": 0.0538, + "step": 4130 + }, + { + "epoch": 0.8658562146300566, + "grad_norm": 0.02710759826004505, + "learning_rate": 4.6611251613127793e-07, + "loss": 0.0548, + "step": 4131 + }, + { + "epoch": 0.8660658142946971, + "grad_norm": 0.020626511424779892, + "learning_rate": 4.64682260987312e-07, + "loss": 0.0534, + "step": 4132 + }, + { + "epoch": 0.8662754139593377, + "grad_norm": 0.020137401297688484, + "learning_rate": 4.6325409662388133e-07, + "loss": 0.0547, + "step": 4133 + }, + { + "epoch": 0.8664850136239782, + "grad_norm": 0.0274630356580019, + "learning_rate": 4.618280236993711e-07, + "loss": 0.0579, + "step": 4134 + }, + { + "epoch": 0.8666946132886187, + "grad_norm": 0.02381911128759384, + "learning_rate": 4.6040404287119924e-07, + "loss": 0.0563, + "step": 4135 + }, + { + "epoch": 0.8669042129532593, + "grad_norm": 0.014909719116985798, + "learning_rate": 4.589821547958195e-07, + "loss": 0.0531, + "step": 4136 + }, + { + "epoch": 0.8671138126178998, + "grad_norm": 0.02371630258858204, + "learning_rate": 4.5756236012872324e-07, + "loss": 0.053, + "step": 4137 + }, + { + "epoch": 0.8673234122825404, + "grad_norm": 0.028402652591466904, + "learning_rate": 4.561446595244362e-07, + "loss": 0.0536, + "step": 4138 + }, + { + "epoch": 0.8675330119471809, + "grad_norm": 0.01664336584508419, + "learning_rate": 4.547290536365173e-07, + "loss": 0.0545, + "step": 4139 + }, + { + "epoch": 0.8677426116118214, + "grad_norm": 0.022892151027917862, + "learning_rate": 4.5331554311755956e-07, + "loss": 0.0527, + "step": 4140 + }, + { + "epoch": 0.8679522112764619, + "grad_norm": 0.027045048773288727, + "learning_rate": 4.519041286191933e-07, + "loss": 0.053, + "step": 4141 + }, + { + "epoch": 0.8681618109411025, + "grad_norm": 0.021929344162344933, + "learning_rate": 4.504948107920781e-07, + "loss": 0.0518, + "step": 4142 + }, + { + "epoch": 0.8683714106057431, + "grad_norm": 0.016940593719482422, + "learning_rate": 4.490875902859099e-07, + "loss": 0.0526, + "step": 4143 + }, + { + "epoch": 0.8685810102703836, + "grad_norm": 0.03185591474175453, + "learning_rate": 4.476824677494179e-07, + "loss": 0.0529, + "step": 4144 + }, + { + "epoch": 0.8687906099350241, + "grad_norm": 0.02771037444472313, + "learning_rate": 4.46279443830363e-07, + "loss": 0.0572, + "step": 4145 + }, + { + "epoch": 0.8690002095996646, + "grad_norm": 0.017459120601415634, + "learning_rate": 4.448785191755378e-07, + "loss": 0.0521, + "step": 4146 + }, + { + "epoch": 0.8692098092643051, + "grad_norm": 0.03185920789837837, + "learning_rate": 4.4347969443076956e-07, + "loss": 0.0549, + "step": 4147 + }, + { + "epoch": 0.8694194089289458, + "grad_norm": 0.03139664977788925, + "learning_rate": 4.420829702409152e-07, + "loss": 0.053, + "step": 4148 + }, + { + "epoch": 0.8696290085935863, + "grad_norm": 0.023265665397047997, + "learning_rate": 4.4068834724986466e-07, + "loss": 0.054, + "step": 4149 + }, + { + "epoch": 0.8698386082582268, + "grad_norm": 0.026613622903823853, + "learning_rate": 4.3929582610053976e-07, + "loss": 0.0566, + "step": 4150 + }, + { + "epoch": 0.8700482079228673, + "grad_norm": 0.03510505333542824, + "learning_rate": 4.3790540743489207e-07, + "loss": 0.0543, + "step": 4151 + }, + { + "epoch": 0.8702578075875078, + "grad_norm": 0.03017215058207512, + "learning_rate": 4.365170918939027e-07, + "loss": 0.0546, + "step": 4152 + }, + { + "epoch": 0.8704674072521484, + "grad_norm": 0.020732155069708824, + "learning_rate": 4.351308801175863e-07, + "loss": 0.0524, + "step": 4153 + }, + { + "epoch": 0.870677006916789, + "grad_norm": 0.032381054013967514, + "learning_rate": 4.337467727449862e-07, + "loss": 0.0544, + "step": 4154 + }, + { + "epoch": 0.8708866065814295, + "grad_norm": 0.03272484615445137, + "learning_rate": 4.323647704141754e-07, + "loss": 0.0546, + "step": 4155 + }, + { + "epoch": 0.87109620624607, + "grad_norm": 0.019726725295186043, + "learning_rate": 4.309848737622568e-07, + "loss": 0.0547, + "step": 4156 + }, + { + "epoch": 0.8713058059107105, + "grad_norm": 0.0256227757781744, + "learning_rate": 4.2960708342536295e-07, + "loss": 0.0544, + "step": 4157 + }, + { + "epoch": 0.8715154055753511, + "grad_norm": 0.03493373841047287, + "learning_rate": 4.2823140003865283e-07, + "loss": 0.0541, + "step": 4158 + }, + { + "epoch": 0.8717250052399916, + "grad_norm": 0.022653086110949516, + "learning_rate": 4.2685782423631806e-07, + "loss": 0.0534, + "step": 4159 + }, + { + "epoch": 0.8719346049046321, + "grad_norm": 0.021094655618071556, + "learning_rate": 4.2548635665157713e-07, + "loss": 0.0501, + "step": 4160 + }, + { + "epoch": 0.8721442045692727, + "grad_norm": 0.03143506497144699, + "learning_rate": 4.241169979166748e-07, + "loss": 0.0533, + "step": 4161 + }, + { + "epoch": 0.8723538042339132, + "grad_norm": 0.029018806293606758, + "learning_rate": 4.2274974866288675e-07, + "loss": 0.0539, + "step": 4162 + }, + { + "epoch": 0.8725634038985538, + "grad_norm": 0.01749360002577305, + "learning_rate": 4.213846095205126e-07, + "loss": 0.0518, + "step": 4163 + }, + { + "epoch": 0.8727730035631943, + "grad_norm": 0.02830035425722599, + "learning_rate": 4.2002158111888345e-07, + "loss": 0.0551, + "step": 4164 + }, + { + "epoch": 0.8729826032278348, + "grad_norm": 0.03221756964921951, + "learning_rate": 4.186606640863533e-07, + "loss": 0.054, + "step": 4165 + }, + { + "epoch": 0.8731922028924753, + "grad_norm": 0.021834442391991615, + "learning_rate": 4.1730185905030527e-07, + "loss": 0.0535, + "step": 4166 + }, + { + "epoch": 0.8734018025571159, + "grad_norm": 0.025496196001768112, + "learning_rate": 4.1594516663714946e-07, + "loss": 0.0545, + "step": 4167 + }, + { + "epoch": 0.8736114022217565, + "grad_norm": 0.031498175114393234, + "learning_rate": 4.145905874723194e-07, + "loss": 0.0569, + "step": 4168 + }, + { + "epoch": 0.873821001886397, + "grad_norm": 0.023885555565357208, + "learning_rate": 4.1323812218027506e-07, + "loss": 0.0548, + "step": 4169 + }, + { + "epoch": 0.8740306015510375, + "grad_norm": 0.019355174154043198, + "learning_rate": 4.1188777138450487e-07, + "loss": 0.0546, + "step": 4170 + }, + { + "epoch": 0.874240201215678, + "grad_norm": 0.029070964083075523, + "learning_rate": 4.1053953570751813e-07, + "loss": 0.055, + "step": 4171 + }, + { + "epoch": 0.8744498008803185, + "grad_norm": 0.030413752421736717, + "learning_rate": 4.0919341577085157e-07, + "loss": 0.0537, + "step": 4172 + }, + { + "epoch": 0.8746594005449592, + "grad_norm": 0.0210228580981493, + "learning_rate": 4.078494121950682e-07, + "loss": 0.0545, + "step": 4173 + }, + { + "epoch": 0.8748690002095997, + "grad_norm": 0.026022691279649734, + "learning_rate": 4.065075255997514e-07, + "loss": 0.0524, + "step": 4174 + }, + { + "epoch": 0.8750785998742402, + "grad_norm": 0.03338058665394783, + "learning_rate": 4.051677566035106e-07, + "loss": 0.0534, + "step": 4175 + }, + { + "epoch": 0.8752881995388807, + "grad_norm": 0.024644313380122185, + "learning_rate": 4.038301058239796e-07, + "loss": 0.0516, + "step": 4176 + }, + { + "epoch": 0.8754977992035212, + "grad_norm": 0.019061360508203506, + "learning_rate": 4.024945738778163e-07, + "loss": 0.0525, + "step": 4177 + }, + { + "epoch": 0.8757073988681618, + "grad_norm": 0.035856083035469055, + "learning_rate": 4.011611613806987e-07, + "loss": 0.0563, + "step": 4178 + }, + { + "epoch": 0.8759169985328024, + "grad_norm": 0.03140418231487274, + "learning_rate": 3.998298689473301e-07, + "loss": 0.0531, + "step": 4179 + }, + { + "epoch": 0.8761265981974429, + "grad_norm": 0.01924203895032406, + "learning_rate": 3.9850069719143735e-07, + "loss": 0.0564, + "step": 4180 + }, + { + "epoch": 0.8763361978620834, + "grad_norm": 0.028375735506415367, + "learning_rate": 3.971736467257659e-07, + "loss": 0.0528, + "step": 4181 + }, + { + "epoch": 0.8765457975267239, + "grad_norm": 0.03424501046538353, + "learning_rate": 3.958487181620879e-07, + "loss": 0.0542, + "step": 4182 + }, + { + "epoch": 0.8767553971913645, + "grad_norm": 0.02565137669444084, + "learning_rate": 3.9452591211119496e-07, + "loss": 0.0556, + "step": 4183 + }, + { + "epoch": 0.876964996856005, + "grad_norm": 0.021563977003097534, + "learning_rate": 3.9320522918289973e-07, + "loss": 0.0527, + "step": 4184 + }, + { + "epoch": 0.8771745965206456, + "grad_norm": 0.027671657502651215, + "learning_rate": 3.918866699860363e-07, + "loss": 0.0522, + "step": 4185 + }, + { + "epoch": 0.8773841961852861, + "grad_norm": 0.032790180295705795, + "learning_rate": 3.9057023512846127e-07, + "loss": 0.056, + "step": 4186 + }, + { + "epoch": 0.8775937958499267, + "grad_norm": 0.023974338546395302, + "learning_rate": 3.892559252170508e-07, + "loss": 0.0528, + "step": 4187 + }, + { + "epoch": 0.8778033955145672, + "grad_norm": 0.02199658937752247, + "learning_rate": 3.879437408576997e-07, + "loss": 0.0536, + "step": 4188 + }, + { + "epoch": 0.8780129951792077, + "grad_norm": 0.03704606741666794, + "learning_rate": 3.866336826553274e-07, + "loss": 0.0531, + "step": 4189 + }, + { + "epoch": 0.8782225948438482, + "grad_norm": 0.027375850826501846, + "learning_rate": 3.8532575121386916e-07, + "loss": 0.0519, + "step": 4190 + }, + { + "epoch": 0.8784321945084888, + "grad_norm": 0.01718190312385559, + "learning_rate": 3.8401994713628044e-07, + "loss": 0.0551, + "step": 4191 + }, + { + "epoch": 0.8786417941731294, + "grad_norm": 0.031402431428432465, + "learning_rate": 3.827162710245369e-07, + "loss": 0.0557, + "step": 4192 + }, + { + "epoch": 0.8788513938377699, + "grad_norm": 0.033082105219364166, + "learning_rate": 3.814147234796345e-07, + "loss": 0.0532, + "step": 4193 + }, + { + "epoch": 0.8790609935024104, + "grad_norm": 0.02236991748213768, + "learning_rate": 3.8011530510158377e-07, + "loss": 0.0544, + "step": 4194 + }, + { + "epoch": 0.8792705931670509, + "grad_norm": 0.02740001678466797, + "learning_rate": 3.78818016489419e-07, + "loss": 0.0549, + "step": 4195 + }, + { + "epoch": 0.8794801928316914, + "grad_norm": 0.03492108732461929, + "learning_rate": 3.775228582411877e-07, + "loss": 0.0536, + "step": 4196 + }, + { + "epoch": 0.8796897924963321, + "grad_norm": 0.029871255159378052, + "learning_rate": 3.7622983095395973e-07, + "loss": 0.0522, + "step": 4197 + }, + { + "epoch": 0.8798993921609726, + "grad_norm": 0.02398385852575302, + "learning_rate": 3.7493893522381866e-07, + "loss": 0.0529, + "step": 4198 + }, + { + "epoch": 0.8801089918256131, + "grad_norm": 0.028454309329390526, + "learning_rate": 3.736501716458668e-07, + "loss": 0.0572, + "step": 4199 + }, + { + "epoch": 0.8803185914902536, + "grad_norm": 0.038288019597530365, + "learning_rate": 3.723635408142262e-07, + "loss": 0.0538, + "step": 4200 + }, + { + "epoch": 0.8805281911548941, + "grad_norm": 0.030530139803886414, + "learning_rate": 3.710790433220324e-07, + "loss": 0.0522, + "step": 4201 + }, + { + "epoch": 0.8807377908195347, + "grad_norm": 0.021036000922322273, + "learning_rate": 3.6979667976143663e-07, + "loss": 0.0532, + "step": 4202 + }, + { + "epoch": 0.8809473904841753, + "grad_norm": 0.035266853868961334, + "learning_rate": 3.685164507236111e-07, + "loss": 0.0538, + "step": 4203 + }, + { + "epoch": 0.8811569901488158, + "grad_norm": 0.03357855603098869, + "learning_rate": 3.6723835679873864e-07, + "loss": 0.0546, + "step": 4204 + }, + { + "epoch": 0.8813665898134563, + "grad_norm": 0.025351261720061302, + "learning_rate": 3.6596239857602136e-07, + "loss": 0.0522, + "step": 4205 + }, + { + "epoch": 0.8815761894780968, + "grad_norm": 0.027122756466269493, + "learning_rate": 3.6468857664367753e-07, + "loss": 0.0541, + "step": 4206 + }, + { + "epoch": 0.8817857891427374, + "grad_norm": 0.03549753874540329, + "learning_rate": 3.6341689158893524e-07, + "loss": 0.0535, + "step": 4207 + }, + { + "epoch": 0.8819953888073779, + "grad_norm": 0.03277155011892319, + "learning_rate": 3.6214734399804277e-07, + "loss": 0.0532, + "step": 4208 + }, + { + "epoch": 0.8822049884720184, + "grad_norm": 0.019641457125544548, + "learning_rate": 3.6087993445626135e-07, + "loss": 0.0543, + "step": 4209 + }, + { + "epoch": 0.882414588136659, + "grad_norm": 0.027548063546419144, + "learning_rate": 3.596146635478676e-07, + "loss": 0.0563, + "step": 4210 + }, + { + "epoch": 0.8826241878012995, + "grad_norm": 0.03249534219503403, + "learning_rate": 3.583515318561498e-07, + "loss": 0.0534, + "step": 4211 + }, + { + "epoch": 0.8828337874659401, + "grad_norm": 0.030830688774585724, + "learning_rate": 3.570905399634111e-07, + "loss": 0.0559, + "step": 4212 + }, + { + "epoch": 0.8830433871305806, + "grad_norm": 0.019546935334801674, + "learning_rate": 3.558316884509694e-07, + "loss": 0.054, + "step": 4213 + }, + { + "epoch": 0.8832529867952211, + "grad_norm": 0.025954263284802437, + "learning_rate": 3.545749778991542e-07, + "loss": 0.0538, + "step": 4214 + }, + { + "epoch": 0.8834625864598616, + "grad_norm": 0.03318324312567711, + "learning_rate": 3.5332040888730935e-07, + "loss": 0.052, + "step": 4215 + }, + { + "epoch": 0.8836721861245022, + "grad_norm": 0.0273283664137125, + "learning_rate": 3.5206798199379166e-07, + "loss": 0.0548, + "step": 4216 + }, + { + "epoch": 0.8838817857891428, + "grad_norm": 0.02014850080013275, + "learning_rate": 3.508176977959682e-07, + "loss": 0.0536, + "step": 4217 + }, + { + "epoch": 0.8840913854537833, + "grad_norm": 0.02724394015967846, + "learning_rate": 3.495695568702201e-07, + "loss": 0.0559, + "step": 4218 + }, + { + "epoch": 0.8843009851184238, + "grad_norm": 0.028185635805130005, + "learning_rate": 3.483235597919404e-07, + "loss": 0.0514, + "step": 4219 + }, + { + "epoch": 0.8845105847830643, + "grad_norm": 0.02001328580081463, + "learning_rate": 3.4707970713553364e-07, + "loss": 0.0539, + "step": 4220 + }, + { + "epoch": 0.8847201844477048, + "grad_norm": 0.01998419128358364, + "learning_rate": 3.4583799947441487e-07, + "loss": 0.0523, + "step": 4221 + }, + { + "epoch": 0.8849297841123455, + "grad_norm": 0.030080687254667282, + "learning_rate": 3.445984373810124e-07, + "loss": 0.0544, + "step": 4222 + }, + { + "epoch": 0.885139383776986, + "grad_norm": 0.026855189353227615, + "learning_rate": 3.4336102142676354e-07, + "loss": 0.0522, + "step": 4223 + }, + { + "epoch": 0.8853489834416265, + "grad_norm": 0.01764325052499771, + "learning_rate": 3.421257521821153e-07, + "loss": 0.0546, + "step": 4224 + }, + { + "epoch": 0.885558583106267, + "grad_norm": 0.02428247220814228, + "learning_rate": 3.408926302165283e-07, + "loss": 0.0541, + "step": 4225 + }, + { + "epoch": 0.8857681827709075, + "grad_norm": 0.025871314108371735, + "learning_rate": 3.396616560984711e-07, + "loss": 0.0529, + "step": 4226 + }, + { + "epoch": 0.8859777824355481, + "grad_norm": 0.02489456534385681, + "learning_rate": 3.384328303954221e-07, + "loss": 0.0562, + "step": 4227 + }, + { + "epoch": 0.8861873821001887, + "grad_norm": 0.017987683415412903, + "learning_rate": 3.372061536738708e-07, + "loss": 0.0529, + "step": 4228 + }, + { + "epoch": 0.8863969817648292, + "grad_norm": 0.025899479165673256, + "learning_rate": 3.3598162649931374e-07, + "loss": 0.0577, + "step": 4229 + }, + { + "epoch": 0.8866065814294697, + "grad_norm": 0.030789077281951904, + "learning_rate": 3.347592494362578e-07, + "loss": 0.0536, + "step": 4230 + }, + { + "epoch": 0.8868161810941102, + "grad_norm": 0.024812959134578705, + "learning_rate": 3.3353902304821826e-07, + "loss": 0.0562, + "step": 4231 + }, + { + "epoch": 0.8870257807587508, + "grad_norm": 0.01769872196018696, + "learning_rate": 3.323209478977202e-07, + "loss": 0.0534, + "step": 4232 + }, + { + "epoch": 0.8872353804233913, + "grad_norm": 0.026812558993697166, + "learning_rate": 3.31105024546296e-07, + "loss": 0.054, + "step": 4233 + }, + { + "epoch": 0.8874449800880319, + "grad_norm": 0.0294424407184124, + "learning_rate": 3.2989125355448623e-07, + "loss": 0.0524, + "step": 4234 + }, + { + "epoch": 0.8876545797526724, + "grad_norm": 0.02184954471886158, + "learning_rate": 3.2867963548183723e-07, + "loss": 0.052, + "step": 4235 + }, + { + "epoch": 0.8878641794173129, + "grad_norm": 0.01684270054101944, + "learning_rate": 3.274701708869066e-07, + "loss": 0.0565, + "step": 4236 + }, + { + "epoch": 0.8880737790819535, + "grad_norm": 0.028860921040177345, + "learning_rate": 3.2626286032725664e-07, + "loss": 0.0527, + "step": 4237 + }, + { + "epoch": 0.888283378746594, + "grad_norm": 0.02757331356406212, + "learning_rate": 3.250577043594566e-07, + "loss": 0.0569, + "step": 4238 + }, + { + "epoch": 0.8884929784112345, + "grad_norm": 0.020717767998576164, + "learning_rate": 3.238547035390843e-07, + "loss": 0.0569, + "step": 4239 + }, + { + "epoch": 0.888702578075875, + "grad_norm": 0.019048750400543213, + "learning_rate": 3.226538584207228e-07, + "loss": 0.0549, + "step": 4240 + }, + { + "epoch": 0.8889121777405156, + "grad_norm": 0.030015893280506134, + "learning_rate": 3.214551695579604e-07, + "loss": 0.0528, + "step": 4241 + }, + { + "epoch": 0.8891217774051562, + "grad_norm": 0.02680326998233795, + "learning_rate": 3.202586375033928e-07, + "loss": 0.0538, + "step": 4242 + }, + { + "epoch": 0.8893313770697967, + "grad_norm": 0.020373962819576263, + "learning_rate": 3.190642628086221e-07, + "loss": 0.0539, + "step": 4243 + }, + { + "epoch": 0.8895409767344372, + "grad_norm": 0.020322667434811592, + "learning_rate": 3.178720460242535e-07, + "loss": 0.0539, + "step": 4244 + }, + { + "epoch": 0.8897505763990777, + "grad_norm": 0.025189222767949104, + "learning_rate": 3.166819876998989e-07, + "loss": 0.0532, + "step": 4245 + }, + { + "epoch": 0.8899601760637182, + "grad_norm": 0.027642201632261276, + "learning_rate": 3.154940883841756e-07, + "loss": 0.0551, + "step": 4246 + }, + { + "epoch": 0.8901697757283589, + "grad_norm": 0.023163793608546257, + "learning_rate": 3.1430834862470395e-07, + "loss": 0.0545, + "step": 4247 + }, + { + "epoch": 0.8903793753929994, + "grad_norm": 0.01851850189268589, + "learning_rate": 3.131247689681099e-07, + "loss": 0.0553, + "step": 4248 + }, + { + "epoch": 0.8905889750576399, + "grad_norm": 0.028064940124750137, + "learning_rate": 3.1194334996002497e-07, + "loss": 0.0544, + "step": 4249 + }, + { + "epoch": 0.8907985747222804, + "grad_norm": 0.02554197795689106, + "learning_rate": 3.1076409214508164e-07, + "loss": 0.0533, + "step": 4250 + }, + { + "epoch": 0.8910081743869209, + "grad_norm": 0.018633106723427773, + "learning_rate": 3.095869960669173e-07, + "loss": 0.0544, + "step": 4251 + }, + { + "epoch": 0.8912177740515616, + "grad_norm": 0.019019601866602898, + "learning_rate": 3.084120622681741e-07, + "loss": 0.0535, + "step": 4252 + }, + { + "epoch": 0.8914273737162021, + "grad_norm": 0.026201697066426277, + "learning_rate": 3.072392912904948e-07, + "loss": 0.0555, + "step": 4253 + }, + { + "epoch": 0.8916369733808426, + "grad_norm": 0.027933111414313316, + "learning_rate": 3.0606868367452746e-07, + "loss": 0.0539, + "step": 4254 + }, + { + "epoch": 0.8918465730454831, + "grad_norm": 0.018472185358405113, + "learning_rate": 3.049002399599232e-07, + "loss": 0.0544, + "step": 4255 + }, + { + "epoch": 0.8920561727101237, + "grad_norm": 0.017080184072256088, + "learning_rate": 3.0373396068533234e-07, + "loss": 0.054, + "step": 4256 + }, + { + "epoch": 0.8922657723747642, + "grad_norm": 0.02512308955192566, + "learning_rate": 3.0256984638840967e-07, + "loss": 0.0548, + "step": 4257 + }, + { + "epoch": 0.8924753720394047, + "grad_norm": 0.02607639506459236, + "learning_rate": 3.0140789760581214e-07, + "loss": 0.0531, + "step": 4258 + }, + { + "epoch": 0.8926849717040453, + "grad_norm": 0.01739831455051899, + "learning_rate": 3.0024811487319837e-07, + "loss": 0.0551, + "step": 4259 + }, + { + "epoch": 0.8928945713686858, + "grad_norm": 0.017008259892463684, + "learning_rate": 2.990904987252269e-07, + "loss": 0.0544, + "step": 4260 + }, + { + "epoch": 0.8931041710333264, + "grad_norm": 0.024107355624437332, + "learning_rate": 2.9793504969555965e-07, + "loss": 0.053, + "step": 4261 + }, + { + "epoch": 0.8933137706979669, + "grad_norm": 0.026618480682373047, + "learning_rate": 2.9678176831685776e-07, + "loss": 0.0535, + "step": 4262 + }, + { + "epoch": 0.8935233703626074, + "grad_norm": 0.019204149022698402, + "learning_rate": 2.9563065512078325e-07, + "loss": 0.053, + "step": 4263 + }, + { + "epoch": 0.8937329700272479, + "grad_norm": 0.018961802124977112, + "learning_rate": 2.9448171063799933e-07, + "loss": 0.0525, + "step": 4264 + }, + { + "epoch": 0.8939425696918885, + "grad_norm": 0.022955385968089104, + "learning_rate": 2.933349353981701e-07, + "loss": 0.0565, + "step": 4265 + }, + { + "epoch": 0.8941521693565291, + "grad_norm": 0.023665906861424446, + "learning_rate": 2.921903299299572e-07, + "loss": 0.0532, + "step": 4266 + }, + { + "epoch": 0.8943617690211696, + "grad_norm": 0.02021893300116062, + "learning_rate": 2.9104789476102515e-07, + "loss": 0.0554, + "step": 4267 + }, + { + "epoch": 0.8945713686858101, + "grad_norm": 0.02034766785800457, + "learning_rate": 2.899076304180348e-07, + "loss": 0.0534, + "step": 4268 + }, + { + "epoch": 0.8947809683504506, + "grad_norm": 0.019094116985797882, + "learning_rate": 2.8876953742664914e-07, + "loss": 0.0563, + "step": 4269 + }, + { + "epoch": 0.8949905680150911, + "grad_norm": 0.022905079647898674, + "learning_rate": 2.876336163115273e-07, + "loss": 0.0533, + "step": 4270 + }, + { + "epoch": 0.8952001676797318, + "grad_norm": 0.022755665704607964, + "learning_rate": 2.8649986759632985e-07, + "loss": 0.0598, + "step": 4271 + }, + { + "epoch": 0.8954097673443723, + "grad_norm": 0.01669159345328808, + "learning_rate": 2.8536829180371485e-07, + "loss": 0.0517, + "step": 4272 + }, + { + "epoch": 0.8956193670090128, + "grad_norm": 0.021552741527557373, + "learning_rate": 2.842388894553377e-07, + "loss": 0.0517, + "step": 4273 + }, + { + "epoch": 0.8958289666736533, + "grad_norm": 0.0235520601272583, + "learning_rate": 2.8311166107185263e-07, + "loss": 0.0553, + "step": 4274 + }, + { + "epoch": 0.8960385663382938, + "grad_norm": 0.019750304520130157, + "learning_rate": 2.819866071729127e-07, + "loss": 0.0529, + "step": 4275 + }, + { + "epoch": 0.8962481660029344, + "grad_norm": 0.01832771860063076, + "learning_rate": 2.808637282771659e-07, + "loss": 0.0518, + "step": 4276 + }, + { + "epoch": 0.896457765667575, + "grad_norm": 0.019599363207817078, + "learning_rate": 2.7974302490226034e-07, + "loss": 0.05, + "step": 4277 + }, + { + "epoch": 0.8966673653322155, + "grad_norm": 0.02147594839334488, + "learning_rate": 2.786244975648406e-07, + "loss": 0.0559, + "step": 4278 + }, + { + "epoch": 0.896876964996856, + "grad_norm": 0.020394155755639076, + "learning_rate": 2.7750814678054626e-07, + "loss": 0.0533, + "step": 4279 + }, + { + "epoch": 0.8970865646614965, + "grad_norm": 0.014958519488573074, + "learning_rate": 2.7639397306401527e-07, + "loss": 0.052, + "step": 4280 + }, + { + "epoch": 0.8972961643261371, + "grad_norm": 0.017371637746691704, + "learning_rate": 2.7528197692888114e-07, + "loss": 0.0491, + "step": 4281 + }, + { + "epoch": 0.8975057639907776, + "grad_norm": 0.021931590512394905, + "learning_rate": 2.7417215888777493e-07, + "loss": 0.0539, + "step": 4282 + }, + { + "epoch": 0.8977153636554182, + "grad_norm": 0.022836145013570786, + "learning_rate": 2.730645194523218e-07, + "loss": 0.0569, + "step": 4283 + }, + { + "epoch": 0.8979249633200587, + "grad_norm": 0.01579204574227333, + "learning_rate": 2.719590591331428e-07, + "loss": 0.0532, + "step": 4284 + }, + { + "epoch": 0.8981345629846992, + "grad_norm": 0.018433870747685432, + "learning_rate": 2.7085577843985634e-07, + "loss": 0.0533, + "step": 4285 + }, + { + "epoch": 0.8983441626493398, + "grad_norm": 0.025535134598612785, + "learning_rate": 2.697546778810728e-07, + "loss": 0.0572, + "step": 4286 + }, + { + "epoch": 0.8985537623139803, + "grad_norm": 0.022626137360930443, + "learning_rate": 2.6865575796440004e-07, + "loss": 0.0534, + "step": 4287 + }, + { + "epoch": 0.8987633619786208, + "grad_norm": 0.01860016956925392, + "learning_rate": 2.675590191964406e-07, + "loss": 0.0553, + "step": 4288 + }, + { + "epoch": 0.8989729616432613, + "grad_norm": 0.020209958776831627, + "learning_rate": 2.6646446208279054e-07, + "loss": 0.0535, + "step": 4289 + }, + { + "epoch": 0.8991825613079019, + "grad_norm": 0.023047301918268204, + "learning_rate": 2.653720871280396e-07, + "loss": 0.0545, + "step": 4290 + }, + { + "epoch": 0.8993921609725425, + "grad_norm": 0.022766290232539177, + "learning_rate": 2.6428189483577283e-07, + "loss": 0.0541, + "step": 4291 + }, + { + "epoch": 0.899601760637183, + "grad_norm": 0.021814832463860512, + "learning_rate": 2.631938857085697e-07, + "loss": 0.0514, + "step": 4292 + }, + { + "epoch": 0.8998113603018235, + "grad_norm": 0.017104798927903175, + "learning_rate": 2.6210806024800083e-07, + "loss": 0.055, + "step": 4293 + }, + { + "epoch": 0.900020959966464, + "grad_norm": 0.019604802131652832, + "learning_rate": 2.6102441895463237e-07, + "loss": 0.0555, + "step": 4294 + }, + { + "epoch": 0.9002305596311045, + "grad_norm": 0.022560935467481613, + "learning_rate": 2.5994296232802254e-07, + "loss": 0.0527, + "step": 4295 + }, + { + "epoch": 0.9004401592957452, + "grad_norm": 0.017508791759610176, + "learning_rate": 2.5886369086672193e-07, + "loss": 0.0542, + "step": 4296 + }, + { + "epoch": 0.9006497589603857, + "grad_norm": 0.010546859353780746, + "learning_rate": 2.577866050682748e-07, + "loss": 0.0537, + "step": 4297 + }, + { + "epoch": 0.9008593586250262, + "grad_norm": 0.017833339050412178, + "learning_rate": 2.567117054292184e-07, + "loss": 0.0552, + "step": 4298 + }, + { + "epoch": 0.9010689582896667, + "grad_norm": 0.02061723917722702, + "learning_rate": 2.5563899244507974e-07, + "loss": 0.0548, + "step": 4299 + }, + { + "epoch": 0.9012785579543072, + "grad_norm": 0.016052190214395523, + "learning_rate": 2.545684666103809e-07, + "loss": 0.0593, + "step": 4300 + }, + { + "epoch": 0.9014881576189478, + "grad_norm": 0.013846572488546371, + "learning_rate": 2.5350012841863283e-07, + "loss": 0.0579, + "step": 4301 + }, + { + "epoch": 0.9016977572835884, + "grad_norm": 0.014581656083464622, + "learning_rate": 2.5243397836233975e-07, + "loss": 0.0558, + "step": 4302 + }, + { + "epoch": 0.9019073569482289, + "grad_norm": 0.021665887907147408, + "learning_rate": 2.513700169329963e-07, + "loss": 0.0534, + "step": 4303 + }, + { + "epoch": 0.9021169566128694, + "grad_norm": 0.020303577184677124, + "learning_rate": 2.503082446210886e-07, + "loss": 0.0511, + "step": 4304 + }, + { + "epoch": 0.9023265562775099, + "grad_norm": 0.012806370854377747, + "learning_rate": 2.4924866191609387e-07, + "loss": 0.0514, + "step": 4305 + }, + { + "epoch": 0.9025361559421505, + "grad_norm": 0.013476556167006493, + "learning_rate": 2.4819126930647976e-07, + "loss": 0.0545, + "step": 4306 + }, + { + "epoch": 0.902745755606791, + "grad_norm": 0.019602788612246513, + "learning_rate": 2.471360672797019e-07, + "loss": 0.0532, + "step": 4307 + }, + { + "epoch": 0.9029553552714316, + "grad_norm": 0.021197538822889328, + "learning_rate": 2.460830563222111e-07, + "loss": 0.0515, + "step": 4308 + }, + { + "epoch": 0.9031649549360721, + "grad_norm": 0.014859777875244617, + "learning_rate": 2.45032236919443e-07, + "loss": 0.0555, + "step": 4309 + }, + { + "epoch": 0.9033745546007126, + "grad_norm": 0.010106687434017658, + "learning_rate": 2.439836095558262e-07, + "loss": 0.053, + "step": 4310 + }, + { + "epoch": 0.9035841542653532, + "grad_norm": 0.016903823241591454, + "learning_rate": 2.429371747147785e-07, + "loss": 0.0536, + "step": 4311 + }, + { + "epoch": 0.9037937539299937, + "grad_norm": 0.018577901646494865, + "learning_rate": 2.41892932878704e-07, + "loss": 0.0544, + "step": 4312 + }, + { + "epoch": 0.9040033535946342, + "grad_norm": 0.01648200862109661, + "learning_rate": 2.408508845289992e-07, + "loss": 0.054, + "step": 4313 + }, + { + "epoch": 0.9042129532592748, + "grad_norm": 0.012114167213439941, + "learning_rate": 2.3981103014604765e-07, + "loss": 0.0536, + "step": 4314 + }, + { + "epoch": 0.9044225529239153, + "grad_norm": 0.012820238247513771, + "learning_rate": 2.387733702092232e-07, + "loss": 0.0555, + "step": 4315 + }, + { + "epoch": 0.9046321525885559, + "grad_norm": 0.018219808116555214, + "learning_rate": 2.3773790519688644e-07, + "loss": 0.0574, + "step": 4316 + }, + { + "epoch": 0.9048417522531964, + "grad_norm": 0.022166509181261063, + "learning_rate": 2.3670463558638556e-07, + "loss": 0.0546, + "step": 4317 + }, + { + "epoch": 0.9050513519178369, + "grad_norm": 0.013082697987556458, + "learning_rate": 2.3567356185405844e-07, + "loss": 0.0553, + "step": 4318 + }, + { + "epoch": 0.9052609515824774, + "grad_norm": 0.0153275141492486, + "learning_rate": 2.3464468447522925e-07, + "loss": 0.0527, + "step": 4319 + }, + { + "epoch": 0.905470551247118, + "grad_norm": 0.019890129566192627, + "learning_rate": 2.3361800392421086e-07, + "loss": 0.0537, + "step": 4320 + }, + { + "epoch": 0.9056801509117586, + "grad_norm": 0.020730622112751007, + "learning_rate": 2.3259352067430298e-07, + "loss": 0.0548, + "step": 4321 + }, + { + "epoch": 0.9058897505763991, + "grad_norm": 0.019399119541049004, + "learning_rate": 2.3157123519779168e-07, + "loss": 0.0541, + "step": 4322 + }, + { + "epoch": 0.9060993502410396, + "grad_norm": 0.02034193091094494, + "learning_rate": 2.3055114796595e-07, + "loss": 0.0571, + "step": 4323 + }, + { + "epoch": 0.9063089499056801, + "grad_norm": 0.02012833207845688, + "learning_rate": 2.2953325944903848e-07, + "loss": 0.0539, + "step": 4324 + }, + { + "epoch": 0.9065185495703207, + "grad_norm": 0.020468810573220253, + "learning_rate": 2.2851757011630393e-07, + "loss": 0.057, + "step": 4325 + }, + { + "epoch": 0.9067281492349613, + "grad_norm": 0.0244952030479908, + "learning_rate": 2.2750408043597794e-07, + "loss": 0.0518, + "step": 4326 + }, + { + "epoch": 0.9069377488996018, + "grad_norm": 0.023987367749214172, + "learning_rate": 2.264927908752801e-07, + "loss": 0.0568, + "step": 4327 + }, + { + "epoch": 0.9071473485642423, + "grad_norm": 0.017585035413503647, + "learning_rate": 2.2548370190041414e-07, + "loss": 0.0534, + "step": 4328 + }, + { + "epoch": 0.9073569482288828, + "grad_norm": 0.017734866589307785, + "learning_rate": 2.2447681397656908e-07, + "loss": 0.0527, + "step": 4329 + }, + { + "epoch": 0.9075665478935234, + "grad_norm": 0.023448040708899498, + "learning_rate": 2.234721275679208e-07, + "loss": 0.0557, + "step": 4330 + }, + { + "epoch": 0.9077761475581639, + "grad_norm": 0.02103973738849163, + "learning_rate": 2.2246964313763053e-07, + "loss": 0.055, + "step": 4331 + }, + { + "epoch": 0.9079857472228045, + "grad_norm": 0.019696585834026337, + "learning_rate": 2.2146936114784134e-07, + "loss": 0.0544, + "step": 4332 + }, + { + "epoch": 0.908195346887445, + "grad_norm": 0.016549983993172646, + "learning_rate": 2.2047128205968494e-07, + "loss": 0.0539, + "step": 4333 + }, + { + "epoch": 0.9084049465520855, + "grad_norm": 0.02317041903734207, + "learning_rate": 2.1947540633327437e-07, + "loss": 0.0528, + "step": 4334 + }, + { + "epoch": 0.9086145462167261, + "grad_norm": 0.020171795040369034, + "learning_rate": 2.184817344277085e-07, + "loss": 0.0543, + "step": 4335 + }, + { + "epoch": 0.9088241458813666, + "grad_norm": 0.019286734983325005, + "learning_rate": 2.1749026680106922e-07, + "loss": 0.0522, + "step": 4336 + }, + { + "epoch": 0.9090337455460071, + "grad_norm": 0.011374424211680889, + "learning_rate": 2.1650100391042373e-07, + "loss": 0.0576, + "step": 4337 + }, + { + "epoch": 0.9092433452106476, + "grad_norm": 0.022341901436448097, + "learning_rate": 2.1551394621182277e-07, + "loss": 0.0552, + "step": 4338 + }, + { + "epoch": 0.9094529448752882, + "grad_norm": 0.026063280180096626, + "learning_rate": 2.1452909416029844e-07, + "loss": 0.0539, + "step": 4339 + }, + { + "epoch": 0.9096625445399288, + "grad_norm": 0.01974085345864296, + "learning_rate": 2.1354644820986703e-07, + "loss": 0.0536, + "step": 4340 + }, + { + "epoch": 0.9098721442045693, + "grad_norm": 0.011266632936894894, + "learning_rate": 2.1256600881352951e-07, + "loss": 0.0542, + "step": 4341 + }, + { + "epoch": 0.9100817438692098, + "grad_norm": 0.02346237748861313, + "learning_rate": 2.1158777642326656e-07, + "loss": 0.0558, + "step": 4342 + }, + { + "epoch": 0.9102913435338503, + "grad_norm": 0.020998260006308556, + "learning_rate": 2.1061175149004464e-07, + "loss": 0.0536, + "step": 4343 + }, + { + "epoch": 0.9105009431984908, + "grad_norm": 0.01797637529671192, + "learning_rate": 2.0963793446381053e-07, + "loss": 0.0532, + "step": 4344 + }, + { + "epoch": 0.9107105428631315, + "grad_norm": 0.016491252928972244, + "learning_rate": 2.08666325793494e-07, + "loss": 0.0522, + "step": 4345 + }, + { + "epoch": 0.910920142527772, + "grad_norm": 0.011900043115019798, + "learning_rate": 2.076969259270051e-07, + "loss": 0.0581, + "step": 4346 + }, + { + "epoch": 0.9111297421924125, + "grad_norm": 0.014170379377901554, + "learning_rate": 2.0672973531123796e-07, + "loss": 0.0534, + "step": 4347 + }, + { + "epoch": 0.911339341857053, + "grad_norm": 0.01760711334645748, + "learning_rate": 2.0576475439206767e-07, + "loss": 0.0555, + "step": 4348 + }, + { + "epoch": 0.9115489415216935, + "grad_norm": 0.017065905034542084, + "learning_rate": 2.0480198361435e-07, + "loss": 0.055, + "step": 4349 + }, + { + "epoch": 0.9117585411863341, + "grad_norm": 0.012907272204756737, + "learning_rate": 2.038414234219216e-07, + "loss": 0.0532, + "step": 4350 + }, + { + "epoch": 0.9119681408509747, + "grad_norm": 0.010160834528505802, + "learning_rate": 2.0288307425760046e-07, + "loss": 0.0543, + "step": 4351 + }, + { + "epoch": 0.9121777405156152, + "grad_norm": 0.013258119113743305, + "learning_rate": 2.0192693656318597e-07, + "loss": 0.0518, + "step": 4352 + }, + { + "epoch": 0.9123873401802557, + "grad_norm": 0.0151562774553895, + "learning_rate": 2.0097301077945607e-07, + "loss": 0.0527, + "step": 4353 + }, + { + "epoch": 0.9125969398448962, + "grad_norm": 0.016773242503404617, + "learning_rate": 2.0002129734617292e-07, + "loss": 0.0534, + "step": 4354 + }, + { + "epoch": 0.9128065395095368, + "grad_norm": 0.012901091016829014, + "learning_rate": 1.9907179670207387e-07, + "loss": 0.0518, + "step": 4355 + }, + { + "epoch": 0.9130161391741773, + "grad_norm": 0.010573726147413254, + "learning_rate": 1.9812450928487936e-07, + "loss": 0.0518, + "step": 4356 + }, + { + "epoch": 0.9132257388388179, + "grad_norm": 0.012841584160923958, + "learning_rate": 1.9717943553128893e-07, + "loss": 0.0556, + "step": 4357 + }, + { + "epoch": 0.9134353385034584, + "grad_norm": 0.014390083961188793, + "learning_rate": 1.962365758769802e-07, + "loss": 0.0526, + "step": 4358 + }, + { + "epoch": 0.9136449381680989, + "grad_norm": 0.014265509322285652, + "learning_rate": 1.9529593075661267e-07, + "loss": 0.0531, + "step": 4359 + }, + { + "epoch": 0.9138545378327395, + "grad_norm": 0.013005812652409077, + "learning_rate": 1.943575006038234e-07, + "loss": 0.0535, + "step": 4360 + }, + { + "epoch": 0.91406413749738, + "grad_norm": 0.009814724326133728, + "learning_rate": 1.9342128585122798e-07, + "loss": 0.0574, + "step": 4361 + }, + { + "epoch": 0.9142737371620205, + "grad_norm": 0.011820555664598942, + "learning_rate": 1.9248728693042117e-07, + "loss": 0.0519, + "step": 4362 + }, + { + "epoch": 0.914483336826661, + "grad_norm": 0.014412821270525455, + "learning_rate": 1.9155550427197577e-07, + "loss": 0.0579, + "step": 4363 + }, + { + "epoch": 0.9146929364913016, + "grad_norm": 0.013178630731999874, + "learning_rate": 1.906259383054454e-07, + "loss": 0.0536, + "step": 4364 + }, + { + "epoch": 0.9149025361559422, + "grad_norm": 0.01012821588665247, + "learning_rate": 1.8969858945935783e-07, + "loss": 0.0566, + "step": 4365 + }, + { + "epoch": 0.9151121358205827, + "grad_norm": 0.009884363040328026, + "learning_rate": 1.8877345816122162e-07, + "loss": 0.0551, + "step": 4366 + }, + { + "epoch": 0.9153217354852232, + "grad_norm": 0.011177174746990204, + "learning_rate": 1.8785054483752174e-07, + "loss": 0.0518, + "step": 4367 + }, + { + "epoch": 0.9155313351498637, + "grad_norm": 0.012874050997197628, + "learning_rate": 1.8692984991372065e-07, + "loss": 0.0554, + "step": 4368 + }, + { + "epoch": 0.9157409348145042, + "grad_norm": 0.012516334652900696, + "learning_rate": 1.8601137381425938e-07, + "loss": 0.0551, + "step": 4369 + }, + { + "epoch": 0.9159505344791449, + "grad_norm": 0.010518589057028294, + "learning_rate": 1.8509511696255421e-07, + "loss": 0.0543, + "step": 4370 + }, + { + "epoch": 0.9161601341437854, + "grad_norm": 0.00973634421825409, + "learning_rate": 1.841810797810012e-07, + "loss": 0.0542, + "step": 4371 + }, + { + "epoch": 0.9163697338084259, + "grad_norm": 0.01056087389588356, + "learning_rate": 1.8326926269096935e-07, + "loss": 0.0525, + "step": 4372 + }, + { + "epoch": 0.9165793334730664, + "grad_norm": 0.012748325243592262, + "learning_rate": 1.8235966611280687e-07, + "loss": 0.0534, + "step": 4373 + }, + { + "epoch": 0.9167889331377069, + "grad_norm": 0.010445375926792622, + "learning_rate": 1.8145229046583778e-07, + "loss": 0.0553, + "step": 4374 + }, + { + "epoch": 0.9169985328023476, + "grad_norm": 0.00820910930633545, + "learning_rate": 1.805471361683614e-07, + "loss": 0.0551, + "step": 4375 + }, + { + "epoch": 0.9172081324669881, + "grad_norm": 0.008532201871275902, + "learning_rate": 1.7964420363765444e-07, + "loss": 0.0562, + "step": 4376 + }, + { + "epoch": 0.9174177321316286, + "grad_norm": 0.01010076142847538, + "learning_rate": 1.78743493289969e-07, + "loss": 0.0537, + "step": 4377 + }, + { + "epoch": 0.9176273317962691, + "grad_norm": 0.011571811512112617, + "learning_rate": 1.778450055405312e-07, + "loss": 0.0552, + "step": 4378 + }, + { + "epoch": 0.9178369314609096, + "grad_norm": 0.01056272629648447, + "learning_rate": 1.7694874080354362e-07, + "loss": 0.056, + "step": 4379 + }, + { + "epoch": 0.9180465311255502, + "grad_norm": 0.008779754862189293, + "learning_rate": 1.760546994921858e-07, + "loss": 0.0541, + "step": 4380 + }, + { + "epoch": 0.9182561307901907, + "grad_norm": 0.008502528071403503, + "learning_rate": 1.7516288201860853e-07, + "loss": 0.0514, + "step": 4381 + }, + { + "epoch": 0.9184657304548313, + "grad_norm": 0.009822116233408451, + "learning_rate": 1.742732887939408e-07, + "loss": 0.0528, + "step": 4382 + }, + { + "epoch": 0.9186753301194718, + "grad_norm": 0.012791884131729603, + "learning_rate": 1.733859202282845e-07, + "loss": 0.0535, + "step": 4383 + }, + { + "epoch": 0.9188849297841123, + "grad_norm": 0.010147576220333576, + "learning_rate": 1.7250077673071685e-07, + "loss": 0.0537, + "step": 4384 + }, + { + "epoch": 0.9190945294487529, + "grad_norm": 0.009435161016881466, + "learning_rate": 1.716178587092876e-07, + "loss": 0.0526, + "step": 4385 + }, + { + "epoch": 0.9193041291133934, + "grad_norm": 0.007458524778485298, + "learning_rate": 1.7073716657102278e-07, + "loss": 0.0523, + "step": 4386 + }, + { + "epoch": 0.919513728778034, + "grad_norm": 0.008520993404090405, + "learning_rate": 1.6985870072192156e-07, + "loss": 0.0538, + "step": 4387 + }, + { + "epoch": 0.9197233284426745, + "grad_norm": 0.01144067570567131, + "learning_rate": 1.6898246156695552e-07, + "loss": 0.0559, + "step": 4388 + }, + { + "epoch": 0.919932928107315, + "grad_norm": 0.01067439466714859, + "learning_rate": 1.6810844951007099e-07, + "loss": 0.0563, + "step": 4389 + }, + { + "epoch": 0.9201425277719556, + "grad_norm": 0.012738638557493687, + "learning_rate": 1.6723666495418844e-07, + "loss": 0.0553, + "step": 4390 + }, + { + "epoch": 0.9203521274365961, + "grad_norm": 0.009737935848534107, + "learning_rate": 1.6636710830119863e-07, + "loss": 0.054, + "step": 4391 + }, + { + "epoch": 0.9205617271012366, + "grad_norm": 0.0069859870709478855, + "learning_rate": 1.6549977995196809e-07, + "loss": 0.0557, + "step": 4392 + }, + { + "epoch": 0.9207713267658771, + "grad_norm": 0.011632826179265976, + "learning_rate": 1.6463468030633478e-07, + "loss": 0.0526, + "step": 4393 + }, + { + "epoch": 0.9209809264305178, + "grad_norm": 0.011854654178023338, + "learning_rate": 1.6377180976310968e-07, + "loss": 0.0561, + "step": 4394 + }, + { + "epoch": 0.9211905260951583, + "grad_norm": 0.010901962406933308, + "learning_rate": 1.6291116872007573e-07, + "loss": 0.0511, + "step": 4395 + }, + { + "epoch": 0.9214001257597988, + "grad_norm": 0.009396993555128574, + "learning_rate": 1.6205275757398774e-07, + "loss": 0.0547, + "step": 4396 + }, + { + "epoch": 0.9216097254244393, + "grad_norm": 0.00782528892159462, + "learning_rate": 1.611965767205742e-07, + "loss": 0.0529, + "step": 4397 + }, + { + "epoch": 0.9218193250890798, + "grad_norm": 0.00804727990180254, + "learning_rate": 1.6034262655453269e-07, + "loss": 0.0538, + "step": 4398 + }, + { + "epoch": 0.9220289247537204, + "grad_norm": 0.011399338953197002, + "learning_rate": 1.594909074695361e-07, + "loss": 0.0523, + "step": 4399 + }, + { + "epoch": 0.922238524418361, + "grad_norm": 0.013382563367486, + "learning_rate": 1.586414198582259e-07, + "loss": 0.0557, + "step": 4400 + }, + { + "epoch": 0.9224481240830015, + "grad_norm": 0.010052556172013283, + "learning_rate": 1.5779416411221437e-07, + "loss": 0.0554, + "step": 4401 + }, + { + "epoch": 0.922657723747642, + "grad_norm": 0.010469266213476658, + "learning_rate": 1.5694914062208799e-07, + "loss": 0.0544, + "step": 4402 + }, + { + "epoch": 0.9228673234122825, + "grad_norm": 0.011628585867583752, + "learning_rate": 1.561063497774018e-07, + "loss": 0.0535, + "step": 4403 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.016579635441303253, + "learning_rate": 1.552657919666817e-07, + "loss": 0.0547, + "step": 4404 + }, + { + "epoch": 0.9232865227415636, + "grad_norm": 0.01323653757572174, + "learning_rate": 1.54427467577426e-07, + "loss": 0.0525, + "step": 4405 + }, + { + "epoch": 0.9234961224062042, + "grad_norm": 0.01618046499788761, + "learning_rate": 1.535913769961006e-07, + "loss": 0.0535, + "step": 4406 + }, + { + "epoch": 0.9237057220708447, + "grad_norm": 0.012092587538063526, + "learning_rate": 1.527575206081444e-07, + "loss": 0.0524, + "step": 4407 + }, + { + "epoch": 0.9239153217354852, + "grad_norm": 0.014828270301222801, + "learning_rate": 1.5192589879796383e-07, + "loss": 0.0533, + "step": 4408 + }, + { + "epoch": 0.9241249214001258, + "grad_norm": 0.008504010736942291, + "learning_rate": 1.510965119489366e-07, + "loss": 0.0541, + "step": 4409 + }, + { + "epoch": 0.9243345210647663, + "grad_norm": 0.010867651551961899, + "learning_rate": 1.5026936044341078e-07, + "loss": 0.0546, + "step": 4410 + }, + { + "epoch": 0.9245441207294068, + "grad_norm": 0.015342487022280693, + "learning_rate": 1.4944444466270248e-07, + "loss": 0.0543, + "step": 4411 + }, + { + "epoch": 0.9247537203940474, + "grad_norm": 0.01415248867124319, + "learning_rate": 1.486217649870969e-07, + "loss": 0.0539, + "step": 4412 + }, + { + "epoch": 0.9249633200586879, + "grad_norm": 0.009957171976566315, + "learning_rate": 1.478013217958507e-07, + "loss": 0.0545, + "step": 4413 + }, + { + "epoch": 0.9251729197233285, + "grad_norm": 0.00818433053791523, + "learning_rate": 1.4698311546718635e-07, + "loss": 0.0556, + "step": 4414 + }, + { + "epoch": 0.925382519387969, + "grad_norm": 0.010151896625757217, + "learning_rate": 1.4616714637829822e-07, + "loss": 0.0533, + "step": 4415 + }, + { + "epoch": 0.9255921190526095, + "grad_norm": 0.012620055116713047, + "learning_rate": 1.453534149053476e-07, + "loss": 0.0556, + "step": 4416 + }, + { + "epoch": 0.92580171871725, + "grad_norm": 0.011959872208535671, + "learning_rate": 1.4454192142346446e-07, + "loss": 0.0549, + "step": 4417 + }, + { + "epoch": 0.9260113183818905, + "grad_norm": 0.010501948185265064, + "learning_rate": 1.437326663067462e-07, + "loss": 0.0526, + "step": 4418 + }, + { + "epoch": 0.9262209180465312, + "grad_norm": 0.009827865287661552, + "learning_rate": 1.429256499282605e-07, + "loss": 0.0529, + "step": 4419 + }, + { + "epoch": 0.9264305177111717, + "grad_norm": 0.009340137243270874, + "learning_rate": 1.421208726600415e-07, + "loss": 0.0524, + "step": 4420 + }, + { + "epoch": 0.9266401173758122, + "grad_norm": 0.010111922398209572, + "learning_rate": 1.4131833487309122e-07, + "loss": 0.0547, + "step": 4421 + }, + { + "epoch": 0.9268497170404527, + "grad_norm": 0.01139815803617239, + "learning_rate": 1.4051803693737876e-07, + "loss": 0.0537, + "step": 4422 + }, + { + "epoch": 0.9270593167050932, + "grad_norm": 0.009999927133321762, + "learning_rate": 1.3971997922184234e-07, + "loss": 0.0532, + "step": 4423 + }, + { + "epoch": 0.9272689163697339, + "grad_norm": 0.010446673259139061, + "learning_rate": 1.3892416209438542e-07, + "loss": 0.0524, + "step": 4424 + }, + { + "epoch": 0.9274785160343744, + "grad_norm": 0.012159881182014942, + "learning_rate": 1.381305859218801e-07, + "loss": 0.051, + "step": 4425 + }, + { + "epoch": 0.9276881156990149, + "grad_norm": 0.013391540385782719, + "learning_rate": 1.3733925107016542e-07, + "loss": 0.0519, + "step": 4426 + }, + { + "epoch": 0.9278977153636554, + "grad_norm": 0.006831051781773567, + "learning_rate": 1.3655015790404568e-07, + "loss": 0.0536, + "step": 4427 + }, + { + "epoch": 0.9281073150282959, + "grad_norm": 0.007939697243273258, + "learning_rate": 1.3576330678729266e-07, + "loss": 0.0535, + "step": 4428 + }, + { + "epoch": 0.9283169146929365, + "grad_norm": 0.015715105459094048, + "learning_rate": 1.3497869808264453e-07, + "loss": 0.054, + "step": 4429 + }, + { + "epoch": 0.928526514357577, + "grad_norm": 0.008145746774971485, + "learning_rate": 1.341963321518064e-07, + "loss": 0.057, + "step": 4430 + }, + { + "epoch": 0.9287361140222176, + "grad_norm": 0.0074055916629731655, + "learning_rate": 1.3341620935544864e-07, + "loss": 0.0559, + "step": 4431 + }, + { + "epoch": 0.9289457136868581, + "grad_norm": 0.00775395892560482, + "learning_rate": 1.3263833005320747e-07, + "loss": 0.0521, + "step": 4432 + }, + { + "epoch": 0.9291553133514986, + "grad_norm": 0.008558698929846287, + "learning_rate": 1.3186269460368496e-07, + "loss": 0.0523, + "step": 4433 + }, + { + "epoch": 0.9293649130161392, + "grad_norm": 0.009375239722430706, + "learning_rate": 1.3108930336444893e-07, + "loss": 0.0522, + "step": 4434 + }, + { + "epoch": 0.9295745126807797, + "grad_norm": 0.007077595219016075, + "learning_rate": 1.303181566920325e-07, + "loss": 0.0538, + "step": 4435 + }, + { + "epoch": 0.9297841123454202, + "grad_norm": 0.0054700132459402084, + "learning_rate": 1.2954925494193472e-07, + "loss": 0.0545, + "step": 4436 + }, + { + "epoch": 0.9299937120100608, + "grad_norm": 0.008495796471834183, + "learning_rate": 1.2878259846861862e-07, + "loss": 0.0587, + "step": 4437 + }, + { + "epoch": 0.9302033116747013, + "grad_norm": 0.008338769897818565, + "learning_rate": 1.2801818762551265e-07, + "loss": 0.051, + "step": 4438 + }, + { + "epoch": 0.9304129113393419, + "grad_norm": 0.009145451709628105, + "learning_rate": 1.2725602276501047e-07, + "loss": 0.0544, + "step": 4439 + }, + { + "epoch": 0.9306225110039824, + "grad_norm": 0.00860004685819149, + "learning_rate": 1.2649610423846937e-07, + "loss": 0.0549, + "step": 4440 + }, + { + "epoch": 0.9308321106686229, + "grad_norm": 0.006629549898207188, + "learning_rate": 1.2573843239621185e-07, + "loss": 0.0548, + "step": 4441 + }, + { + "epoch": 0.9310417103332634, + "grad_norm": 0.007576174568384886, + "learning_rate": 1.2498300758752413e-07, + "loss": 0.0517, + "step": 4442 + }, + { + "epoch": 0.931251309997904, + "grad_norm": 0.007479529827833176, + "learning_rate": 1.2422983016065816e-07, + "loss": 0.0575, + "step": 4443 + }, + { + "epoch": 0.9314609096625446, + "grad_norm": 0.006873736623674631, + "learning_rate": 1.2347890046282785e-07, + "loss": 0.0543, + "step": 4444 + }, + { + "epoch": 0.9316705093271851, + "grad_norm": 0.008335386402904987, + "learning_rate": 1.2273021884021074e-07, + "loss": 0.0526, + "step": 4445 + }, + { + "epoch": 0.9318801089918256, + "grad_norm": 0.007241726852953434, + "learning_rate": 1.2198378563795023e-07, + "loss": 0.0534, + "step": 4446 + }, + { + "epoch": 0.9320897086564661, + "grad_norm": 0.008679443039000034, + "learning_rate": 1.2123960120015045e-07, + "loss": 0.0542, + "step": 4447 + }, + { + "epoch": 0.9322993083211066, + "grad_norm": 0.0065467506647109985, + "learning_rate": 1.2049766586988142e-07, + "loss": 0.0506, + "step": 4448 + }, + { + "epoch": 0.9325089079857473, + "grad_norm": 0.0077818529680371284, + "learning_rate": 1.1975797998917514e-07, + "loss": 0.0526, + "step": 4449 + }, + { + "epoch": 0.9327185076503878, + "grad_norm": 0.008923080749809742, + "learning_rate": 1.1902054389902662e-07, + "loss": 0.0511, + "step": 4450 + }, + { + "epoch": 0.9329281073150283, + "grad_norm": 0.009406541474163532, + "learning_rate": 1.1828535793939277e-07, + "loss": 0.0507, + "step": 4451 + }, + { + "epoch": 0.9331377069796688, + "grad_norm": 0.008926275186240673, + "learning_rate": 1.1755242244919528e-07, + "loss": 0.0553, + "step": 4452 + }, + { + "epoch": 0.9333473066443093, + "grad_norm": 0.007040916942059994, + "learning_rate": 1.1682173776631722e-07, + "loss": 0.054, + "step": 4453 + }, + { + "epoch": 0.9335569063089499, + "grad_norm": 0.009015001356601715, + "learning_rate": 1.16093304227603e-07, + "loss": 0.0555, + "step": 4454 + }, + { + "epoch": 0.9337665059735905, + "grad_norm": 0.010234599001705647, + "learning_rate": 1.1536712216886181e-07, + "loss": 0.0542, + "step": 4455 + }, + { + "epoch": 0.933976105638231, + "grad_norm": 0.008125402964651585, + "learning_rate": 1.1464319192486251e-07, + "loss": 0.0515, + "step": 4456 + }, + { + "epoch": 0.9341857053028715, + "grad_norm": 0.009704657830297947, + "learning_rate": 1.1392151382933647e-07, + "loss": 0.0555, + "step": 4457 + }, + { + "epoch": 0.934395304967512, + "grad_norm": 0.009614041075110435, + "learning_rate": 1.1320208821497758e-07, + "loss": 0.0529, + "step": 4458 + }, + { + "epoch": 0.9346049046321526, + "grad_norm": 0.007132851053029299, + "learning_rate": 1.1248491541344164e-07, + "loss": 0.0539, + "step": 4459 + }, + { + "epoch": 0.9348145042967931, + "grad_norm": 0.008845999836921692, + "learning_rate": 1.117699957553442e-07, + "loss": 0.0558, + "step": 4460 + }, + { + "epoch": 0.9350241039614337, + "grad_norm": 0.009133417159318924, + "learning_rate": 1.1105732957026272e-07, + "loss": 0.0528, + "step": 4461 + }, + { + "epoch": 0.9352337036260742, + "grad_norm": 0.008758967742323875, + "learning_rate": 1.1034691718673774e-07, + "loss": 0.0551, + "step": 4462 + }, + { + "epoch": 0.9354433032907148, + "grad_norm": 0.009887007996439934, + "learning_rate": 1.0963875893226728e-07, + "loss": 0.0545, + "step": 4463 + }, + { + "epoch": 0.9356529029553553, + "grad_norm": 0.00951511599123478, + "learning_rate": 1.0893285513331353e-07, + "loss": 0.0566, + "step": 4464 + }, + { + "epoch": 0.9358625026199958, + "grad_norm": 0.0072890762239694595, + "learning_rate": 1.0822920611529786e-07, + "loss": 0.0538, + "step": 4465 + }, + { + "epoch": 0.9360721022846363, + "grad_norm": 0.008058588020503521, + "learning_rate": 1.0752781220260245e-07, + "loss": 0.0518, + "step": 4466 + }, + { + "epoch": 0.9362817019492768, + "grad_norm": 0.008425997570157051, + "learning_rate": 1.0682867371856864e-07, + "loss": 0.0534, + "step": 4467 + }, + { + "epoch": 0.9364913016139175, + "grad_norm": 0.008281690068542957, + "learning_rate": 1.0613179098549975e-07, + "loss": 0.055, + "step": 4468 + }, + { + "epoch": 0.936700901278558, + "grad_norm": 0.009938735514879227, + "learning_rate": 1.0543716432465933e-07, + "loss": 0.0524, + "step": 4469 + }, + { + "epoch": 0.9369105009431985, + "grad_norm": 0.009986302815377712, + "learning_rate": 1.0474479405626847e-07, + "loss": 0.0527, + "step": 4470 + }, + { + "epoch": 0.937120100607839, + "grad_norm": 0.007245698943734169, + "learning_rate": 1.0405468049951184e-07, + "loss": 0.0539, + "step": 4471 + }, + { + "epoch": 0.9373297002724795, + "grad_norm": 0.009867161512374878, + "learning_rate": 1.0336682397252995e-07, + "loss": 0.0551, + "step": 4472 + }, + { + "epoch": 0.9375392999371202, + "grad_norm": 0.008164730854332447, + "learning_rate": 1.0268122479242526e-07, + "loss": 0.0547, + "step": 4473 + }, + { + "epoch": 0.9377488996017607, + "grad_norm": 0.0104014091193676, + "learning_rate": 1.0199788327525828e-07, + "loss": 0.0508, + "step": 4474 + }, + { + "epoch": 0.9379584992664012, + "grad_norm": 0.009538762271404266, + "learning_rate": 1.013167997360498e-07, + "loss": 0.0505, + "step": 4475 + }, + { + "epoch": 0.9381680989310417, + "grad_norm": 0.008720876649022102, + "learning_rate": 1.0063797448877977e-07, + "loss": 0.0534, + "step": 4476 + }, + { + "epoch": 0.9383776985956822, + "grad_norm": 0.010985384695231915, + "learning_rate": 9.996140784638564e-08, + "loss": 0.0572, + "step": 4477 + }, + { + "epoch": 0.9385872982603228, + "grad_norm": 0.008751952089369297, + "learning_rate": 9.928710012076404e-08, + "loss": 0.0542, + "step": 4478 + }, + { + "epoch": 0.9387968979249633, + "grad_norm": 0.007031100802123547, + "learning_rate": 9.861505162277186e-08, + "loss": 0.0552, + "step": 4479 + }, + { + "epoch": 0.9390064975896039, + "grad_norm": 0.008988458663225174, + "learning_rate": 9.794526266222237e-08, + "loss": 0.0531, + "step": 4480 + }, + { + "epoch": 0.9392160972542444, + "grad_norm": 0.007348395884037018, + "learning_rate": 9.727773354788861e-08, + "loss": 0.0547, + "step": 4481 + }, + { + "epoch": 0.9394256969188849, + "grad_norm": 0.009616075083613396, + "learning_rate": 9.661246458750162e-08, + "loss": 0.053, + "step": 4482 + }, + { + "epoch": 0.9396352965835255, + "grad_norm": 0.00715535506606102, + "learning_rate": 9.594945608774997e-08, + "loss": 0.0563, + "step": 4483 + }, + { + "epoch": 0.939844896248166, + "grad_norm": 0.008326810784637928, + "learning_rate": 9.528870835427972e-08, + "loss": 0.0532, + "step": 4484 + }, + { + "epoch": 0.9400544959128065, + "grad_norm": 0.009740813635289669, + "learning_rate": 9.463022169169666e-08, + "loss": 0.0514, + "step": 4485 + }, + { + "epoch": 0.9402640955774471, + "grad_norm": 0.0066096787340939045, + "learning_rate": 9.397399640356242e-08, + "loss": 0.0543, + "step": 4486 + }, + { + "epoch": 0.9404736952420876, + "grad_norm": 0.009233192540705204, + "learning_rate": 9.332003279239665e-08, + "loss": 0.0582, + "step": 4487 + }, + { + "epoch": 0.9406832949067282, + "grad_norm": 0.01031689066439867, + "learning_rate": 9.266833115967655e-08, + "loss": 0.0556, + "step": 4488 + }, + { + "epoch": 0.9408928945713687, + "grad_norm": 0.010228820145130157, + "learning_rate": 9.201889180583679e-08, + "loss": 0.0538, + "step": 4489 + }, + { + "epoch": 0.9411024942360092, + "grad_norm": 0.0081089548766613, + "learning_rate": 9.137171503026787e-08, + "loss": 0.0541, + "step": 4490 + }, + { + "epoch": 0.9413120939006497, + "grad_norm": 0.007212504744529724, + "learning_rate": 9.072680113131893e-08, + "loss": 0.0517, + "step": 4491 + }, + { + "epoch": 0.9415216935652903, + "grad_norm": 0.006384396459907293, + "learning_rate": 9.008415040629548e-08, + "loss": 0.0528, + "step": 4492 + }, + { + "epoch": 0.9417312932299309, + "grad_norm": 0.007916389964520931, + "learning_rate": 8.944376315145831e-08, + "loss": 0.0533, + "step": 4493 + }, + { + "epoch": 0.9419408928945714, + "grad_norm": 0.009434894658625126, + "learning_rate": 8.880563966202627e-08, + "loss": 0.0538, + "step": 4494 + }, + { + "epoch": 0.9421504925592119, + "grad_norm": 0.009904226288199425, + "learning_rate": 8.816978023217404e-08, + "loss": 0.0542, + "step": 4495 + }, + { + "epoch": 0.9423600922238524, + "grad_norm": 0.008942014537751675, + "learning_rate": 8.753618515503325e-08, + "loss": 0.0543, + "step": 4496 + }, + { + "epoch": 0.9425696918884929, + "grad_norm": 0.008763092570006847, + "learning_rate": 8.690485472269028e-08, + "loss": 0.0546, + "step": 4497 + }, + { + "epoch": 0.9427792915531336, + "grad_norm": 0.006246887147426605, + "learning_rate": 8.627578922618895e-08, + "loss": 0.0525, + "step": 4498 + }, + { + "epoch": 0.9429888912177741, + "grad_norm": 0.00860271230340004, + "learning_rate": 8.564898895552843e-08, + "loss": 0.0542, + "step": 4499 + }, + { + "epoch": 0.9431984908824146, + "grad_norm": 0.005993400234729052, + "learning_rate": 8.502445419966254e-08, + "loss": 0.0541, + "step": 4500 + }, + { + "epoch": 0.9434080905470551, + "grad_norm": 0.009251521900296211, + "learning_rate": 8.440218524650268e-08, + "loss": 0.0515, + "step": 4501 + }, + { + "epoch": 0.9436176902116956, + "grad_norm": 0.01032980252057314, + "learning_rate": 8.378218238291492e-08, + "loss": 0.0538, + "step": 4502 + }, + { + "epoch": 0.9438272898763362, + "grad_norm": 0.007855959236621857, + "learning_rate": 8.316444589471895e-08, + "loss": 0.0531, + "step": 4503 + }, + { + "epoch": 0.9440368895409768, + "grad_norm": 0.008385205641388893, + "learning_rate": 8.254897606669254e-08, + "loss": 0.0547, + "step": 4504 + }, + { + "epoch": 0.9442464892056173, + "grad_norm": 0.0065010362304747105, + "learning_rate": 8.193577318256707e-08, + "loss": 0.056, + "step": 4505 + }, + { + "epoch": 0.9444560888702578, + "grad_norm": 0.007750457618385553, + "learning_rate": 8.132483752502806e-08, + "loss": 0.0555, + "step": 4506 + }, + { + "epoch": 0.9446656885348983, + "grad_norm": 0.007757239043712616, + "learning_rate": 8.071616937571692e-08, + "loss": 0.0538, + "step": 4507 + }, + { + "epoch": 0.9448752881995389, + "grad_norm": 0.006676631513983011, + "learning_rate": 8.010976901523082e-08, + "loss": 0.0536, + "step": 4508 + }, + { + "epoch": 0.9450848878641794, + "grad_norm": 0.007698145695030689, + "learning_rate": 7.95056367231184e-08, + "loss": 0.0557, + "step": 4509 + }, + { + "epoch": 0.94529448752882, + "grad_norm": 0.009001096710562706, + "learning_rate": 7.89037727778863e-08, + "loss": 0.0539, + "step": 4510 + }, + { + "epoch": 0.9455040871934605, + "grad_norm": 0.008882983587682247, + "learning_rate": 7.830417745699204e-08, + "loss": 0.0516, + "step": 4511 + }, + { + "epoch": 0.945713686858101, + "grad_norm": 0.006819653324782848, + "learning_rate": 7.770685103685005e-08, + "loss": 0.0526, + "step": 4512 + }, + { + "epoch": 0.9459232865227416, + "grad_norm": 0.006686016917228699, + "learning_rate": 7.711179379282674e-08, + "loss": 0.0529, + "step": 4513 + }, + { + "epoch": 0.9461328861873821, + "grad_norm": 0.0065452903509140015, + "learning_rate": 7.651900599924378e-08, + "loss": 0.0526, + "step": 4514 + }, + { + "epoch": 0.9463424858520226, + "grad_norm": 0.006739321630448103, + "learning_rate": 7.592848792937701e-08, + "loss": 0.0535, + "step": 4515 + }, + { + "epoch": 0.9465520855166631, + "grad_norm": 0.009856310673058033, + "learning_rate": 7.53402398554548e-08, + "loss": 0.0527, + "step": 4516 + }, + { + "epoch": 0.9467616851813037, + "grad_norm": 0.00838090106844902, + "learning_rate": 7.475426204865854e-08, + "loss": 0.0539, + "step": 4517 + }, + { + "epoch": 0.9469712848459443, + "grad_norm": 0.0068512773141264915, + "learning_rate": 7.417055477912438e-08, + "loss": 0.0536, + "step": 4518 + }, + { + "epoch": 0.9471808845105848, + "grad_norm": 0.006120042875409126, + "learning_rate": 7.358911831594095e-08, + "loss": 0.0537, + "step": 4519 + }, + { + "epoch": 0.9473904841752253, + "grad_norm": 0.008014840073883533, + "learning_rate": 7.300995292715107e-08, + "loss": 0.0535, + "step": 4520 + }, + { + "epoch": 0.9476000838398658, + "grad_norm": 0.006846890784800053, + "learning_rate": 7.24330588797495e-08, + "loss": 0.0555, + "step": 4521 + }, + { + "epoch": 0.9478096835045063, + "grad_norm": 0.006406300701200962, + "learning_rate": 7.18584364396846e-08, + "loss": 0.0551, + "step": 4522 + }, + { + "epoch": 0.948019283169147, + "grad_norm": 0.006220379378646612, + "learning_rate": 7.128608587185615e-08, + "loss": 0.0531, + "step": 4523 + }, + { + "epoch": 0.9482288828337875, + "grad_norm": 0.006792591419070959, + "learning_rate": 7.071600744011865e-08, + "loss": 0.0516, + "step": 4524 + }, + { + "epoch": 0.948438482498428, + "grad_norm": 0.0063816942274570465, + "learning_rate": 7.014820140727797e-08, + "loss": 0.0542, + "step": 4525 + }, + { + "epoch": 0.9486480821630685, + "grad_norm": 0.006532501429319382, + "learning_rate": 6.958266803509195e-08, + "loss": 0.0515, + "step": 4526 + }, + { + "epoch": 0.948857681827709, + "grad_norm": 0.005698981694877148, + "learning_rate": 6.901940758427206e-08, + "loss": 0.0547, + "step": 4527 + }, + { + "epoch": 0.9490672814923496, + "grad_norm": 0.007572156842797995, + "learning_rate": 6.845842031448113e-08, + "loss": 0.0508, + "step": 4528 + }, + { + "epoch": 0.9492768811569902, + "grad_norm": 0.005259588360786438, + "learning_rate": 6.789970648433397e-08, + "loss": 0.0532, + "step": 4529 + }, + { + "epoch": 0.9494864808216307, + "grad_norm": 0.00802881084382534, + "learning_rate": 6.734326635139732e-08, + "loss": 0.0535, + "step": 4530 + }, + { + "epoch": 0.9496960804862712, + "grad_norm": 0.008471885696053505, + "learning_rate": 6.678910017219098e-08, + "loss": 0.0531, + "step": 4531 + }, + { + "epoch": 0.9499056801509118, + "grad_norm": 0.006917075254023075, + "learning_rate": 6.623720820218449e-08, + "loss": 0.0513, + "step": 4532 + }, + { + "epoch": 0.9501152798155523, + "grad_norm": 0.006756001152098179, + "learning_rate": 6.568759069579988e-08, + "loss": 0.0524, + "step": 4533 + }, + { + "epoch": 0.9503248794801928, + "grad_norm": 0.006958952639251947, + "learning_rate": 6.514024790641116e-08, + "loss": 0.054, + "step": 4534 + }, + { + "epoch": 0.9505344791448334, + "grad_norm": 0.006947326939553022, + "learning_rate": 6.459518008634313e-08, + "loss": 0.0548, + "step": 4535 + }, + { + "epoch": 0.9507440788094739, + "grad_norm": 0.006160185672342777, + "learning_rate": 6.405238748687203e-08, + "loss": 0.0538, + "step": 4536 + }, + { + "epoch": 0.9509536784741145, + "grad_norm": 0.007486341055482626, + "learning_rate": 6.351187035822492e-08, + "loss": 0.054, + "step": 4537 + }, + { + "epoch": 0.951163278138755, + "grad_norm": 0.006339498329907656, + "learning_rate": 6.297362894958025e-08, + "loss": 0.0563, + "step": 4538 + }, + { + "epoch": 0.9513728778033955, + "grad_norm": 0.006274237297475338, + "learning_rate": 6.243766350906733e-08, + "loss": 0.0538, + "step": 4539 + }, + { + "epoch": 0.951582477468036, + "grad_norm": 0.005623075179755688, + "learning_rate": 6.190397428376515e-08, + "loss": 0.0534, + "step": 4540 + }, + { + "epoch": 0.9517920771326766, + "grad_norm": 0.0076430076733231544, + "learning_rate": 6.137256151970583e-08, + "loss": 0.0539, + "step": 4541 + }, + { + "epoch": 0.9520016767973172, + "grad_norm": 0.006511925719678402, + "learning_rate": 6.08434254618695e-08, + "loss": 0.0525, + "step": 4542 + }, + { + "epoch": 0.9522112764619577, + "grad_norm": 0.0067582023330032825, + "learning_rate": 6.031656635418825e-08, + "loss": 0.0536, + "step": 4543 + }, + { + "epoch": 0.9524208761265982, + "grad_norm": 0.006065226625651121, + "learning_rate": 5.979198443954393e-08, + "loss": 0.0538, + "step": 4544 + }, + { + "epoch": 0.9526304757912387, + "grad_norm": 0.005706054624170065, + "learning_rate": 5.926967995976807e-08, + "loss": 0.0517, + "step": 4545 + }, + { + "epoch": 0.9528400754558792, + "grad_norm": 0.005716789048165083, + "learning_rate": 5.8749653155643626e-08, + "loss": 0.0539, + "step": 4546 + }, + { + "epoch": 0.9530496751205199, + "grad_norm": 0.0065559083595871925, + "learning_rate": 5.8231904266902726e-08, + "loss": 0.0522, + "step": 4547 + }, + { + "epoch": 0.9532592747851604, + "grad_norm": 0.005127377342432737, + "learning_rate": 5.771643353222778e-08, + "loss": 0.0544, + "step": 4548 + }, + { + "epoch": 0.9534688744498009, + "grad_norm": 0.005965971853584051, + "learning_rate": 5.720324118925036e-08, + "loss": 0.0529, + "step": 4549 + }, + { + "epoch": 0.9536784741144414, + "grad_norm": 0.0051533933728933334, + "learning_rate": 5.669232747455178e-08, + "loss": 0.0544, + "step": 4550 + }, + { + "epoch": 0.9538880737790819, + "grad_norm": 0.007161029148846865, + "learning_rate": 5.618369262366363e-08, + "loss": 0.0548, + "step": 4551 + }, + { + "epoch": 0.9540976734437225, + "grad_norm": 0.006360564846545458, + "learning_rate": 5.567733687106558e-08, + "loss": 0.0558, + "step": 4552 + }, + { + "epoch": 0.954307273108363, + "grad_norm": 0.006422116421163082, + "learning_rate": 5.517326045018867e-08, + "loss": 0.0535, + "step": 4553 + }, + { + "epoch": 0.9545168727730036, + "grad_norm": 0.00639104750007391, + "learning_rate": 5.4671463593412025e-08, + "loss": 0.0536, + "step": 4554 + }, + { + "epoch": 0.9547264724376441, + "grad_norm": 0.005469374358654022, + "learning_rate": 5.417194653206337e-08, + "loss": 0.0555, + "step": 4555 + }, + { + "epoch": 0.9549360721022846, + "grad_norm": 0.006682547274976969, + "learning_rate": 5.367470949641906e-08, + "loss": 0.0536, + "step": 4556 + }, + { + "epoch": 0.9551456717669252, + "grad_norm": 0.008950907737016678, + "learning_rate": 5.317975271570686e-08, + "loss": 0.0548, + "step": 4557 + }, + { + "epoch": 0.9553552714315657, + "grad_norm": 0.00793489534407854, + "learning_rate": 5.268707641810144e-08, + "loss": 0.0519, + "step": 4558 + }, + { + "epoch": 0.9555648710962062, + "grad_norm": 0.0063106887973845005, + "learning_rate": 5.2196680830725596e-08, + "loss": 0.0545, + "step": 4559 + }, + { + "epoch": 0.9557744707608468, + "grad_norm": 0.006787710823118687, + "learning_rate": 5.1708566179652363e-08, + "loss": 0.0508, + "step": 4560 + }, + { + "epoch": 0.9559840704254873, + "grad_norm": 0.008157514967024326, + "learning_rate": 5.122273268990285e-08, + "loss": 0.0546, + "step": 4561 + }, + { + "epoch": 0.9561936700901279, + "grad_norm": 0.00778998015448451, + "learning_rate": 5.073918058544458e-08, + "loss": 0.0536, + "step": 4562 + }, + { + "epoch": 0.9564032697547684, + "grad_norm": 0.009132279083132744, + "learning_rate": 5.025791008919645e-08, + "loss": 0.0527, + "step": 4563 + }, + { + "epoch": 0.9566128694194089, + "grad_norm": 0.006497305817902088, + "learning_rate": 4.977892142302376e-08, + "loss": 0.0539, + "step": 4564 + }, + { + "epoch": 0.9568224690840494, + "grad_norm": 0.005645697936415672, + "learning_rate": 4.930221480773989e-08, + "loss": 0.0531, + "step": 4565 + }, + { + "epoch": 0.95703206874869, + "grad_norm": 0.007060209289193153, + "learning_rate": 4.882779046310682e-08, + "loss": 0.053, + "step": 4566 + }, + { + "epoch": 0.9572416684133306, + "grad_norm": 0.005475896876305342, + "learning_rate": 4.835564860783404e-08, + "loss": 0.054, + "step": 4567 + }, + { + "epoch": 0.9574512680779711, + "grad_norm": 0.007286901585757732, + "learning_rate": 4.7885789459578e-08, + "loss": 0.0506, + "step": 4568 + }, + { + "epoch": 0.9576608677426116, + "grad_norm": 0.007155861239880323, + "learning_rate": 4.741821323494489e-08, + "loss": 0.0533, + "step": 4569 + }, + { + "epoch": 0.9578704674072521, + "grad_norm": 0.005204516928642988, + "learning_rate": 4.6952920149486715e-08, + "loss": 0.054, + "step": 4570 + }, + { + "epoch": 0.9580800670718926, + "grad_norm": 0.006855521816760302, + "learning_rate": 4.6489910417703564e-08, + "loss": 0.054, + "step": 4571 + }, + { + "epoch": 0.9582896667365333, + "grad_norm": 0.005472875200212002, + "learning_rate": 4.602918425304248e-08, + "loss": 0.0533, + "step": 4572 + }, + { + "epoch": 0.9584992664011738, + "grad_norm": 0.005690296180546284, + "learning_rate": 4.5570741867898563e-08, + "loss": 0.0535, + "step": 4573 + }, + { + "epoch": 0.9587088660658143, + "grad_norm": 0.0066895815543830395, + "learning_rate": 4.511458347361386e-08, + "loss": 0.0539, + "step": 4574 + }, + { + "epoch": 0.9589184657304548, + "grad_norm": 0.008229286409914494, + "learning_rate": 4.4660709280476275e-08, + "loss": 0.0512, + "step": 4575 + }, + { + "epoch": 0.9591280653950953, + "grad_norm": 0.0072631631046533585, + "learning_rate": 4.4209119497722883e-08, + "loss": 0.0546, + "step": 4576 + }, + { + "epoch": 0.959337665059736, + "grad_norm": 0.006190068554133177, + "learning_rate": 4.375981433353604e-08, + "loss": 0.0516, + "step": 4577 + }, + { + "epoch": 0.9595472647243765, + "grad_norm": 0.0051482743583619595, + "learning_rate": 4.331279399504507e-08, + "loss": 0.0529, + "step": 4578 + }, + { + "epoch": 0.959756864389017, + "grad_norm": 0.005935709923505783, + "learning_rate": 4.286805868832622e-08, + "loss": 0.0524, + "step": 4579 + }, + { + "epoch": 0.9599664640536575, + "grad_norm": 0.005591293331235647, + "learning_rate": 4.242560861840273e-08, + "loss": 0.0534, + "step": 4580 + }, + { + "epoch": 0.960176063718298, + "grad_norm": 0.006417642813175917, + "learning_rate": 4.19854439892442e-08, + "loss": 0.0546, + "step": 4581 + }, + { + "epoch": 0.9603856633829386, + "grad_norm": 0.006241221912205219, + "learning_rate": 4.154756500376611e-08, + "loss": 0.0545, + "step": 4582 + }, + { + "epoch": 0.9605952630475791, + "grad_norm": 0.004910702351480722, + "learning_rate": 4.1111971863830866e-08, + "loss": 0.054, + "step": 4583 + }, + { + "epoch": 0.9608048627122197, + "grad_norm": 0.005474326200783253, + "learning_rate": 4.0678664770246177e-08, + "loss": 0.0539, + "step": 4584 + }, + { + "epoch": 0.9610144623768602, + "grad_norm": 0.006352483760565519, + "learning_rate": 4.02476439227667e-08, + "loss": 0.0556, + "step": 4585 + }, + { + "epoch": 0.9612240620415007, + "grad_norm": 0.006457141134887934, + "learning_rate": 3.981890952009293e-08, + "loss": 0.0547, + "step": 4586 + }, + { + "epoch": 0.9614336617061413, + "grad_norm": 0.00615812698379159, + "learning_rate": 3.939246175987232e-08, + "loss": 0.0556, + "step": 4587 + }, + { + "epoch": 0.9616432613707818, + "grad_norm": 0.006070506758987904, + "learning_rate": 3.896830083869596e-08, + "loss": 0.0511, + "step": 4588 + }, + { + "epoch": 0.9618528610354223, + "grad_norm": 0.00562552735209465, + "learning_rate": 3.8546426952102425e-08, + "loss": 0.0568, + "step": 4589 + }, + { + "epoch": 0.9620624607000628, + "grad_norm": 0.006280606612563133, + "learning_rate": 3.812684029457614e-08, + "loss": 0.0551, + "step": 4590 + }, + { + "epoch": 0.9622720603647034, + "grad_norm": 0.006904164794832468, + "learning_rate": 3.770954105954461e-08, + "loss": 0.0524, + "step": 4591 + }, + { + "epoch": 0.962481660029344, + "grad_norm": 0.007641474716365337, + "learning_rate": 3.7294529439384494e-08, + "loss": 0.0559, + "step": 4592 + }, + { + "epoch": 0.9626912596939845, + "grad_norm": 0.005529316142201424, + "learning_rate": 3.6881805625415544e-08, + "loss": 0.0517, + "step": 4593 + }, + { + "epoch": 0.962900859358625, + "grad_norm": 0.004727725870907307, + "learning_rate": 3.647136980790333e-08, + "loss": 0.0551, + "step": 4594 + }, + { + "epoch": 0.9631104590232655, + "grad_norm": 0.00712125189602375, + "learning_rate": 3.606322217605873e-08, + "loss": 0.0546, + "step": 4595 + }, + { + "epoch": 0.963320058687906, + "grad_norm": 0.006122817751020193, + "learning_rate": 3.565736291803734e-08, + "loss": 0.0553, + "step": 4596 + }, + { + "epoch": 0.9635296583525467, + "grad_norm": 0.004824989475309849, + "learning_rate": 3.525379222094061e-08, + "loss": 0.0539, + "step": 4597 + }, + { + "epoch": 0.9637392580171872, + "grad_norm": 0.007551755756139755, + "learning_rate": 3.485251027081415e-08, + "loss": 0.0536, + "step": 4598 + }, + { + "epoch": 0.9639488576818277, + "grad_norm": 0.005360460840165615, + "learning_rate": 3.445351725264945e-08, + "loss": 0.0547, + "step": 4599 + }, + { + "epoch": 0.9641584573464682, + "grad_norm": 0.0053915646858513355, + "learning_rate": 3.405681335038158e-08, + "loss": 0.0536, + "step": 4600 + }, + { + "epoch": 0.9643680570111088, + "grad_norm": 0.0051850867457687855, + "learning_rate": 3.3662398746890924e-08, + "loss": 0.0531, + "step": 4601 + }, + { + "epoch": 0.9645776566757494, + "grad_norm": 0.0051136258989572525, + "learning_rate": 3.327027362400315e-08, + "loss": 0.0537, + "step": 4602 + }, + { + "epoch": 0.9647872563403899, + "grad_norm": 0.008604098111391068, + "learning_rate": 3.288043816248809e-08, + "loss": 0.0521, + "step": 4603 + }, + { + "epoch": 0.9649968560050304, + "grad_norm": 0.00728649040684104, + "learning_rate": 3.249289254205867e-08, + "loss": 0.0521, + "step": 4604 + }, + { + "epoch": 0.9652064556696709, + "grad_norm": 0.005971864331513643, + "learning_rate": 3.21076369413742e-08, + "loss": 0.0518, + "step": 4605 + }, + { + "epoch": 0.9654160553343115, + "grad_norm": 0.004847438540309668, + "learning_rate": 3.172467153803704e-08, + "loss": 0.0536, + "step": 4606 + }, + { + "epoch": 0.965625654998952, + "grad_norm": 0.004828069359064102, + "learning_rate": 3.134399650859432e-08, + "loss": 0.0529, + "step": 4607 + }, + { + "epoch": 0.9658352546635925, + "grad_norm": 0.004983065649867058, + "learning_rate": 3.096561202853676e-08, + "loss": 0.0563, + "step": 4608 + }, + { + "epoch": 0.9660448543282331, + "grad_norm": 0.005727593321353197, + "learning_rate": 3.0589518272300946e-08, + "loss": 0.052, + "step": 4609 + }, + { + "epoch": 0.9662544539928736, + "grad_norm": 0.005493259988725185, + "learning_rate": 3.0215715413264294e-08, + "loss": 0.0526, + "step": 4610 + }, + { + "epoch": 0.9664640536575142, + "grad_norm": 0.005342698656022549, + "learning_rate": 2.984420362375007e-08, + "loss": 0.0517, + "step": 4611 + }, + { + "epoch": 0.9666736533221547, + "grad_norm": 0.0063081239350140095, + "learning_rate": 2.9474983075026276e-08, + "loss": 0.0536, + "step": 4612 + }, + { + "epoch": 0.9668832529867952, + "grad_norm": 0.005974064581096172, + "learning_rate": 2.9108053937302316e-08, + "loss": 0.0562, + "step": 4613 + }, + { + "epoch": 0.9670928526514357, + "grad_norm": 0.005807976704090834, + "learning_rate": 2.8743416379733435e-08, + "loss": 0.0499, + "step": 4614 + }, + { + "epoch": 0.9673024523160763, + "grad_norm": 0.005039132200181484, + "learning_rate": 2.8381070570416835e-08, + "loss": 0.0521, + "step": 4615 + }, + { + "epoch": 0.9675120519807169, + "grad_norm": 0.007256725803017616, + "learning_rate": 2.8021016676393897e-08, + "loss": 0.0539, + "step": 4616 + }, + { + "epoch": 0.9677216516453574, + "grad_norm": 0.006315998733043671, + "learning_rate": 2.7663254863649625e-08, + "loss": 0.0531, + "step": 4617 + }, + { + "epoch": 0.9679312513099979, + "grad_norm": 0.006833828054368496, + "learning_rate": 2.7307785297111533e-08, + "loss": 0.0571, + "step": 4618 + }, + { + "epoch": 0.9681408509746384, + "grad_norm": 0.007867738604545593, + "learning_rate": 2.6954608140651872e-08, + "loss": 0.0566, + "step": 4619 + }, + { + "epoch": 0.9683504506392789, + "grad_norm": 0.005606554448604584, + "learning_rate": 2.6603723557085402e-08, + "loss": 0.0547, + "step": 4620 + }, + { + "epoch": 0.9685600503039196, + "grad_norm": 0.00556945102289319, + "learning_rate": 2.6255131708168845e-08, + "loss": 0.0523, + "step": 4621 + }, + { + "epoch": 0.9687696499685601, + "grad_norm": 0.006726409774273634, + "learning_rate": 2.5908832754603097e-08, + "loss": 0.0541, + "step": 4622 + }, + { + "epoch": 0.9689792496332006, + "grad_norm": 0.0066106924787163734, + "learning_rate": 2.5564826856032677e-08, + "loss": 0.054, + "step": 4623 + }, + { + "epoch": 0.9691888492978411, + "grad_norm": 0.004752601031213999, + "learning_rate": 2.5223114171043507e-08, + "loss": 0.0539, + "step": 4624 + }, + { + "epoch": 0.9693984489624816, + "grad_norm": 0.006882491055876017, + "learning_rate": 2.488369485716513e-08, + "loss": 0.0553, + "step": 4625 + }, + { + "epoch": 0.9696080486271222, + "grad_norm": 0.008670981973409653, + "learning_rate": 2.4546569070870717e-08, + "loss": 0.0556, + "step": 4626 + }, + { + "epoch": 0.9698176482917628, + "grad_norm": 0.006038348656147718, + "learning_rate": 2.4211736967574283e-08, + "loss": 0.0529, + "step": 4627 + }, + { + "epoch": 0.9700272479564033, + "grad_norm": 0.00571649894118309, + "learning_rate": 2.387919870163291e-08, + "loss": 0.054, + "step": 4628 + }, + { + "epoch": 0.9702368476210438, + "grad_norm": 0.004810945596545935, + "learning_rate": 2.354895442634786e-08, + "loss": 0.054, + "step": 4629 + }, + { + "epoch": 0.9704464472856843, + "grad_norm": 0.00811366643756628, + "learning_rate": 2.3221004293961237e-08, + "loss": 0.0542, + "step": 4630 + }, + { + "epoch": 0.9706560469503249, + "grad_norm": 0.005908562336117029, + "learning_rate": 2.289534845565766e-08, + "loss": 0.0528, + "step": 4631 + }, + { + "epoch": 0.9708656466149654, + "grad_norm": 0.00523727061226964, + "learning_rate": 2.2571987061564827e-08, + "loss": 0.0542, + "step": 4632 + }, + { + "epoch": 0.971075246279606, + "grad_norm": 0.006445094011723995, + "learning_rate": 2.225092026075182e-08, + "loss": 0.0511, + "step": 4633 + }, + { + "epoch": 0.9712848459442465, + "grad_norm": 0.006017652805894613, + "learning_rate": 2.193214820123024e-08, + "loss": 0.0585, + "step": 4634 + }, + { + "epoch": 0.971494445608887, + "grad_norm": 0.00453847274184227, + "learning_rate": 2.1615671029954765e-08, + "loss": 0.0548, + "step": 4635 + }, + { + "epoch": 0.9717040452735276, + "grad_norm": 0.0046917712315917015, + "learning_rate": 2.1301488892820908e-08, + "loss": 0.0533, + "step": 4636 + }, + { + "epoch": 0.9719136449381681, + "grad_norm": 0.004841329529881477, + "learning_rate": 2.098960193466615e-08, + "loss": 0.0536, + "step": 4637 + }, + { + "epoch": 0.9721232446028086, + "grad_norm": 0.0073344954289495945, + "learning_rate": 2.0680010299271024e-08, + "loss": 0.0557, + "step": 4638 + }, + { + "epoch": 0.9723328442674491, + "grad_norm": 0.006531344726681709, + "learning_rate": 2.0372714129356375e-08, + "loss": 0.0546, + "step": 4639 + }, + { + "epoch": 0.9725424439320897, + "grad_norm": 0.00542111974209547, + "learning_rate": 2.0067713566586654e-08, + "loss": 0.0539, + "step": 4640 + }, + { + "epoch": 0.9727520435967303, + "grad_norm": 0.004822442773729563, + "learning_rate": 1.9765008751566618e-08, + "loss": 0.0562, + "step": 4641 + }, + { + "epoch": 0.9729616432613708, + "grad_norm": 0.004666461609303951, + "learning_rate": 1.9464599823842966e-08, + "loss": 0.0558, + "step": 4642 + }, + { + "epoch": 0.9731712429260113, + "grad_norm": 0.005237584933638573, + "learning_rate": 1.9166486921903814e-08, + "loss": 0.0542, + "step": 4643 + }, + { + "epoch": 0.9733808425906518, + "grad_norm": 0.005628027021884918, + "learning_rate": 1.8870670183179783e-08, + "loss": 0.052, + "step": 4644 + }, + { + "epoch": 0.9735904422552923, + "grad_norm": 0.005040889140218496, + "learning_rate": 1.8577149744042343e-08, + "loss": 0.0522, + "step": 4645 + }, + { + "epoch": 0.973800041919933, + "grad_norm": 0.004789900500327349, + "learning_rate": 1.8285925739803812e-08, + "loss": 0.0521, + "step": 4646 + }, + { + "epoch": 0.9740096415845735, + "grad_norm": 0.00599799444898963, + "learning_rate": 1.7996998304719016e-08, + "loss": 0.0553, + "step": 4647 + }, + { + "epoch": 0.974219241249214, + "grad_norm": 0.004553031176328659, + "learning_rate": 1.7710367571983077e-08, + "loss": 0.0505, + "step": 4648 + }, + { + "epoch": 0.9744288409138545, + "grad_norm": 0.00728783430531621, + "learning_rate": 1.7426033673733077e-08, + "loss": 0.0537, + "step": 4649 + }, + { + "epoch": 0.974638440578495, + "grad_norm": 0.004845589864999056, + "learning_rate": 1.7143996741045832e-08, + "loss": 0.0554, + "step": 4650 + }, + { + "epoch": 0.9748480402431356, + "grad_norm": 0.005394340958446264, + "learning_rate": 1.686425690394178e-08, + "loss": 0.0516, + "step": 4651 + }, + { + "epoch": 0.9750576399077762, + "grad_norm": 0.006395083852112293, + "learning_rate": 1.6586814291379428e-08, + "loss": 0.0553, + "step": 4652 + }, + { + "epoch": 0.9752672395724167, + "grad_norm": 0.005952565465122461, + "learning_rate": 1.631166903126147e-08, + "loss": 0.0554, + "step": 4653 + }, + { + "epoch": 0.9754768392370572, + "grad_norm": 0.004673383664339781, + "learning_rate": 1.603882125042866e-08, + "loss": 0.0555, + "step": 4654 + }, + { + "epoch": 0.9756864389016977, + "grad_norm": 0.005786755122244358, + "learning_rate": 1.576827107466372e-08, + "loss": 0.0529, + "step": 4655 + }, + { + "epoch": 0.9758960385663383, + "grad_norm": 0.006783796474337578, + "learning_rate": 1.5500018628690216e-08, + "loss": 0.0526, + "step": 4656 + }, + { + "epoch": 0.9761056382309788, + "grad_norm": 0.006686339620500803, + "learning_rate": 1.5234064036173114e-08, + "loss": 0.0541, + "step": 4657 + }, + { + "epoch": 0.9763152378956194, + "grad_norm": 0.006544138304889202, + "learning_rate": 1.4970407419717116e-08, + "loss": 0.0517, + "step": 4658 + }, + { + "epoch": 0.9765248375602599, + "grad_norm": 0.004879649728536606, + "learning_rate": 1.4709048900867772e-08, + "loss": 0.0521, + "step": 4659 + }, + { + "epoch": 0.9767344372249004, + "grad_norm": 0.006231387611478567, + "learning_rate": 1.4449988600111486e-08, + "loss": 0.0551, + "step": 4660 + }, + { + "epoch": 0.976944036889541, + "grad_norm": 0.005122179165482521, + "learning_rate": 1.4193226636874391e-08, + "loss": 0.0516, + "step": 4661 + }, + { + "epoch": 0.9771536365541815, + "grad_norm": 0.004648258443921804, + "learning_rate": 1.393876312952458e-08, + "loss": 0.0523, + "step": 4662 + }, + { + "epoch": 0.977363236218822, + "grad_norm": 0.005532658658921719, + "learning_rate": 1.3686598195369327e-08, + "loss": 0.0589, + "step": 4663 + }, + { + "epoch": 0.9775728358834626, + "grad_norm": 0.00559549406170845, + "learning_rate": 1.34367319506562e-08, + "loss": 0.0499, + "step": 4664 + }, + { + "epoch": 0.9777824355481031, + "grad_norm": 0.004705341067165136, + "learning_rate": 1.318916451057417e-08, + "loss": 0.0535, + "step": 4665 + }, + { + "epoch": 0.9779920352127437, + "grad_norm": 0.005659153684973717, + "learning_rate": 1.2943895989251387e-08, + "loss": 0.0543, + "step": 4666 + }, + { + "epoch": 0.9782016348773842, + "grad_norm": 0.004608216229826212, + "learning_rate": 1.2700926499756295e-08, + "loss": 0.058, + "step": 4667 + }, + { + "epoch": 0.9784112345420247, + "grad_norm": 0.005427349358797073, + "learning_rate": 1.2460256154098738e-08, + "loss": 0.0566, + "step": 4668 + }, + { + "epoch": 0.9786208342066652, + "grad_norm": 0.0053130644373595715, + "learning_rate": 1.2221885063226635e-08, + "loss": 0.0516, + "step": 4669 + }, + { + "epoch": 0.9788304338713059, + "grad_norm": 0.004453903995454311, + "learning_rate": 1.198581333702986e-08, + "loss": 0.053, + "step": 4670 + }, + { + "epoch": 0.9790400335359464, + "grad_norm": 0.0048965164460241795, + "learning_rate": 1.1752041084336364e-08, + "loss": 0.0536, + "step": 4671 + }, + { + "epoch": 0.9792496332005869, + "grad_norm": 0.008032665587961674, + "learning_rate": 1.15205684129166e-08, + "loss": 0.0548, + "step": 4672 + }, + { + "epoch": 0.9794592328652274, + "grad_norm": 0.004445253871381283, + "learning_rate": 1.1291395429477991e-08, + "loss": 0.052, + "step": 4673 + }, + { + "epoch": 0.9796688325298679, + "grad_norm": 0.004714785609394312, + "learning_rate": 1.1064522239669916e-08, + "loss": 0.0553, + "step": 4674 + }, + { + "epoch": 0.9798784321945085, + "grad_norm": 0.004439793061465025, + "learning_rate": 1.0839948948080937e-08, + "loss": 0.0509, + "step": 4675 + }, + { + "epoch": 0.9800880318591491, + "grad_norm": 0.004678604193031788, + "learning_rate": 1.0617675658239345e-08, + "loss": 0.0568, + "step": 4676 + }, + { + "epoch": 0.9802976315237896, + "grad_norm": 0.007188865914940834, + "learning_rate": 1.0397702472612625e-08, + "loss": 0.053, + "step": 4677 + }, + { + "epoch": 0.9805072311884301, + "grad_norm": 0.006724335718899965, + "learning_rate": 1.0180029492608546e-08, + "loss": 0.0556, + "step": 4678 + }, + { + "epoch": 0.9807168308530706, + "grad_norm": 0.006979641038924456, + "learning_rate": 9.964656818574614e-09, + "loss": 0.0528, + "step": 4679 + }, + { + "epoch": 0.9809264305177112, + "grad_norm": 0.0065865106880664825, + "learning_rate": 9.751584549796966e-09, + "loss": 0.0538, + "step": 4680 + }, + { + "epoch": 0.9811360301823517, + "grad_norm": 0.005535759497433901, + "learning_rate": 9.54081278450314e-09, + "loss": 0.0547, + "step": 4681 + }, + { + "epoch": 0.9813456298469923, + "grad_norm": 0.006264574825763702, + "learning_rate": 9.332341619857078e-09, + "loss": 0.0541, + "step": 4682 + }, + { + "epoch": 0.9815552295116328, + "grad_norm": 0.006483092904090881, + "learning_rate": 9.126171151965235e-09, + "loss": 0.0536, + "step": 4683 + }, + { + "epoch": 0.9817648291762733, + "grad_norm": 0.005037569906562567, + "learning_rate": 8.922301475872141e-09, + "loss": 0.0548, + "step": 4684 + }, + { + "epoch": 0.9819744288409139, + "grad_norm": 0.005114252213388681, + "learning_rate": 8.72073268556095e-09, + "loss": 0.055, + "step": 4685 + }, + { + "epoch": 0.9821840285055544, + "grad_norm": 0.006643661763519049, + "learning_rate": 8.52146487395511e-09, + "loss": 0.0564, + "step": 4686 + }, + { + "epoch": 0.9823936281701949, + "grad_norm": 0.007684056647121906, + "learning_rate": 8.324498132917248e-09, + "loss": 0.055, + "step": 4687 + }, + { + "epoch": 0.9826032278348354, + "grad_norm": 0.005348839331418276, + "learning_rate": 8.129832553249173e-09, + "loss": 0.0527, + "step": 4688 + }, + { + "epoch": 0.982812827499476, + "grad_norm": 0.006276331376284361, + "learning_rate": 7.937468224691325e-09, + "loss": 0.0553, + "step": 4689 + }, + { + "epoch": 0.9830224271641166, + "grad_norm": 0.005827800370752811, + "learning_rate": 7.747405235923322e-09, + "loss": 0.0534, + "step": 4690 + }, + { + "epoch": 0.9832320268287571, + "grad_norm": 0.005027239676564932, + "learning_rate": 7.55964367456452e-09, + "loss": 0.0505, + "step": 4691 + }, + { + "epoch": 0.9834416264933976, + "grad_norm": 0.004755694884806871, + "learning_rate": 7.374183627173459e-09, + "loss": 0.0544, + "step": 4692 + }, + { + "epoch": 0.9836512261580381, + "grad_norm": 0.004315654281526804, + "learning_rate": 7.191025179246192e-09, + "loss": 0.0505, + "step": 4693 + }, + { + "epoch": 0.9838608258226786, + "grad_norm": 0.006336344871670008, + "learning_rate": 7.010168415219621e-09, + "loss": 0.0546, + "step": 4694 + }, + { + "epoch": 0.9840704254873193, + "grad_norm": 0.004240935202687979, + "learning_rate": 6.831613418468163e-09, + "loss": 0.0529, + "step": 4695 + }, + { + "epoch": 0.9842800251519598, + "grad_norm": 0.00471118651330471, + "learning_rate": 6.655360271305422e-09, + "loss": 0.0562, + "step": 4696 + }, + { + "epoch": 0.9844896248166003, + "grad_norm": 0.005362183786928654, + "learning_rate": 6.4814090549847334e-09, + "loss": 0.0535, + "step": 4697 + }, + { + "epoch": 0.9846992244812408, + "grad_norm": 0.003966295626014471, + "learning_rate": 6.30975984969695e-09, + "loss": 0.0536, + "step": 4698 + }, + { + "epoch": 0.9849088241458813, + "grad_norm": 0.005599298048764467, + "learning_rate": 6.140412734572665e-09, + "loss": 0.0529, + "step": 4699 + }, + { + "epoch": 0.985118423810522, + "grad_norm": 0.00638514244928956, + "learning_rate": 5.973367787681095e-09, + "loss": 0.0514, + "step": 4700 + }, + { + "epoch": 0.9853280234751625, + "grad_norm": 0.004646534100174904, + "learning_rate": 5.808625086029529e-09, + "loss": 0.0538, + "step": 4701 + }, + { + "epoch": 0.985537623139803, + "grad_norm": 0.0049830214120447636, + "learning_rate": 5.646184705563884e-09, + "loss": 0.0535, + "step": 4702 + }, + { + "epoch": 0.9857472228044435, + "grad_norm": 0.005619076080620289, + "learning_rate": 5.486046721170368e-09, + "loss": 0.0554, + "step": 4703 + }, + { + "epoch": 0.985956822469084, + "grad_norm": 0.004761868622153997, + "learning_rate": 5.328211206671596e-09, + "loss": 0.0546, + "step": 4704 + }, + { + "epoch": 0.9861664221337246, + "grad_norm": 0.004782952833920717, + "learning_rate": 5.17267823482992e-09, + "loss": 0.0549, + "step": 4705 + }, + { + "epoch": 0.9863760217983651, + "grad_norm": 0.005984222050756216, + "learning_rate": 5.019447877346317e-09, + "loss": 0.055, + "step": 4706 + }, + { + "epoch": 0.9865856214630057, + "grad_norm": 0.004718398675322533, + "learning_rate": 4.868520204859284e-09, + "loss": 0.0522, + "step": 4707 + }, + { + "epoch": 0.9867952211276462, + "grad_norm": 0.005698632914572954, + "learning_rate": 4.719895286947052e-09, + "loss": 0.0547, + "step": 4708 + }, + { + "epoch": 0.9870048207922867, + "grad_norm": 0.005634862929582596, + "learning_rate": 4.573573192125369e-09, + "loss": 0.0523, + "step": 4709 + }, + { + "epoch": 0.9872144204569273, + "grad_norm": 0.0050827860832214355, + "learning_rate": 4.429553987849167e-09, + "loss": 0.0547, + "step": 4710 + }, + { + "epoch": 0.9874240201215678, + "grad_norm": 0.005555478390306234, + "learning_rate": 4.287837740510336e-09, + "loss": 0.0524, + "step": 4711 + }, + { + "epoch": 0.9876336197862083, + "grad_norm": 0.0045521133579313755, + "learning_rate": 4.148424515441063e-09, + "loss": 0.0535, + "step": 4712 + }, + { + "epoch": 0.9878432194508489, + "grad_norm": 0.005545974709093571, + "learning_rate": 4.011314376909936e-09, + "loss": 0.0513, + "step": 4713 + }, + { + "epoch": 0.9880528191154894, + "grad_norm": 0.005865436978638172, + "learning_rate": 3.876507388125839e-09, + "loss": 0.0522, + "step": 4714 + }, + { + "epoch": 0.98826241878013, + "grad_norm": 0.004759861622005701, + "learning_rate": 3.744003611233505e-09, + "loss": 0.0529, + "step": 4715 + }, + { + "epoch": 0.9884720184447705, + "grad_norm": 0.004680339712649584, + "learning_rate": 3.613803107317959e-09, + "loss": 0.0536, + "step": 4716 + }, + { + "epoch": 0.988681618109411, + "grad_norm": 0.005101062823086977, + "learning_rate": 3.4859059364006354e-09, + "loss": 0.0506, + "step": 4717 + }, + { + "epoch": 0.9888912177740515, + "grad_norm": 0.005664270371198654, + "learning_rate": 3.3603121574438126e-09, + "loss": 0.0565, + "step": 4718 + }, + { + "epoch": 0.989100817438692, + "grad_norm": 0.005366276018321514, + "learning_rate": 3.237021828344511e-09, + "loss": 0.0542, + "step": 4719 + }, + { + "epoch": 0.9893104171033327, + "grad_norm": 0.004395569209009409, + "learning_rate": 3.1160350059405986e-09, + "loss": 0.0537, + "step": 4720 + }, + { + "epoch": 0.9895200167679732, + "grad_norm": 0.00447695842012763, + "learning_rate": 2.9973517460063496e-09, + "loss": 0.0532, + "step": 4721 + }, + { + "epoch": 0.9897296164326137, + "grad_norm": 0.00491610262542963, + "learning_rate": 2.8809721032552197e-09, + "loss": 0.054, + "step": 4722 + }, + { + "epoch": 0.9899392160972542, + "grad_norm": 0.0043014525435864925, + "learning_rate": 2.7668961313376263e-09, + "loss": 0.0561, + "step": 4723 + }, + { + "epoch": 0.9901488157618947, + "grad_norm": 0.007249589078128338, + "learning_rate": 2.6551238828431692e-09, + "loss": 0.0538, + "step": 4724 + }, + { + "epoch": 0.9903584154265354, + "grad_norm": 0.005701399873942137, + "learning_rate": 2.5456554092984087e-09, + "loss": 0.0568, + "step": 4725 + }, + { + "epoch": 0.9905680150911759, + "grad_norm": 0.00624644011259079, + "learning_rate": 2.438490761168533e-09, + "loss": 0.0554, + "step": 4726 + }, + { + "epoch": 0.9907776147558164, + "grad_norm": 0.005006265826523304, + "learning_rate": 2.3336299878562453e-09, + "loss": 0.0531, + "step": 4727 + }, + { + "epoch": 0.9909872144204569, + "grad_norm": 0.005804962012916803, + "learning_rate": 2.231073137702877e-09, + "loss": 0.0542, + "step": 4728 + }, + { + "epoch": 0.9911968140850974, + "grad_norm": 0.005389668978750706, + "learning_rate": 2.1308202579861657e-09, + "loss": 0.0529, + "step": 4729 + }, + { + "epoch": 0.991406413749738, + "grad_norm": 0.005474920384585857, + "learning_rate": 2.0328713949230304e-09, + "loss": 0.0545, + "step": 4730 + }, + { + "epoch": 0.9916160134143785, + "grad_norm": 0.0048176608979702, + "learning_rate": 1.937226593668462e-09, + "loss": 0.0534, + "step": 4731 + }, + { + "epoch": 0.9918256130790191, + "grad_norm": 0.004726291634142399, + "learning_rate": 1.8438858983138575e-09, + "loss": 0.0535, + "step": 4732 + }, + { + "epoch": 0.9920352127436596, + "grad_norm": 0.0048382277600467205, + "learning_rate": 1.752849351889796e-09, + "loss": 0.0548, + "step": 4733 + }, + { + "epoch": 0.9922448124083001, + "grad_norm": 0.006312186364084482, + "learning_rate": 1.6641169963638182e-09, + "loss": 0.0554, + "step": 4734 + }, + { + "epoch": 0.9924544120729407, + "grad_norm": 0.005486776586622, + "learning_rate": 1.5776888726420913e-09, + "loss": 0.0535, + "step": 4735 + }, + { + "epoch": 0.9926640117375812, + "grad_norm": 0.005428542383015156, + "learning_rate": 1.4935650205671893e-09, + "loss": 0.0557, + "step": 4736 + }, + { + "epoch": 0.9928736114022217, + "grad_norm": 0.0053994497284293175, + "learning_rate": 1.4117454789208673e-09, + "loss": 0.0514, + "step": 4737 + }, + { + "epoch": 0.9930832110668623, + "grad_norm": 0.005145637784153223, + "learning_rate": 1.3322302854212876e-09, + "loss": 0.0544, + "step": 4738 + }, + { + "epoch": 0.9932928107315029, + "grad_norm": 0.005606017541140318, + "learning_rate": 1.2550194767252387e-09, + "loss": 0.054, + "step": 4739 + }, + { + "epoch": 0.9935024103961434, + "grad_norm": 0.004900652449578047, + "learning_rate": 1.1801130884270262e-09, + "loss": 0.0527, + "step": 4740 + }, + { + "epoch": 0.9937120100607839, + "grad_norm": 0.0050704628229141235, + "learning_rate": 1.1075111550579166e-09, + "loss": 0.0548, + "step": 4741 + }, + { + "epoch": 0.9939216097254244, + "grad_norm": 0.004520857241004705, + "learning_rate": 1.0372137100883584e-09, + "loss": 0.0558, + "step": 4742 + }, + { + "epoch": 0.9941312093900649, + "grad_norm": 0.004727307241410017, + "learning_rate": 9.692207859246516e-10, + "loss": 0.0543, + "step": 4743 + }, + { + "epoch": 0.9943408090547056, + "grad_norm": 0.0052885450422763824, + "learning_rate": 9.03532413911723e-10, + "loss": 0.0537, + "step": 4744 + }, + { + "epoch": 0.9945504087193461, + "grad_norm": 0.00510826800018549, + "learning_rate": 8.401486243320156e-10, + "loss": 0.0523, + "step": 4745 + }, + { + "epoch": 0.9947600083839866, + "grad_norm": 0.005960457026958466, + "learning_rate": 7.790694464054893e-10, + "loss": 0.0528, + "step": 4746 + }, + { + "epoch": 0.9949696080486271, + "grad_norm": 0.006908085662871599, + "learning_rate": 7.202949082890654e-10, + "loss": 0.054, + "step": 4747 + }, + { + "epoch": 0.9951792077132676, + "grad_norm": 0.004360835067927837, + "learning_rate": 6.638250370788468e-10, + "loss": 0.0512, + "step": 4748 + }, + { + "epoch": 0.9953888073779082, + "grad_norm": 0.00422016391530633, + "learning_rate": 6.09659858806233e-10, + "loss": 0.0539, + "step": 4749 + }, + { + "epoch": 0.9955984070425488, + "grad_norm": 0.0072461930103600025, + "learning_rate": 5.577993984423602e-10, + "loss": 0.0521, + "step": 4750 + }, + { + "epoch": 0.9958080067071893, + "grad_norm": 0.0055481684394180775, + "learning_rate": 5.08243679894771e-10, + "loss": 0.054, + "step": 4751 + }, + { + "epoch": 0.9960176063718298, + "grad_norm": 0.004346661269664764, + "learning_rate": 4.609927260079694e-10, + "loss": 0.0544, + "step": 4752 + }, + { + "epoch": 0.9962272060364703, + "grad_norm": 0.004471504595130682, + "learning_rate": 4.1604655856508633e-10, + "loss": 0.0528, + "step": 4753 + }, + { + "epoch": 0.9964368057011109, + "grad_norm": 0.004563014954328537, + "learning_rate": 3.7340519828621415e-10, + "loss": 0.0531, + "step": 4754 + }, + { + "epoch": 0.9966464053657514, + "grad_norm": 0.005326093640178442, + "learning_rate": 3.330686648289616e-10, + "loss": 0.0527, + "step": 4755 + }, + { + "epoch": 0.996856005030392, + "grad_norm": 0.005793221294879913, + "learning_rate": 2.950369767884542e-10, + "loss": 0.0533, + "step": 4756 + }, + { + "epoch": 0.9970656046950325, + "grad_norm": 0.005273030139505863, + "learning_rate": 2.59310151697334e-10, + "loss": 0.0521, + "step": 4757 + }, + { + "epoch": 0.997275204359673, + "grad_norm": 0.004422128200531006, + "learning_rate": 2.2588820602631457e-10, + "loss": 0.0517, + "step": 4758 + }, + { + "epoch": 0.9974848040243136, + "grad_norm": 0.006338967010378838, + "learning_rate": 1.9477115518140577e-10, + "loss": 0.0555, + "step": 4759 + }, + { + "epoch": 0.9976944036889541, + "grad_norm": 0.004579038359224796, + "learning_rate": 1.6595901350890954e-10, + "loss": 0.0539, + "step": 4760 + }, + { + "epoch": 0.9979040033535946, + "grad_norm": 0.005021790973842144, + "learning_rate": 1.39451794290979e-10, + "loss": 0.0528, + "step": 4761 + }, + { + "epoch": 0.9981136030182352, + "grad_norm": 0.004821773152798414, + "learning_rate": 1.1524950974672878e-10, + "loss": 0.0525, + "step": 4762 + }, + { + "epoch": 0.9983232026828757, + "grad_norm": 0.004908620845526457, + "learning_rate": 9.335217103445538e-11, + "loss": 0.0524, + "step": 4763 + }, + { + "epoch": 0.9985328023475163, + "grad_norm": 0.007371780462563038, + "learning_rate": 7.375978824775143e-11, + "loss": 0.0526, + "step": 4764 + }, + { + "epoch": 0.9987424020121568, + "grad_norm": 0.005421594250947237, + "learning_rate": 5.6472370419391464e-11, + "loss": 0.0543, + "step": 4765 + }, + { + "epoch": 0.9989520016767973, + "grad_norm": 0.0057752360589802265, + "learning_rate": 4.148992551855635e-11, + "loss": 0.053, + "step": 4766 + }, + { + "epoch": 0.9991616013414378, + "grad_norm": 0.007910934276878834, + "learning_rate": 2.8812460452498637e-11, + "loss": 0.0517, + "step": 4767 + }, + { + "epoch": 0.9993712010060783, + "grad_norm": 0.005225565284490585, + "learning_rate": 1.843998106543232e-11, + "loss": 0.0547, + "step": 4768 + }, + { + "epoch": 0.999580800670719, + "grad_norm": 0.005672250874340534, + "learning_rate": 1.0372492138532864e-11, + "loss": 0.0564, + "step": 4769 + }, + { + "epoch": 0.9997904003353595, + "grad_norm": 0.005982316564768553, + "learning_rate": 4.609997391602505e-12, + "loss": 0.0538, + "step": 4770 + }, + { + "epoch": 1.0, + "grad_norm": 0.005909937433898449, + "learning_rate": 1.1524994808498335e-12, + "loss": 0.0547, + "step": 4771 + }, + { + "epoch": 1.0, + "step": 4771, + "total_flos": 0.0, + "train_loss": 0.06302105758305591, + "train_runtime": 301007.0841, + "train_samples_per_second": 3.043, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 1.0, + "max_steps": 4771, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}