diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,19742 +10,19742 @@ "log_history": [ { "epoch": 0.0035714285714285713, - "grad_norm": 476.70025634765625, + "grad_norm": 48544.6953125, "learning_rate": 3.571428571428572e-08, - "loss": 26.9688, + "loss": 2525.0, "step": 1 }, { "epoch": 0.007142857142857143, - "grad_norm": 482.3134765625, + "grad_norm": 55155.546875, "learning_rate": 7.142857142857144e-08, - "loss": 30.4844, + "loss": 2636.0, "step": 2 }, { "epoch": 0.010714285714285714, - "grad_norm": 414.4317321777344, + "grad_norm": 54588.11328125, "learning_rate": 1.0714285714285716e-07, - "loss": 29.6406, + "loss": 2860.0, "step": 3 }, { "epoch": 0.014285714285714285, - "grad_norm": 474.867919921875, + "grad_norm": 55787.01171875, "learning_rate": 1.4285714285714287e-07, - "loss": 26.9062, + "loss": 2540.0, "step": 4 }, { "epoch": 0.017857142857142856, - "grad_norm": 458.3918151855469, + "grad_norm": 50037.16796875, "learning_rate": 1.7857142857142858e-07, - "loss": 26.625, + "loss": 2776.0, "step": 5 }, { "epoch": 0.02142857142857143, - "grad_norm": 491.6147766113281, + "grad_norm": 55643.56640625, "learning_rate": 2.142857142857143e-07, - "loss": 28.4844, + "loss": 2620.0, "step": 6 }, { "epoch": 0.025, - "grad_norm": 464.2335205078125, + "grad_norm": 56512.84375, "learning_rate": 2.5000000000000004e-07, - "loss": 25.9688, + "loss": 2538.0, "step": 7 }, { "epoch": 0.02857142857142857, - "grad_norm": 400.991455078125, + "grad_norm": 61987.41015625, "learning_rate": 2.8571428571428575e-07, - "loss": 22.9219, + "loss": 2732.0, "step": 8 }, { "epoch": 0.03214285714285714, - "grad_norm": 467.1754150390625, + "grad_norm": 58430.15234375, "learning_rate": 3.214285714285714e-07, - "loss": 25.5938, + "loss": 2618.0, "step": 9 }, { "epoch": 0.03571428571428571, - "grad_norm": 477.78631591796875, + "grad_norm": 56901.875, "learning_rate": 3.5714285714285716e-07, - "loss": 29.0156, + "loss": 2552.0, "step": 10 }, { "epoch": 0.039285714285714285, - "grad_norm": 467.178466796875, + "grad_norm": 55610.13671875, "learning_rate": 3.9285714285714286e-07, - "loss": 29.3906, + "loss": 2820.0, "step": 11 }, { "epoch": 0.04285714285714286, - "grad_norm": 467.70025634765625, + "grad_norm": 54006.9140625, "learning_rate": 4.285714285714286e-07, - "loss": 28.8594, + "loss": 2769.0, "step": 12 }, { "epoch": 0.04642857142857143, - "grad_norm": 465.14013671875, + "grad_norm": 62984.54296875, "learning_rate": 4.642857142857143e-07, - "loss": 34.3906, + "loss": 2829.0, "step": 13 }, { "epoch": 0.05, - "grad_norm": 429.67596435546875, + "grad_norm": 54368.875, "learning_rate": 5.000000000000001e-07, - "loss": 24.8906, + "loss": 2940.0, "step": 14 }, { "epoch": 0.05357142857142857, - "grad_norm": 443.1952209472656, + "grad_norm": 61938.92578125, "learning_rate": 5.357142857142857e-07, - "loss": 26.6875, + "loss": 2810.0, "step": 15 }, { "epoch": 0.05714285714285714, - "grad_norm": 494.4329528808594, + "grad_norm": 53331.5625, "learning_rate": 5.714285714285715e-07, - "loss": 30.2031, + "loss": 2394.0, "step": 16 }, { "epoch": 0.060714285714285714, - "grad_norm": 425.93463134765625, + "grad_norm": 59003.05078125, "learning_rate": 6.071428571428572e-07, - "loss": 25.6719, + "loss": 3010.0, "step": 17 }, { "epoch": 0.06428571428571428, - "grad_norm": 501.7166442871094, + "grad_norm": 54923.78515625, "learning_rate": 6.428571428571428e-07, - "loss": 30.4844, + "loss": 2698.0, "step": 18 }, { "epoch": 0.06785714285714285, - "grad_norm": 461.9896240234375, + "grad_norm": 57235.73828125, "learning_rate": 6.785714285714286e-07, - "loss": 25.2031, + "loss": 2504.0, "step": 19 }, { "epoch": 0.07142857142857142, - "grad_norm": 491.8294372558594, + "grad_norm": 60191.07421875, "learning_rate": 7.142857142857143e-07, - "loss": 30.1406, + "loss": 2696.0, "step": 20 }, { "epoch": 0.075, - "grad_norm": 450.2153625488281, + "grad_norm": 56713.77734375, "learning_rate": 7.5e-07, - "loss": 26.3594, + "loss": 2734.0, "step": 21 }, { "epoch": 0.07857142857142857, - "grad_norm": 447.2486572265625, + "grad_norm": 61983.03515625, "learning_rate": 7.857142857142857e-07, - "loss": 25.4844, + "loss": 2682.0, "step": 22 }, { "epoch": 0.08214285714285714, - "grad_norm": 451.3460693359375, + "grad_norm": 57473.671875, "learning_rate": 8.214285714285715e-07, - "loss": 24.9219, + "loss": 2676.0, "step": 23 }, { "epoch": 0.08571428571428572, - "grad_norm": 437.9570617675781, + "grad_norm": 55743.8125, "learning_rate": 8.571428571428572e-07, - "loss": 25.0312, + "loss": 2652.0, "step": 24 }, { "epoch": 0.08928571428571429, - "grad_norm": 474.4172058105469, + "grad_norm": 56975.93359375, "learning_rate": 8.928571428571429e-07, - "loss": 31.7344, + "loss": 2614.0, "step": 25 }, { "epoch": 0.09285714285714286, - "grad_norm": 454.866455078125, + "grad_norm": 59308.09765625, "learning_rate": 9.285714285714287e-07, - "loss": 29.4531, + "loss": 2786.0, "step": 26 }, { "epoch": 0.09642857142857143, - "grad_norm": 501.000244140625, + "grad_norm": 59939.6171875, "learning_rate": 9.642857142857145e-07, - "loss": 31.0938, + "loss": 2478.0, "step": 27 }, { "epoch": 0.1, - "grad_norm": 460.5111389160156, + "grad_norm": 56436.0390625, "learning_rate": 1.0000000000000002e-06, - "loss": 27.0469, + "loss": 2638.0, "step": 28 }, { "epoch": 0.10357142857142858, - "grad_norm": 450.7170104980469, + "grad_norm": 56832.61328125, "learning_rate": 1.0357142857142859e-06, - "loss": 27.125, + "loss": 2716.0, "step": 29 }, { "epoch": 0.10714285714285714, - "grad_norm": 459.7214050292969, + "grad_norm": 54977.24609375, "learning_rate": 1.0714285714285714e-06, - "loss": 27.9688, + "loss": 2622.0, "step": 30 }, { "epoch": 0.11071428571428571, - "grad_norm": 468.7563781738281, + "grad_norm": 52479.5234375, "learning_rate": 1.1071428571428573e-06, - "loss": 30.0, + "loss": 2711.0, "step": 31 }, { "epoch": 0.11428571428571428, - "grad_norm": 435.9480285644531, + "grad_norm": 58123.73046875, "learning_rate": 1.142857142857143e-06, - "loss": 29.9844, + "loss": 2741.0, "step": 32 }, { "epoch": 0.11785714285714285, - "grad_norm": 459.3067932128906, + "grad_norm": 53508.328125, "learning_rate": 1.1785714285714287e-06, - "loss": 30.1875, + "loss": 2648.0, "step": 33 }, { "epoch": 0.12142857142857143, - "grad_norm": 414.72552490234375, + "grad_norm": 55233.74609375, "learning_rate": 1.2142857142857144e-06, - "loss": 27.5312, + "loss": 2722.0, "step": 34 }, { "epoch": 0.125, - "grad_norm": 505.076171875, + "grad_norm": 59158.7109375, "learning_rate": 1.25e-06, - "loss": 30.7344, + "loss": 2568.0, "step": 35 }, { "epoch": 0.12857142857142856, - "grad_norm": 435.8720397949219, + "grad_norm": 57440.38671875, "learning_rate": 1.2857142857142856e-06, - "loss": 28.4062, + "loss": 2576.0, "step": 36 }, { "epoch": 0.13214285714285715, - "grad_norm": 420.0755310058594, + "grad_norm": 58374.6875, "learning_rate": 1.3214285714285715e-06, - "loss": 27.125, + "loss": 2782.0, "step": 37 }, { "epoch": 0.1357142857142857, - "grad_norm": 432.12689208984375, + "grad_norm": 57305.36328125, "learning_rate": 1.3571428571428572e-06, - "loss": 26.4688, + "loss": 2798.0, "step": 38 }, { "epoch": 0.1392857142857143, - "grad_norm": 442.9870300292969, + "grad_norm": 52514.9375, "learning_rate": 1.392857142857143e-06, - "loss": 29.0781, + "loss": 2854.0, "step": 39 }, { "epoch": 0.14285714285714285, - "grad_norm": 456.2323913574219, + "grad_norm": 56448.4140625, "learning_rate": 1.4285714285714286e-06, - "loss": 27.1562, + "loss": 2780.0, "step": 40 }, { "epoch": 0.14642857142857144, - "grad_norm": 411.92791748046875, + "grad_norm": 57320.1640625, "learning_rate": 1.4642857142857145e-06, - "loss": 24.2969, + "loss": 2770.0, "step": 41 }, { "epoch": 0.15, - "grad_norm": 426.4077453613281, + "grad_norm": 58692.0234375, "learning_rate": 1.5e-06, - "loss": 24.8125, + "loss": 2800.0, "step": 42 }, { "epoch": 0.15357142857142858, - "grad_norm": 476.18780517578125, + "grad_norm": 55001.59765625, "learning_rate": 1.535714285714286e-06, - "loss": 29.6719, + "loss": 2644.0, "step": 43 }, { "epoch": 0.15714285714285714, - "grad_norm": 412.37921142578125, + "grad_norm": 54533.80859375, "learning_rate": 1.5714285714285714e-06, - "loss": 22.6562, + "loss": 2572.0, "step": 44 }, { "epoch": 0.16071428571428573, - "grad_norm": 442.13079833984375, + "grad_norm": 57930.265625, "learning_rate": 1.6071428571428574e-06, - "loss": 27.0, + "loss": 2788.0, "step": 45 }, { "epoch": 0.16428571428571428, - "grad_norm": 411.8236389160156, + "grad_norm": 53036.37109375, "learning_rate": 1.642857142857143e-06, - "loss": 25.4375, + "loss": 2768.0, "step": 46 }, { "epoch": 0.16785714285714284, - "grad_norm": 440.1373291015625, + "grad_norm": 56918.08984375, "learning_rate": 1.6785714285714286e-06, - "loss": 28.0938, + "loss": 2646.0, "step": 47 }, { "epoch": 0.17142857142857143, - "grad_norm": 447.80682373046875, + "grad_norm": 54108.76171875, "learning_rate": 1.7142857142857145e-06, - "loss": 23.7188, + "loss": 2628.0, "step": 48 }, { "epoch": 0.175, - "grad_norm": 441.1207275390625, + "grad_norm": 56238.75390625, "learning_rate": 1.75e-06, - "loss": 28.125, + "loss": 2660.0, "step": 49 }, { "epoch": 0.17857142857142858, - "grad_norm": 431.698486328125, + "grad_norm": 56509.8515625, "learning_rate": 1.7857142857142859e-06, - "loss": 26.0156, + "loss": 2624.0, "step": 50 }, { "epoch": 0.18214285714285713, - "grad_norm": 503.77008056640625, + "grad_norm": 59810.55859375, "learning_rate": 1.8214285714285716e-06, - "loss": 28.7656, + "loss": 2320.0, "step": 51 }, { "epoch": 0.18571428571428572, - "grad_norm": 401.3651428222656, + "grad_norm": 52871.17578125, "learning_rate": 1.8571428571428573e-06, - "loss": 23.625, + "loss": 2613.0, "step": 52 }, { "epoch": 0.18928571428571428, - "grad_norm": 489.8802185058594, + "grad_norm": 57674.58984375, "learning_rate": 1.892857142857143e-06, - "loss": 29.875, + "loss": 2414.0, "step": 53 }, { "epoch": 0.19285714285714287, - "grad_norm": 437.7277526855469, + "grad_norm": 54902.953125, "learning_rate": 1.928571428571429e-06, - "loss": 27.25, + "loss": 2710.0, "step": 54 }, { "epoch": 0.19642857142857142, - "grad_norm": 413.51300048828125, + "grad_norm": 57680.69921875, "learning_rate": 1.9642857142857144e-06, - "loss": 25.9062, + "loss": 2669.0, "step": 55 }, { "epoch": 0.2, - "grad_norm": 449.22222900390625, + "grad_norm": 56207.1796875, "learning_rate": 2.0000000000000003e-06, - "loss": 27.8281, + "loss": 2626.0, "step": 56 }, { "epoch": 0.20357142857142857, - "grad_norm": 467.76702880859375, + "grad_norm": 57754.87109375, "learning_rate": 2.035714285714286e-06, - "loss": 30.4062, + "loss": 2594.0, "step": 57 }, { "epoch": 0.20714285714285716, - "grad_norm": 471.6856689453125, + "grad_norm": 51905.11328125, "learning_rate": 2.0714285714285717e-06, - "loss": 33.0625, + "loss": 2558.0, "step": 58 }, { "epoch": 0.21071428571428572, - "grad_norm": 443.81787109375, + "grad_norm": 54741.5625, "learning_rate": 2.1071428571428572e-06, - "loss": 23.9688, + "loss": 2504.0, "step": 59 }, { "epoch": 0.21428571428571427, - "grad_norm": 405.0802917480469, + "grad_norm": 59795.40625, "learning_rate": 2.1428571428571427e-06, - "loss": 23.5938, + "loss": 2522.0, "step": 60 }, { "epoch": 0.21785714285714286, - "grad_norm": 398.11767578125, + "grad_norm": 57522.73828125, "learning_rate": 2.1785714285714286e-06, - "loss": 25.375, + "loss": 2522.0, "step": 61 }, { "epoch": 0.22142857142857142, - "grad_norm": 380.9560241699219, + "grad_norm": 54045.98046875, "learning_rate": 2.2142857142857146e-06, - "loss": 25.7656, + "loss": 2596.0, "step": 62 }, { "epoch": 0.225, - "grad_norm": 437.2050476074219, + "grad_norm": 51539.40234375, "learning_rate": 2.25e-06, - "loss": 33.5625, + "loss": 2636.0, "step": 63 }, { "epoch": 0.22857142857142856, - "grad_norm": 405.1700744628906, + "grad_norm": 52762.08203125, "learning_rate": 2.285714285714286e-06, - "loss": 22.4297, + "loss": 2476.0, "step": 64 }, { "epoch": 0.23214285714285715, - "grad_norm": 430.02227783203125, + "grad_norm": 50739.83984375, "learning_rate": 2.321428571428572e-06, - "loss": 28.3594, + "loss": 2358.0, "step": 65 }, { "epoch": 0.2357142857142857, - "grad_norm": 432.0417785644531, + "grad_norm": 55836.32421875, "learning_rate": 2.3571428571428574e-06, - "loss": 25.7656, + "loss": 2440.0, "step": 66 }, { "epoch": 0.2392857142857143, - "grad_norm": 410.22540283203125, + "grad_norm": 48759.96484375, "learning_rate": 2.3928571428571433e-06, - "loss": 28.8281, + "loss": 2710.0, "step": 67 }, { "epoch": 0.24285714285714285, - "grad_norm": 468.97833251953125, + "grad_norm": 57551.83203125, "learning_rate": 2.428571428571429e-06, - "loss": 26.2812, + "loss": 2485.0, "step": 68 }, { "epoch": 0.24642857142857144, - "grad_norm": 330.7679748535156, + "grad_norm": 53682.7578125, "learning_rate": 2.4642857142857147e-06, - "loss": 21.3828, + "loss": 2572.0, "step": 69 }, { "epoch": 0.25, - "grad_norm": 391.85589599609375, + "grad_norm": 52824.26171875, "learning_rate": 2.5e-06, - "loss": 22.4844, + "loss": 2337.0, "step": 70 }, { "epoch": 0.25357142857142856, - "grad_norm": 433.45831298828125, + "grad_norm": 53040.15625, "learning_rate": 2.5357142857142857e-06, - "loss": 28.1406, + "loss": 2283.0, "step": 71 }, { "epoch": 0.2571428571428571, - "grad_norm": 395.9306945800781, + "grad_norm": 57390.953125, "learning_rate": 2.571428571428571e-06, - "loss": 21.8906, + "loss": 2500.0, "step": 72 }, { "epoch": 0.26071428571428573, - "grad_norm": 382.7699890136719, + "grad_norm": 53383.375, "learning_rate": 2.6071428571428575e-06, - "loss": 28.6719, + "loss": 2536.0, "step": 73 }, { "epoch": 0.2642857142857143, - "grad_norm": 424.3962097167969, + "grad_norm": 55229.26953125, "learning_rate": 2.642857142857143e-06, - "loss": 24.4219, + "loss": 2346.0, "step": 74 }, { "epoch": 0.26785714285714285, - "grad_norm": 404.885009765625, + "grad_norm": 50438.17578125, "learning_rate": 2.6785714285714285e-06, - "loss": 26.125, + "loss": 2307.0, "step": 75 }, { "epoch": 0.2714285714285714, - "grad_norm": 437.3279113769531, + "grad_norm": 51793.6796875, "learning_rate": 2.7142857142857144e-06, - "loss": 25.9531, + "loss": 2159.0, "step": 76 }, { "epoch": 0.275, - "grad_norm": 418.9220886230469, + "grad_norm": 56010.3125, "learning_rate": 2.7500000000000004e-06, - "loss": 23.7344, + "loss": 2201.0, "step": 77 }, { "epoch": 0.2785714285714286, - "grad_norm": 428.2545471191406, + "grad_norm": 49001.52734375, "learning_rate": 2.785714285714286e-06, - "loss": 34.5938, + "loss": 2416.0, "step": 78 }, { "epoch": 0.28214285714285714, - "grad_norm": 371.60833740234375, + "grad_norm": 51914.92578125, "learning_rate": 2.8214285714285718e-06, - "loss": 25.8906, + "loss": 2514.0, "step": 79 }, { "epoch": 0.2857142857142857, - "grad_norm": 364.85064697265625, + "grad_norm": 57334.23828125, "learning_rate": 2.8571428571428573e-06, - "loss": 26.2188, + "loss": 2584.0, "step": 80 }, { "epoch": 0.2892857142857143, - "grad_norm": 426.8570861816406, + "grad_norm": 46454.109375, "learning_rate": 2.892857142857143e-06, - "loss": 26.8281, + "loss": 2156.0, "step": 81 }, { "epoch": 0.29285714285714287, - "grad_norm": 421.036865234375, + "grad_norm": 52924.03125, "learning_rate": 2.928571428571429e-06, - "loss": 24.1875, + "loss": 2237.0, "step": 82 }, { "epoch": 0.29642857142857143, - "grad_norm": 397.99755859375, + "grad_norm": 52062.296875, "learning_rate": 2.9642857142857146e-06, - "loss": 27.75, + "loss": 2308.0, "step": 83 }, { "epoch": 0.3, - "grad_norm": 390.9758605957031, + "grad_norm": 54490.3046875, "learning_rate": 3e-06, - "loss": 27.125, + "loss": 2548.0, "step": 84 }, { "epoch": 0.30357142857142855, - "grad_norm": 411.8300476074219, + "grad_norm": 51692.0078125, "learning_rate": 3.0357142857142856e-06, - "loss": 28.4844, + "loss": 2440.0, "step": 85 }, { "epoch": 0.30714285714285716, - "grad_norm": 391.6376953125, + "grad_norm": 53973.59375, "learning_rate": 3.071428571428572e-06, - "loss": 25.125, + "loss": 2301.0, "step": 86 }, { "epoch": 0.3107142857142857, - "grad_norm": 362.9507751464844, + "grad_norm": 50207.29296875, "learning_rate": 3.1071428571428574e-06, - "loss": 22.7812, + "loss": 2323.0, "step": 87 }, { "epoch": 0.3142857142857143, - "grad_norm": 399.1080017089844, + "grad_norm": 45083.03125, "learning_rate": 3.142857142857143e-06, - "loss": 25.6094, + "loss": 2299.0, "step": 88 }, { "epoch": 0.31785714285714284, - "grad_norm": 381.00555419921875, + "grad_norm": 49912.5859375, "learning_rate": 3.178571428571429e-06, - "loss": 25.6719, + "loss": 2367.0, "step": 89 }, { "epoch": 0.32142857142857145, - "grad_norm": 390.6072082519531, + "grad_norm": 46730.38671875, "learning_rate": 3.2142857142857147e-06, - "loss": 24.7188, + "loss": 2303.0, "step": 90 }, { "epoch": 0.325, - "grad_norm": 389.594970703125, + "grad_norm": 50786.91796875, "learning_rate": 3.2500000000000002e-06, - "loss": 29.7188, + "loss": 2120.0, "step": 91 }, { "epoch": 0.32857142857142857, - "grad_norm": 410.5198974609375, + "grad_norm": 50003.14453125, "learning_rate": 3.285714285714286e-06, - "loss": 27.4375, + "loss": 2351.0, "step": 92 }, { "epoch": 0.33214285714285713, - "grad_norm": 438.47784423828125, + "grad_norm": 50678.0703125, "learning_rate": 3.3214285714285716e-06, - "loss": 30.0156, + "loss": 2270.0, "step": 93 }, { "epoch": 0.3357142857142857, - "grad_norm": 371.4592590332031, + "grad_norm": 51310.421875, "learning_rate": 3.357142857142857e-06, - "loss": 22.4531, + "loss": 2044.0, "step": 94 }, { "epoch": 0.3392857142857143, - "grad_norm": 364.02886962890625, + "grad_norm": 51872.7578125, "learning_rate": 3.3928571428571435e-06, - "loss": 28.125, + "loss": 2450.0, "step": 95 }, { "epoch": 0.34285714285714286, - "grad_norm": 346.2029113769531, + "grad_norm": 43206.97265625, "learning_rate": 3.428571428571429e-06, - "loss": 24.2578, + "loss": 2240.0, "step": 96 }, { "epoch": 0.3464285714285714, - "grad_norm": 404.9358825683594, + "grad_norm": 48175.49609375, "learning_rate": 3.4642857142857145e-06, - "loss": 26.9688, + "loss": 1990.0, "step": 97 }, { "epoch": 0.35, - "grad_norm": 350.8284912109375, + "grad_norm": 49124.59765625, "learning_rate": 3.5e-06, - "loss": 22.6562, + "loss": 2334.0, "step": 98 }, { "epoch": 0.3535714285714286, - "grad_norm": 397.1616516113281, + "grad_norm": 48831.9296875, "learning_rate": 3.5357142857142863e-06, - "loss": 24.6719, + "loss": 2150.0, "step": 99 }, { "epoch": 0.35714285714285715, - "grad_norm": 405.81585693359375, + "grad_norm": 56516.67578125, "learning_rate": 3.5714285714285718e-06, - "loss": 23.0938, + "loss": 1985.0, "step": 100 }, { "epoch": 0.3607142857142857, - "grad_norm": 343.4232177734375, + "grad_norm": 57121.9140625, "learning_rate": 3.6071428571428573e-06, - "loss": 24.2656, + "loss": 2344.0, "step": 101 }, { "epoch": 0.36428571428571427, - "grad_norm": 344.626708984375, + "grad_norm": 50754.53515625, "learning_rate": 3.642857142857143e-06, - "loss": 23.9375, + "loss": 2304.0, "step": 102 }, { "epoch": 0.3678571428571429, - "grad_norm": 404.7317810058594, + "grad_norm": 53306.16796875, "learning_rate": 3.678571428571429e-06, - "loss": 28.8281, + "loss": 2034.0, "step": 103 }, { "epoch": 0.37142857142857144, - "grad_norm": 344.7197265625, + "grad_norm": 44559.53125, "learning_rate": 3.7142857142857146e-06, - "loss": 27.3125, + "loss": 2195.0, "step": 104 }, { "epoch": 0.375, - "grad_norm": 361.006591796875, + "grad_norm": 46505.51171875, "learning_rate": 3.7500000000000005e-06, - "loss": 25.75, + "loss": 2124.0, "step": 105 }, { "epoch": 0.37857142857142856, - "grad_norm": 353.9184875488281, + "grad_norm": 49290.10546875, "learning_rate": 3.785714285714286e-06, - "loss": 24.5781, + "loss": 2158.0, "step": 106 }, { "epoch": 0.3821428571428571, - "grad_norm": 342.6656494140625, + "grad_norm": 47605.75, "learning_rate": 3.8214285714285715e-06, - "loss": 22.5938, + "loss": 2190.0, "step": 107 }, { "epoch": 0.38571428571428573, - "grad_norm": 316.21295166015625, + "grad_norm": 48017.82421875, "learning_rate": 3.857142857142858e-06, - "loss": 21.0781, + "loss": 2042.0, "step": 108 }, { "epoch": 0.3892857142857143, - "grad_norm": 329.8937683105469, + "grad_norm": 49979.39453125, "learning_rate": 3.892857142857143e-06, - "loss": 25.6875, + "loss": 2049.0, "step": 109 }, { "epoch": 0.39285714285714285, - "grad_norm": 293.50909423828125, + "grad_norm": 48653.3203125, "learning_rate": 3.928571428571429e-06, - "loss": 19.7344, + "loss": 2225.0, "step": 110 }, { "epoch": 0.3964285714285714, - "grad_norm": 317.79583740234375, + "grad_norm": 50144.546875, "learning_rate": 3.964285714285714e-06, - "loss": 22.4844, + "loss": 2195.0, "step": 111 }, { "epoch": 0.4, - "grad_norm": 322.17462158203125, + "grad_norm": 49567.1953125, "learning_rate": 4.000000000000001e-06, - "loss": 21.9453, + "loss": 2131.0, "step": 112 }, { "epoch": 0.4035714285714286, - "grad_norm": 344.13323974609375, + "grad_norm": 43937.37109375, "learning_rate": 4.035714285714286e-06, - "loss": 24.0938, + "loss": 1926.0, "step": 113 }, { "epoch": 0.40714285714285714, - "grad_norm": 331.3674621582031, + "grad_norm": 53035.12890625, "learning_rate": 4.071428571428572e-06, - "loss": 22.3906, + "loss": 2015.0, "step": 114 }, { "epoch": 0.4107142857142857, - "grad_norm": 344.12530517578125, + "grad_norm": 48111.984375, "learning_rate": 4.107142857142857e-06, - "loss": 24.9844, + "loss": 1865.0, "step": 115 }, { "epoch": 0.4142857142857143, - "grad_norm": 338.5205078125, + "grad_norm": 45911.875, "learning_rate": 4.1428571428571435e-06, - "loss": 24.3594, + "loss": 1974.0, "step": 116 }, { "epoch": 0.41785714285714287, - "grad_norm": 339.5569152832031, + "grad_norm": 50860.16796875, "learning_rate": 4.178571428571429e-06, - "loss": 23.2578, + "loss": 1884.0, "step": 117 }, { "epoch": 0.42142857142857143, - "grad_norm": 351.39208984375, + "grad_norm": 45759.2890625, "learning_rate": 4.2142857142857145e-06, - "loss": 22.4062, + "loss": 1827.0, "step": 118 }, { "epoch": 0.425, - "grad_norm": 370.0450439453125, + "grad_norm": 41331.12890625, "learning_rate": 4.25e-06, - "loss": 25.6719, + "loss": 1670.0, "step": 119 }, { "epoch": 0.42857142857142855, - "grad_norm": 302.0252380371094, + "grad_norm": 48222.890625, "learning_rate": 4.2857142857142855e-06, - "loss": 21.7656, + "loss": 2185.0, "step": 120 }, { "epoch": 0.43214285714285716, - "grad_norm": 325.2033386230469, + "grad_norm": 48716.6796875, "learning_rate": 4.321428571428572e-06, - "loss": 23.5938, + "loss": 1977.0, "step": 121 }, { "epoch": 0.4357142857142857, - "grad_norm": 316.630859375, + "grad_norm": 45521.70703125, "learning_rate": 4.357142857142857e-06, - "loss": 24.6719, + "loss": 2167.0, "step": 122 }, { "epoch": 0.4392857142857143, - "grad_norm": 300.1486511230469, + "grad_norm": 49179.828125, "learning_rate": 4.392857142857143e-06, - "loss": 26.0938, + "loss": 2276.0, "step": 123 }, { "epoch": 0.44285714285714284, - "grad_norm": 283.73199462890625, + "grad_norm": 50002.54296875, "learning_rate": 4.428571428571429e-06, - "loss": 22.8438, + "loss": 2027.0, "step": 124 }, { "epoch": 0.44642857142857145, - "grad_norm": 319.7061767578125, + "grad_norm": 47304.29296875, "learning_rate": 4.464285714285715e-06, - "loss": 23.5938, + "loss": 1886.0, "step": 125 }, { "epoch": 0.45, - "grad_norm": 299.60302734375, + "grad_norm": 48354.08203125, "learning_rate": 4.5e-06, - "loss": 23.1172, + "loss": 2147.0, "step": 126 }, { "epoch": 0.45357142857142857, - "grad_norm": 279.7106628417969, + "grad_norm": 43458.8046875, "learning_rate": 4.5357142857142865e-06, - "loss": 21.125, + "loss": 2043.0, "step": 127 }, { "epoch": 0.45714285714285713, - "grad_norm": 347.5083923339844, + "grad_norm": 49427.7734375, "learning_rate": 4.571428571428572e-06, - "loss": 23.4062, + "loss": 1974.0, "step": 128 }, { "epoch": 0.4607142857142857, - "grad_norm": 307.5142517089844, + "grad_norm": 48812.6328125, "learning_rate": 4.6071428571428574e-06, - "loss": 21.8594, + "loss": 2039.0, "step": 129 }, { "epoch": 0.4642857142857143, - "grad_norm": 306.3246765136719, + "grad_norm": 43462.1171875, "learning_rate": 4.642857142857144e-06, - "loss": 23.7031, + "loss": 1777.0, "step": 130 }, { "epoch": 0.46785714285714286, - "grad_norm": 304.927734375, + "grad_norm": 47502.78125, "learning_rate": 4.678571428571429e-06, - "loss": 19.5781, + "loss": 1830.0, "step": 131 }, { "epoch": 0.4714285714285714, - "grad_norm": 344.84893798828125, + "grad_norm": 45883.8828125, "learning_rate": 4.714285714285715e-06, - "loss": 27.5312, + "loss": 1883.0, "step": 132 }, { "epoch": 0.475, - "grad_norm": 321.4259948730469, + "grad_norm": 42061.08203125, "learning_rate": 4.75e-06, - "loss": 21.1953, + "loss": 1689.0, "step": 133 }, { "epoch": 0.4785714285714286, - "grad_norm": 308.4122314453125, + "grad_norm": 46696.37109375, "learning_rate": 4.785714285714287e-06, - "loss": 24.7656, + "loss": 1876.0, "step": 134 }, { "epoch": 0.48214285714285715, - "grad_norm": 296.87640380859375, + "grad_norm": 40520.76953125, "learning_rate": 4.821428571428572e-06, - "loss": 22.8203, + "loss": 1805.0, "step": 135 }, { "epoch": 0.4857142857142857, - "grad_norm": 312.9284973144531, + "grad_norm": 42581.24609375, "learning_rate": 4.857142857142858e-06, - "loss": 23.2656, + "loss": 1731.0, "step": 136 }, { "epoch": 0.48928571428571427, - "grad_norm": 272.53240966796875, + "grad_norm": 43324.4609375, "learning_rate": 4.892857142857143e-06, - "loss": 25.9453, + "loss": 1993.0, "step": 137 }, { "epoch": 0.4928571428571429, - "grad_norm": 322.1316223144531, + "grad_norm": 40790.68359375, "learning_rate": 4.928571428571429e-06, - "loss": 26.8594, + "loss": 1873.0, "step": 138 }, { "epoch": 0.49642857142857144, - "grad_norm": 315.32196044921875, + "grad_norm": 37050.62109375, "learning_rate": 4.964285714285715e-06, - "loss": 26.4297, + "loss": 1659.0, "step": 139 }, { "epoch": 0.5, - "grad_norm": 277.9049072265625, + "grad_norm": 49885.3046875, "learning_rate": 5e-06, - "loss": 21.2422, + "loss": 1637.5, "step": 140 }, { "epoch": 0.5035714285714286, - "grad_norm": 290.2340087890625, + "grad_norm": 44591.82421875, "learning_rate": 5.035714285714286e-06, - "loss": 20.9297, + "loss": 1697.0, "step": 141 }, { "epoch": 0.5071428571428571, - "grad_norm": 284.7858581542969, + "grad_norm": 40223.59765625, "learning_rate": 5.071428571428571e-06, - "loss": 23.1094, + "loss": 1781.0, "step": 142 }, { "epoch": 0.5107142857142857, - "grad_norm": 228.74746704101562, + "grad_norm": 43106.30859375, "learning_rate": 5.107142857142857e-06, - "loss": 25.1406, + "loss": 2012.0, "step": 143 }, { "epoch": 0.5142857142857142, - "grad_norm": 289.68939208984375, + "grad_norm": 42699.32421875, "learning_rate": 5.142857142857142e-06, - "loss": 26.0234, + "loss": 1645.0, "step": 144 }, { "epoch": 0.5178571428571429, - "grad_norm": 298.378173828125, + "grad_norm": 38726.9296875, "learning_rate": 5.1785714285714296e-06, - "loss": 24.1875, + "loss": 1537.5, "step": 145 }, { "epoch": 0.5214285714285715, - "grad_norm": 269.7729797363281, + "grad_norm": 41239.7421875, "learning_rate": 5.214285714285715e-06, - "loss": 19.8203, + "loss": 1625.0, "step": 146 }, { "epoch": 0.525, - "grad_norm": 241.9813690185547, + "grad_norm": 42875.859375, "learning_rate": 5.2500000000000006e-06, - "loss": 22.3906, + "loss": 1782.0, "step": 147 }, { "epoch": 0.5285714285714286, - "grad_norm": 260.9550476074219, + "grad_norm": 38225.578125, "learning_rate": 5.285714285714286e-06, - "loss": 21.9062, + "loss": 1757.0, "step": 148 }, { "epoch": 0.5321428571428571, - "grad_norm": 269.8868408203125, + "grad_norm": 40768.36328125, "learning_rate": 5.3214285714285715e-06, - "loss": 25.8203, + "loss": 1665.0, "step": 149 }, { "epoch": 0.5357142857142857, - "grad_norm": 274.3963928222656, + "grad_norm": 38799.9453125, "learning_rate": 5.357142857142857e-06, - "loss": 21.6094, + "loss": 1617.0, "step": 150 }, { "epoch": 0.5392857142857143, - "grad_norm": 315.60205078125, + "grad_norm": 43229.2578125, "learning_rate": 5.392857142857143e-06, - "loss": 25.9531, + "loss": 1513.0, "step": 151 }, { "epoch": 0.5428571428571428, - "grad_norm": 231.7419891357422, + "grad_norm": 41194.19921875, "learning_rate": 5.428571428571429e-06, - "loss": 19.2734, + "loss": 1440.0, "step": 152 }, { "epoch": 0.5464285714285714, - "grad_norm": 239.33724975585938, + "grad_norm": 39372.83984375, "learning_rate": 5.464285714285714e-06, - "loss": 21.6719, + "loss": 1614.0, "step": 153 }, { "epoch": 0.55, - "grad_norm": 269.93218994140625, + "grad_norm": 39602.26953125, "learning_rate": 5.500000000000001e-06, - "loss": 21.6562, + "loss": 1398.0, "step": 154 }, { "epoch": 0.5535714285714286, - "grad_norm": 285.39227294921875, + "grad_norm": 36637.0546875, "learning_rate": 5.535714285714286e-06, - "loss": 23.0547, + "loss": 1615.0, "step": 155 }, { "epoch": 0.5571428571428572, - "grad_norm": 216.80662536621094, + "grad_norm": 38556.140625, "learning_rate": 5.571428571428572e-06, - "loss": 19.6406, + "loss": 1666.0, "step": 156 }, { "epoch": 0.5607142857142857, - "grad_norm": 231.36837768554688, + "grad_norm": 38034.23828125, "learning_rate": 5.607142857142858e-06, - "loss": 20.1641, + "loss": 1511.0, "step": 157 }, { "epoch": 0.5642857142857143, - "grad_norm": 228.6242218017578, + "grad_norm": 44598.8125, "learning_rate": 5.6428571428571435e-06, - "loss": 19.9375, + "loss": 1643.0, "step": 158 }, { "epoch": 0.5678571428571428, - "grad_norm": 215.09642028808594, + "grad_norm": 38049.1953125, "learning_rate": 5.678571428571429e-06, - "loss": 20.4531, + "loss": 1464.5, "step": 159 }, { "epoch": 0.5714285714285714, - "grad_norm": 237.20187377929688, + "grad_norm": 35484.2578125, "learning_rate": 5.7142857142857145e-06, - "loss": 25.6406, + "loss": 1478.0, "step": 160 }, { "epoch": 0.575, - "grad_norm": 245.66937255859375, + "grad_norm": 37643.3671875, "learning_rate": 5.75e-06, - "loss": 20.2578, + "loss": 1410.5, "step": 161 }, { "epoch": 0.5785714285714286, - "grad_norm": 251.03172302246094, + "grad_norm": 37466.31640625, "learning_rate": 5.785714285714286e-06, - "loss": 23.8906, + "loss": 1402.0, "step": 162 }, { "epoch": 0.5821428571428572, - "grad_norm": 239.97976684570312, + "grad_norm": 35002.24609375, "learning_rate": 5.821428571428573e-06, - "loss": 19.2656, + "loss": 1237.0, "step": 163 }, { "epoch": 0.5857142857142857, - "grad_norm": 253.80308532714844, + "grad_norm": 31511.01171875, "learning_rate": 5.857142857142858e-06, - "loss": 21.5938, + "loss": 1275.0, "step": 164 }, { "epoch": 0.5892857142857143, - "grad_norm": 241.61058044433594, + "grad_norm": 40428.31640625, "learning_rate": 5.892857142857144e-06, - "loss": 22.9375, + "loss": 1416.0, "step": 165 }, { "epoch": 0.5928571428571429, - "grad_norm": 205.09938049316406, + "grad_norm": 38300.34765625, "learning_rate": 5.928571428571429e-06, - "loss": 16.9297, + "loss": 1431.0, "step": 166 }, { "epoch": 0.5964285714285714, - "grad_norm": 227.73577880859375, + "grad_norm": 37506.6875, "learning_rate": 5.964285714285715e-06, - "loss": 20.7344, + "loss": 1278.5, "step": 167 }, { "epoch": 0.6, - "grad_norm": 201.16065979003906, + "grad_norm": 37629.4140625, "learning_rate": 6e-06, - "loss": 22.0625, + "loss": 1352.0, "step": 168 }, { "epoch": 0.6035714285714285, - "grad_norm": 233.2169647216797, + "grad_norm": 38953.62890625, "learning_rate": 6.035714285714286e-06, - "loss": 20.3984, + "loss": 1243.5, "step": 169 }, { "epoch": 0.6071428571428571, - "grad_norm": 242.8660888671875, + "grad_norm": 34488.328125, "learning_rate": 6.071428571428571e-06, - "loss": 27.125, + "loss": 1352.0, "step": 170 }, { "epoch": 0.6107142857142858, - "grad_norm": 194.48208618164062, + "grad_norm": 36888.90625, "learning_rate": 6.107142857142858e-06, - "loss": 19.9844, + "loss": 1455.5, "step": 171 }, { "epoch": 0.6142857142857143, - "grad_norm": 202.074462890625, + "grad_norm": 35487.7421875, "learning_rate": 6.142857142857144e-06, - "loss": 22.3672, + "loss": 1426.0, "step": 172 }, { "epoch": 0.6178571428571429, - "grad_norm": 235.34242248535156, + "grad_norm": 36121.4921875, "learning_rate": 6.178571428571429e-06, - "loss": 19.8125, + "loss": 1333.0, "step": 173 }, { "epoch": 0.6214285714285714, - "grad_norm": 170.82933044433594, + "grad_norm": 37063.6953125, "learning_rate": 6.214285714285715e-06, - "loss": 21.0625, + "loss": 1284.5, "step": 174 }, { "epoch": 0.625, - "grad_norm": 206.0693359375, + "grad_norm": 35706.61328125, "learning_rate": 6.25e-06, - "loss": 22.2734, + "loss": 1242.0, "step": 175 }, { "epoch": 0.6285714285714286, - "grad_norm": 177.68365478515625, + "grad_norm": 40051.47265625, "learning_rate": 6.285714285714286e-06, - "loss": 19.6797, + "loss": 1330.0, "step": 176 }, { "epoch": 0.6321428571428571, - "grad_norm": 224.065185546875, + "grad_norm": 36092.5234375, "learning_rate": 6.321428571428571e-06, - "loss": 21.2188, + "loss": 1234.5, "step": 177 }, { "epoch": 0.6357142857142857, - "grad_norm": 183.49310302734375, + "grad_norm": 35734.7734375, "learning_rate": 6.357142857142858e-06, - "loss": 20.4453, + "loss": 1246.0, "step": 178 }, { "epoch": 0.6392857142857142, - "grad_norm": 206.50790405273438, + "grad_norm": 34745.9296875, "learning_rate": 6.392857142857143e-06, - "loss": 20.8203, + "loss": 1319.5, "step": 179 }, { "epoch": 0.6428571428571429, - "grad_norm": 189.37969970703125, + "grad_norm": 31167.529296875, "learning_rate": 6.4285714285714295e-06, - "loss": 24.5625, + "loss": 1277.0, "step": 180 }, { "epoch": 0.6464285714285715, - "grad_norm": 173.31056213378906, + "grad_norm": 29240.9140625, "learning_rate": 6.464285714285715e-06, - "loss": 19.3281, + "loss": 1274.0, "step": 181 }, { "epoch": 0.65, - "grad_norm": 187.03831481933594, + "grad_norm": 31659.38671875, "learning_rate": 6.5000000000000004e-06, - "loss": 19.4297, + "loss": 1239.0, "step": 182 }, { "epoch": 0.6535714285714286, - "grad_norm": 185.00762939453125, + "grad_norm": 33609.12109375, "learning_rate": 6.535714285714286e-06, - "loss": 21.1875, + "loss": 1226.5, "step": 183 }, { "epoch": 0.6571428571428571, - "grad_norm": 188.84010314941406, + "grad_norm": 30414.06640625, "learning_rate": 6.571428571428572e-06, - "loss": 21.7656, + "loss": 1142.5, "step": 184 }, { "epoch": 0.6607142857142857, - "grad_norm": 192.83676147460938, + "grad_norm": 31541.8359375, "learning_rate": 6.607142857142858e-06, - "loss": 18.1484, + "loss": 891.5, "step": 185 }, { "epoch": 0.6642857142857143, - "grad_norm": 177.14833068847656, + "grad_norm": 30813.146484375, "learning_rate": 6.642857142857143e-06, - "loss": 17.2812, + "loss": 1181.0, "step": 186 }, { "epoch": 0.6678571428571428, - "grad_norm": 199.4510955810547, + "grad_norm": 28017.86328125, "learning_rate": 6.678571428571429e-06, - "loss": 21.1953, + "loss": 1194.0, "step": 187 }, { "epoch": 0.6714285714285714, - "grad_norm": 154.5773468017578, + "grad_norm": 30904.638671875, "learning_rate": 6.714285714285714e-06, - "loss": 19.9219, + "loss": 1177.5, "step": 188 }, { "epoch": 0.675, - "grad_norm": 180.60301208496094, + "grad_norm": 30676.01953125, "learning_rate": 6.750000000000001e-06, - "loss": 20.625, + "loss": 1056.5, "step": 189 }, { "epoch": 0.6785714285714286, - "grad_norm": 173.22206115722656, + "grad_norm": 30213.75390625, "learning_rate": 6.785714285714287e-06, - "loss": 19.9531, + "loss": 1208.0, "step": 190 }, { "epoch": 0.6821428571428572, - "grad_norm": 184.2520751953125, + "grad_norm": 33366.171875, "learning_rate": 6.8214285714285724e-06, - "loss": 23.375, + "loss": 1220.0, "step": 191 }, { "epoch": 0.6857142857142857, - "grad_norm": 159.26380920410156, + "grad_norm": 34670.984375, "learning_rate": 6.857142857142858e-06, - "loss": 23.125, + "loss": 982.0, "step": 192 }, { "epoch": 0.6892857142857143, - "grad_norm": 139.96575927734375, + "grad_norm": 30760.220703125, "learning_rate": 6.892857142857143e-06, - "loss": 17.9297, + "loss": 1143.0, "step": 193 }, { "epoch": 0.6928571428571428, - "grad_norm": 170.7452392578125, + "grad_norm": 26211.34375, "learning_rate": 6.928571428571429e-06, - "loss": 17.625, + "loss": 907.75, "step": 194 }, { "epoch": 0.6964285714285714, - "grad_norm": 123.6897964477539, + "grad_norm": 28831.35546875, "learning_rate": 6.964285714285714e-06, - "loss": 17.6484, + "loss": 984.5, "step": 195 }, { "epoch": 0.7, - "grad_norm": 145.02731323242188, + "grad_norm": 32666.31640625, "learning_rate": 7e-06, - "loss": 21.3906, + "loss": 1286.0, "step": 196 }, { "epoch": 0.7035714285714286, - "grad_norm": 127.48950958251953, + "grad_norm": 29338.609375, "learning_rate": 7.035714285714287e-06, - "loss": 17.8359, + "loss": 1067.5, "step": 197 }, { "epoch": 0.7071428571428572, - "grad_norm": 163.2288055419922, + "grad_norm": 27221.5859375, "learning_rate": 7.0714285714285726e-06, - "loss": 22.8281, + "loss": 1042.0, "step": 198 }, { "epoch": 0.7107142857142857, - "grad_norm": 130.36390686035156, + "grad_norm": 34019.1015625, "learning_rate": 7.107142857142858e-06, - "loss": 15.0859, + "loss": 1186.5, "step": 199 }, { "epoch": 0.7142857142857143, - "grad_norm": 163.37734985351562, + "grad_norm": 26549.19140625, "learning_rate": 7.1428571428571436e-06, - "loss": 20.9766, + "loss": 982.5, "step": 200 }, { "epoch": 0.7178571428571429, - "grad_norm": 162.68939208984375, + "grad_norm": 33324.21484375, "learning_rate": 7.178571428571429e-06, - "loss": 24.8047, + "loss": 1206.0, "step": 201 }, { "epoch": 0.7214285714285714, - "grad_norm": 148.17105102539062, + "grad_norm": 23376.7265625, "learning_rate": 7.2142857142857145e-06, - "loss": 19.0703, + "loss": 787.0, "step": 202 }, { "epoch": 0.725, - "grad_norm": 167.50244140625, + "grad_norm": 25867.71875, "learning_rate": 7.25e-06, - "loss": 21.2578, + "loss": 936.5, "step": 203 }, { "epoch": 0.7285714285714285, - "grad_norm": 145.0186004638672, + "grad_norm": 26746.14453125, "learning_rate": 7.285714285714286e-06, - "loss": 21.4688, + "loss": 1035.0, "step": 204 }, { "epoch": 0.7321428571428571, - "grad_norm": 124.78711700439453, + "grad_norm": 30135.052734375, "learning_rate": 7.321428571428572e-06, - "loss": 18.5078, + "loss": 1153.0, "step": 205 }, { "epoch": 0.7357142857142858, - "grad_norm": 116.82051849365234, + "grad_norm": 27071.125, "learning_rate": 7.357142857142858e-06, - "loss": 21.7734, + "loss": 959.0, "step": 206 }, { "epoch": 0.7392857142857143, - "grad_norm": 122.78929138183594, + "grad_norm": 26113.98828125, "learning_rate": 7.392857142857144e-06, - "loss": 16.0391, + "loss": 860.75, "step": 207 }, { "epoch": 0.7428571428571429, - "grad_norm": 115.99458312988281, + "grad_norm": 26341.416015625, "learning_rate": 7.428571428571429e-06, - "loss": 18.8047, + "loss": 969.5, "step": 208 }, { "epoch": 0.7464285714285714, - "grad_norm": 126.74948120117188, + "grad_norm": 25817.244140625, "learning_rate": 7.464285714285715e-06, - "loss": 17.4844, + "loss": 973.75, "step": 209 }, { "epoch": 0.75, - "grad_norm": 107.44823455810547, + "grad_norm": 23402.08203125, "learning_rate": 7.500000000000001e-06, - "loss": 19.5234, + "loss": 928.5, "step": 210 }, { "epoch": 0.7535714285714286, - "grad_norm": 131.0528564453125, + "grad_norm": 24869.328125, "learning_rate": 7.5357142857142865e-06, - "loss": 22.2656, + "loss": 782.0, "step": 211 }, { "epoch": 0.7571428571428571, - "grad_norm": 96.97832489013672, + "grad_norm": 26426.349609375, "learning_rate": 7.571428571428572e-06, - "loss": 19.2734, + "loss": 921.5, "step": 212 }, { "epoch": 0.7607142857142857, - "grad_norm": 140.7826690673828, + "grad_norm": 24473.81640625, "learning_rate": 7.6071428571428575e-06, - "loss": 17.3047, + "loss": 896.5, "step": 213 }, { "epoch": 0.7642857142857142, - "grad_norm": 123.25503540039062, + "grad_norm": 25129.603515625, "learning_rate": 7.642857142857143e-06, - "loss": 17.9062, + "loss": 798.0, "step": 214 }, { "epoch": 0.7678571428571429, - "grad_norm": 102.89736938476562, + "grad_norm": 23632.783203125, "learning_rate": 7.67857142857143e-06, - "loss": 18.6406, + "loss": 824.0, "step": 215 }, { "epoch": 0.7714285714285715, - "grad_norm": 148.74058532714844, + "grad_norm": 25266.703125, "learning_rate": 7.714285714285716e-06, - "loss": 20.6016, + "loss": 816.5, "step": 216 }, { "epoch": 0.775, - "grad_norm": 114.43161010742188, + "grad_norm": 27692.029296875, "learning_rate": 7.75e-06, - "loss": 20.0156, + "loss": 898.25, "step": 217 }, { "epoch": 0.7785714285714286, - "grad_norm": 92.57781982421875, + "grad_norm": 22708.42578125, "learning_rate": 7.785714285714287e-06, - "loss": 15.9453, + "loss": 812.0, "step": 218 }, { "epoch": 0.7821428571428571, - "grad_norm": 102.472412109375, + "grad_norm": 23687.54296875, "learning_rate": 7.821428571428571e-06, - "loss": 17.1094, + "loss": 956.5, "step": 219 }, { "epoch": 0.7857142857142857, - "grad_norm": 104.74394226074219, + "grad_norm": 24890.970703125, "learning_rate": 7.857142857142858e-06, - "loss": 18.25, + "loss": 801.5, "step": 220 }, { "epoch": 0.7892857142857143, - "grad_norm": 109.19405364990234, + "grad_norm": 21245.04296875, "learning_rate": 7.892857142857144e-06, - "loss": 18.9219, + "loss": 729.875, "step": 221 }, { "epoch": 0.7928571428571428, - "grad_norm": 92.25491333007812, + "grad_norm": 24234.861328125, "learning_rate": 7.928571428571429e-06, - "loss": 18.8984, + "loss": 835.75, "step": 222 }, { "epoch": 0.7964285714285714, - "grad_norm": 84.87105560302734, + "grad_norm": 22802.3046875, "learning_rate": 7.964285714285715e-06, - "loss": 13.3281, + "loss": 879.5, "step": 223 }, { "epoch": 0.8, - "grad_norm": 99.42977905273438, + "grad_norm": 18312.732421875, "learning_rate": 8.000000000000001e-06, - "loss": 18.1562, + "loss": 690.875, "step": 224 }, { "epoch": 0.8035714285714286, - "grad_norm": 95.29884338378906, + "grad_norm": 23139.6484375, "learning_rate": 8.035714285714286e-06, - "loss": 16.5, + "loss": 893.5, "step": 225 }, { "epoch": 0.8071428571428572, - "grad_norm": 110.34087371826172, + "grad_norm": 23195.873046875, "learning_rate": 8.071428571428572e-06, - "loss": 18.0391, + "loss": 703.75, "step": 226 }, { "epoch": 0.8107142857142857, - "grad_norm": 100.99089813232422, + "grad_norm": 21371.83203125, "learning_rate": 8.107142857142859e-06, - "loss": 16.9531, + "loss": 710.25, "step": 227 }, { "epoch": 0.8142857142857143, - "grad_norm": 125.95132446289062, + "grad_norm": 21577.89453125, "learning_rate": 8.142857142857143e-06, - "loss": 20.4141, + "loss": 849.5, "step": 228 }, { "epoch": 0.8178571428571428, - "grad_norm": 98.25389099121094, + "grad_norm": 17650.26171875, "learning_rate": 8.17857142857143e-06, - "loss": 20.5859, + "loss": 655.0, "step": 229 }, { "epoch": 0.8214285714285714, - "grad_norm": 89.53688049316406, + "grad_norm": 22850.76953125, "learning_rate": 8.214285714285714e-06, - "loss": 17.7891, + "loss": 717.25, "step": 230 }, { "epoch": 0.825, - "grad_norm": 79.43860626220703, + "grad_norm": 20951.990234375, "learning_rate": 8.25e-06, - "loss": 15.9844, + "loss": 649.25, "step": 231 }, { "epoch": 0.8285714285714286, - "grad_norm": 97.28184509277344, + "grad_norm": 19439.328125, "learning_rate": 8.285714285714287e-06, - "loss": 16.2969, + "loss": 682.5, "step": 232 }, { "epoch": 0.8321428571428572, - "grad_norm": 86.7956314086914, + "grad_norm": 22502.7734375, "learning_rate": 8.321428571428573e-06, - "loss": 17.0078, + "loss": 713.5, "step": 233 }, { "epoch": 0.8357142857142857, - "grad_norm": 81.40847778320312, + "grad_norm": 22971.576171875, "learning_rate": 8.357142857142858e-06, - "loss": 14.4297, + "loss": 611.5, "step": 234 }, { "epoch": 0.8392857142857143, - "grad_norm": 96.3760986328125, + "grad_norm": 20937.8359375, "learning_rate": 8.392857142857144e-06, - "loss": 17.7969, + "loss": 699.25, "step": 235 }, { "epoch": 0.8428571428571429, - "grad_norm": 101.95112609863281, + "grad_norm": 22469.3203125, "learning_rate": 8.428571428571429e-06, - "loss": 16.8594, + "loss": 715.5, "step": 236 }, { "epoch": 0.8464285714285714, - "grad_norm": 81.51654052734375, + "grad_norm": 17387.869140625, "learning_rate": 8.464285714285715e-06, - "loss": 18.7109, + "loss": 506.25, "step": 237 }, { "epoch": 0.85, - "grad_norm": 75.49237823486328, + "grad_norm": 16981.8984375, "learning_rate": 8.5e-06, - "loss": 16.0781, + "loss": 571.75, "step": 238 }, { "epoch": 0.8535714285714285, - "grad_norm": 104.47551727294922, + "grad_norm": 22693.2890625, "learning_rate": 8.535714285714286e-06, - "loss": 18.4844, + "loss": 805.25, "step": 239 }, { "epoch": 0.8571428571428571, - "grad_norm": 82.41503143310547, + "grad_norm": 14894.5986328125, "learning_rate": 8.571428571428571e-06, - "loss": 14.3906, + "loss": 534.125, "step": 240 }, { "epoch": 0.8607142857142858, - "grad_norm": 75.3221206665039, + "grad_norm": 20919.419921875, "learning_rate": 8.607142857142859e-06, - "loss": 15.5234, + "loss": 645.0, "step": 241 }, { "epoch": 0.8642857142857143, - "grad_norm": 99.53022766113281, + "grad_norm": 17273.087890625, "learning_rate": 8.642857142857144e-06, - "loss": 18.0391, + "loss": 498.75, "step": 242 }, { "epoch": 0.8678571428571429, - "grad_norm": 95.59163665771484, + "grad_norm": 18586.759765625, "learning_rate": 8.67857142857143e-06, - "loss": 17.0703, + "loss": 766.25, "step": 243 }, { "epoch": 0.8714285714285714, - "grad_norm": 134.81008911132812, + "grad_norm": 13377.4697265625, "learning_rate": 8.714285714285715e-06, - "loss": 23.0547, + "loss": 450.125, "step": 244 }, { "epoch": 0.875, - "grad_norm": 89.14547729492188, + "grad_norm": 17875.6875, "learning_rate": 8.750000000000001e-06, - "loss": 17.5312, + "loss": 635.25, "step": 245 }, { "epoch": 0.8785714285714286, - "grad_norm": 86.16332244873047, + "grad_norm": 17747.962890625, "learning_rate": 8.785714285714286e-06, - "loss": 15.7656, + "loss": 477.75, "step": 246 }, { "epoch": 0.8821428571428571, - "grad_norm": 86.12939453125, + "grad_norm": 19076.22265625, "learning_rate": 8.821428571428572e-06, - "loss": 16.1641, + "loss": 587.5, "step": 247 }, { "epoch": 0.8857142857142857, - "grad_norm": 96.31692504882812, + "grad_norm": 16513.00390625, "learning_rate": 8.857142857142858e-06, - "loss": 18.0703, + "loss": 656.25, "step": 248 }, { "epoch": 0.8892857142857142, - "grad_norm": 99.14350891113281, + "grad_norm": 17273.529296875, "learning_rate": 8.892857142857143e-06, - "loss": 16.1406, + "loss": 427.0, "step": 249 }, { "epoch": 0.8928571428571429, - "grad_norm": 107.43618774414062, + "grad_norm": 14970.4912109375, "learning_rate": 8.92857142857143e-06, - "loss": 20.0078, + "loss": 469.5, "step": 250 }, { "epoch": 0.8964285714285715, - "grad_norm": 85.40766143798828, + "grad_norm": 13040.6708984375, "learning_rate": 8.964285714285716e-06, - "loss": 19.125, + "loss": 466.375, "step": 251 }, { "epoch": 0.9, - "grad_norm": 92.74526977539062, + "grad_norm": 16325.146484375, "learning_rate": 9e-06, - "loss": 18.8125, + "loss": 530.0, "step": 252 }, { "epoch": 0.9035714285714286, - "grad_norm": 74.12761688232422, + "grad_norm": 15406.16015625, "learning_rate": 9.035714285714287e-06, - "loss": 14.1172, + "loss": 433.5, "step": 253 }, { "epoch": 0.9071428571428571, - "grad_norm": 83.7098617553711, + "grad_norm": 15148.9345703125, "learning_rate": 9.071428571428573e-06, - "loss": 16.7734, + "loss": 485.25, "step": 254 }, { "epoch": 0.9107142857142857, - "grad_norm": 88.60262298583984, + "grad_norm": 14272.46484375, "learning_rate": 9.107142857142858e-06, - "loss": 19.4609, + "loss": 505.5, "step": 255 }, { "epoch": 0.9142857142857143, - "grad_norm": 74.61520385742188, + "grad_norm": 17446.931640625, "learning_rate": 9.142857142857144e-06, - "loss": 16.3203, + "loss": 486.875, "step": 256 }, { "epoch": 0.9178571428571428, - "grad_norm": 80.32473754882812, + "grad_norm": 11477.19140625, "learning_rate": 9.178571428571429e-06, - "loss": 12.8047, + "loss": 358.625, "step": 257 }, { "epoch": 0.9214285714285714, - "grad_norm": 74.33063507080078, + "grad_norm": 14684.0693359375, "learning_rate": 9.214285714285715e-06, - "loss": 14.5391, + "loss": 428.5, "step": 258 }, { "epoch": 0.925, - "grad_norm": 92.73950958251953, + "grad_norm": 15391.5625, "learning_rate": 9.250000000000001e-06, - "loss": 14.7109, + "loss": 433.125, "step": 259 }, { "epoch": 0.9285714285714286, - "grad_norm": 73.21745300292969, + "grad_norm": 16857.1328125, "learning_rate": 9.285714285714288e-06, - "loss": 14.5859, + "loss": 462.625, "step": 260 }, { "epoch": 0.9321428571428572, - "grad_norm": 95.37288665771484, + "grad_norm": 16069.93359375, "learning_rate": 9.321428571428572e-06, - "loss": 19.4922, + "loss": 460.875, "step": 261 }, { "epoch": 0.9357142857142857, - "grad_norm": 93.61527252197266, + "grad_norm": 12793.6064453125, "learning_rate": 9.357142857142859e-06, - "loss": 19.2266, + "loss": 373.8125, "step": 262 }, { "epoch": 0.9392857142857143, - "grad_norm": 114.50502014160156, + "grad_norm": 14879.3154296875, "learning_rate": 9.392857142857143e-06, - "loss": 21.0391, + "loss": 439.125, "step": 263 }, { "epoch": 0.9428571428571428, - "grad_norm": 72.6736831665039, + "grad_norm": 10189.052734375, "learning_rate": 9.42857142857143e-06, - "loss": 16.5312, + "loss": 379.5, "step": 264 }, { "epoch": 0.9464285714285714, - "grad_norm": 86.33624267578125, + "grad_norm": 14227.330078125, "learning_rate": 9.464285714285714e-06, - "loss": 17.1484, + "loss": 452.5, "step": 265 }, { "epoch": 0.95, - "grad_norm": 92.04947662353516, + "grad_norm": 12111.03515625, "learning_rate": 9.5e-06, - "loss": 18.2812, + "loss": 377.5, "step": 266 }, { "epoch": 0.9535714285714286, - "grad_norm": 103.0252685546875, + "grad_norm": 10455.41796875, "learning_rate": 9.535714285714287e-06, - "loss": 19.9141, + "loss": 347.0625, "step": 267 }, { "epoch": 0.9571428571428572, - "grad_norm": 70.00375366210938, + "grad_norm": 13098.6826171875, "learning_rate": 9.571428571428573e-06, - "loss": 14.7812, + "loss": 380.25, "step": 268 }, { "epoch": 0.9607142857142857, - "grad_norm": 92.1401138305664, + "grad_norm": 12830.8662109375, "learning_rate": 9.607142857142858e-06, - "loss": 17.1797, + "loss": 364.75, "step": 269 }, { "epoch": 0.9642857142857143, - "grad_norm": 80.43658447265625, + "grad_norm": 12711.3046875, "learning_rate": 9.642857142857144e-06, - "loss": 15.7109, + "loss": 551.875, "step": 270 }, { "epoch": 0.9678571428571429, - "grad_norm": 102.74592590332031, + "grad_norm": 12514.1494140625, "learning_rate": 9.678571428571429e-06, - "loss": 25.3594, + "loss": 405.625, "step": 271 }, { "epoch": 0.9714285714285714, - "grad_norm": 83.2950668334961, + "grad_norm": 13504.0927734375, "learning_rate": 9.714285714285715e-06, - "loss": 18.4922, + "loss": 376.75, "step": 272 }, { "epoch": 0.975, - "grad_norm": 70.38327026367188, + "grad_norm": 11305.9609375, "learning_rate": 9.75e-06, - "loss": 16.2344, + "loss": 303.25, "step": 273 }, { "epoch": 0.9785714285714285, - "grad_norm": 90.3486557006836, + "grad_norm": 11952.7431640625, "learning_rate": 9.785714285714286e-06, - "loss": 16.0625, + "loss": 324.25, "step": 274 }, { "epoch": 0.9821428571428571, - "grad_norm": 76.84796142578125, + "grad_norm": 12138.9736328125, "learning_rate": 9.821428571428573e-06, - "loss": 17.1328, + "loss": 375.9375, "step": 275 }, { "epoch": 0.9857142857142858, - "grad_norm": 78.52225494384766, + "grad_norm": 12093.4111328125, "learning_rate": 9.857142857142859e-06, - "loss": 16.2188, + "loss": 333.5, "step": 276 }, { "epoch": 0.9892857142857143, - "grad_norm": 65.8594741821289, + "grad_norm": 9759.865234375, "learning_rate": 9.892857142857143e-06, - "loss": 13.8984, + "loss": 303.25, "step": 277 }, { "epoch": 0.9928571428571429, - "grad_norm": 68.17979431152344, + "grad_norm": 11395.50390625, "learning_rate": 9.92857142857143e-06, - "loss": 13.7812, + "loss": 289.375, "step": 278 }, { "epoch": 0.9964285714285714, - "grad_norm": 74.56209564208984, + "grad_norm": 11235.5107421875, "learning_rate": 9.964285714285714e-06, - "loss": 17.3594, + "loss": 395.375, "step": 279 }, { "epoch": 1.0, - "grad_norm": 94.02787017822266, + "grad_norm": 12736.294921875, "learning_rate": 1e-05, - "loss": 16.8047, + "loss": 428.5, "step": 280 }, { "epoch": 1.0, - "eval_loss": 16.45554542541504, - "eval_mse": 16.459409681935433, - "eval_runtime": 11.5104, - "eval_samples_per_second": 246.734, - "eval_steps_per_second": 1.303, - "eval_target_0_mse": 27.76799211944593, - "eval_target_1_mse": 15.505072080667183, - "eval_target_2_mse": 11.161297277536558, - "eval_target_3_mse": 11.403277250092078, + "eval_loss": 347.7662048339844, + "eval_mse": 347.8058602778353, + "eval_runtime": 11.5661, + "eval_samples_per_second": 245.544, + "eval_steps_per_second": 1.297, + "eval_target_0_mse": 633.5776236437076, + "eval_target_1_mse": 263.57286043847563, + "eval_target_2_mse": 370.2541447403749, + "eval_target_3_mse": 123.81881228878296, "step": 280 }, { "epoch": 1.0035714285714286, - "grad_norm": 96.48439025878906, + "grad_norm": 11203.009765625, "learning_rate": 9.999996114574232e-06, - "loss": 17.25, + "loss": 249.375, "step": 281 }, { "epoch": 1.0071428571428571, - "grad_norm": 89.92938995361328, + "grad_norm": 8239.037109375, "learning_rate": 9.99998445830296e-06, - "loss": 17.7266, + "loss": 290.5625, "step": 282 }, { "epoch": 1.0107142857142857, - "grad_norm": 73.15606689453125, + "grad_norm": 8889.88671875, "learning_rate": 9.999965031204306e-06, - "loss": 18.5156, + "loss": 257.125, "step": 283 }, { "epoch": 1.0142857142857142, - "grad_norm": 80.01873779296875, + "grad_norm": 10844.080078125, "learning_rate": 9.999937833308459e-06, - "loss": 15.4688, + "loss": 307.375, "step": 284 }, { "epoch": 1.0178571428571428, - "grad_norm": 74.69602966308594, + "grad_norm": 8675.119140625, "learning_rate": 9.999902864657691e-06, - "loss": 15.2734, + "loss": 288.75, "step": 285 }, { "epoch": 1.0214285714285714, - "grad_norm": 86.97518157958984, + "grad_norm": 14556.5224609375, "learning_rate": 9.99986012530635e-06, - "loss": 20.5, + "loss": 411.0, "step": 286 }, { "epoch": 1.025, - "grad_norm": 72.83181762695312, + "grad_norm": 8876.8232421875, "learning_rate": 9.999809615320857e-06, - "loss": 17.1641, + "loss": 222.625, "step": 287 }, { "epoch": 1.0285714285714285, - "grad_norm": 74.2786636352539, + "grad_norm": 9253.29296875, "learning_rate": 9.999751334779716e-06, - "loss": 20.0078, + "loss": 237.75, "step": 288 }, { "epoch": 1.032142857142857, - "grad_norm": 77.30772399902344, + "grad_norm": 8294.7705078125, "learning_rate": 9.999685283773504e-06, - "loss": 17.75, + "loss": 245.1875, "step": 289 }, { "epoch": 1.0357142857142858, - "grad_norm": 99.25684356689453, + "grad_norm": 8309.5673828125, "learning_rate": 9.999611462404874e-06, - "loss": 14.2266, + "loss": 231.9062, "step": 290 }, { "epoch": 1.0392857142857144, - "grad_norm": 85.888916015625, + "grad_norm": 8937.1982421875, "learning_rate": 9.99952987078856e-06, - "loss": 15.9141, + "loss": 205.9062, "step": 291 }, { "epoch": 1.042857142857143, - "grad_norm": 70.37421417236328, + "grad_norm": 7173.86474609375, "learning_rate": 9.999440509051367e-06, - "loss": 16.5, + "loss": 141.0625, "step": 292 }, { "epoch": 1.0464285714285715, - "grad_norm": 79.93342590332031, + "grad_norm": 9779.6572265625, "learning_rate": 9.99934337733218e-06, - "loss": 19.4766, + "loss": 240.875, "step": 293 }, { "epoch": 1.05, - "grad_norm": 88.60183715820312, + "grad_norm": 11559.123046875, "learning_rate": 9.999238475781957e-06, - "loss": 17.1797, + "loss": 351.5, "step": 294 }, { "epoch": 1.0535714285714286, - "grad_norm": 89.8875503540039, + "grad_norm": 8323.0263671875, "learning_rate": 9.999125804563732e-06, - "loss": 15.6094, + "loss": 209.4688, "step": 295 }, { "epoch": 1.0571428571428572, - "grad_norm": 66.15046691894531, + "grad_norm": 9182.6162109375, "learning_rate": 9.999005363852619e-06, - "loss": 14.8438, + "loss": 258.75, "step": 296 }, { "epoch": 1.0607142857142857, - "grad_norm": 73.1620101928711, + "grad_norm": 4893.7666015625, "learning_rate": 9.998877153835798e-06, - "loss": 17.3125, + "loss": 134.6875, "step": 297 }, { "epoch": 1.0642857142857143, - "grad_norm": 85.9134521484375, + "grad_norm": 7344.5830078125, "learning_rate": 9.998741174712534e-06, - "loss": 17.7422, + "loss": 219.2188, "step": 298 }, { "epoch": 1.0678571428571428, - "grad_norm": 81.31625366210938, + "grad_norm": 8295.7138671875, "learning_rate": 9.998597426694158e-06, - "loss": 16.6953, + "loss": 211.9375, "step": 299 }, { "epoch": 1.0714285714285714, - "grad_norm": 65.39875793457031, + "grad_norm": 9789.7998046875, "learning_rate": 9.998445910004082e-06, - "loss": 16.1562, + "loss": 254.75, "step": 300 }, { "epoch": 1.075, - "grad_norm": 78.95276641845703, + "grad_norm": 10685.927734375, "learning_rate": 9.998286624877786e-06, - "loss": 16.8516, + "loss": 292.0, "step": 301 }, { "epoch": 1.0785714285714285, - "grad_norm": 71.36321258544922, + "grad_norm": 8568.583984375, "learning_rate": 9.99811957156283e-06, - "loss": 13.6875, + "loss": 223.25, "step": 302 }, { "epoch": 1.082142857142857, - "grad_norm": 74.26119232177734, + "grad_norm": 6249.4912109375, "learning_rate": 9.99794475031884e-06, - "loss": 13.1172, + "loss": 165.2812, "step": 303 }, { "epoch": 1.0857142857142856, - "grad_norm": 66.24810028076172, + "grad_norm": 6934.5986328125, "learning_rate": 9.997762161417517e-06, - "loss": 15.8984, + "loss": 156.7188, "step": 304 }, { "epoch": 1.0892857142857142, - "grad_norm": 65.53324890136719, + "grad_norm": 8256.572265625, "learning_rate": 9.997571805142638e-06, - "loss": 17.4844, + "loss": 172.7812, "step": 305 }, { "epoch": 1.092857142857143, - "grad_norm": 87.03485107421875, + "grad_norm": 8154.33642578125, "learning_rate": 9.99737368179005e-06, - "loss": 15.9922, + "loss": 218.25, "step": 306 }, { "epoch": 1.0964285714285715, - "grad_norm": 68.19409942626953, + "grad_norm": 7247.60107421875, "learning_rate": 9.997167791667668e-06, - "loss": 13.0234, + "loss": 167.9375, "step": 307 }, { "epoch": 1.1, - "grad_norm": 108.7411880493164, + "grad_norm": 6042.0498046875, "learning_rate": 9.99695413509548e-06, - "loss": 16.2422, + "loss": 206.6875, "step": 308 }, { "epoch": 1.1035714285714286, - "grad_norm": 93.99723815917969, + "grad_norm": 6101.53515625, "learning_rate": 9.996732712405545e-06, - "loss": 17.5156, + "loss": 136.75, "step": 309 }, { "epoch": 1.1071428571428572, - "grad_norm": 70.84767150878906, + "grad_norm": 6769.41845703125, "learning_rate": 9.996503523941994e-06, - "loss": 16.0156, + "loss": 180.125, "step": 310 }, { "epoch": 1.1107142857142858, - "grad_norm": 82.52654266357422, + "grad_norm": 5267.03857421875, "learning_rate": 9.996266570061022e-06, - "loss": 14.1797, + "loss": 216.2578, "step": 311 }, { "epoch": 1.1142857142857143, - "grad_norm": 92.99340057373047, + "grad_norm": 7919.57080078125, "learning_rate": 9.996021851130897e-06, - "loss": 14.0391, + "loss": 183.625, "step": 312 }, { "epoch": 1.1178571428571429, - "grad_norm": 68.71207427978516, + "grad_norm": 7638.78515625, "learning_rate": 9.995769367531953e-06, - "loss": 15.125, + "loss": 203.25, "step": 313 }, { "epoch": 1.1214285714285714, - "grad_norm": 80.76944732666016, + "grad_norm": 5843.927734375, "learning_rate": 9.995509119656595e-06, - "loss": 16.1641, + "loss": 154.9688, "step": 314 }, { "epoch": 1.125, - "grad_norm": 73.514404296875, + "grad_norm": 6214.84375, "learning_rate": 9.99524110790929e-06, - "loss": 15.4219, + "loss": 176.6172, "step": 315 }, { "epoch": 1.1285714285714286, - "grad_norm": 83.92344665527344, + "grad_norm": 7034.54638671875, "learning_rate": 9.994965332706574e-06, - "loss": 18.9297, + "loss": 194.5625, "step": 316 }, { "epoch": 1.1321428571428571, - "grad_norm": 76.00177764892578, + "grad_norm": 7158.00439453125, "learning_rate": 9.99468179447705e-06, - "loss": 15.4766, + "loss": 232.0, "step": 317 }, { "epoch": 1.1357142857142857, - "grad_norm": 72.25801849365234, + "grad_norm": 5556.1591796875, "learning_rate": 9.994390493661384e-06, - "loss": 16.3672, + "loss": 155.5625, "step": 318 }, { "epoch": 1.1392857142857142, - "grad_norm": 70.91494750976562, + "grad_norm": 5756.087890625, "learning_rate": 9.994091430712307e-06, - "loss": 13.1328, + "loss": 155.8438, "step": 319 }, { "epoch": 1.1428571428571428, - "grad_norm": 68.69308471679688, + "grad_norm": 5221.12158203125, "learning_rate": 9.993784606094612e-06, - "loss": 16.5312, + "loss": 146.3125, "step": 320 }, { "epoch": 1.1464285714285714, - "grad_norm": 67.94559478759766, + "grad_norm": 4459.2451171875, "learning_rate": 9.993470020285161e-06, - "loss": 15.2656, + "loss": 101.3516, "step": 321 }, { "epoch": 1.15, - "grad_norm": 76.75133514404297, + "grad_norm": 6024.9287109375, "learning_rate": 9.993147673772869e-06, - "loss": 14.1211, + "loss": 156.5391, "step": 322 }, { "epoch": 1.1535714285714285, - "grad_norm": 72.26853942871094, + "grad_norm": 7707.34033203125, "learning_rate": 9.992817567058722e-06, - "loss": 14.8984, + "loss": 165.875, "step": 323 }, { "epoch": 1.157142857142857, - "grad_norm": 76.29134368896484, + "grad_norm": 5177.4072265625, "learning_rate": 9.99247970065576e-06, - "loss": 14.9297, + "loss": 89.0938, "step": 324 }, { "epoch": 1.1607142857142858, - "grad_norm": 72.95942687988281, + "grad_norm": 4396.01416015625, "learning_rate": 9.992134075089085e-06, - "loss": 14.7031, + "loss": 112.0625, "step": 325 }, { "epoch": 1.1642857142857144, - "grad_norm": 69.09703826904297, + "grad_norm": 4506.82861328125, "learning_rate": 9.991780690895856e-06, - "loss": 15.0469, + "loss": 114.9062, "step": 326 }, { "epoch": 1.167857142857143, - "grad_norm": 65.2791519165039, + "grad_norm": 5738.66259765625, "learning_rate": 9.991419548625294e-06, - "loss": 16.75, + "loss": 174.2812, "step": 327 }, { "epoch": 1.1714285714285715, - "grad_norm": 68.60404968261719, + "grad_norm": 6642.04931640625, "learning_rate": 9.991050648838676e-06, - "loss": 12.168, + "loss": 116.2969, "step": 328 }, { "epoch": 1.175, - "grad_norm": 90.77375793457031, + "grad_norm": 5424.1396484375, "learning_rate": 9.990673992109335e-06, - "loss": 17.9688, + "loss": 126.6172, "step": 329 }, { "epoch": 1.1785714285714286, - "grad_norm": 84.22817993164062, + "grad_norm": 5690.1943359375, "learning_rate": 9.990289579022661e-06, - "loss": 12.9219, + "loss": 132.4727, "step": 330 }, { "epoch": 1.1821428571428572, - "grad_norm": 75.16531372070312, + "grad_norm": 5434.8779296875, "learning_rate": 9.989897410176093e-06, - "loss": 18.0859, + "loss": 113.0625, "step": 331 }, { "epoch": 1.1857142857142857, - "grad_norm": 65.63926696777344, + "grad_norm": 4840.16259765625, "learning_rate": 9.989497486179132e-06, - "loss": 12.0547, + "loss": 120.0156, "step": 332 }, { "epoch": 1.1892857142857143, - "grad_norm": 73.19938659667969, + "grad_norm": 3887.0908203125, "learning_rate": 9.989089807653329e-06, - "loss": 15.0391, + "loss": 108.1875, "step": 333 }, { "epoch": 1.1928571428571428, - "grad_norm": 59.28972625732422, + "grad_norm": 3818.36474609375, "learning_rate": 9.98867437523228e-06, - "loss": 14.8203, + "loss": 118.7969, "step": 334 }, { "epoch": 1.1964285714285714, - "grad_norm": 92.87635803222656, + "grad_norm": 6250.7724609375, "learning_rate": 9.988251189561645e-06, - "loss": 17.0703, + "loss": 167.5, "step": 335 }, { "epoch": 1.2, - "grad_norm": 76.77757263183594, + "grad_norm": 3988.795654296875, "learning_rate": 9.987820251299121e-06, - "loss": 12.875, + "loss": 100.7617, "step": 336 }, { "epoch": 1.2035714285714285, - "grad_norm": 60.58586120605469, + "grad_norm": 4587.75390625, "learning_rate": 9.987381561114464e-06, - "loss": 12.6797, + "loss": 113.0859, "step": 337 }, { "epoch": 1.207142857142857, - "grad_norm": 73.91436767578125, + "grad_norm": 5280.708984375, "learning_rate": 9.986935119689469e-06, - "loss": 16.0625, + "loss": 142.25, "step": 338 }, { "epoch": 1.2107142857142856, - "grad_norm": 59.389591217041016, + "grad_norm": 4789.30810546875, "learning_rate": 9.986480927717986e-06, - "loss": 14.2031, + "loss": 108.1172, "step": 339 }, { "epoch": 1.2142857142857142, - "grad_norm": 64.7526626586914, + "grad_norm": 4502.4921875, "learning_rate": 9.986018985905901e-06, - "loss": 14.6406, + "loss": 80.9062, "step": 340 }, { "epoch": 1.217857142857143, - "grad_norm": 80.5587158203125, + "grad_norm": 4086.18408203125, "learning_rate": 9.985549294971157e-06, - "loss": 17.3203, + "loss": 93.4727, "step": 341 }, { "epoch": 1.2214285714285715, - "grad_norm": 75.2529525756836, + "grad_norm": 6365.3095703125, "learning_rate": 9.98507185564373e-06, - "loss": 14.9922, + "loss": 160.1172, "step": 342 }, { "epoch": 1.225, - "grad_norm": 69.25627136230469, + "grad_norm": 4317.46630859375, "learning_rate": 9.984586668665641e-06, - "loss": 13.0625, + "loss": 131.125, "step": 343 }, { "epoch": 1.2285714285714286, - "grad_norm": 65.46245574951172, + "grad_norm": 4163.40380859375, "learning_rate": 9.984093734790955e-06, - "loss": 15.5703, + "loss": 127.2266, "step": 344 }, { "epoch": 1.2321428571428572, - "grad_norm": 80.66325378417969, + "grad_norm": 3753.249755859375, "learning_rate": 9.983593054785776e-06, - "loss": 15.4844, + "loss": 64.9531, "step": 345 }, { "epoch": 1.2357142857142858, - "grad_norm": 71.53549194335938, + "grad_norm": 4974.337890625, "learning_rate": 9.983084629428244e-06, - "loss": 14.5625, + "loss": 109.9062, "step": 346 }, { "epoch": 1.2392857142857143, - "grad_norm": 72.56149291992188, + "grad_norm": 3194.146484375, "learning_rate": 9.98256845950854e-06, - "loss": 16.8984, + "loss": 93.7344, "step": 347 }, { "epoch": 1.2428571428571429, - "grad_norm": 99.11085510253906, + "grad_norm": 3322.313232421875, "learning_rate": 9.98204454582888e-06, - "loss": 19.0391, + "loss": 89.8906, "step": 348 }, { "epoch": 1.2464285714285714, - "grad_norm": 59.95326614379883, + "grad_norm": 3412.546142578125, "learning_rate": 9.981512889203515e-06, - "loss": 14.8516, + "loss": 71.4219, "step": 349 }, { "epoch": 1.25, - "grad_norm": 58.47057342529297, + "grad_norm": 2498.410400390625, "learning_rate": 9.980973490458728e-06, - "loss": 13.1406, + "loss": 38.3633, "step": 350 }, { "epoch": 1.2535714285714286, - "grad_norm": 55.850589752197266, + "grad_norm": 2873.92919921875, "learning_rate": 9.98042635043284e-06, - "loss": 11.5547, + "loss": 56.0625, "step": 351 }, { "epoch": 1.2571428571428571, - "grad_norm": 60.66499328613281, + "grad_norm": 4673.09619140625, "learning_rate": 9.979871469976197e-06, - "loss": 12.9609, + "loss": 151.8281, "step": 352 }, { "epoch": 1.2607142857142857, - "grad_norm": 63.89755630493164, + "grad_norm": 4280.66650390625, "learning_rate": 9.979308849951177e-06, - "loss": 12.3594, + "loss": 85.1562, "step": 353 }, { "epoch": 1.2642857142857142, - "grad_norm": 61.41006851196289, + "grad_norm": 3086.531982421875, "learning_rate": 9.978738491232191e-06, - "loss": 13.8594, + "loss": 89.2109, "step": 354 }, { "epoch": 1.2678571428571428, - "grad_norm": 86.45226287841797, + "grad_norm": 3903.843505859375, "learning_rate": 9.978160394705669e-06, - "loss": 16.0469, + "loss": 74.875, "step": 355 }, { "epoch": 1.2714285714285714, - "grad_norm": 66.66681671142578, + "grad_norm": 2326.87744140625, "learning_rate": 9.977574561270075e-06, - "loss": 14.0156, + "loss": 66.4688, "step": 356 }, { "epoch": 1.275, - "grad_norm": 79.0351333618164, + "grad_norm": 3560.80224609375, "learning_rate": 9.976980991835896e-06, - "loss": 13.1484, + "loss": 82.7227, "step": 357 }, { "epoch": 1.2785714285714285, - "grad_norm": 68.78445434570312, + "grad_norm": 2009.2374267578125, "learning_rate": 9.976379687325633e-06, - "loss": 14.0859, + "loss": 49.0859, "step": 358 }, { "epoch": 1.282142857142857, - "grad_norm": 77.4033432006836, + "grad_norm": 3895.22509765625, "learning_rate": 9.975770648673821e-06, - "loss": 13.9609, + "loss": 103.4844, "step": 359 }, { "epoch": 1.2857142857142856, - "grad_norm": 63.17131042480469, + "grad_norm": 2713.777099609375, "learning_rate": 9.975153876827008e-06, - "loss": 14.9297, + "loss": 51.8477, "step": 360 }, { "epoch": 1.2892857142857144, - "grad_norm": 71.58943939208984, + "grad_norm": 3304.460205078125, "learning_rate": 9.974529372743762e-06, - "loss": 16.0469, + "loss": 96.1094, "step": 361 }, { "epoch": 1.292857142857143, - "grad_norm": 69.42976379394531, + "grad_norm": 3778.38623046875, "learning_rate": 9.97389713739467e-06, - "loss": 13.9766, + "loss": 56.75, "step": 362 }, { "epoch": 1.2964285714285715, - "grad_norm": 68.25914001464844, + "grad_norm": 4105.6650390625, "learning_rate": 9.973257171762334e-06, - "loss": 14.5, + "loss": 99.3594, "step": 363 }, { "epoch": 1.3, - "grad_norm": 70.60839080810547, + "grad_norm": 3779.646728515625, "learning_rate": 9.972609476841368e-06, - "loss": 15.2344, + "loss": 75.7969, "step": 364 }, { "epoch": 1.3035714285714286, - "grad_norm": 77.7553482055664, + "grad_norm": 3980.998291015625, "learning_rate": 9.9719540536384e-06, - "loss": 15.2578, + "loss": 89.3438, "step": 365 }, { "epoch": 1.3071428571428572, - "grad_norm": 88.90721130371094, + "grad_norm": 2299.5966796875, "learning_rate": 9.97129090317207e-06, - "loss": 20.8828, + "loss": 68.6094, "step": 366 }, { "epoch": 1.3107142857142857, - "grad_norm": 77.04537963867188, + "grad_norm": 2828.774169921875, "learning_rate": 9.970620026473028e-06, - "loss": 14.2031, + "loss": 42.9219, "step": 367 }, { "epoch": 1.3142857142857143, - "grad_norm": 67.82777404785156, + "grad_norm": 3052.263916015625, "learning_rate": 9.969941424583926e-06, - "loss": 16.6406, + "loss": 112.6953, "step": 368 }, { "epoch": 1.3178571428571428, - "grad_norm": 66.92823028564453, + "grad_norm": 2110.2109375, "learning_rate": 9.969255098559434e-06, - "loss": 16.5391, + "loss": 58.4453, "step": 369 }, { "epoch": 1.3214285714285714, - "grad_norm": 73.28927612304688, + "grad_norm": 3233.62841796875, "learning_rate": 9.968561049466214e-06, - "loss": 13.293, + "loss": 77.4297, "step": 370 }, { "epoch": 1.325, - "grad_norm": 92.24927520751953, + "grad_norm": 3164.963134765625, "learning_rate": 9.967859278382939e-06, - "loss": 16.9453, + "loss": 40.6953, "step": 371 }, { "epoch": 1.3285714285714285, - "grad_norm": 68.39715576171875, + "grad_norm": 4477.97216796875, "learning_rate": 9.967149786400278e-06, - "loss": 13.5234, + "loss": 121.1953, "step": 372 }, { "epoch": 1.332142857142857, - "grad_norm": 78.75071716308594, + "grad_norm": 2896.68017578125, "learning_rate": 9.966432574620906e-06, - "loss": 14.6719, + "loss": 74.6875, "step": 373 }, { "epoch": 1.3357142857142856, - "grad_norm": 60.6776123046875, + "grad_norm": 1990.000244140625, "learning_rate": 9.965707644159492e-06, - "loss": 13.625, + "loss": 61.5312, "step": 374 }, { "epoch": 1.3392857142857144, - "grad_norm": 74.01213073730469, + "grad_norm": 3974.928955078125, "learning_rate": 9.964974996142699e-06, - "loss": 14.3047, + "loss": 77.2031, "step": 375 }, { "epoch": 1.342857142857143, - "grad_norm": 60.50319290161133, + "grad_norm": 2968.148193359375, "learning_rate": 9.964234631709188e-06, - "loss": 13.7109, + "loss": 53.8828, "step": 376 }, { "epoch": 1.3464285714285715, - "grad_norm": 72.34317779541016, + "grad_norm": 2551.07861328125, "learning_rate": 9.96348655200961e-06, - "loss": 15.1641, + "loss": 48.5195, "step": 377 }, { "epoch": 1.35, - "grad_norm": 70.07368469238281, + "grad_norm": 2384.084716796875, "learning_rate": 9.962730758206612e-06, - "loss": 12.9922, + "loss": 63.7344, "step": 378 }, { "epoch": 1.3535714285714286, - "grad_norm": 67.06788635253906, + "grad_norm": 2091.879638671875, "learning_rate": 9.961967251474823e-06, - "loss": 15.2734, + "loss": 45.9297, "step": 379 }, { "epoch": 1.3571428571428572, - "grad_norm": 76.16040802001953, + "grad_norm": 2729.066162109375, "learning_rate": 9.961196033000862e-06, - "loss": 16.7344, + "loss": 52.6328, "step": 380 }, { "epoch": 1.3607142857142858, - "grad_norm": 65.53759765625, + "grad_norm": 2769.593994140625, "learning_rate": 9.960417103983335e-06, - "loss": 12.8125, + "loss": 50.7852, "step": 381 }, { "epoch": 1.3642857142857143, - "grad_norm": 77.16561126708984, + "grad_norm": 1611.6575927734375, "learning_rate": 9.959630465632833e-06, - "loss": 18.2188, + "loss": 34.3672, "step": 382 }, { "epoch": 1.3678571428571429, - "grad_norm": 71.51244354248047, + "grad_norm": 2532.72314453125, "learning_rate": 9.95883611917192e-06, - "loss": 16.625, + "loss": 65.5, "step": 383 }, { "epoch": 1.3714285714285714, - "grad_norm": 77.27005767822266, + "grad_norm": 2208.116455078125, "learning_rate": 9.958034065835151e-06, - "loss": 15.8516, + "loss": 58.6016, "step": 384 }, { "epoch": 1.375, - "grad_norm": 60.978599548339844, + "grad_norm": 2299.150390625, "learning_rate": 9.957224306869053e-06, - "loss": 13.0469, + "loss": 41.3633, "step": 385 }, { "epoch": 1.3785714285714286, - "grad_norm": 79.08712005615234, + "grad_norm": 1293.28955078125, "learning_rate": 9.956406843532128e-06, - "loss": 15.5, + "loss": 19.8047, "step": 386 }, { "epoch": 1.3821428571428571, - "grad_norm": 69.45923614501953, + "grad_norm": 2699.1650390625, "learning_rate": 9.955581677094851e-06, - "loss": 16.8906, + "loss": 83.4609, "step": 387 }, { "epoch": 1.3857142857142857, - "grad_norm": 98.14091491699219, + "grad_norm": 2260.285888671875, "learning_rate": 9.954748808839675e-06, - "loss": 18.6875, + "loss": 67.7109, "step": 388 }, { "epoch": 1.3892857142857142, - "grad_norm": 70.88829803466797, + "grad_norm": 1547.4879150390625, "learning_rate": 9.953908240061016e-06, - "loss": 16.4062, + "loss": 20.1055, "step": 389 }, { "epoch": 1.3928571428571428, - "grad_norm": 63.097434997558594, + "grad_norm": 3047.328125, "learning_rate": 9.953059972065264e-06, - "loss": 16.2734, + "loss": 95.75, "step": 390 }, { "epoch": 1.3964285714285714, - "grad_norm": 76.92569732666016, + "grad_norm": 1244.9755859375, "learning_rate": 9.952204006170771e-06, - "loss": 14.4453, + "loss": 19.0527, "step": 391 }, { "epoch": 1.4, - "grad_norm": 71.4322738647461, + "grad_norm": 2506.983642578125, "learning_rate": 9.951340343707852e-06, - "loss": 15.3906, + "loss": 63.1562, "step": 392 }, { "epoch": 1.4035714285714285, - "grad_norm": 82.21317291259766, + "grad_norm": 3441.695068359375, "learning_rate": 9.950468986018789e-06, - "loss": 14.6875, + "loss": 70.8789, "step": 393 }, { "epoch": 1.407142857142857, - "grad_norm": 64.30828094482422, + "grad_norm": 1669.4014892578125, "learning_rate": 9.949589934457815e-06, - "loss": 14.7344, + "loss": 33.0078, "step": 394 }, { "epoch": 1.4107142857142856, - "grad_norm": 61.286376953125, + "grad_norm": 1171.032470703125, "learning_rate": 9.948703190391131e-06, - "loss": 13.8594, + "loss": 34.8633, "step": 395 }, { "epoch": 1.4142857142857144, - "grad_norm": 68.73880004882812, + "grad_norm": 3057.765380859375, "learning_rate": 9.947808755196886e-06, - "loss": 15.0312, + "loss": 74.3086, "step": 396 }, { "epoch": 1.417857142857143, - "grad_norm": 73.20220947265625, + "grad_norm": 2590.413330078125, "learning_rate": 9.946906630265184e-06, - "loss": 15.6016, + "loss": 73.625, "step": 397 }, { "epoch": 1.4214285714285715, - "grad_norm": 63.83852005004883, + "grad_norm": 2334.88232421875, "learning_rate": 9.945996816998082e-06, - "loss": 15.9883, + "loss": 43.6855, "step": 398 }, { "epoch": 1.425, - "grad_norm": 70.45362854003906, + "grad_norm": 2801.4169921875, "learning_rate": 9.945079316809585e-06, - "loss": 16.0547, + "loss": 60.2109, "step": 399 }, { "epoch": 1.4285714285714286, - "grad_norm": 105.73292541503906, + "grad_norm": 2209.259033203125, "learning_rate": 9.944154131125643e-06, - "loss": 18.3984, + "loss": 74.3223, "step": 400 }, { "epoch": 1.4321428571428572, - "grad_norm": 65.84974670410156, + "grad_norm": 2347.47021484375, "learning_rate": 9.943221261384155e-06, - "loss": 13.7266, + "loss": 41.8281, "step": 401 }, { "epoch": 1.4357142857142857, - "grad_norm": 78.2030029296875, + "grad_norm": 1365.155029296875, "learning_rate": 9.942280709034954e-06, - "loss": 17.0, + "loss": 50.2812, "step": 402 }, { "epoch": 1.4392857142857143, - "grad_norm": 64.22380828857422, + "grad_norm": 1975.1627197265625, "learning_rate": 9.941332475539826e-06, - "loss": 15.3164, + "loss": 54.5254, "step": 403 }, { "epoch": 1.4428571428571428, - "grad_norm": 61.12931442260742, + "grad_norm": 2279.263916015625, "learning_rate": 9.940376562372482e-06, - "loss": 12.5703, + "loss": 71.7969, "step": 404 }, { "epoch": 1.4464285714285714, - "grad_norm": 64.36003875732422, + "grad_norm": 1792.5882568359375, "learning_rate": 9.939412971018574e-06, - "loss": 11.9375, + "loss": 27.3594, "step": 405 }, { "epoch": 1.45, - "grad_norm": 60.016326904296875, + "grad_norm": 1565.5513916015625, "learning_rate": 9.938441702975689e-06, - "loss": 10.8828, + "loss": 30.9727, "step": 406 }, { "epoch": 1.4535714285714285, - "grad_norm": 73.8875503540039, + "grad_norm": 2725.62744140625, "learning_rate": 9.937462759753343e-06, - "loss": 16.9844, + "loss": 75.3945, "step": 407 }, { "epoch": 1.457142857142857, - "grad_norm": 61.275062561035156, + "grad_norm": 1409.57373046875, "learning_rate": 9.936476142872979e-06, - "loss": 12.8516, + "loss": 23.7344, "step": 408 }, { "epoch": 1.4607142857142856, - "grad_norm": 64.1805191040039, + "grad_norm": 2241.506591796875, "learning_rate": 9.93548185386797e-06, - "loss": 13.25, + "loss": 46.4414, "step": 409 }, { "epoch": 1.4642857142857144, - "grad_norm": 71.62825012207031, + "grad_norm": 1719.4794921875, "learning_rate": 9.934479894283607e-06, - "loss": 16.625, + "loss": 29.5859, "step": 410 }, { "epoch": 1.467857142857143, - "grad_norm": 83.94905090332031, + "grad_norm": 2002.0841064453125, "learning_rate": 9.933470265677107e-06, - "loss": 13.7812, + "loss": 43.3281, "step": 411 }, { "epoch": 1.4714285714285715, - "grad_norm": 80.55525970458984, + "grad_norm": 1425.798828125, "learning_rate": 9.932452969617607e-06, - "loss": 19.2656, + "loss": 56.2266, "step": 412 }, { "epoch": 1.475, - "grad_norm": 76.33503723144531, + "grad_norm": 1546.5391845703125, "learning_rate": 9.931428007686158e-06, - "loss": 13.0234, + "loss": 31.6367, "step": 413 }, { "epoch": 1.4785714285714286, - "grad_norm": 73.42916870117188, + "grad_norm": 2006.9598388671875, "learning_rate": 9.930395381475723e-06, - "loss": 15.9688, + "loss": 57.375, "step": 414 }, { "epoch": 1.4821428571428572, - "grad_norm": 91.70386505126953, + "grad_norm": 1218.7559814453125, "learning_rate": 9.92935509259118e-06, - "loss": 14.375, + "loss": 30.5977, "step": 415 }, { "epoch": 1.4857142857142858, - "grad_norm": 62.46748352050781, + "grad_norm": 1769.1646728515625, "learning_rate": 9.928307142649315e-06, - "loss": 14.6562, + "loss": 58.3457, "step": 416 }, { "epoch": 1.4892857142857143, - "grad_norm": 77.73207092285156, + "grad_norm": 2074.3671875, "learning_rate": 9.927251533278823e-06, - "loss": 14.7656, + "loss": 59.4062, "step": 417 }, { "epoch": 1.4928571428571429, - "grad_norm": 56.19773864746094, + "grad_norm": 874.3988037109375, "learning_rate": 9.926188266120297e-06, - "loss": 11.2734, + "loss": 13.1953, "step": 418 }, { "epoch": 1.4964285714285714, - "grad_norm": 67.06712341308594, + "grad_norm": 1742.0382080078125, "learning_rate": 9.925117342826239e-06, - "loss": 14.0625, + "loss": 44.1328, "step": 419 }, { "epoch": 1.5, - "grad_norm": 59.32160568237305, + "grad_norm": 748.84130859375, "learning_rate": 9.924038765061042e-06, - "loss": 11.1328, + "loss": 10.2383, "step": 420 }, { "epoch": 1.5035714285714286, - "grad_norm": 68.36906433105469, + "grad_norm": 1448.7000732421875, "learning_rate": 9.922952534501002e-06, - "loss": 12.6875, + "loss": 40.4375, "step": 421 }, { "epoch": 1.5071428571428571, - "grad_norm": 66.32378387451172, + "grad_norm": 1774.9552001953125, "learning_rate": 9.921858652834306e-06, - "loss": 15.8984, + "loss": 40.5781, "step": 422 }, { "epoch": 1.5107142857142857, - "grad_norm": 72.64904022216797, + "grad_norm": 1131.154296875, "learning_rate": 9.920757121761033e-06, - "loss": 16.7578, + "loss": 29.3359, "step": 423 }, { "epoch": 1.5142857142857142, - "grad_norm": 51.21018600463867, + "grad_norm": 1577.4127197265625, "learning_rate": 9.91964794299315e-06, - "loss": 10.4531, + "loss": 43.7617, "step": 424 }, { "epoch": 1.5178571428571428, - "grad_norm": 70.82195281982422, + "grad_norm": 2438.007568359375, "learning_rate": 9.918531118254507e-06, - "loss": 13.8203, + "loss": 46.043, "step": 425 }, { "epoch": 1.5214285714285714, - "grad_norm": 65.88298797607422, + "grad_norm": 945.3380737304688, "learning_rate": 9.917406649280843e-06, - "loss": 14.6016, + "loss": 25.1875, "step": 426 }, { "epoch": 1.525, - "grad_norm": 77.01053619384766, + "grad_norm": 1807.56103515625, "learning_rate": 9.916274537819774e-06, - "loss": 14.7266, + "loss": 35.8086, "step": 427 }, { "epoch": 1.5285714285714285, - "grad_norm": 67.13703155517578, + "grad_norm": 1233.183349609375, "learning_rate": 9.915134785630793e-06, - "loss": 14.6797, + "loss": 21.7422, "step": 428 }, { "epoch": 1.532142857142857, - "grad_norm": 53.29026412963867, + "grad_norm": 377.82257080078125, "learning_rate": 9.913987394485268e-06, - "loss": 13.8203, + "loss": 11.5742, "step": 429 }, { "epoch": 1.5357142857142856, - "grad_norm": 72.39093017578125, + "grad_norm": 2343.8466796875, "learning_rate": 9.912832366166443e-06, - "loss": 15.0703, + "loss": 70.3203, "step": 430 }, { "epoch": 1.5392857142857141, - "grad_norm": 115.60740661621094, + "grad_norm": 1628.3466796875, "learning_rate": 9.911669702469425e-06, - "loss": 18.6797, + "loss": 58.3086, "step": 431 }, { "epoch": 1.5428571428571427, - "grad_norm": 69.79154205322266, + "grad_norm": 1873.8726806640625, "learning_rate": 9.910499405201195e-06, - "loss": 14.375, + "loss": 29.2734, "step": 432 }, { "epoch": 1.5464285714285713, - "grad_norm": 55.368431091308594, + "grad_norm": 2067.472412109375, "learning_rate": 9.909321476180594e-06, - "loss": 16.5391, + "loss": 43.1992, "step": 433 }, { "epoch": 1.55, - "grad_norm": 104.6972885131836, + "grad_norm": 1328.9373779296875, "learning_rate": 9.908135917238321e-06, - "loss": 16.7578, + "loss": 22.9453, "step": 434 }, { "epoch": 1.5535714285714286, - "grad_norm": 80.23822021484375, + "grad_norm": 1920.828125, "learning_rate": 9.90694273021694e-06, - "loss": 13.6641, + "loss": 44.1445, "step": 435 }, { "epoch": 1.5571428571428572, - "grad_norm": 77.43599700927734, + "grad_norm": 1758.8486328125, "learning_rate": 9.905741916970863e-06, - "loss": 18.4219, + "loss": 46.1172, "step": 436 }, { "epoch": 1.5607142857142857, - "grad_norm": 61.506351470947266, + "grad_norm": 1688.185302734375, "learning_rate": 9.904533479366364e-06, - "loss": 14.2109, + "loss": 25.9531, "step": 437 }, { "epoch": 1.5642857142857143, - "grad_norm": 60.63215255737305, + "grad_norm": 1698.635498046875, "learning_rate": 9.903317419281557e-06, - "loss": 12.1875, + "loss": 37.0625, "step": 438 }, { "epoch": 1.5678571428571428, - "grad_norm": 58.2626838684082, + "grad_norm": 1835.8955078125, "learning_rate": 9.902093738606405e-06, - "loss": 10.8906, + "loss": 31.7109, "step": 439 }, { "epoch": 1.5714285714285714, - "grad_norm": 63.255191802978516, + "grad_norm": 916.6587524414062, "learning_rate": 9.900862439242719e-06, - "loss": 15.0469, + "loss": 16.875, "step": 440 }, { "epoch": 1.575, - "grad_norm": 59.10268783569336, + "grad_norm": 1504.1815185546875, "learning_rate": 9.899623523104149e-06, - "loss": 10.9375, + "loss": 17.0352, "step": 441 }, { "epoch": 1.5785714285714287, - "grad_norm": 64.71354675292969, + "grad_norm": 1168.5439453125, "learning_rate": 9.898376992116179e-06, - "loss": 15.0156, + "loss": 29.457, "step": 442 }, { "epoch": 1.5821428571428573, - "grad_norm": 55.60033416748047, + "grad_norm": 1242.0263671875, "learning_rate": 9.897122848216131e-06, - "loss": 12.6719, + "loss": 25.7461, "step": 443 }, { "epoch": 1.5857142857142859, - "grad_norm": 55.70918655395508, + "grad_norm": 1354.6993408203125, "learning_rate": 9.895861093353159e-06, - "loss": 11.5977, + "loss": 20.2578, "step": 444 }, { "epoch": 1.5892857142857144, - "grad_norm": 64.78665161132812, + "grad_norm": 1570.2513427734375, "learning_rate": 9.894591729488243e-06, - "loss": 13.125, + "loss": 19.6914, "step": 445 }, { "epoch": 1.592857142857143, - "grad_norm": 69.9896469116211, + "grad_norm": 1027.1549072265625, "learning_rate": 9.893314758594192e-06, - "loss": 11.8633, + "loss": 13.2383, "step": 446 }, { "epoch": 1.5964285714285715, - "grad_norm": 64.56148529052734, + "grad_norm": 968.5897827148438, "learning_rate": 9.892030182655638e-06, - "loss": 17.8672, + "loss": 22.6855, "step": 447 }, { "epoch": 1.6, - "grad_norm": 69.98963165283203, + "grad_norm": 813.89013671875, "learning_rate": 9.890738003669029e-06, - "loss": 12.6836, + "loss": 13.2578, "step": 448 }, { "epoch": 1.6035714285714286, - "grad_norm": 73.51358032226562, + "grad_norm": 1217.337646484375, "learning_rate": 9.889438223642632e-06, - "loss": 14.75, + "loss": 34.8789, "step": 449 }, { "epoch": 1.6071428571428572, - "grad_norm": 55.47947692871094, + "grad_norm": 721.6350708007812, "learning_rate": 9.888130844596525e-06, - "loss": 12.9609, + "loss": 13.2109, "step": 450 }, { "epoch": 1.6107142857142858, - "grad_norm": 57.51869201660156, + "grad_norm": 397.8067321777344, "learning_rate": 9.886815868562596e-06, - "loss": 13.5039, + "loss": 23.9453, "step": 451 }, { "epoch": 1.6142857142857143, - "grad_norm": 60.760433197021484, + "grad_norm": 262.0512390136719, "learning_rate": 9.885493297584548e-06, - "loss": 13.7266, + "loss": 12.4531, "step": 452 }, { "epoch": 1.6178571428571429, - "grad_norm": 85.88954162597656, + "grad_norm": 1929.118896484375, "learning_rate": 9.884163133717876e-06, - "loss": 12.2266, + "loss": 48.1484, "step": 453 }, { "epoch": 1.6214285714285714, - "grad_norm": 87.56583404541016, + "grad_norm": 388.3246154785156, "learning_rate": 9.882825379029883e-06, - "loss": 15.6719, + "loss": 28.2852, "step": 454 }, { "epoch": 1.625, - "grad_norm": 60.022605895996094, + "grad_norm": 1570.833251953125, "learning_rate": 9.881480035599667e-06, - "loss": 13.0312, + "loss": 23.6113, "step": 455 }, { "epoch": 1.6285714285714286, - "grad_norm": 61.26583480834961, + "grad_norm": 1388.6534423828125, "learning_rate": 9.880127105518122e-06, - "loss": 12.7266, + "loss": 20.4531, "step": 456 }, { "epoch": 1.6321428571428571, - "grad_norm": 66.49527740478516, + "grad_norm": 1253.453857421875, "learning_rate": 9.878766590887932e-06, - "loss": 13.0703, + "loss": 33.5586, "step": 457 }, { "epoch": 1.6357142857142857, - "grad_norm": 56.891326904296875, + "grad_norm": 976.625, "learning_rate": 9.877398493823567e-06, - "loss": 13.2188, + "loss": 36.3633, "step": 458 }, { "epoch": 1.6392857142857142, - "grad_norm": 52.832061767578125, + "grad_norm": 1472.9024658203125, "learning_rate": 9.876022816451284e-06, - "loss": 12.75, + "loss": 23.4766, "step": 459 }, { "epoch": 1.6428571428571428, - "grad_norm": 61.07321548461914, + "grad_norm": 395.1318664550781, "learning_rate": 9.874639560909118e-06, - "loss": 14.0625, + "loss": 24.5977, "step": 460 }, { "epoch": 1.6464285714285714, - "grad_norm": 66.30066680908203, + "grad_norm": 2294.47705078125, "learning_rate": 9.873248729346888e-06, - "loss": 15.0938, + "loss": 50.8984, "step": 461 }, { "epoch": 1.65, - "grad_norm": 61.22555160522461, + "grad_norm": 1772.104736328125, "learning_rate": 9.871850323926178e-06, - "loss": 12.5078, + "loss": 49.7695, "step": 462 }, { "epoch": 1.6535714285714285, - "grad_norm": 65.93433380126953, + "grad_norm": 1440.6239013671875, "learning_rate": 9.870444346820349e-06, - "loss": 13.0625, + "loss": 22.4766, "step": 463 }, { "epoch": 1.657142857142857, - "grad_norm": 61.884849548339844, + "grad_norm": 2393.2177734375, "learning_rate": 9.869030800214531e-06, - "loss": 12.8555, + "loss": 53.2422, "step": 464 }, { "epoch": 1.6607142857142856, - "grad_norm": 63.299739837646484, + "grad_norm": 1742.3726806640625, "learning_rate": 9.867609686305616e-06, - "loss": 14.125, + "loss": 40.1953, "step": 465 }, { "epoch": 1.6642857142857141, - "grad_norm": 60.81603240966797, + "grad_norm": 1181.1842041015625, "learning_rate": 9.866181007302258e-06, - "loss": 16.5469, + "loss": 21.0312, "step": 466 }, { "epoch": 1.6678571428571427, - "grad_norm": 59.329383850097656, + "grad_norm": 778.9828491210938, "learning_rate": 9.864744765424864e-06, - "loss": 15.1602, + "loss": 20.2773, "step": 467 }, { "epoch": 1.6714285714285713, - "grad_norm": 87.3580551147461, + "grad_norm": 519.1237182617188, "learning_rate": 9.863300962905602e-06, - "loss": 21.1641, + "loss": 30.7852, "step": 468 }, { "epoch": 1.675, - "grad_norm": 73.7926254272461, + "grad_norm": 914.5255126953125, "learning_rate": 9.861849601988384e-06, - "loss": 15.6094, + "loss": 18.8555, "step": 469 }, { "epoch": 1.6785714285714286, - "grad_norm": 65.26078033447266, + "grad_norm": 1399.2432861328125, "learning_rate": 9.860390684928873e-06, - "loss": 17.1016, + "loss": 34.4062, "step": 470 }, { "epoch": 1.6821428571428572, - "grad_norm": 69.60477447509766, + "grad_norm": 1298.0614013671875, "learning_rate": 9.858924213994477e-06, - "loss": 17.1641, + "loss": 24.2891, "step": 471 }, { "epoch": 1.6857142857142857, - "grad_norm": 67.26019287109375, + "grad_norm": 1299.804443359375, "learning_rate": 9.857450191464337e-06, - "loss": 15.4688, + "loss": 24.4121, "step": 472 }, { "epoch": 1.6892857142857143, - "grad_norm": 67.14057159423828, + "grad_norm": 1474.2772216796875, "learning_rate": 9.85596861962934e-06, - "loss": 11.8125, + "loss": 44.7754, "step": 473 }, { "epoch": 1.6928571428571428, - "grad_norm": 64.00554656982422, + "grad_norm": 727.7659912109375, "learning_rate": 9.854479500792099e-06, - "loss": 17.0312, + "loss": 25.3242, "step": 474 }, { "epoch": 1.6964285714285714, - "grad_norm": 70.2386474609375, + "grad_norm": 1467.8577880859375, "learning_rate": 9.852982837266955e-06, - "loss": 16.7578, + "loss": 32.5547, "step": 475 }, { "epoch": 1.7, - "grad_norm": 89.21781921386719, + "grad_norm": 641.2784423828125, "learning_rate": 9.851478631379982e-06, - "loss": 14.4062, + "loss": 27.1875, "step": 476 }, { "epoch": 1.7035714285714287, - "grad_norm": 86.43087005615234, + "grad_norm": 1494.5640869140625, "learning_rate": 9.849966885468974e-06, - "loss": 10.3008, + "loss": 32.5527, "step": 477 }, { "epoch": 1.7071428571428573, - "grad_norm": 50.75283432006836, + "grad_norm": 153.75148010253906, "learning_rate": 9.848447601883436e-06, - "loss": 12.2031, + "loss": 8.3477, "step": 478 }, { "epoch": 1.7107142857142859, - "grad_norm": 67.03600311279297, + "grad_norm": 734.0425415039062, "learning_rate": 9.846920782984595e-06, - "loss": 11.7344, + "loss": 11.7773, "step": 479 }, { "epoch": 1.7142857142857144, - "grad_norm": 62.450584411621094, + "grad_norm": 101.44495391845703, "learning_rate": 9.84538643114539e-06, - "loss": 13.0781, + "loss": 7.6504, "step": 480 }, { "epoch": 1.717857142857143, - "grad_norm": 60.48087692260742, + "grad_norm": 1316.48779296875, "learning_rate": 9.843844548750463e-06, - "loss": 14.9844, + "loss": 44.5312, "step": 481 }, { "epoch": 1.7214285714285715, - "grad_norm": 67.67215728759766, + "grad_norm": 2000.48974609375, "learning_rate": 9.842295138196165e-06, - "loss": 13.5312, + "loss": 29.1562, "step": 482 }, { "epoch": 1.725, - "grad_norm": 66.13870239257812, + "grad_norm": 844.6709594726562, "learning_rate": 9.84073820189054e-06, - "loss": 13.8125, + "loss": 22.6836, "step": 483 }, { "epoch": 1.7285714285714286, - "grad_norm": 79.39622497558594, + "grad_norm": 642.6068725585938, "learning_rate": 9.839173742253334e-06, - "loss": 16.1328, + "loss": 16.5781, "step": 484 }, { "epoch": 1.7321428571428572, - "grad_norm": 61.412322998046875, + "grad_norm": 908.5521850585938, "learning_rate": 9.837601761715982e-06, - "loss": 12.5938, + "loss": 58.0508, "step": 485 }, { "epoch": 1.7357142857142858, - "grad_norm": 57.348175048828125, + "grad_norm": 39.22201919555664, "learning_rate": 9.836022262721611e-06, - "loss": 11.7383, + "loss": 8.7656, "step": 486 }, { "epoch": 1.7392857142857143, - "grad_norm": 68.28793334960938, + "grad_norm": 1468.984130859375, "learning_rate": 9.834435247725032e-06, - "loss": 14.3125, + "loss": 23.0742, "step": 487 }, { "epoch": 1.7428571428571429, - "grad_norm": 61.77914047241211, + "grad_norm": 2076.000732421875, "learning_rate": 9.832840719192737e-06, - "loss": 13.4844, + "loss": 49.0703, "step": 488 }, { "epoch": 1.7464285714285714, - "grad_norm": 69.30245208740234, + "grad_norm": 1671.769287109375, "learning_rate": 9.831238679602893e-06, - "loss": 11.0938, + "loss": 19.5352, "step": 489 }, { "epoch": 1.75, - "grad_norm": 65.97408294677734, + "grad_norm": 836.016845703125, "learning_rate": 9.829629131445342e-06, - "loss": 15.0781, + "loss": 15.793, "step": 490 }, { "epoch": 1.7535714285714286, - "grad_norm": 74.2400894165039, + "grad_norm": 1569.8265380859375, "learning_rate": 9.828012077221598e-06, - "loss": 14.6797, + "loss": 23.875, "step": 491 }, { "epoch": 1.7571428571428571, - "grad_norm": 60.69493103027344, + "grad_norm": 751.0060424804688, "learning_rate": 9.826387519444838e-06, - "loss": 12.0781, + "loss": 20.1016, "step": 492 }, { "epoch": 1.7607142857142857, - "grad_norm": 75.23729705810547, + "grad_norm": 847.2117919921875, "learning_rate": 9.824755460639901e-06, - "loss": 12.625, + "loss": 13.0664, "step": 493 }, { "epoch": 1.7642857142857142, - "grad_norm": 55.42323303222656, + "grad_norm": 770.7418823242188, "learning_rate": 9.823115903343283e-06, - "loss": 13.4453, + "loss": 13.2109, "step": 494 }, { "epoch": 1.7678571428571428, - "grad_norm": 62.02582550048828, + "grad_norm": 1768.5833740234375, "learning_rate": 9.82146885010314e-06, - "loss": 14.1094, + "loss": 27.1172, "step": 495 }, { "epoch": 1.7714285714285714, - "grad_norm": 72.0544662475586, + "grad_norm": 800.9346313476562, "learning_rate": 9.819814303479268e-06, - "loss": 16.1953, + "loss": 24.7305, "step": 496 }, { "epoch": 1.775, - "grad_norm": 63.89875030517578, + "grad_norm": 1684.244384765625, "learning_rate": 9.818152266043115e-06, - "loss": 14.1797, + "loss": 37.6719, "step": 497 }, { "epoch": 1.7785714285714285, - "grad_norm": 64.88561248779297, + "grad_norm": 822.6416015625, "learning_rate": 9.816482740377775e-06, - "loss": 13.9453, + "loss": 24.0547, "step": 498 }, { "epoch": 1.782142857142857, - "grad_norm": 64.00762176513672, + "grad_norm": 1615.4383544921875, "learning_rate": 9.814805729077968e-06, - "loss": 11.5, + "loss": 32.3242, "step": 499 }, { "epoch": 1.7857142857142856, - "grad_norm": 75.17144775390625, + "grad_norm": 1251.168701171875, "learning_rate": 9.81312123475006e-06, - "loss": 13.3281, + "loss": 38.2988, "step": 500 }, { "epoch": 1.7892857142857141, - "grad_norm": 76.5600814819336, + "grad_norm": 485.68603515625, "learning_rate": 9.811429260012044e-06, - "loss": 13.5078, + "loss": 11.4961, "step": 501 }, { "epoch": 1.7928571428571427, - "grad_norm": 58.287132263183594, + "grad_norm": 1440.1732177734375, "learning_rate": 9.80972980749353e-06, - "loss": 12.1875, + "loss": 38.2617, "step": 502 }, { "epoch": 1.7964285714285713, - "grad_norm": 79.07958984375, + "grad_norm": 1040.0146484375, "learning_rate": 9.808022879835762e-06, - "loss": 18.2109, + "loss": 20.7227, "step": 503 }, { "epoch": 1.8, - "grad_norm": 72.38505554199219, + "grad_norm": 1593.7366943359375, "learning_rate": 9.806308479691595e-06, - "loss": 15.9219, + "loss": 42.0938, "step": 504 }, { "epoch": 1.8035714285714286, - "grad_norm": 55.1053466796875, + "grad_norm": 1172.6595458984375, "learning_rate": 9.804586609725499e-06, - "loss": 13.6719, + "loss": 26.9746, "step": 505 }, { "epoch": 1.8071428571428572, - "grad_norm": 65.40480041503906, + "grad_norm": 1183.6171875, "learning_rate": 9.802857272613552e-06, - "loss": 18.5312, + "loss": 21.6641, "step": 506 }, { "epoch": 1.8107142857142857, - "grad_norm": 54.70572280883789, + "grad_norm": 515.4976806640625, "learning_rate": 9.80112047104344e-06, - "loss": 12.1484, + "loss": 10.4727, "step": 507 }, { "epoch": 1.8142857142857143, - "grad_norm": 61.03078842163086, + "grad_norm": 2090.161376953125, "learning_rate": 9.799376207714446e-06, - "loss": 15.8125, + "loss": 39.2969, "step": 508 }, { "epoch": 1.8178571428571428, - "grad_norm": 70.83861541748047, + "grad_norm": 1059.23486328125, "learning_rate": 9.797624485337454e-06, - "loss": 21.2578, + "loss": 24.6133, "step": 509 }, { "epoch": 1.8214285714285714, - "grad_norm": 69.0458984375, + "grad_norm": 1305.21484375, "learning_rate": 9.795865306634939e-06, - "loss": 14.8516, + "loss": 23.2148, "step": 510 }, { "epoch": 1.825, - "grad_norm": 71.186279296875, + "grad_norm": 1501.465576171875, "learning_rate": 9.794098674340966e-06, - "loss": 14.3984, + "loss": 21.1953, "step": 511 }, { "epoch": 1.8285714285714287, - "grad_norm": 59.495662689208984, + "grad_norm": 704.1776733398438, "learning_rate": 9.792324591201179e-06, - "loss": 11.1445, + "loss": 28.5508, "step": 512 }, { "epoch": 1.8321428571428573, - "grad_norm": 52.97871780395508, + "grad_norm": 1712.2322998046875, "learning_rate": 9.790543059972807e-06, - "loss": 11.6719, + "loss": 23.8047, "step": 513 }, { "epoch": 1.8357142857142859, - "grad_norm": 53.318233489990234, + "grad_norm": 721.8836059570312, "learning_rate": 9.788754083424654e-06, - "loss": 10.4922, + "loss": 13.6543, "step": 514 }, { "epoch": 1.8392857142857144, - "grad_norm": 49.11600112915039, + "grad_norm": 747.8984985351562, "learning_rate": 9.78695766433709e-06, - "loss": 10.9609, + "loss": 12.1211, "step": 515 }, { "epoch": 1.842857142857143, - "grad_norm": 67.2564697265625, + "grad_norm": 1446.3438720703125, "learning_rate": 9.785153805502062e-06, - "loss": 13.625, + "loss": 31.7891, "step": 516 }, { "epoch": 1.8464285714285715, - "grad_norm": 65.34046936035156, + "grad_norm": 893.0841674804688, "learning_rate": 9.78334250972307e-06, - "loss": 13.4688, + "loss": 14.0586, "step": 517 }, { "epoch": 1.85, - "grad_norm": 78.73178100585938, + "grad_norm": 1067.0, "learning_rate": 9.781523779815178e-06, - "loss": 15.5391, + "loss": 31.6211, "step": 518 }, { "epoch": 1.8535714285714286, - "grad_norm": 69.47505187988281, + "grad_norm": 1260.3907470703125, "learning_rate": 9.779697618605001e-06, - "loss": 14.1875, + "loss": 41.9258, "step": 519 }, { "epoch": 1.8571428571428572, - "grad_norm": 60.11038589477539, + "grad_norm": 854.4818115234375, "learning_rate": 9.777864028930705e-06, - "loss": 12.3984, + "loss": 17.873, "step": 520 }, { "epoch": 1.8607142857142858, - "grad_norm": 56.6129035949707, + "grad_norm": 1230.445068359375, "learning_rate": 9.776023013642e-06, - "loss": 12.1641, + "loss": 16.7461, "step": 521 }, { "epoch": 1.8642857142857143, - "grad_norm": 78.31788635253906, + "grad_norm": 1226.038818359375, "learning_rate": 9.774174575600137e-06, - "loss": 15.5, + "loss": 31.6328, "step": 522 }, { "epoch": 1.8678571428571429, - "grad_norm": 64.33763122558594, + "grad_norm": 1383.8424072265625, "learning_rate": 9.772318717677905e-06, - "loss": 13.875, + "loss": 21.9414, "step": 523 }, { "epoch": 1.8714285714285714, - "grad_norm": 63.95622634887695, + "grad_norm": 219.9924774169922, "learning_rate": 9.770455442759622e-06, - "loss": 13.625, + "loss": 11.5195, "step": 524 }, { "epoch": 1.875, - "grad_norm": 53.69536209106445, + "grad_norm": 1618.654052734375, "learning_rate": 9.768584753741134e-06, - "loss": 10.5195, + "loss": 26.0156, "step": 525 }, { "epoch": 1.8785714285714286, - "grad_norm": 62.87694549560547, + "grad_norm": 1037.7147216796875, "learning_rate": 9.766706653529814e-06, - "loss": 12.3125, + "loss": 17.5859, "step": 526 }, { "epoch": 1.8821428571428571, - "grad_norm": 67.30829620361328, + "grad_norm": 1345.6417236328125, "learning_rate": 9.764821145044545e-06, - "loss": 12.8672, + "loss": 36.7227, "step": 527 }, { "epoch": 1.8857142857142857, - "grad_norm": 76.14859771728516, + "grad_norm": 1183.8675537109375, "learning_rate": 9.762928231215731e-06, - "loss": 13.5469, + "loss": 18.1953, "step": 528 }, { "epoch": 1.8892857142857142, - "grad_norm": 72.80113983154297, + "grad_norm": 965.0231323242188, "learning_rate": 9.761027914985282e-06, - "loss": 12.9648, + "loss": 28.7773, "step": 529 }, { "epoch": 1.8928571428571428, - "grad_norm": 72.81568908691406, + "grad_norm": 792.8912963867188, "learning_rate": 9.759120199306613e-06, - "loss": 14.3828, + "loss": 17.5664, "step": 530 }, { "epoch": 1.8964285714285714, - "grad_norm": 54.340057373046875, + "grad_norm": 1331.437744140625, "learning_rate": 9.75720508714464e-06, - "loss": 10.9609, + "loss": 17.4141, "step": 531 }, { "epoch": 1.9, - "grad_norm": 62.46900939941406, + "grad_norm": 248.95648193359375, "learning_rate": 9.755282581475769e-06, - "loss": 11.2695, + "loss": 19.0527, "step": 532 }, { "epoch": 1.9035714285714285, - "grad_norm": 60.12433624267578, + "grad_norm": 1096.2860107421875, "learning_rate": 9.753352685287906e-06, - "loss": 12.8125, + "loss": 29.416, "step": 533 }, { "epoch": 1.907142857142857, - "grad_norm": 56.85470199584961, + "grad_norm": 361.1824951171875, "learning_rate": 9.751415401580437e-06, - "loss": 12.0078, + "loss": 10.2695, "step": 534 }, { "epoch": 1.9107142857142856, - "grad_norm": 113.504638671875, + "grad_norm": 640.0327758789062, "learning_rate": 9.749470733364231e-06, - "loss": 13.4453, + "loss": 14.3906, "step": 535 }, { "epoch": 1.9142857142857141, - "grad_norm": 71.07494354248047, + "grad_norm": 244.1167755126953, "learning_rate": 9.747518683661632e-06, - "loss": 13.418, + "loss": 11.1719, "step": 536 }, { "epoch": 1.9178571428571427, - "grad_norm": 65.5964126586914, + "grad_norm": 785.2843017578125, "learning_rate": 9.74555925550646e-06, - "loss": 13.0156, + "loss": 37.2734, "step": 537 }, { "epoch": 1.9214285714285713, - "grad_norm": 63.37857437133789, + "grad_norm": 683.5971069335938, "learning_rate": 9.743592451944e-06, - "loss": 14.5234, + "loss": 27.6543, "step": 538 }, { "epoch": 1.925, - "grad_norm": 58.5995979309082, + "grad_norm": 344.5482482910156, "learning_rate": 9.741618276030998e-06, - "loss": 11.9688, + "loss": 9.9102, "step": 539 }, { "epoch": 1.9285714285714286, - "grad_norm": 57.571231842041016, + "grad_norm": 297.43255615234375, "learning_rate": 9.73963673083566e-06, - "loss": 14.1641, + "loss": 13.1309, "step": 540 }, { "epoch": 1.9321428571428572, - "grad_norm": 61.73564147949219, + "grad_norm": 680.2230834960938, "learning_rate": 9.737647819437645e-06, - "loss": 13.9688, + "loss": 26.7637, "step": 541 }, { "epoch": 1.9357142857142857, - "grad_norm": 67.2490463256836, + "grad_norm": 305.0251770019531, "learning_rate": 9.73565154492806e-06, - "loss": 14.8828, + "loss": 13.0859, "step": 542 }, { "epoch": 1.9392857142857143, - "grad_norm": 73.42760467529297, + "grad_norm": 1296.7769775390625, "learning_rate": 9.733647910409456e-06, - "loss": 15.8516, + "loss": 24.0195, "step": 543 }, { "epoch": 1.9428571428571428, - "grad_norm": 63.2861213684082, + "grad_norm": 747.3027954101562, "learning_rate": 9.731636918995821e-06, - "loss": 16.6797, + "loss": 30.0625, "step": 544 }, { "epoch": 1.9464285714285714, - "grad_norm": 59.45108413696289, + "grad_norm": 363.14117431640625, "learning_rate": 9.72961857381258e-06, - "loss": 15.1953, + "loss": 13.957, "step": 545 }, { "epoch": 1.95, - "grad_norm": 53.372352600097656, + "grad_norm": 1508.800537109375, "learning_rate": 9.727592877996585e-06, - "loss": 12.3672, + "loss": 36.5723, "step": 546 }, { "epoch": 1.9535714285714287, - "grad_norm": 58.048973083496094, + "grad_norm": 195.5558624267578, "learning_rate": 9.72555983469611e-06, - "loss": 11.4766, + "loss": 19.3633, "step": 547 }, { "epoch": 1.9571428571428573, - "grad_norm": 57.8227424621582, + "grad_norm": 350.47967529296875, "learning_rate": 9.723519447070854e-06, - "loss": 11.7812, + "loss": 22.9609, "step": 548 }, { "epoch": 1.9607142857142859, - "grad_norm": 69.44131469726562, + "grad_norm": 1207.3375244140625, "learning_rate": 9.721471718291922e-06, - "loss": 14.6641, + "loss": 19.5234, "step": 549 }, { "epoch": 1.9642857142857144, - "grad_norm": 77.03851318359375, + "grad_norm": 1065.8724365234375, "learning_rate": 9.719416651541839e-06, - "loss": 12.8594, + "loss": 18.8242, "step": 550 }, { "epoch": 1.967857142857143, - "grad_norm": 63.37330627441406, + "grad_norm": 1623.127197265625, "learning_rate": 9.717354250014524e-06, - "loss": 14.8281, + "loss": 37.1992, "step": 551 }, { "epoch": 1.9714285714285715, - "grad_norm": 46.10747528076172, + "grad_norm": 34.10625457763672, "learning_rate": 9.715284516915303e-06, - "loss": 11.7812, + "loss": 8.8203, "step": 552 }, { "epoch": 1.975, - "grad_norm": 81.59782409667969, + "grad_norm": 1360.32861328125, "learning_rate": 9.713207455460893e-06, - "loss": 15.7188, + "loss": 23.9414, "step": 553 }, { "epoch": 1.9785714285714286, - "grad_norm": 68.83948516845703, + "grad_norm": 983.7730712890625, "learning_rate": 9.7111230688794e-06, - "loss": 11.8711, + "loss": 15.4492, "step": 554 }, { "epoch": 1.9821428571428572, - "grad_norm": 63.49802780151367, + "grad_norm": 1024.8736572265625, "learning_rate": 9.709031360410318e-06, - "loss": 12.1719, + "loss": 27.3281, "step": 555 }, { "epoch": 1.9857142857142858, - "grad_norm": 69.13542938232422, + "grad_norm": 1014.2055053710938, "learning_rate": 9.706932333304518e-06, - "loss": 12.1562, + "loss": 27.8203, "step": 556 }, { "epoch": 1.9892857142857143, - "grad_norm": 50.63701248168945, + "grad_norm": 499.101318359375, "learning_rate": 9.704825990824243e-06, - "loss": 11.1016, + "loss": 21.3906, "step": 557 }, { "epoch": 1.9928571428571429, - "grad_norm": 63.36759567260742, + "grad_norm": 609.7255249023438, "learning_rate": 9.702712336243109e-06, - "loss": 10.7734, + "loss": 10.248, "step": 558 }, { "epoch": 1.9964285714285714, - "grad_norm": 54.72822570800781, + "grad_norm": 9.642609596252441, "learning_rate": 9.700591372846096e-06, - "loss": 11.5352, + "loss": 9.1289, "step": 559 }, { "epoch": 2.0, - "grad_norm": 46.7043571472168, + "grad_norm": 464.7775573730469, "learning_rate": 9.698463103929542e-06, - "loss": 11.1602, + "loss": 9.9395, "step": 560 }, { "epoch": 2.0, - "eval_loss": 13.173151016235352, - "eval_mse": 13.177350813527328, - "eval_runtime": 11.5341, - "eval_samples_per_second": 246.227, - "eval_steps_per_second": 1.3, - "eval_target_0_mse": 23.31787680982535, - "eval_target_1_mse": 13.17604352203358, - "eval_target_2_mse": 8.288725219540341, - "eval_target_3_mse": 7.926757702710042, + "eval_loss": 20.696178436279297, + "eval_mse": 20.73352995061185, + "eval_runtime": 10.7402, + "eval_samples_per_second": 264.426, + "eval_steps_per_second": 1.397, + "eval_target_0_mse": 42.07642225136172, + "eval_target_1_mse": 18.629381458363742, + "eval_target_2_mse": 16.85529584827585, + "eval_target_3_mse": 5.373020244446087, "step": 560 }, { "epoch": 2.0035714285714286, - "grad_norm": 65.37533569335938, + "grad_norm": 42.97891616821289, "learning_rate": 9.69632753280114e-06, - "loss": 12.5078, + "loss": 7.3105, "step": 561 }, { "epoch": 2.007142857142857, - "grad_norm": 63.120948791503906, + "grad_norm": 464.99609375, "learning_rate": 9.694184662779931e-06, - "loss": 14.2656, + "loss": 15.5703, "step": 562 }, { "epoch": 2.0107142857142857, - "grad_norm": 51.067588806152344, + "grad_norm": 1045.445068359375, "learning_rate": 9.6920344971963e-06, - "loss": 12.5547, + "loss": 28.3184, "step": 563 }, { "epoch": 2.0142857142857142, - "grad_norm": 58.87617492675781, + "grad_norm": 755.9244995117188, "learning_rate": 9.68987703939197e-06, - "loss": 10.5469, + "loss": 23.1211, "step": 564 }, { "epoch": 2.017857142857143, - "grad_norm": 65.44974517822266, + "grad_norm": 770.8403930664062, "learning_rate": 9.687712292719997e-06, - "loss": 17.6875, + "loss": 37.2656, "step": 565 }, { "epoch": 2.0214285714285714, - "grad_norm": 53.74982833862305, + "grad_norm": 390.13323974609375, "learning_rate": 9.685540260544768e-06, - "loss": 12.4844, + "loss": 14.1953, "step": 566 }, { "epoch": 2.025, - "grad_norm": 55.018131256103516, + "grad_norm": 704.2529907226562, "learning_rate": 9.683360946241988e-06, - "loss": 13.3672, + "loss": 15.7617, "step": 567 }, { "epoch": 2.0285714285714285, - "grad_norm": 71.43266296386719, + "grad_norm": 613.5941162109375, "learning_rate": 9.681174353198687e-06, - "loss": 12.0625, + "loss": 24.1152, "step": 568 }, { "epoch": 2.032142857142857, - "grad_norm": 57.66063690185547, + "grad_norm": 756.5528564453125, "learning_rate": 9.678980484813199e-06, - "loss": 15.7969, + "loss": 15.832, "step": 569 }, { "epoch": 2.0357142857142856, - "grad_norm": 53.663631439208984, + "grad_norm": 168.2481231689453, "learning_rate": 9.67677934449517e-06, - "loss": 13.9297, + "loss": 11.5391, "step": 570 }, { "epoch": 2.039285714285714, - "grad_norm": 76.1527328491211, + "grad_norm": 205.36883544921875, "learning_rate": 9.67457093566555e-06, - "loss": 12.9961, + "loss": 11.9062, "step": 571 }, { "epoch": 2.0428571428571427, - "grad_norm": 57.869136810302734, + "grad_norm": 926.4354858398438, "learning_rate": 9.672355261756578e-06, - "loss": 13.0391, + "loss": 17.5742, "step": 572 }, { "epoch": 2.0464285714285713, - "grad_norm": 71.52044677734375, + "grad_norm": 200.5098114013672, "learning_rate": 9.670132326211792e-06, - "loss": 12.4297, + "loss": 11.9375, "step": 573 }, { "epoch": 2.05, - "grad_norm": 56.6915397644043, + "grad_norm": 1315.3004150390625, "learning_rate": 9.667902132486009e-06, - "loss": 12.5391, + "loss": 20.2012, "step": 574 }, { "epoch": 2.0535714285714284, - "grad_norm": 59.381282806396484, + "grad_norm": 850.7864379882812, "learning_rate": 9.665664684045332e-06, - "loss": 12.0156, + "loss": 17.5547, "step": 575 }, { "epoch": 2.057142857142857, - "grad_norm": 56.226402282714844, + "grad_norm": 499.75567626953125, "learning_rate": 9.663419984367139e-06, - "loss": 9.8008, + "loss": 16.0547, "step": 576 }, { "epoch": 2.0607142857142855, - "grad_norm": 60.33580780029297, + "grad_norm": 210.24156188964844, "learning_rate": 9.661168036940071e-06, - "loss": 13.3281, + "loss": 12.3711, "step": 577 }, { "epoch": 2.064285714285714, - "grad_norm": 58.95866012573242, + "grad_norm": 480.1747741699219, "learning_rate": 9.658908845264043e-06, - "loss": 10.793, + "loss": 8.8887, "step": 578 }, { "epoch": 2.067857142857143, - "grad_norm": 72.72016906738281, + "grad_norm": 1047.54150390625, "learning_rate": 9.65664241285022e-06, - "loss": 16.5859, + "loss": 23.2148, "step": 579 }, { "epoch": 2.0714285714285716, - "grad_norm": 55.30221176147461, + "grad_norm": 188.54397583007812, "learning_rate": 9.654368743221022e-06, - "loss": 12.9375, + "loss": 11.6523, "step": 580 }, { "epoch": 2.075, - "grad_norm": 66.87218475341797, + "grad_norm": 1569.677734375, "learning_rate": 9.652087839910123e-06, - "loss": 12.0117, + "loss": 25.2891, "step": 581 }, { "epoch": 2.0785714285714287, - "grad_norm": 75.04557037353516, + "grad_norm": 1176.041748046875, "learning_rate": 9.649799706462435e-06, - "loss": 21.2578, + "loss": 25.0156, "step": 582 }, { "epoch": 2.0821428571428573, - "grad_norm": 68.70262908935547, + "grad_norm": 330.2607421875, "learning_rate": 9.647504346434105e-06, - "loss": 12.7422, + "loss": 23.875, "step": 583 }, { "epoch": 2.085714285714286, - "grad_norm": 61.93150329589844, + "grad_norm": 809.8572998046875, "learning_rate": 9.645201763392513e-06, - "loss": 12.4141, + "loss": 16.3477, "step": 584 }, { "epoch": 2.0892857142857144, - "grad_norm": 64.75306701660156, + "grad_norm": 811.7422485351562, "learning_rate": 9.642891960916269e-06, - "loss": 14.9141, + "loss": 16.8906, "step": 585 }, { "epoch": 2.092857142857143, - "grad_norm": 60.123695373535156, + "grad_norm": 55.9590950012207, "learning_rate": 9.640574942595195e-06, - "loss": 15.8984, + "loss": 13.5352, "step": 586 }, { "epoch": 2.0964285714285715, - "grad_norm": 60.153987884521484, + "grad_norm": 866.0266723632812, "learning_rate": 9.638250712030334e-06, - "loss": 14.9531, + "loss": 19.1406, "step": 587 }, { "epoch": 2.1, - "grad_norm": 62.738433837890625, + "grad_norm": 161.6846923828125, "learning_rate": 9.635919272833938e-06, - "loss": 13.5703, + "loss": 11.6484, "step": 588 }, { "epoch": 2.1035714285714286, - "grad_norm": 62.45803451538086, + "grad_norm": 1493.354248046875, "learning_rate": 9.633580628629458e-06, - "loss": 14.2734, + "loss": 39.4082, "step": 589 }, { "epoch": 2.107142857142857, - "grad_norm": 77.72651672363281, + "grad_norm": 407.9542236328125, "learning_rate": 9.631234783051544e-06, - "loss": 16.2578, + "loss": 14.6523, "step": 590 }, { "epoch": 2.1107142857142858, - "grad_norm": 63.42686080932617, + "grad_norm": 1179.7882080078125, "learning_rate": 9.628881739746043e-06, - "loss": 11.3945, + "loss": 16.7852, "step": 591 }, { "epoch": 2.1142857142857143, - "grad_norm": 66.50565338134766, + "grad_norm": 83.4468994140625, "learning_rate": 9.626521502369984e-06, - "loss": 10.9844, + "loss": 7.4453, "step": 592 }, { "epoch": 2.117857142857143, - "grad_norm": 72.35333251953125, + "grad_norm": 191.75244140625, "learning_rate": 9.624154074591577e-06, - "loss": 10.0078, + "loss": 5.9434, "step": 593 }, { "epoch": 2.1214285714285714, - "grad_norm": 71.17774200439453, + "grad_norm": 732.6151733398438, "learning_rate": 9.621779460090209e-06, - "loss": 11.918, + "loss": 19.207, "step": 594 }, { "epoch": 2.125, - "grad_norm": 71.8630142211914, + "grad_norm": 1637.144287109375, "learning_rate": 9.619397662556434e-06, - "loss": 12.4219, + "loss": 55.1992, "step": 595 }, { "epoch": 2.1285714285714286, - "grad_norm": 61.686256408691406, + "grad_norm": 1011.1781616210938, "learning_rate": 9.617008685691973e-06, - "loss": 14.4531, + "loss": 18.1992, "step": 596 }, { "epoch": 2.132142857142857, - "grad_norm": 56.4715576171875, + "grad_norm": 593.2000732421875, "learning_rate": 9.6146125332097e-06, - "loss": 9.5078, + "loss": 7.6738, "step": 597 }, { "epoch": 2.1357142857142857, - "grad_norm": 54.050472259521484, + "grad_norm": 35.583248138427734, "learning_rate": 9.612209208833648e-06, - "loss": 11.918, + "loss": 10.6914, "step": 598 }, { "epoch": 2.1392857142857142, - "grad_norm": 57.5831413269043, + "grad_norm": 967.1884765625, "learning_rate": 9.609798716298987e-06, - "loss": 12.3594, + "loss": 13.8477, "step": 599 }, { "epoch": 2.142857142857143, - "grad_norm": 67.2144775390625, + "grad_norm": 443.8343200683594, "learning_rate": 9.60738105935204e-06, - "loss": 13.7031, + "loss": 13.4414, "step": 600 }, { "epoch": 2.1464285714285714, - "grad_norm": 61.33454132080078, + "grad_norm": 341.4071044921875, "learning_rate": 9.60495624175025e-06, - "loss": 11.4766, + "loss": 8.1895, "step": 601 }, { "epoch": 2.15, - "grad_norm": 64.48047637939453, + "grad_norm": 177.2566680908203, "learning_rate": 9.602524267262202e-06, - "loss": 12.5938, + "loss": 9.6211, "step": 602 }, { "epoch": 2.1535714285714285, - "grad_norm": 84.2440414428711, + "grad_norm": 422.60040283203125, "learning_rate": 9.600085139667597e-06, - "loss": 14.625, + "loss": 14.2891, "step": 603 }, { "epoch": 2.157142857142857, - "grad_norm": 52.14349365234375, + "grad_norm": 331.1619567871094, "learning_rate": 9.597638862757255e-06, - "loss": 10.7891, + "loss": 8.875, "step": 604 }, { "epoch": 2.1607142857142856, - "grad_norm": 52.08406448364258, + "grad_norm": 1335.163330078125, "learning_rate": 9.595185440333103e-06, - "loss": 11.4922, + "loss": 22.5156, "step": 605 }, { "epoch": 2.164285714285714, - "grad_norm": 61.841373443603516, + "grad_norm": 31.296445846557617, "learning_rate": 9.592724876208183e-06, - "loss": 11.8047, + "loss": 11.0391, "step": 606 }, { "epoch": 2.1678571428571427, - "grad_norm": 67.14802551269531, + "grad_norm": 840.38720703125, "learning_rate": 9.59025717420663e-06, - "loss": 15.0352, + "loss": 25.3398, "step": 607 }, { "epoch": 2.1714285714285713, - "grad_norm": 60.209320068359375, + "grad_norm": 9.601303100585938, "learning_rate": 9.58778233816367e-06, - "loss": 11.3633, + "loss": 9.3535, "step": 608 }, { "epoch": 2.175, - "grad_norm": 72.41310119628906, + "grad_norm": 85.21630859375, "learning_rate": 9.58530037192562e-06, - "loss": 13.0312, + "loss": 11.9531, "step": 609 }, { "epoch": 2.1785714285714284, - "grad_norm": 61.04594802856445, + "grad_norm": 149.02520751953125, "learning_rate": 9.582811279349881e-06, - "loss": 13.5, + "loss": 13.4609, "step": 610 }, { "epoch": 2.182142857142857, - "grad_norm": 76.74407196044922, + "grad_norm": 728.3556518554688, "learning_rate": 9.580315064304925e-06, - "loss": 10.5391, + "loss": 9.8281, "step": 611 }, { "epoch": 2.185714285714286, - "grad_norm": 64.79609680175781, + "grad_norm": 46.285491943359375, "learning_rate": 9.577811730670297e-06, - "loss": 14.9141, + "loss": 12.125, "step": 612 }, { "epoch": 2.189285714285714, - "grad_norm": 74.13015747070312, + "grad_norm": 1128.743408203125, "learning_rate": 9.5753012823366e-06, - "loss": 11.0938, + "loss": 36.7422, "step": 613 }, { "epoch": 2.192857142857143, - "grad_norm": 69.55266571044922, + "grad_norm": 503.4261169433594, "learning_rate": 9.572783723205502e-06, - "loss": 13.4219, + "loss": 33.5859, "step": 614 }, { "epoch": 2.1964285714285716, - "grad_norm": 51.268680572509766, + "grad_norm": 339.5541687011719, "learning_rate": 9.570259057189716e-06, - "loss": 9.5078, + "loss": 7.0586, "step": 615 }, { "epoch": 2.2, - "grad_norm": 64.8949203491211, + "grad_norm": 351.4658203125, "learning_rate": 9.567727288213005e-06, - "loss": 12.3711, + "loss": 9.918, "step": 616 }, { "epoch": 2.2035714285714287, - "grad_norm": 53.33942413330078, + "grad_norm": 331.2039794921875, "learning_rate": 9.565188420210169e-06, - "loss": 10.6914, + "loss": 10.2539, "step": 617 }, { "epoch": 2.2071428571428573, - "grad_norm": 58.641971588134766, + "grad_norm": 711.2646484375, "learning_rate": 9.56264245712704e-06, - "loss": 12.3906, + "loss": 18.8242, "step": 618 }, { "epoch": 2.210714285714286, - "grad_norm": 63.460411071777344, + "grad_norm": 682.3560791015625, "learning_rate": 9.560089402920478e-06, - "loss": 15.5156, + "loss": 16.1875, "step": 619 }, { "epoch": 2.2142857142857144, - "grad_norm": 67.26868438720703, + "grad_norm": 810.7847900390625, "learning_rate": 9.557529261558367e-06, - "loss": 12.625, + "loss": 13.0781, "step": 620 }, { "epoch": 2.217857142857143, - "grad_norm": 60.48006820678711, + "grad_norm": 721.2135620117188, "learning_rate": 9.554962037019599e-06, - "loss": 13.0547, + "loss": 14.2383, "step": 621 }, { "epoch": 2.2214285714285715, - "grad_norm": 60.59569549560547, + "grad_norm": 89.69808959960938, "learning_rate": 9.552387733294081e-06, - "loss": 12.1953, + "loss": 10.957, "step": 622 }, { "epoch": 2.225, - "grad_norm": 54.08411407470703, + "grad_norm": 753.7745971679688, "learning_rate": 9.549806354382716e-06, - "loss": 10.8203, + "loss": 12.0742, "step": 623 }, { "epoch": 2.2285714285714286, - "grad_norm": 71.39938354492188, + "grad_norm": 63.763885498046875, "learning_rate": 9.547217904297411e-06, - "loss": 10.1719, + "loss": 7.2266, "step": 624 }, { "epoch": 2.232142857142857, - "grad_norm": 51.62982940673828, + "grad_norm": 644.8640747070312, "learning_rate": 9.544622387061055e-06, - "loss": 11.4648, + "loss": 12.3691, "step": 625 }, { "epoch": 2.2357142857142858, - "grad_norm": 65.6781997680664, + "grad_norm": 525.536865234375, "learning_rate": 9.542019806707526e-06, - "loss": 12.8359, + "loss": 12.043, "step": 626 }, { "epoch": 2.2392857142857143, - "grad_norm": 73.69727325439453, + "grad_norm": 116.72285461425781, "learning_rate": 9.539410167281673e-06, - "loss": 10.2812, + "loss": 5.7578, "step": 627 }, { "epoch": 2.242857142857143, - "grad_norm": 68.20236206054688, + "grad_norm": 113.781005859375, "learning_rate": 9.536793472839325e-06, - "loss": 13.7734, + "loss": 11.5664, "step": 628 }, { "epoch": 2.2464285714285714, - "grad_norm": 64.7838134765625, + "grad_norm": 1189.43896484375, "learning_rate": 9.534169727447268e-06, - "loss": 11.5781, + "loss": 28.1816, "step": 629 }, { "epoch": 2.25, - "grad_norm": 61.53121566772461, + "grad_norm": 627.6077270507812, "learning_rate": 9.531538935183252e-06, - "loss": 12.2734, + "loss": 12.0176, "step": 630 }, { "epoch": 2.2535714285714286, - "grad_norm": 66.82312774658203, + "grad_norm": 25.101303100585938, "learning_rate": 9.528901100135971e-06, - "loss": 14.5781, + "loss": 12.2383, "step": 631 }, { "epoch": 2.257142857142857, - "grad_norm": 55.4050407409668, + "grad_norm": 26.2918643951416, "learning_rate": 9.526256226405075e-06, - "loss": 11.3047, + "loss": 7.7695, "step": 632 }, { "epoch": 2.2607142857142857, - "grad_norm": 57.75661849975586, + "grad_norm": 20.98931884765625, "learning_rate": 9.523604318101145e-06, - "loss": 11.1914, + "loss": 10.1406, "step": 633 }, { "epoch": 2.2642857142857142, - "grad_norm": 53.95859909057617, + "grad_norm": 1083.3123779296875, "learning_rate": 9.520945379345701e-06, - "loss": 11.7891, + "loss": 21.3984, "step": 634 }, { "epoch": 2.267857142857143, - "grad_norm": 54.36275863647461, + "grad_norm": 1202.2374267578125, "learning_rate": 9.518279414271184e-06, - "loss": 11.793, + "loss": 14.3984, "step": 635 }, { "epoch": 2.2714285714285714, - "grad_norm": 66.67405700683594, + "grad_norm": 416.3262939453125, "learning_rate": 9.51560642702096e-06, - "loss": 14.4531, + "loss": 13.873, "step": 636 }, { "epoch": 2.275, - "grad_norm": 77.34737396240234, + "grad_norm": 548.035400390625, "learning_rate": 9.512926421749305e-06, - "loss": 10.0, + "loss": 7.2617, "step": 637 }, { "epoch": 2.2785714285714285, - "grad_norm": 51.585853576660156, + "grad_norm": 727.9420776367188, "learning_rate": 9.510239402621402e-06, - "loss": 13.3594, + "loss": 18.2734, "step": 638 }, { "epoch": 2.282142857142857, - "grad_norm": 62.27290725708008, + "grad_norm": 76.42401885986328, "learning_rate": 9.50754537381334e-06, - "loss": 13.2031, + "loss": 11.3789, "step": 639 }, { "epoch": 2.2857142857142856, - "grad_norm": 86.95469665527344, + "grad_norm": 909.6198120117188, "learning_rate": 9.504844339512096e-06, - "loss": 15.2344, + "loss": 23.9414, "step": 640 }, { "epoch": 2.289285714285714, - "grad_norm": 65.23404693603516, + "grad_norm": 138.76686096191406, "learning_rate": 9.502136303915539e-06, - "loss": 13.1094, + "loss": 11.4219, "step": 641 }, { "epoch": 2.2928571428571427, - "grad_norm": 69.41129302978516, + "grad_norm": 614.742431640625, "learning_rate": 9.499421271232416e-06, - "loss": 12.2422, + "loss": 20.6445, "step": 642 }, { "epoch": 2.2964285714285713, - "grad_norm": 51.78654098510742, + "grad_norm": 880.5191040039062, "learning_rate": 9.496699245682351e-06, - "loss": 10.1914, + "loss": 12.002, "step": 643 }, { "epoch": 2.3, - "grad_norm": 61.27387619018555, + "grad_norm": 419.2158508300781, "learning_rate": 9.493970231495836e-06, - "loss": 11.6719, + "loss": 17.7383, "step": 644 }, { "epoch": 2.3035714285714284, - "grad_norm": 50.43906021118164, + "grad_norm": 785.4339599609375, "learning_rate": 9.49123423291422e-06, - "loss": 11.6406, + "loss": 12.7695, "step": 645 }, { "epoch": 2.307142857142857, - "grad_norm": 74.56135559082031, + "grad_norm": 178.99386596679688, "learning_rate": 9.488491254189718e-06, - "loss": 9.8945, + "loss": 30.3477, "step": 646 }, { "epoch": 2.310714285714286, - "grad_norm": 63.766048431396484, + "grad_norm": 62.72074508666992, "learning_rate": 9.48574129958538e-06, - "loss": 12.7969, + "loss": 12.1465, "step": 647 }, { "epoch": 2.314285714285714, - "grad_norm": 71.37628173828125, + "grad_norm": 76.31533813476562, "learning_rate": 9.482984373375105e-06, - "loss": 13.0859, + "loss": 11.2227, "step": 648 }, { "epoch": 2.317857142857143, - "grad_norm": 56.700477600097656, + "grad_norm": 714.2598876953125, "learning_rate": 9.480220479843627e-06, - "loss": 13.5859, + "loss": 14.9805, "step": 649 }, { "epoch": 2.3214285714285716, - "grad_norm": 63.44389724731445, + "grad_norm": 484.34716796875, "learning_rate": 9.477449623286505e-06, - "loss": 11.9219, + "loss": 19.0547, "step": 650 }, { "epoch": 2.325, - "grad_norm": 58.36225128173828, + "grad_norm": 10.13248062133789, "learning_rate": 9.474671808010126e-06, - "loss": 13.3438, + "loss": 11.0801, "step": 651 }, { "epoch": 2.3285714285714287, - "grad_norm": 62.89229202270508, + "grad_norm": 158.13528442382812, "learning_rate": 9.471887038331686e-06, - "loss": 13.8281, + "loss": 9.6191, "step": 652 }, { "epoch": 2.3321428571428573, - "grad_norm": 61.50202560424805, + "grad_norm": 169.62599182128906, "learning_rate": 9.469095318579188e-06, - "loss": 12.4648, + "loss": 10.2852, "step": 653 }, { "epoch": 2.335714285714286, - "grad_norm": 78.13320922851562, + "grad_norm": 1452.0989990234375, "learning_rate": 9.466296653091446e-06, - "loss": 14.0586, + "loss": 38.3086, "step": 654 }, { "epoch": 2.3392857142857144, - "grad_norm": 63.878074645996094, + "grad_norm": 690.172607421875, "learning_rate": 9.463491046218058e-06, - "loss": 11.0898, + "loss": 11.4004, "step": 655 }, { "epoch": 2.342857142857143, - "grad_norm": 70.62857818603516, + "grad_norm": 62.51376724243164, "learning_rate": 9.460678502319419e-06, - "loss": 11.3203, + "loss": 6.7637, "step": 656 }, { "epoch": 2.3464285714285715, - "grad_norm": 71.47483825683594, + "grad_norm": 115.6410903930664, "learning_rate": 9.457859025766696e-06, - "loss": 14.1562, + "loss": 11.6172, "step": 657 }, { "epoch": 2.35, - "grad_norm": 64.00738525390625, + "grad_norm": 1679.875244140625, "learning_rate": 9.45503262094184e-06, - "loss": 14.7031, + "loss": 27.3242, "step": 658 }, { "epoch": 2.3535714285714286, - "grad_norm": 51.51399612426758, + "grad_norm": 25.936969757080078, "learning_rate": 9.452199292237564e-06, - "loss": 12.0938, + "loss": 11.2422, "step": 659 }, { "epoch": 2.357142857142857, - "grad_norm": 70.18224334716797, + "grad_norm": 67.57929229736328, "learning_rate": 9.449359044057344e-06, - "loss": 17.3047, + "loss": 14.9844, "step": 660 }, { "epoch": 2.3607142857142858, - "grad_norm": 58.24589920043945, + "grad_norm": 98.71033477783203, "learning_rate": 9.446511880815408e-06, - "loss": 13.4922, + "loss": 25.9883, "step": 661 }, { "epoch": 2.3642857142857143, - "grad_norm": 60.900814056396484, + "grad_norm": 406.01446533203125, "learning_rate": 9.443657806936735e-06, - "loss": 14.6172, + "loss": 24.3672, "step": 662 }, { "epoch": 2.367857142857143, - "grad_norm": 73.62637329101562, + "grad_norm": 665.1796875, "learning_rate": 9.440796826857038e-06, - "loss": 14.7188, + "loss": 22.3477, "step": 663 }, { "epoch": 2.3714285714285714, - "grad_norm": 62.59147262573242, + "grad_norm": 722.241943359375, "learning_rate": 9.437928945022772e-06, - "loss": 14.9531, + "loss": 37.8047, "step": 664 }, { "epoch": 2.375, - "grad_norm": 67.35678100585938, + "grad_norm": 43.080970764160156, "learning_rate": 9.43505416589111e-06, - "loss": 15.5703, + "loss": 15.5391, "step": 665 }, { "epoch": 2.3785714285714286, - "grad_norm": 73.3077392578125, + "grad_norm": 39.54164123535156, "learning_rate": 9.432172493929949e-06, - "loss": 10.0664, + "loss": 6.1631, "step": 666 }, { "epoch": 2.382142857142857, - "grad_norm": 58.64065170288086, + "grad_norm": 50.358055114746094, "learning_rate": 9.4292839336179e-06, - "loss": 9.4883, + "loss": 6.4199, "step": 667 }, { "epoch": 2.3857142857142857, - "grad_norm": 84.39055633544922, + "grad_norm": 224.4847412109375, "learning_rate": 9.426388489444276e-06, - "loss": 14.75, + "loss": 15.4004, "step": 668 }, { "epoch": 2.3892857142857142, - "grad_norm": 76.28087615966797, + "grad_norm": 622.2835693359375, "learning_rate": 9.423486165909091e-06, - "loss": 9.5195, + "loss": 9.0664, "step": 669 }, { "epoch": 2.392857142857143, - "grad_norm": 54.03121566772461, + "grad_norm": 429.79052734375, "learning_rate": 9.420576967523049e-06, - "loss": 10.6758, + "loss": 48.0586, "step": 670 }, { "epoch": 2.3964285714285714, - "grad_norm": 57.280662536621094, + "grad_norm": 74.68653869628906, "learning_rate": 9.417660898807542e-06, - "loss": 13.7969, + "loss": 11.2754, "step": 671 }, { "epoch": 2.4, - "grad_norm": 53.454158782958984, + "grad_norm": 348.8187255859375, "learning_rate": 9.414737964294636e-06, - "loss": 11.5391, + "loss": 10.7266, "step": 672 }, { "epoch": 2.4035714285714285, - "grad_norm": 57.953346252441406, + "grad_norm": 146.97410583496094, "learning_rate": 9.411808168527068e-06, - "loss": 12.7422, + "loss": 22.4805, "step": 673 }, { "epoch": 2.407142857142857, - "grad_norm": 77.46489715576172, + "grad_norm": 277.505126953125, "learning_rate": 9.408871516058241e-06, - "loss": 20.2422, + "loss": 19.9883, "step": 674 }, { "epoch": 2.4107142857142856, - "grad_norm": 63.72200393676758, + "grad_norm": 32.917274475097656, "learning_rate": 9.405928011452211e-06, - "loss": 15.1094, + "loss": 13.9531, "step": 675 }, { "epoch": 2.414285714285714, - "grad_norm": 59.873390197753906, + "grad_norm": 958.6089477539062, "learning_rate": 9.40297765928369e-06, - "loss": 11.3672, + "loss": 18.2578, "step": 676 }, { "epoch": 2.4178571428571427, - "grad_norm": 71.02816009521484, + "grad_norm": 45.07780456542969, "learning_rate": 9.400020464138025e-06, - "loss": 12.6172, + "loss": 10.3359, "step": 677 }, { "epoch": 2.4214285714285713, - "grad_norm": 89.26254272460938, + "grad_norm": 471.2696838378906, "learning_rate": 9.3970564306112e-06, - "loss": 18.7891, + "loss": 21.418, "step": 678 }, { "epoch": 2.425, - "grad_norm": 56.43541717529297, + "grad_norm": 724.981201171875, "learning_rate": 9.394085563309827e-06, - "loss": 11.2461, + "loss": 24.3281, "step": 679 }, { "epoch": 2.4285714285714284, - "grad_norm": 53.902061462402344, + "grad_norm": 292.7646789550781, "learning_rate": 9.391107866851143e-06, - "loss": 10.9062, + "loss": 9.8301, "step": 680 }, { "epoch": 2.432142857142857, - "grad_norm": 54.39091491699219, + "grad_norm": 1413.863525390625, "learning_rate": 9.388123345862994e-06, - "loss": 9.918, + "loss": 16.9531, "step": 681 }, { "epoch": 2.435714285714286, - "grad_norm": 59.809814453125, + "grad_norm": 994.0634155273438, "learning_rate": 9.385132004983834e-06, - "loss": 10.8555, + "loss": 13.9766, "step": 682 }, { "epoch": 2.439285714285714, - "grad_norm": 58.610904693603516, + "grad_norm": 305.55462646484375, "learning_rate": 9.382133848862716e-06, - "loss": 14.1953, + "loss": 14.8984, "step": 683 }, { "epoch": 2.442857142857143, - "grad_norm": 47.29315948486328, + "grad_norm": 24.77017593383789, "learning_rate": 9.379128882159283e-06, - "loss": 10.5, + "loss": 8.2578, "step": 684 }, { "epoch": 2.4464285714285716, - "grad_norm": 57.30778121948242, + "grad_norm": 492.9799499511719, "learning_rate": 9.376117109543769e-06, - "loss": 12.1914, + "loss": 11.9453, "step": 685 }, { "epoch": 2.45, - "grad_norm": 53.31981658935547, + "grad_norm": 662.0299072265625, "learning_rate": 9.37309853569698e-06, - "loss": 13.3203, + "loss": 13.7031, "step": 686 }, { "epoch": 2.4535714285714287, - "grad_norm": 64.80961608886719, + "grad_norm": 6.7692131996154785, "learning_rate": 9.370073165310292e-06, - "loss": 14.6719, + "loss": 12.0469, "step": 687 }, { "epoch": 2.4571428571428573, - "grad_norm": 63.99576187133789, + "grad_norm": 260.8575744628906, "learning_rate": 9.36704100308565e-06, - "loss": 11.0508, + "loss": 8.5137, "step": 688 }, { "epoch": 2.460714285714286, - "grad_norm": 61.00963592529297, + "grad_norm": 254.4297332763672, "learning_rate": 9.364002053735546e-06, - "loss": 14.3281, + "loss": 12.1367, "step": 689 }, { "epoch": 2.4642857142857144, - "grad_norm": 63.51106262207031, + "grad_norm": 154.5642852783203, "learning_rate": 9.360956321983028e-06, - "loss": 11.7891, + "loss": 9.2422, "step": 690 }, { "epoch": 2.467857142857143, - "grad_norm": 69.38475799560547, + "grad_norm": 33.188560485839844, "learning_rate": 9.35790381256168e-06, - "loss": 10.6953, + "loss": 7.7285, "step": 691 }, { "epoch": 2.4714285714285715, - "grad_norm": 52.3367919921875, + "grad_norm": 177.5060577392578, "learning_rate": 9.354844530215621e-06, - "loss": 13.0156, + "loss": 10.7422, "step": 692 }, { "epoch": 2.475, - "grad_norm": 100.41969299316406, + "grad_norm": 238.21212768554688, "learning_rate": 9.351778479699499e-06, - "loss": 18.2383, + "loss": 18.832, "step": 693 }, { "epoch": 2.4785714285714286, - "grad_norm": 54.77377700805664, + "grad_norm": 65.51685333251953, "learning_rate": 9.348705665778479e-06, - "loss": 13.0234, + "loss": 13.4297, "step": 694 }, { "epoch": 2.482142857142857, - "grad_norm": 60.49354934692383, + "grad_norm": 39.57070541381836, "learning_rate": 9.345626093228233e-06, - "loss": 13.6094, + "loss": 12.2637, "step": 695 }, { "epoch": 2.4857142857142858, - "grad_norm": 70.97604370117188, + "grad_norm": 726.024169921875, "learning_rate": 9.342539766834945e-06, - "loss": 13.0234, + "loss": 23.8262, "step": 696 }, { "epoch": 2.4892857142857143, - "grad_norm": 89.12297058105469, + "grad_norm": 20.046306610107422, "learning_rate": 9.339446691395292e-06, - "loss": 12.6836, + "loss": 8.4043, "step": 697 }, { "epoch": 2.492857142857143, - "grad_norm": 60.37810134887695, + "grad_norm": 28.2738037109375, "learning_rate": 9.336346871716438e-06, - "loss": 15.2969, + "loss": 13.9492, "step": 698 }, { "epoch": 2.4964285714285714, - "grad_norm": 63.56589126586914, + "grad_norm": 214.42767333984375, "learning_rate": 9.33324031261603e-06, - "loss": 13.4297, + "loss": 12.8906, "step": 699 }, { "epoch": 2.5, - "grad_norm": 65.12931060791016, + "grad_norm": 65.22955322265625, "learning_rate": 9.330127018922195e-06, - "loss": 14.3516, + "loss": 25.6074, "step": 700 }, { "epoch": 2.5035714285714286, - "grad_norm": 52.17057800292969, + "grad_norm": 596.093994140625, "learning_rate": 9.327006995473515e-06, - "loss": 10.4844, + "loss": 11.5215, "step": 701 }, { "epoch": 2.507142857142857, - "grad_norm": 50.48122024536133, + "grad_norm": 1500.9698486328125, "learning_rate": 9.323880247119041e-06, - "loss": 9.9609, + "loss": 18.2676, "step": 702 }, { "epoch": 2.5107142857142857, - "grad_norm": 53.59724044799805, + "grad_norm": 8.157247543334961, "learning_rate": 9.320746778718274e-06, - "loss": 13.6406, + "loss": 11.5957, "step": 703 }, { "epoch": 2.5142857142857142, - "grad_norm": 57.976585388183594, + "grad_norm": 1026.89013671875, "learning_rate": 9.317606595141156e-06, - "loss": 15.6875, + "loss": 23.2891, "step": 704 }, { "epoch": 2.517857142857143, - "grad_norm": 63.404048919677734, + "grad_norm": 462.71966552734375, "learning_rate": 9.314459701268065e-06, - "loss": 10.7031, + "loss": 8.1875, "step": 705 }, { "epoch": 2.5214285714285714, - "grad_norm": 47.50910568237305, + "grad_norm": 632.51513671875, "learning_rate": 9.311306101989814e-06, - "loss": 9.7852, + "loss": 22.5508, "step": 706 }, { "epoch": 2.525, - "grad_norm": 62.52707290649414, + "grad_norm": 20.896238327026367, "learning_rate": 9.30814580220763e-06, - "loss": 11.75, + "loss": 9.1504, "step": 707 }, { "epoch": 2.5285714285714285, - "grad_norm": 67.43305206298828, + "grad_norm": 152.9181365966797, "learning_rate": 9.304978806833158e-06, - "loss": 9.3633, + "loss": 7.0156, "step": 708 }, { "epoch": 2.532142857142857, - "grad_norm": 59.435020446777344, + "grad_norm": 19.61768913269043, "learning_rate": 9.30180512078845e-06, - "loss": 13.0859, + "loss": 10.7852, "step": 709 }, { "epoch": 2.5357142857142856, - "grad_norm": 54.168094635009766, + "grad_norm": 133.75831604003906, "learning_rate": 9.298624749005953e-06, - "loss": 11.4688, + "loss": 11.1719, "step": 710 }, { "epoch": 2.539285714285714, - "grad_norm": 64.08353424072266, + "grad_norm": 157.88388061523438, "learning_rate": 9.295437696428504e-06, - "loss": 13.7305, + "loss": 26.0645, "step": 711 }, { "epoch": 2.5428571428571427, - "grad_norm": 53.95362091064453, + "grad_norm": 51.52932357788086, "learning_rate": 9.292243968009332e-06, - "loss": 11.8086, + "loss": 9.9746, "step": 712 }, { "epoch": 2.5464285714285713, - "grad_norm": 48.95252227783203, + "grad_norm": 22.628358840942383, "learning_rate": 9.289043568712029e-06, - "loss": 11.1562, + "loss": 9.8086, "step": 713 }, { "epoch": 2.55, - "grad_norm": 49.33997344970703, + "grad_norm": 834.8486938476562, "learning_rate": 9.285836503510562e-06, - "loss": 12.0977, + "loss": 19.9609, "step": 714 }, { "epoch": 2.553571428571429, - "grad_norm": 67.77393341064453, + "grad_norm": 818.1465454101562, "learning_rate": 9.282622777389258e-06, - "loss": 16.8945, + "loss": 21.4512, "step": 715 }, { "epoch": 2.557142857142857, - "grad_norm": 66.34770965576172, + "grad_norm": 45.97016906738281, "learning_rate": 9.279402395342794e-06, - "loss": 13.8633, + "loss": 11.8711, "step": 716 }, { "epoch": 2.560714285714286, - "grad_norm": 46.717464447021484, + "grad_norm": 56.74850845336914, "learning_rate": 9.276175362376191e-06, - "loss": 10.9766, + "loss": 9.9531, "step": 717 }, { "epoch": 2.564285714285714, - "grad_norm": 66.04427337646484, + "grad_norm": 1224.8909912109375, "learning_rate": 9.27294168350481e-06, - "loss": 14.2227, + "loss": 29.6836, "step": 718 }, { "epoch": 2.567857142857143, - "grad_norm": 78.49992370605469, + "grad_norm": 12.08031177520752, "learning_rate": 9.269701363754335e-06, - "loss": 15.3867, + "loss": 14.3438, "step": 719 }, { "epoch": 2.571428571428571, - "grad_norm": 53.65013122558594, + "grad_norm": 573.0210571289062, "learning_rate": 9.266454408160779e-06, - "loss": 12.0859, + "loss": 21.0859, "step": 720 }, { "epoch": 2.575, - "grad_norm": 54.41189193725586, + "grad_norm": 21.096519470214844, "learning_rate": 9.263200821770462e-06, - "loss": 11.582, + "loss": 8.5117, "step": 721 }, { "epoch": 2.5785714285714287, - "grad_norm": 51.12994384765625, + "grad_norm": 24.024465560913086, "learning_rate": 9.25994060964001e-06, - "loss": 11.7031, + "loss": 9.1602, "step": 722 }, { "epoch": 2.5821428571428573, - "grad_norm": 55.230430603027344, + "grad_norm": 12.084073066711426, "learning_rate": 9.25667377683635e-06, - "loss": 12.8008, + "loss": 12.4062, "step": 723 }, { "epoch": 2.585714285714286, - "grad_norm": 57.75759506225586, + "grad_norm": 185.01287841796875, "learning_rate": 9.253400328436699e-06, - "loss": 9.3711, + "loss": 19.1445, "step": 724 }, { "epoch": 2.5892857142857144, - "grad_norm": 59.28636932373047, + "grad_norm": 315.2637023925781, "learning_rate": 9.250120269528546e-06, - "loss": 13.5078, + "loss": 23.4141, "step": 725 }, { "epoch": 2.592857142857143, - "grad_norm": 55.35805130004883, + "grad_norm": 1217.78369140625, "learning_rate": 9.246833605209669e-06, - "loss": 11.0625, + "loss": 16.6309, "step": 726 }, { "epoch": 2.5964285714285715, - "grad_norm": 56.72629165649414, + "grad_norm": 15.977570533752441, "learning_rate": 9.243540340588097e-06, - "loss": 9.7891, + "loss": 8.4395, "step": 727 }, { "epoch": 2.6, - "grad_norm": 59.329376220703125, + "grad_norm": 22.744813919067383, "learning_rate": 9.24024048078213e-06, - "loss": 13.5312, + "loss": 12.0586, "step": 728 }, { "epoch": 2.6035714285714286, - "grad_norm": 53.99064636230469, + "grad_norm": 92.36138916015625, "learning_rate": 9.236934030920309e-06, - "loss": 11.9102, + "loss": 9.6309, "step": 729 }, { "epoch": 2.607142857142857, - "grad_norm": 54.379417419433594, + "grad_norm": 4.717357158660889, "learning_rate": 9.233620996141421e-06, - "loss": 10.0625, + "loss": 7.6641, "step": 730 }, { "epoch": 2.6107142857142858, - "grad_norm": 62.70623016357422, + "grad_norm": 1127.933349609375, "learning_rate": 9.230301381594487e-06, - "loss": 11.1641, + "loss": 21.4336, "step": 731 }, { "epoch": 2.6142857142857143, - "grad_norm": 52.003902435302734, + "grad_norm": 477.7636413574219, "learning_rate": 9.226975192438752e-06, - "loss": 10.8281, + "loss": 12.1211, "step": 732 }, { "epoch": 2.617857142857143, - "grad_norm": 48.416202545166016, + "grad_norm": 8.769879341125488, "learning_rate": 9.22364243384368e-06, - "loss": 10.7891, + "loss": 9.6875, "step": 733 }, { "epoch": 2.6214285714285714, - "grad_norm": 59.7373161315918, + "grad_norm": 207.09194946289062, "learning_rate": 9.220303110988947e-06, - "loss": 11.5508, + "loss": 9.3047, "step": 734 }, { "epoch": 2.625, - "grad_norm": 56.96145248413086, + "grad_norm": 904.828857421875, "learning_rate": 9.21695722906443e-06, - "loss": 10.4141, + "loss": 12.3555, "step": 735 }, { "epoch": 2.6285714285714286, - "grad_norm": 73.68313598632812, + "grad_norm": 27.428268432617188, "learning_rate": 9.213604793270196e-06, - "loss": 18.1602, + "loss": 16.5176, "step": 736 }, { "epoch": 2.632142857142857, - "grad_norm": 60.83184051513672, + "grad_norm": 8.852267265319824, "learning_rate": 9.210245808816505e-06, - "loss": 10.2305, + "loss": 6.7168, "step": 737 }, { "epoch": 2.6357142857142857, - "grad_norm": 59.33787155151367, + "grad_norm": 58.678401947021484, "learning_rate": 9.20688028092379e-06, - "loss": 10.8906, + "loss": 9.8242, "step": 738 }, { "epoch": 2.6392857142857142, - "grad_norm": 74.77108001708984, + "grad_norm": 1124.845947265625, "learning_rate": 9.203508214822652e-06, - "loss": 13.0312, + "loss": 23.7539, "step": 739 }, { "epoch": 2.642857142857143, - "grad_norm": 62.380706787109375, + "grad_norm": 644.85986328125, "learning_rate": 9.200129615753858e-06, - "loss": 11.2617, + "loss": 28.459, "step": 740 }, { "epoch": 2.6464285714285714, - "grad_norm": 65.22488403320312, + "grad_norm": 232.95516967773438, "learning_rate": 9.196744488968327e-06, - "loss": 12.0078, + "loss": 8.7656, "step": 741 }, { "epoch": 2.65, - "grad_norm": 62.31855010986328, + "grad_norm": 62.42345428466797, "learning_rate": 9.193352839727122e-06, - "loss": 13.6953, + "loss": 11.8438, "step": 742 }, { "epoch": 2.6535714285714285, - "grad_norm": 58.02761459350586, + "grad_norm": 177.1497039794922, "learning_rate": 9.18995467330144e-06, - "loss": 13.0078, + "loss": 11.1875, "step": 743 }, { "epoch": 2.657142857142857, - "grad_norm": 49.560829162597656, + "grad_norm": 706.6404418945312, "learning_rate": 9.186549994972618e-06, - "loss": 10.6641, + "loss": 11.0625, "step": 744 }, { "epoch": 2.6607142857142856, - "grad_norm": 55.47771453857422, + "grad_norm": 28.481101989746094, "learning_rate": 9.1831388100321e-06, - "loss": 12.1367, + "loss": 9.9531, "step": 745 }, { "epoch": 2.664285714285714, - "grad_norm": 53.42022705078125, + "grad_norm": 1024.6275634765625, "learning_rate": 9.179721123781448e-06, - "loss": 11.8359, + "loss": 22.2617, "step": 746 }, { "epoch": 2.6678571428571427, - "grad_norm": 53.15449905395508, + "grad_norm": 31.688383102416992, "learning_rate": 9.176296941532332e-06, - "loss": 13.6328, + "loss": 12.9258, "step": 747 }, { "epoch": 2.6714285714285713, - "grad_norm": 62.184326171875, + "grad_norm": 1345.9156494140625, "learning_rate": 9.172866268606514e-06, - "loss": 10.3164, + "loss": 45.5625, "step": 748 }, { "epoch": 2.675, - "grad_norm": 69.34342193603516, + "grad_norm": 942.7173461914062, "learning_rate": 9.169429110335842e-06, - "loss": 14.1094, + "loss": 22.8457, "step": 749 }, { "epoch": 2.678571428571429, - "grad_norm": 56.424896240234375, + "grad_norm": 1128.6046142578125, "learning_rate": 9.165985472062245e-06, - "loss": 11.7031, + "loss": 19.6836, "step": 750 }, { "epoch": 2.682142857142857, - "grad_norm": 63.94963455200195, + "grad_norm": 956.705322265625, "learning_rate": 9.162535359137726e-06, - "loss": 11.7148, + "loss": 13.3477, "step": 751 }, { "epoch": 2.685714285714286, - "grad_norm": 72.2540512084961, + "grad_norm": 304.337890625, "learning_rate": 9.159078776924347e-06, - "loss": 11.4492, + "loss": 22.9219, "step": 752 }, { "epoch": 2.689285714285714, - "grad_norm": 56.853919982910156, + "grad_norm": 256.0356750488281, "learning_rate": 9.155615730794223e-06, - "loss": 12.4688, + "loss": 13.4141, "step": 753 }, { "epoch": 2.692857142857143, - "grad_norm": 51.40186309814453, + "grad_norm": 79.6223373413086, "learning_rate": 9.152146226129519e-06, - "loss": 11.625, + "loss": 23.8398, "step": 754 }, { "epoch": 2.696428571428571, - "grad_norm": 46.99700164794922, + "grad_norm": 6.170767784118652, "learning_rate": 9.148670268322439e-06, - "loss": 10.5391, + "loss": 8.1777, "step": 755 }, { "epoch": 2.7, - "grad_norm": 53.19974136352539, + "grad_norm": 78.38643646240234, "learning_rate": 9.145187862775208e-06, - "loss": 12.3984, + "loss": 9.5195, "step": 756 }, { "epoch": 2.7035714285714287, - "grad_norm": 61.41981506347656, + "grad_norm": 1634.88916015625, "learning_rate": 9.141699014900084e-06, - "loss": 13.5742, + "loss": 33.1426, "step": 757 }, { "epoch": 2.7071428571428573, - "grad_norm": 64.33472442626953, + "grad_norm": 1171.101318359375, "learning_rate": 9.138203730119326e-06, - "loss": 10.9453, + "loss": 26.3672, "step": 758 }, { "epoch": 2.710714285714286, - "grad_norm": 57.124114990234375, + "grad_norm": 171.91444396972656, "learning_rate": 9.1347020138652e-06, - "loss": 12.1289, + "loss": 9.5508, "step": 759 }, { "epoch": 2.7142857142857144, - "grad_norm": 52.198486328125, + "grad_norm": 5.476727485656738, "learning_rate": 9.131193871579975e-06, - "loss": 9.5312, + "loss": 7.0469, "step": 760 }, { "epoch": 2.717857142857143, - "grad_norm": 61.45719528198242, + "grad_norm": 131.83688354492188, "learning_rate": 9.1276793087159e-06, - "loss": 13.0781, + "loss": 11.0332, "step": 761 }, { "epoch": 2.7214285714285715, - "grad_norm": 48.156272888183594, + "grad_norm": 22.963092803955078, "learning_rate": 9.1241583307352e-06, - "loss": 11.3594, + "loss": 9.498, "step": 762 }, { "epoch": 2.725, - "grad_norm": 69.11106872558594, + "grad_norm": 1097.086669921875, "learning_rate": 9.120630943110078e-06, - "loss": 14.0781, + "loss": 41.4922, "step": 763 }, { "epoch": 2.7285714285714286, - "grad_norm": 58.01288604736328, + "grad_norm": 7.798305511474609, "learning_rate": 9.117097151322697e-06, - "loss": 11.2422, + "loss": 8.0234, "step": 764 }, { "epoch": 2.732142857142857, - "grad_norm": 60.284427642822266, + "grad_norm": 235.2790069580078, "learning_rate": 9.113556960865168e-06, - "loss": 12.1641, + "loss": 10.0957, "step": 765 }, { "epoch": 2.7357142857142858, - "grad_norm": 50.124717712402344, + "grad_norm": 337.505859375, "learning_rate": 9.110010377239552e-06, - "loss": 10.1406, + "loss": 8.4375, "step": 766 }, { "epoch": 2.7392857142857143, - "grad_norm": 59.82373046875, + "grad_norm": 811.5652465820312, "learning_rate": 9.10645740595784e-06, - "loss": 11.0195, + "loss": 13.1504, "step": 767 }, { "epoch": 2.742857142857143, - "grad_norm": 75.27725982666016, + "grad_norm": 323.7221374511719, "learning_rate": 9.102898052541959e-06, - "loss": 15.5547, + "loss": 14.8047, "step": 768 }, { "epoch": 2.7464285714285714, - "grad_norm": 52.63650131225586, + "grad_norm": 39.39506912231445, "learning_rate": 9.099332322523748e-06, - "loss": 9.3828, + "loss": 6.5234, "step": 769 }, { "epoch": 2.75, - "grad_norm": 59.20087814331055, + "grad_norm": 14.85326099395752, "learning_rate": 9.09576022144496e-06, - "loss": 13.8203, + "loss": 12.9648, "step": 770 }, { "epoch": 2.7535714285714286, - "grad_norm": 55.335445404052734, + "grad_norm": 896.6591186523438, "learning_rate": 9.092181754857247e-06, - "loss": 12.0117, + "loss": 14.0156, "step": 771 }, { "epoch": 2.757142857142857, - "grad_norm": 65.19805145263672, + "grad_norm": 1303.8359375, "learning_rate": 9.088596928322158e-06, - "loss": 14.6797, + "loss": 20.8652, "step": 772 }, { "epoch": 2.7607142857142857, - "grad_norm": 50.827369689941406, + "grad_norm": 249.16659545898438, "learning_rate": 9.085005747411121e-06, - "loss": 12.9102, + "loss": 24.209, "step": 773 }, { "epoch": 2.7642857142857142, - "grad_norm": 57.56480026245117, + "grad_norm": 7.906393527984619, "learning_rate": 9.081408217705446e-06, - "loss": 13.0977, + "loss": 9.4453, "step": 774 }, { "epoch": 2.767857142857143, - "grad_norm": 51.521976470947266, + "grad_norm": 580.0657348632812, "learning_rate": 9.077804344796302e-06, - "loss": 11.9648, + "loss": 14.0586, "step": 775 }, { "epoch": 2.7714285714285714, - "grad_norm": 84.47215270996094, + "grad_norm": 284.5125732421875, "learning_rate": 9.074194134284726e-06, - "loss": 18.4297, + "loss": 19.0859, "step": 776 }, { "epoch": 2.775, - "grad_norm": 67.8520278930664, + "grad_norm": 22.102306365966797, "learning_rate": 9.070577591781598e-06, - "loss": 13.5859, + "loss": 13.6895, "step": 777 }, { "epoch": 2.7785714285714285, - "grad_norm": 55.66756820678711, + "grad_norm": 1187.8800048828125, "learning_rate": 9.066954722907639e-06, - "loss": 11.7891, + "loss": 24.9609, "step": 778 }, { "epoch": 2.782142857142857, - "grad_norm": 52.24456024169922, + "grad_norm": 10.090874671936035, "learning_rate": 9.063325533293409e-06, - "loss": 11.2031, + "loss": 10.2969, "step": 779 }, { "epoch": 2.7857142857142856, - "grad_norm": 63.32294845581055, + "grad_norm": 233.37644958496094, "learning_rate": 9.059690028579285e-06, - "loss": 13.0391, + "loss": 23.0117, "step": 780 }, { "epoch": 2.789285714285714, - "grad_norm": 57.61387252807617, + "grad_norm": 224.49395751953125, "learning_rate": 9.056048214415456e-06, - "loss": 13.0156, + "loss": 12.2578, "step": 781 }, { "epoch": 2.7928571428571427, - "grad_norm": 51.528228759765625, + "grad_norm": 216.9644317626953, "learning_rate": 9.052400096461928e-06, - "loss": 11.207, + "loss": 10.2188, "step": 782 }, { "epoch": 2.7964285714285713, - "grad_norm": 62.744815826416016, + "grad_norm": 221.7606964111328, "learning_rate": 9.048745680388493e-06, - "loss": 11.2109, + "loss": 7.918, "step": 783 }, { "epoch": 2.8, - "grad_norm": 47.136600494384766, + "grad_norm": 10.024370193481445, "learning_rate": 9.045084971874738e-06, - "loss": 9.8555, + "loss": 8.0723, "step": 784 }, { "epoch": 2.803571428571429, - "grad_norm": 58.11830139160156, + "grad_norm": 34.50477981567383, "learning_rate": 9.041417976610028e-06, - "loss": 9.5703, + "loss": 8.2344, "step": 785 }, { "epoch": 2.807142857142857, - "grad_norm": 55.830387115478516, + "grad_norm": 615.3500366210938, "learning_rate": 9.037744700293497e-06, - "loss": 9.6875, + "loss": 8.7744, "step": 786 }, { "epoch": 2.810714285714286, - "grad_norm": 64.70860290527344, + "grad_norm": 28.558334350585938, "learning_rate": 9.03406514863404e-06, - "loss": 11.3828, + "loss": 9.3809, "step": 787 }, { "epoch": 2.814285714285714, - "grad_norm": 49.32529067993164, + "grad_norm": 1122.27490234375, "learning_rate": 9.030379327350311e-06, - "loss": 9.5352, + "loss": 20.9883, "step": 788 }, { "epoch": 2.817857142857143, - "grad_norm": 57.78901672363281, + "grad_norm": 228.27020263671875, "learning_rate": 9.026687242170701e-06, - "loss": 13.0977, + "loss": 11.3848, "step": 789 }, { "epoch": 2.821428571428571, - "grad_norm": 61.277198791503906, + "grad_norm": 488.15350341796875, "learning_rate": 9.022988898833342e-06, - "loss": 11.9531, + "loss": 10.668, "step": 790 }, { "epoch": 2.825, - "grad_norm": 57.05833435058594, + "grad_norm": 1190.7825927734375, "learning_rate": 9.019284303086086e-06, - "loss": 13.5078, + "loss": 18.207, "step": 791 }, { "epoch": 2.8285714285714287, - "grad_norm": 62.57944869995117, + "grad_norm": 355.278564453125, "learning_rate": 9.01557346068651e-06, - "loss": 12.9297, + "loss": 13.2051, "step": 792 }, { "epoch": 2.8321428571428573, - "grad_norm": 54.83690643310547, + "grad_norm": 689.2097778320312, "learning_rate": 9.011856377401891e-06, - "loss": 11.7891, + "loss": 12.3242, "step": 793 }, { "epoch": 2.835714285714286, - "grad_norm": 64.10013580322266, + "grad_norm": 17.987226486206055, "learning_rate": 9.00813305900921e-06, - "loss": 13.7188, + "loss": 11.543, "step": 794 }, { "epoch": 2.8392857142857144, - "grad_norm": 57.04679489135742, + "grad_norm": 732.38720703125, "learning_rate": 9.004403511295141e-06, - "loss": 11.6406, + "loss": 19.8984, "step": 795 }, { "epoch": 2.842857142857143, - "grad_norm": 53.89971160888672, + "grad_norm": 17.91159439086914, "learning_rate": 9.000667740056033e-06, - "loss": 11.4688, + "loss": 7.6172, "step": 796 }, { "epoch": 2.8464285714285715, - "grad_norm": 56.299171447753906, + "grad_norm": 11.518808364868164, "learning_rate": 8.996925751097911e-06, - "loss": 10.5781, + "loss": 7.6406, "step": 797 }, { "epoch": 2.85, - "grad_norm": 61.95994567871094, + "grad_norm": 302.9195861816406, "learning_rate": 8.993177550236464e-06, - "loss": 13.0469, + "loss": 12.7461, "step": 798 }, { "epoch": 2.8535714285714286, - "grad_norm": 69.69711303710938, + "grad_norm": 862.9013061523438, "learning_rate": 8.989423143297036e-06, - "loss": 14.418, + "loss": 19.2734, "step": 799 }, { "epoch": 2.857142857142857, - "grad_norm": 56.747093200683594, + "grad_norm": 15.42437744140625, "learning_rate": 8.985662536114614e-06, - "loss": 12.2109, + "loss": 11.2969, "step": 800 }, { "epoch": 2.8607142857142858, - "grad_norm": 61.4937629699707, + "grad_norm": 702.300537109375, "learning_rate": 8.981895734533818e-06, - "loss": 12.5781, + "loss": 14.6562, "step": 801 }, { "epoch": 2.8642857142857143, - "grad_norm": 57.091339111328125, + "grad_norm": 461.8436279296875, "learning_rate": 8.978122744408905e-06, - "loss": 13.4883, + "loss": 13.2734, "step": 802 }, { "epoch": 2.867857142857143, - "grad_norm": 55.88637161254883, + "grad_norm": 53.064266204833984, "learning_rate": 8.974343571603743e-06, - "loss": 11.7461, + "loss": 11.0098, "step": 803 }, { "epoch": 2.8714285714285714, - "grad_norm": 51.81855010986328, + "grad_norm": 109.18843078613281, "learning_rate": 8.970558221991806e-06, - "loss": 11.7617, + "loss": 10.4902, "step": 804 }, { "epoch": 2.875, - "grad_norm": 54.484405517578125, + "grad_norm": 1200.982421875, "learning_rate": 8.966766701456177e-06, - "loss": 10.4844, + "loss": 38.1211, "step": 805 }, { "epoch": 2.8785714285714286, - "grad_norm": 63.39767837524414, + "grad_norm": 85.99945831298828, "learning_rate": 8.962969015889522e-06, - "loss": 10.4375, + "loss": 7.8281, "step": 806 }, { "epoch": 2.882142857142857, - "grad_norm": 58.7789192199707, + "grad_norm": 113.17164611816406, "learning_rate": 8.959165171194091e-06, - "loss": 9.8828, + "loss": 10.3359, "step": 807 }, { "epoch": 2.8857142857142857, - "grad_norm": 63.376155853271484, + "grad_norm": 54.82890319824219, "learning_rate": 8.955355173281709e-06, - "loss": 8.7422, + "loss": 5.0996, "step": 808 }, { "epoch": 2.8892857142857142, - "grad_norm": 49.106380462646484, + "grad_norm": 65.0638198852539, "learning_rate": 8.951539028073757e-06, - "loss": 10.0469, + "loss": 8.1758, "step": 809 }, { "epoch": 2.892857142857143, - "grad_norm": 57.181522369384766, + "grad_norm": 686.844970703125, "learning_rate": 8.947716741501178e-06, - "loss": 10.8945, + "loss": 12.6074, "step": 810 }, { "epoch": 2.8964285714285714, - "grad_norm": 101.49259948730469, + "grad_norm": 9.142158508300781, "learning_rate": 8.943888319504456e-06, - "loss": 15.3828, + "loss": 16.3555, "step": 811 }, { "epoch": 2.9, - "grad_norm": 60.89630126953125, + "grad_norm": 760.6893920898438, "learning_rate": 8.94005376803361e-06, - "loss": 11.7031, + "loss": 16.0664, "step": 812 }, { "epoch": 2.9035714285714285, - "grad_norm": 65.40675354003906, + "grad_norm": 377.252685546875, "learning_rate": 8.936213093048187e-06, - "loss": 11.0938, + "loss": 10.959, "step": 813 }, { "epoch": 2.907142857142857, - "grad_norm": 63.66600036621094, + "grad_norm": 87.01111602783203, "learning_rate": 8.93236630051725e-06, - "loss": 15.3242, + "loss": 14.3672, "step": 814 }, { "epoch": 2.9107142857142856, - "grad_norm": 66.3242416381836, + "grad_norm": 33.42525863647461, "learning_rate": 8.92851339641937e-06, - "loss": 12.8984, + "loss": 10.3027, "step": 815 }, { "epoch": 2.914285714285714, - "grad_norm": 62.32695388793945, + "grad_norm": 581.3942260742188, "learning_rate": 8.924654386742613e-06, - "loss": 11.7891, + "loss": 21.6211, "step": 816 }, { "epoch": 2.9178571428571427, - "grad_norm": 51.0781135559082, + "grad_norm": 104.0300064086914, "learning_rate": 8.92078927748454e-06, - "loss": 12.3203, + "loss": 10.1582, "step": 817 }, { "epoch": 2.9214285714285713, - "grad_norm": 71.10233306884766, + "grad_norm": 812.6210327148438, "learning_rate": 8.91691807465219e-06, - "loss": 11.8945, + "loss": 30.1836, "step": 818 }, { "epoch": 2.925, - "grad_norm": 70.24767303466797, + "grad_norm": 23.633996963500977, "learning_rate": 8.91304078426207e-06, - "loss": 11.5938, + "loss": 9.8047, "step": 819 }, { "epoch": 2.928571428571429, - "grad_norm": 47.404422760009766, + "grad_norm": 54.0675048828125, "learning_rate": 8.90915741234015e-06, - "loss": 9.9023, + "loss": 7.498, "step": 820 }, { "epoch": 2.932142857142857, - "grad_norm": 46.73117446899414, + "grad_norm": 576.3934936523438, "learning_rate": 8.905267964921852e-06, - "loss": 8.875, + "loss": 9.043, "step": 821 }, { "epoch": 2.935714285714286, - "grad_norm": 62.030296325683594, + "grad_norm": 102.47286987304688, "learning_rate": 8.901372448052036e-06, - "loss": 12.7578, + "loss": 13.8086, "step": 822 }, { "epoch": 2.939285714285714, - "grad_norm": 53.38199996948242, + "grad_norm": 19.27668571472168, "learning_rate": 8.897470867785002e-06, - "loss": 11.4219, + "loss": 21.3867, "step": 823 }, { "epoch": 2.942857142857143, - "grad_norm": 49.46811294555664, + "grad_norm": 4.520827293395996, "learning_rate": 8.89356323018447e-06, - "loss": 10.6016, + "loss": 9.207, "step": 824 }, { "epoch": 2.946428571428571, - "grad_norm": 94.73876190185547, + "grad_norm": 213.93333435058594, "learning_rate": 8.889649541323575e-06, - "loss": 15.3047, + "loss": 16.7695, "step": 825 }, { "epoch": 2.95, - "grad_norm": 65.01333618164062, + "grad_norm": 52.276058197021484, "learning_rate": 8.885729807284855e-06, - "loss": 9.7188, + "loss": 6.916, "step": 826 }, { "epoch": 2.9535714285714287, - "grad_norm": 56.35231399536133, + "grad_norm": 112.12921142578125, "learning_rate": 8.881804034160244e-06, - "loss": 10.8125, + "loss": 9.248, "step": 827 }, { "epoch": 2.9571428571428573, - "grad_norm": 60.141719818115234, + "grad_norm": 330.92926025390625, "learning_rate": 8.877872228051061e-06, - "loss": 10.8438, + "loss": 9.5469, "step": 828 }, { "epoch": 2.960714285714286, - "grad_norm": 60.45473861694336, + "grad_norm": 151.19442749023438, "learning_rate": 8.873934395068006e-06, - "loss": 11.2539, + "loss": 10.457, "step": 829 }, { "epoch": 2.9642857142857144, - "grad_norm": 72.13605499267578, + "grad_norm": 141.4802703857422, "learning_rate": 8.869990541331137e-06, - "loss": 12.8125, + "loss": 13.457, "step": 830 }, { "epoch": 2.967857142857143, - "grad_norm": 61.62421417236328, + "grad_norm": 25.79639434814453, "learning_rate": 8.86604067296988e-06, - "loss": 12.7812, + "loss": 10.9727, "step": 831 }, { "epoch": 2.9714285714285715, - "grad_norm": 55.35321807861328, + "grad_norm": 207.466796875, "learning_rate": 8.862084796122998e-06, - "loss": 10.6211, + "loss": 9.6562, "step": 832 }, { "epoch": 2.975, - "grad_norm": 57.74436569213867, + "grad_norm": 116.33453369140625, "learning_rate": 8.858122916938601e-06, - "loss": 11.9141, + "loss": 10.418, "step": 833 }, { "epoch": 2.9785714285714286, - "grad_norm": 51.90534591674805, + "grad_norm": 145.76332092285156, "learning_rate": 8.854155041574121e-06, - "loss": 10.3477, + "loss": 19.8164, "step": 834 }, { "epoch": 2.982142857142857, - "grad_norm": 54.02871322631836, + "grad_norm": 952.1488647460938, "learning_rate": 8.850181176196316e-06, - "loss": 14.8516, + "loss": 33.7891, "step": 835 }, { "epoch": 2.9857142857142858, - "grad_norm": 52.91661834716797, + "grad_norm": 740.1598510742188, "learning_rate": 8.846201326981245e-06, - "loss": 10.8125, + "loss": 16.0547, "step": 836 }, { "epoch": 2.9892857142857143, - "grad_norm": 56.72195053100586, + "grad_norm": 1072.1513671875, "learning_rate": 8.842215500114274e-06, - "loss": 15.0938, + "loss": 22.5742, "step": 837 }, { "epoch": 2.992857142857143, - "grad_norm": 56.804229736328125, + "grad_norm": 621.56689453125, "learning_rate": 8.838223701790057e-06, - "loss": 13.2227, + "loss": 28.4922, "step": 838 }, { "epoch": 2.9964285714285714, - "grad_norm": 57.880767822265625, + "grad_norm": 34.00000762939453, "learning_rate": 8.834225938212528e-06, - "loss": 10.0664, + "loss": 8.5645, "step": 839 }, { "epoch": 3.0, - "grad_norm": 50.92356491088867, + "grad_norm": 8.153974533081055, "learning_rate": 8.83022221559489e-06, - "loss": 12.6055, + "loss": 11.7539, "step": 840 }, { "epoch": 3.0, - "eval_loss": 11.804313659667969, - "eval_mse": 11.803122182773812, - "eval_runtime": 11.2816, - "eval_samples_per_second": 251.736, - "eval_steps_per_second": 1.33, - "eval_target_0_mse": 21.369306183423156, - "eval_target_1_mse": 11.972847315979143, - "eval_target_2_mse": 7.081936090942864, - "eval_target_3_mse": 6.788399140750078, + "eval_loss": 14.04477596282959, + "eval_mse": 14.04690981281001, + "eval_runtime": 11.331, + "eval_samples_per_second": 250.639, + "eval_steps_per_second": 1.324, + "eval_target_0_mse": 37.027201126758776, + "eval_target_1_mse": 10.425861282454203, + "eval_target_2_mse": 5.588847796433989, + "eval_target_3_mse": 3.1457290455930806, "step": 840 }, { "epoch": 3.0035714285714286, - "grad_norm": 61.716922760009766, + "grad_norm": 1060.4013671875, "learning_rate": 8.826212540159615e-06, - "loss": 11.2109, + "loss": 21.4492, "step": 841 }, { "epoch": 3.007142857142857, - "grad_norm": 52.246131896972656, + "grad_norm": 8.906164169311523, "learning_rate": 8.822196918138416e-06, - "loss": 8.9648, + "loss": 6.3105, "step": 842 }, { "epoch": 3.0107142857142857, - "grad_norm": 47.40808868408203, + "grad_norm": 29.82748031616211, "learning_rate": 8.818175355772259e-06, - "loss": 10.3281, + "loss": 10.1836, "step": 843 }, { "epoch": 3.0142857142857142, - "grad_norm": 63.48115539550781, + "grad_norm": 862.1499633789062, "learning_rate": 8.814147859311333e-06, - "loss": 11.7695, + "loss": 26.457, "step": 844 }, { "epoch": 3.017857142857143, - "grad_norm": 50.6243896484375, + "grad_norm": 187.60560607910156, "learning_rate": 8.810114435015055e-06, - "loss": 11.4609, + "loss": 11.2617, "step": 845 }, { "epoch": 3.0214285714285714, - "grad_norm": 57.48344802856445, + "grad_norm": 47.3878173828125, "learning_rate": 8.806075089152051e-06, - "loss": 10.3984, + "loss": 9.2695, "step": 846 }, { "epoch": 3.025, - "grad_norm": 73.98190307617188, + "grad_norm": 35.140419006347656, "learning_rate": 8.802029828000157e-06, - "loss": 14.6953, + "loss": 12.2773, "step": 847 }, { "epoch": 3.0285714285714285, - "grad_norm": 54.0810546875, + "grad_norm": 13.609644889831543, "learning_rate": 8.797978657846391e-06, - "loss": 8.5352, + "loss": 6.0879, "step": 848 }, { "epoch": 3.032142857142857, - "grad_norm": 63.839168548583984, + "grad_norm": 558.426025390625, "learning_rate": 8.793921584986967e-06, - "loss": 13.4453, + "loss": 14.1797, "step": 849 }, { "epoch": 3.0357142857142856, - "grad_norm": 53.01897430419922, + "grad_norm": 155.2855682373047, "learning_rate": 8.789858615727266e-06, - "loss": 12.2188, + "loss": 11.2148, "step": 850 }, { "epoch": 3.039285714285714, - "grad_norm": 57.063232421875, + "grad_norm": 20.08562469482422, "learning_rate": 8.785789756381833e-06, - "loss": 12.0508, + "loss": 12.2656, "step": 851 }, { "epoch": 3.0428571428571427, - "grad_norm": 58.180667877197266, + "grad_norm": 903.527587890625, "learning_rate": 8.781715013274369e-06, - "loss": 12.6133, + "loss": 26.4746, "step": 852 }, { "epoch": 3.0464285714285713, - "grad_norm": 69.91610717773438, + "grad_norm": 292.310791015625, "learning_rate": 8.777634392737719e-06, - "loss": 13.4531, + "loss": 15.2285, "step": 853 }, { "epoch": 3.05, - "grad_norm": 51.08747863769531, + "grad_norm": 3.9534356594085693, "learning_rate": 8.773547901113862e-06, - "loss": 9.7266, + "loss": 6.7773, "step": 854 }, { "epoch": 3.0535714285714284, - "grad_norm": 55.04694747924805, + "grad_norm": 503.546142578125, "learning_rate": 8.7694555447539e-06, - "loss": 10.6797, + "loss": 9.1777, "step": 855 }, { "epoch": 3.057142857142857, - "grad_norm": 69.81141662597656, + "grad_norm": 2.652756690979004, "learning_rate": 8.765357330018056e-06, - "loss": 14.1094, + "loss": 12.4883, "step": 856 }, { "epoch": 3.0607142857142855, - "grad_norm": 48.88008117675781, + "grad_norm": 28.016494750976562, "learning_rate": 8.761253263275651e-06, - "loss": 11.5977, + "loss": 11.0898, "step": 857 }, { "epoch": 3.064285714285714, - "grad_norm": 57.722740173339844, + "grad_norm": 79.17037963867188, "learning_rate": 8.757143350905102e-06, - "loss": 13.875, + "loss": 11.8984, "step": 858 }, { "epoch": 3.067857142857143, - "grad_norm": 79.07816314697266, + "grad_norm": 14.248964309692383, "learning_rate": 8.753027599293918e-06, - "loss": 15.2031, + "loss": 16.2109, "step": 859 }, { "epoch": 3.0714285714285716, - "grad_norm": 54.78011703491211, + "grad_norm": 73.52734375, "learning_rate": 8.748906014838672e-06, - "loss": 9.4102, + "loss": 8.7344, "step": 860 }, { "epoch": 3.075, - "grad_norm": 57.08549499511719, + "grad_norm": 242.61814880371094, "learning_rate": 8.744778603945013e-06, - "loss": 12.0312, + "loss": 11.4609, "step": 861 }, { "epoch": 3.0785714285714287, - "grad_norm": 58.506080627441406, + "grad_norm": 233.428955078125, "learning_rate": 8.740645373027635e-06, - "loss": 11.8828, + "loss": 12.2051, "step": 862 }, { "epoch": 3.0821428571428573, - "grad_norm": 48.351402282714844, + "grad_norm": 3.5437819957733154, "learning_rate": 8.736506328510288e-06, - "loss": 10.5508, + "loss": 9.4004, "step": 863 }, { "epoch": 3.085714285714286, - "grad_norm": 89.6833267211914, + "grad_norm": 145.75753784179688, "learning_rate": 8.732361476825752e-06, - "loss": 18.0352, + "loss": 19.5254, "step": 864 }, { "epoch": 3.0892857142857144, - "grad_norm": 76.75377655029297, + "grad_norm": 862.6920776367188, "learning_rate": 8.728210824415829e-06, - "loss": 11.6172, + "loss": 18.7188, "step": 865 }, { "epoch": 3.092857142857143, - "grad_norm": 76.32678985595703, + "grad_norm": 720.8951416015625, "learning_rate": 8.724054377731342e-06, - "loss": 11.5703, + "loss": 17.1465, "step": 866 }, { "epoch": 3.0964285714285715, - "grad_norm": 45.76746368408203, + "grad_norm": 995.8477172851562, "learning_rate": 8.719892143232117e-06, - "loss": 8.3398, + "loss": 14.3125, "step": 867 }, { "epoch": 3.1, - "grad_norm": 55.83579635620117, + "grad_norm": 23.956830978393555, "learning_rate": 8.715724127386971e-06, - "loss": 11.0703, + "loss": 10.4414, "step": 868 }, { "epoch": 3.1035714285714286, - "grad_norm": 57.32278823852539, + "grad_norm": 158.61915588378906, "learning_rate": 8.711550336673717e-06, - "loss": 8.8359, + "loss": 7.6211, "step": 869 }, { "epoch": 3.107142857142857, - "grad_norm": 60.34767532348633, + "grad_norm": 3.968193769454956, "learning_rate": 8.707370777579134e-06, - "loss": 15.9766, + "loss": 13.9316, "step": 870 }, { "epoch": 3.1107142857142858, - "grad_norm": 59.71306228637695, + "grad_norm": 112.79329681396484, "learning_rate": 8.703185456598969e-06, - "loss": 9.9219, + "loss": 7.2852, "step": 871 }, { "epoch": 3.1142857142857143, - "grad_norm": 67.74508666992188, + "grad_norm": 478.969970703125, "learning_rate": 8.698994380237921e-06, - "loss": 13.0586, + "loss": 15.7148, "step": 872 }, { "epoch": 3.117857142857143, - "grad_norm": 59.645111083984375, + "grad_norm": 88.99514770507812, "learning_rate": 8.69479755500964e-06, - "loss": 12.9922, + "loss": 12.8086, "step": 873 }, { "epoch": 3.1214285714285714, - "grad_norm": 74.82654571533203, + "grad_norm": 35.099342346191406, "learning_rate": 8.690594987436705e-06, - "loss": 12.3203, + "loss": 11.3672, "step": 874 }, { "epoch": 3.125, - "grad_norm": 44.0617561340332, + "grad_norm": 9.768417358398438, "learning_rate": 8.68638668405062e-06, - "loss": 9.9531, + "loss": 8.043, "step": 875 }, { "epoch": 3.1285714285714286, - "grad_norm": 69.92530822753906, + "grad_norm": 1250.6070556640625, "learning_rate": 8.68217265139181e-06, - "loss": 10.4023, + "loss": 17.3926, "step": 876 }, { "epoch": 3.132142857142857, - "grad_norm": 54.81882095336914, + "grad_norm": 91.5658187866211, "learning_rate": 8.677952896009598e-06, - "loss": 11.25, + "loss": 10.8848, "step": 877 }, { "epoch": 3.1357142857142857, - "grad_norm": 62.66746520996094, + "grad_norm": 82.53955078125, "learning_rate": 8.6737274244622e-06, - "loss": 13.3008, + "loss": 12.2188, "step": 878 }, { "epoch": 3.1392857142857142, - "grad_norm": 49.4044303894043, + "grad_norm": 18.108356475830078, "learning_rate": 8.669496243316719e-06, - "loss": 9.3281, + "loss": 7.541, "step": 879 }, { "epoch": 3.142857142857143, - "grad_norm": 54.23263168334961, + "grad_norm": 868.3704833984375, "learning_rate": 8.665259359149132e-06, - "loss": 9.7695, + "loss": 13.8789, "step": 880 }, { "epoch": 3.1464285714285714, - "grad_norm": 53.548397064208984, + "grad_norm": 881.9405517578125, "learning_rate": 8.66101677854428e-06, - "loss": 11.5781, + "loss": 12.8164, "step": 881 }, { "epoch": 3.15, - "grad_norm": 79.01927185058594, + "grad_norm": 44.632755279541016, "learning_rate": 8.656768508095853e-06, - "loss": 10.6953, + "loss": 8.4844, "step": 882 }, { "epoch": 3.1535714285714285, - "grad_norm": 64.95639038085938, + "grad_norm": 43.91999816894531, "learning_rate": 8.652514554406388e-06, - "loss": 12.5938, + "loss": 12.7578, "step": 883 }, { "epoch": 3.157142857142857, - "grad_norm": 66.95855712890625, + "grad_norm": 133.06935119628906, "learning_rate": 8.648254924087256e-06, - "loss": 10.0508, + "loss": 8.9004, "step": 884 }, { "epoch": 3.1607142857142856, - "grad_norm": 57.97194290161133, + "grad_norm": 586.483642578125, "learning_rate": 8.643989623758642e-06, - "loss": 12.7734, + "loss": 13.3438, "step": 885 }, { "epoch": 3.164285714285714, - "grad_norm": 71.04072570800781, + "grad_norm": 12.335368156433105, "learning_rate": 8.639718660049556e-06, - "loss": 11.8477, + "loss": 11.1289, "step": 886 }, { "epoch": 3.1678571428571427, - "grad_norm": 53.709163665771484, + "grad_norm": 912.59326171875, "learning_rate": 8.635442039597798e-06, - "loss": 9.7891, + "loss": 26.2773, "step": 887 }, { "epoch": 3.1714285714285713, - "grad_norm": 59.920902252197266, + "grad_norm": 212.30970764160156, "learning_rate": 8.631159769049965e-06, - "loss": 12.1914, + "loss": 13.75, "step": 888 }, { "epoch": 3.175, - "grad_norm": 57.052677154541016, + "grad_norm": 31.099889755249023, "learning_rate": 8.626871855061438e-06, - "loss": 11.1328, + "loss": 10.1641, "step": 889 }, { "epoch": 3.1785714285714284, - "grad_norm": 71.82422637939453, + "grad_norm": 5.724249362945557, "learning_rate": 8.622578304296364e-06, - "loss": 10.6641, + "loss": 7.4453, "step": 890 }, { "epoch": 3.182142857142857, - "grad_norm": 50.676265716552734, + "grad_norm": 331.943359375, "learning_rate": 8.618279123427652e-06, - "loss": 9.9922, + "loss": 19.0898, "step": 891 }, { "epoch": 3.185714285714286, - "grad_norm": 55.771263122558594, + "grad_norm": 151.6820068359375, "learning_rate": 8.613974319136959e-06, - "loss": 10.5508, + "loss": 9.8828, "step": 892 }, { "epoch": 3.189285714285714, - "grad_norm": 57.160682678222656, + "grad_norm": 145.13418579101562, "learning_rate": 8.609663898114686e-06, - "loss": 10.2188, + "loss": 9.2422, "step": 893 }, { "epoch": 3.192857142857143, - "grad_norm": 53.87666702270508, + "grad_norm": 26.56485939025879, "learning_rate": 8.605347867059963e-06, - "loss": 10.332, + "loss": 8.2051, "step": 894 }, { "epoch": 3.1964285714285716, - "grad_norm": 84.08065795898438, + "grad_norm": 172.54075622558594, "learning_rate": 8.601026232680634e-06, - "loss": 11.2109, + "loss": 12.6484, "step": 895 }, { "epoch": 3.2, - "grad_norm": 50.920570373535156, + "grad_norm": 28.70050621032715, "learning_rate": 8.596699001693257e-06, - "loss": 10.2188, + "loss": 19.7109, "step": 896 }, { "epoch": 3.2035714285714287, - "grad_norm": 53.21914291381836, + "grad_norm": 10.769079208374023, "learning_rate": 8.592366180823084e-06, - "loss": 11.4062, + "loss": 10.7383, "step": 897 }, { "epoch": 3.2071428571428573, - "grad_norm": 64.84420013427734, + "grad_norm": 27.06294822692871, "learning_rate": 8.58802777680406e-06, - "loss": 8.7734, + "loss": 6.6953, "step": 898 }, { "epoch": 3.210714285714286, - "grad_norm": 69.38402557373047, + "grad_norm": 114.60929107666016, "learning_rate": 8.5836837963788e-06, - "loss": 12.4453, + "loss": 9.5273, "step": 899 }, { "epoch": 3.2142857142857144, - "grad_norm": 54.65081024169922, + "grad_norm": 902.9102783203125, "learning_rate": 8.579334246298593e-06, - "loss": 13.7383, + "loss": 22.8262, "step": 900 }, { "epoch": 3.217857142857143, - "grad_norm": 75.01324462890625, + "grad_norm": 459.0619812011719, "learning_rate": 8.574979133323378e-06, - "loss": 13.0039, + "loss": 20.7812, "step": 901 }, { "epoch": 3.2214285714285715, - "grad_norm": 51.74589157104492, + "grad_norm": 173.65907287597656, "learning_rate": 8.570618464221741e-06, - "loss": 10.4062, + "loss": 9.875, "step": 902 }, { "epoch": 3.225, - "grad_norm": 59.98271942138672, + "grad_norm": 116.93803405761719, "learning_rate": 8.56625224577091e-06, - "loss": 9.7578, + "loss": 7.3145, "step": 903 }, { "epoch": 3.2285714285714286, - "grad_norm": 68.33460235595703, + "grad_norm": 13.215533256530762, "learning_rate": 8.561880484756726e-06, - "loss": 13.0, + "loss": 13.2148, "step": 904 }, { "epoch": 3.232142857142857, - "grad_norm": 65.04283905029297, + "grad_norm": 54.05717086791992, "learning_rate": 8.557503187973652e-06, - "loss": 13.3125, + "loss": 12.9453, "step": 905 }, { "epoch": 3.2357142857142858, - "grad_norm": 62.329402923583984, + "grad_norm": 38.71870422363281, "learning_rate": 8.553120362224754e-06, - "loss": 9.8711, + "loss": 8.3438, "step": 906 }, { "epoch": 3.2392857142857143, - "grad_norm": 61.37017059326172, + "grad_norm": 154.98536682128906, "learning_rate": 8.548732014321688e-06, - "loss": 13.1719, + "loss": 11.3945, "step": 907 }, { "epoch": 3.242857142857143, - "grad_norm": 63.00102996826172, + "grad_norm": 380.5295104980469, "learning_rate": 8.544338151084697e-06, - "loss": 11.3711, + "loss": 22.7266, "step": 908 }, { "epoch": 3.2464285714285714, - "grad_norm": 57.68163299560547, + "grad_norm": 623.4511108398438, "learning_rate": 8.539938779342589e-06, - "loss": 13.1641, + "loss": 14.6348, "step": 909 }, { "epoch": 3.25, - "grad_norm": 65.98783111572266, + "grad_norm": 6.113479137420654, "learning_rate": 8.535533905932739e-06, - "loss": 10.0352, + "loss": 10.0312, "step": 910 }, { "epoch": 3.2535714285714286, - "grad_norm": 64.00474548339844, + "grad_norm": 676.7217407226562, "learning_rate": 8.531123537701069e-06, - "loss": 12.4375, + "loss": 22.7656, "step": 911 }, { "epoch": 3.257142857142857, - "grad_norm": 64.85053253173828, + "grad_norm": 42.548553466796875, "learning_rate": 8.526707681502045e-06, - "loss": 13.5352, + "loss": 12.957, "step": 912 }, { "epoch": 3.2607142857142857, - "grad_norm": 76.9067611694336, + "grad_norm": 89.60271453857422, "learning_rate": 8.522286344198658e-06, - "loss": 13.3125, + "loss": 14.6719, "step": 913 }, { "epoch": 3.2642857142857142, - "grad_norm": 58.1256103515625, + "grad_norm": 3.2589569091796875, "learning_rate": 8.517859532662418e-06, - "loss": 10.8984, + "loss": 8.1836, "step": 914 }, { "epoch": 3.267857142857143, - "grad_norm": 87.43299865722656, + "grad_norm": 389.6732177734375, "learning_rate": 8.513427253773347e-06, - "loss": 13.8125, + "loss": 27.3281, "step": 915 }, { "epoch": 3.2714285714285714, - "grad_norm": 71.21187591552734, + "grad_norm": 356.5793151855469, "learning_rate": 8.508989514419959e-06, - "loss": 13.7461, + "loss": 24.0918, "step": 916 }, { "epoch": 3.275, - "grad_norm": 70.53509521484375, + "grad_norm": 51.3709831237793, "learning_rate": 8.504546321499255e-06, - "loss": 10.0117, + "loss": 7.7871, "step": 917 }, { "epoch": 3.2785714285714285, - "grad_norm": 58.6937255859375, + "grad_norm": 1140.8553466796875, "learning_rate": 8.500097681916717e-06, - "loss": 11.2617, + "loss": 18.7051, "step": 918 }, { "epoch": 3.282142857142857, - "grad_norm": 54.55370330810547, + "grad_norm": 15.594528198242188, "learning_rate": 8.495643602586287e-06, - "loss": 12.582, + "loss": 12.2148, "step": 919 }, { "epoch": 3.2857142857142856, - "grad_norm": 49.861576080322266, + "grad_norm": 575.7767333984375, "learning_rate": 8.491184090430365e-06, - "loss": 11.6484, + "loss": 18.0234, "step": 920 }, { "epoch": 3.289285714285714, - "grad_norm": 59.6472053527832, + "grad_norm": 14.777253150939941, "learning_rate": 8.48671915237979e-06, - "loss": 14.6484, + "loss": 14.1602, "step": 921 }, { "epoch": 3.2928571428571427, - "grad_norm": 60.83682632446289, + "grad_norm": 10.773479461669922, "learning_rate": 8.482248795373835e-06, - "loss": 13.5, + "loss": 13.3359, "step": 922 }, { "epoch": 3.2964285714285713, - "grad_norm": 57.03144836425781, + "grad_norm": 33.10726547241211, "learning_rate": 8.477773026360199e-06, - "loss": 10.5, + "loss": 9.5488, "step": 923 }, { "epoch": 3.3, - "grad_norm": 61.104976654052734, + "grad_norm": 217.79910278320312, "learning_rate": 8.473291852294986e-06, - "loss": 12.8086, + "loss": 11.6172, "step": 924 }, { "epoch": 3.3035714285714284, - "grad_norm": 49.17302322387695, + "grad_norm": 30.169370651245117, "learning_rate": 8.46880528014271e-06, - "loss": 10.6367, + "loss": 9.8105, "step": 925 }, { "epoch": 3.307142857142857, - "grad_norm": 59.66930389404297, + "grad_norm": 873.532958984375, "learning_rate": 8.46431331687626e-06, - "loss": 9.9062, + "loss": 17.1797, "step": 926 }, { "epoch": 3.310714285714286, - "grad_norm": 48.67955780029297, + "grad_norm": 188.3071746826172, "learning_rate": 8.459815969476917e-06, - "loss": 10.7266, + "loss": 22.707, "step": 927 }, { "epoch": 3.314285714285714, - "grad_norm": 64.18270874023438, + "grad_norm": 247.83331298828125, "learning_rate": 8.455313244934324e-06, - "loss": 11.6953, + "loss": 8.8457, "step": 928 }, { "epoch": 3.317857142857143, - "grad_norm": 59.15361785888672, + "grad_norm": 4.631284236907959, "learning_rate": 8.450805150246481e-06, - "loss": 11.1406, + "loss": 8.9336, "step": 929 }, { "epoch": 3.3214285714285716, - "grad_norm": 72.82840728759766, + "grad_norm": 809.8936767578125, "learning_rate": 8.446291692419735e-06, - "loss": 14.1094, + "loss": 21.9219, "step": 930 }, { "epoch": 3.325, - "grad_norm": 58.35105895996094, + "grad_norm": 114.71652221679688, "learning_rate": 8.44177287846877e-06, - "loss": 12.1641, + "loss": 11.543, "step": 931 }, { "epoch": 3.3285714285714287, - "grad_norm": 49.20502471923828, + "grad_norm": 527.4972534179688, "learning_rate": 8.437248715416591e-06, - "loss": 10.7891, + "loss": 11.8828, "step": 932 }, { "epoch": 3.3321428571428573, - "grad_norm": 50.299110412597656, + "grad_norm": 36.173316955566406, "learning_rate": 8.432719210294518e-06, - "loss": 9.0586, + "loss": 9.7266, "step": 933 }, { "epoch": 3.335714285714286, - "grad_norm": 53.89004135131836, + "grad_norm": 1156.1875, "learning_rate": 8.428184370142171e-06, - "loss": 10.0586, + "loss": 15.043, "step": 934 }, { "epoch": 3.3392857142857144, - "grad_norm": 49.437896728515625, + "grad_norm": 695.1661376953125, "learning_rate": 8.423644202007468e-06, - "loss": 8.9102, + "loss": 17.0566, "step": 935 }, { "epoch": 3.342857142857143, - "grad_norm": 54.54572296142578, + "grad_norm": 848.0780639648438, "learning_rate": 8.4190987129466e-06, - "loss": 10.5781, + "loss": 18.1602, "step": 936 }, { "epoch": 3.3464285714285715, - "grad_norm": 63.20669937133789, + "grad_norm": 556.0953369140625, "learning_rate": 8.414547910024035e-06, - "loss": 12.9062, + "loss": 12.3398, "step": 937 }, { "epoch": 3.35, - "grad_norm": 72.62944793701172, + "grad_norm": 14.218914031982422, "learning_rate": 8.409991800312493e-06, - "loss": 18.2109, + "loss": 18.7031, "step": 938 }, { "epoch": 3.3535714285714286, - "grad_norm": 66.18962097167969, + "grad_norm": 8.672175407409668, "learning_rate": 8.405430390892945e-06, - "loss": 13.3438, + "loss": 13.3359, "step": 939 }, { "epoch": 3.357142857142857, - "grad_norm": 48.773704528808594, + "grad_norm": 60.74304962158203, "learning_rate": 8.400863688854598e-06, - "loss": 10.1484, + "loss": 11.1523, "step": 940 }, { "epoch": 3.3607142857142858, - "grad_norm": 61.589839935302734, + "grad_norm": 10.700437545776367, "learning_rate": 8.396291701294884e-06, - "loss": 9.8008, + "loss": 19.6484, "step": 941 }, { "epoch": 3.3642857142857143, - "grad_norm": 54.251365661621094, + "grad_norm": 7.014707565307617, "learning_rate": 8.391714435319452e-06, - "loss": 11.5547, + "loss": 10.6016, "step": 942 }, { "epoch": 3.367857142857143, - "grad_norm": 53.24906921386719, + "grad_norm": 83.1421890258789, "learning_rate": 8.387131898042152e-06, - "loss": 10.5977, + "loss": 11.5078, "step": 943 }, { "epoch": 3.3714285714285714, - "grad_norm": 49.24538803100586, + "grad_norm": 3.7153990268707275, "learning_rate": 8.382544096585028e-06, - "loss": 10.3008, + "loss": 8.8164, "step": 944 }, { "epoch": 3.375, - "grad_norm": 71.83541107177734, + "grad_norm": 20.438093185424805, "learning_rate": 8.377951038078303e-06, - "loss": 13.7031, + "loss": 13.2441, "step": 945 }, { "epoch": 3.3785714285714286, - "grad_norm": 63.056732177734375, + "grad_norm": 10.489644050598145, "learning_rate": 8.373352729660373e-06, - "loss": 9.5, + "loss": 7.8145, "step": 946 }, { "epoch": 3.382142857142857, - "grad_norm": 47.53358459472656, + "grad_norm": 482.2951354980469, "learning_rate": 8.368749178477793e-06, - "loss": 8.5625, + "loss": 6.6426, "step": 947 }, { "epoch": 3.3857142857142857, - "grad_norm": 49.33970642089844, + "grad_norm": 45.90773391723633, "learning_rate": 8.364140391685265e-06, - "loss": 9.6914, + "loss": 8.6484, "step": 948 }, { "epoch": 3.3892857142857142, - "grad_norm": 57.252235412597656, + "grad_norm": 18.86627769470215, "learning_rate": 8.359526376445631e-06, - "loss": 10.3984, + "loss": 10.1758, "step": 949 }, { "epoch": 3.392857142857143, - "grad_norm": 61.70408630371094, + "grad_norm": 236.72622680664062, "learning_rate": 8.35490713992985e-06, - "loss": 10.7344, + "loss": 9.5117, "step": 950 }, { "epoch": 3.3964285714285714, - "grad_norm": 50.67820739746094, + "grad_norm": 393.3250427246094, "learning_rate": 8.350282689317011e-06, - "loss": 10.4844, + "loss": 10.8047, "step": 951 }, { "epoch": 3.4, - "grad_norm": 69.9441146850586, + "grad_norm": 18.175302505493164, "learning_rate": 8.345653031794292e-06, - "loss": 9.4297, + "loss": 7.0, "step": 952 }, { "epoch": 3.4035714285714285, - "grad_norm": 54.81508255004883, + "grad_norm": 9.046562194824219, "learning_rate": 8.34101817455697e-06, - "loss": 11.6562, + "loss": 9.8945, "step": 953 }, { "epoch": 3.407142857142857, - "grad_norm": 56.592376708984375, + "grad_norm": 4.956536293029785, "learning_rate": 8.336378124808404e-06, - "loss": 11.6016, + "loss": 11.0156, "step": 954 }, { "epoch": 3.4107142857142856, - "grad_norm": 51.458560943603516, + "grad_norm": 921.3717651367188, "learning_rate": 8.331732889760021e-06, - "loss": 9.2656, + "loss": 11.3584, "step": 955 }, { "epoch": 3.414285714285714, - "grad_norm": 58.0858039855957, + "grad_norm": 169.79583740234375, "learning_rate": 8.327082476631307e-06, - "loss": 13.4805, + "loss": 12.4844, "step": 956 }, { "epoch": 3.4178571428571427, - "grad_norm": 59.04414367675781, + "grad_norm": 476.6055908203125, "learning_rate": 8.322426892649796e-06, - "loss": 10.7852, + "loss": 11.7363, "step": 957 }, { "epoch": 3.4214285714285713, - "grad_norm": 72.73804473876953, + "grad_norm": 475.1752014160156, "learning_rate": 8.317766145051057e-06, - "loss": 14.9414, + "loss": 17.3672, "step": 958 }, { "epoch": 3.425, - "grad_norm": 60.15581130981445, + "grad_norm": 749.2841796875, "learning_rate": 8.313100241078689e-06, - "loss": 13.5664, + "loss": 20.3984, "step": 959 }, { "epoch": 3.4285714285714284, - "grad_norm": 54.238922119140625, + "grad_norm": 482.0047302246094, "learning_rate": 8.308429187984298e-06, - "loss": 11.6172, + "loss": 14.1641, "step": 960 }, { "epoch": 3.432142857142857, - "grad_norm": 50.23872756958008, + "grad_norm": 889.7870483398438, "learning_rate": 8.303752993027499e-06, - "loss": 11.6562, + "loss": 18.8672, "step": 961 }, { "epoch": 3.435714285714286, - "grad_norm": 62.61098098754883, + "grad_norm": 7.586583137512207, "learning_rate": 8.299071663475892e-06, - "loss": 12.0977, + "loss": 12.1953, "step": 962 }, { "epoch": 3.439285714285714, - "grad_norm": 52.95948791503906, + "grad_norm": 3.7064661979675293, "learning_rate": 8.294385206605063e-06, - "loss": 11.6406, + "loss": 10.9844, "step": 963 }, { "epoch": 3.442857142857143, - "grad_norm": 53.552223205566406, + "grad_norm": 987.1384887695312, "learning_rate": 8.289693629698564e-06, - "loss": 11.1602, + "loss": 14.2051, "step": 964 }, { "epoch": 3.4464285714285716, - "grad_norm": 49.03020095825195, + "grad_norm": 185.29505920410156, "learning_rate": 8.284996940047904e-06, - "loss": 10.2578, + "loss": 22.2305, "step": 965 }, { "epoch": 3.45, - "grad_norm": 49.62080001831055, + "grad_norm": 83.57319641113281, "learning_rate": 8.280295144952537e-06, - "loss": 11.625, + "loss": 9.9766, "step": 966 }, { "epoch": 3.4535714285714287, - "grad_norm": 51.1426887512207, + "grad_norm": 191.1047821044922, "learning_rate": 8.275588251719857e-06, - "loss": 11.6797, + "loss": 22.5312, "step": 967 }, { "epoch": 3.4571428571428573, - "grad_norm": 50.2422981262207, + "grad_norm": 8.655171394348145, "learning_rate": 8.270876267665173e-06, - "loss": 10.2656, + "loss": 8.0039, "step": 968 }, { "epoch": 3.460714285714286, - "grad_norm": 62.235801696777344, + "grad_norm": 742.8677368164062, "learning_rate": 8.266159200111713e-06, - "loss": 10.3242, + "loss": 14.7539, "step": 969 }, { "epoch": 3.4642857142857144, - "grad_norm": 77.22599029541016, + "grad_norm": 551.7919921875, "learning_rate": 8.261437056390607e-06, - "loss": 9.4141, + "loss": 8.873, "step": 970 }, { "epoch": 3.467857142857143, - "grad_norm": 56.37822341918945, + "grad_norm": 39.37831115722656, "learning_rate": 8.256709843840864e-06, - "loss": 10.668, + "loss": 8.793, "step": 971 }, { "epoch": 3.4714285714285715, - "grad_norm": 58.33448791503906, + "grad_norm": 52.861724853515625, "learning_rate": 8.251977569809383e-06, - "loss": 12.9336, + "loss": 11.3242, "step": 972 }, { "epoch": 3.475, - "grad_norm": 54.02076721191406, + "grad_norm": 7.87725305557251, "learning_rate": 8.247240241650918e-06, - "loss": 11.4609, + "loss": 10.2383, "step": 973 }, { "epoch": 3.4785714285714286, - "grad_norm": 45.95112991333008, + "grad_norm": 124.37787628173828, "learning_rate": 8.242497866728089e-06, - "loss": 11.1523, + "loss": 9.832, "step": 974 }, { "epoch": 3.482142857142857, - "grad_norm": 58.23402404785156, + "grad_norm": 834.4370727539062, "learning_rate": 8.237750452411353e-06, - "loss": 11.8242, + "loss": 19.168, "step": 975 }, { "epoch": 3.4857142857142858, - "grad_norm": 62.99897766113281, + "grad_norm": 4.823376178741455, "learning_rate": 8.232998006078998e-06, - "loss": 12.2344, + "loss": 11.7578, "step": 976 }, { "epoch": 3.4892857142857143, - "grad_norm": 62.29439163208008, + "grad_norm": 67.53034210205078, "learning_rate": 8.228240535117138e-06, - "loss": 10.8477, + "loss": 9.1602, "step": 977 }, { "epoch": 3.492857142857143, - "grad_norm": 78.26791381835938, + "grad_norm": 22.93580436706543, "learning_rate": 8.223478046919693e-06, - "loss": 13.2969, + "loss": 15.793, "step": 978 }, { "epoch": 3.4964285714285714, - "grad_norm": 48.845821380615234, + "grad_norm": 1092.704833984375, "learning_rate": 8.218710548888377e-06, - "loss": 9.9609, + "loss": 14.8633, "step": 979 }, { "epoch": 3.5, - "grad_norm": 55.41098403930664, + "grad_norm": 113.19414520263672, "learning_rate": 8.213938048432697e-06, - "loss": 10.6484, + "loss": 9.748, "step": 980 }, { "epoch": 3.5035714285714286, - "grad_norm": 56.77787780761719, + "grad_norm": 372.60546875, "learning_rate": 8.209160552969932e-06, - "loss": 12.5469, + "loss": 21.918, "step": 981 }, { "epoch": 3.507142857142857, - "grad_norm": 55.87371826171875, + "grad_norm": 31.02596664428711, "learning_rate": 8.204378069925121e-06, - "loss": 11.2969, + "loss": 10.4961, "step": 982 }, { "epoch": 3.5107142857142857, - "grad_norm": 65.93899536132812, + "grad_norm": 961.9480590820312, "learning_rate": 8.19959060673106e-06, - "loss": 9.8828, + "loss": 13.1133, "step": 983 }, { "epoch": 3.5142857142857142, - "grad_norm": 51.37303161621094, + "grad_norm": 273.9033203125, "learning_rate": 8.19479817082828e-06, - "loss": 11.6875, + "loss": 10.8008, "step": 984 }, { "epoch": 3.517857142857143, - "grad_norm": 61.17251968383789, + "grad_norm": 627.7311401367188, "learning_rate": 8.190000769665044e-06, - "loss": 13.5547, + "loss": 23.2031, "step": 985 }, { "epoch": 3.5214285714285714, - "grad_norm": 66.72117614746094, + "grad_norm": 340.4833679199219, "learning_rate": 8.18519841069733e-06, - "loss": 11.625, + "loss": 29.7363, "step": 986 }, { "epoch": 3.525, - "grad_norm": 59.966548919677734, + "grad_norm": 880.2069091796875, "learning_rate": 8.18039110138882e-06, - "loss": 10.9219, + "loss": 28.1484, "step": 987 }, { "epoch": 3.5285714285714285, - "grad_norm": 70.21217346191406, + "grad_norm": 62.54664611816406, "learning_rate": 8.175578849210894e-06, - "loss": 13.5742, + "loss": 24.4297, "step": 988 }, { "epoch": 3.532142857142857, - "grad_norm": 52.24333572387695, + "grad_norm": 7.543524742126465, "learning_rate": 8.17076166164261e-06, - "loss": 10.4727, + "loss": 8.9453, "step": 989 }, { "epoch": 3.5357142857142856, - "grad_norm": 68.34994506835938, + "grad_norm": 569.4793701171875, "learning_rate": 8.165939546170701e-06, - "loss": 12.4688, + "loss": 13.3594, "step": 990 }, { "epoch": 3.539285714285714, - "grad_norm": 62.22126007080078, + "grad_norm": 10.071966171264648, "learning_rate": 8.16111251028955e-06, - "loss": 11.8672, + "loss": 9.9766, "step": 991 }, { "epoch": 3.5428571428571427, - "grad_norm": 48.92422866821289, + "grad_norm": 809.0988159179688, "learning_rate": 8.156280561501196e-06, - "loss": 9.4336, + "loss": 13.5957, "step": 992 }, { "epoch": 3.5464285714285713, - "grad_norm": 63.854679107666016, + "grad_norm": 638.0767822265625, "learning_rate": 8.15144370731531e-06, - "loss": 14.8008, + "loss": 26.3477, "step": 993 }, { "epoch": 3.55, - "grad_norm": 48.53741455078125, + "grad_norm": 883.3140869140625, "learning_rate": 8.146601955249187e-06, - "loss": 11.2422, + "loss": 14.3906, "step": 994 }, { "epoch": 3.553571428571429, - "grad_norm": 45.18637466430664, + "grad_norm": 84.4407730102539, "learning_rate": 8.141755312827737e-06, - "loss": 9.1602, + "loss": 8.8594, "step": 995 }, { "epoch": 3.557142857142857, - "grad_norm": 61.46656799316406, + "grad_norm": 659.7808227539062, "learning_rate": 8.136903787583464e-06, - "loss": 9.1055, + "loss": 15.6016, "step": 996 }, { "epoch": 3.560714285714286, - "grad_norm": 66.11101531982422, + "grad_norm": 104.7785415649414, "learning_rate": 8.132047387056465e-06, - "loss": 12.5273, + "loss": 12.2148, "step": 997 }, { "epoch": 3.564285714285714, - "grad_norm": 58.45448303222656, + "grad_norm": 4.2260589599609375, "learning_rate": 8.127186118794415e-06, - "loss": 10.6406, + "loss": 10.1719, "step": 998 }, { "epoch": 3.567857142857143, - "grad_norm": 72.37398529052734, + "grad_norm": 91.42001342773438, "learning_rate": 8.122319990352552e-06, - "loss": 16.1953, + "loss": 17.4453, "step": 999 }, { "epoch": 3.571428571428571, - "grad_norm": 57.454341888427734, + "grad_norm": 2.9067442417144775, "learning_rate": 8.117449009293668e-06, - "loss": 10.3242, + "loss": 8.7734, "step": 1000 }, { "epoch": 3.575, - "grad_norm": 56.87932205200195, + "grad_norm": 1386.6539306640625, "learning_rate": 8.112573183188099e-06, - "loss": 11.7578, + "loss": 32.3594, "step": 1001 }, { "epoch": 3.5785714285714287, - "grad_norm": 54.90412521362305, + "grad_norm": 499.6116027832031, "learning_rate": 8.107692519613705e-06, - "loss": 12.7305, + "loss": 11.4199, "step": 1002 }, { "epoch": 3.5821428571428573, - "grad_norm": 57.4184684753418, + "grad_norm": 44.448883056640625, "learning_rate": 8.102807026155875e-06, - "loss": 9.7344, + "loss": 7.5156, "step": 1003 }, { "epoch": 3.585714285714286, - "grad_norm": 59.88007736206055, + "grad_norm": 69.64434814453125, "learning_rate": 8.097916710407491e-06, - "loss": 8.7188, + "loss": 7.1211, "step": 1004 }, { "epoch": 3.5892857142857144, - "grad_norm": 44.92019271850586, + "grad_norm": 459.9666442871094, "learning_rate": 8.093021579968942e-06, - "loss": 11.3828, + "loss": 10.8027, "step": 1005 }, { "epoch": 3.592857142857143, - "grad_norm": 56.39665985107422, + "grad_norm": 100.0321273803711, "learning_rate": 8.08812164244809e-06, - "loss": 10.7383, + "loss": 9.1914, "step": 1006 }, { "epoch": 3.5964285714285715, - "grad_norm": 47.203243255615234, + "grad_norm": 117.02057647705078, "learning_rate": 8.083216905460275e-06, - "loss": 9.3438, + "loss": 8.4883, "step": 1007 }, { "epoch": 3.6, - "grad_norm": 50.5592041015625, + "grad_norm": 99.24940490722656, "learning_rate": 8.078307376628292e-06, - "loss": 11.0664, + "loss": 12.3008, "step": 1008 }, { "epoch": 3.6035714285714286, - "grad_norm": 58.80060958862305, + "grad_norm": 10.374593734741211, "learning_rate": 8.073393063582386e-06, - "loss": 11.8945, + "loss": 11.8516, "step": 1009 }, { "epoch": 3.607142857142857, - "grad_norm": 63.57283401489258, + "grad_norm": 86.23297119140625, "learning_rate": 8.068473973960238e-06, - "loss": 11.668, + "loss": 11.2539, "step": 1010 }, { "epoch": 3.6107142857142858, - "grad_norm": 49.10694885253906, + "grad_norm": 60.51142501831055, "learning_rate": 8.063550115406948e-06, - "loss": 9.668, + "loss": 8.7148, "step": 1011 }, { "epoch": 3.6142857142857143, - "grad_norm": 61.258705139160156, + "grad_norm": 606.1629638671875, "learning_rate": 8.058621495575032e-06, - "loss": 10.4492, + "loss": 20.3281, "step": 1012 }, { "epoch": 3.617857142857143, - "grad_norm": 59.529014587402344, + "grad_norm": 18.165761947631836, "learning_rate": 8.053688122124405e-06, - "loss": 12.8125, + "loss": 13.4199, "step": 1013 }, { "epoch": 3.6214285714285714, - "grad_norm": 61.07832717895508, + "grad_norm": 199.468017578125, "learning_rate": 8.04875000272237e-06, - "loss": 13.0938, + "loss": 14.3594, "step": 1014 }, { "epoch": 3.625, - "grad_norm": 44.99586486816406, + "grad_norm": 546.4374389648438, "learning_rate": 8.043807145043604e-06, - "loss": 10.6641, + "loss": 12.3594, "step": 1015 }, { "epoch": 3.6285714285714286, - "grad_norm": 57.972225189208984, + "grad_norm": 55.288818359375, "learning_rate": 8.038859556770152e-06, - "loss": 11.3281, + "loss": 11.0898, "step": 1016 }, { "epoch": 3.632142857142857, - "grad_norm": 59.8886833190918, + "grad_norm": 323.950439453125, "learning_rate": 8.033907245591403e-06, - "loss": 15.6016, + "loss": 16.6016, "step": 1017 }, { "epoch": 3.6357142857142857, - "grad_norm": 49.968528747558594, + "grad_norm": 471.6402587890625, "learning_rate": 8.0289502192041e-06, - "loss": 10.3359, + "loss": 10.7598, "step": 1018 }, { "epoch": 3.6392857142857142, - "grad_norm": 49.99464416503906, + "grad_norm": 544.1854248046875, "learning_rate": 8.023988485312301e-06, - "loss": 9.1797, + "loss": 17.9102, "step": 1019 }, { "epoch": 3.642857142857143, - "grad_norm": 68.33613586425781, + "grad_norm": 235.39627075195312, "learning_rate": 8.019022051627387e-06, - "loss": 9.3281, + "loss": 7.4824, "step": 1020 }, { "epoch": 3.6464285714285714, - "grad_norm": 51.205970764160156, + "grad_norm": 214.94268798828125, "learning_rate": 8.014050925868042e-06, - "loss": 13.1328, + "loss": 26.252, "step": 1021 }, { "epoch": 3.65, - "grad_norm": 54.03866958618164, + "grad_norm": 36.6672248840332, "learning_rate": 8.009075115760243e-06, - "loss": 11.9844, + "loss": 11.4805, "step": 1022 }, { "epoch": 3.6535714285714285, - "grad_norm": 52.82984161376953, + "grad_norm": 11.905939102172852, "learning_rate": 8.004094629037242e-06, - "loss": 11.2656, + "loss": 12.125, "step": 1023 }, { "epoch": 3.657142857142857, - "grad_norm": 59.98958206176758, + "grad_norm": 8.61720085144043, "learning_rate": 7.99910947343957e-06, - "loss": 10.3125, + "loss": 10.6895, "step": 1024 }, { "epoch": 3.6607142857142856, - "grad_norm": 72.0011215209961, + "grad_norm": 15.917816162109375, "learning_rate": 7.994119656715002e-06, - "loss": 11.1641, + "loss": 8.7969, "step": 1025 }, { "epoch": 3.664285714285714, - "grad_norm": 50.26795196533203, + "grad_norm": 4.934392929077148, "learning_rate": 7.989125186618566e-06, - "loss": 11.5664, + "loss": 10.5938, "step": 1026 }, { "epoch": 3.6678571428571427, - "grad_norm": 81.0321273803711, + "grad_norm": 804.5480346679688, "learning_rate": 7.984126070912519e-06, - "loss": 16.0547, + "loss": 24.9844, "step": 1027 }, { "epoch": 3.6714285714285713, - "grad_norm": 50.2989387512207, + "grad_norm": 409.5172424316406, "learning_rate": 7.979122317366337e-06, - "loss": 10.9297, + "loss": 32.6758, "step": 1028 }, { "epoch": 3.675, - "grad_norm": 65.38188171386719, + "grad_norm": 51.35375213623047, "learning_rate": 7.974113933756708e-06, - "loss": 9.1406, + "loss": 6.9062, "step": 1029 }, { "epoch": 3.678571428571429, - "grad_norm": 53.547447204589844, + "grad_norm": 328.65753173828125, "learning_rate": 7.969100927867508e-06, - "loss": 10.7227, + "loss": 11.0781, "step": 1030 }, { "epoch": 3.682142857142857, - "grad_norm": 48.515594482421875, + "grad_norm": 860.552978515625, "learning_rate": 7.964083307489806e-06, - "loss": 10.8125, + "loss": 13.4844, "step": 1031 }, { "epoch": 3.685714285714286, - "grad_norm": 51.797630310058594, + "grad_norm": 912.0567016601562, "learning_rate": 7.95906108042184e-06, - "loss": 9.9375, + "loss": 25.0586, "step": 1032 }, { "epoch": 3.689285714285714, - "grad_norm": 57.82164764404297, + "grad_norm": 22.97993278503418, "learning_rate": 7.954034254469e-06, - "loss": 13.543, + "loss": 12.5859, "step": 1033 }, { "epoch": 3.692857142857143, - "grad_norm": 50.72506332397461, + "grad_norm": 18.007932662963867, "learning_rate": 7.949002837443836e-06, - "loss": 9.7422, + "loss": 9.1465, "step": 1034 }, { "epoch": 3.696428571428571, - "grad_norm": 64.70691680908203, + "grad_norm": 754.9473876953125, "learning_rate": 7.943966837166024e-06, - "loss": 14.1875, + "loss": 16.707, "step": 1035 }, { "epoch": 3.7, - "grad_norm": 49.269203186035156, + "grad_norm": 21.95349884033203, "learning_rate": 7.938926261462366e-06, - "loss": 9.9844, + "loss": 7.668, "step": 1036 }, { "epoch": 3.7035714285714287, - "grad_norm": 58.002342224121094, + "grad_norm": 141.89166259765625, "learning_rate": 7.933881118166776e-06, - "loss": 13.1016, + "loss": 11.5938, "step": 1037 }, { "epoch": 3.7071428571428573, - "grad_norm": 54.95801544189453, + "grad_norm": 87.89644622802734, "learning_rate": 7.928831415120265e-06, - "loss": 9.543, + "loss": 8.6504, "step": 1038 }, { "epoch": 3.710714285714286, - "grad_norm": 45.00877380371094, + "grad_norm": 5.912858963012695, "learning_rate": 7.923777160170933e-06, - "loss": 10.1992, + "loss": 9.5117, "step": 1039 }, { "epoch": 3.7142857142857144, - "grad_norm": 66.58634948730469, + "grad_norm": 740.97607421875, "learning_rate": 7.918718361173951e-06, - "loss": 10.7344, + "loss": 17.7676, "step": 1040 }, { "epoch": 3.717857142857143, - "grad_norm": 65.20586395263672, + "grad_norm": 28.342830657958984, "learning_rate": 7.913655025991555e-06, - "loss": 13.5586, + "loss": 12.8848, "step": 1041 }, { "epoch": 3.7214285714285715, - "grad_norm": 56.41404342651367, + "grad_norm": 309.2786865234375, "learning_rate": 7.90858716249303e-06, - "loss": 11.9219, + "loss": 12.1328, "step": 1042 }, { "epoch": 3.725, - "grad_norm": 59.09754943847656, + "grad_norm": 995.6663818359375, "learning_rate": 7.903514778554699e-06, - "loss": 12.7266, + "loss": 31.0, "step": 1043 }, { "epoch": 3.7285714285714286, - "grad_norm": 62.203033447265625, + "grad_norm": 609.7417602539062, "learning_rate": 7.898437882059913e-06, - "loss": 11.9805, + "loss": 21.2344, "step": 1044 }, { "epoch": 3.732142857142857, - "grad_norm": 58.468116760253906, + "grad_norm": 33.553321838378906, "learning_rate": 7.89335648089903e-06, - "loss": 11.6328, + "loss": 13.1172, "step": 1045 }, { "epoch": 3.7357142857142858, - "grad_norm": 52.00740051269531, + "grad_norm": 11.374946594238281, "learning_rate": 7.888270582969415e-06, - "loss": 9.8867, + "loss": 7.6211, "step": 1046 }, { "epoch": 3.7392857142857143, - "grad_norm": 55.239463806152344, + "grad_norm": 10.470023155212402, "learning_rate": 7.883180196175419e-06, - "loss": 11.2383, + "loss": 12.2539, "step": 1047 }, { "epoch": 3.742857142857143, - "grad_norm": 46.84687042236328, + "grad_norm": 81.49393463134766, "learning_rate": 7.87808532842837e-06, - "loss": 8.418, + "loss": 6.125, "step": 1048 }, { "epoch": 3.7464285714285714, - "grad_norm": 60.47352981567383, + "grad_norm": 280.1875915527344, "learning_rate": 7.872985987646557e-06, - "loss": 8.9453, + "loss": 19.5371, "step": 1049 }, { "epoch": 3.75, - "grad_norm": 72.8636703491211, + "grad_norm": 4.67822265625, "learning_rate": 7.86788218175523e-06, - "loss": 9.1836, + "loss": 7.5723, "step": 1050 }, { "epoch": 3.7535714285714286, - "grad_norm": 46.293487548828125, + "grad_norm": 4.025252342224121, "learning_rate": 7.862773918686571e-06, - "loss": 10.5781, + "loss": 10.8711, "step": 1051 }, { "epoch": 3.757142857142857, - "grad_norm": 51.664833068847656, + "grad_norm": 10.94587230682373, "learning_rate": 7.857661206379687e-06, - "loss": 11.2656, + "loss": 9.0664, "step": 1052 }, { "epoch": 3.7607142857142857, - "grad_norm": 52.2459831237793, + "grad_norm": 56.944549560546875, "learning_rate": 7.852544052780609e-06, - "loss": 8.8828, + "loss": 8.1172, "step": 1053 }, { "epoch": 3.7642857142857142, - "grad_norm": 73.31340026855469, + "grad_norm": 630.40869140625, "learning_rate": 7.84742246584226e-06, - "loss": 12.2188, + "loss": 13.8398, "step": 1054 }, { "epoch": 3.767857142857143, - "grad_norm": 60.54641342163086, + "grad_norm": 168.45777893066406, "learning_rate": 7.842296453524462e-06, - "loss": 12.4883, + "loss": 12.9492, "step": 1055 }, { "epoch": 3.7714285714285714, - "grad_norm": 53.18296813964844, + "grad_norm": 17.002954483032227, "learning_rate": 7.83716602379391e-06, - "loss": 11.1289, + "loss": 10.5488, "step": 1056 }, { "epoch": 3.775, - "grad_norm": 56.81544494628906, + "grad_norm": 979.5777587890625, "learning_rate": 7.832031184624165e-06, - "loss": 10.7734, + "loss": 15.4375, "step": 1057 }, { "epoch": 3.7785714285714285, - "grad_norm": 58.932708740234375, + "grad_norm": 20.45827865600586, "learning_rate": 7.826891943995641e-06, - "loss": 9.7891, + "loss": 6.459, "step": 1058 }, { "epoch": 3.782142857142857, - "grad_norm": 56.69194030761719, + "grad_norm": 35.96401596069336, "learning_rate": 7.821748309895596e-06, - "loss": 13.2188, + "loss": 13.9531, "step": 1059 }, { "epoch": 3.7857142857142856, - "grad_norm": 55.169700622558594, + "grad_norm": 497.7569274902344, "learning_rate": 7.81660029031811e-06, - "loss": 10.0508, + "loss": 11.2012, "step": 1060 }, { "epoch": 3.789285714285714, - "grad_norm": 47.481502532958984, + "grad_norm": 3.9989068508148193, "learning_rate": 7.811447893264087e-06, - "loss": 11.2031, + "loss": 10.293, "step": 1061 }, { "epoch": 3.7928571428571427, - "grad_norm": 53.623287200927734, + "grad_norm": 1048.1282958984375, "learning_rate": 7.806291126741222e-06, - "loss": 10.7266, + "loss": 24.0742, "step": 1062 }, { "epoch": 3.7964285714285713, - "grad_norm": 49.86418151855469, + "grad_norm": 574.0073852539062, "learning_rate": 7.801129998764014e-06, - "loss": 11.1055, + "loss": 12.3281, "step": 1063 }, { "epoch": 3.8, - "grad_norm": 48.68032455444336, + "grad_norm": 9.482410430908203, "learning_rate": 7.795964517353734e-06, - "loss": 11.5586, + "loss": 10.3164, "step": 1064 }, { "epoch": 3.803571428571429, - "grad_norm": 63.076438903808594, + "grad_norm": 141.97933959960938, "learning_rate": 7.790794690538422e-06, - "loss": 12.4102, + "loss": 12.5332, "step": 1065 }, { "epoch": 3.807142857142857, - "grad_norm": 67.44180297851562, + "grad_norm": 173.35134887695312, "learning_rate": 7.785620526352862e-06, - "loss": 13.1094, + "loss": 12.8848, "step": 1066 }, { "epoch": 3.810714285714286, - "grad_norm": 66.94548034667969, + "grad_norm": 308.3785705566406, "learning_rate": 7.780442032838594e-06, - "loss": 10.1758, + "loss": 8.6074, "step": 1067 }, { "epoch": 3.814285714285714, - "grad_norm": 53.19947052001953, + "grad_norm": 120.33564758300781, "learning_rate": 7.775259218043876e-06, - "loss": 10.4961, + "loss": 9.6445, "step": 1068 }, { "epoch": 3.817857142857143, - "grad_norm": 55.1075439453125, + "grad_norm": 429.77447509765625, "learning_rate": 7.770072090023684e-06, - "loss": 9.6836, + "loss": 12.7832, "step": 1069 }, { "epoch": 3.821428571428571, - "grad_norm": 49.61224365234375, + "grad_norm": 13.572346687316895, "learning_rate": 7.764880656839698e-06, - "loss": 11.8438, + "loss": 10.3047, "step": 1070 }, { "epoch": 3.825, - "grad_norm": 57.61943435668945, + "grad_norm": 27.865951538085938, "learning_rate": 7.759684926560292e-06, - "loss": 12.4297, + "loss": 11.4082, "step": 1071 }, { "epoch": 3.8285714285714287, - "grad_norm": 69.04647827148438, + "grad_norm": 544.4581909179688, "learning_rate": 7.754484907260513e-06, - "loss": 16.543, + "loss": 19.2109, "step": 1072 }, { "epoch": 3.8321428571428573, - "grad_norm": 51.13750457763672, + "grad_norm": 731.4468994140625, "learning_rate": 7.74928060702208e-06, - "loss": 10.6484, + "loss": 13.9062, "step": 1073 }, { "epoch": 3.835714285714286, - "grad_norm": 55.081966400146484, + "grad_norm": 67.02845001220703, "learning_rate": 7.744072033933356e-06, - "loss": 10.3281, + "loss": 10.2969, "step": 1074 }, { "epoch": 3.8392857142857144, - "grad_norm": 53.780731201171875, + "grad_norm": 966.3527221679688, "learning_rate": 7.738859196089358e-06, - "loss": 10.8359, + "loss": 15.5625, "step": 1075 }, { "epoch": 3.842857142857143, - "grad_norm": 51.56159210205078, + "grad_norm": 512.1155395507812, "learning_rate": 7.733642101591719e-06, - "loss": 8.9102, + "loss": 18.4141, "step": 1076 }, { "epoch": 3.8464285714285715, - "grad_norm": 66.81848907470703, + "grad_norm": 377.2500305175781, "learning_rate": 7.728420758548692e-06, - "loss": 11.0312, + "loss": 12.3125, "step": 1077 }, { "epoch": 3.85, - "grad_norm": 67.34381866455078, + "grad_norm": 65.57722473144531, "learning_rate": 7.723195175075136e-06, - "loss": 10.2344, + "loss": 9.4648, "step": 1078 }, { "epoch": 3.8535714285714286, - "grad_norm": 54.97749328613281, + "grad_norm": 27.969879150390625, "learning_rate": 7.717965359292496e-06, - "loss": 10.0586, + "loss": 11.3867, "step": 1079 }, { "epoch": 3.857142857142857, - "grad_norm": 50.68669128417969, + "grad_norm": 469.3667297363281, "learning_rate": 7.712731319328798e-06, - "loss": 10.5117, + "loss": 12.4453, "step": 1080 }, { "epoch": 3.8607142857142858, - "grad_norm": 55.58878707885742, + "grad_norm": 148.2136688232422, "learning_rate": 7.70749306331863e-06, - "loss": 12.7383, + "loss": 14.1426, "step": 1081 }, { "epoch": 3.8642857142857143, - "grad_norm": 47.89833450317383, + "grad_norm": 457.3583068847656, "learning_rate": 7.702250599403133e-06, - "loss": 11.625, + "loss": 11.5938, "step": 1082 }, { "epoch": 3.867857142857143, - "grad_norm": 53.37619400024414, + "grad_norm": 15.347333908081055, "learning_rate": 7.697003935729991e-06, - "loss": 10.0703, + "loss": 10.6562, "step": 1083 }, { "epoch": 3.8714285714285714, - "grad_norm": 45.76573944091797, + "grad_norm": 36.11317443847656, "learning_rate": 7.691753080453413e-06, - "loss": 9.5898, + "loss": 9.75, "step": 1084 }, { "epoch": 3.875, - "grad_norm": 54.7925910949707, + "grad_norm": 958.0936889648438, "learning_rate": 7.686498041734121e-06, - "loss": 10.4258, + "loss": 15.3887, "step": 1085 }, { "epoch": 3.8785714285714286, - "grad_norm": 54.30693817138672, + "grad_norm": 68.66741180419922, "learning_rate": 7.681238827739338e-06, - "loss": 12.2852, + "loss": 38.6523, "step": 1086 }, { "epoch": 3.882142857142857, - "grad_norm": 58.483360290527344, + "grad_norm": 107.61131286621094, "learning_rate": 7.675975446642784e-06, - "loss": 11.8828, + "loss": 12.3594, "step": 1087 }, { "epoch": 3.8857142857142857, - "grad_norm": 53.79195785522461, + "grad_norm": 8.555182456970215, "learning_rate": 7.670707906624644e-06, - "loss": 11.6875, + "loss": 9.6484, "step": 1088 }, { "epoch": 3.8892857142857142, - "grad_norm": 50.097381591796875, + "grad_norm": 96.56568908691406, "learning_rate": 7.665436215871576e-06, - "loss": 12.9922, + "loss": 11.0703, "step": 1089 }, { "epoch": 3.892857142857143, - "grad_norm": 53.380680084228516, + "grad_norm": 912.547607421875, "learning_rate": 7.660160382576683e-06, - "loss": 11.0742, + "loss": 17.8203, "step": 1090 }, { "epoch": 3.8964285714285714, - "grad_norm": 60.27989959716797, + "grad_norm": 35.095184326171875, "learning_rate": 7.65488041493951e-06, - "loss": 7.8789, + "loss": 6.8672, "step": 1091 }, { "epoch": 3.9, - "grad_norm": 51.71400451660156, + "grad_norm": 14.187033653259277, "learning_rate": 7.649596321166024e-06, - "loss": 9.7734, + "loss": 8.4023, "step": 1092 }, { "epoch": 3.9035714285714285, - "grad_norm": 53.004608154296875, + "grad_norm": 74.7503662109375, "learning_rate": 7.64430810946861e-06, - "loss": 11.2656, + "loss": 10.8281, "step": 1093 }, { "epoch": 3.907142857142857, - "grad_norm": 47.91337585449219, + "grad_norm": 13.023059844970703, "learning_rate": 7.639015788066046e-06, - "loss": 9.7227, + "loss": 8.0234, "step": 1094 }, { "epoch": 3.9107142857142856, - "grad_norm": 59.21824264526367, + "grad_norm": 52.7878303527832, "learning_rate": 7.633719365183505e-06, - "loss": 11.0117, + "loss": 9.7812, "step": 1095 }, { "epoch": 3.914285714285714, - "grad_norm": 52.356201171875, + "grad_norm": 1745.4278564453125, "learning_rate": 7.628418849052523e-06, - "loss": 11.6406, + "loss": 22.4648, "step": 1096 }, { "epoch": 3.9178571428571427, - "grad_norm": 48.493438720703125, + "grad_norm": 161.92263793945312, "learning_rate": 7.623114247911012e-06, - "loss": 10.2383, + "loss": 9.418, "step": 1097 }, { "epoch": 3.9214285714285713, - "grad_norm": 45.397037506103516, + "grad_norm": 6.775206089019775, "learning_rate": 7.617805570003223e-06, - "loss": 8.8945, + "loss": 7.2109, "step": 1098 }, { "epoch": 3.925, - "grad_norm": 43.46955108642578, + "grad_norm": 70.00099182128906, "learning_rate": 7.612492823579744e-06, - "loss": 10.2695, + "loss": 10.3242, "step": 1099 }, { "epoch": 3.928571428571429, - "grad_norm": 66.8531265258789, + "grad_norm": 29.310821533203125, "learning_rate": 7.607176016897491e-06, - "loss": 9.6055, + "loss": 8.2051, "step": 1100 }, { "epoch": 3.932142857142857, - "grad_norm": 46.657997131347656, + "grad_norm": 703.8792724609375, "learning_rate": 7.601855158219684e-06, - "loss": 9.2461, + "loss": 11.6172, "step": 1101 }, { "epoch": 3.935714285714286, - "grad_norm": 56.82691192626953, + "grad_norm": 105.47693634033203, "learning_rate": 7.596530255815846e-06, - "loss": 13.8516, + "loss": 13.9824, "step": 1102 }, { "epoch": 3.939285714285714, - "grad_norm": 49.10109329223633, + "grad_norm": 14.817994117736816, "learning_rate": 7.59120131796178e-06, - "loss": 8.957, + "loss": 8.082, "step": 1103 }, { "epoch": 3.942857142857143, - "grad_norm": 50.4098014831543, + "grad_norm": 145.88282775878906, "learning_rate": 7.585868352939564e-06, - "loss": 10.2891, + "loss": 8.8828, "step": 1104 }, { "epoch": 3.946428571428571, - "grad_norm": 63.67052459716797, + "grad_norm": 18.56246566772461, "learning_rate": 7.580531369037534e-06, - "loss": 10.7539, + "loss": 9.752, "step": 1105 }, { "epoch": 3.95, - "grad_norm": 58.474849700927734, + "grad_norm": 521.6914672851562, "learning_rate": 7.575190374550272e-06, - "loss": 12.4688, + "loss": 14.9395, "step": 1106 }, { "epoch": 3.9535714285714287, - "grad_norm": 57.26238250732422, + "grad_norm": 749.9774780273438, "learning_rate": 7.569845377778593e-06, - "loss": 10.1172, + "loss": 16.8242, "step": 1107 }, { "epoch": 3.9571428571428573, - "grad_norm": 77.57918548583984, + "grad_norm": 30.110071182250977, "learning_rate": 7.564496387029532e-06, - "loss": 8.6953, + "loss": 6.2695, "step": 1108 }, { "epoch": 3.960714285714286, - "grad_norm": 50.982540130615234, + "grad_norm": 502.418212890625, "learning_rate": 7.559143410616331e-06, - "loss": 11.3398, + "loss": 17.5801, "step": 1109 }, { "epoch": 3.9642857142857144, - "grad_norm": 68.15931701660156, + "grad_norm": 39.874000549316406, "learning_rate": 7.553786456858429e-06, - "loss": 11.8711, + "loss": 12.709, "step": 1110 }, { "epoch": 3.967857142857143, - "grad_norm": 44.718326568603516, + "grad_norm": 18.087642669677734, "learning_rate": 7.548425534081442e-06, - "loss": 9.1875, + "loss": 7.584, "step": 1111 }, { "epoch": 3.9714285714285715, - "grad_norm": 68.8305435180664, + "grad_norm": 1192.2974853515625, "learning_rate": 7.543060650617159e-06, - "loss": 12.2773, + "loss": 16.7812, "step": 1112 }, { "epoch": 3.975, - "grad_norm": 73.23046112060547, + "grad_norm": 121.82898712158203, "learning_rate": 7.537691814803522e-06, - "loss": 12.957, + "loss": 13.5918, "step": 1113 }, { "epoch": 3.9785714285714286, - "grad_norm": 51.342010498046875, + "grad_norm": 124.85108184814453, "learning_rate": 7.532319034984614e-06, - "loss": 13.4375, + "loss": 12.1797, "step": 1114 }, { "epoch": 3.982142857142857, - "grad_norm": 60.578426361083984, + "grad_norm": 38.0688362121582, "learning_rate": 7.526942319510655e-06, - "loss": 12.2969, + "loss": 10.0234, "step": 1115 }, { "epoch": 3.9857142857142858, - "grad_norm": 64.4384765625, + "grad_norm": 33.61159133911133, "learning_rate": 7.521561676737972e-06, - "loss": 12.9609, + "loss": 12.3945, "step": 1116 }, { "epoch": 3.9892857142857143, - "grad_norm": 63.59252166748047, + "grad_norm": 57.50529479980469, "learning_rate": 7.516177115029002e-06, - "loss": 14.2305, + "loss": 13.5195, "step": 1117 }, { "epoch": 3.992857142857143, - "grad_norm": 50.300716400146484, + "grad_norm": 11.313979148864746, "learning_rate": 7.510788642752269e-06, - "loss": 9.6875, + "loss": 10.0059, "step": 1118 }, { "epoch": 3.9964285714285714, - "grad_norm": 60.57063674926758, + "grad_norm": 1066.2344970703125, "learning_rate": 7.505396268282381e-06, - "loss": 11.2891, + "loss": 16.8867, "step": 1119 }, { "epoch": 4.0, - "grad_norm": 61.74922561645508, + "grad_norm": 639.6029663085938, "learning_rate": 7.500000000000001e-06, - "loss": 11.1562, + "loss": 12.293, "step": 1120 }, { "epoch": 4.0, - "eval_loss": 10.969278335571289, - "eval_mse": 10.966898889045924, - "eval_runtime": 11.0312, - "eval_samples_per_second": 257.452, - "eval_steps_per_second": 1.36, - "eval_target_0_mse": 20.208798511705073, - "eval_target_1_mse": 11.175192857125714, - "eval_target_2_mse": 6.365514511742435, - "eval_target_3_mse": 6.118089675610478, + "eval_loss": 13.046168327331543, + "eval_mse": 13.044220056770122, + "eval_runtime": 11.3336, + "eval_samples_per_second": 250.583, + "eval_steps_per_second": 1.324, + "eval_target_0_mse": 36.9398919499106, + "eval_target_1_mse": 8.881332675855948, + "eval_target_2_mse": 4.122082263530947, + "eval_target_3_mse": 2.233573337782996, "step": 1120 }, { "epoch": 4.003571428571429, - "grad_norm": 55.5285530090332, + "grad_norm": 11.331307411193848, "learning_rate": 7.494599846291853e-06, - "loss": 11.7812, + "loss": 11.5254, "step": 1121 }, { "epoch": 4.007142857142857, - "grad_norm": 46.621826171875, + "grad_norm": 949.3154907226562, "learning_rate": 7.489195815550692e-06, - "loss": 10.168, + "loss": 16.1367, "step": 1122 }, { "epoch": 4.010714285714286, - "grad_norm": 60.47281265258789, + "grad_norm": 25.53479766845703, "learning_rate": 7.483787916175307e-06, - "loss": 11.9844, + "loss": 12.6172, "step": 1123 }, { "epoch": 4.014285714285714, - "grad_norm": 70.85700988769531, + "grad_norm": 707.5961303710938, "learning_rate": 7.478376156570489e-06, - "loss": 12.8281, + "loss": 23.5, "step": 1124 }, { "epoch": 4.017857142857143, - "grad_norm": 59.2254638671875, + "grad_norm": 6.141911029815674, "learning_rate": 7.472960545147038e-06, - "loss": 8.7969, + "loss": 6.3086, "step": 1125 }, { "epoch": 4.021428571428571, - "grad_norm": 64.9921646118164, + "grad_norm": 319.1160888671875, "learning_rate": 7.467541090321735e-06, - "loss": 11.2656, + "loss": 24.5, "step": 1126 }, { "epoch": 4.025, - "grad_norm": 62.048789978027344, + "grad_norm": 40.97224044799805, "learning_rate": 7.462117800517337e-06, - "loss": 12.0625, + "loss": 15.252, "step": 1127 }, { "epoch": 4.0285714285714285, - "grad_norm": 57.300636291503906, + "grad_norm": 917.6477661132812, "learning_rate": 7.456690684162557e-06, - "loss": 11.4609, + "loss": 29.1758, "step": 1128 }, { "epoch": 4.0321428571428575, - "grad_norm": 54.24581527709961, + "grad_norm": 64.43729400634766, "learning_rate": 7.451259749692061e-06, - "loss": 10.0078, + "loss": 9.0859, "step": 1129 }, { "epoch": 4.035714285714286, - "grad_norm": 56.99355697631836, + "grad_norm": 50.33824157714844, "learning_rate": 7.445825005546448e-06, - "loss": 11.3867, + "loss": 9.7207, "step": 1130 }, { "epoch": 4.039285714285715, - "grad_norm": 64.51433563232422, + "grad_norm": 996.8959350585938, "learning_rate": 7.44038646017223e-06, - "loss": 11.4922, + "loss": 17.375, "step": 1131 }, { "epoch": 4.042857142857143, - "grad_norm": 52.136871337890625, + "grad_norm": 10.948749542236328, "learning_rate": 7.434944122021837e-06, - "loss": 11.1484, + "loss": 11.0156, "step": 1132 }, { "epoch": 4.046428571428572, - "grad_norm": 56.38874816894531, + "grad_norm": 22.893110275268555, "learning_rate": 7.4294979995535875e-06, - "loss": 9.8359, + "loss": 8.5664, "step": 1133 }, { "epoch": 4.05, - "grad_norm": 56.005104064941406, + "grad_norm": 28.63909149169922, "learning_rate": 7.424048101231687e-06, - "loss": 11.4727, + "loss": 9.6016, "step": 1134 }, { "epoch": 4.053571428571429, - "grad_norm": 70.1040267944336, + "grad_norm": 282.063232421875, "learning_rate": 7.4185944355261996e-06, - "loss": 9.1094, + "loss": 7.5176, "step": 1135 }, { "epoch": 4.057142857142857, - "grad_norm": 45.29258346557617, + "grad_norm": 6.367667198181152, "learning_rate": 7.413137010913055e-06, - "loss": 9.9609, + "loss": 10.0469, "step": 1136 }, { "epoch": 4.060714285714286, - "grad_norm": 47.330814361572266, + "grad_norm": 206.21275329589844, "learning_rate": 7.407675835874019e-06, - "loss": 10.8008, + "loss": 11.5273, "step": 1137 }, { "epoch": 4.064285714285714, - "grad_norm": 46.4870491027832, + "grad_norm": 87.36446380615234, "learning_rate": 7.4022109188966895e-06, - "loss": 11.1719, + "loss": 11.6738, "step": 1138 }, { "epoch": 4.067857142857143, - "grad_norm": 55.703086853027344, + "grad_norm": 30.88331413269043, "learning_rate": 7.396742268474475e-06, - "loss": 12.3867, + "loss": 13.2617, "step": 1139 }, { "epoch": 4.071428571428571, - "grad_norm": 52.2574577331543, + "grad_norm": 755.0950927734375, "learning_rate": 7.391269893106592e-06, - "loss": 9.832, + "loss": 12.2012, "step": 1140 }, { "epoch": 4.075, - "grad_norm": 57.37445068359375, + "grad_norm": 6.358447074890137, "learning_rate": 7.3857938012980425e-06, - "loss": 8.7148, + "loss": 8.7227, "step": 1141 }, { "epoch": 4.078571428571428, - "grad_norm": 52.484378814697266, + "grad_norm": 11.017739295959473, "learning_rate": 7.3803140015596065e-06, - "loss": 10.3008, + "loss": 9.9375, "step": 1142 }, { "epoch": 4.082142857142857, - "grad_norm": 47.229984283447266, + "grad_norm": 1083.0701904296875, "learning_rate": 7.374830502407827e-06, - "loss": 10.2148, + "loss": 15.2344, "step": 1143 }, { "epoch": 4.085714285714285, - "grad_norm": 47.75303649902344, + "grad_norm": 339.28814697265625, "learning_rate": 7.369343312364994e-06, - "loss": 9.5625, + "loss": 21.293, "step": 1144 }, { "epoch": 4.089285714285714, - "grad_norm": 67.51705169677734, + "grad_norm": 50.724647521972656, "learning_rate": 7.363852439959136e-06, - "loss": 9.7812, + "loss": 7.0801, "step": 1145 }, { "epoch": 4.0928571428571425, - "grad_norm": 64.51493072509766, + "grad_norm": 34.80635452270508, "learning_rate": 7.358357893724003e-06, - "loss": 13.7344, + "loss": 13.8828, "step": 1146 }, { "epoch": 4.0964285714285715, - "grad_norm": 55.955833435058594, + "grad_norm": 657.4717407226562, "learning_rate": 7.352859682199058e-06, - "loss": 10.9609, + "loss": 14.2061, "step": 1147 }, { "epoch": 4.1, - "grad_norm": 45.38529586791992, + "grad_norm": 409.73321533203125, "learning_rate": 7.347357813929455e-06, - "loss": 10.2695, + "loss": 10.4375, "step": 1148 }, { "epoch": 4.103571428571429, - "grad_norm": 44.22454833984375, + "grad_norm": 61.84675598144531, "learning_rate": 7.341852297466036e-06, - "loss": 9.2148, + "loss": 7.9883, "step": 1149 }, { "epoch": 4.107142857142857, - "grad_norm": 70.82363891601562, + "grad_norm": 524.6834716796875, "learning_rate": 7.336343141365311e-06, - "loss": 16.1055, + "loss": 19.3242, "step": 1150 }, { "epoch": 4.110714285714286, - "grad_norm": 66.38835144042969, + "grad_norm": 919.642578125, "learning_rate": 7.3308303541894465e-06, - "loss": 13.0234, + "loss": 17.0312, "step": 1151 }, { "epoch": 4.114285714285714, - "grad_norm": 56.455604553222656, + "grad_norm": 310.1175537109375, "learning_rate": 7.3253139445062535e-06, - "loss": 9.6094, + "loss": 7.832, "step": 1152 }, { "epoch": 4.117857142857143, - "grad_norm": 46.04097366333008, + "grad_norm": 287.7452087402344, "learning_rate": 7.319793920889171e-06, - "loss": 10.3789, + "loss": 8.9941, "step": 1153 }, { "epoch": 4.121428571428571, - "grad_norm": 67.42240142822266, + "grad_norm": 826.5342407226562, "learning_rate": 7.314270291917256e-06, - "loss": 11.0352, + "loss": 18.3516, "step": 1154 }, { "epoch": 4.125, - "grad_norm": 57.54825210571289, + "grad_norm": 15.265548706054688, "learning_rate": 7.308743066175172e-06, - "loss": 12.1172, + "loss": 12.2305, "step": 1155 }, { "epoch": 4.128571428571428, - "grad_norm": 49.48421096801758, + "grad_norm": 89.75617980957031, "learning_rate": 7.303212252253163e-06, - "loss": 11.1133, + "loss": 34.9961, "step": 1156 }, { "epoch": 4.132142857142857, - "grad_norm": 47.61314392089844, + "grad_norm": 19.17425537109375, "learning_rate": 7.297677858747059e-06, - "loss": 11.207, + "loss": 11.1992, "step": 1157 }, { "epoch": 4.135714285714286, - "grad_norm": 44.782066345214844, + "grad_norm": 75.50289916992188, "learning_rate": 7.29213989425825e-06, - "loss": 8.6367, + "loss": 6.9492, "step": 1158 }, { "epoch": 4.139285714285714, - "grad_norm": 50.94352340698242, + "grad_norm": 14.794899940490723, "learning_rate": 7.286598367393678e-06, - "loss": 11.8125, + "loss": 12.2812, "step": 1159 }, { "epoch": 4.142857142857143, - "grad_norm": 52.17855453491211, + "grad_norm": 134.30154418945312, "learning_rate": 7.281053286765816e-06, - "loss": 11.5234, + "loss": 11.9648, "step": 1160 }, { "epoch": 4.146428571428571, - "grad_norm": 60.46622848510742, + "grad_norm": 11.307595252990723, "learning_rate": 7.275504660992665e-06, - "loss": 11.2109, + "loss": 11.3906, "step": 1161 }, { "epoch": 4.15, - "grad_norm": 46.69803237915039, + "grad_norm": 112.6237564086914, "learning_rate": 7.269952498697734e-06, - "loss": 10.2578, + "loss": 10.625, "step": 1162 }, { "epoch": 4.1535714285714285, - "grad_norm": 53.10807800292969, + "grad_norm": 377.7585754394531, "learning_rate": 7.264396808510031e-06, - "loss": 10.7539, + "loss": 9.6719, "step": 1163 }, { "epoch": 4.1571428571428575, - "grad_norm": 61.72574996948242, + "grad_norm": 8.96403694152832, "learning_rate": 7.258837599064043e-06, - "loss": 12.0391, + "loss": 10.5703, "step": 1164 }, { "epoch": 4.160714285714286, - "grad_norm": 45.25117874145508, + "grad_norm": 9.523026466369629, "learning_rate": 7.253274878999728e-06, - "loss": 11.3008, + "loss": 10.0059, "step": 1165 }, { "epoch": 4.164285714285715, - "grad_norm": 49.409278869628906, + "grad_norm": 46.72370147705078, "learning_rate": 7.247708656962498e-06, - "loss": 11.5859, + "loss": 11.875, "step": 1166 }, { "epoch": 4.167857142857143, - "grad_norm": 46.97398376464844, + "grad_norm": 380.0696716308594, "learning_rate": 7.242138941603216e-06, - "loss": 7.1094, + "loss": 6.3047, "step": 1167 }, { "epoch": 4.171428571428572, - "grad_norm": 76.52069854736328, + "grad_norm": 21.193872451782227, "learning_rate": 7.236565741578163e-06, - "loss": 11.8906, + "loss": 10.7695, "step": 1168 }, { "epoch": 4.175, - "grad_norm": 58.366790771484375, + "grad_norm": 19.794429779052734, "learning_rate": 7.2309890655490446e-06, - "loss": 9.0117, + "loss": 8.916, "step": 1169 }, { "epoch": 4.178571428571429, - "grad_norm": 45.68241500854492, + "grad_norm": 629.47705078125, "learning_rate": 7.225408922182962e-06, - "loss": 9.6172, + "loss": 11.0527, "step": 1170 }, { "epoch": 4.182142857142857, - "grad_norm": 51.13352584838867, + "grad_norm": 12.269636154174805, "learning_rate": 7.219825320152411e-06, - "loss": 10.6445, + "loss": 11.1914, "step": 1171 }, { "epoch": 4.185714285714286, - "grad_norm": 69.04093170166016, + "grad_norm": 8.116941452026367, "learning_rate": 7.214238268135258e-06, - "loss": 11.1133, + "loss": 10.0039, "step": 1172 }, { "epoch": 4.189285714285714, - "grad_norm": 60.07635498046875, + "grad_norm": 353.8351135253906, "learning_rate": 7.2086477748147345e-06, - "loss": 12.8359, + "loss": 23.2578, "step": 1173 }, { "epoch": 4.192857142857143, - "grad_norm": 64.1850814819336, + "grad_norm": 12.85792350769043, "learning_rate": 7.203053848879419e-06, - "loss": 11.8945, + "loss": 11.5625, "step": 1174 }, { "epoch": 4.196428571428571, - "grad_norm": 42.3048210144043, + "grad_norm": 43.6623649597168, "learning_rate": 7.197456499023226e-06, - "loss": 9.4375, + "loss": 9.3398, "step": 1175 }, { "epoch": 4.2, - "grad_norm": 61.62110900878906, + "grad_norm": 894.333984375, "learning_rate": 7.191855733945388e-06, - "loss": 11.668, + "loss": 19.7266, "step": 1176 }, { "epoch": 4.203571428571428, - "grad_norm": 47.813899993896484, + "grad_norm": 137.5155029296875, "learning_rate": 7.186251562350449e-06, - "loss": 9.7578, + "loss": 10.4414, "step": 1177 }, { "epoch": 4.207142857142857, - "grad_norm": 61.196903228759766, + "grad_norm": 133.12570190429688, "learning_rate": 7.180643992948247e-06, - "loss": 11.3789, + "loss": 11.8438, "step": 1178 }, { "epoch": 4.210714285714285, - "grad_norm": 56.878868103027344, + "grad_norm": 26.76664161682129, "learning_rate": 7.1750330344539e-06, - "loss": 9.0742, + "loss": 7.2734, "step": 1179 }, { "epoch": 4.214285714285714, - "grad_norm": 67.1504898071289, + "grad_norm": 868.9444580078125, "learning_rate": 7.169418695587791e-06, - "loss": 11.3281, + "loss": 13.5312, "step": 1180 }, { "epoch": 4.2178571428571425, - "grad_norm": 50.292842864990234, + "grad_norm": 15.997787475585938, "learning_rate": 7.163800985075561e-06, - "loss": 12.1172, + "loss": 12.5, "step": 1181 }, { "epoch": 4.2214285714285715, - "grad_norm": 63.77092742919922, + "grad_norm": 252.82699584960938, "learning_rate": 7.158179911648087e-06, - "loss": 8.1094, + "loss": 7.1348, "step": 1182 }, { "epoch": 4.225, - "grad_norm": 80.61720275878906, + "grad_norm": 14.11526107788086, "learning_rate": 7.1525554840414765e-06, - "loss": 10.5781, + "loss": 12.5938, "step": 1183 }, { "epoch": 4.228571428571429, - "grad_norm": 57.41088104248047, + "grad_norm": 292.5434875488281, "learning_rate": 7.146927710997047e-06, - "loss": 9.3008, + "loss": 8.0234, "step": 1184 }, { "epoch": 4.232142857142857, - "grad_norm": 60.05517578125, + "grad_norm": 40.31282043457031, "learning_rate": 7.1412966012613135e-06, - "loss": 11.6914, + "loss": 11.2598, "step": 1185 }, { "epoch": 4.235714285714286, - "grad_norm": 51.733856201171875, + "grad_norm": 1064.87158203125, "learning_rate": 7.135662163585984e-06, - "loss": 10.9336, + "loss": 15.3398, "step": 1186 }, { "epoch": 4.239285714285714, - "grad_norm": 54.662811279296875, + "grad_norm": 25.58346939086914, "learning_rate": 7.1300244067279335e-06, - "loss": 12.8594, + "loss": 13.0469, "step": 1187 }, { "epoch": 4.242857142857143, - "grad_norm": 59.054107666015625, + "grad_norm": 289.4027404785156, "learning_rate": 7.124383339449193e-06, - "loss": 15.625, + "loss": 16.5215, "step": 1188 }, { "epoch": 4.246428571428572, - "grad_norm": 67.9957275390625, + "grad_norm": 131.808837890625, "learning_rate": 7.118738970516944e-06, - "loss": 13.0352, + "loss": 13.4531, "step": 1189 }, { "epoch": 4.25, - "grad_norm": 61.39967346191406, + "grad_norm": 14.855598449707031, "learning_rate": 7.113091308703498e-06, - "loss": 13.168, + "loss": 13.4648, "step": 1190 }, { "epoch": 4.253571428571428, - "grad_norm": 45.41973114013672, + "grad_norm": 69.07725524902344, "learning_rate": 7.107440362786282e-06, - "loss": 9.1484, + "loss": 9.9883, "step": 1191 }, { "epoch": 4.257142857142857, - "grad_norm": 56.73194122314453, + "grad_norm": 29.45139503479004, "learning_rate": 7.101786141547829e-06, - "loss": 8.668, + "loss": 7.4062, "step": 1192 }, { "epoch": 4.260714285714286, - "grad_norm": 55.76304626464844, + "grad_norm": 1113.4259033203125, "learning_rate": 7.09612865377576e-06, - "loss": 11.1523, + "loss": 16.5664, "step": 1193 }, { "epoch": 4.264285714285714, - "grad_norm": 48.00605010986328, + "grad_norm": 39.21089172363281, "learning_rate": 7.090467908262777e-06, - "loss": 10.0117, + "loss": 10.7148, "step": 1194 }, { "epoch": 4.267857142857143, - "grad_norm": 53.62165451049805, + "grad_norm": 704.34814453125, "learning_rate": 7.084803913806642e-06, - "loss": 9.9297, + "loss": 12.0859, "step": 1195 }, { "epoch": 4.271428571428571, - "grad_norm": 48.24089050292969, + "grad_norm": 1007.8427734375, "learning_rate": 7.079136679210165e-06, - "loss": 10.3477, + "loss": 18.7969, "step": 1196 }, { "epoch": 4.275, - "grad_norm": 66.03018951416016, + "grad_norm": 210.5769500732422, "learning_rate": 7.073466213281196e-06, - "loss": 8.0078, + "loss": 5.9688, "step": 1197 }, { "epoch": 4.2785714285714285, - "grad_norm": 66.65876007080078, + "grad_norm": 65.44849395751953, "learning_rate": 7.067792524832604e-06, - "loss": 12.9336, + "loss": 14.1836, "step": 1198 }, { "epoch": 4.2821428571428575, - "grad_norm": 62.34231948852539, + "grad_norm": 47.49427795410156, "learning_rate": 7.062115622682267e-06, - "loss": 9.4844, + "loss": 9.0, "step": 1199 }, { "epoch": 4.285714285714286, - "grad_norm": 47.663143157958984, + "grad_norm": 51.151390075683594, "learning_rate": 7.056435515653059e-06, - "loss": 9.1172, + "loss": 7.5742, "step": 1200 }, { "epoch": 4.289285714285715, - "grad_norm": 46.71085739135742, + "grad_norm": 969.561767578125, "learning_rate": 7.050752212572831e-06, - "loss": 10.6953, + "loss": 13.2559, "step": 1201 }, { "epoch": 4.292857142857143, - "grad_norm": 70.4585189819336, + "grad_norm": 46.736392974853516, "learning_rate": 7.045065722274407e-06, - "loss": 12.0156, + "loss": 14.1055, "step": 1202 }, { "epoch": 4.296428571428572, - "grad_norm": 51.536556243896484, + "grad_norm": 845.5842895507812, "learning_rate": 7.039376053595559e-06, - "loss": 10.0273, + "loss": 13.9141, "step": 1203 }, { "epoch": 4.3, - "grad_norm": 51.845401763916016, + "grad_norm": 981.0889892578125, "learning_rate": 7.033683215379002e-06, - "loss": 13.625, + "loss": 21.5312, "step": 1204 }, { "epoch": 4.303571428571429, - "grad_norm": 41.78042221069336, + "grad_norm": 15.28213119506836, "learning_rate": 7.027987216472376e-06, - "loss": 8.6055, + "loss": 8.2129, "step": 1205 }, { "epoch": 4.307142857142857, - "grad_norm": 44.98314666748047, + "grad_norm": 24.408987045288086, "learning_rate": 7.022288065728233e-06, - "loss": 9.582, + "loss": 8.9414, "step": 1206 }, { "epoch": 4.310714285714286, - "grad_norm": 58.86567687988281, + "grad_norm": 58.789222717285156, "learning_rate": 7.016585772004026e-06, - "loss": 10.0312, + "loss": 8.8516, "step": 1207 }, { "epoch": 4.314285714285714, - "grad_norm": 59.64255142211914, + "grad_norm": 29.37164878845215, "learning_rate": 7.010880344162087e-06, - "loss": 10.0156, + "loss": 10.4688, "step": 1208 }, { "epoch": 4.317857142857143, - "grad_norm": 47.514400482177734, + "grad_norm": 12.697324752807617, "learning_rate": 7.005171791069626e-06, - "loss": 10.1719, + "loss": 10.3945, "step": 1209 }, { "epoch": 4.321428571428571, - "grad_norm": 45.80529022216797, + "grad_norm": 25.562158584594727, "learning_rate": 6.999460121598704e-06, - "loss": 8.4727, + "loss": 7.4258, "step": 1210 }, { "epoch": 4.325, - "grad_norm": 67.92034912109375, + "grad_norm": 183.7058563232422, "learning_rate": 6.993745344626232e-06, - "loss": 9.707, + "loss": 8.8652, "step": 1211 }, { "epoch": 4.328571428571428, - "grad_norm": 59.824092864990234, + "grad_norm": 5.832786560058594, "learning_rate": 6.988027469033943e-06, - "loss": 10.4141, + "loss": 9.6992, "step": 1212 }, { "epoch": 4.332142857142857, - "grad_norm": 46.942298889160156, + "grad_norm": 56.1806755065918, "learning_rate": 6.9823065037083885e-06, - "loss": 9.2383, + "loss": 9.9062, "step": 1213 }, { "epoch": 4.335714285714285, - "grad_norm": 65.91862487792969, + "grad_norm": 205.55335998535156, "learning_rate": 6.976582457540926e-06, - "loss": 13.3047, + "loss": 13.1016, "step": 1214 }, { "epoch": 4.339285714285714, - "grad_norm": 62.025333404541016, + "grad_norm": 70.0198974609375, "learning_rate": 6.970855339427698e-06, - "loss": 12.75, + "loss": 10.3828, "step": 1215 }, { "epoch": 4.3428571428571425, - "grad_norm": 45.97697448730469, + "grad_norm": 378.94500732421875, "learning_rate": 6.965125158269619e-06, - "loss": 9.1797, + "loss": 9.3809, "step": 1216 }, { "epoch": 4.3464285714285715, - "grad_norm": 63.53715896606445, + "grad_norm": 90.15065002441406, "learning_rate": 6.959391922972368e-06, - "loss": 9.9219, + "loss": 17.8047, "step": 1217 }, { "epoch": 4.35, - "grad_norm": 50.1829719543457, + "grad_norm": 143.60110473632812, "learning_rate": 6.953655642446368e-06, - "loss": 11.5195, + "loss": 12.6465, "step": 1218 }, { "epoch": 4.353571428571429, - "grad_norm": 64.87712097167969, + "grad_norm": 28.561920166015625, "learning_rate": 6.94791632560678e-06, - "loss": 11.2891, + "loss": 13.332, "step": 1219 }, { "epoch": 4.357142857142857, - "grad_norm": 48.15956115722656, + "grad_norm": 110.9373550415039, "learning_rate": 6.942173981373474e-06, - "loss": 11.9219, + "loss": 11.5234, "step": 1220 }, { "epoch": 4.360714285714286, - "grad_norm": 51.6088981628418, + "grad_norm": 25.13768196105957, "learning_rate": 6.9364286186710335e-06, - "loss": 10.5938, + "loss": 11.1289, "step": 1221 }, { "epoch": 4.364285714285714, - "grad_norm": 70.58609771728516, + "grad_norm": 151.27137756347656, "learning_rate": 6.930680246428732e-06, - "loss": 13.4844, + "loss": 15.7969, "step": 1222 }, { "epoch": 4.367857142857143, - "grad_norm": 53.69131088256836, + "grad_norm": 226.3632354736328, "learning_rate": 6.924928873580518e-06, - "loss": 9.957, + "loss": 9.9375, "step": 1223 }, { "epoch": 4.371428571428572, - "grad_norm": 49.677127838134766, + "grad_norm": 5.776775360107422, "learning_rate": 6.919174509065003e-06, - "loss": 9.7891, + "loss": 10.75, "step": 1224 }, { "epoch": 4.375, - "grad_norm": 46.05371856689453, + "grad_norm": 598.9332885742188, "learning_rate": 6.913417161825449e-06, - "loss": 10.2773, + "loss": 10.7783, "step": 1225 }, { "epoch": 4.378571428571428, - "grad_norm": 47.70090866088867, + "grad_norm": 182.47251892089844, "learning_rate": 6.907656840809758e-06, - "loss": 9.3086, + "loss": 10.3984, "step": 1226 }, { "epoch": 4.382142857142857, - "grad_norm": 62.83681106567383, + "grad_norm": 87.76947021484375, "learning_rate": 6.901893554970446e-06, - "loss": 8.5117, + "loss": 6.8633, "step": 1227 }, { "epoch": 4.385714285714286, - "grad_norm": 45.37397384643555, + "grad_norm": 19.170286178588867, "learning_rate": 6.896127313264643e-06, - "loss": 9.3789, + "loss": 11.1797, "step": 1228 }, { "epoch": 4.389285714285714, - "grad_norm": 56.94482421875, + "grad_norm": 44.07053756713867, "learning_rate": 6.89035812465407e-06, - "loss": 9.6211, + "loss": 10.2852, "step": 1229 }, { "epoch": 4.392857142857143, - "grad_norm": 58.994407653808594, + "grad_norm": 16.56456756591797, "learning_rate": 6.8845859981050265e-06, - "loss": 9.7734, + "loss": 8.5195, "step": 1230 }, { "epoch": 4.396428571428571, - "grad_norm": 44.161659240722656, + "grad_norm": 6.421798229217529, "learning_rate": 6.878810942588384e-06, - "loss": 9.7031, + "loss": 7.6836, "step": 1231 }, { "epoch": 4.4, - "grad_norm": 57.68800735473633, + "grad_norm": 23.67146110534668, "learning_rate": 6.873032967079562e-06, - "loss": 9.4609, + "loss": 8.5371, "step": 1232 }, { "epoch": 4.4035714285714285, - "grad_norm": 53.67744827270508, + "grad_norm": 27.297672271728516, "learning_rate": 6.867252080558514e-06, - "loss": 12.7969, + "loss": 11.9062, "step": 1233 }, { "epoch": 4.4071428571428575, - "grad_norm": 43.71571350097656, + "grad_norm": 18.766313552856445, "learning_rate": 6.8614682920097265e-06, - "loss": 8.3047, + "loss": 7.5566, "step": 1234 }, { "epoch": 4.410714285714286, - "grad_norm": 49.43186569213867, + "grad_norm": 788.2775268554688, "learning_rate": 6.85568161042219e-06, - "loss": 10.5117, + "loss": 13.9766, "step": 1235 }, { "epoch": 4.414285714285715, - "grad_norm": 67.0520248413086, + "grad_norm": 299.6541442871094, "learning_rate": 6.8498920447893955e-06, - "loss": 12.0078, + "loss": 11.2109, "step": 1236 }, { "epoch": 4.417857142857143, - "grad_norm": 51.61349868774414, + "grad_norm": 24.054889678955078, "learning_rate": 6.844099604109311e-06, - "loss": 10.0, + "loss": 10.3398, "step": 1237 }, { "epoch": 4.421428571428572, - "grad_norm": 54.946998596191406, + "grad_norm": 196.21609497070312, "learning_rate": 6.838304297384377e-06, - "loss": 10.3281, + "loss": 11.9414, "step": 1238 }, { "epoch": 4.425, - "grad_norm": 59.87200164794922, + "grad_norm": 9.773138999938965, "learning_rate": 6.832506133621487e-06, - "loss": 16.7656, + "loss": 15.4863, "step": 1239 }, { "epoch": 4.428571428571429, - "grad_norm": 47.18049240112305, + "grad_norm": 160.42391967773438, "learning_rate": 6.8267051218319766e-06, - "loss": 8.5703, + "loss": 8.2031, "step": 1240 }, { "epoch": 4.432142857142857, - "grad_norm": 46.36000061035156, + "grad_norm": 31.924097061157227, "learning_rate": 6.820901271031604e-06, - "loss": 9.2812, + "loss": 8.3047, "step": 1241 }, { "epoch": 4.435714285714286, - "grad_norm": 46.93675231933594, + "grad_norm": 22.626821517944336, "learning_rate": 6.8150945902405415e-06, - "loss": 10.7344, + "loss": 10.0508, "step": 1242 }, { "epoch": 4.439285714285714, - "grad_norm": 58.69475173950195, + "grad_norm": 138.541259765625, "learning_rate": 6.809285088483361e-06, - "loss": 10.5, + "loss": 9.752, "step": 1243 }, { "epoch": 4.442857142857143, - "grad_norm": 55.6963005065918, + "grad_norm": 27.272762298583984, "learning_rate": 6.8034727747890195e-06, - "loss": 10.0508, + "loss": 8.959, "step": 1244 }, { "epoch": 4.446428571428571, - "grad_norm": 53.460018157958984, + "grad_norm": 3.7862722873687744, "learning_rate": 6.797657658190838e-06, - "loss": 8.2422, + "loss": 7.8594, "step": 1245 }, { "epoch": 4.45, - "grad_norm": 52.176509857177734, + "grad_norm": 1252.1180419921875, "learning_rate": 6.7918397477265e-06, - "loss": 12.6602, + "loss": 32.6523, "step": 1246 }, { "epoch": 4.453571428571428, - "grad_norm": 43.547237396240234, + "grad_norm": 13.66934585571289, "learning_rate": 6.786019052438033e-06, - "loss": 9.7188, + "loss": 9.1133, "step": 1247 }, { "epoch": 4.457142857142857, - "grad_norm": 49.920433044433594, + "grad_norm": 59.5120735168457, "learning_rate": 6.780195581371785e-06, - "loss": 8.3125, + "loss": 8.1406, "step": 1248 }, { "epoch": 4.460714285714285, - "grad_norm": 52.410396575927734, + "grad_norm": 13.643356323242188, "learning_rate": 6.7743693435784195e-06, - "loss": 13.0195, + "loss": 12.0117, "step": 1249 }, { "epoch": 4.464285714285714, - "grad_norm": 65.32703399658203, + "grad_norm": 27.058303833007812, "learning_rate": 6.768540348112908e-06, - "loss": 14.0391, + "loss": 12.3809, "step": 1250 }, { "epoch": 4.4678571428571425, - "grad_norm": 48.972904205322266, + "grad_norm": 840.5648193359375, "learning_rate": 6.762708604034498e-06, - "loss": 12.5, + "loss": 15.125, "step": 1251 }, { "epoch": 4.4714285714285715, - "grad_norm": 52.25868225097656, + "grad_norm": 893.9486083984375, "learning_rate": 6.7568741204067145e-06, - "loss": 9.9062, + "loss": 18.3672, "step": 1252 }, { "epoch": 4.475, - "grad_norm": 42.98270797729492, + "grad_norm": 226.90716552734375, "learning_rate": 6.751036906297338e-06, - "loss": 9.9258, + "loss": 10.8438, "step": 1253 }, { "epoch": 4.478571428571429, - "grad_norm": 55.96113204956055, + "grad_norm": 28.33758544921875, "learning_rate": 6.745196970778394e-06, - "loss": 12.4375, + "loss": 12.1133, "step": 1254 }, { "epoch": 4.482142857142857, - "grad_norm": 47.135284423828125, + "grad_norm": 5.575850009918213, "learning_rate": 6.739354322926136e-06, - "loss": 9.1602, + "loss": 9.6152, "step": 1255 }, { "epoch": 4.485714285714286, - "grad_norm": 54.09988021850586, + "grad_norm": 95.85755920410156, "learning_rate": 6.733508971821037e-06, - "loss": 9.7617, + "loss": 8.873, "step": 1256 }, { "epoch": 4.489285714285714, - "grad_norm": 56.34666442871094, + "grad_norm": 18.643903732299805, "learning_rate": 6.727660926547763e-06, - "loss": 10.5352, + "loss": 11.6875, "step": 1257 }, { "epoch": 4.492857142857143, - "grad_norm": 44.73397445678711, + "grad_norm": 1130.993896484375, "learning_rate": 6.721810196195176e-06, - "loss": 8.1602, + "loss": 11.6113, "step": 1258 }, { "epoch": 4.496428571428572, - "grad_norm": 49.70079803466797, + "grad_norm": 69.48484802246094, "learning_rate": 6.715956789856306e-06, - "loss": 9.9062, + "loss": 11.4219, "step": 1259 }, { "epoch": 4.5, - "grad_norm": 57.20534133911133, + "grad_norm": 32.96428680419922, "learning_rate": 6.710100716628345e-06, - "loss": 9.2852, + "loss": 6.8496, "step": 1260 }, { "epoch": 4.503571428571428, - "grad_norm": 72.92005920410156, + "grad_norm": 71.4104995727539, "learning_rate": 6.704241985612625e-06, - "loss": 11.6875, + "loss": 13.4531, "step": 1261 }, { "epoch": 4.507142857142857, - "grad_norm": 56.88804626464844, + "grad_norm": 302.7184143066406, "learning_rate": 6.698380605914614e-06, - "loss": 11.5703, + "loss": 11.0898, "step": 1262 }, { "epoch": 4.510714285714286, - "grad_norm": 47.69300842285156, + "grad_norm": 963.237548828125, "learning_rate": 6.692516586643895e-06, - "loss": 9.0078, + "loss": 15.4766, "step": 1263 }, { "epoch": 4.514285714285714, - "grad_norm": 45.924537658691406, + "grad_norm": 21.416893005371094, "learning_rate": 6.686649936914151e-06, - "loss": 9.043, + "loss": 7.8809, "step": 1264 }, { "epoch": 4.517857142857143, - "grad_norm": 43.5368766784668, + "grad_norm": 15.644923210144043, "learning_rate": 6.680780665843155e-06, - "loss": 9.3945, + "loss": 9.8633, "step": 1265 }, { "epoch": 4.521428571428571, - "grad_norm": 63.852359771728516, + "grad_norm": 704.9505615234375, "learning_rate": 6.6749087825527535e-06, - "loss": 9.043, + "loss": 10.1719, "step": 1266 }, { "epoch": 4.525, - "grad_norm": 59.51030349731445, + "grad_norm": 86.38751983642578, "learning_rate": 6.669034296168855e-06, - "loss": 10.3555, + "loss": 8.4805, "step": 1267 }, { "epoch": 4.5285714285714285, - "grad_norm": 62.62824630737305, + "grad_norm": 10.91057014465332, "learning_rate": 6.6631572158214105e-06, - "loss": 9.0898, + "loss": 7.2754, "step": 1268 }, { "epoch": 4.5321428571428575, - "grad_norm": 60.62123489379883, + "grad_norm": 1.7746273279190063, "learning_rate": 6.657277550644404e-06, - "loss": 9.8359, + "loss": 8.0645, "step": 1269 }, { "epoch": 4.535714285714286, - "grad_norm": 61.775962829589844, + "grad_norm": 6.527629375457764, "learning_rate": 6.651395309775837e-06, - "loss": 11.6211, + "loss": 10.0195, "step": 1270 }, { "epoch": 4.539285714285715, - "grad_norm": 49.226749420166016, + "grad_norm": 10.498749732971191, "learning_rate": 6.645510502357712e-06, - "loss": 12.918, + "loss": 13.2637, "step": 1271 }, { "epoch": 4.542857142857143, - "grad_norm": 52.22493362426758, + "grad_norm": 1040.6558837890625, "learning_rate": 6.639623137536023e-06, - "loss": 11.8203, + "loss": 20.8809, "step": 1272 }, { "epoch": 4.546428571428572, - "grad_norm": 48.03239440917969, + "grad_norm": 941.9212036132812, "learning_rate": 6.6337332244607376e-06, - "loss": 12.4023, + "loss": 14.8789, "step": 1273 }, { "epoch": 4.55, - "grad_norm": 83.61151123046875, + "grad_norm": 676.2642211914062, "learning_rate": 6.627840772285784e-06, - "loss": 14.0, + "loss": 24.4375, "step": 1274 }, { "epoch": 4.553571428571429, - "grad_norm": 65.64878845214844, + "grad_norm": 37.2834587097168, "learning_rate": 6.621945790169037e-06, - "loss": 11.1602, + "loss": 9.9199, "step": 1275 }, { "epoch": 4.557142857142857, - "grad_norm": 43.84993362426758, + "grad_norm": 4.13286828994751, "learning_rate": 6.616048287272301e-06, - "loss": 9.6055, + "loss": 8.541, "step": 1276 }, { "epoch": 4.560714285714286, - "grad_norm": 57.05831527709961, + "grad_norm": 56.673709869384766, "learning_rate": 6.610148272761301e-06, - "loss": 9.4922, + "loss": 9.1094, "step": 1277 }, { "epoch": 4.564285714285714, - "grad_norm": 55.946990966796875, + "grad_norm": 613.0344848632812, "learning_rate": 6.604245755805665e-06, - "loss": 10.8086, + "loss": 12.1133, "step": 1278 }, { "epoch": 4.567857142857143, - "grad_norm": 53.55848693847656, + "grad_norm": 13.159357070922852, "learning_rate": 6.598340745578908e-06, - "loss": 8.4922, + "loss": 6.6562, "step": 1279 }, { "epoch": 4.571428571428571, - "grad_norm": 58.75669479370117, + "grad_norm": 413.35357666015625, "learning_rate": 6.592433251258423e-06, - "loss": 9.6758, + "loss": 18.3535, "step": 1280 }, { "epoch": 4.575, - "grad_norm": 62.64598083496094, + "grad_norm": 361.80316162109375, "learning_rate": 6.586523282025462e-06, - "loss": 13.1914, + "loss": 15.0762, "step": 1281 }, { "epoch": 4.578571428571428, - "grad_norm": 48.03810501098633, + "grad_norm": 417.19775390625, "learning_rate": 6.5806108470651235e-06, - "loss": 9.1133, + "loss": 10.0996, "step": 1282 }, { "epoch": 4.582142857142857, - "grad_norm": 58.10173797607422, + "grad_norm": 878.2360229492188, "learning_rate": 6.574695955566337e-06, - "loss": 10.7617, + "loss": 18.4805, "step": 1283 }, { "epoch": 4.585714285714285, - "grad_norm": 54.2088508605957, + "grad_norm": 33.37519454956055, "learning_rate": 6.568778616721853e-06, - "loss": 10.3984, + "loss": 9.4531, "step": 1284 }, { "epoch": 4.589285714285714, - "grad_norm": 51.699485778808594, + "grad_norm": 30.391077041625977, "learning_rate": 6.562858839728224e-06, - "loss": 10.7461, + "loss": 10.5625, "step": 1285 }, { "epoch": 4.5928571428571425, - "grad_norm": 54.285789489746094, + "grad_norm": 1016.7225341796875, "learning_rate": 6.556936633785788e-06, - "loss": 10.6523, + "loss": 17.8398, "step": 1286 }, { "epoch": 4.5964285714285715, - "grad_norm": 47.19926834106445, + "grad_norm": 135.2120819091797, "learning_rate": 6.551012008098668e-06, - "loss": 10.6094, + "loss": 11.5898, "step": 1287 }, { "epoch": 4.6, - "grad_norm": 59.04658508300781, + "grad_norm": 215.13153076171875, "learning_rate": 6.545084971874738e-06, - "loss": 8.8242, + "loss": 7.3516, "step": 1288 }, { "epoch": 4.603571428571429, - "grad_norm": 47.80223083496094, + "grad_norm": 17.506567001342773, "learning_rate": 6.5391555343256205e-06, - "loss": 9.6133, + "loss": 8.5508, "step": 1289 }, { "epoch": 4.607142857142857, - "grad_norm": 49.57460021972656, + "grad_norm": 299.6906433105469, "learning_rate": 6.5332237046666725e-06, - "loss": 11.4023, + "loss": 11.4453, "step": 1290 }, { "epoch": 4.610714285714286, - "grad_norm": 49.131839752197266, + "grad_norm": 606.7481079101562, "learning_rate": 6.527289492116967e-06, - "loss": 11.2891, + "loss": 14.1758, "step": 1291 }, { "epoch": 4.614285714285714, - "grad_norm": 62.12271499633789, + "grad_norm": 1022.994384765625, "learning_rate": 6.521352905899283e-06, - "loss": 12.2344, + "loss": 20.5859, "step": 1292 }, { "epoch": 4.617857142857143, - "grad_norm": 64.56568908691406, + "grad_norm": 927.5538940429688, "learning_rate": 6.515413955240083e-06, - "loss": 9.5039, + "loss": 14.5332, "step": 1293 }, { "epoch": 4.621428571428572, - "grad_norm": 58.898284912109375, + "grad_norm": 24.565195083618164, "learning_rate": 6.509472649369511e-06, - "loss": 8.8828, + "loss": 20.5059, "step": 1294 }, { "epoch": 4.625, - "grad_norm": 64.50312805175781, + "grad_norm": 109.92662811279297, "learning_rate": 6.503528997521365e-06, - "loss": 11.3867, + "loss": 11.5137, "step": 1295 }, { "epoch": 4.628571428571428, - "grad_norm": 40.43440246582031, + "grad_norm": 142.665771484375, "learning_rate": 6.497583008933097e-06, - "loss": 9.6562, + "loss": 8.6406, "step": 1296 }, { "epoch": 4.632142857142857, - "grad_norm": 65.96137237548828, + "grad_norm": 15.442841529846191, "learning_rate": 6.491634692845781e-06, - "loss": 10.2109, + "loss": 10.6406, "step": 1297 }, { "epoch": 4.635714285714286, - "grad_norm": 47.61362075805664, + "grad_norm": 24.01895523071289, "learning_rate": 6.485684058504116e-06, - "loss": 9.8359, + "loss": 9.4805, "step": 1298 }, { "epoch": 4.639285714285714, - "grad_norm": 40.44854736328125, + "grad_norm": 32.134788513183594, "learning_rate": 6.4797311151564e-06, - "loss": 8.8398, + "loss": 9.3564, "step": 1299 }, { "epoch": 4.642857142857143, - "grad_norm": 59.11066818237305, + "grad_norm": 1272.6064453125, "learning_rate": 6.473775872054522e-06, - "loss": 14.2656, + "loss": 23.5664, "step": 1300 }, { "epoch": 4.646428571428571, - "grad_norm": 51.60243225097656, + "grad_norm": 69.99929809570312, "learning_rate": 6.467818338453943e-06, - "loss": 9.418, + "loss": 7.7559, "step": 1301 }, { "epoch": 4.65, - "grad_norm": 57.277366638183594, + "grad_norm": 76.45451354980469, "learning_rate": 6.461858523613684e-06, - "loss": 10.4766, + "loss": 9.6875, "step": 1302 }, { "epoch": 4.6535714285714285, - "grad_norm": 59.93546676635742, + "grad_norm": 209.65948486328125, "learning_rate": 6.455896436796314e-06, - "loss": 10.3711, + "loss": 9.1562, "step": 1303 }, { "epoch": 4.6571428571428575, - "grad_norm": 51.89094543457031, + "grad_norm": 15.170083045959473, "learning_rate": 6.449932087267932e-06, - "loss": 11.9883, + "loss": 13.2305, "step": 1304 }, { "epoch": 4.660714285714286, - "grad_norm": 61.61795425415039, + "grad_norm": 43.66115951538086, "learning_rate": 6.44396548429815e-06, - "loss": 14.3438, + "loss": 13.7617, "step": 1305 }, { "epoch": 4.664285714285715, - "grad_norm": 57.79254913330078, + "grad_norm": 444.7977294921875, "learning_rate": 6.437996637160086e-06, - "loss": 10.5898, + "loss": 12.9414, "step": 1306 }, { "epoch": 4.667857142857143, - "grad_norm": 63.701969146728516, + "grad_norm": 16.973979949951172, "learning_rate": 6.432025555130348e-06, - "loss": 16.7109, + "loss": 18.5859, "step": 1307 }, { "epoch": 4.671428571428572, - "grad_norm": 65.87297058105469, + "grad_norm": 371.7062072753906, "learning_rate": 6.426052247489012e-06, - "loss": 15.7305, + "loss": 22.6133, "step": 1308 }, { "epoch": 4.675, - "grad_norm": 69.40196990966797, + "grad_norm": 1099.746826171875, "learning_rate": 6.420076723519615e-06, - "loss": 10.6328, + "loss": 17.7715, "step": 1309 }, { "epoch": 4.678571428571429, - "grad_norm": 47.2768440246582, + "grad_norm": 950.2506713867188, "learning_rate": 6.414098992509138e-06, - "loss": 9.8984, + "loss": 12.2188, "step": 1310 }, { "epoch": 4.682142857142857, - "grad_norm": 50.008426666259766, + "grad_norm": 6.615330219268799, "learning_rate": 6.408119063747995e-06, - "loss": 9.8555, + "loss": 11.7852, "step": 1311 }, { "epoch": 4.685714285714286, - "grad_norm": 56.49504470825195, + "grad_norm": 20.505027770996094, "learning_rate": 6.402136946530014e-06, - "loss": 13.0117, + "loss": 13.2402, "step": 1312 }, { "epoch": 4.689285714285714, - "grad_norm": 55.99534606933594, + "grad_norm": 576.5913696289062, "learning_rate": 6.396152650152424e-06, - "loss": 10.6445, + "loss": 25.168, "step": 1313 }, { "epoch": 4.692857142857143, - "grad_norm": 51.67205810546875, + "grad_norm": 934.6709594726562, "learning_rate": 6.390166183915839e-06, - "loss": 11.7578, + "loss": 18.8867, "step": 1314 }, { "epoch": 4.696428571428571, - "grad_norm": 50.0031623840332, + "grad_norm": 238.27606201171875, "learning_rate": 6.384177557124247e-06, - "loss": 9.875, + "loss": 10.3398, "step": 1315 }, { "epoch": 4.7, - "grad_norm": 68.18054962158203, + "grad_norm": 739.79638671875, "learning_rate": 6.378186779084996e-06, - "loss": 11.5547, + "loss": 18.1133, "step": 1316 }, { "epoch": 4.703571428571428, - "grad_norm": 55.29550552368164, + "grad_norm": 173.82505798339844, "learning_rate": 6.372193859108775e-06, - "loss": 9.8359, + "loss": 8.6367, "step": 1317 }, { "epoch": 4.707142857142857, - "grad_norm": 50.897743225097656, + "grad_norm": 15.470370292663574, "learning_rate": 6.3661988065096015e-06, - "loss": 10.7031, + "loss": 10.625, "step": 1318 }, { "epoch": 4.710714285714285, - "grad_norm": 53.641693115234375, + "grad_norm": 19.305116653442383, "learning_rate": 6.360201630604808e-06, - "loss": 9.8594, + "loss": 8.0723, "step": 1319 }, { "epoch": 4.714285714285714, - "grad_norm": 48.848331451416016, + "grad_norm": 12.34910774230957, "learning_rate": 6.354202340715027e-06, - "loss": 10.0117, + "loss": 8.3984, "step": 1320 }, { "epoch": 4.7178571428571425, - "grad_norm": 42.65821838378906, + "grad_norm": 20.64888572692871, "learning_rate": 6.348200946164178e-06, - "loss": 10.8711, + "loss": 11.2344, "step": 1321 }, { "epoch": 4.7214285714285715, - "grad_norm": 53.63444137573242, + "grad_norm": 18.275678634643555, "learning_rate": 6.342197456279449e-06, - "loss": 12.5508, + "loss": 11.8184, "step": 1322 }, { "epoch": 4.725, - "grad_norm": 50.96023178100586, + "grad_norm": 120.3302001953125, "learning_rate": 6.336191880391285e-06, - "loss": 10.1094, + "loss": 22.6484, "step": 1323 }, { "epoch": 4.728571428571429, - "grad_norm": 49.673240661621094, + "grad_norm": 148.49880981445312, "learning_rate": 6.330184227833376e-06, - "loss": 9.4219, + "loss": 9.7969, "step": 1324 }, { "epoch": 4.732142857142857, - "grad_norm": 46.969505310058594, + "grad_norm": 76.86311340332031, "learning_rate": 6.324174507942636e-06, - "loss": 10.6875, + "loss": 9.3867, "step": 1325 }, { "epoch": 4.735714285714286, - "grad_norm": 49.311283111572266, + "grad_norm": 29.04791259765625, "learning_rate": 6.318162730059194e-06, - "loss": 9.4922, + "loss": 10.3555, "step": 1326 }, { "epoch": 4.739285714285714, - "grad_norm": 39.998844146728516, + "grad_norm": 931.30517578125, "learning_rate": 6.312148903526375e-06, - "loss": 8.793, + "loss": 13.3965, "step": 1327 }, { "epoch": 4.742857142857143, - "grad_norm": 64.28884887695312, + "grad_norm": 995.64892578125, "learning_rate": 6.306133037690693e-06, - "loss": 13.6953, + "loss": 19.5498, "step": 1328 }, { "epoch": 4.746428571428572, - "grad_norm": 62.66605758666992, + "grad_norm": 1019.3848266601562, "learning_rate": 6.300115141901824e-06, - "loss": 10.8438, + "loss": 15.373, "step": 1329 }, { "epoch": 4.75, - "grad_norm": 49.88985824584961, + "grad_norm": 447.8642883300781, "learning_rate": 6.294095225512604e-06, - "loss": 10.2852, + "loss": 12.4688, "step": 1330 }, { "epoch": 4.753571428571428, - "grad_norm": 46.77223587036133, + "grad_norm": 27.5798397064209, "learning_rate": 6.288073297879009e-06, - "loss": 12.125, + "loss": 12.6211, "step": 1331 }, { "epoch": 4.757142857142857, - "grad_norm": 41.762386322021484, + "grad_norm": 12.861761093139648, "learning_rate": 6.282049368360143e-06, - "loss": 9.1328, + "loss": 7.7559, "step": 1332 }, { "epoch": 4.760714285714286, - "grad_norm": 59.693565368652344, + "grad_norm": 23.206750869750977, "learning_rate": 6.276023446318214e-06, - "loss": 10.3203, + "loss": 8.4609, "step": 1333 }, { "epoch": 4.764285714285714, - "grad_norm": 50.270286560058594, + "grad_norm": 17.547515869140625, "learning_rate": 6.269995541118531e-06, - "loss": 8.3984, + "loss": 8.541, "step": 1334 }, { "epoch": 4.767857142857143, - "grad_norm": 64.99529266357422, + "grad_norm": 29.556751251220703, "learning_rate": 6.2639656621294874e-06, - "loss": 10.7383, + "loss": 10.0469, "step": 1335 }, { "epoch": 4.771428571428571, - "grad_norm": 50.455928802490234, + "grad_norm": 14.196871757507324, "learning_rate": 6.257933818722544e-06, - "loss": 11.5117, + "loss": 11.2949, "step": 1336 }, { "epoch": 4.775, - "grad_norm": 43.47975158691406, + "grad_norm": 16.749099731445312, "learning_rate": 6.251900020272208e-06, - "loss": 9.6055, + "loss": 10.1211, "step": 1337 }, { "epoch": 4.7785714285714285, - "grad_norm": 52.40178298950195, + "grad_norm": 683.4146728515625, "learning_rate": 6.245864276156033e-06, - "loss": 13.2031, + "loss": 14.6816, "step": 1338 }, { "epoch": 4.7821428571428575, - "grad_norm": 49.76625061035156, + "grad_norm": 33.0611686706543, "learning_rate": 6.239826595754591e-06, - "loss": 9.5312, + "loss": 8.2617, "step": 1339 }, { "epoch": 4.785714285714286, - "grad_norm": 46.83122253417969, + "grad_norm": 50.403743743896484, "learning_rate": 6.233786988451468e-06, - "loss": 10.957, + "loss": 9.6074, "step": 1340 }, { "epoch": 4.789285714285715, - "grad_norm": 67.38544464111328, + "grad_norm": 878.5269165039062, "learning_rate": 6.227745463633243e-06, - "loss": 9.1211, + "loss": 15.8848, "step": 1341 }, { "epoch": 4.792857142857143, - "grad_norm": 65.58389282226562, + "grad_norm": 51.168521881103516, "learning_rate": 6.2217020306894705e-06, - "loss": 10.9727, + "loss": 11.5781, "step": 1342 }, { "epoch": 4.796428571428572, - "grad_norm": 65.12335205078125, + "grad_norm": 672.624267578125, "learning_rate": 6.215656699012678e-06, - "loss": 14.1133, + "loss": 19.0508, "step": 1343 }, { "epoch": 4.8, - "grad_norm": 52.61879348754883, + "grad_norm": 25.60259437561035, "learning_rate": 6.209609477998339e-06, - "loss": 9.4141, + "loss": 6.8984, "step": 1344 }, { "epoch": 4.803571428571429, - "grad_norm": 62.85334777832031, + "grad_norm": 50.14417266845703, "learning_rate": 6.2035603770448664e-06, - "loss": 12.6992, + "loss": 13.4492, "step": 1345 }, { "epoch": 4.807142857142857, - "grad_norm": 53.62458038330078, + "grad_norm": 233.7950897216797, "learning_rate": 6.19750940555359e-06, - "loss": 11.3281, + "loss": 10.0957, "step": 1346 }, { "epoch": 4.810714285714286, - "grad_norm": 46.73116683959961, + "grad_norm": 14.730423927307129, "learning_rate": 6.191456572928753e-06, - "loss": 12.5078, + "loss": 11.8887, "step": 1347 }, { "epoch": 4.814285714285714, - "grad_norm": 57.724239349365234, + "grad_norm": 419.175048828125, "learning_rate": 6.185401888577488e-06, - "loss": 9.7344, + "loss": 15.4492, "step": 1348 }, { "epoch": 4.817857142857143, - "grad_norm": 50.35357666015625, + "grad_norm": 1509.5657958984375, "learning_rate": 6.179345361909806e-06, - "loss": 11.1406, + "loss": 27.1641, "step": 1349 }, { "epoch": 4.821428571428571, - "grad_norm": 53.995460510253906, + "grad_norm": 634.7535400390625, "learning_rate": 6.173287002338577e-06, - "loss": 12.0273, + "loss": 14.2891, "step": 1350 }, { "epoch": 4.825, - "grad_norm": 44.59127426147461, + "grad_norm": 5.157732963562012, "learning_rate": 6.1672268192795285e-06, - "loss": 10.582, + "loss": 9.4375, "step": 1351 }, { "epoch": 4.828571428571428, - "grad_norm": 52.928810119628906, + "grad_norm": 115.95891571044922, "learning_rate": 6.161164822151213e-06, - "loss": 11.0742, + "loss": 12.6562, "step": 1352 }, { "epoch": 4.832142857142857, - "grad_norm": 51.79356002807617, + "grad_norm": 180.70037841796875, "learning_rate": 6.155101020375011e-06, - "loss": 10.6875, + "loss": 9.8516, "step": 1353 }, { "epoch": 4.835714285714285, - "grad_norm": 65.07655334472656, + "grad_norm": 12.303606033325195, "learning_rate": 6.1490354233750986e-06, - "loss": 9.2031, + "loss": 8.9414, "step": 1354 }, { "epoch": 4.839285714285714, - "grad_norm": 50.324588775634766, + "grad_norm": 170.04798889160156, "learning_rate": 6.1429680405784485e-06, - "loss": 8.5039, + "loss": 7.8438, "step": 1355 }, { "epoch": 4.8428571428571425, - "grad_norm": 43.83511734008789, + "grad_norm": 8.051681518554688, "learning_rate": 6.136898881414807e-06, - "loss": 8.2109, + "loss": 7.9102, "step": 1356 }, { "epoch": 4.8464285714285715, - "grad_norm": 70.43878936767578, + "grad_norm": 39.9486198425293, "learning_rate": 6.1308279553166824e-06, - "loss": 12.1289, + "loss": 13.9453, "step": 1357 }, { "epoch": 4.85, - "grad_norm": 50.814876556396484, + "grad_norm": 420.51544189453125, "learning_rate": 6.124755271719326e-06, - "loss": 10.5703, + "loss": 20.1855, "step": 1358 }, { "epoch": 4.853571428571429, - "grad_norm": 45.590110778808594, + "grad_norm": 52.3118782043457, "learning_rate": 6.1186808400607224e-06, - "loss": 11.7539, + "loss": 12.9961, "step": 1359 }, { "epoch": 4.857142857142857, - "grad_norm": 61.39267349243164, + "grad_norm": 15.181742668151855, "learning_rate": 6.112604669781572e-06, - "loss": 7.7617, + "loss": 6.3066, "step": 1360 }, { "epoch": 4.860714285714286, - "grad_norm": 48.24943161010742, + "grad_norm": 666.5877075195312, "learning_rate": 6.106526770325283e-06, - "loss": 10.5234, + "loss": 19.0361, "step": 1361 }, { "epoch": 4.864285714285714, - "grad_norm": 52.94794464111328, + "grad_norm": 635.4930419921875, "learning_rate": 6.100447151137939e-06, - "loss": 10.7695, + "loss": 15.2969, "step": 1362 }, { "epoch": 4.867857142857143, - "grad_norm": 57.50790023803711, + "grad_norm": 40.683956146240234, "learning_rate": 6.094365821668307e-06, - "loss": 9.4023, + "loss": 9.2266, "step": 1363 }, { "epoch": 4.871428571428572, - "grad_norm": 56.274497985839844, + "grad_norm": 487.99267578125, "learning_rate": 6.088282791367812e-06, - "loss": 10.957, + "loss": 19.7422, "step": 1364 }, { "epoch": 4.875, - "grad_norm": 46.01288986206055, + "grad_norm": 110.34877014160156, "learning_rate": 6.0821980696905145e-06, - "loss": 7.4648, + "loss": 7.0, "step": 1365 }, { "epoch": 4.878571428571428, - "grad_norm": 53.07455062866211, + "grad_norm": 936.3322143554688, "learning_rate": 6.076111666093111e-06, - "loss": 10.5312, + "loss": 15.002, "step": 1366 }, { "epoch": 4.882142857142857, - "grad_norm": 60.106597900390625, + "grad_norm": 263.1244812011719, "learning_rate": 6.070023590034907e-06, - "loss": 8.918, + "loss": 7.0059, "step": 1367 }, { "epoch": 4.885714285714286, - "grad_norm": 50.45497512817383, + "grad_norm": 26.00920295715332, "learning_rate": 6.063933850977811e-06, - "loss": 11.3906, + "loss": 13.1172, "step": 1368 }, { "epoch": 4.889285714285714, - "grad_norm": 50.476654052734375, + "grad_norm": 10.988875389099121, "learning_rate": 6.057842458386315e-06, - "loss": 11.0977, + "loss": 11.6094, "step": 1369 }, { "epoch": 4.892857142857143, - "grad_norm": 66.29315185546875, + "grad_norm": 238.52651977539062, "learning_rate": 6.05174942172748e-06, - "loss": 13.0977, + "loss": 14.293, "step": 1370 }, { "epoch": 4.896428571428571, - "grad_norm": 63.846435546875, + "grad_norm": 21.533695220947266, "learning_rate": 6.045654750470921e-06, - "loss": 10.9102, + "loss": 11.0078, "step": 1371 }, { "epoch": 4.9, - "grad_norm": 76.87985229492188, + "grad_norm": 164.6018524169922, "learning_rate": 6.039558454088796e-06, - "loss": 14.4688, + "loss": 17.7461, "step": 1372 }, { "epoch": 4.9035714285714285, - "grad_norm": 62.00615310668945, + "grad_norm": 504.3652038574219, "learning_rate": 6.033460542055791e-06, - "loss": 8.3594, + "loss": 8.9199, "step": 1373 }, { "epoch": 4.9071428571428575, - "grad_norm": 48.8815803527832, + "grad_norm": 92.22815704345703, "learning_rate": 6.027361023849096e-06, - "loss": 10.1562, + "loss": 9.543, "step": 1374 }, { "epoch": 4.910714285714286, - "grad_norm": 51.10958480834961, + "grad_norm": 30.355009078979492, "learning_rate": 6.0212599089484026e-06, - "loss": 11.418, + "loss": 12.1797, "step": 1375 }, { "epoch": 4.914285714285715, - "grad_norm": 52.675838470458984, + "grad_norm": 84.4201431274414, "learning_rate": 6.015157206835881e-06, - "loss": 11.7656, + "loss": 24.4629, "step": 1376 }, { "epoch": 4.917857142857143, - "grad_norm": 77.63253784179688, + "grad_norm": 100.91366577148438, "learning_rate": 6.009052926996173e-06, - "loss": 11.457, + "loss": 11.6914, "step": 1377 }, { "epoch": 4.921428571428572, - "grad_norm": 66.52314758300781, + "grad_norm": 149.8079833984375, "learning_rate": 6.002947078916365e-06, - "loss": 12.1406, + "loss": 12.0781, "step": 1378 }, { "epoch": 4.925, - "grad_norm": 83.61681365966797, + "grad_norm": 7.013453483581543, "learning_rate": 5.996839672085986e-06, - "loss": 12.832, + "loss": 13.5781, "step": 1379 }, { "epoch": 4.928571428571429, - "grad_norm": 50.57850646972656, + "grad_norm": 931.711181640625, "learning_rate": 5.990730715996989e-06, - "loss": 9.4766, + "loss": 14.6387, "step": 1380 }, { "epoch": 4.932142857142857, - "grad_norm": 55.62222671508789, + "grad_norm": 16.921546936035156, "learning_rate": 5.984620220143728e-06, - "loss": 7.8594, + "loss": 6.5586, "step": 1381 }, { "epoch": 4.935714285714286, - "grad_norm": 44.322479248046875, + "grad_norm": 609.4995727539062, "learning_rate": 5.978508194022958e-06, - "loss": 9.7617, + "loss": 16.4473, "step": 1382 }, { "epoch": 4.939285714285714, - "grad_norm": 44.98233413696289, + "grad_norm": 45.558650970458984, "learning_rate": 5.972394647133807e-06, - "loss": 9.2969, + "loss": 10.4609, "step": 1383 }, { "epoch": 4.942857142857143, - "grad_norm": 49.74446487426758, + "grad_norm": 836.6251831054688, "learning_rate": 5.9662795889777666e-06, - "loss": 11.6602, + "loss": 21.0703, "step": 1384 }, { "epoch": 4.946428571428571, - "grad_norm": 59.07768249511719, + "grad_norm": 621.3798828125, "learning_rate": 5.960163029058682e-06, - "loss": 10.5664, + "loss": 19.9668, "step": 1385 }, { "epoch": 4.95, - "grad_norm": 59.64643096923828, + "grad_norm": 593.7202758789062, "learning_rate": 5.954044976882725e-06, - "loss": 12.7109, + "loss": 24.3281, "step": 1386 }, { "epoch": 4.953571428571428, - "grad_norm": 47.893436431884766, + "grad_norm": 211.3679656982422, "learning_rate": 5.947925441958393e-06, - "loss": 9.7461, + "loss": 11.3789, "step": 1387 }, { "epoch": 4.957142857142857, - "grad_norm": 45.64666748046875, + "grad_norm": 950.444580078125, "learning_rate": 5.941804433796485e-06, - "loss": 12.1797, + "loss": 17.0742, "step": 1388 }, { "epoch": 4.960714285714285, - "grad_norm": 49.69574737548828, + "grad_norm": 1063.264404296875, "learning_rate": 5.935681961910091e-06, - "loss": 9.6836, + "loss": 23.6523, "step": 1389 }, { "epoch": 4.964285714285714, - "grad_norm": 62.57014846801758, + "grad_norm": 1163.8995361328125, "learning_rate": 5.929558035814574e-06, - "loss": 8.0547, + "loss": 11.1094, "step": 1390 }, { "epoch": 4.9678571428571425, - "grad_norm": 48.326416015625, + "grad_norm": 921.125244140625, "learning_rate": 5.9234326650275575e-06, - "loss": 8.1367, + "loss": 16.8809, "step": 1391 }, { "epoch": 4.9714285714285715, - "grad_norm": 52.0089225769043, + "grad_norm": 1190.478271484375, "learning_rate": 5.917305859068912e-06, - "loss": 10.4648, + "loss": 31.0117, "step": 1392 }, { "epoch": 4.975, - "grad_norm": 66.37007904052734, + "grad_norm": 15.346173286437988, "learning_rate": 5.911177627460739e-06, - "loss": 12.7461, + "loss": 13.5, "step": 1393 }, { "epoch": 4.978571428571429, - "grad_norm": 48.63053894042969, + "grad_norm": 6.141760349273682, "learning_rate": 5.90504797972735e-06, - "loss": 8.3398, + "loss": 8.3789, "step": 1394 }, { "epoch": 4.982142857142857, - "grad_norm": 50.93167495727539, + "grad_norm": 47.35612487792969, "learning_rate": 5.8989169253952635e-06, - "loss": 9.5391, + "loss": 19.9688, "step": 1395 }, { "epoch": 4.985714285714286, - "grad_norm": 48.74834442138672, + "grad_norm": 438.0937194824219, "learning_rate": 5.892784473993184e-06, - "loss": 8.8359, + "loss": 8.8164, "step": 1396 }, { "epoch": 4.989285714285714, - "grad_norm": 61.9733772277832, + "grad_norm": 774.62548828125, "learning_rate": 5.886650635051984e-06, - "loss": 9.8672, + "loss": 27.5234, "step": 1397 }, { "epoch": 4.992857142857143, - "grad_norm": 56.48850631713867, + "grad_norm": 18.66183090209961, "learning_rate": 5.880515418104692e-06, - "loss": 9.1172, + "loss": 8.1816, "step": 1398 }, { "epoch": 4.996428571428572, - "grad_norm": 63.641761779785156, + "grad_norm": 45.379554748535156, "learning_rate": 5.874378832686482e-06, - "loss": 10.8555, + "loss": 12.5547, "step": 1399 }, { "epoch": 5.0, - "grad_norm": 71.40581512451172, + "grad_norm": 48.245792388916016, "learning_rate": 5.8682408883346535e-06, - "loss": 8.6172, + "loss": 7.3027, "step": 1400 }, { "epoch": 5.0, - "eval_loss": 10.425792694091797, - "eval_mse": 10.425014430444078, - "eval_runtime": 11.4326, - "eval_samples_per_second": 248.412, - "eval_steps_per_second": 1.312, - "eval_target_0_mse": 19.45627803066311, - "eval_target_1_mse": 10.667289320666134, - "eval_target_2_mse": 5.904172274142496, - "eval_target_3_mse": 5.672318096304573, + "eval_loss": 12.821561813354492, + "eval_mse": 12.813887278345906, + "eval_runtime": 11.2347, + "eval_samples_per_second": 252.789, + "eval_steps_per_second": 1.335, + "eval_target_0_mse": 37.298402229629396, + "eval_target_1_mse": 8.786984447900409, + "eval_target_2_mse": 3.4468194696625187, + "eval_target_3_mse": 1.7233429661913056, "step": 1400 }, { "epoch": 5.003571428571429, - "grad_norm": 61.77046203613281, + "grad_norm": 44.83823776245117, "learning_rate": 5.862101594588614e-06, - "loss": 12.0156, + "loss": 11.8652, "step": 1401 }, { "epoch": 5.007142857142857, - "grad_norm": 46.45106506347656, + "grad_norm": 28.90280532836914, "learning_rate": 5.855960960989877e-06, - "loss": 8.7969, + "loss": 9.3457, "step": 1402 }, { "epoch": 5.010714285714286, - "grad_norm": 48.226932525634766, + "grad_norm": 163.46551513671875, "learning_rate": 5.849818997082026e-06, - "loss": 9.3281, + "loss": 9.0371, "step": 1403 }, { "epoch": 5.014285714285714, - "grad_norm": 56.541805267333984, + "grad_norm": 168.88137817382812, "learning_rate": 5.843675712410724e-06, - "loss": 11.2852, + "loss": 13.7344, "step": 1404 }, { "epoch": 5.017857142857143, - "grad_norm": 51.48737335205078, + "grad_norm": 22.01162338256836, "learning_rate": 5.837531116523683e-06, - "loss": 9.5742, + "loss": 12.5195, "step": 1405 }, { "epoch": 5.021428571428571, - "grad_norm": 48.6540412902832, + "grad_norm": 827.182861328125, "learning_rate": 5.8313852189706465e-06, - "loss": 11.1602, + "loss": 18.0938, "step": 1406 }, { "epoch": 5.025, - "grad_norm": 43.87437438964844, + "grad_norm": 148.2345428466797, "learning_rate": 5.825238029303388e-06, - "loss": 9.9805, + "loss": 9.502, "step": 1407 }, { "epoch": 5.0285714285714285, - "grad_norm": 50.88685989379883, + "grad_norm": 264.02655029296875, "learning_rate": 5.819089557075689e-06, - "loss": 10.0352, + "loss": 11.9141, "step": 1408 }, { "epoch": 5.0321428571428575, - "grad_norm": 61.2413330078125, + "grad_norm": 49.746437072753906, "learning_rate": 5.81293981184332e-06, - "loss": 9.9102, + "loss": 10.1602, "step": 1409 }, { "epoch": 5.035714285714286, - "grad_norm": 61.18778991699219, + "grad_norm": 102.76546478271484, "learning_rate": 5.806788803164034e-06, - "loss": 11.4336, + "loss": 14.3477, "step": 1410 }, { "epoch": 5.039285714285715, - "grad_norm": 47.16850662231445, + "grad_norm": 18.95713996887207, "learning_rate": 5.800636540597544e-06, - "loss": 9.4219, + "loss": 8.9902, "step": 1411 }, { "epoch": 5.042857142857143, - "grad_norm": 55.2095832824707, + "grad_norm": 40.84389877319336, "learning_rate": 5.7944830337055165e-06, - "loss": 9.5312, + "loss": 9.1113, "step": 1412 }, { "epoch": 5.046428571428572, - "grad_norm": 64.62085723876953, + "grad_norm": 24.670841217041016, "learning_rate": 5.7883282920515486e-06, - "loss": 13.5117, + "loss": 18.6914, "step": 1413 }, { "epoch": 5.05, - "grad_norm": 46.53950500488281, + "grad_norm": 1300.047607421875, "learning_rate": 5.782172325201155e-06, - "loss": 9.8984, + "loss": 17.6172, "step": 1414 }, { "epoch": 5.053571428571429, - "grad_norm": 51.75440979003906, + "grad_norm": 652.6658935546875, "learning_rate": 5.776015142721758e-06, - "loss": 11.7812, + "loss": 15.8789, "step": 1415 }, { "epoch": 5.057142857142857, - "grad_norm": 52.13917922973633, + "grad_norm": 128.73934936523438, "learning_rate": 5.769856754182668e-06, - "loss": 12.7773, + "loss": 11.3457, "step": 1416 }, { "epoch": 5.060714285714286, - "grad_norm": 63.815086364746094, + "grad_norm": 1273.5050048828125, "learning_rate": 5.76369716915507e-06, - "loss": 10.2461, + "loss": 19.0781, "step": 1417 }, { "epoch": 5.064285714285714, - "grad_norm": 42.831329345703125, + "grad_norm": 115.05062103271484, "learning_rate": 5.7575363972120066e-06, - "loss": 10.2617, + "loss": 9.0547, "step": 1418 }, { "epoch": 5.067857142857143, - "grad_norm": 62.040767669677734, + "grad_norm": 24.620689392089844, "learning_rate": 5.751374447928368e-06, - "loss": 9.3594, + "loss": 8.2715, "step": 1419 }, { "epoch": 5.071428571428571, - "grad_norm": 55.77035903930664, + "grad_norm": 548.8239135742188, "learning_rate": 5.745211330880872e-06, - "loss": 8.5039, + "loss": 9.4375, "step": 1420 }, { "epoch": 5.075, - "grad_norm": 46.840877532958984, + "grad_norm": 240.1739044189453, "learning_rate": 5.7390470556480545e-06, - "loss": 8.8047, + "loss": 9.25, "step": 1421 }, { "epoch": 5.078571428571428, - "grad_norm": 47.686187744140625, + "grad_norm": 23.776155471801758, "learning_rate": 5.732881631810245e-06, - "loss": 10.3555, + "loss": 10.1953, "step": 1422 }, { "epoch": 5.082142857142857, - "grad_norm": 44.46207809448242, + "grad_norm": 25.179594039916992, "learning_rate": 5.726715068949564e-06, - "loss": 8.7656, + "loss": 8.582, "step": 1423 }, { "epoch": 5.085714285714285, - "grad_norm": 58.9624137878418, + "grad_norm": 20.422372817993164, "learning_rate": 5.720547376649901e-06, - "loss": 10.1914, + "loss": 11.168, "step": 1424 }, { "epoch": 5.089285714285714, - "grad_norm": 66.60795593261719, + "grad_norm": 119.45560455322266, "learning_rate": 5.7143785644969005e-06, - "loss": 12.5938, + "loss": 12.9082, "step": 1425 }, { "epoch": 5.0928571428571425, - "grad_norm": 50.26286697387695, + "grad_norm": 43.48995590209961, "learning_rate": 5.708208642077946e-06, - "loss": 10.1797, + "loss": 10.3359, "step": 1426 }, { "epoch": 5.0964285714285715, - "grad_norm": 61.3529167175293, + "grad_norm": 24.987916946411133, "learning_rate": 5.702037618982148e-06, - "loss": 8.6289, + "loss": 7.8516, "step": 1427 }, { "epoch": 5.1, - "grad_norm": 56.32416915893555, + "grad_norm": 30.34014129638672, "learning_rate": 5.695865504800328e-06, - "loss": 10.6641, + "loss": 10.7227, "step": 1428 }, { "epoch": 5.103571428571429, - "grad_norm": 42.827964782714844, + "grad_norm": 599.5990600585938, "learning_rate": 5.689692309125001e-06, - "loss": 10.0547, + "loss": 12.5469, "step": 1429 }, { "epoch": 5.107142857142857, - "grad_norm": 57.3599853515625, + "grad_norm": 9.29161262512207, "learning_rate": 5.683518041550368e-06, - "loss": 12.7812, + "loss": 13.6797, "step": 1430 }, { "epoch": 5.110714285714286, - "grad_norm": 66.59650421142578, + "grad_norm": 31.183700561523438, "learning_rate": 5.677342711672289e-06, - "loss": 9.5508, + "loss": 8.1582, "step": 1431 }, { "epoch": 5.114285714285714, - "grad_norm": 60.28845977783203, + "grad_norm": 272.7343444824219, "learning_rate": 5.671166329088278e-06, - "loss": 9.4688, + "loss": 10.1875, "step": 1432 }, { "epoch": 5.117857142857143, - "grad_norm": 60.34199142456055, + "grad_norm": 635.6747436523438, "learning_rate": 5.664988903397488e-06, - "loss": 15.1016, + "loss": 15.043, "step": 1433 }, { "epoch": 5.121428571428571, - "grad_norm": 49.97574234008789, + "grad_norm": 727.6406860351562, "learning_rate": 5.658810444200689e-06, - "loss": 9.9883, + "loss": 13.8125, "step": 1434 }, { "epoch": 5.125, - "grad_norm": 53.60847473144531, + "grad_norm": 16.665416717529297, "learning_rate": 5.65263096110026e-06, - "loss": 8.9297, + "loss": 7.7734, "step": 1435 }, { "epoch": 5.128571428571428, - "grad_norm": 47.538719177246094, + "grad_norm": 34.416751861572266, "learning_rate": 5.646450463700167e-06, - "loss": 9.0781, + "loss": 8.8809, "step": 1436 }, { "epoch": 5.132142857142857, - "grad_norm": 60.0999870300293, + "grad_norm": 537.0762329101562, "learning_rate": 5.640268961605959e-06, - "loss": 14.1406, + "loss": 23.3887, "step": 1437 }, { "epoch": 5.135714285714286, - "grad_norm": 51.829551696777344, + "grad_norm": 33.67753982543945, "learning_rate": 5.634086464424743e-06, - "loss": 8.6172, + "loss": 8.5, "step": 1438 }, { "epoch": 5.139285714285714, - "grad_norm": 62.63954162597656, + "grad_norm": 1174.77587890625, "learning_rate": 5.627902981765169e-06, - "loss": 10.8047, + "loss": 19.5352, "step": 1439 }, { "epoch": 5.142857142857143, - "grad_norm": 51.463531494140625, + "grad_norm": 33.077003479003906, "learning_rate": 5.621718523237427e-06, - "loss": 10.5312, + "loss": 11.3125, "step": 1440 }, { "epoch": 5.146428571428571, - "grad_norm": 51.038238525390625, + "grad_norm": 629.9578857421875, "learning_rate": 5.615533098453215e-06, - "loss": 10.8203, + "loss": 19.0469, "step": 1441 }, { "epoch": 5.15, - "grad_norm": 49.482017517089844, + "grad_norm": 51.19455337524414, "learning_rate": 5.609346717025738e-06, - "loss": 10.3047, + "loss": 10.3945, "step": 1442 }, { "epoch": 5.1535714285714285, - "grad_norm": 43.1971435546875, + "grad_norm": 164.8997344970703, "learning_rate": 5.603159388569685e-06, - "loss": 9.0586, + "loss": 9.3027, "step": 1443 }, { "epoch": 5.1571428571428575, - "grad_norm": 52.37653732299805, + "grad_norm": 7.159567356109619, "learning_rate": 5.596971122701221e-06, - "loss": 10.3711, + "loss": 9.8965, "step": 1444 }, { "epoch": 5.160714285714286, - "grad_norm": 50.22847366333008, + "grad_norm": 18.345705032348633, "learning_rate": 5.590781929037965e-06, - "loss": 10.6719, + "loss": 10.8867, "step": 1445 }, { "epoch": 5.164285714285715, - "grad_norm": 46.709083557128906, + "grad_norm": 13.760417938232422, "learning_rate": 5.584591817198974e-06, - "loss": 9.8711, + "loss": 8.5781, "step": 1446 }, { "epoch": 5.167857142857143, - "grad_norm": 50.84846878051758, + "grad_norm": 506.4337158203125, "learning_rate": 5.5784007968047395e-06, - "loss": 9.3867, + "loss": 13.7637, "step": 1447 }, { "epoch": 5.171428571428572, - "grad_norm": 68.21871948242188, + "grad_norm": 119.91104888916016, "learning_rate": 5.57220887747716e-06, - "loss": 12.5195, + "loss": 12.9883, "step": 1448 }, { "epoch": 5.175, - "grad_norm": 50.064842224121094, + "grad_norm": 934.4440307617188, "learning_rate": 5.566016068839535e-06, - "loss": 11.7305, + "loss": 16.2461, "step": 1449 }, { "epoch": 5.178571428571429, - "grad_norm": 47.84496307373047, + "grad_norm": 487.7801208496094, "learning_rate": 5.559822380516539e-06, - "loss": 9.8398, + "loss": 11.3711, "step": 1450 }, { "epoch": 5.182142857142857, - "grad_norm": 49.03466033935547, + "grad_norm": 947.4435424804688, "learning_rate": 5.553627822134224e-06, - "loss": 8.3945, + "loss": 12.2754, "step": 1451 }, { "epoch": 5.185714285714286, - "grad_norm": 40.05208969116211, + "grad_norm": 147.27304077148438, "learning_rate": 5.547432403319986e-06, - "loss": 8.0117, + "loss": 8.6289, "step": 1452 }, { "epoch": 5.189285714285714, - "grad_norm": 64.5248031616211, + "grad_norm": 606.2604370117188, "learning_rate": 5.541236133702561e-06, - "loss": 11.9023, + "loss": 14.2734, "step": 1453 }, { "epoch": 5.192857142857143, - "grad_norm": 56.71283721923828, + "grad_norm": 212.02029418945312, "learning_rate": 5.535039022912007e-06, - "loss": 9.8711, + "loss": 10.3477, "step": 1454 }, { "epoch": 5.196428571428571, - "grad_norm": 49.616485595703125, + "grad_norm": 7.403648376464844, "learning_rate": 5.52884108057969e-06, - "loss": 11.4844, + "loss": 12.0586, "step": 1455 }, { "epoch": 5.2, - "grad_norm": 47.92969512939453, + "grad_norm": 15.888338088989258, "learning_rate": 5.522642316338268e-06, - "loss": 9.3477, + "loss": 8.5586, "step": 1456 }, { "epoch": 5.203571428571428, - "grad_norm": 71.21198272705078, + "grad_norm": 20.11236000061035, "learning_rate": 5.516442739821676e-06, - "loss": 11.6211, + "loss": 14.1992, "step": 1457 }, { "epoch": 5.207142857142857, - "grad_norm": 49.89578628540039, + "grad_norm": 233.37579345703125, "learning_rate": 5.510242360665114e-06, - "loss": 12.25, + "loss": 11.957, "step": 1458 }, { "epoch": 5.210714285714285, - "grad_norm": 46.242618560791016, + "grad_norm": 855.3073120117188, "learning_rate": 5.504041188505022e-06, - "loss": 10.1367, + "loss": 15.6406, "step": 1459 }, { "epoch": 5.214285714285714, - "grad_norm": 51.743221282958984, + "grad_norm": 912.884033203125, "learning_rate": 5.497839232979084e-06, - "loss": 10.5547, + "loss": 26.2227, "step": 1460 }, { "epoch": 5.2178571428571425, - "grad_norm": 40.873104095458984, + "grad_norm": 324.8852233886719, "learning_rate": 5.49163650372619e-06, - "loss": 10.3438, + "loss": 10.7344, "step": 1461 }, { "epoch": 5.2214285714285715, - "grad_norm": 53.273216247558594, + "grad_norm": 31.601716995239258, "learning_rate": 5.485433010386442e-06, - "loss": 12.5508, + "loss": 12.7578, "step": 1462 }, { "epoch": 5.225, - "grad_norm": 50.31330490112305, + "grad_norm": 293.6587829589844, "learning_rate": 5.4792287626011206e-06, - "loss": 9.5586, + "loss": 10.9629, "step": 1463 }, { "epoch": 5.228571428571429, - "grad_norm": 55.11276626586914, + "grad_norm": 163.7696533203125, "learning_rate": 5.473023770012686e-06, - "loss": 13.1406, + "loss": 25.0898, "step": 1464 }, { "epoch": 5.232142857142857, - "grad_norm": 46.86997985839844, + "grad_norm": 33.129878997802734, "learning_rate": 5.466818042264754e-06, - "loss": 8.832, + "loss": 9.3633, "step": 1465 }, { "epoch": 5.235714285714286, - "grad_norm": 66.86492156982422, + "grad_norm": 263.1462707519531, "learning_rate": 5.4606115890020815e-06, - "loss": 13.8281, + "loss": 17.1523, "step": 1466 }, { "epoch": 5.239285714285714, - "grad_norm": 56.324764251708984, + "grad_norm": 12.158010482788086, "learning_rate": 5.454404419870554e-06, - "loss": 10.9844, + "loss": 10.7715, "step": 1467 }, { "epoch": 5.242857142857143, - "grad_norm": 41.85927200317383, + "grad_norm": 748.7096557617188, "learning_rate": 5.448196544517168e-06, - "loss": 9.7539, + "loss": 12.0527, "step": 1468 }, { "epoch": 5.246428571428572, - "grad_norm": 44.55553436279297, + "grad_norm": 16.85055160522461, "learning_rate": 5.441987972590022e-06, - "loss": 8.7969, + "loss": 9.4336, "step": 1469 }, { "epoch": 5.25, - "grad_norm": 43.25236511230469, + "grad_norm": 11.414520263671875, "learning_rate": 5.435778713738292e-06, - "loss": 9.1953, + "loss": 8.2197, "step": 1470 }, { "epoch": 5.253571428571428, - "grad_norm": 57.2674674987793, + "grad_norm": 1023.72998046875, "learning_rate": 5.429568777612224e-06, - "loss": 9.5391, + "loss": 16.2793, "step": 1471 }, { "epoch": 5.257142857142857, - "grad_norm": 53.939605712890625, + "grad_norm": 1034.4869384765625, "learning_rate": 5.423358173863117e-06, - "loss": 12.3672, + "loss": 16.7969, "step": 1472 }, { "epoch": 5.260714285714286, - "grad_norm": 52.211334228515625, + "grad_norm": 68.47333526611328, "learning_rate": 5.417146912143306e-06, - "loss": 11.832, + "loss": 13.3398, "step": 1473 }, { "epoch": 5.264285714285714, - "grad_norm": 61.0552978515625, + "grad_norm": 29.654827117919922, "learning_rate": 5.4109350021061526e-06, - "loss": 8.8906, + "loss": 8.043, "step": 1474 }, { "epoch": 5.267857142857143, - "grad_norm": 46.14200973510742, + "grad_norm": 13.925447463989258, "learning_rate": 5.404722453406017e-06, - "loss": 11.0859, + "loss": 11.7578, "step": 1475 }, { "epoch": 5.271428571428571, - "grad_norm": 43.68145751953125, + "grad_norm": 386.12542724609375, "learning_rate": 5.398509275698263e-06, - "loss": 9.2031, + "loss": 8.8164, "step": 1476 }, { "epoch": 5.275, - "grad_norm": 50.1629753112793, + "grad_norm": 697.7108154296875, "learning_rate": 5.392295478639226e-06, - "loss": 10.4258, + "loss": 18.5469, "step": 1477 }, { "epoch": 5.2785714285714285, - "grad_norm": 62.4405632019043, + "grad_norm": 87.77157592773438, "learning_rate": 5.386081071886204e-06, - "loss": 12.918, + "loss": 15.2812, "step": 1478 }, { "epoch": 5.2821428571428575, - "grad_norm": 46.21096420288086, + "grad_norm": 10.980712890625, "learning_rate": 5.3798660650974435e-06, - "loss": 10.6992, + "loss": 10.793, "step": 1479 }, { "epoch": 5.285714285714286, - "grad_norm": 67.28675842285156, + "grad_norm": 554.4993896484375, "learning_rate": 5.373650467932122e-06, - "loss": 11.2148, + "loss": 19.8359, "step": 1480 }, { "epoch": 5.289285714285715, - "grad_norm": 50.80919647216797, + "grad_norm": 35.791259765625, "learning_rate": 5.3674342900503385e-06, - "loss": 10.1289, + "loss": 11.2891, "step": 1481 }, { "epoch": 5.292857142857143, - "grad_norm": 70.84877014160156, + "grad_norm": 28.697444915771484, "learning_rate": 5.361217541113093e-06, - "loss": 10.8086, + "loss": 9.8281, "step": 1482 }, { "epoch": 5.296428571428572, - "grad_norm": 58.110572814941406, + "grad_norm": 19.857362747192383, "learning_rate": 5.355000230782268e-06, - "loss": 8.9219, + "loss": 7.8379, "step": 1483 }, { "epoch": 5.3, - "grad_norm": 49.03927993774414, + "grad_norm": 4.231168270111084, "learning_rate": 5.348782368720627e-06, - "loss": 10.4258, + "loss": 10.0488, "step": 1484 }, { "epoch": 5.303571428571429, - "grad_norm": 60.53662872314453, + "grad_norm": 274.4617919921875, "learning_rate": 5.342563964591783e-06, - "loss": 9.1602, + "loss": 9.5762, "step": 1485 }, { "epoch": 5.307142857142857, - "grad_norm": 55.279563903808594, + "grad_norm": 1106.4814453125, "learning_rate": 5.336345028060199e-06, - "loss": 11.082, + "loss": 23.2207, "step": 1486 }, { "epoch": 5.310714285714286, - "grad_norm": 51.93131637573242, + "grad_norm": 219.03839111328125, "learning_rate": 5.330125568791158e-06, - "loss": 10.1602, + "loss": 10.9453, "step": 1487 }, { "epoch": 5.314285714285714, - "grad_norm": 44.670597076416016, + "grad_norm": 9.353899002075195, "learning_rate": 5.323905596450759e-06, - "loss": 9.6758, + "loss": 9.9746, "step": 1488 }, { "epoch": 5.317857142857143, - "grad_norm": 44.989810943603516, + "grad_norm": 353.3575134277344, "learning_rate": 5.317685120705901e-06, - "loss": 8.707, + "loss": 7.9375, "step": 1489 }, { "epoch": 5.321428571428571, - "grad_norm": 54.01971435546875, + "grad_norm": 5.976604461669922, "learning_rate": 5.311464151224261e-06, - "loss": 9.6328, + "loss": 8.1426, "step": 1490 }, { "epoch": 5.325, - "grad_norm": 44.238006591796875, + "grad_norm": 20.916521072387695, "learning_rate": 5.3052426976742855e-06, - "loss": 7.4414, + "loss": 6.4121, "step": 1491 }, { "epoch": 5.328571428571428, - "grad_norm": 43.295413970947266, + "grad_norm": 20.69138526916504, "learning_rate": 5.299020769725172e-06, - "loss": 9.0508, + "loss": 8.4531, "step": 1492 }, { "epoch": 5.332142857142857, - "grad_norm": 53.48015594482422, + "grad_norm": 12.1712646484375, "learning_rate": 5.292798377046856e-06, - "loss": 12.9453, + "loss": 12.0156, "step": 1493 }, { "epoch": 5.335714285714285, - "grad_norm": 40.519371032714844, + "grad_norm": 17.256973266601562, "learning_rate": 5.286575529309997e-06, - "loss": 9.6172, + "loss": 9.293, "step": 1494 }, { "epoch": 5.339285714285714, - "grad_norm": 52.57669448852539, + "grad_norm": 99.86624145507812, "learning_rate": 5.2803522361859596e-06, - "loss": 9.1875, + "loss": 8.0547, "step": 1495 }, { "epoch": 5.3428571428571425, - "grad_norm": 60.91816711425781, + "grad_norm": 386.5679016113281, "learning_rate": 5.274128507346801e-06, - "loss": 14.0859, + "loss": 15.0859, "step": 1496 }, { "epoch": 5.3464285714285715, - "grad_norm": 43.630104064941406, + "grad_norm": 937.0828857421875, "learning_rate": 5.267904352465255e-06, - "loss": 9.0312, + "loss": 17.0234, "step": 1497 }, { "epoch": 5.35, - "grad_norm": 55.279903411865234, + "grad_norm": 684.9830322265625, "learning_rate": 5.2616797812147205e-06, - "loss": 9.1797, + "loss": 10.1836, "step": 1498 }, { "epoch": 5.353571428571429, - "grad_norm": 51.7821044921875, + "grad_norm": 724.1044311523438, "learning_rate": 5.255454803269239e-06, - "loss": 10.3477, + "loss": 19.0449, "step": 1499 }, { "epoch": 5.357142857142857, - "grad_norm": 54.818336486816406, + "grad_norm": 36.806373596191406, "learning_rate": 5.249229428303486e-06, - "loss": 10.4453, + "loss": 8.9238, "step": 1500 }, { "epoch": 5.360714285714286, - "grad_norm": 80.15299224853516, + "grad_norm": 772.7474365234375, "learning_rate": 5.243003665992758e-06, - "loss": 9.5586, + "loss": 10.3145, "step": 1501 }, { "epoch": 5.364285714285714, - "grad_norm": 40.238136291503906, + "grad_norm": 20.04298973083496, "learning_rate": 5.2367775260129465e-06, - "loss": 8.4531, + "loss": 9.6836, "step": 1502 }, { "epoch": 5.367857142857143, - "grad_norm": 45.691246032714844, + "grad_norm": 786.3037109375, "learning_rate": 5.230551018040534e-06, - "loss": 7.5898, + "loss": 9.8477, "step": 1503 }, { "epoch": 5.371428571428572, - "grad_norm": 52.65000534057617, + "grad_norm": 23.6844482421875, "learning_rate": 5.224324151752575e-06, - "loss": 13.7656, + "loss": 14.2129, "step": 1504 }, { "epoch": 5.375, - "grad_norm": 44.38646697998047, + "grad_norm": 18.63768196105957, "learning_rate": 5.218096936826681e-06, - "loss": 7.9766, + "loss": 6.4395, "step": 1505 }, { "epoch": 5.378571428571428, - "grad_norm": 49.75492477416992, + "grad_norm": 912.8204345703125, "learning_rate": 5.211869382941003e-06, - "loss": 9.6133, + "loss": 12.6836, "step": 1506 }, { "epoch": 5.382142857142857, - "grad_norm": 62.82442855834961, + "grad_norm": 12.265811920166016, "learning_rate": 5.205641499774221e-06, - "loss": 11.3711, + "loss": 9.0664, "step": 1507 }, { "epoch": 5.385714285714286, - "grad_norm": 56.329341888427734, + "grad_norm": 106.89302825927734, "learning_rate": 5.199413297005525e-06, - "loss": 8.1523, + "loss": 8.8027, "step": 1508 }, { "epoch": 5.389285714285714, - "grad_norm": 82.18895721435547, + "grad_norm": 23.39969825744629, "learning_rate": 5.1931847843146045e-06, - "loss": 11.0352, + "loss": 11.0938, "step": 1509 }, { "epoch": 5.392857142857143, - "grad_norm": 62.04426574707031, + "grad_norm": 13.232510566711426, "learning_rate": 5.18695597138163e-06, - "loss": 10.9922, + "loss": 9.7324, "step": 1510 }, { "epoch": 5.396428571428571, - "grad_norm": 44.087440490722656, + "grad_norm": 77.86470031738281, "learning_rate": 5.1807268678872335e-06, - "loss": 7.6055, + "loss": 7.2891, "step": 1511 }, { "epoch": 5.4, - "grad_norm": 46.83291244506836, + "grad_norm": 11.412009239196777, "learning_rate": 5.174497483512506e-06, - "loss": 7.9531, + "loss": 7.8691, "step": 1512 }, { "epoch": 5.4035714285714285, - "grad_norm": 41.434478759765625, + "grad_norm": 59.7557487487793, "learning_rate": 5.168267827938971e-06, - "loss": 8.9805, + "loss": 9.2871, "step": 1513 }, { "epoch": 5.4071428571428575, - "grad_norm": 47.25886917114258, + "grad_norm": 66.27079010009766, "learning_rate": 5.162037910848573e-06, - "loss": 9.6211, + "loss": 9.582, "step": 1514 }, { "epoch": 5.410714285714286, - "grad_norm": 54.92447280883789, + "grad_norm": 117.49435424804688, "learning_rate": 5.155807741923666e-06, - "loss": 10.1602, + "loss": 9.0977, "step": 1515 }, { "epoch": 5.414285714285715, - "grad_norm": 45.74653244018555, + "grad_norm": 86.09494018554688, "learning_rate": 5.1495773308469935e-06, - "loss": 9.9531, + "loss": 19.1836, "step": 1516 }, { "epoch": 5.417857142857143, - "grad_norm": 44.95054244995117, + "grad_norm": 115.76094818115234, "learning_rate": 5.143346687301673e-06, - "loss": 9.6055, + "loss": 10.9375, "step": 1517 }, { "epoch": 5.421428571428572, - "grad_norm": 57.562110900878906, + "grad_norm": 889.227783203125, "learning_rate": 5.137115820971189e-06, - "loss": 10.1367, + "loss": 13.2617, "step": 1518 }, { "epoch": 5.425, - "grad_norm": 42.89727020263672, + "grad_norm": 11.57211685180664, "learning_rate": 5.130884741539367e-06, - "loss": 8.9883, + "loss": 7.7305, "step": 1519 }, { "epoch": 5.428571428571429, - "grad_norm": 56.32749938964844, + "grad_norm": 1291.8759765625, "learning_rate": 5.1246534586903655e-06, - "loss": 9.8945, + "loss": 22.6934, "step": 1520 }, { "epoch": 5.432142857142857, - "grad_norm": 48.47429656982422, + "grad_norm": 169.2554168701172, "learning_rate": 5.1184219821086586e-06, - "loss": 9.4375, + "loss": 9.6445, "step": 1521 }, { "epoch": 5.435714285714286, - "grad_norm": 61.43561935424805, + "grad_norm": 46.72488021850586, "learning_rate": 5.112190321479026e-06, - "loss": 13.0273, + "loss": 13.1387, "step": 1522 }, { "epoch": 5.439285714285714, - "grad_norm": 60.05105209350586, + "grad_norm": 469.0018615722656, "learning_rate": 5.105958486486524e-06, - "loss": 10.5234, + "loss": 12.6289, "step": 1523 }, { "epoch": 5.442857142857143, - "grad_norm": 57.06263732910156, + "grad_norm": 1694.40478515625, "learning_rate": 5.099726486816491e-06, - "loss": 9.0586, + "loss": 22.3242, "step": 1524 }, { "epoch": 5.446428571428571, - "grad_norm": 46.29328155517578, + "grad_norm": 8.174033164978027, "learning_rate": 5.093494332154511e-06, - "loss": 9.1367, + "loss": 8.4355, "step": 1525 }, { "epoch": 5.45, - "grad_norm": 48.091636657714844, + "grad_norm": 173.8683319091797, "learning_rate": 5.087262032186418e-06, - "loss": 10.5508, + "loss": 9.2695, "step": 1526 }, { "epoch": 5.453571428571428, - "grad_norm": 47.872344970703125, + "grad_norm": 64.46499633789062, "learning_rate": 5.081029596598265e-06, - "loss": 8.9453, + "loss": 7.5254, "step": 1527 }, { "epoch": 5.457142857142857, - "grad_norm": 59.55905532836914, + "grad_norm": 19.618080139160156, "learning_rate": 5.074797035076319e-06, - "loss": 12.4844, + "loss": 13.1133, "step": 1528 }, { "epoch": 5.460714285714285, - "grad_norm": 49.584659576416016, + "grad_norm": 61.752864837646484, "learning_rate": 5.068564357307041e-06, - "loss": 8.4102, + "loss": 6.582, "step": 1529 }, { "epoch": 5.464285714285714, - "grad_norm": 44.33107376098633, + "grad_norm": 335.2889099121094, "learning_rate": 5.062331572977076e-06, - "loss": 8.3164, + "loss": 9.7441, "step": 1530 }, { "epoch": 5.4678571428571425, - "grad_norm": 49.156551361083984, + "grad_norm": 231.91978454589844, "learning_rate": 5.05609869177323e-06, - "loss": 10.1406, + "loss": 11.4219, "step": 1531 }, { "epoch": 5.4714285714285715, - "grad_norm": 42.27621078491211, + "grad_norm": 6.915616512298584, "learning_rate": 5.049865723382463e-06, - "loss": 8.2617, + "loss": 6.7539, "step": 1532 }, { "epoch": 5.475, - "grad_norm": 63.84312438964844, + "grad_norm": 83.70397186279297, "learning_rate": 5.04363267749187e-06, - "loss": 8.7578, + "loss": 8.9258, "step": 1533 }, { "epoch": 5.478571428571429, - "grad_norm": 47.772029876708984, + "grad_norm": 968.3523559570312, "learning_rate": 5.037399563788665e-06, - "loss": 10.0352, + "loss": 13.6016, "step": 1534 }, { "epoch": 5.482142857142857, - "grad_norm": 63.39606475830078, + "grad_norm": 7.430887222290039, "learning_rate": 5.0311663919601685e-06, - "loss": 8.4609, + "loss": 7.4414, "step": 1535 }, { "epoch": 5.485714285714286, - "grad_norm": 46.278472900390625, + "grad_norm": 239.97549438476562, "learning_rate": 5.024933171693791e-06, - "loss": 9.2734, + "loss": 9.7168, "step": 1536 }, { "epoch": 5.489285714285714, - "grad_norm": 47.596397399902344, + "grad_norm": 20.717365264892578, "learning_rate": 5.018699912677018e-06, - "loss": 11.3945, + "loss": 12.6836, "step": 1537 }, { "epoch": 5.492857142857143, - "grad_norm": 54.852237701416016, + "grad_norm": 311.75018310546875, "learning_rate": 5.012466624597396e-06, - "loss": 9.9297, + "loss": 9.3711, "step": 1538 }, { "epoch": 5.496428571428572, - "grad_norm": 41.27146911621094, + "grad_norm": 16.799301147460938, "learning_rate": 5.006233317142517e-06, - "loss": 8.1406, + "loss": 6.5762, "step": 1539 }, { "epoch": 5.5, - "grad_norm": 51.483154296875, + "grad_norm": 1052.4896240234375, "learning_rate": 5e-06, - "loss": 8.9648, + "loss": 11.8027, "step": 1540 }, { "epoch": 5.503571428571428, - "grad_norm": 66.07180786132812, + "grad_norm": 58.30836486816406, "learning_rate": 4.993766682857484e-06, - "loss": 11.1133, + "loss": 12.125, "step": 1541 }, { "epoch": 5.507142857142857, - "grad_norm": 57.18376159667969, + "grad_norm": 1074.6864013671875, "learning_rate": 4.987533375402605e-06, - "loss": 11.5234, + "loss": 22.8496, "step": 1542 }, { "epoch": 5.510714285714286, - "grad_norm": 45.31563186645508, + "grad_norm": 36.5207633972168, "learning_rate": 4.981300087322984e-06, - "loss": 9.6797, + "loss": 9.959, "step": 1543 }, { "epoch": 5.514285714285714, - "grad_norm": 54.22509002685547, + "grad_norm": 879.3292846679688, "learning_rate": 4.9750668283062104e-06, - "loss": 11.2812, + "loss": 21.3418, "step": 1544 }, { "epoch": 5.517857142857143, - "grad_norm": 58.84791564941406, + "grad_norm": 1374.795166015625, "learning_rate": 4.968833608039832e-06, - "loss": 10.25, + "loss": 22.2188, "step": 1545 }, { "epoch": 5.521428571428571, - "grad_norm": 42.53422164916992, + "grad_norm": 17.173152923583984, "learning_rate": 4.962600436211336e-06, - "loss": 9.4609, + "loss": 9.4883, "step": 1546 }, { "epoch": 5.525, - "grad_norm": 52.54304122924805, + "grad_norm": 476.46685791015625, "learning_rate": 4.956367322508131e-06, - "loss": 10.2617, + "loss": 9.6562, "step": 1547 }, { "epoch": 5.5285714285714285, - "grad_norm": 57.8118896484375, + "grad_norm": 903.869140625, "learning_rate": 4.950134276617538e-06, - "loss": 10.4258, + "loss": 39.4668, "step": 1548 }, { "epoch": 5.5321428571428575, - "grad_norm": 53.34760284423828, + "grad_norm": 777.5332641601562, "learning_rate": 4.943901308226771e-06, - "loss": 8.8984, + "loss": 12.4043, "step": 1549 }, { "epoch": 5.535714285714286, - "grad_norm": 46.492427825927734, + "grad_norm": 40.261505126953125, "learning_rate": 4.937668427022925e-06, - "loss": 7.8047, + "loss": 7.082, "step": 1550 }, { "epoch": 5.539285714285715, - "grad_norm": 47.09682846069336, + "grad_norm": 34.86526870727539, "learning_rate": 4.9314356426929604e-06, - "loss": 9.293, + "loss": 8.6055, "step": 1551 }, { "epoch": 5.542857142857143, - "grad_norm": 47.27290725708008, + "grad_norm": 73.35187530517578, "learning_rate": 4.9252029649236835e-06, - "loss": 8.5039, + "loss": 20.668, "step": 1552 }, { "epoch": 5.546428571428572, - "grad_norm": 54.840370178222656, + "grad_norm": 31.295278549194336, "learning_rate": 4.918970403401737e-06, - "loss": 10.4141, + "loss": 11.0137, "step": 1553 }, { "epoch": 5.55, - "grad_norm": 49.712764739990234, + "grad_norm": 32.626487731933594, "learning_rate": 4.9127379678135825e-06, - "loss": 11.9688, + "loss": 12.2383, "step": 1554 }, { "epoch": 5.553571428571429, - "grad_norm": 67.43892669677734, + "grad_norm": 24.92441749572754, "learning_rate": 4.90650566784549e-06, - "loss": 11.3281, + "loss": 14.4297, "step": 1555 }, { "epoch": 5.557142857142857, - "grad_norm": 43.10005187988281, + "grad_norm": 34.26100158691406, "learning_rate": 4.900273513183511e-06, - "loss": 9.5312, + "loss": 9.8984, "step": 1556 }, { "epoch": 5.560714285714286, - "grad_norm": 43.7419319152832, + "grad_norm": 393.4625244140625, "learning_rate": 4.894041513513477e-06, - "loss": 9.3828, + "loss": 10.0117, "step": 1557 }, { "epoch": 5.564285714285714, - "grad_norm": 55.84998321533203, + "grad_norm": 703.3402709960938, "learning_rate": 4.887809678520976e-06, - "loss": 9.8906, + "loss": 13.9766, "step": 1558 }, { "epoch": 5.567857142857143, - "grad_norm": 57.79484176635742, + "grad_norm": 69.5191421508789, "learning_rate": 4.881578017891343e-06, - "loss": 12.1328, + "loss": 12.5723, "step": 1559 }, { "epoch": 5.571428571428571, - "grad_norm": 45.063438415527344, + "grad_norm": 209.46437072753906, "learning_rate": 4.875346541309637e-06, - "loss": 8.668, + "loss": 8.3086, "step": 1560 }, { "epoch": 5.575, - "grad_norm": 47.72773742675781, + "grad_norm": 47.12582015991211, "learning_rate": 4.869115258460636e-06, - "loss": 7.9961, + "loss": 8.125, "step": 1561 }, { "epoch": 5.578571428571428, - "grad_norm": 68.70746612548828, + "grad_norm": 163.08970642089844, "learning_rate": 4.862884179028813e-06, - "loss": 15.1719, + "loss": 19.3828, "step": 1562 }, { "epoch": 5.582142857142857, - "grad_norm": 41.9263916015625, + "grad_norm": 61.89958572387695, "learning_rate": 4.856653312698329e-06, - "loss": 9.4062, + "loss": 8.5547, "step": 1563 }, { "epoch": 5.585714285714285, - "grad_norm": 48.97158432006836, + "grad_norm": 858.2366943359375, "learning_rate": 4.850422669153009e-06, - "loss": 11.3125, + "loss": 15.4961, "step": 1564 }, { "epoch": 5.589285714285714, - "grad_norm": 46.03926467895508, + "grad_norm": 147.41696166992188, "learning_rate": 4.844192258076335e-06, - "loss": 10.1797, + "loss": 9.8105, "step": 1565 }, { "epoch": 5.5928571428571425, - "grad_norm": 43.022544860839844, + "grad_norm": 10.361417770385742, "learning_rate": 4.8379620891514284e-06, - "loss": 8.6992, + "loss": 9.3125, "step": 1566 }, { "epoch": 5.5964285714285715, - "grad_norm": 48.49531173706055, + "grad_norm": 62.1901741027832, "learning_rate": 4.831732172061032e-06, - "loss": 11.4414, + "loss": 13.4453, "step": 1567 }, { "epoch": 5.6, - "grad_norm": 56.68817138671875, + "grad_norm": 69.14346313476562, "learning_rate": 4.825502516487497e-06, - "loss": 10.6836, + "loss": 9.1641, "step": 1568 }, { "epoch": 5.603571428571429, - "grad_norm": 44.11182403564453, + "grad_norm": 15.477385520935059, "learning_rate": 4.819273132112769e-06, - "loss": 10.0391, + "loss": 10.2109, "step": 1569 }, { "epoch": 5.607142857142857, - "grad_norm": 51.59687423706055, + "grad_norm": 23.553468704223633, "learning_rate": 4.8130440286183725e-06, - "loss": 11.7422, + "loss": 12.2188, "step": 1570 }, { "epoch": 5.610714285714286, - "grad_norm": 67.98545837402344, + "grad_norm": 21.612619400024414, "learning_rate": 4.806815215685397e-06, - "loss": 8.6602, + "loss": 8.002, "step": 1571 }, { "epoch": 5.614285714285714, - "grad_norm": 59.83100509643555, + "grad_norm": 128.38638305664062, "learning_rate": 4.800586702994477e-06, - "loss": 8.6641, + "loss": 8.6973, "step": 1572 }, { "epoch": 5.617857142857143, - "grad_norm": 56.82401657104492, + "grad_norm": 503.44024658203125, "learning_rate": 4.794358500225782e-06, - "loss": 7.5586, + "loss": 15.2012, "step": 1573 }, { "epoch": 5.621428571428572, - "grad_norm": 62.64628982543945, + "grad_norm": 74.5972671508789, "learning_rate": 4.788130617058999e-06, - "loss": 9.0234, + "loss": 8.4023, "step": 1574 }, { "epoch": 5.625, - "grad_norm": 49.86322021484375, + "grad_norm": 7.920133590698242, "learning_rate": 4.781903063173321e-06, - "loss": 8.1562, + "loss": 7.8672, "step": 1575 }, { "epoch": 5.628571428571428, - "grad_norm": 50.653228759765625, + "grad_norm": 902.6047973632812, "learning_rate": 4.775675848247427e-06, - "loss": 10.1172, + "loss": 14.4766, "step": 1576 }, { "epoch": 5.632142857142857, - "grad_norm": 57.38638687133789, + "grad_norm": 4.3526082038879395, "learning_rate": 4.769448981959468e-06, - "loss": 10.4297, + "loss": 9.8125, "step": 1577 }, { "epoch": 5.635714285714286, - "grad_norm": 55.42776107788086, + "grad_norm": 795.8441772460938, "learning_rate": 4.763222473987056e-06, - "loss": 10.1562, + "loss": 18.3164, "step": 1578 }, { "epoch": 5.639285714285714, - "grad_norm": 44.86182403564453, + "grad_norm": 55.906410217285156, "learning_rate": 4.756996334007245e-06, - "loss": 11.1758, + "loss": 13.9258, "step": 1579 }, { "epoch": 5.642857142857143, - "grad_norm": 43.44905090332031, + "grad_norm": 11.89001178741455, "learning_rate": 4.750770571696514e-06, - "loss": 9.0859, + "loss": 8.7773, "step": 1580 }, { "epoch": 5.646428571428571, - "grad_norm": 65.25932312011719, + "grad_norm": 114.11042785644531, "learning_rate": 4.744545196730762e-06, - "loss": 11.0898, + "loss": 24.1699, "step": 1581 }, { "epoch": 5.65, - "grad_norm": 78.29357147216797, + "grad_norm": 34.300804138183594, "learning_rate": 4.738320218785281e-06, - "loss": 8.7383, + "loss": 17.4902, "step": 1582 }, { "epoch": 5.6535714285714285, - "grad_norm": 45.47819519042969, + "grad_norm": 49.4405517578125, "learning_rate": 4.732095647534745e-06, - "loss": 9.5508, + "loss": 10.4004, "step": 1583 }, { "epoch": 5.6571428571428575, - "grad_norm": 66.7427749633789, + "grad_norm": 9.965595245361328, "learning_rate": 4.7258714926532e-06, - "loss": 7.1094, + "loss": 4.4336, "step": 1584 }, { "epoch": 5.660714285714286, - "grad_norm": 45.906490325927734, + "grad_norm": 21.811878204345703, "learning_rate": 4.719647763814041e-06, - "loss": 10.1641, + "loss": 10.9961, "step": 1585 }, { "epoch": 5.664285714285715, - "grad_norm": 61.99280548095703, + "grad_norm": 595.4080200195312, "learning_rate": 4.713424470690004e-06, - "loss": 8.7656, + "loss": 9.0293, "step": 1586 }, { "epoch": 5.667857142857143, - "grad_norm": 53.49858474731445, + "grad_norm": 224.91940307617188, "learning_rate": 4.707201622953145e-06, - "loss": 14.1836, + "loss": 14.2891, "step": 1587 }, { "epoch": 5.671428571428572, - "grad_norm": 56.845767974853516, + "grad_norm": 170.0563507080078, "learning_rate": 4.700979230274829e-06, - "loss": 12.9414, + "loss": 13.2441, "step": 1588 }, { "epoch": 5.675, - "grad_norm": 55.17892074584961, + "grad_norm": 130.1929168701172, "learning_rate": 4.694757302325715e-06, - "loss": 10.5625, + "loss": 11.6348, "step": 1589 }, { "epoch": 5.678571428571429, - "grad_norm": 49.47640609741211, + "grad_norm": 529.76318359375, "learning_rate": 4.6885358487757396e-06, - "loss": 8.4375, + "loss": 8.5547, "step": 1590 }, { "epoch": 5.682142857142857, - "grad_norm": 49.72962188720703, + "grad_norm": 435.2521667480469, "learning_rate": 4.6823148792940995e-06, - "loss": 9.3125, + "loss": 10.9062, "step": 1591 }, { "epoch": 5.685714285714286, - "grad_norm": 57.43544006347656, + "grad_norm": 95.29248809814453, "learning_rate": 4.676094403549241e-06, - "loss": 9.707, + "loss": 9.0215, "step": 1592 }, { "epoch": 5.689285714285714, - "grad_norm": 52.89817810058594, + "grad_norm": 230.93331909179688, "learning_rate": 4.669874431208843e-06, - "loss": 9.5977, + "loss": 11.5898, "step": 1593 }, { "epoch": 5.692857142857143, - "grad_norm": 68.98974609375, + "grad_norm": 312.2042236328125, "learning_rate": 4.663654971939802e-06, - "loss": 11.5078, + "loss": 13.9961, "step": 1594 }, { "epoch": 5.696428571428571, - "grad_norm": 41.092891693115234, + "grad_norm": 211.26416015625, "learning_rate": 4.657436035408217e-06, - "loss": 9.0039, + "loss": 9.8652, "step": 1595 }, { "epoch": 5.7, - "grad_norm": 51.25163269042969, + "grad_norm": 1174.9051513671875, "learning_rate": 4.651217631279374e-06, - "loss": 11.0117, + "loss": 29.5938, "step": 1596 }, { "epoch": 5.703571428571428, - "grad_norm": 70.30717468261719, + "grad_norm": 37.70885467529297, "learning_rate": 4.644999769217731e-06, - "loss": 9.4844, + "loss": 7.3438, "step": 1597 }, { "epoch": 5.707142857142857, - "grad_norm": 46.608253479003906, + "grad_norm": 12.210695266723633, "learning_rate": 4.638782458886908e-06, - "loss": 10.043, + "loss": 9.8574, "step": 1598 }, { "epoch": 5.710714285714285, - "grad_norm": 45.631919860839844, + "grad_norm": 12.32148265838623, "learning_rate": 4.632565709949662e-06, - "loss": 7.9102, + "loss": 7.3398, "step": 1599 }, { "epoch": 5.714285714285714, - "grad_norm": 45.90546417236328, + "grad_norm": 49.41553497314453, "learning_rate": 4.626349532067879e-06, - "loss": 10.8203, + "loss": 10.5039, "step": 1600 }, { "epoch": 5.7178571428571425, - "grad_norm": 77.22392272949219, + "grad_norm": 79.93115234375, "learning_rate": 4.620133934902559e-06, - "loss": 13.8242, + "loss": 14.4316, "step": 1601 }, { "epoch": 5.7214285714285715, - "grad_norm": 69.01750946044922, + "grad_norm": 28.128820419311523, "learning_rate": 4.613918928113797e-06, - "loss": 10.0039, + "loss": 11.6992, "step": 1602 }, { "epoch": 5.725, - "grad_norm": 52.218528747558594, + "grad_norm": 181.10804748535156, "learning_rate": 4.6077045213607765e-06, - "loss": 11.332, + "loss": 12.9531, "step": 1603 }, { "epoch": 5.728571428571429, - "grad_norm": 51.50844192504883, + "grad_norm": 24.486251831054688, "learning_rate": 4.601490724301738e-06, - "loss": 9.6758, + "loss": 10.25, "step": 1604 }, { "epoch": 5.732142857142857, - "grad_norm": 50.681121826171875, + "grad_norm": 419.62939453125, "learning_rate": 4.595277546593984e-06, - "loss": 9.7969, + "loss": 12.5078, "step": 1605 }, { "epoch": 5.735714285714286, - "grad_norm": 66.9765853881836, + "grad_norm": 8.538434982299805, "learning_rate": 4.589064997893849e-06, - "loss": 8.2227, + "loss": 6.5156, "step": 1606 }, { "epoch": 5.739285714285714, - "grad_norm": 51.3770751953125, + "grad_norm": 1395.2288818359375, "learning_rate": 4.5828530878566954e-06, - "loss": 10.0508, + "loss": 21.1953, "step": 1607 }, { "epoch": 5.742857142857143, - "grad_norm": 55.04964065551758, + "grad_norm": 23.833202362060547, "learning_rate": 4.576641826136884e-06, - "loss": 11.375, + "loss": 13.1875, "step": 1608 }, { "epoch": 5.746428571428572, - "grad_norm": 72.69183349609375, + "grad_norm": 937.6629638671875, "learning_rate": 4.570431222387777e-06, - "loss": 11.3828, + "loss": 25.8711, "step": 1609 }, { "epoch": 5.75, - "grad_norm": 43.491641998291016, + "grad_norm": 74.26826477050781, "learning_rate": 4.564221286261709e-06, - "loss": 9.0703, + "loss": 9.1211, "step": 1610 }, { "epoch": 5.753571428571428, - "grad_norm": 42.81809616088867, + "grad_norm": 427.6198425292969, "learning_rate": 4.55801202740998e-06, - "loss": 9.668, + "loss": 10.6484, "step": 1611 }, { "epoch": 5.757142857142857, - "grad_norm": 62.632633209228516, + "grad_norm": 622.4078369140625, "learning_rate": 4.551803455482833e-06, - "loss": 9.4375, + "loss": 11.9883, "step": 1612 }, { "epoch": 5.760714285714286, - "grad_norm": 49.77617263793945, + "grad_norm": 16.274507522583008, "learning_rate": 4.545595580129448e-06, - "loss": 11.2539, + "loss": 11.0586, "step": 1613 }, { "epoch": 5.764285714285714, - "grad_norm": 68.17145538330078, + "grad_norm": 19.57277488708496, "learning_rate": 4.539388410997919e-06, - "loss": 13.4062, + "loss": 15.6797, "step": 1614 }, { "epoch": 5.767857142857143, - "grad_norm": 78.18854522705078, + "grad_norm": 45.75452423095703, "learning_rate": 4.533181957735247e-06, - "loss": 12.9023, + "loss": 14.7383, "step": 1615 }, { "epoch": 5.771428571428571, - "grad_norm": 48.676513671875, + "grad_norm": 73.1038818359375, "learning_rate": 4.526976229987315e-06, - "loss": 11.0039, + "loss": 11.166, "step": 1616 }, { "epoch": 5.775, - "grad_norm": 47.9576416015625, + "grad_norm": 11.152532577514648, "learning_rate": 4.52077123739888e-06, - "loss": 13.0938, + "loss": 12.6719, "step": 1617 }, { "epoch": 5.7785714285714285, - "grad_norm": 60.74048614501953, + "grad_norm": 1135.13525390625, "learning_rate": 4.51456698961356e-06, - "loss": 12.9609, + "loss": 18.6641, "step": 1618 }, { "epoch": 5.7821428571428575, - "grad_norm": 51.42037582397461, + "grad_norm": 15.524484634399414, "learning_rate": 4.508363496273811e-06, - "loss": 11.0352, + "loss": 11.8574, "step": 1619 }, { "epoch": 5.785714285714286, - "grad_norm": 48.507843017578125, + "grad_norm": 1278.5338134765625, "learning_rate": 4.502160767020918e-06, - "loss": 9.25, + "loss": 15.3438, "step": 1620 }, { "epoch": 5.789285714285715, - "grad_norm": 53.146568298339844, + "grad_norm": 459.5232238769531, "learning_rate": 4.4959588114949785e-06, - "loss": 10.9727, + "loss": 22.9062, "step": 1621 }, { "epoch": 5.792857142857143, - "grad_norm": 54.156978607177734, + "grad_norm": 412.8844299316406, "learning_rate": 4.489757639334888e-06, - "loss": 10.4102, + "loss": 17.6445, "step": 1622 }, { "epoch": 5.796428571428572, - "grad_norm": 50.71610641479492, + "grad_norm": 29.87220001220703, "learning_rate": 4.483557260178326e-06, - "loss": 7.8477, + "loss": 6.0508, "step": 1623 }, { "epoch": 5.8, - "grad_norm": 59.174896240234375, + "grad_norm": 488.8746032714844, "learning_rate": 4.477357683661734e-06, - "loss": 10.8516, + "loss": 26.543, "step": 1624 }, { "epoch": 5.803571428571429, - "grad_norm": 61.135223388671875, + "grad_norm": 478.629638671875, "learning_rate": 4.471158919420312e-06, - "loss": 10.4688, + "loss": 16.9453, "step": 1625 }, { "epoch": 5.807142857142857, - "grad_norm": 46.15522003173828, + "grad_norm": 341.7420349121094, "learning_rate": 4.464960977087995e-06, - "loss": 11.3516, + "loss": 13.3086, "step": 1626 }, { "epoch": 5.810714285714286, - "grad_norm": 48.68561553955078, + "grad_norm": 395.3283386230469, "learning_rate": 4.458763866297441e-06, - "loss": 9.0781, + "loss": 11.3594, "step": 1627 }, { "epoch": 5.814285714285714, - "grad_norm": 72.04107666015625, + "grad_norm": 14.895715713500977, "learning_rate": 4.452567596680016e-06, - "loss": 10.8594, + "loss": 12.4531, "step": 1628 }, { "epoch": 5.817857142857143, - "grad_norm": 45.58473587036133, + "grad_norm": 62.358055114746094, "learning_rate": 4.4463721778657774e-06, - "loss": 8.832, + "loss": 9.8984, "step": 1629 }, { "epoch": 5.821428571428571, - "grad_norm": 53.87794876098633, + "grad_norm": 43.44416427612305, "learning_rate": 4.4401776194834615e-06, - "loss": 10.7578, + "loss": 12.3594, "step": 1630 }, { "epoch": 5.825, - "grad_norm": 59.67512893676758, + "grad_norm": 19.440317153930664, "learning_rate": 4.4339839311604675e-06, - "loss": 12.4531, + "loss": 14.9785, "step": 1631 }, { "epoch": 5.828571428571428, - "grad_norm": 56.13530731201172, + "grad_norm": 33.76012420654297, "learning_rate": 4.427791122522841e-06, - "loss": 10.6328, + "loss": 11.5625, "step": 1632 }, { "epoch": 5.832142857142857, - "grad_norm": 48.171260833740234, + "grad_norm": 144.4760284423828, "learning_rate": 4.421599203195262e-06, - "loss": 8.375, + "loss": 7.9492, "step": 1633 }, { "epoch": 5.835714285714285, - "grad_norm": 52.101871490478516, + "grad_norm": 831.3182983398438, "learning_rate": 4.415408182801027e-06, - "loss": 11.0625, + "loss": 14.6484, "step": 1634 }, { "epoch": 5.839285714285714, - "grad_norm": 49.68294906616211, + "grad_norm": 18.30744743347168, "learning_rate": 4.409218070962036e-06, - "loss": 10.4062, + "loss": 10.5547, "step": 1635 }, { "epoch": 5.8428571428571425, - "grad_norm": 63.56822204589844, + "grad_norm": 959.8087158203125, "learning_rate": 4.4030288772987795e-06, - "loss": 11.6602, + "loss": 19.7051, "step": 1636 }, { "epoch": 5.8464285714285715, - "grad_norm": 66.45170593261719, + "grad_norm": 736.7271728515625, "learning_rate": 4.396840611430316e-06, - "loss": 7.9883, + "loss": 10.4473, "step": 1637 }, { "epoch": 5.85, - "grad_norm": 44.060523986816406, + "grad_norm": 39.32826614379883, "learning_rate": 4.390653282974264e-06, - "loss": 9.7148, + "loss": 10.5898, "step": 1638 }, { "epoch": 5.853571428571429, - "grad_norm": 54.552249908447266, + "grad_norm": 1419.6434326171875, "learning_rate": 4.384466901546786e-06, - "loss": 11.7266, + "loss": 30.8203, "step": 1639 }, { "epoch": 5.857142857142857, - "grad_norm": 57.99196243286133, + "grad_norm": 14.474321365356445, "learning_rate": 4.3782814767625755e-06, - "loss": 12.168, + "loss": 14.377, "step": 1640 }, { "epoch": 5.860714285714286, - "grad_norm": 53.15438461303711, + "grad_norm": 71.71196746826172, "learning_rate": 4.372097018234832e-06, - "loss": 11.4453, + "loss": 13.0781, "step": 1641 }, { "epoch": 5.864285714285714, - "grad_norm": 62.88151550292969, + "grad_norm": 194.26754760742188, "learning_rate": 4.3659135355752595e-06, - "loss": 10.5859, + "loss": 9.2012, "step": 1642 }, { "epoch": 5.867857142857143, - "grad_norm": 57.074710845947266, + "grad_norm": 463.9566650390625, "learning_rate": 4.359731038394042e-06, - "loss": 9.3984, + "loss": 10.4453, "step": 1643 }, { "epoch": 5.871428571428572, - "grad_norm": 53.16033935546875, + "grad_norm": 6.869324684143066, "learning_rate": 4.353549536299835e-06, - "loss": 10.375, + "loss": 11.7188, "step": 1644 }, { "epoch": 5.875, - "grad_norm": 48.00330352783203, + "grad_norm": 121.85261535644531, "learning_rate": 4.347369038899744e-06, - "loss": 11.0391, + "loss": 12.3125, "step": 1645 }, { "epoch": 5.878571428571428, - "grad_norm": 47.857723236083984, + "grad_norm": 24.91994285583496, "learning_rate": 4.341189555799313e-06, - "loss": 11.9805, + "loss": 12.1523, "step": 1646 }, { "epoch": 5.882142857142857, - "grad_norm": 51.96940231323242, + "grad_norm": 9.41640567779541, "learning_rate": 4.335011096602514e-06, - "loss": 8.0547, + "loss": 7.9824, "step": 1647 }, { "epoch": 5.885714285714286, - "grad_norm": 51.88654708862305, + "grad_norm": 88.92093658447266, "learning_rate": 4.3288336709117246e-06, - "loss": 12.207, + "loss": 12.9492, "step": 1648 }, { "epoch": 5.889285714285714, - "grad_norm": 60.50304412841797, + "grad_norm": 21.915687561035156, "learning_rate": 4.322657288327714e-06, - "loss": 12.5898, + "loss": 13.7891, "step": 1649 }, { "epoch": 5.892857142857143, - "grad_norm": 54.898170471191406, + "grad_norm": 144.4425506591797, "learning_rate": 4.316481958449634e-06, - "loss": 10.5234, + "loss": 11.6074, "step": 1650 }, { "epoch": 5.896428571428571, - "grad_norm": 48.29661560058594, + "grad_norm": 457.72357177734375, "learning_rate": 4.310307690875e-06, - "loss": 9.832, + "loss": 19.7285, "step": 1651 }, { "epoch": 5.9, - "grad_norm": 58.824851989746094, + "grad_norm": 34.79280090332031, "learning_rate": 4.304134495199675e-06, - "loss": 8.7969, + "loss": 8.9219, "step": 1652 }, { "epoch": 5.9035714285714285, - "grad_norm": 66.70083618164062, + "grad_norm": 68.95867156982422, "learning_rate": 4.297962381017855e-06, - "loss": 10.1016, + "loss": 12.3555, "step": 1653 }, { "epoch": 5.9071428571428575, - "grad_norm": 38.39802169799805, + "grad_norm": 248.98231506347656, "learning_rate": 4.291791357922056e-06, - "loss": 8.4336, + "loss": 9.5156, "step": 1654 }, { "epoch": 5.910714285714286, - "grad_norm": 56.887184143066406, + "grad_norm": 9.59241771697998, "learning_rate": 4.285621435503101e-06, - "loss": 12.8633, + "loss": 14.7988, "step": 1655 }, { "epoch": 5.914285714285715, - "grad_norm": 49.34559631347656, + "grad_norm": 33.046573638916016, "learning_rate": 4.279452623350101e-06, - "loss": 10.6211, + "loss": 10.7969, "step": 1656 }, { "epoch": 5.917857142857143, - "grad_norm": 71.9808120727539, + "grad_norm": 50.89940643310547, "learning_rate": 4.273284931050438e-06, - "loss": 14.4102, + "loss": 16.2109, "step": 1657 }, { "epoch": 5.921428571428572, - "grad_norm": 55.4949836730957, + "grad_norm": 62.67816162109375, "learning_rate": 4.267118368189757e-06, - "loss": 8.0742, + "loss": 6.8301, "step": 1658 }, { "epoch": 5.925, - "grad_norm": 42.78428649902344, + "grad_norm": 78.69082641601562, "learning_rate": 4.260952944351947e-06, - "loss": 7.7891, + "loss": 7.1172, "step": 1659 }, { "epoch": 5.928571428571429, - "grad_norm": 56.24702072143555, + "grad_norm": 987.6006469726562, "learning_rate": 4.254788669119127e-06, - "loss": 12.9141, + "loss": 18.7051, "step": 1660 }, { "epoch": 5.932142857142857, - "grad_norm": 66.3494873046875, + "grad_norm": 42.4428596496582, "learning_rate": 4.248625552071632e-06, - "loss": 13.9961, + "loss": 15.0605, "step": 1661 }, { "epoch": 5.935714285714286, - "grad_norm": 52.807647705078125, + "grad_norm": 57.38206481933594, "learning_rate": 4.2424636027879926e-06, - "loss": 11.0195, + "loss": 10.7656, "step": 1662 }, { "epoch": 5.939285714285714, - "grad_norm": 45.50996398925781, + "grad_norm": 274.8497314453125, "learning_rate": 4.236302830844931e-06, - "loss": 9.6797, + "loss": 9.0508, "step": 1663 }, { "epoch": 5.942857142857143, - "grad_norm": 47.0439567565918, + "grad_norm": 285.47479248046875, "learning_rate": 4.230143245817332e-06, - "loss": 9.1875, + "loss": 8.8125, "step": 1664 }, { "epoch": 5.946428571428571, - "grad_norm": 60.722129821777344, + "grad_norm": 16.227373123168945, "learning_rate": 4.223984857278242e-06, - "loss": 10.1562, + "loss": 11.3906, "step": 1665 }, { "epoch": 5.95, - "grad_norm": 49.956451416015625, + "grad_norm": 892.3443603515625, "learning_rate": 4.217827674798845e-06, - "loss": 8.0273, + "loss": 9.9668, "step": 1666 }, { "epoch": 5.953571428571428, - "grad_norm": 51.216156005859375, + "grad_norm": 14.958064079284668, "learning_rate": 4.211671707948452e-06, - "loss": 11.2773, + "loss": 11.4453, "step": 1667 }, { "epoch": 5.957142857142857, - "grad_norm": 65.91729736328125, + "grad_norm": 206.19345092773438, "learning_rate": 4.205516966294484e-06, - "loss": 8.3711, + "loss": 8.2754, "step": 1668 }, { "epoch": 5.960714285714285, - "grad_norm": 49.82816696166992, + "grad_norm": 21.125017166137695, "learning_rate": 4.1993634594024555e-06, - "loss": 9.8672, + "loss": 9.1309, "step": 1669 }, { "epoch": 5.964285714285714, - "grad_norm": 53.48155212402344, + "grad_norm": 46.416133880615234, "learning_rate": 4.193211196835967e-06, - "loss": 9.0, + "loss": 9.1328, "step": 1670 }, { "epoch": 5.9678571428571425, - "grad_norm": 39.58634567260742, + "grad_norm": 19.207155227661133, "learning_rate": 4.187060188156681e-06, - "loss": 8.3672, + "loss": 9.0195, "step": 1671 }, { "epoch": 5.9714285714285715, - "grad_norm": 72.48980712890625, + "grad_norm": 262.2999267578125, "learning_rate": 4.180910442924312e-06, - "loss": 11.1875, + "loss": 12.7773, "step": 1672 }, { "epoch": 5.975, - "grad_norm": 61.04351043701172, + "grad_norm": 29.84653663635254, "learning_rate": 4.174761970696612e-06, - "loss": 10.0352, + "loss": 10.293, "step": 1673 }, { "epoch": 5.978571428571429, - "grad_norm": 48.407623291015625, + "grad_norm": 452.22467041015625, "learning_rate": 4.1686147810293534e-06, - "loss": 10.3984, + "loss": 11.9766, "step": 1674 }, { "epoch": 5.982142857142857, - "grad_norm": 48.5159912109375, + "grad_norm": 84.66542053222656, "learning_rate": 4.162468883476319e-06, - "loss": 8.7891, + "loss": 9.2188, "step": 1675 }, { "epoch": 5.985714285714286, - "grad_norm": 48.72674560546875, + "grad_norm": 192.14540100097656, "learning_rate": 4.156324287589276e-06, - "loss": 11.2773, + "loss": 14.3516, "step": 1676 }, { "epoch": 5.989285714285714, - "grad_norm": 50.07181167602539, + "grad_norm": 747.48046875, "learning_rate": 4.150181002917974e-06, - "loss": 8.5391, + "loss": 10.6387, "step": 1677 }, { "epoch": 5.992857142857143, - "grad_norm": 58.37034225463867, + "grad_norm": 144.89596557617188, "learning_rate": 4.144039039010125e-06, - "loss": 9.9414, + "loss": 9.5195, "step": 1678 }, { "epoch": 5.996428571428572, - "grad_norm": 54.81343078613281, + "grad_norm": 170.58621215820312, "learning_rate": 4.137898405411387e-06, - "loss": 8.1289, + "loss": 17.7617, "step": 1679 }, { "epoch": 6.0, - "grad_norm": 73.3040542602539, + "grad_norm": 771.9425048828125, "learning_rate": 4.131759111665349e-06, - "loss": 10.1992, + "loss": 12.8594, "step": 1680 }, { "epoch": 6.0, - "eval_loss": 10.075176239013672, - "eval_mse": 10.075117181055136, - "eval_runtime": 11.3353, - "eval_samples_per_second": 250.545, - "eval_steps_per_second": 1.323, - "eval_target_0_mse": 18.99395581441415, - "eval_target_1_mse": 10.335927969775117, - "eval_target_2_mse": 5.593903802101472, - "eval_target_3_mse": 5.376681137929808, + "eval_loss": 12.752420425415039, + "eval_mse": 12.750003891887825, + "eval_runtime": 10.9859, + "eval_samples_per_second": 258.513, + "eval_steps_per_second": 1.365, + "eval_target_0_mse": 37.742021404614626, + "eval_target_1_mse": 8.728842796295323, + "eval_target_2_mse": 3.009326004155817, + "eval_target_3_mse": 1.5198253624855385, "step": 1680 }, { "epoch": 6.003571428571429, - "grad_norm": 76.41683197021484, + "grad_norm": 490.85784912109375, "learning_rate": 4.125621167313519e-06, - "loss": 8.9297, + "loss": 10.2109, "step": 1681 }, { "epoch": 6.007142857142857, - "grad_norm": 50.83246994018555, + "grad_norm": 801.1961059570312, "learning_rate": 4.119484581895309e-06, - "loss": 9.1602, + "loss": 12.0195, "step": 1682 }, { "epoch": 6.010714285714286, - "grad_norm": 54.864261627197266, + "grad_norm": 29.213481903076172, "learning_rate": 4.113349364948018e-06, - "loss": 10.4102, + "loss": 11.9648, "step": 1683 }, { "epoch": 6.014285714285714, - "grad_norm": 57.45695495605469, + "grad_norm": 105.80555725097656, "learning_rate": 4.107215526006818e-06, - "loss": 10.8125, + "loss": 22.6816, "step": 1684 }, { "epoch": 6.017857142857143, - "grad_norm": 51.833526611328125, + "grad_norm": 961.22314453125, "learning_rate": 4.101083074604737e-06, - "loss": 9.7891, + "loss": 12.7949, "step": 1685 }, { "epoch": 6.021428571428571, - "grad_norm": 50.87210464477539, + "grad_norm": 49.2587776184082, "learning_rate": 4.094952020272651e-06, - "loss": 8.8164, + "loss": 10.5508, "step": 1686 }, { "epoch": 6.025, - "grad_norm": 58.539634704589844, + "grad_norm": 939.3512573242188, "learning_rate": 4.088822372539263e-06, - "loss": 12.1055, + "loss": 18.0664, "step": 1687 }, { "epoch": 6.0285714285714285, - "grad_norm": 49.209163665771484, + "grad_norm": 374.60931396484375, "learning_rate": 4.0826941409310885e-06, - "loss": 9.5625, + "loss": 10.4609, "step": 1688 }, { "epoch": 6.0321428571428575, - "grad_norm": 51.57401657104492, + "grad_norm": 38.29633331298828, "learning_rate": 4.076567334972443e-06, - "loss": 9.168, + "loss": 9.5762, "step": 1689 }, { "epoch": 6.035714285714286, - "grad_norm": 49.37749099731445, + "grad_norm": 184.16127014160156, "learning_rate": 4.070441964185428e-06, - "loss": 10.0312, + "loss": 9.8828, "step": 1690 }, { "epoch": 6.039285714285715, - "grad_norm": 43.40364074707031, + "grad_norm": 487.64471435546875, "learning_rate": 4.06431803808991e-06, - "loss": 7.4219, + "loss": 8.0098, "step": 1691 }, { "epoch": 6.042857142857143, - "grad_norm": 54.65069580078125, + "grad_norm": 63.00151062011719, "learning_rate": 4.058195566203516e-06, - "loss": 10.1719, + "loss": 12.1387, "step": 1692 }, { "epoch": 6.046428571428572, - "grad_norm": 49.99138641357422, + "grad_norm": 782.02685546875, "learning_rate": 4.052074558041608e-06, - "loss": 8.7578, + "loss": 10.207, "step": 1693 }, { "epoch": 6.05, - "grad_norm": 51.0986213684082, + "grad_norm": 10.825554847717285, "learning_rate": 4.045955023117276e-06, - "loss": 11.3984, + "loss": 12.4453, "step": 1694 }, { "epoch": 6.053571428571429, - "grad_norm": 45.37326431274414, + "grad_norm": 53.48950958251953, "learning_rate": 4.0398369709413195e-06, - "loss": 6.9648, + "loss": 6.6641, "step": 1695 }, { "epoch": 6.057142857142857, - "grad_norm": 49.3985595703125, + "grad_norm": 63.65537643432617, "learning_rate": 4.033720411022235e-06, - "loss": 11.4531, + "loss": 10.8984, "step": 1696 }, { "epoch": 6.060714285714286, - "grad_norm": 62.27033615112305, + "grad_norm": 9.590255737304688, "learning_rate": 4.0276053528661955e-06, - "loss": 9.875, + "loss": 11.5586, "step": 1697 }, { "epoch": 6.064285714285714, - "grad_norm": 45.84238052368164, + "grad_norm": 187.78184509277344, "learning_rate": 4.021491805977043e-06, - "loss": 9.4492, + "loss": 34.4512, "step": 1698 }, { "epoch": 6.067857142857143, - "grad_norm": 48.25836944580078, + "grad_norm": 20.09391212463379, "learning_rate": 4.0153797798562725e-06, - "loss": 10.6055, + "loss": 9.998, "step": 1699 }, { "epoch": 6.071428571428571, - "grad_norm": 55.16362762451172, + "grad_norm": 17.72443199157715, "learning_rate": 4.009269284003014e-06, - "loss": 10.1875, + "loss": 12.1836, "step": 1700 }, { "epoch": 6.075, - "grad_norm": 55.9172248840332, + "grad_norm": 65.73104095458984, "learning_rate": 4.003160327914015e-06, - "loss": 10.0938, + "loss": 11.4023, "step": 1701 }, { "epoch": 6.078571428571428, - "grad_norm": 43.185508728027344, + "grad_norm": 9.207090377807617, "learning_rate": 3.997052921083637e-06, - "loss": 10.8047, + "loss": 10.918, "step": 1702 }, { "epoch": 6.082142857142857, - "grad_norm": 48.58912658691406, + "grad_norm": 142.95005798339844, "learning_rate": 3.990947073003829e-06, - "loss": 9.4023, + "loss": 9.5293, "step": 1703 }, { "epoch": 6.085714285714285, - "grad_norm": 49.210086822509766, + "grad_norm": 655.189453125, "learning_rate": 3.98484279316412e-06, - "loss": 11.293, + "loss": 22.0078, "step": 1704 }, { "epoch": 6.089285714285714, - "grad_norm": 57.406002044677734, + "grad_norm": 119.37258911132812, "learning_rate": 3.978740091051599e-06, - "loss": 9.0117, + "loss": 8.377, "step": 1705 }, { "epoch": 6.0928571428571425, - "grad_norm": 55.95344161987305, + "grad_norm": 17.568416595458984, "learning_rate": 3.9726389761509055e-06, - "loss": 8.5547, + "loss": 8.6211, "step": 1706 }, { "epoch": 6.0964285714285715, - "grad_norm": 52.71607208251953, + "grad_norm": 680.8194580078125, "learning_rate": 3.96653945794421e-06, - "loss": 8.8008, + "loss": 15.4238, "step": 1707 }, { "epoch": 6.1, - "grad_norm": 51.33906936645508, + "grad_norm": 767.8720703125, "learning_rate": 3.960441545911205e-06, - "loss": 9.3672, + "loss": 19.9844, "step": 1708 }, { "epoch": 6.103571428571429, - "grad_norm": 46.3787727355957, + "grad_norm": 144.58682250976562, "learning_rate": 3.954345249529081e-06, - "loss": 11.7266, + "loss": 9.6387, "step": 1709 }, { "epoch": 6.107142857142857, - "grad_norm": 40.00098419189453, + "grad_norm": 7.60906982421875, "learning_rate": 3.948250578272522e-06, - "loss": 8.6328, + "loss": 7.3965, "step": 1710 }, { "epoch": 6.110714285714286, - "grad_norm": 67.38056945800781, + "grad_norm": 668.8328247070312, "learning_rate": 3.9421575416136866e-06, - "loss": 14.3203, + "loss": 18.1953, "step": 1711 }, { "epoch": 6.114285714285714, - "grad_norm": 61.05616760253906, + "grad_norm": 38.15952682495117, "learning_rate": 3.936066149022191e-06, - "loss": 10.625, + "loss": 9.5, "step": 1712 }, { "epoch": 6.117857142857143, - "grad_norm": 50.53459930419922, + "grad_norm": 198.3947296142578, "learning_rate": 3.929976409965094e-06, - "loss": 10.0469, + "loss": 11.0977, "step": 1713 }, { "epoch": 6.121428571428571, - "grad_norm": 53.52139663696289, + "grad_norm": 14.970895767211914, "learning_rate": 3.923888333906891e-06, - "loss": 10.707, + "loss": 11.2715, "step": 1714 }, { "epoch": 6.125, - "grad_norm": 49.115806579589844, + "grad_norm": 644.01953125, "learning_rate": 3.917801930309486e-06, - "loss": 9.1914, + "loss": 14.5801, "step": 1715 }, { "epoch": 6.128571428571428, - "grad_norm": 44.22719955444336, + "grad_norm": 66.63221740722656, "learning_rate": 3.911717208632189e-06, - "loss": 8.4492, + "loss": 8.8125, "step": 1716 }, { "epoch": 6.132142857142857, - "grad_norm": 74.1977767944336, + "grad_norm": 9.975343704223633, "learning_rate": 3.9056341783316935e-06, - "loss": 8.3516, + "loss": 6.1992, "step": 1717 }, { "epoch": 6.135714285714286, - "grad_norm": 48.96635055541992, + "grad_norm": 5.0466766357421875, "learning_rate": 3.8995528488620635e-06, - "loss": 9.1328, + "loss": 9.834, "step": 1718 }, { "epoch": 6.139285714285714, - "grad_norm": 50.21746063232422, + "grad_norm": 181.25189208984375, "learning_rate": 3.8934732296747205e-06, - "loss": 10.6602, + "loss": 13.7461, "step": 1719 }, { "epoch": 6.142857142857143, - "grad_norm": 43.338382720947266, + "grad_norm": 161.98471069335938, "learning_rate": 3.887395330218429e-06, - "loss": 9.7031, + "loss": 9.5391, "step": 1720 }, { "epoch": 6.146428571428571, - "grad_norm": 44.57133483886719, + "grad_norm": 363.58087158203125, "learning_rate": 3.88131915993928e-06, - "loss": 9.3906, + "loss": 11.5508, "step": 1721 }, { "epoch": 6.15, - "grad_norm": 44.74058532714844, + "grad_norm": 20.980865478515625, "learning_rate": 3.875244728280676e-06, - "loss": 9.0273, + "loss": 8.459, "step": 1722 }, { "epoch": 6.1535714285714285, - "grad_norm": 60.828041076660156, + "grad_norm": 15.99710750579834, "learning_rate": 3.869172044683319e-06, - "loss": 8.9141, + "loss": 9.2188, "step": 1723 }, { "epoch": 6.1571428571428575, - "grad_norm": 62.143890380859375, + "grad_norm": 16.499717712402344, "learning_rate": 3.863101118585193e-06, - "loss": 8.3359, + "loss": 8.0723, "step": 1724 }, { "epoch": 6.160714285714286, - "grad_norm": 47.39149475097656, + "grad_norm": 22.781593322753906, "learning_rate": 3.857031959421553e-06, - "loss": 10.1445, + "loss": 22.6094, "step": 1725 }, { "epoch": 6.164285714285715, - "grad_norm": 45.78790283203125, + "grad_norm": 133.22808837890625, "learning_rate": 3.850964576624904e-06, - "loss": 7.668, + "loss": 6.9785, "step": 1726 }, { "epoch": 6.167857142857143, - "grad_norm": 51.39608383178711, + "grad_norm": 369.3150634765625, "learning_rate": 3.844898979624992e-06, - "loss": 11.832, + "loss": 13.9883, "step": 1727 }, { "epoch": 6.171428571428572, - "grad_norm": 56.623592376708984, + "grad_norm": 1038.81103515625, "learning_rate": 3.8388351778487884e-06, - "loss": 9.6211, + "loss": 14.5508, "step": 1728 }, { "epoch": 6.175, - "grad_norm": 46.049320220947266, + "grad_norm": 11.300472259521484, "learning_rate": 3.832773180720475e-06, - "loss": 9.0508, + "loss": 8.875, "step": 1729 }, { "epoch": 6.178571428571429, - "grad_norm": 58.23458480834961, + "grad_norm": 25.081615447998047, "learning_rate": 3.826712997661426e-06, - "loss": 11.9688, + "loss": 12.9219, "step": 1730 }, { "epoch": 6.182142857142857, - "grad_norm": 55.182701110839844, + "grad_norm": 109.65489196777344, "learning_rate": 3.820654638090197e-06, - "loss": 10.9883, + "loss": 12.8945, "step": 1731 }, { "epoch": 6.185714285714286, - "grad_norm": 45.907630920410156, + "grad_norm": 21.512109756469727, "learning_rate": 3.8145981114225135e-06, - "loss": 10.2773, + "loss": 10.582, "step": 1732 }, { "epoch": 6.189285714285714, - "grad_norm": 50.76328659057617, + "grad_norm": 239.8883056640625, "learning_rate": 3.808543427071249e-06, - "loss": 9.9336, + "loss": 10.7266, "step": 1733 }, { "epoch": 6.192857142857143, - "grad_norm": 53.89619827270508, + "grad_norm": 956.52099609375, "learning_rate": 3.8024905944464118e-06, - "loss": 11.0938, + "loss": 19.5234, "step": 1734 }, { "epoch": 6.196428571428571, - "grad_norm": 54.912513732910156, + "grad_norm": 365.6389465332031, "learning_rate": 3.7964396229551365e-06, - "loss": 9.6719, + "loss": 19.2695, "step": 1735 }, { "epoch": 6.2, - "grad_norm": 59.877376556396484, + "grad_norm": 219.60227966308594, "learning_rate": 3.790390522001662e-06, - "loss": 10.418, + "loss": 11.1113, "step": 1736 }, { "epoch": 6.203571428571428, - "grad_norm": 46.845550537109375, + "grad_norm": 652.0458374023438, "learning_rate": 3.7843433009873222e-06, - "loss": 10.168, + "loss": 13.252, "step": 1737 }, { "epoch": 6.207142857142857, - "grad_norm": 39.68058395385742, + "grad_norm": 39.6609001159668, "learning_rate": 3.778297969310529e-06, - "loss": 9.2812, + "loss": 10.2969, "step": 1738 }, { "epoch": 6.210714285714285, - "grad_norm": 49.07656478881836, + "grad_norm": 367.79119873046875, "learning_rate": 3.7722545363667573e-06, - "loss": 9.6875, + "loss": 17.0625, "step": 1739 }, { "epoch": 6.214285714285714, - "grad_norm": 67.2649917602539, + "grad_norm": 200.66477966308594, "learning_rate": 3.7662130115485317e-06, - "loss": 9.6328, + "loss": 9.8984, "step": 1740 }, { "epoch": 6.2178571428571425, - "grad_norm": 47.03213119506836, + "grad_norm": 21.775047302246094, "learning_rate": 3.7601734042454093e-06, - "loss": 10.5586, + "loss": 11.7344, "step": 1741 }, { "epoch": 6.2214285714285715, - "grad_norm": 50.91373062133789, + "grad_norm": 907.8807373046875, "learning_rate": 3.754135723843968e-06, - "loss": 8.5742, + "loss": 12.0273, "step": 1742 }, { "epoch": 6.225, - "grad_norm": 56.941951751708984, + "grad_norm": 338.03070068359375, "learning_rate": 3.748099979727792e-06, - "loss": 10.3633, + "loss": 12.2578, "step": 1743 }, { "epoch": 6.228571428571429, - "grad_norm": 47.65779495239258, + "grad_norm": 32.333927154541016, "learning_rate": 3.7420661812774577e-06, - "loss": 11.5586, + "loss": 10.7656, "step": 1744 }, { "epoch": 6.232142857142857, - "grad_norm": 54.094120025634766, + "grad_norm": 657.7205200195312, "learning_rate": 3.736034337870512e-06, - "loss": 9.8398, + "loss": 10.834, "step": 1745 }, { "epoch": 6.235714285714286, - "grad_norm": 40.20479202270508, + "grad_norm": 22.726911544799805, "learning_rate": 3.7300044588814692e-06, - "loss": 6.8984, + "loss": 7.2207, "step": 1746 }, { "epoch": 6.239285714285714, - "grad_norm": 61.97272872924805, + "grad_norm": 152.84201049804688, "learning_rate": 3.723976553681787e-06, - "loss": 10.1602, + "loss": 8.6035, "step": 1747 }, { "epoch": 6.242857142857143, - "grad_norm": 59.61983871459961, + "grad_norm": 70.90827941894531, "learning_rate": 3.7179506316398584e-06, - "loss": 9.2656, + "loss": 10.2148, "step": 1748 }, { "epoch": 6.246428571428572, - "grad_norm": 44.060604095458984, + "grad_norm": 29.570253372192383, "learning_rate": 3.7119267021209903e-06, - "loss": 9.9961, + "loss": 10.6973, "step": 1749 }, { "epoch": 6.25, - "grad_norm": 49.06633377075195, + "grad_norm": 57.819698333740234, "learning_rate": 3.705904774487396e-06, - "loss": 10.9336, + "loss": 9.6973, "step": 1750 }, { "epoch": 6.253571428571428, - "grad_norm": 49.38844680786133, + "grad_norm": 93.63714599609375, "learning_rate": 3.6998848580981765e-06, - "loss": 8.8008, + "loss": 9.5469, "step": 1751 }, { "epoch": 6.257142857142857, - "grad_norm": 61.709964752197266, + "grad_norm": 309.0652160644531, "learning_rate": 3.6938669623093086e-06, - "loss": 9.4844, + "loss": 10.8086, "step": 1752 }, { "epoch": 6.260714285714286, - "grad_norm": 57.79066467285156, + "grad_norm": 381.38226318359375, "learning_rate": 3.6878510964736248e-06, - "loss": 11.0703, + "loss": 12.8867, "step": 1753 }, { "epoch": 6.264285714285714, - "grad_norm": 43.46262741088867, + "grad_norm": 20.11421012878418, "learning_rate": 3.6818372699408067e-06, - "loss": 9.3906, + "loss": 10.0625, "step": 1754 }, { "epoch": 6.267857142857143, - "grad_norm": 47.41386032104492, + "grad_norm": 31.051742553710938, "learning_rate": 3.6758254920573638e-06, - "loss": 9.2227, + "loss": 7.1992, "step": 1755 }, { "epoch": 6.271428571428571, - "grad_norm": 59.2305908203125, + "grad_norm": 53.27566909790039, "learning_rate": 3.669815772166625e-06, - "loss": 11.4805, + "loss": 15.7188, "step": 1756 }, { "epoch": 6.275, - "grad_norm": 58.47336196899414, + "grad_norm": 114.33683013916016, "learning_rate": 3.663808119608716e-06, - "loss": 7.8945, + "loss": 6.5703, "step": 1757 }, { "epoch": 6.2785714285714285, - "grad_norm": 41.33405685424805, + "grad_norm": 33.98986053466797, "learning_rate": 3.6578025437205535e-06, - "loss": 8.7188, + "loss": 9.5703, "step": 1758 }, { "epoch": 6.2821428571428575, - "grad_norm": 67.05522155761719, + "grad_norm": 13.689750671386719, "learning_rate": 3.651799053835824e-06, - "loss": 9.5938, + "loss": 11.4297, "step": 1759 }, { "epoch": 6.285714285714286, - "grad_norm": 53.32649612426758, + "grad_norm": 59.812217712402344, "learning_rate": 3.6457976592849753e-06, - "loss": 10.6055, + "loss": 10.4316, "step": 1760 }, { "epoch": 6.289285714285715, - "grad_norm": 41.22390365600586, + "grad_norm": 18.5739803314209, "learning_rate": 3.6397983693951944e-06, - "loss": 9.418, + "loss": 8.5723, "step": 1761 }, { "epoch": 6.292857142857143, - "grad_norm": 52.49165725708008, + "grad_norm": 9.770771980285645, "learning_rate": 3.6338011934904006e-06, - "loss": 11.8242, + "loss": 12.8848, "step": 1762 }, { "epoch": 6.296428571428572, - "grad_norm": 47.30814743041992, + "grad_norm": 34.42852783203125, "learning_rate": 3.6278061408912257e-06, - "loss": 9.1367, + "loss": 10.2344, "step": 1763 }, { "epoch": 6.3, - "grad_norm": 49.25723648071289, + "grad_norm": 27.329360961914062, "learning_rate": 3.6218132209150047e-06, - "loss": 7.6758, + "loss": 7.4531, "step": 1764 }, { "epoch": 6.303571428571429, - "grad_norm": 50.525604248046875, + "grad_norm": 201.6879119873047, "learning_rate": 3.6158224428757538e-06, - "loss": 11.3633, + "loss": 24.7871, "step": 1765 }, { "epoch": 6.307142857142857, - "grad_norm": 50.78407669067383, + "grad_norm": 9.310190200805664, "learning_rate": 3.609833816084163e-06, - "loss": 10.2852, + "loss": 12.0234, "step": 1766 }, { "epoch": 6.310714285714286, - "grad_norm": 59.88170623779297, + "grad_norm": 987.639404296875, "learning_rate": 3.6038473498475774e-06, - "loss": 9.9531, + "loss": 13.1523, "step": 1767 }, { "epoch": 6.314285714285714, - "grad_norm": 48.58623504638672, + "grad_norm": 138.0100555419922, "learning_rate": 3.5978630534699873e-06, - "loss": 9.9453, + "loss": 11.3633, "step": 1768 }, { "epoch": 6.317857142857143, - "grad_norm": 41.02545166015625, + "grad_norm": 476.808837890625, "learning_rate": 3.5918809362520056e-06, - "loss": 9.1289, + "loss": 13.6133, "step": 1769 }, { "epoch": 6.321428571428571, - "grad_norm": 54.59299087524414, + "grad_norm": 18.03131675720215, "learning_rate": 3.585901007490863e-06, - "loss": 12.082, + "loss": 12.875, "step": 1770 }, { "epoch": 6.325, - "grad_norm": 43.45494079589844, + "grad_norm": 279.54296875, "learning_rate": 3.579923276480387e-06, - "loss": 9.2109, + "loss": 8.9043, "step": 1771 }, { "epoch": 6.328571428571428, - "grad_norm": 50.15151596069336, + "grad_norm": 215.88291931152344, "learning_rate": 3.57394775251099e-06, - "loss": 8.75, + "loss": 9.4258, "step": 1772 }, { "epoch": 6.332142857142857, - "grad_norm": 50.33037567138672, + "grad_norm": 21.568098068237305, "learning_rate": 3.5679744448696534e-06, - "loss": 10.6094, + "loss": 10.5488, "step": 1773 }, { "epoch": 6.335714285714285, - "grad_norm": 43.77446365356445, + "grad_norm": 180.8748321533203, "learning_rate": 3.562003362839914e-06, - "loss": 8.1484, + "loss": 8.9648, "step": 1774 }, { "epoch": 6.339285714285714, - "grad_norm": 62.063499450683594, + "grad_norm": 826.9417114257812, "learning_rate": 3.556034515701852e-06, - "loss": 12.7812, + "loss": 14.9023, "step": 1775 }, { "epoch": 6.3428571428571425, - "grad_norm": 52.96371841430664, + "grad_norm": 598.0648193359375, "learning_rate": 3.550067912732069e-06, - "loss": 12.2227, + "loss": 26.0078, "step": 1776 }, { "epoch": 6.3464285714285715, - "grad_norm": 50.546905517578125, + "grad_norm": 53.15715789794922, "learning_rate": 3.544103563203687e-06, - "loss": 9.7188, + "loss": 10.3828, "step": 1777 }, { "epoch": 6.35, - "grad_norm": 62.89564895629883, + "grad_norm": 129.30056762695312, "learning_rate": 3.538141476386317e-06, - "loss": 9.9688, + "loss": 12.5625, "step": 1778 }, { "epoch": 6.353571428571429, - "grad_norm": 46.9800910949707, + "grad_norm": 69.8838882446289, "learning_rate": 3.5321816615460585e-06, - "loss": 9.4336, + "loss": 11.3047, "step": 1779 }, { "epoch": 6.357142857142857, - "grad_norm": 59.78700256347656, + "grad_norm": 25.29172134399414, "learning_rate": 3.526224127945479e-06, - "loss": 9.6133, + "loss": 9.4258, "step": 1780 }, { "epoch": 6.360714285714286, - "grad_norm": 73.3747329711914, + "grad_norm": 1400.075439453125, "learning_rate": 3.520268884843602e-06, - "loss": 12.2773, + "loss": 24.4531, "step": 1781 }, { "epoch": 6.364285714285714, - "grad_norm": 38.920501708984375, + "grad_norm": 12.825567245483398, "learning_rate": 3.5143159414958854e-06, - "loss": 8.1484, + "loss": 9.2539, "step": 1782 }, { "epoch": 6.367857142857143, - "grad_norm": 41.59416961669922, + "grad_norm": 11.501030921936035, "learning_rate": 3.50836530715422e-06, - "loss": 7.1992, + "loss": 7.1094, "step": 1783 }, { "epoch": 6.371428571428572, - "grad_norm": 54.04392623901367, + "grad_norm": 1180.2333984375, "learning_rate": 3.502416991066904e-06, - "loss": 9.3945, + "loss": 22.1953, "step": 1784 }, { "epoch": 6.375, - "grad_norm": 60.764312744140625, + "grad_norm": 1088.2928466796875, "learning_rate": 3.4964710024786354e-06, - "loss": 9.6328, + "loss": 19.1016, "step": 1785 }, { "epoch": 6.378571428571428, - "grad_norm": 54.28053665161133, + "grad_norm": 96.76593017578125, "learning_rate": 3.4905273506304904e-06, - "loss": 9.7734, + "loss": 10.1484, "step": 1786 }, { "epoch": 6.382142857142857, - "grad_norm": 65.46214294433594, + "grad_norm": 374.8851623535156, "learning_rate": 3.484586044759918e-06, - "loss": 13.6953, + "loss": 17.3281, "step": 1787 }, { "epoch": 6.385714285714286, - "grad_norm": 50.05048370361328, + "grad_norm": 22.29779624938965, "learning_rate": 3.478647094100719e-06, - "loss": 9.2812, + "loss": 9.3145, "step": 1788 }, { "epoch": 6.389285714285714, - "grad_norm": 58.779998779296875, + "grad_norm": 16.57585334777832, "learning_rate": 3.4727105078830347e-06, - "loss": 8.2891, + "loss": 6.25, "step": 1789 }, { "epoch": 6.392857142857143, - "grad_norm": 55.121925354003906, + "grad_norm": 639.3925170898438, "learning_rate": 3.4667762953333296e-06, - "loss": 12.8516, + "loss": 18.8906, "step": 1790 }, { "epoch": 6.396428571428571, - "grad_norm": 65.17803192138672, + "grad_norm": 34.11528015136719, "learning_rate": 3.460844465674381e-06, - "loss": 18.8789, + "loss": 20.9297, "step": 1791 }, { "epoch": 6.4, - "grad_norm": 51.2833251953125, + "grad_norm": 1630.653564453125, "learning_rate": 3.4549150281252635e-06, - "loss": 9.5938, + "loss": 35.5156, "step": 1792 }, { "epoch": 6.4035714285714285, - "grad_norm": 80.84515380859375, + "grad_norm": 38.077152252197266, "learning_rate": 3.4489879919013338e-06, - "loss": 17.0859, + "loss": 21.1445, "step": 1793 }, { "epoch": 6.4071428571428575, - "grad_norm": 47.337791442871094, + "grad_norm": 140.9590301513672, "learning_rate": 3.443063366214212e-06, - "loss": 11.1328, + "loss": 11.0078, "step": 1794 }, { "epoch": 6.410714285714286, - "grad_norm": 41.62042236328125, + "grad_norm": 97.01585388183594, "learning_rate": 3.4371411602717785e-06, - "loss": 8.5664, + "loss": 9.2578, "step": 1795 }, { "epoch": 6.414285714285715, - "grad_norm": 67.5714340209961, + "grad_norm": 30.308340072631836, "learning_rate": 3.4312213832781487e-06, - "loss": 11.0039, + "loss": 11.3438, "step": 1796 }, { "epoch": 6.417857142857143, - "grad_norm": 67.93173217773438, + "grad_norm": 780.4686279296875, "learning_rate": 3.425304044433666e-06, - "loss": 14.7578, + "loss": 21.1172, "step": 1797 }, { "epoch": 6.421428571428572, - "grad_norm": 46.988712310791016, + "grad_norm": 1103.8399658203125, "learning_rate": 3.41938915293488e-06, - "loss": 9.4219, + "loss": 21.8906, "step": 1798 }, { "epoch": 6.425, - "grad_norm": 47.787208557128906, + "grad_norm": 15.716828346252441, "learning_rate": 3.4134767179745404e-06, - "loss": 9.2539, + "loss": 7.9258, "step": 1799 }, { "epoch": 6.428571428571429, - "grad_norm": 59.736324310302734, + "grad_norm": 744.6246948242188, "learning_rate": 3.4075667487415785e-06, - "loss": 11.543, + "loss": 16.3281, "step": 1800 }, { "epoch": 6.432142857142857, - "grad_norm": 47.66754913330078, + "grad_norm": 59.65317153930664, "learning_rate": 3.4016592544210937e-06, - "loss": 9.9883, + "loss": 10.6895, "step": 1801 }, { "epoch": 6.435714285714286, - "grad_norm": 49.02814865112305, + "grad_norm": 823.3919677734375, "learning_rate": 3.3957542441943375e-06, - "loss": 9.0586, + "loss": 11.1484, "step": 1802 }, { "epoch": 6.439285714285714, - "grad_norm": 51.15004348754883, + "grad_norm": 7.740328311920166, "learning_rate": 3.389851727238701e-06, - "loss": 7.8711, + "loss": 6.0859, "step": 1803 }, { "epoch": 6.442857142857143, - "grad_norm": 43.851661682128906, + "grad_norm": 851.7941284179688, "learning_rate": 3.383951712727701e-06, - "loss": 9.9922, + "loss": 14.5469, "step": 1804 }, { "epoch": 6.446428571428571, - "grad_norm": 51.12315368652344, + "grad_norm": 8.271879196166992, "learning_rate": 3.3780542098309653e-06, - "loss": 10.5469, + "loss": 10.1113, "step": 1805 }, { "epoch": 6.45, - "grad_norm": 46.6807746887207, + "grad_norm": 91.08444213867188, "learning_rate": 3.372159227714218e-06, - "loss": 9.293, + "loss": 9.5117, "step": 1806 }, { "epoch": 6.453571428571428, - "grad_norm": 65.33263397216797, + "grad_norm": 1336.1544189453125, "learning_rate": 3.366266775539264e-06, - "loss": 11.6211, + "loss": 18.7656, "step": 1807 }, { "epoch": 6.457142857142857, - "grad_norm": 42.80998611450195, + "grad_norm": 218.9738006591797, "learning_rate": 3.3603768624639786e-06, - "loss": 9.4688, + "loss": 9.125, "step": 1808 }, { "epoch": 6.460714285714285, - "grad_norm": 58.90462875366211, + "grad_norm": 11.207719802856445, "learning_rate": 3.3544894976422904e-06, - "loss": 10.9102, + "loss": 11.3438, "step": 1809 }, { "epoch": 6.464285714285714, - "grad_norm": 46.461429595947266, + "grad_norm": 25.200098037719727, "learning_rate": 3.3486046902241663e-06, - "loss": 8.4297, + "loss": 8.3223, "step": 1810 }, { "epoch": 6.4678571428571425, - "grad_norm": 42.41583251953125, + "grad_norm": 58.32366943359375, "learning_rate": 3.342722449355598e-06, - "loss": 8.1641, + "loss": 7.1406, "step": 1811 }, { "epoch": 6.4714285714285715, - "grad_norm": 62.66763687133789, + "grad_norm": 88.71317291259766, "learning_rate": 3.336842784178591e-06, - "loss": 13.6172, + "loss": 16.791, "step": 1812 }, { "epoch": 6.475, - "grad_norm": 53.854305267333984, + "grad_norm": 923.6730346679688, "learning_rate": 3.330965703831146e-06, - "loss": 9.7344, + "loss": 16.4414, "step": 1813 }, { "epoch": 6.478571428571429, - "grad_norm": 54.623504638671875, + "grad_norm": 941.4225463867188, "learning_rate": 3.325091217447248e-06, - "loss": 12.3633, + "loss": 16.4688, "step": 1814 }, { "epoch": 6.482142857142857, - "grad_norm": 42.16908264160156, + "grad_norm": 152.6041259765625, "learning_rate": 3.3192193341568476e-06, - "loss": 8.8633, + "loss": 10.1289, "step": 1815 }, { "epoch": 6.485714285714286, - "grad_norm": 54.690284729003906, + "grad_norm": 551.3446044921875, "learning_rate": 3.3133500630858507e-06, - "loss": 11.332, + "loss": 18.3398, "step": 1816 }, { "epoch": 6.489285714285714, - "grad_norm": 52.28733825683594, + "grad_norm": 10.687947273254395, "learning_rate": 3.307483413356106e-06, - "loss": 8.3164, + "loss": 7.834, "step": 1817 }, { "epoch": 6.492857142857143, - "grad_norm": 41.34231948852539, + "grad_norm": 1005.6011962890625, "learning_rate": 3.3016193940853857e-06, - "loss": 9.6641, + "loss": 11.9531, "step": 1818 }, { "epoch": 6.496428571428572, - "grad_norm": 55.064151763916016, + "grad_norm": 46.814178466796875, "learning_rate": 3.295758014387375e-06, - "loss": 9.6328, + "loss": 11.0703, "step": 1819 }, { "epoch": 6.5, - "grad_norm": 49.355709075927734, + "grad_norm": 70.38685607910156, "learning_rate": 3.289899283371657e-06, - "loss": 11.1328, + "loss": 12.4121, "step": 1820 }, { "epoch": 6.503571428571428, - "grad_norm": 52.56207275390625, + "grad_norm": 89.73088073730469, "learning_rate": 3.2840432101436945e-06, - "loss": 7.9102, + "loss": 7.3574, "step": 1821 }, { "epoch": 6.507142857142857, - "grad_norm": 40.73325729370117, + "grad_norm": 1042.9569091796875, "learning_rate": 3.2781898038048242e-06, - "loss": 8.5703, + "loss": 16.0664, "step": 1822 }, { "epoch": 6.510714285714286, - "grad_norm": 53.947166442871094, + "grad_norm": 1126.654052734375, "learning_rate": 3.2723390734522374e-06, - "loss": 10.0352, + "loss": 15.8906, "step": 1823 }, { "epoch": 6.514285714285714, - "grad_norm": 48.53689193725586, + "grad_norm": 207.09506225585938, "learning_rate": 3.266491028178964e-06, - "loss": 8.6602, + "loss": 8.5215, "step": 1824 }, { "epoch": 6.517857142857143, - "grad_norm": 41.46699523925781, + "grad_norm": 215.46087646484375, "learning_rate": 3.2606456770738636e-06, - "loss": 10.0156, + "loss": 10.0117, "step": 1825 }, { "epoch": 6.521428571428571, - "grad_norm": 47.30232238769531, + "grad_norm": 773.8084716796875, "learning_rate": 3.2548030292216067e-06, - "loss": 10.0312, + "loss": 13.832, "step": 1826 }, { "epoch": 6.525, - "grad_norm": 62.916542053222656, + "grad_norm": 263.3454284667969, "learning_rate": 3.248963093702663e-06, - "loss": 9.0508, + "loss": 8.8379, "step": 1827 }, { "epoch": 6.5285714285714285, - "grad_norm": 51.91960525512695, + "grad_norm": 902.4046630859375, "learning_rate": 3.2431258795932863e-06, - "loss": 10.9102, + "loss": 16.4766, "step": 1828 }, { "epoch": 6.5321428571428575, - "grad_norm": 49.59056854248047, + "grad_norm": 13.694159507751465, "learning_rate": 3.237291395965503e-06, - "loss": 8.6133, + "loss": 8.0273, "step": 1829 }, { "epoch": 6.535714285714286, - "grad_norm": 55.930274963378906, + "grad_norm": 1058.738037109375, "learning_rate": 3.231459651887093e-06, - "loss": 9.6562, + "loss": 22.5664, "step": 1830 }, { "epoch": 6.539285714285715, - "grad_norm": 62.24993133544922, + "grad_norm": 40.12822341918945, "learning_rate": 3.22563065642158e-06, - "loss": 10.1641, + "loss": 11.6289, "step": 1831 }, { "epoch": 6.542857142857143, - "grad_norm": 54.71732711791992, + "grad_norm": 7.515157222747803, "learning_rate": 3.219804418628216e-06, - "loss": 10.6172, + "loss": 10.5449, "step": 1832 }, { "epoch": 6.546428571428572, - "grad_norm": 48.60704040527344, + "grad_norm": 277.3721923828125, "learning_rate": 3.2139809475619675e-06, - "loss": 8.668, + "loss": 9.2383, "step": 1833 }, { "epoch": 6.55, - "grad_norm": 45.13418197631836, + "grad_norm": 15.44127368927002, "learning_rate": 3.2081602522734987e-06, - "loss": 7.9336, + "loss": 7.1309, "step": 1834 }, { "epoch": 6.553571428571429, - "grad_norm": 55.4351806640625, + "grad_norm": 75.16183471679688, "learning_rate": 3.2023423418091625e-06, - "loss": 12.1133, + "loss": 11.6406, "step": 1835 }, { "epoch": 6.557142857142857, - "grad_norm": 51.31172561645508, + "grad_norm": 253.98077392578125, "learning_rate": 3.1965272252109817e-06, - "loss": 9.4961, + "loss": 9.125, "step": 1836 }, { "epoch": 6.560714285714286, - "grad_norm": 45.25824737548828, + "grad_norm": 836.0795288085938, "learning_rate": 3.1907149115166403e-06, - "loss": 8.7852, + "loss": 11.8867, "step": 1837 }, { "epoch": 6.564285714285714, - "grad_norm": 50.234886169433594, + "grad_norm": 19.762052536010742, "learning_rate": 3.18490540975946e-06, - "loss": 8.3984, + "loss": 6.9062, "step": 1838 }, { "epoch": 6.567857142857143, - "grad_norm": 48.83291244506836, + "grad_norm": 18.149869918823242, "learning_rate": 3.179098728968398e-06, - "loss": 10.5977, + "loss": 12.0547, "step": 1839 }, { "epoch": 6.571428571428571, - "grad_norm": 50.49949264526367, + "grad_norm": 191.10641479492188, "learning_rate": 3.173294878168025e-06, - "loss": 10.0625, + "loss": 10.4453, "step": 1840 }, { "epoch": 6.575, - "grad_norm": 62.109519958496094, + "grad_norm": 409.2178649902344, "learning_rate": 3.167493866378514e-06, - "loss": 9.7891, + "loss": 10.9453, "step": 1841 }, { "epoch": 6.578571428571428, - "grad_norm": 40.08619689941406, + "grad_norm": 187.94374084472656, "learning_rate": 3.161695702615625e-06, - "loss": 9.293, + "loss": 10.0234, "step": 1842 }, { "epoch": 6.582142857142857, - "grad_norm": 42.30028533935547, + "grad_norm": 17.126375198364258, "learning_rate": 3.1559003958906907e-06, - "loss": 9.4727, + "loss": 10.4336, "step": 1843 }, { "epoch": 6.585714285714285, - "grad_norm": 52.644264221191406, + "grad_norm": 150.32273864746094, "learning_rate": 3.150107955210606e-06, - "loss": 9.9414, + "loss": 9.1562, "step": 1844 }, { "epoch": 6.589285714285714, - "grad_norm": 43.016849517822266, + "grad_norm": 35.58272933959961, "learning_rate": 3.1443183895778105e-06, - "loss": 8.7266, + "loss": 8.9609, "step": 1845 }, { "epoch": 6.5928571428571425, - "grad_norm": 49.94986343383789, + "grad_norm": 16.50701904296875, "learning_rate": 3.1385317079902743e-06, - "loss": 11.5508, + "loss": 13.6406, "step": 1846 }, { "epoch": 6.5964285714285715, - "grad_norm": 48.18743133544922, + "grad_norm": 27.22591781616211, "learning_rate": 3.1327479194414867e-06, - "loss": 9.9141, + "loss": 9.7969, "step": 1847 }, { "epoch": 6.6, - "grad_norm": 48.377628326416016, + "grad_norm": 1284.10205078125, "learning_rate": 3.12696703292044e-06, - "loss": 10.418, + "loss": 16.8438, "step": 1848 }, { "epoch": 6.603571428571429, - "grad_norm": 45.022621154785156, + "grad_norm": 28.03920555114746, "learning_rate": 3.1211890574116172e-06, - "loss": 11.0742, + "loss": 12.6367, "step": 1849 }, { "epoch": 6.607142857142857, - "grad_norm": 49.71916580200195, + "grad_norm": 127.08976745605469, "learning_rate": 3.1154140018949743e-06, - "loss": 10.3711, + "loss": 12.4199, "step": 1850 }, { "epoch": 6.610714285714286, - "grad_norm": 42.09014129638672, + "grad_norm": 682.7449340820312, "learning_rate": 3.109641875345932e-06, - "loss": 10.7695, + "loss": 13.6094, "step": 1851 }, { "epoch": 6.614285714285714, - "grad_norm": 46.908382415771484, + "grad_norm": 169.46310424804688, "learning_rate": 3.1038726867353587e-06, - "loss": 8.3242, + "loss": 9.3223, "step": 1852 }, { "epoch": 6.617857142857143, - "grad_norm": 63.779232025146484, + "grad_norm": 859.537353515625, "learning_rate": 3.0981064450295555e-06, - "loss": 11.4102, + "loss": 22.4531, "step": 1853 }, { "epoch": 6.621428571428572, - "grad_norm": 46.99143600463867, + "grad_norm": 31.62457847595215, "learning_rate": 3.092343159190244e-06, - "loss": 11.043, + "loss": 12.2881, "step": 1854 }, { "epoch": 6.625, - "grad_norm": 63.027008056640625, + "grad_norm": 97.884521484375, "learning_rate": 3.0865828381745515e-06, - "loss": 10.418, + "loss": 11.7695, "step": 1855 }, { "epoch": 6.628571428571428, - "grad_norm": 44.806819915771484, + "grad_norm": 131.51268005371094, "learning_rate": 3.0808254909349987e-06, - "loss": 10.0156, + "loss": 9.6289, "step": 1856 }, { "epoch": 6.632142857142857, - "grad_norm": 55.84218978881836, + "grad_norm": 130.49493408203125, "learning_rate": 3.0750711264194834e-06, - "loss": 10.4141, + "loss": 11.9141, "step": 1857 }, { "epoch": 6.635714285714286, - "grad_norm": 57.5880012512207, + "grad_norm": 29.710914611816406, "learning_rate": 3.0693197535712695e-06, - "loss": 7.0273, + "loss": 6.126, "step": 1858 }, { "epoch": 6.639285714285714, - "grad_norm": 58.116241455078125, + "grad_norm": 171.5111846923828, "learning_rate": 3.063571381328967e-06, - "loss": 9.3242, + "loss": 9.7812, "step": 1859 }, { "epoch": 6.642857142857143, - "grad_norm": 47.00972366333008, + "grad_norm": 289.0292053222656, "learning_rate": 3.057826018626527e-06, - "loss": 9.3125, + "loss": 12.1523, "step": 1860 }, { "epoch": 6.646428571428571, - "grad_norm": 43.64870834350586, + "grad_norm": 17.55817222595215, "learning_rate": 3.0520836743932213e-06, - "loss": 8.4414, + "loss": 8.0059, "step": 1861 }, { "epoch": 6.65, - "grad_norm": 49.987091064453125, + "grad_norm": 1084.5736083984375, "learning_rate": 3.0463443575536324e-06, - "loss": 10.5898, + "loss": 14.2109, "step": 1862 }, { "epoch": 6.6535714285714285, - "grad_norm": 51.58626174926758, + "grad_norm": 209.27081298828125, "learning_rate": 3.0406080770276337e-06, - "loss": 12.1328, + "loss": 12.6406, "step": 1863 }, { "epoch": 6.6571428571428575, - "grad_norm": 43.749080657958984, + "grad_norm": 797.3853759765625, "learning_rate": 3.0348748417303826e-06, - "loss": 9.7031, + "loss": 12.3477, "step": 1864 }, { "epoch": 6.660714285714286, - "grad_norm": 62.39121627807617, + "grad_norm": 181.34507751464844, "learning_rate": 3.029144660572304e-06, - "loss": 9.3594, + "loss": 8.082, "step": 1865 }, { "epoch": 6.664285714285715, - "grad_norm": 41.9468994140625, + "grad_norm": 104.76295471191406, "learning_rate": 3.023417542459076e-06, - "loss": 8.0586, + "loss": 9.0605, "step": 1866 }, { "epoch": 6.667857142857143, - "grad_norm": 67.46638488769531, + "grad_norm": 896.0699462890625, "learning_rate": 3.0176934962916127e-06, - "loss": 12.0938, + "loss": 19.1309, "step": 1867 }, { "epoch": 6.671428571428572, - "grad_norm": 56.73579788208008, + "grad_norm": 65.90125274658203, "learning_rate": 3.0119725309660595e-06, - "loss": 8.0117, + "loss": 7.9922, "step": 1868 }, { "epoch": 6.675, - "grad_norm": 48.761253356933594, + "grad_norm": 84.0003890991211, "learning_rate": 3.0062546553737692e-06, - "loss": 9.9746, + "loss": 10.4902, "step": 1869 }, { "epoch": 6.678571428571429, - "grad_norm": 45.86736297607422, + "grad_norm": 690.745361328125, "learning_rate": 3.000539878401296e-06, - "loss": 11.125, + "loss": 17.5, "step": 1870 }, { "epoch": 6.682142857142857, - "grad_norm": 48.47173309326172, + "grad_norm": 420.60595703125, "learning_rate": 2.994828208930375e-06, - "loss": 8.6289, + "loss": 9.2871, "step": 1871 }, { "epoch": 6.685714285714286, - "grad_norm": 60.0002555847168, + "grad_norm": 159.31552124023438, "learning_rate": 2.989119655837913e-06, - "loss": 8.75, + "loss": 9.4219, "step": 1872 }, { "epoch": 6.689285714285714, - "grad_norm": 42.57590103149414, + "grad_norm": 44.5473747253418, "learning_rate": 2.9834142279959754e-06, - "loss": 8.7852, + "loss": 8.3809, "step": 1873 }, { "epoch": 6.692857142857143, - "grad_norm": 38.2464599609375, + "grad_norm": 155.84474182128906, "learning_rate": 2.9777119342717686e-06, - "loss": 7.9883, + "loss": 9.1758, "step": 1874 }, { "epoch": 6.696428571428571, - "grad_norm": 46.831398010253906, + "grad_norm": 158.33494567871094, "learning_rate": 2.9720127835276257e-06, - "loss": 8.5156, + "loss": 20.3984, "step": 1875 }, { "epoch": 6.7, - "grad_norm": 49.2427864074707, + "grad_norm": 1010.9134521484375, "learning_rate": 2.966316784621e-06, - "loss": 11.8203, + "loss": 14.9375, "step": 1876 }, { "epoch": 6.703571428571428, - "grad_norm": 51.12537384033203, + "grad_norm": 22.079877853393555, "learning_rate": 2.960623946404443e-06, - "loss": 7.3906, + "loss": 6.9375, "step": 1877 }, { "epoch": 6.707142857142857, - "grad_norm": 54.67777633666992, + "grad_norm": 4.840839385986328, "learning_rate": 2.9549342777255955e-06, - "loss": 11.168, + "loss": 10.1797, "step": 1878 }, { "epoch": 6.710714285714285, - "grad_norm": 68.72468566894531, + "grad_norm": 10.660902976989746, "learning_rate": 2.949247787427171e-06, - "loss": 12.0156, + "loss": 13.4453, "step": 1879 }, { "epoch": 6.714285714285714, - "grad_norm": 61.74354934692383, + "grad_norm": 1039.3839111328125, "learning_rate": 2.9435644843469434e-06, - "loss": 13.9727, + "loss": 21.9297, "step": 1880 }, { "epoch": 6.7178571428571425, - "grad_norm": 55.475059509277344, + "grad_norm": 156.9852294921875, "learning_rate": 2.9378843773177346e-06, - "loss": 10.0, + "loss": 7.9004, "step": 1881 }, { "epoch": 6.7214285714285715, - "grad_norm": 51.46151351928711, + "grad_norm": 16.691944122314453, "learning_rate": 2.932207475167398e-06, - "loss": 7.9375, + "loss": 7.5273, "step": 1882 }, { "epoch": 6.725, - "grad_norm": 46.63974380493164, + "grad_norm": 1070.3870849609375, "learning_rate": 2.926533786718806e-06, - "loss": 10.4531, + "loss": 12.7969, "step": 1883 }, { "epoch": 6.728571428571429, - "grad_norm": 59.20750045776367, + "grad_norm": 14.556982040405273, "learning_rate": 2.9208633207898372e-06, - "loss": 12.2266, + "loss": 15.2832, "step": 1884 }, { "epoch": 6.732142857142857, - "grad_norm": 64.58248138427734, + "grad_norm": 12.16686725616455, "learning_rate": 2.9151960861933616e-06, - "loss": 14.3086, + "loss": 14.6074, "step": 1885 }, { "epoch": 6.735714285714286, - "grad_norm": 49.47846984863281, + "grad_norm": 661.9479370117188, "learning_rate": 2.9095320917372256e-06, - "loss": 11.4922, + "loss": 27.543, "step": 1886 }, { "epoch": 6.739285714285714, - "grad_norm": 71.73543548583984, + "grad_norm": 12.990812301635742, "learning_rate": 2.9038713462242417e-06, - "loss": 12.8477, + "loss": 14.3672, "step": 1887 }, { "epoch": 6.742857142857143, - "grad_norm": 57.15528869628906, + "grad_norm": 790.8316650390625, "learning_rate": 2.8982138584521734e-06, - "loss": 10.0547, + "loss": 17.9531, "step": 1888 }, { "epoch": 6.746428571428572, - "grad_norm": 54.0948371887207, + "grad_norm": 68.20613861083984, "learning_rate": 2.89255963721372e-06, - "loss": 9.957, + "loss": 10.25, "step": 1889 }, { "epoch": 6.75, - "grad_norm": 55.53627014160156, + "grad_norm": 538.1288452148438, "learning_rate": 2.886908691296504e-06, - "loss": 8.0156, + "loss": 7.5078, "step": 1890 }, { "epoch": 6.753571428571428, - "grad_norm": 48.981449127197266, + "grad_norm": 396.48443603515625, "learning_rate": 2.8812610294830568e-06, - "loss": 8.7695, + "loss": 19.9648, "step": 1891 }, { "epoch": 6.757142857142857, - "grad_norm": 46.38783264160156, + "grad_norm": 74.33063507080078, "learning_rate": 2.8756166605508085e-06, - "loss": 10.7695, + "loss": 12.0859, "step": 1892 }, { "epoch": 6.760714285714286, - "grad_norm": 54.05794143676758, + "grad_norm": 504.9187316894531, "learning_rate": 2.8699755932720703e-06, - "loss": 10.0898, + "loss": 11.0859, "step": 1893 }, { "epoch": 6.764285714285714, - "grad_norm": 61.12387466430664, + "grad_norm": 23.60403823852539, "learning_rate": 2.8643378364140186e-06, - "loss": 10.9258, + "loss": 13.7188, "step": 1894 }, { "epoch": 6.767857142857143, - "grad_norm": 50.96097183227539, + "grad_norm": 797.1279907226562, "learning_rate": 2.8587033987386857e-06, - "loss": 10.7852, + "loss": 15.1797, "step": 1895 }, { "epoch": 6.771428571428571, - "grad_norm": 44.759761810302734, + "grad_norm": 28.334415435791016, "learning_rate": 2.853072289002954e-06, - "loss": 8.9297, + "loss": 7.3633, "step": 1896 }, { "epoch": 6.775, - "grad_norm": 57.9931755065918, + "grad_norm": 60.57717514038086, "learning_rate": 2.8474445159585235e-06, - "loss": 10.2734, + "loss": 8.2793, "step": 1897 }, { "epoch": 6.7785714285714285, - "grad_norm": 55.666141510009766, + "grad_norm": 29.93793296813965, "learning_rate": 2.841820088351912e-06, - "loss": 9.6406, + "loss": 11.9453, "step": 1898 }, { "epoch": 6.7821428571428575, - "grad_norm": 43.167755126953125, + "grad_norm": 21.02042579650879, "learning_rate": 2.8361990149244404e-06, - "loss": 8.5938, + "loss": 8.2422, "step": 1899 }, { "epoch": 6.785714285714286, - "grad_norm": 45.30558776855469, + "grad_norm": 33.1290397644043, "learning_rate": 2.83058130441221e-06, - "loss": 8.2578, + "loss": 8.3164, "step": 1900 }, { "epoch": 6.789285714285715, - "grad_norm": 51.0146484375, + "grad_norm": 84.35741424560547, "learning_rate": 2.8249669655461007e-06, - "loss": 11.8555, + "loss": 12.3398, "step": 1901 }, { "epoch": 6.792857142857143, - "grad_norm": 56.199066162109375, + "grad_norm": 660.461181640625, "learning_rate": 2.8193560070517535e-06, - "loss": 12.4375, + "loss": 14.6289, "step": 1902 }, { "epoch": 6.796428571428572, - "grad_norm": 50.12849426269531, + "grad_norm": 72.39306640625, "learning_rate": 2.813748437649551e-06, - "loss": 9.3242, + "loss": 8.4707, "step": 1903 }, { "epoch": 6.8, - "grad_norm": 42.75407028198242, + "grad_norm": 22.2253360748291, "learning_rate": 2.8081442660546126e-06, - "loss": 8.2148, + "loss": 9.4766, "step": 1904 }, { "epoch": 6.803571428571429, - "grad_norm": 54.81376647949219, + "grad_norm": 89.17495727539062, "learning_rate": 2.8025435009767748e-06, - "loss": 11.4414, + "loss": 11.7656, "step": 1905 }, { "epoch": 6.807142857142857, - "grad_norm": 51.74576950073242, + "grad_norm": 12.424627304077148, "learning_rate": 2.7969461511205807e-06, - "loss": 11.2734, + "loss": 10.751, "step": 1906 }, { "epoch": 6.810714285714286, - "grad_norm": 36.827911376953125, + "grad_norm": 149.8108367919922, "learning_rate": 2.7913522251852663e-06, - "loss": 8.3633, + "loss": 9.5664, "step": 1907 }, { "epoch": 6.814285714285714, - "grad_norm": 49.43166732788086, + "grad_norm": 12.577292442321777, "learning_rate": 2.7857617318647434e-06, - "loss": 9.7578, + "loss": 8.7246, "step": 1908 }, { "epoch": 6.817857142857143, - "grad_norm": 50.876888275146484, + "grad_norm": 448.13800048828125, "learning_rate": 2.7801746798475905e-06, - "loss": 11.0625, + "loss": 11.877, "step": 1909 }, { "epoch": 6.821428571428571, - "grad_norm": 60.221397399902344, + "grad_norm": 50.4626350402832, "learning_rate": 2.774591077817038e-06, - "loss": 10.3477, + "loss": 10.9961, "step": 1910 }, { "epoch": 6.825, - "grad_norm": 61.643985748291016, + "grad_norm": 396.29931640625, "learning_rate": 2.7690109344509563e-06, - "loss": 10.5312, + "loss": 13.3984, "step": 1911 }, { "epoch": 6.828571428571428, - "grad_norm": 50.18294143676758, + "grad_norm": 21.18560791015625, "learning_rate": 2.7634342584218364e-06, - "loss": 8.1367, + "loss": 8.4883, "step": 1912 }, { "epoch": 6.832142857142857, - "grad_norm": 47.47311782836914, + "grad_norm": 10.817098617553711, "learning_rate": 2.757861058396785e-06, - "loss": 10.0078, + "loss": 11.6777, "step": 1913 }, { "epoch": 6.835714285714285, - "grad_norm": 48.29475784301758, + "grad_norm": 35.891883850097656, "learning_rate": 2.752291343037501e-06, - "loss": 10.9102, + "loss": 12.5234, "step": 1914 }, { "epoch": 6.839285714285714, - "grad_norm": 53.10966110229492, + "grad_norm": 986.3176879882812, "learning_rate": 2.746725121000273e-06, - "loss": 7.8828, + "loss": 13.8516, "step": 1915 }, { "epoch": 6.8428571428571425, - "grad_norm": 52.99269485473633, + "grad_norm": 215.3768310546875, "learning_rate": 2.7411624009359592e-06, - "loss": 9.5312, + "loss": 21.2539, "step": 1916 }, { "epoch": 6.8464285714285715, - "grad_norm": 57.08993911743164, + "grad_norm": 155.8316650390625, "learning_rate": 2.7356031914899704e-06, - "loss": 10.293, + "loss": 12.2461, "step": 1917 }, { "epoch": 6.85, - "grad_norm": 50.306514739990234, + "grad_norm": 81.75945281982422, "learning_rate": 2.7300475013022666e-06, - "loss": 10.3633, + "loss": 9.8594, "step": 1918 }, { "epoch": 6.853571428571429, - "grad_norm": 46.976810455322266, + "grad_norm": 124.7396011352539, "learning_rate": 2.7244953390073358e-06, - "loss": 8.0352, + "loss": 8.543, "step": 1919 }, { "epoch": 6.857142857142857, - "grad_norm": 48.30802917480469, + "grad_norm": 36.188411712646484, "learning_rate": 2.718946713234185e-06, - "loss": 10.6016, + "loss": 22.7578, "step": 1920 }, { "epoch": 6.860714285714286, - "grad_norm": 38.41388702392578, + "grad_norm": 26.3350830078125, "learning_rate": 2.7134016326063233e-06, - "loss": 9.1758, + "loss": 9.3672, "step": 1921 }, { "epoch": 6.864285714285714, - "grad_norm": 59.04668426513672, + "grad_norm": 691.7966918945312, "learning_rate": 2.7078601057417497e-06, - "loss": 10.7852, + "loss": 13.3594, "step": 1922 }, { "epoch": 6.867857142857143, - "grad_norm": 59.74335479736328, + "grad_norm": 357.6175537109375, "learning_rate": 2.702322141252941e-06, - "loss": 12.9688, + "loss": 15.0234, "step": 1923 }, { "epoch": 6.871428571428572, - "grad_norm": 44.1722412109375, + "grad_norm": 13.438132286071777, "learning_rate": 2.6967877477468394e-06, - "loss": 7.7344, + "loss": 8.5762, "step": 1924 }, { "epoch": 6.875, - "grad_norm": 58.35015869140625, + "grad_norm": 21.086524963378906, "learning_rate": 2.6912569338248317e-06, - "loss": 8.8711, + "loss": 7.6445, "step": 1925 }, { "epoch": 6.878571428571428, - "grad_norm": 42.75580596923828, + "grad_norm": 158.30734252929688, "learning_rate": 2.685729708082745e-06, - "loss": 7.5586, + "loss": 7.7109, "step": 1926 }, { "epoch": 6.882142857142857, - "grad_norm": 38.24165344238281, + "grad_norm": 173.49929809570312, "learning_rate": 2.6802060791108304e-06, - "loss": 7.209, + "loss": 7.5781, "step": 1927 }, { "epoch": 6.885714285714286, - "grad_norm": 42.917667388916016, + "grad_norm": 1132.27490234375, "learning_rate": 2.674686055493748e-06, - "loss": 8.1992, + "loss": 15.5586, "step": 1928 }, { "epoch": 6.889285714285714, - "grad_norm": 72.39764404296875, + "grad_norm": 732.4373779296875, "learning_rate": 2.6691696458105543e-06, - "loss": 10.0586, + "loss": 10.2988, "step": 1929 }, { "epoch": 6.892857142857143, - "grad_norm": 43.038875579833984, + "grad_norm": 31.82263946533203, "learning_rate": 2.66365685863469e-06, - "loss": 9.6055, + "loss": 10.1367, "step": 1930 }, { "epoch": 6.896428571428571, - "grad_norm": 40.63591003417969, + "grad_norm": 17.871963500976562, "learning_rate": 2.6581477025339645e-06, - "loss": 10.8945, + "loss": 11.1113, "step": 1931 }, { "epoch": 6.9, - "grad_norm": 44.74380874633789, + "grad_norm": 496.0389099121094, "learning_rate": 2.6526421860705474e-06, - "loss": 8.3711, + "loss": 9.8672, "step": 1932 }, { "epoch": 6.9035714285714285, - "grad_norm": 55.73988342285156, + "grad_norm": 40.55010223388672, "learning_rate": 2.6471403178009447e-06, - "loss": 9.5664, + "loss": 11.4883, "step": 1933 }, { "epoch": 6.9071428571428575, - "grad_norm": 52.501861572265625, + "grad_norm": 98.62919616699219, "learning_rate": 2.6416421062759984e-06, - "loss": 11.7617, + "loss": 11.957, "step": 1934 }, { "epoch": 6.910714285714286, - "grad_norm": 44.91633224487305, + "grad_norm": 13.243093490600586, "learning_rate": 2.6361475600408657e-06, - "loss": 11.6523, + "loss": 13.3848, "step": 1935 }, { "epoch": 6.914285714285715, - "grad_norm": 60.55117416381836, + "grad_norm": 74.8520736694336, "learning_rate": 2.6306566876350072e-06, - "loss": 11.2266, + "loss": 12.8535, "step": 1936 }, { "epoch": 6.917857142857143, - "grad_norm": 52.51578140258789, + "grad_norm": 780.4949340820312, "learning_rate": 2.625169497592174e-06, - "loss": 8.9492, + "loss": 11.8711, "step": 1937 }, { "epoch": 6.921428571428572, - "grad_norm": 53.95813751220703, + "grad_norm": 38.522926330566406, "learning_rate": 2.619685998440393e-06, - "loss": 9.9375, + "loss": 10.0078, "step": 1938 }, { "epoch": 6.925, - "grad_norm": 47.066410064697266, + "grad_norm": 910.493408203125, "learning_rate": 2.614206198701958e-06, - "loss": 9.5703, + "loss": 12.6328, "step": 1939 }, { "epoch": 6.928571428571429, - "grad_norm": 44.24982452392578, + "grad_norm": 5.024480819702148, "learning_rate": 2.608730106893411e-06, - "loss": 9.6953, + "loss": 9.8438, "step": 1940 }, { "epoch": 6.932142857142857, - "grad_norm": 69.46598815917969, + "grad_norm": 39.948951721191406, "learning_rate": 2.603257731525527e-06, - "loss": 10.0273, + "loss": 12.3125, "step": 1941 }, { "epoch": 6.935714285714286, - "grad_norm": 47.415733337402344, + "grad_norm": 13.37286376953125, "learning_rate": 2.5977890811033135e-06, - "loss": 8.2969, + "loss": 10.7812, "step": 1942 }, { "epoch": 6.939285714285714, - "grad_norm": 58.071083068847656, + "grad_norm": 139.82080078125, "learning_rate": 2.5923241641259823e-06, - "loss": 11.1055, + "loss": 11.9023, "step": 1943 }, { "epoch": 6.942857142857143, - "grad_norm": 56.389015197753906, + "grad_norm": 1196.23876953125, "learning_rate": 2.5868629890869467e-06, - "loss": 10.1328, + "loss": 15.1582, "step": 1944 }, { "epoch": 6.946428571428571, - "grad_norm": 54.16038513183594, + "grad_norm": 70.02942657470703, "learning_rate": 2.5814055644738013e-06, - "loss": 10.9336, + "loss": 9.4785, "step": 1945 }, { "epoch": 6.95, - "grad_norm": 53.483848571777344, + "grad_norm": 294.8551940917969, "learning_rate": 2.5759518987683154e-06, - "loss": 10.8672, + "loss": 13.0742, "step": 1946 }, { "epoch": 6.953571428571428, - "grad_norm": 53.74272537231445, + "grad_norm": 369.0929870605469, "learning_rate": 2.570502000446412e-06, - "loss": 7.3594, + "loss": 7.9297, "step": 1947 }, { "epoch": 6.957142857142857, - "grad_norm": 54.762210845947266, + "grad_norm": 15.928335189819336, "learning_rate": 2.5650558779781635e-06, - "loss": 8.4453, + "loss": 9.793, "step": 1948 }, { "epoch": 6.960714285714285, - "grad_norm": 42.34233474731445, + "grad_norm": 43.134986877441406, "learning_rate": 2.559613539827772e-06, - "loss": 7.8672, + "loss": 8.6289, "step": 1949 }, { "epoch": 6.964285714285714, - "grad_norm": 57.62413787841797, + "grad_norm": 50.296302795410156, "learning_rate": 2.5541749944535554e-06, - "loss": 9.4375, + "loss": 8.4102, "step": 1950 }, { "epoch": 6.9678571428571425, - "grad_norm": 51.0986328125, + "grad_norm": 84.61518096923828, "learning_rate": 2.5487402503079396e-06, - "loss": 11.3828, + "loss": 13.5156, "step": 1951 }, { "epoch": 6.9714285714285715, - "grad_norm": 46.97901153564453, + "grad_norm": 638.52978515625, "learning_rate": 2.543309315837444e-06, - "loss": 11.0156, + "loss": 14.9648, "step": 1952 }, { "epoch": 6.975, - "grad_norm": 40.150028228759766, + "grad_norm": 108.73402404785156, "learning_rate": 2.5378821994826654e-06, - "loss": 8.582, + "loss": 9.2031, "step": 1953 }, { "epoch": 6.978571428571429, - "grad_norm": 49.147300720214844, + "grad_norm": 21.477113723754883, "learning_rate": 2.532458909678266e-06, - "loss": 9.0, + "loss": 10.8398, "step": 1954 }, { "epoch": 6.982142857142857, - "grad_norm": 49.78001022338867, + "grad_norm": 29.363492965698242, "learning_rate": 2.527039454852963e-06, - "loss": 8.8008, + "loss": 7.1543, "step": 1955 }, { "epoch": 6.985714285714286, - "grad_norm": 61.43871307373047, + "grad_norm": 412.04266357421875, "learning_rate": 2.521623843429512e-06, - "loss": 9.2148, + "loss": 8.4922, "step": 1956 }, { "epoch": 6.989285714285714, - "grad_norm": 59.51691436767578, + "grad_norm": 15.031450271606445, "learning_rate": 2.516212083824697e-06, - "loss": 11.6367, + "loss": 14.1016, "step": 1957 }, { "epoch": 6.992857142857143, - "grad_norm": 59.463077545166016, + "grad_norm": 11.867350578308105, "learning_rate": 2.5108041844493104e-06, - "loss": 8.6758, + "loss": 7.9707, "step": 1958 }, { "epoch": 6.996428571428572, - "grad_norm": 64.82918548583984, + "grad_norm": 28.373775482177734, "learning_rate": 2.5054001537081502e-06, - "loss": 9.4258, + "loss": 10.1465, "step": 1959 }, { "epoch": 7.0, - "grad_norm": 48.44807434082031, + "grad_norm": 19.854145050048828, "learning_rate": 2.5000000000000015e-06, - "loss": 7.875, + "loss": 6.7754, "step": 1960 }, { "epoch": 7.0, - "eval_loss": 9.870070457458496, - "eval_mse": 9.862997678440072, - "eval_runtime": 10.8424, - "eval_samples_per_second": 261.935, - "eval_steps_per_second": 1.383, - "eval_target_0_mse": 18.703786858691405, - "eval_target_1_mse": 10.126804512994605, - "eval_target_2_mse": 5.4208129552185085, - "eval_target_3_mse": 5.200586386855764, + "eval_loss": 12.729208946228027, + "eval_mse": 12.723724467950099, + "eval_runtime": 11.5844, + "eval_samples_per_second": 245.158, + "eval_steps_per_second": 1.295, + "eval_target_0_mse": 37.93679830794139, + "eval_target_1_mse": 8.871865007110888, + "eval_target_2_mse": 2.7380220206874495, + "eval_target_3_mse": 1.3482125360606807, "step": 1960 }, { "epoch": 7.003571428571429, - "grad_norm": 63.351436614990234, + "grad_norm": 26.615774154663086, "learning_rate": 2.494603731717622e-06, - "loss": 12.3906, + "loss": 13.2773, "step": 1961 }, { "epoch": 7.007142857142857, - "grad_norm": 48.5108757019043, + "grad_norm": 883.41064453125, "learning_rate": 2.4892113572477324e-06, - "loss": 8.9336, + "loss": 18.0098, "step": 1962 }, { "epoch": 7.010714285714286, - "grad_norm": 42.428409576416016, + "grad_norm": 1090.418212890625, "learning_rate": 2.483822884971e-06, - "loss": 10.2695, + "loss": 13.7969, "step": 1963 }, { "epoch": 7.014285714285714, - "grad_norm": 40.74365234375, + "grad_norm": 84.444580078125, "learning_rate": 2.4784383232620297e-06, - "loss": 9.1445, + "loss": 9.4297, "step": 1964 }, { "epoch": 7.017857142857143, - "grad_norm": 44.30000686645508, + "grad_norm": 14.660280227661133, "learning_rate": 2.473057680489348e-06, - "loss": 10.2773, + "loss": 11.3613, "step": 1965 }, { "epoch": 7.021428571428571, - "grad_norm": 47.44732666015625, + "grad_norm": 25.334270477294922, "learning_rate": 2.467680965015387e-06, - "loss": 8.5781, + "loss": 9.3555, "step": 1966 }, { "epoch": 7.025, - "grad_norm": 55.073936462402344, + "grad_norm": 176.8497314453125, "learning_rate": 2.462308185196481e-06, - "loss": 10.0195, + "loss": 9.3945, "step": 1967 }, { "epoch": 7.0285714285714285, - "grad_norm": 58.727638244628906, + "grad_norm": 437.9122009277344, "learning_rate": 2.4569393493828433e-06, - "loss": 10.5781, + "loss": 19.9023, "step": 1968 }, { "epoch": 7.0321428571428575, - "grad_norm": 42.37130355834961, + "grad_norm": 539.8684692382812, "learning_rate": 2.4515744659185598e-06, - "loss": 8.1172, + "loss": 8.793, "step": 1969 }, { "epoch": 7.035714285714286, - "grad_norm": 62.23876953125, + "grad_norm": 809.5782470703125, "learning_rate": 2.4462135431415736e-06, - "loss": 9.9883, + "loss": 13.918, "step": 1970 }, { "epoch": 7.039285714285715, - "grad_norm": 53.30149459838867, + "grad_norm": 432.6263427734375, "learning_rate": 2.4408565893836705e-06, - "loss": 10.9727, + "loss": 12.6367, "step": 1971 }, { "epoch": 7.042857142857143, - "grad_norm": 46.96200942993164, + "grad_norm": 905.0035400390625, "learning_rate": 2.43550361297047e-06, - "loss": 9.8203, + "loss": 27.6484, "step": 1972 }, { "epoch": 7.046428571428572, - "grad_norm": 54.683563232421875, + "grad_norm": 92.1772689819336, "learning_rate": 2.4301546222214106e-06, - "loss": 9.7461, + "loss": 12.8789, "step": 1973 }, { "epoch": 7.05, - "grad_norm": 52.28224182128906, + "grad_norm": 168.8818359375, "learning_rate": 2.424809625449729e-06, - "loss": 12.1328, + "loss": 12.8125, "step": 1974 }, { "epoch": 7.053571428571429, - "grad_norm": 47.982505798339844, + "grad_norm": 222.2382354736328, "learning_rate": 2.4194686309624664e-06, - "loss": 8.4141, + "loss": 10.4473, "step": 1975 }, { "epoch": 7.057142857142857, - "grad_norm": 46.61050796508789, + "grad_norm": 260.4507141113281, "learning_rate": 2.4141316470604362e-06, - "loss": 8.6172, + "loss": 20.6289, "step": 1976 }, { "epoch": 7.060714285714286, - "grad_norm": 54.53724670410156, + "grad_norm": 622.87109375, "learning_rate": 2.4087986820382205e-06, - "loss": 8.6523, + "loss": 8.8535, "step": 1977 }, { "epoch": 7.064285714285714, - "grad_norm": 46.894859313964844, + "grad_norm": 91.58675384521484, "learning_rate": 2.403469744184154e-06, - "loss": 6.8516, + "loss": 7.25, "step": 1978 }, { "epoch": 7.067857142857143, - "grad_norm": 64.6785888671875, + "grad_norm": 912.1692504882812, "learning_rate": 2.3981448417803154e-06, - "loss": 8.5703, + "loss": 11.5469, "step": 1979 }, { "epoch": 7.071428571428571, - "grad_norm": 45.66244888305664, + "grad_norm": 1151.236083984375, "learning_rate": 2.39282398310251e-06, - "loss": 9.2852, + "loss": 17.0625, "step": 1980 }, { "epoch": 7.075, - "grad_norm": 48.743621826171875, + "grad_norm": 789.4208984375, "learning_rate": 2.387507176420256e-06, - "loss": 9.4414, + "loss": 13.9453, "step": 1981 }, { "epoch": 7.078571428571428, - "grad_norm": 52.41585159301758, + "grad_norm": 41.62275314331055, "learning_rate": 2.382194429996778e-06, - "loss": 11.668, + "loss": 11.9297, "step": 1982 }, { "epoch": 7.082142857142857, - "grad_norm": 51.873958587646484, + "grad_norm": 10.293851852416992, "learning_rate": 2.376885752088988e-06, - "loss": 10.7656, + "loss": 11.9531, "step": 1983 }, { "epoch": 7.085714285714285, - "grad_norm": 48.01889419555664, + "grad_norm": 547.8676147460938, "learning_rate": 2.371581150947476e-06, - "loss": 8.9688, + "loss": 16.9395, "step": 1984 }, { "epoch": 7.089285714285714, - "grad_norm": 59.357669830322266, + "grad_norm": 207.9541015625, "learning_rate": 2.366280634816496e-06, - "loss": 10.2109, + "loss": 12.4688, "step": 1985 }, { "epoch": 7.0928571428571425, - "grad_norm": 45.54422378540039, + "grad_norm": 429.0995178222656, "learning_rate": 2.3609842119339533e-06, - "loss": 8.6797, + "loss": 10.0215, "step": 1986 }, { "epoch": 7.0964285714285715, - "grad_norm": 46.90761184692383, + "grad_norm": 143.94677734375, "learning_rate": 2.3556918905313897e-06, - "loss": 9.9453, + "loss": 11.8594, "step": 1987 }, { "epoch": 7.1, - "grad_norm": 58.20370101928711, + "grad_norm": 713.7937622070312, "learning_rate": 2.3504036788339763e-06, - "loss": 12.4883, + "loss": 16.6875, "step": 1988 }, { "epoch": 7.103571428571429, - "grad_norm": 48.187137603759766, + "grad_norm": 111.3718490600586, "learning_rate": 2.3451195850604913e-06, - "loss": 11.0117, + "loss": 11.6836, "step": 1989 }, { "epoch": 7.107142857142857, - "grad_norm": 66.80733489990234, + "grad_norm": 57.836063385009766, "learning_rate": 2.339839617423318e-06, - "loss": 7.0469, + "loss": 5.3438, "step": 1990 }, { "epoch": 7.110714285714286, - "grad_norm": 51.86183547973633, + "grad_norm": 8.621505737304688, "learning_rate": 2.3345637841284254e-06, - "loss": 8.457, + "loss": 7.4062, "step": 1991 }, { "epoch": 7.114285714285714, - "grad_norm": 50.80962371826172, + "grad_norm": 567.2972412109375, "learning_rate": 2.3292920933753566e-06, - "loss": 11.2031, + "loss": 14.7969, "step": 1992 }, { "epoch": 7.117857142857143, - "grad_norm": 58.8770866394043, + "grad_norm": 956.7918090820312, "learning_rate": 2.324024553357217e-06, - "loss": 11.6719, + "loss": 21.4609, "step": 1993 }, { "epoch": 7.121428571428571, - "grad_norm": 42.8383674621582, + "grad_norm": 27.522008895874023, "learning_rate": 2.3187611722606616e-06, - "loss": 8.9922, + "loss": 9.0488, "step": 1994 }, { "epoch": 7.125, - "grad_norm": 50.996673583984375, + "grad_norm": 117.21835327148438, "learning_rate": 2.3135019582658803e-06, - "loss": 8.9961, + "loss": 8.7266, "step": 1995 }, { "epoch": 7.128571428571428, - "grad_norm": 45.923030853271484, + "grad_norm": 18.674222946166992, "learning_rate": 2.3082469195465893e-06, - "loss": 10.0234, + "loss": 11.5859, "step": 1996 }, { "epoch": 7.132142857142857, - "grad_norm": 51.73891067504883, + "grad_norm": 204.33265686035156, "learning_rate": 2.30299606427001e-06, - "loss": 8.7812, + "loss": 9.6328, "step": 1997 }, { "epoch": 7.135714285714286, - "grad_norm": 41.77059555053711, + "grad_norm": 95.39877319335938, "learning_rate": 2.297749400596868e-06, - "loss": 7.6445, + "loss": 7.9043, "step": 1998 }, { "epoch": 7.139285714285714, - "grad_norm": 53.344905853271484, + "grad_norm": 76.26758575439453, "learning_rate": 2.2925069366813718e-06, - "loss": 9.207, + "loss": 7.7051, "step": 1999 }, { "epoch": 7.142857142857143, - "grad_norm": 49.7736930847168, + "grad_norm": 302.4071350097656, "learning_rate": 2.2872686806712037e-06, - "loss": 9.1406, + "loss": 21.957, "step": 2000 }, { "epoch": 7.146428571428571, - "grad_norm": 57.1895866394043, + "grad_norm": 848.7487182617188, "learning_rate": 2.2820346407075044e-06, - "loss": 11.5547, + "loss": 13.3184, "step": 2001 }, { "epoch": 7.15, - "grad_norm": 41.262596130371094, + "grad_norm": 228.38870239257812, "learning_rate": 2.2768048249248648e-06, - "loss": 8.3516, + "loss": 8.0703, "step": 2002 }, { "epoch": 7.1535714285714285, - "grad_norm": 45.489158630371094, + "grad_norm": 938.7359008789062, "learning_rate": 2.2715792414513084e-06, - "loss": 9.8086, + "loss": 30.5215, "step": 2003 }, { "epoch": 7.1571428571428575, - "grad_norm": 56.30449676513672, + "grad_norm": 514.1884155273438, "learning_rate": 2.2663578984082826e-06, - "loss": 10.3438, + "loss": 13.0254, "step": 2004 }, { "epoch": 7.160714285714286, - "grad_norm": 51.64695358276367, + "grad_norm": 996.7538452148438, "learning_rate": 2.261140803910644e-06, - "loss": 10.4062, + "loss": 14.7012, "step": 2005 }, { "epoch": 7.164285714285715, - "grad_norm": 63.39439392089844, + "grad_norm": 9.876635551452637, "learning_rate": 2.2559279660666444e-06, - "loss": 8.4805, + "loss": 6.8223, "step": 2006 }, { "epoch": 7.167857142857143, - "grad_norm": 42.560768127441406, + "grad_norm": 158.0379180908203, "learning_rate": 2.2507193929779223e-06, - "loss": 8.332, + "loss": 7.4531, "step": 2007 }, { "epoch": 7.171428571428572, - "grad_norm": 57.913089752197266, + "grad_norm": 1365.9599609375, "learning_rate": 2.245515092739488e-06, - "loss": 11.0703, + "loss": 21.5117, "step": 2008 }, { "epoch": 7.175, - "grad_norm": 46.91106033325195, + "grad_norm": 698.214111328125, "learning_rate": 2.2403150734397095e-06, - "loss": 10.4922, + "loss": 15.5156, "step": 2009 }, { "epoch": 7.178571428571429, - "grad_norm": 50.82299041748047, + "grad_norm": 46.15066146850586, "learning_rate": 2.235119343160303e-06, - "loss": 10.418, + "loss": 11.416, "step": 2010 }, { "epoch": 7.182142857142857, - "grad_norm": 53.172096252441406, + "grad_norm": 60.94255447387695, "learning_rate": 2.2299279099763176e-06, - "loss": 9.6992, + "loss": 9.3945, "step": 2011 }, { "epoch": 7.185714285714286, - "grad_norm": 56.6114501953125, + "grad_norm": 15.648303985595703, "learning_rate": 2.224740781956126e-06, - "loss": 8.2188, + "loss": 8.623, "step": 2012 }, { "epoch": 7.189285714285714, - "grad_norm": 58.05879211425781, + "grad_norm": 500.7865905761719, "learning_rate": 2.219557967161408e-06, - "loss": 12.043, + "loss": 13.4902, "step": 2013 }, { "epoch": 7.192857142857143, - "grad_norm": 48.3296012878418, + "grad_norm": 186.95626831054688, "learning_rate": 2.214379473647139e-06, - "loss": 10.0938, + "loss": 10.7578, "step": 2014 }, { "epoch": 7.196428571428571, - "grad_norm": 53.80365753173828, + "grad_norm": 22.915145874023438, "learning_rate": 2.2092053094615813e-06, - "loss": 10.5195, + "loss": 11.5098, "step": 2015 }, { "epoch": 7.2, - "grad_norm": 47.50804138183594, + "grad_norm": 24.84552574157715, "learning_rate": 2.204035482646267e-06, - "loss": 9.6523, + "loss": 11.1523, "step": 2016 }, { "epoch": 7.203571428571428, - "grad_norm": 40.840606689453125, + "grad_norm": 125.68289184570312, "learning_rate": 2.1988700012359865e-06, - "loss": 8.8242, + "loss": 9.8711, "step": 2017 }, { "epoch": 7.207142857142857, - "grad_norm": 51.00700759887695, + "grad_norm": 8.792232513427734, "learning_rate": 2.1937088732587785e-06, - "loss": 10.082, + "loss": 10.6016, "step": 2018 }, { "epoch": 7.210714285714285, - "grad_norm": 54.85242462158203, + "grad_norm": 59.142024993896484, "learning_rate": 2.1885521067359156e-06, - "loss": 9.1055, + "loss": 10.7812, "step": 2019 }, { "epoch": 7.214285714285714, - "grad_norm": 52.243682861328125, + "grad_norm": 34.948387145996094, "learning_rate": 2.1833997096818897e-06, - "loss": 9.8242, + "loss": 9.792, "step": 2020 }, { "epoch": 7.2178571428571425, - "grad_norm": 57.335819244384766, + "grad_norm": 1052.2783203125, "learning_rate": 2.178251690104406e-06, - "loss": 10.1445, + "loss": 15.5742, "step": 2021 }, { "epoch": 7.2214285714285715, - "grad_norm": 40.44204330444336, + "grad_norm": 24.019699096679688, "learning_rate": 2.17310805600436e-06, - "loss": 7.0664, + "loss": 6.1191, "step": 2022 }, { "epoch": 7.225, - "grad_norm": 48.1353874206543, + "grad_norm": 21.972909927368164, "learning_rate": 2.1679688153758373e-06, - "loss": 9.0586, + "loss": 8.8184, "step": 2023 }, { "epoch": 7.228571428571429, - "grad_norm": 50.90115737915039, + "grad_norm": 159.1180419921875, "learning_rate": 2.162833976206092e-06, - "loss": 8.5469, + "loss": 8.332, "step": 2024 }, { "epoch": 7.232142857142857, - "grad_norm": 59.126644134521484, + "grad_norm": 8.876053810119629, "learning_rate": 2.157703546475539e-06, - "loss": 8.7031, + "loss": 7.5234, "step": 2025 }, { "epoch": 7.235714285714286, - "grad_norm": 54.20848083496094, + "grad_norm": 88.6999282836914, "learning_rate": 2.1525775341577404e-06, - "loss": 13.4258, + "loss": 15.4883, "step": 2026 }, { "epoch": 7.239285714285714, - "grad_norm": 57.22539520263672, + "grad_norm": 64.13837432861328, "learning_rate": 2.1474559472193923e-06, - "loss": 10.0234, + "loss": 9.9121, "step": 2027 }, { "epoch": 7.242857142857143, - "grad_norm": 50.83561325073242, + "grad_norm": 925.4891357421875, "learning_rate": 2.1423387936203125e-06, - "loss": 10.957, + "loss": 17.5234, "step": 2028 }, { "epoch": 7.246428571428572, - "grad_norm": 44.79703140258789, + "grad_norm": 81.99842834472656, "learning_rate": 2.13722608131343e-06, - "loss": 9.1289, + "loss": 9.9844, "step": 2029 }, { "epoch": 7.25, - "grad_norm": 43.77326202392578, + "grad_norm": 12.005550384521484, "learning_rate": 2.132117818244771e-06, - "loss": 8.5586, + "loss": 7.6172, "step": 2030 }, { "epoch": 7.253571428571428, - "grad_norm": 42.20839309692383, + "grad_norm": 10.164466857910156, "learning_rate": 2.1270140123534437e-06, - "loss": 7.582, + "loss": 6.2891, "step": 2031 }, { "epoch": 7.257142857142857, - "grad_norm": 56.11486053466797, + "grad_norm": 233.4937744140625, "learning_rate": 2.1219146715716332e-06, - "loss": 8.4805, + "loss": 7.9648, "step": 2032 }, { "epoch": 7.260714285714286, - "grad_norm": 83.53744506835938, + "grad_norm": 98.5975112915039, "learning_rate": 2.116819803824584e-06, - "loss": 10.293, + "loss": 14.4043, "step": 2033 }, { "epoch": 7.264285714285714, - "grad_norm": 47.66694641113281, + "grad_norm": 665.5526123046875, "learning_rate": 2.1117294170305876e-06, - "loss": 9.1602, + "loss": 18.0, "step": 2034 }, { "epoch": 7.267857142857143, - "grad_norm": 49.74073028564453, + "grad_norm": 23.631071090698242, "learning_rate": 2.1066435191009717e-06, - "loss": 12.2539, + "loss": 13.1836, "step": 2035 }, { "epoch": 7.271428571428571, - "grad_norm": 53.32902526855469, + "grad_norm": 262.7103271484375, "learning_rate": 2.1015621179400893e-06, - "loss": 10.5156, + "loss": 12.5, "step": 2036 }, { "epoch": 7.275, - "grad_norm": 45.40413284301758, + "grad_norm": 577.6365966796875, "learning_rate": 2.096485221445301e-06, - "loss": 9.25, + "loss": 11.3086, "step": 2037 }, { "epoch": 7.2785714285714285, - "grad_norm": 45.199954986572266, + "grad_norm": 153.36549377441406, "learning_rate": 2.0914128375069724e-06, - "loss": 7.3203, + "loss": 6.3398, "step": 2038 }, { "epoch": 7.2821428571428575, - "grad_norm": 46.47087097167969, + "grad_norm": 31.084224700927734, "learning_rate": 2.086344974008448e-06, - "loss": 8.7031, + "loss": 8.8867, "step": 2039 }, { "epoch": 7.285714285714286, - "grad_norm": 62.2796630859375, + "grad_norm": 738.989990234375, "learning_rate": 2.081281638826052e-06, - "loss": 11.1367, + "loss": 25.2109, "step": 2040 }, { "epoch": 7.289285714285715, - "grad_norm": 45.831199645996094, + "grad_norm": 125.4146499633789, "learning_rate": 2.0762228398290696e-06, - "loss": 8.4023, + "loss": 7.5332, "step": 2041 }, { "epoch": 7.292857142857143, - "grad_norm": 49.61810302734375, + "grad_norm": 388.1852722167969, "learning_rate": 2.071168584879736e-06, - "loss": 9.1992, + "loss": 11.5625, "step": 2042 }, { "epoch": 7.296428571428572, - "grad_norm": 50.0167350769043, + "grad_norm": 115.833251953125, "learning_rate": 2.0661188818332257e-06, - "loss": 7.875, + "loss": 7.877, "step": 2043 }, { "epoch": 7.3, - "grad_norm": 44.27167892456055, + "grad_norm": 36.304927825927734, "learning_rate": 2.061073738537635e-06, - "loss": 9.4531, + "loss": 9.5371, "step": 2044 }, { "epoch": 7.303571428571429, - "grad_norm": 50.59127426147461, + "grad_norm": 639.1361694335938, "learning_rate": 2.056033162833977e-06, - "loss": 10.5781, + "loss": 14.1133, "step": 2045 }, { "epoch": 7.307142857142857, - "grad_norm": 48.18633270263672, + "grad_norm": 638.9813842773438, "learning_rate": 2.050997162556166e-06, - "loss": 9.707, + "loss": 15.0801, "step": 2046 }, { "epoch": 7.310714285714286, - "grad_norm": 57.87864303588867, + "grad_norm": 35.78220748901367, "learning_rate": 2.0459657455310013e-06, - "loss": 9.6172, + "loss": 10.3516, "step": 2047 }, { "epoch": 7.314285714285714, - "grad_norm": 43.35893249511719, + "grad_norm": 15.11691665649414, "learning_rate": 2.0409389195781627e-06, - "loss": 10.1094, + "loss": 9.6992, "step": 2048 }, { "epoch": 7.317857142857143, - "grad_norm": 57.04454803466797, + "grad_norm": 65.2090835571289, "learning_rate": 2.035916692510195e-06, - "loss": 10.6953, + "loss": 10.4805, "step": 2049 }, { "epoch": 7.321428571428571, - "grad_norm": 45.509063720703125, + "grad_norm": 77.36616516113281, "learning_rate": 2.030899072132493e-06, - "loss": 8.6406, + "loss": 9.0762, "step": 2050 }, { "epoch": 7.325, - "grad_norm": 51.84846115112305, + "grad_norm": 5.754606246948242, "learning_rate": 2.0258860662432946e-06, - "loss": 12.1406, + "loss": 13.6211, "step": 2051 }, { "epoch": 7.328571428571428, - "grad_norm": 44.216407775878906, + "grad_norm": 160.74359130859375, "learning_rate": 2.0208776826336617e-06, - "loss": 9.3438, + "loss": 11.8281, "step": 2052 }, { "epoch": 7.332142857142857, - "grad_norm": 64.21390533447266, + "grad_norm": 49.02817153930664, "learning_rate": 2.0158739290874822e-06, - "loss": 8.3047, + "loss": 8.4414, "step": 2053 }, { "epoch": 7.335714285714285, - "grad_norm": 56.0959358215332, + "grad_norm": 26.19078254699707, "learning_rate": 2.0108748133814347e-06, - "loss": 8.3203, + "loss": 7.7988, "step": 2054 }, { "epoch": 7.339285714285714, - "grad_norm": 57.921268463134766, + "grad_norm": 233.6194305419922, "learning_rate": 2.0058803432849987e-06, - "loss": 8.7148, + "loss": 8.7598, "step": 2055 }, { "epoch": 7.3428571428571425, - "grad_norm": 43.13523483276367, + "grad_norm": 122.44649505615234, "learning_rate": 2.0008905265604316e-06, - "loss": 9.3594, + "loss": 10.7656, "step": 2056 }, { "epoch": 7.3464285714285715, - "grad_norm": 49.41657257080078, + "grad_norm": 104.89433288574219, "learning_rate": 1.9959053709627576e-06, - "loss": 11.3164, + "loss": 11.9082, "step": 2057 }, { "epoch": 7.35, - "grad_norm": 53.06032180786133, + "grad_norm": 16.9726505279541, "learning_rate": 1.990924884239758e-06, - "loss": 7.7734, + "loss": 8.2012, "step": 2058 }, { "epoch": 7.353571428571429, - "grad_norm": 53.045806884765625, + "grad_norm": 128.5762939453125, "learning_rate": 1.9859490741319574e-06, - "loss": 8.6992, + "loss": 9.0039, "step": 2059 }, { "epoch": 7.357142857142857, - "grad_norm": 45.809844970703125, + "grad_norm": 898.1134643554688, "learning_rate": 1.980977948372612e-06, - "loss": 8.2578, + "loss": 11.9297, "step": 2060 }, { "epoch": 7.360714285714286, - "grad_norm": 48.863380432128906, + "grad_norm": 41.402130126953125, "learning_rate": 1.9760115146877e-06, - "loss": 8.6445, + "loss": 6.959, "step": 2061 }, { "epoch": 7.364285714285714, - "grad_norm": 47.282196044921875, + "grad_norm": 27.58127784729004, "learning_rate": 1.971049780795901e-06, - "loss": 9.2578, + "loss": 8.5137, "step": 2062 }, { "epoch": 7.367857142857143, - "grad_norm": 56.848567962646484, + "grad_norm": 1107.053955078125, "learning_rate": 1.9660927544085967e-06, - "loss": 8.5508, + "loss": 11.3906, "step": 2063 }, { "epoch": 7.371428571428572, - "grad_norm": 49.91100311279297, + "grad_norm": 6.424514293670654, "learning_rate": 1.9611404432298505e-06, - "loss": 7.7656, + "loss": 7.4473, "step": 2064 }, { "epoch": 7.375, - "grad_norm": 56.56538772583008, + "grad_norm": 1510.45263671875, "learning_rate": 1.956192854956397e-06, - "loss": 9.668, + "loss": 44.9961, "step": 2065 }, { "epoch": 7.378571428571428, - "grad_norm": 55.09542465209961, + "grad_norm": 57.634605407714844, "learning_rate": 1.9512499972776303e-06, - "loss": 8.7734, + "loss": 11.0312, "step": 2066 }, { "epoch": 7.382142857142857, - "grad_norm": 56.163917541503906, + "grad_norm": 33.63594055175781, "learning_rate": 1.9463118778755946e-06, - "loss": 9.3281, + "loss": 11.3047, "step": 2067 }, { "epoch": 7.385714285714286, - "grad_norm": 47.06993103027344, + "grad_norm": 251.27027893066406, "learning_rate": 1.941378504424968e-06, - "loss": 10.6328, + "loss": 12.5312, "step": 2068 }, { "epoch": 7.389285714285714, - "grad_norm": 56.117271423339844, + "grad_norm": 32.23177719116211, "learning_rate": 1.9364498845930534e-06, - "loss": 7.6406, + "loss": 7.6055, "step": 2069 }, { "epoch": 7.392857142857143, - "grad_norm": 46.762413024902344, + "grad_norm": 31.434009552001953, "learning_rate": 1.9315260260397638e-06, - "loss": 8.9492, + "loss": 9.4414, "step": 2070 }, { "epoch": 7.396428571428571, - "grad_norm": 44.66630554199219, + "grad_norm": 18.753795623779297, "learning_rate": 1.9266069364176144e-06, - "loss": 8.5547, + "loss": 9.0098, "step": 2071 }, { "epoch": 7.4, - "grad_norm": 47.76073455810547, + "grad_norm": 824.9378662109375, "learning_rate": 1.9216926233717087e-06, - "loss": 9.2188, + "loss": 13.9141, "step": 2072 }, { "epoch": 7.4035714285714285, - "grad_norm": 51.81515884399414, + "grad_norm": 1172.97021484375, "learning_rate": 1.9167830945397263e-06, - "loss": 10.6797, + "loss": 21.8945, "step": 2073 }, { "epoch": 7.4071428571428575, - "grad_norm": 43.56839370727539, + "grad_norm": 760.8384399414062, "learning_rate": 1.911878357551911e-06, - "loss": 8.1445, + "loss": 15.9434, "step": 2074 }, { "epoch": 7.410714285714286, - "grad_norm": 45.4836540222168, + "grad_norm": 68.5196304321289, "learning_rate": 1.9069784200310592e-06, - "loss": 10.6133, + "loss": 9.4629, "step": 2075 }, { "epoch": 7.414285714285715, - "grad_norm": 58.65183639526367, + "grad_norm": 28.114192962646484, "learning_rate": 1.902083289592509e-06, - "loss": 9.7461, + "loss": 8.918, "step": 2076 }, { "epoch": 7.417857142857143, - "grad_norm": 58.53528594970703, + "grad_norm": 60.8482666015625, "learning_rate": 1.8971929738441275e-06, - "loss": 15.9922, + "loss": 17.7969, "step": 2077 }, { "epoch": 7.421428571428572, - "grad_norm": 49.79013442993164, + "grad_norm": 46.012916564941406, "learning_rate": 1.892307480386295e-06, - "loss": 11.3164, + "loss": 10.6348, "step": 2078 }, { "epoch": 7.425, - "grad_norm": 47.56153869628906, + "grad_norm": 341.8997497558594, "learning_rate": 1.887426816811903e-06, - "loss": 8.3516, + "loss": 8.4395, "step": 2079 }, { "epoch": 7.428571428571429, - "grad_norm": 49.558624267578125, + "grad_norm": 71.5573959350586, "learning_rate": 1.8825509907063328e-06, - "loss": 8.1094, + "loss": 8.0938, "step": 2080 }, { "epoch": 7.432142857142857, - "grad_norm": 51.29070281982422, + "grad_norm": 677.493408203125, "learning_rate": 1.877680009647449e-06, - "loss": 11.9102, + "loss": 20.2305, "step": 2081 }, { "epoch": 7.435714285714286, - "grad_norm": 53.199119567871094, + "grad_norm": 759.8582763671875, "learning_rate": 1.8728138812055863e-06, - "loss": 9.1133, + "loss": 18.3086, "step": 2082 }, { "epoch": 7.439285714285714, - "grad_norm": 64.47129821777344, + "grad_norm": 333.6936340332031, "learning_rate": 1.867952612943536e-06, - "loss": 11.4453, + "loss": 12.2188, "step": 2083 }, { "epoch": 7.442857142857143, - "grad_norm": 53.561676025390625, + "grad_norm": 25.52960777282715, "learning_rate": 1.8630962124165376e-06, - "loss": 14.4023, + "loss": 16.0762, "step": 2084 }, { "epoch": 7.446428571428571, - "grad_norm": 51.50230407714844, + "grad_norm": 22.25140380859375, "learning_rate": 1.8582446871722637e-06, - "loss": 13.0977, + "loss": 15.9746, "step": 2085 }, { "epoch": 7.45, - "grad_norm": 52.827606201171875, + "grad_norm": 124.75507354736328, "learning_rate": 1.8533980447508138e-06, - "loss": 9.4531, + "loss": 10.0234, "step": 2086 }, { "epoch": 7.453571428571428, - "grad_norm": 55.584537506103516, + "grad_norm": 72.1204605102539, "learning_rate": 1.8485562926846917e-06, - "loss": 9.2109, + "loss": 9.7441, "step": 2087 }, { "epoch": 7.457142857142857, - "grad_norm": 43.79344940185547, + "grad_norm": 35.36233139038086, "learning_rate": 1.843719438498806e-06, - "loss": 9.2539, + "loss": 9.6953, "step": 2088 }, { "epoch": 7.460714285714285, - "grad_norm": 54.40840148925781, + "grad_norm": 17.136817932128906, "learning_rate": 1.838887489710452e-06, - "loss": 11.6016, + "loss": 12.4141, "step": 2089 }, { "epoch": 7.464285714285714, - "grad_norm": 41.03867721557617, + "grad_norm": 103.83271026611328, "learning_rate": 1.8340604538293017e-06, - "loss": 7.8008, + "loss": 6.7197, "step": 2090 }, { "epoch": 7.4678571428571425, - "grad_norm": 54.62271499633789, + "grad_norm": 178.6735076904297, "learning_rate": 1.8292383383573898e-06, - "loss": 10.3125, + "loss": 11.1504, "step": 2091 }, { "epoch": 7.4714285714285715, - "grad_norm": 46.10812759399414, + "grad_norm": 789.8245239257812, "learning_rate": 1.8244211507891064e-06, - "loss": 10.793, + "loss": 14.2363, "step": 2092 }, { "epoch": 7.475, - "grad_norm": 50.24759292602539, + "grad_norm": 13.563408851623535, "learning_rate": 1.8196088986111798e-06, - "loss": 8.5469, + "loss": 9.043, "step": 2093 }, { "epoch": 7.478571428571429, - "grad_norm": 43.41456604003906, + "grad_norm": 14.514375686645508, "learning_rate": 1.8148015893026727e-06, - "loss": 8.6484, + "loss": 8.752, "step": 2094 }, { "epoch": 7.482142857142857, - "grad_norm": 49.78636169433594, + "grad_norm": 28.239742279052734, "learning_rate": 1.809999230334958e-06, - "loss": 8.6055, + "loss": 9.0195, "step": 2095 }, { "epoch": 7.485714285714286, - "grad_norm": 54.846473693847656, + "grad_norm": 135.65805053710938, "learning_rate": 1.8052018291717216e-06, - "loss": 6.625, + "loss": 5.6406, "step": 2096 }, { "epoch": 7.489285714285714, - "grad_norm": 48.348060607910156, + "grad_norm": 30.534257888793945, "learning_rate": 1.8004093932689415e-06, - "loss": 11.5742, + "loss": 12.2617, "step": 2097 }, { "epoch": 7.492857142857143, - "grad_norm": 63.018035888671875, + "grad_norm": 116.61524963378906, "learning_rate": 1.7956219300748796e-06, - "loss": 12.3906, + "loss": 14.8477, "step": 2098 }, { "epoch": 7.496428571428572, - "grad_norm": 36.80154037475586, + "grad_norm": 100.53321075439453, "learning_rate": 1.7908394470300694e-06, - "loss": 8.3711, + "loss": 9.7578, "step": 2099 }, { "epoch": 7.5, - "grad_norm": 50.149986267089844, + "grad_norm": 49.59261703491211, "learning_rate": 1.7860619515673034e-06, - "loss": 10.1602, + "loss": 10.1797, "step": 2100 }, { "epoch": 7.503571428571428, - "grad_norm": 52.04157257080078, + "grad_norm": 434.8240051269531, "learning_rate": 1.7812894511116236e-06, - "loss": 9.8633, + "loss": 13.4453, "step": 2101 }, { "epoch": 7.507142857142857, - "grad_norm": 54.08245086669922, + "grad_norm": 21.013368606567383, "learning_rate": 1.7765219530803101e-06, - "loss": 9.3828, + "loss": 9.6836, "step": 2102 }, { "epoch": 7.510714285714286, - "grad_norm": 53.15264129638672, + "grad_norm": 103.0265884399414, "learning_rate": 1.7717594648828633e-06, - "loss": 8.3789, + "loss": 9.582, "step": 2103 }, { "epoch": 7.514285714285714, - "grad_norm": 50.72862243652344, + "grad_norm": 16.0716609954834, "learning_rate": 1.7670019939210025e-06, - "loss": 10.2891, + "loss": 11.9922, "step": 2104 }, { "epoch": 7.517857142857143, - "grad_norm": 63.247283935546875, + "grad_norm": 104.5992202758789, "learning_rate": 1.7622495475886486e-06, - "loss": 13.1836, + "loss": 15.0039, "step": 2105 }, { "epoch": 7.521428571428571, - "grad_norm": 52.4545783996582, + "grad_norm": 603.4968872070312, "learning_rate": 1.7575021332719117e-06, - "loss": 10.0234, + "loss": 13.5312, "step": 2106 }, { "epoch": 7.525, - "grad_norm": 45.21814727783203, + "grad_norm": 14.599879264831543, "learning_rate": 1.7527597583490825e-06, - "loss": 8.8398, + "loss": 9.5059, "step": 2107 }, { "epoch": 7.5285714285714285, - "grad_norm": 47.38611602783203, + "grad_norm": 509.5621032714844, "learning_rate": 1.748022430190619e-06, - "loss": 10.4922, + "loss": 13.625, "step": 2108 }, { "epoch": 7.5321428571428575, - "grad_norm": 53.635040283203125, + "grad_norm": 172.2183074951172, "learning_rate": 1.7432901561591365e-06, - "loss": 8.9727, + "loss": 9.2129, "step": 2109 }, { "epoch": 7.535714285714286, - "grad_norm": 41.75260543823242, + "grad_norm": 17.159360885620117, "learning_rate": 1.7385629436093958e-06, - "loss": 8.543, + "loss": 7.8535, "step": 2110 }, { "epoch": 7.539285714285715, - "grad_norm": 46.87645721435547, + "grad_norm": 833.8939819335938, "learning_rate": 1.7338407998882878e-06, - "loss": 10.0117, + "loss": 16.3203, "step": 2111 }, { "epoch": 7.542857142857143, - "grad_norm": 45.71580123901367, + "grad_norm": 9.838553428649902, "learning_rate": 1.7291237323348287e-06, - "loss": 9.4492, + "loss": 9.2617, "step": 2112 }, { "epoch": 7.546428571428572, - "grad_norm": 55.01573944091797, + "grad_norm": 10.59164047241211, "learning_rate": 1.724411748280146e-06, - "loss": 11.6719, + "loss": 12.6836, "step": 2113 }, { "epoch": 7.55, - "grad_norm": 55.98125076293945, + "grad_norm": 739.369384765625, "learning_rate": 1.7197048550474643e-06, - "loss": 7.3555, + "loss": 8.1094, "step": 2114 }, { "epoch": 7.553571428571429, - "grad_norm": 39.23920822143555, + "grad_norm": 295.9716796875, "learning_rate": 1.7150030599520984e-06, - "loss": 8.3125, + "loss": 7.9746, "step": 2115 }, { "epoch": 7.557142857142857, - "grad_norm": 48.62102508544922, + "grad_norm": 57.862571716308594, "learning_rate": 1.7103063703014372e-06, - "loss": 11.0938, + "loss": 12.8867, "step": 2116 }, { "epoch": 7.560714285714286, - "grad_norm": 50.21400833129883, + "grad_norm": 87.3856430053711, "learning_rate": 1.705614793394938e-06, - "loss": 9.8711, + "loss": 9.4727, "step": 2117 }, { "epoch": 7.564285714285714, - "grad_norm": 52.85794448852539, + "grad_norm": 42.17456817626953, "learning_rate": 1.7009283365241086e-06, - "loss": 7.9922, + "loss": 6.9824, "step": 2118 }, { "epoch": 7.567857142857143, - "grad_norm": 50.42074203491211, + "grad_norm": 1212.111328125, "learning_rate": 1.6962470069725046e-06, - "loss": 9.0078, + "loss": 12.4219, "step": 2119 }, { "epoch": 7.571428571428571, - "grad_norm": 58.07779312133789, + "grad_norm": 1067.46875, "learning_rate": 1.6915708120157042e-06, - "loss": 12.3125, + "loss": 18.3965, "step": 2120 }, { "epoch": 7.575, - "grad_norm": 49.82625961303711, + "grad_norm": 591.1502685546875, "learning_rate": 1.6868997589213138e-06, - "loss": 11.0469, + "loss": 12.4219, "step": 2121 }, { "epoch": 7.578571428571428, - "grad_norm": 50.90169143676758, + "grad_norm": 26.44688606262207, "learning_rate": 1.6822338549489447e-06, - "loss": 10.0586, + "loss": 11.7734, "step": 2122 }, { "epoch": 7.582142857142857, - "grad_norm": 48.816619873046875, + "grad_norm": 91.77942657470703, "learning_rate": 1.6775731073502066e-06, - "loss": 10.0273, + "loss": 21.0566, "step": 2123 }, { "epoch": 7.585714285714285, - "grad_norm": 64.08169555664062, + "grad_norm": 275.1936340332031, "learning_rate": 1.6729175233686957e-06, - "loss": 11.9609, + "loss": 14.8047, "step": 2124 }, { "epoch": 7.589285714285714, - "grad_norm": 46.999237060546875, + "grad_norm": 1057.817626953125, "learning_rate": 1.6682671102399806e-06, - "loss": 9.4922, + "loss": 13.3594, "step": 2125 }, { "epoch": 7.5928571428571425, - "grad_norm": 58.80659103393555, + "grad_norm": 973.9456787109375, "learning_rate": 1.6636218751915973e-06, - "loss": 10.1094, + "loss": 15.5938, "step": 2126 }, { "epoch": 7.5964285714285715, - "grad_norm": 47.602760314941406, + "grad_norm": 589.4563598632812, "learning_rate": 1.6589818254430323e-06, - "loss": 9.7969, + "loss": 11.6113, "step": 2127 }, { "epoch": 7.6, - "grad_norm": 56.82249450683594, + "grad_norm": 28.651220321655273, "learning_rate": 1.6543469682057105e-06, - "loss": 12.7305, + "loss": 14.373, "step": 2128 }, { "epoch": 7.603571428571429, - "grad_norm": 54.81881332397461, + "grad_norm": 77.83538055419922, "learning_rate": 1.6497173106829912e-06, - "loss": 9.7539, + "loss": 10.5527, "step": 2129 }, { "epoch": 7.607142857142857, - "grad_norm": 57.32584762573242, + "grad_norm": 38.56644058227539, "learning_rate": 1.6450928600701505e-06, - "loss": 7.6484, + "loss": 6.6016, "step": 2130 }, { "epoch": 7.610714285714286, - "grad_norm": 53.000640869140625, + "grad_norm": 39.71200942993164, "learning_rate": 1.6404736235543705e-06, - "loss": 11.5859, + "loss": 12.4453, "step": 2131 }, { "epoch": 7.614285714285714, - "grad_norm": 45.20490264892578, + "grad_norm": 36.246829986572266, "learning_rate": 1.6358596083147342e-06, - "loss": 8.9492, + "loss": 8.7031, "step": 2132 }, { "epoch": 7.617857142857143, - "grad_norm": 42.30289077758789, + "grad_norm": 613.4480590820312, "learning_rate": 1.6312508215222085e-06, - "loss": 8.3594, + "loss": 17.1875, "step": 2133 }, { "epoch": 7.621428571428572, - "grad_norm": 56.27920913696289, + "grad_norm": 37.32528305053711, "learning_rate": 1.6266472703396286e-06, - "loss": 9.3047, + "loss": 12.6523, "step": 2134 }, { "epoch": 7.625, - "grad_norm": 58.00720977783203, + "grad_norm": 31.81368637084961, "learning_rate": 1.6220489619216988e-06, - "loss": 10.1172, + "loss": 10.4824, "step": 2135 }, { "epoch": 7.628571428571428, - "grad_norm": 68.54242706298828, + "grad_norm": 32.60926055908203, "learning_rate": 1.617455903414974e-06, - "loss": 11.9023, + "loss": 14.6699, "step": 2136 }, { "epoch": 7.632142857142857, - "grad_norm": 73.51898193359375, + "grad_norm": 895.7269287109375, "learning_rate": 1.612868101957849e-06, - "loss": 13.3594, + "loss": 23.8281, "step": 2137 }, { "epoch": 7.635714285714286, - "grad_norm": 53.65211868286133, + "grad_norm": 404.4788513183594, "learning_rate": 1.6082855646805485e-06, - "loss": 11.9102, + "loss": 14.5781, "step": 2138 }, { "epoch": 7.639285714285714, - "grad_norm": 50.65534973144531, + "grad_norm": 26.31083869934082, "learning_rate": 1.6037082987051162e-06, - "loss": 10.707, + "loss": 9.9492, "step": 2139 }, { "epoch": 7.642857142857143, - "grad_norm": 54.172454833984375, + "grad_norm": 778.1361694335938, "learning_rate": 1.5991363111454023e-06, - "loss": 8.7422, + "loss": 12.1133, "step": 2140 }, { "epoch": 7.646428571428571, - "grad_norm": 42.59855270385742, + "grad_norm": 792.2111206054688, "learning_rate": 1.5945696091070551e-06, - "loss": 9.6445, + "loss": 15.4375, "step": 2141 }, { "epoch": 7.65, - "grad_norm": 47.974937438964844, + "grad_norm": 1149.3280029296875, "learning_rate": 1.5900081996875083e-06, - "loss": 10.4258, + "loss": 11.9004, "step": 2142 }, { "epoch": 7.6535714285714285, - "grad_norm": 43.38620376586914, + "grad_norm": 158.06861877441406, "learning_rate": 1.5854520899759656e-06, - "loss": 9.4414, + "loss": 10.2773, "step": 2143 }, { "epoch": 7.6571428571428575, - "grad_norm": 49.58203125, + "grad_norm": 229.0572509765625, "learning_rate": 1.5809012870533996e-06, - "loss": 11.0234, + "loss": 12.1113, "step": 2144 }, { "epoch": 7.660714285714286, - "grad_norm": 55.872398376464844, + "grad_norm": 179.10079956054688, "learning_rate": 1.5763557979925326e-06, - "loss": 11.1055, + "loss": 23.168, "step": 2145 }, { "epoch": 7.664285714285715, - "grad_norm": 65.92535400390625, + "grad_norm": 168.8335418701172, "learning_rate": 1.571815629857829e-06, - "loss": 10.7148, + "loss": 13.918, "step": 2146 }, { "epoch": 7.667857142857143, - "grad_norm": 47.230224609375, + "grad_norm": 49.97970962524414, "learning_rate": 1.567280789705483e-06, - "loss": 8.3125, + "loss": 8.6406, "step": 2147 }, { "epoch": 7.671428571428572, - "grad_norm": 44.411800384521484, + "grad_norm": 73.37007141113281, "learning_rate": 1.5627512845834092e-06, - "loss": 9.3711, + "loss": 10.2441, "step": 2148 }, { "epoch": 7.675, - "grad_norm": 40.1270751953125, + "grad_norm": 9.804173469543457, "learning_rate": 1.5582271215312294e-06, - "loss": 8.2305, + "loss": 8.1523, "step": 2149 }, { "epoch": 7.678571428571429, - "grad_norm": 46.97914123535156, + "grad_norm": 24.992942810058594, "learning_rate": 1.553708307580265e-06, - "loss": 10.0312, + "loss": 9.8867, "step": 2150 }, { "epoch": 7.682142857142857, - "grad_norm": 50.5496940612793, + "grad_norm": 40.950775146484375, "learning_rate": 1.5491948497535199e-06, - "loss": 8.4375, + "loss": 8.9062, "step": 2151 }, { "epoch": 7.685714285714286, - "grad_norm": 55.00469970703125, + "grad_norm": 48.35763931274414, "learning_rate": 1.544686755065677e-06, - "loss": 11.7422, + "loss": 13.8516, "step": 2152 }, { "epoch": 7.689285714285714, - "grad_norm": 54.16765213012695, + "grad_norm": 8.335076332092285, "learning_rate": 1.5401840305230837e-06, - "loss": 11.7773, + "loss": 12.7578, "step": 2153 }, { "epoch": 7.692857142857143, - "grad_norm": 79.04151153564453, + "grad_norm": 375.4282531738281, "learning_rate": 1.535686683123741e-06, - "loss": 8.6211, + "loss": 8.6621, "step": 2154 }, { "epoch": 7.696428571428571, - "grad_norm": 39.10385513305664, + "grad_norm": 4.7262187004089355, "learning_rate": 1.5311947198572918e-06, - "loss": 8.0938, + "loss": 8.5625, "step": 2155 }, { "epoch": 7.7, - "grad_norm": 46.52937316894531, + "grad_norm": 51.89335632324219, "learning_rate": 1.5267081477050132e-06, - "loss": 9.957, + "loss": 11.1172, "step": 2156 }, { "epoch": 7.703571428571428, - "grad_norm": 46.13748550415039, + "grad_norm": 28.112377166748047, "learning_rate": 1.5222269736398016e-06, - "loss": 7.8945, + "loss": 6.6172, "step": 2157 }, { "epoch": 7.707142857142857, - "grad_norm": 54.79770278930664, + "grad_norm": 940.6259765625, "learning_rate": 1.5177512046261667e-06, - "loss": 8.8203, + "loss": 12.6914, "step": 2158 }, { "epoch": 7.710714285714285, - "grad_norm": 44.437767028808594, + "grad_norm": 50.013877868652344, "learning_rate": 1.5132808476202126e-06, - "loss": 10.375, + "loss": 23.6797, "step": 2159 }, { "epoch": 7.714285714285714, - "grad_norm": 47.97459411621094, + "grad_norm": 44.95922088623047, "learning_rate": 1.5088159095696365e-06, - "loss": 8.2109, + "loss": 8.0039, "step": 2160 }, { "epoch": 7.7178571428571425, - "grad_norm": 52.299041748046875, + "grad_norm": 30.19487190246582, "learning_rate": 1.5043563974137132e-06, - "loss": 9.2148, + "loss": 9.0625, "step": 2161 }, { "epoch": 7.7214285714285715, - "grad_norm": 53.46441650390625, + "grad_norm": 655.800048828125, "learning_rate": 1.4999023180832834e-06, - "loss": 8.8281, + "loss": 10.4941, "step": 2162 }, { "epoch": 7.725, - "grad_norm": 41.899288177490234, + "grad_norm": 137.64759826660156, "learning_rate": 1.4954536785007456e-06, - "loss": 10.625, + "loss": 12.1367, "step": 2163 }, { "epoch": 7.728571428571429, - "grad_norm": 54.87217330932617, + "grad_norm": 10.540565490722656, "learning_rate": 1.4910104855800429e-06, - "loss": 9.3242, + "loss": 10.2734, "step": 2164 }, { "epoch": 7.732142857142857, - "grad_norm": 44.63528823852539, + "grad_norm": 47.29835891723633, "learning_rate": 1.4865727462266543e-06, - "loss": 8.9727, + "loss": 9.332, "step": 2165 }, { "epoch": 7.735714285714286, - "grad_norm": 51.13568878173828, + "grad_norm": 43.28983688354492, "learning_rate": 1.4821404673375838e-06, - "loss": 11.3477, + "loss": 12.6426, "step": 2166 }, { "epoch": 7.739285714285714, - "grad_norm": 38.19542694091797, + "grad_norm": 848.6436157226562, "learning_rate": 1.4777136558013443e-06, - "loss": 7.4414, + "loss": 12.0859, "step": 2167 }, { "epoch": 7.742857142857143, - "grad_norm": 44.01172637939453, + "grad_norm": 22.936410903930664, "learning_rate": 1.4732923184979563e-06, - "loss": 10.0898, + "loss": 11.0, "step": 2168 }, { "epoch": 7.746428571428572, - "grad_norm": 57.4007453918457, + "grad_norm": 119.09423828125, "learning_rate": 1.468876462298932e-06, - "loss": 12.5898, + "loss": 17.1465, "step": 2169 }, { "epoch": 7.75, - "grad_norm": 53.57429504394531, + "grad_norm": 17.285741806030273, "learning_rate": 1.4644660940672628e-06, - "loss": 12.5352, + "loss": 14.4062, "step": 2170 }, { "epoch": 7.753571428571428, - "grad_norm": 44.302085876464844, + "grad_norm": 391.1519470214844, "learning_rate": 1.4600612206574127e-06, - "loss": 9.8008, + "loss": 11.6875, "step": 2171 }, { "epoch": 7.757142857142857, - "grad_norm": 54.732120513916016, + "grad_norm": 11.35551643371582, "learning_rate": 1.455661848915305e-06, - "loss": 12.793, + "loss": 12.0898, "step": 2172 }, { "epoch": 7.760714285714286, - "grad_norm": 64.85467529296875, + "grad_norm": 744.14501953125, "learning_rate": 1.4512679856783124e-06, - "loss": 12.3711, + "loss": 15.7266, "step": 2173 }, { "epoch": 7.764285714285714, - "grad_norm": 42.8780632019043, + "grad_norm": 27.014747619628906, "learning_rate": 1.446879637775247e-06, - "loss": 8.8789, + "loss": 9.1172, "step": 2174 }, { "epoch": 7.767857142857143, - "grad_norm": 48.226173400878906, + "grad_norm": 66.20259094238281, "learning_rate": 1.4424968120263506e-06, - "loss": 8.2188, + "loss": 9.1992, "step": 2175 }, { "epoch": 7.771428571428571, - "grad_norm": 50.61428451538086, + "grad_norm": 32.182090759277344, "learning_rate": 1.438119515243277e-06, - "loss": 10.3828, + "loss": 11.9961, "step": 2176 }, { "epoch": 7.775, - "grad_norm": 47.67099380493164, + "grad_norm": 61.885475158691406, "learning_rate": 1.433747754229093e-06, - "loss": 8.7812, + "loss": 8.7988, "step": 2177 }, { "epoch": 7.7785714285714285, - "grad_norm": 63.100040435791016, + "grad_norm": 8.589547157287598, "learning_rate": 1.4293815357782592e-06, - "loss": 11.4023, + "loss": 13.9141, "step": 2178 }, { "epoch": 7.7821428571428575, - "grad_norm": 41.516056060791016, + "grad_norm": 67.8105697631836, "learning_rate": 1.4250208666766235e-06, - "loss": 7.6484, + "loss": 7.2734, "step": 2179 }, { "epoch": 7.785714285714286, - "grad_norm": 71.62000274658203, + "grad_norm": 129.96343994140625, "learning_rate": 1.4206657537014078e-06, - "loss": 11.4805, + "loss": 14.1602, "step": 2180 }, { "epoch": 7.789285714285715, - "grad_norm": 61.394508361816406, + "grad_norm": 36.827510833740234, "learning_rate": 1.4163162036211997e-06, - "loss": 13.668, + "loss": 16.457, "step": 2181 }, { "epoch": 7.792857142857143, - "grad_norm": 49.03635787963867, + "grad_norm": 287.0084533691406, "learning_rate": 1.4119722231959405e-06, - "loss": 9.8281, + "loss": 11.3867, "step": 2182 }, { "epoch": 7.796428571428572, - "grad_norm": 47.140350341796875, + "grad_norm": 29.76845932006836, "learning_rate": 1.4076338191769173e-06, - "loss": 12.1797, + "loss": 14.7363, "step": 2183 }, { "epoch": 7.8, - "grad_norm": 53.06507110595703, + "grad_norm": 27.240421295166016, "learning_rate": 1.4033009983067454e-06, - "loss": 11.8555, + "loss": 11.9551, "step": 2184 }, { "epoch": 7.803571428571429, - "grad_norm": 51.229148864746094, + "grad_norm": 19.561988830566406, "learning_rate": 1.3989737673193682e-06, - "loss": 10.1094, + "loss": 9.9219, "step": 2185 }, { "epoch": 7.807142857142857, - "grad_norm": 52.79569625854492, + "grad_norm": 13.297532081604004, "learning_rate": 1.3946521329400397e-06, - "loss": 8.2656, + "loss": 7.4512, "step": 2186 }, { "epoch": 7.810714285714286, - "grad_norm": 56.461402893066406, + "grad_norm": 29.169296264648438, "learning_rate": 1.390336101885315e-06, - "loss": 9.1172, + "loss": 8.4141, "step": 2187 }, { "epoch": 7.814285714285714, - "grad_norm": 51.53401565551758, + "grad_norm": 467.0444030761719, "learning_rate": 1.3860256808630429e-06, - "loss": 8.6133, + "loss": 11.2578, "step": 2188 }, { "epoch": 7.817857142857143, - "grad_norm": 74.28408813476562, + "grad_norm": 494.41644287109375, "learning_rate": 1.3817208765723505e-06, - "loss": 10.7852, + "loss": 19.0625, "step": 2189 }, { "epoch": 7.821428571428571, - "grad_norm": 48.30155944824219, + "grad_norm": 433.3891906738281, "learning_rate": 1.3774216957036368e-06, - "loss": 10.0078, + "loss": 12.4648, "step": 2190 }, { "epoch": 7.825, - "grad_norm": 40.89067459106445, + "grad_norm": 30.494304656982422, "learning_rate": 1.373128144938563e-06, - "loss": 8.6133, + "loss": 10.168, "step": 2191 }, { "epoch": 7.828571428571428, - "grad_norm": 44.52857971191406, + "grad_norm": 484.8500671386719, "learning_rate": 1.3688402309500353e-06, - "loss": 11.3984, + "loss": 13.291, "step": 2192 }, { "epoch": 7.832142857142857, - "grad_norm": 50.323307037353516, + "grad_norm": 701.5294799804688, "learning_rate": 1.364557960402204e-06, - "loss": 8.8789, + "loss": 9.7031, "step": 2193 }, { "epoch": 7.835714285714285, - "grad_norm": 54.12724685668945, + "grad_norm": 61.370948791503906, "learning_rate": 1.360281339950446e-06, - "loss": 8.8867, + "loss": 9.4492, "step": 2194 }, { "epoch": 7.839285714285714, - "grad_norm": 61.6383056640625, + "grad_norm": 70.79173278808594, "learning_rate": 1.3560103762413584e-06, - "loss": 9.9375, + "loss": 11.2344, "step": 2195 }, { "epoch": 7.8428571428571425, - "grad_norm": 46.13237762451172, + "grad_norm": 349.13360595703125, "learning_rate": 1.351745075912746e-06, - "loss": 10.4883, + "loss": 11.8828, "step": 2196 }, { "epoch": 7.8464285714285715, - "grad_norm": 45.86848449707031, + "grad_norm": 727.4525756835938, "learning_rate": 1.3474854455936126e-06, - "loss": 8.3359, + "loss": 9.5293, "step": 2197 }, { "epoch": 7.85, - "grad_norm": 45.4126091003418, + "grad_norm": 11.452990531921387, "learning_rate": 1.3432314919041478e-06, - "loss": 8.168, + "loss": 8.0801, "step": 2198 }, { "epoch": 7.853571428571429, - "grad_norm": 63.56608581542969, + "grad_norm": 95.29249572753906, "learning_rate": 1.3389832214557224e-06, - "loss": 8.3945, + "loss": 8.0234, "step": 2199 }, { "epoch": 7.857142857142857, - "grad_norm": 42.0566291809082, + "grad_norm": 21.28777313232422, "learning_rate": 1.3347406408508695e-06, - "loss": 8.5312, + "loss": 8.7461, "step": 2200 }, { "epoch": 7.860714285714286, - "grad_norm": 43.70015335083008, + "grad_norm": 11.470367431640625, "learning_rate": 1.3305037566832836e-06, - "loss": 9.4805, + "loss": 8.9883, "step": 2201 }, { "epoch": 7.864285714285714, - "grad_norm": 51.26001739501953, + "grad_norm": 135.63116455078125, "learning_rate": 1.326272575537803e-06, - "loss": 10.4648, + "loss": 11.4531, "step": 2202 }, { "epoch": 7.867857142857143, - "grad_norm": 39.645450592041016, + "grad_norm": 24.682212829589844, "learning_rate": 1.3220471039904048e-06, - "loss": 9.7695, + "loss": 10.0059, "step": 2203 }, { "epoch": 7.871428571428572, - "grad_norm": 39.46467208862305, + "grad_norm": 629.7945556640625, "learning_rate": 1.317827348608191e-06, - "loss": 9.4531, + "loss": 11.6777, "step": 2204 }, { "epoch": 7.875, - "grad_norm": 66.77433776855469, + "grad_norm": 52.056331634521484, "learning_rate": 1.3136133159493803e-06, - "loss": 11.2578, + "loss": 14.5273, "step": 2205 }, { "epoch": 7.878571428571428, - "grad_norm": 59.179710388183594, + "grad_norm": 263.1520080566406, "learning_rate": 1.3094050125632973e-06, - "loss": 6.957, + "loss": 7.543, "step": 2206 }, { "epoch": 7.882142857142857, - "grad_norm": 45.01536178588867, + "grad_norm": 857.3849487304688, "learning_rate": 1.3052024449903621e-06, - "loss": 8.3359, + "loss": 17.8867, "step": 2207 }, { "epoch": 7.885714285714286, - "grad_norm": 60.65529251098633, + "grad_norm": 26.991296768188477, "learning_rate": 1.3010056197620813e-06, - "loss": 10.5625, + "loss": 12.0352, "step": 2208 }, { "epoch": 7.889285714285714, - "grad_norm": 41.40438461303711, + "grad_norm": 15.98957633972168, "learning_rate": 1.2968145434010343e-06, - "loss": 8.9492, + "loss": 9.1211, "step": 2209 }, { "epoch": 7.892857142857143, - "grad_norm": 49.663143157958984, + "grad_norm": 77.5733413696289, "learning_rate": 1.2926292224208664e-06, - "loss": 8.7188, + "loss": 7.5312, "step": 2210 }, { "epoch": 7.896428571428571, - "grad_norm": 45.31591796875, + "grad_norm": 466.4745788574219, "learning_rate": 1.2884496633262827e-06, - "loss": 9.2148, + "loss": 10.7344, "step": 2211 }, { "epoch": 7.9, - "grad_norm": 51.56908416748047, + "grad_norm": 191.3324737548828, "learning_rate": 1.2842758726130283e-06, - "loss": 9.9531, + "loss": 9.5859, "step": 2212 }, { "epoch": 7.9035714285714285, - "grad_norm": 40.810672760009766, + "grad_norm": 1015.260498046875, "learning_rate": 1.2801078567678849e-06, - "loss": 8.3984, + "loss": 12.8633, "step": 2213 }, { "epoch": 7.9071428571428575, - "grad_norm": 46.010765075683594, + "grad_norm": 21.77794075012207, "learning_rate": 1.27594562226866e-06, - "loss": 9.1797, + "loss": 11.2344, "step": 2214 }, { "epoch": 7.910714285714286, - "grad_norm": 49.37642288208008, + "grad_norm": 11.342094421386719, "learning_rate": 1.2717891755841722e-06, - "loss": 8.4648, + "loss": 7.9766, "step": 2215 }, { "epoch": 7.914285714285715, - "grad_norm": 59.04348373413086, + "grad_norm": 454.5245056152344, "learning_rate": 1.2676385231742493e-06, - "loss": 8.9922, + "loss": 10.6035, "step": 2216 }, { "epoch": 7.917857142857143, - "grad_norm": 44.77436828613281, + "grad_norm": 29.300264358520508, "learning_rate": 1.2634936714897118e-06, - "loss": 8.9648, + "loss": 8.0273, "step": 2217 }, { "epoch": 7.921428571428572, - "grad_norm": 85.55245208740234, + "grad_norm": 407.42291259765625, "learning_rate": 1.259354626972365e-06, - "loss": 14.1016, + "loss": 16.5664, "step": 2218 }, { "epoch": 7.925, - "grad_norm": 43.664981842041016, + "grad_norm": 118.63423919677734, "learning_rate": 1.2552213960549891e-06, - "loss": 9.2383, + "loss": 11.0898, "step": 2219 }, { "epoch": 7.928571428571429, - "grad_norm": 60.717384338378906, + "grad_norm": 1143.6077880859375, "learning_rate": 1.2510939851613285e-06, - "loss": 13.168, + "loss": 19.7656, "step": 2220 }, { "epoch": 7.932142857142857, - "grad_norm": 50.000709533691406, + "grad_norm": 38.37111282348633, "learning_rate": 1.2469724007060834e-06, - "loss": 10.6523, + "loss": 10.9043, "step": 2221 }, { "epoch": 7.935714285714286, - "grad_norm": 60.827117919921875, + "grad_norm": 51.58177947998047, "learning_rate": 1.242856649094899e-06, - "loss": 10.8008, + "loss": 11.9062, "step": 2222 }, { "epoch": 7.939285714285714, - "grad_norm": 59.33256149291992, + "grad_norm": 296.65118408203125, "learning_rate": 1.2387467367243517e-06, - "loss": 10.9844, + "loss": 16.0098, "step": 2223 }, { "epoch": 7.942857142857143, - "grad_norm": 50.271339416503906, + "grad_norm": 75.54273223876953, "learning_rate": 1.234642669981946e-06, - "loss": 11.2344, + "loss": 12.0996, "step": 2224 }, { "epoch": 7.946428571428571, - "grad_norm": 48.663673400878906, + "grad_norm": 193.32766723632812, "learning_rate": 1.230544455246101e-06, - "loss": 10.6836, + "loss": 12.9414, "step": 2225 }, { "epoch": 7.95, - "grad_norm": 48.9289665222168, + "grad_norm": 22.567956924438477, "learning_rate": 1.22645209888614e-06, - "loss": 6.9375, + "loss": 4.834, "step": 2226 }, { "epoch": 7.953571428571428, - "grad_norm": 48.65119171142578, + "grad_norm": 67.93697357177734, "learning_rate": 1.2223656072622825e-06, - "loss": 9.8477, + "loss": 9.5762, "step": 2227 }, { "epoch": 7.957142857142857, - "grad_norm": 47.305381774902344, + "grad_norm": 692.3645629882812, "learning_rate": 1.218284986725632e-06, - "loss": 11.4102, + "loss": 14.5195, "step": 2228 }, { "epoch": 7.960714285714285, - "grad_norm": 56.3883056640625, + "grad_norm": 140.8734130859375, "learning_rate": 1.214210243618168e-06, - "loss": 7.168, + "loss": 5.8516, "step": 2229 }, { "epoch": 7.964285714285714, - "grad_norm": 60.344547271728516, + "grad_norm": 14.51502799987793, "learning_rate": 1.2101413842727345e-06, - "loss": 12.168, + "loss": 14.707, "step": 2230 }, { "epoch": 7.9678571428571425, - "grad_norm": 49.11228942871094, + "grad_norm": 425.96087646484375, "learning_rate": 1.2060784150130345e-06, - "loss": 9.9258, + "loss": 11.6562, "step": 2231 }, { "epoch": 7.9714285714285715, - "grad_norm": 44.14506530761719, + "grad_norm": 271.9992980957031, "learning_rate": 1.2020213421536103e-06, - "loss": 10.1445, + "loss": 21.8945, "step": 2232 }, { "epoch": 7.975, - "grad_norm": 55.067630767822266, + "grad_norm": 197.14913940429688, "learning_rate": 1.1979701719998454e-06, - "loss": 10.3359, + "loss": 11.4453, "step": 2233 }, { "epoch": 7.978571428571429, - "grad_norm": 50.78660202026367, + "grad_norm": 47.02999496459961, "learning_rate": 1.1939249108479495e-06, - "loss": 10.5781, + "loss": 11.6914, "step": 2234 }, { "epoch": 7.982142857142857, - "grad_norm": 62.30586242675781, + "grad_norm": 820.3775024414062, "learning_rate": 1.1898855649849462e-06, - "loss": 8.9219, + "loss": 11.7051, "step": 2235 }, { "epoch": 7.985714285714286, - "grad_norm": 42.162620544433594, + "grad_norm": 53.7216911315918, "learning_rate": 1.1858521406886674e-06, - "loss": 9.5547, + "loss": 11.5938, "step": 2236 }, { "epoch": 7.989285714285714, - "grad_norm": 45.15355682373047, + "grad_norm": 24.744171142578125, "learning_rate": 1.181824644227741e-06, - "loss": 9.5508, + "loss": 10.043, "step": 2237 }, { "epoch": 7.992857142857143, - "grad_norm": 52.23591232299805, + "grad_norm": 10.535294532775879, "learning_rate": 1.1778030818615827e-06, - "loss": 13.2188, + "loss": 13.875, "step": 2238 }, { "epoch": 7.996428571428572, - "grad_norm": 51.371524810791016, + "grad_norm": 649.682373046875, "learning_rate": 1.1737874598403865e-06, - "loss": 7.3281, + "loss": 16.2344, "step": 2239 }, { "epoch": 8.0, - "grad_norm": 62.015445709228516, + "grad_norm": 15.296875, "learning_rate": 1.1697777844051105e-06, - "loss": 8.0508, + "loss": 9.9688, "step": 2240 }, { "epoch": 8.0, - "eval_loss": 9.751144409179688, - "eval_mse": 9.75637457119561, - "eval_runtime": 11.4773, - "eval_samples_per_second": 247.445, - "eval_steps_per_second": 1.307, - "eval_target_0_mse": 18.567112625932936, - "eval_target_1_mse": 10.026862013927142, - "eval_target_2_mse": 5.3199836458254675, - "eval_target_3_mse": 5.111539999096893, + "eval_loss": 12.677087783813477, + "eval_mse": 12.68152892974625, + "eval_runtime": 10.6896, + "eval_samples_per_second": 265.678, + "eval_steps_per_second": 1.403, + "eval_target_0_mse": 37.92038089340444, + "eval_target_1_mse": 8.923100966621506, + "eval_target_2_mse": 2.611133973793757, + "eval_target_3_mse": 1.2714998851652874, "step": 2240 }, { "epoch": 8.003571428571428, - "grad_norm": 54.52159118652344, + "grad_norm": 81.8472900390625, "learning_rate": 1.1657740617874742e-06, - "loss": 10.2969, + "loss": 10.5527, "step": 2241 }, { "epoch": 8.007142857142858, - "grad_norm": 41.751834869384766, + "grad_norm": 91.3329849243164, "learning_rate": 1.1617762982099446e-06, - "loss": 8.8594, + "loss": 9.1895, "step": 2242 }, { "epoch": 8.010714285714286, - "grad_norm": 42.54962158203125, + "grad_norm": 172.80715942382812, "learning_rate": 1.1577844998857275e-06, - "loss": 8.9297, + "loss": 8.4189, "step": 2243 }, { "epoch": 8.014285714285714, - "grad_norm": 46.9176139831543, + "grad_norm": 28.94157600402832, "learning_rate": 1.1537986730187567e-06, - "loss": 8.332, + "loss": 8.918, "step": 2244 }, { "epoch": 8.017857142857142, - "grad_norm": 66.1787338256836, + "grad_norm": 64.19886779785156, "learning_rate": 1.1498188238036862e-06, - "loss": 8.707, + "loss": 9.1035, "step": 2245 }, { "epoch": 8.021428571428572, - "grad_norm": 54.309234619140625, + "grad_norm": 445.63720703125, "learning_rate": 1.145844958425879e-06, - "loss": 11.0547, + "loss": 12.3203, "step": 2246 }, { "epoch": 8.025, - "grad_norm": 46.58029556274414, + "grad_norm": 114.98033142089844, "learning_rate": 1.1418770830614012e-06, - "loss": 8.5781, + "loss": 10.4316, "step": 2247 }, { "epoch": 8.028571428571428, - "grad_norm": 50.20857620239258, + "grad_norm": 321.1734619140625, "learning_rate": 1.137915203877003e-06, - "loss": 8.9766, + "loss": 9.4062, "step": 2248 }, { "epoch": 8.032142857142857, - "grad_norm": 45.50188446044922, + "grad_norm": 805.3211669921875, "learning_rate": 1.133959327030122e-06, - "loss": 10.5664, + "loss": 13.957, "step": 2249 }, { "epoch": 8.035714285714286, - "grad_norm": 53.620243072509766, + "grad_norm": 857.7301635742188, "learning_rate": 1.1300094586688632e-06, - "loss": 8.8086, + "loss": 18.2949, "step": 2250 }, { "epoch": 8.039285714285715, - "grad_norm": 43.65388107299805, + "grad_norm": 17.41173553466797, "learning_rate": 1.1260656049319957e-06, - "loss": 8.8008, + "loss": 7.7793, "step": 2251 }, { "epoch": 8.042857142857143, - "grad_norm": 54.071571350097656, + "grad_norm": 906.4207153320312, "learning_rate": 1.1221277719489387e-06, - "loss": 12.168, + "loss": 17.7734, "step": 2252 }, { "epoch": 8.04642857142857, - "grad_norm": 42.783390045166016, + "grad_norm": 936.3721313476562, "learning_rate": 1.1181959658397567e-06, - "loss": 8.4727, + "loss": 15.0352, "step": 2253 }, { "epoch": 8.05, - "grad_norm": 55.40306854248047, + "grad_norm": 260.1036071777344, "learning_rate": 1.1142701927151456e-06, - "loss": 8.0703, + "loss": 8.6113, "step": 2254 }, { "epoch": 8.053571428571429, - "grad_norm": 47.35654067993164, + "grad_norm": 721.5177612304688, "learning_rate": 1.1103504586764264e-06, - "loss": 8.9023, + "loss": 10.875, "step": 2255 }, { "epoch": 8.057142857142857, - "grad_norm": 53.88459396362305, + "grad_norm": 1062.04541015625, "learning_rate": 1.1064367698155303e-06, - "loss": 9.5273, + "loss": 13.9766, "step": 2256 }, { "epoch": 8.060714285714285, - "grad_norm": 54.38326644897461, + "grad_norm": 19.954477310180664, "learning_rate": 1.1025291322149988e-06, - "loss": 12.3047, + "loss": 26.5898, "step": 2257 }, { "epoch": 8.064285714285715, - "grad_norm": 46.14596176147461, + "grad_norm": 126.15492248535156, "learning_rate": 1.0986275519479657e-06, - "loss": 9.625, + "loss": 11.1445, "step": 2258 }, { "epoch": 8.067857142857143, - "grad_norm": 40.147708892822266, + "grad_norm": 299.19232177734375, "learning_rate": 1.094732035078151e-06, - "loss": 7.1797, + "loss": 8.0977, "step": 2259 }, { "epoch": 8.071428571428571, - "grad_norm": 72.8932113647461, + "grad_norm": 312.84808349609375, "learning_rate": 1.0908425876598512e-06, - "loss": 15.5273, + "loss": 18.5938, "step": 2260 }, { "epoch": 8.075, - "grad_norm": 50.24802780151367, + "grad_norm": 8.641109466552734, "learning_rate": 1.0869592157379305e-06, - "loss": 11.1328, + "loss": 11.3574, "step": 2261 }, { "epoch": 8.07857142857143, - "grad_norm": 49.929927825927734, + "grad_norm": 223.38613891601562, "learning_rate": 1.0830819253478104e-06, - "loss": 9.8789, + "loss": 11.5664, "step": 2262 }, { "epoch": 8.082142857142857, - "grad_norm": 62.99164581298828, + "grad_norm": 20.205596923828125, "learning_rate": 1.0792107225154597e-06, - "loss": 10.2891, + "loss": 11.0039, "step": 2263 }, { "epoch": 8.085714285714285, - "grad_norm": 46.1276741027832, + "grad_norm": 897.216796875, "learning_rate": 1.0753456132573886e-06, - "loss": 8.8516, + "loss": 14.5449, "step": 2264 }, { "epoch": 8.089285714285714, - "grad_norm": 56.310401916503906, + "grad_norm": 1146.6693115234375, "learning_rate": 1.0714866035806326e-06, - "loss": 10.8281, + "loss": 15.6719, "step": 2265 }, { "epoch": 8.092857142857143, - "grad_norm": 55.184486389160156, + "grad_norm": 57.854766845703125, "learning_rate": 1.0676336994827513e-06, - "loss": 10.457, + "loss": 12.8203, "step": 2266 }, { "epoch": 8.096428571428572, - "grad_norm": 45.34183120727539, + "grad_norm": 947.1398315429688, "learning_rate": 1.0637869069518137e-06, - "loss": 8.957, + "loss": 14.5547, "step": 2267 }, { "epoch": 8.1, - "grad_norm": 45.12693405151367, + "grad_norm": 31.039485931396484, "learning_rate": 1.0599462319663906e-06, - "loss": 10.2383, + "loss": 11.7031, "step": 2268 }, { "epoch": 8.103571428571428, - "grad_norm": 43.74140548706055, + "grad_norm": 99.4459457397461, "learning_rate": 1.0561116804955451e-06, - "loss": 10.3828, + "loss": 10.3164, "step": 2269 }, { "epoch": 8.107142857142858, - "grad_norm": 43.84679412841797, + "grad_norm": 69.95419311523438, "learning_rate": 1.0522832584988235e-06, - "loss": 7.9922, + "loss": 8.8887, "step": 2270 }, { "epoch": 8.110714285714286, - "grad_norm": 65.1592025756836, + "grad_norm": 332.4451599121094, "learning_rate": 1.0484609719262445e-06, - "loss": 11.0273, + "loss": 14.0488, "step": 2271 }, { "epoch": 8.114285714285714, - "grad_norm": 49.027469635009766, + "grad_norm": 652.4242553710938, "learning_rate": 1.044644826718295e-06, - "loss": 9.5938, + "loss": 12.9688, "step": 2272 }, { "epoch": 8.117857142857142, - "grad_norm": 45.61960220336914, + "grad_norm": 1089.4532470703125, "learning_rate": 1.0408348288059112e-06, - "loss": 9.9609, + "loss": 13.6094, "step": 2273 }, { "epoch": 8.121428571428572, - "grad_norm": 59.95853042602539, + "grad_norm": 75.78922271728516, "learning_rate": 1.0370309841104803e-06, - "loss": 10.0234, + "loss": 11.5273, "step": 2274 }, { "epoch": 8.125, - "grad_norm": 49.66108322143555, + "grad_norm": 36.85566711425781, "learning_rate": 1.0332332985438248e-06, - "loss": 9.4336, + "loss": 10.9219, "step": 2275 }, { "epoch": 8.128571428571428, - "grad_norm": 53.57130813598633, + "grad_norm": 114.21883392333984, "learning_rate": 1.029441778008195e-06, - "loss": 9.2734, + "loss": 9.7266, "step": 2276 }, { "epoch": 8.132142857142858, - "grad_norm": 44.972286224365234, + "grad_norm": 432.5933532714844, "learning_rate": 1.0256564283962588e-06, - "loss": 8.0156, + "loss": 17.707, "step": 2277 }, { "epoch": 8.135714285714286, - "grad_norm": 50.950191497802734, + "grad_norm": 47.15464782714844, "learning_rate": 1.0218772555910955e-06, - "loss": 8.75, + "loss": 9.6016, "step": 2278 }, { "epoch": 8.139285714285714, - "grad_norm": 45.7244987487793, + "grad_norm": 59.05503463745117, "learning_rate": 1.0181042654661815e-06, - "loss": 8.3906, + "loss": 10.1602, "step": 2279 }, { "epoch": 8.142857142857142, - "grad_norm": 45.52471923828125, + "grad_norm": 114.25577545166016, "learning_rate": 1.0143374638853892e-06, - "loss": 7.9062, + "loss": 7.2812, "step": 2280 }, { "epoch": 8.146428571428572, - "grad_norm": 46.226261138916016, + "grad_norm": 12.783981323242188, "learning_rate": 1.0105768567029656e-06, - "loss": 6.9531, + "loss": 5.9434, "step": 2281 }, { "epoch": 8.15, - "grad_norm": 50.3105354309082, + "grad_norm": 126.70967102050781, "learning_rate": 1.006822449763537e-06, - "loss": 10.7734, + "loss": 11.5566, "step": 2282 }, { "epoch": 8.153571428571428, - "grad_norm": 51.856346130371094, + "grad_norm": 362.0102844238281, "learning_rate": 1.0030742489020907e-06, - "loss": 9.6523, + "loss": 16.0195, "step": 2283 }, { "epoch": 8.157142857142857, - "grad_norm": 42.801719665527344, + "grad_norm": 22.726974487304688, "learning_rate": 9.993322599439692e-07, - "loss": 9.0625, + "loss": 9.6641, "step": 2284 }, { "epoch": 8.160714285714286, - "grad_norm": 78.59752655029297, + "grad_norm": 888.2879028320312, "learning_rate": 9.955964887048608e-07, - "loss": 13.9023, + "loss": 35.7266, "step": 2285 }, { "epoch": 8.164285714285715, - "grad_norm": 40.64580535888672, + "grad_norm": 30.060731887817383, "learning_rate": 9.918669409907904e-07, - "loss": 8.3906, + "loss": 8.7715, "step": 2286 }, { "epoch": 8.167857142857143, - "grad_norm": 60.77827453613281, + "grad_norm": 943.718017578125, "learning_rate": 9.881436225981107e-07, - "loss": 10.8594, + "loss": 15.9102, "step": 2287 }, { "epoch": 8.17142857142857, - "grad_norm": 42.34695053100586, + "grad_norm": 34.74462127685547, "learning_rate": 9.844265393134927e-07, - "loss": 9.6484, + "loss": 22.0352, "step": 2288 }, { "epoch": 8.175, - "grad_norm": 48.17861557006836, + "grad_norm": 18.543550491333008, "learning_rate": 9.807156969139136e-07, - "loss": 10.75, + "loss": 10.2109, "step": 2289 }, { "epoch": 8.178571428571429, - "grad_norm": 51.748756408691406, + "grad_norm": 17.379348754882812, "learning_rate": 9.770111011666582e-07, - "loss": 9.6133, + "loss": 9.5156, "step": 2290 }, { "epoch": 8.182142857142857, - "grad_norm": 57.86015701293945, + "grad_norm": 57.862186431884766, "learning_rate": 9.733127578292983e-07, - "loss": 9.6406, + "loss": 9.8945, "step": 2291 }, { "epoch": 8.185714285714285, - "grad_norm": 43.73434066772461, + "grad_norm": 348.26898193359375, "learning_rate": 9.696206726496893e-07, - "loss": 10.0508, + "loss": 12.9238, "step": 2292 }, { "epoch": 8.189285714285715, - "grad_norm": 63.11996841430664, + "grad_norm": 898.9811401367188, "learning_rate": 9.6593485136596e-07, - "loss": 15.3086, + "loss": 24.9395, "step": 2293 }, { "epoch": 8.192857142857143, - "grad_norm": 42.94272232055664, + "grad_norm": 135.75938415527344, "learning_rate": 9.622552997065043e-07, - "loss": 8.5156, + "loss": 9.5488, "step": 2294 }, { "epoch": 8.196428571428571, - "grad_norm": 46.4530143737793, + "grad_norm": 47.732177734375, "learning_rate": 9.585820233899739e-07, - "loss": 8.8789, + "loss": 8.8867, "step": 2295 }, { "epoch": 8.2, - "grad_norm": 54.99473190307617, + "grad_norm": 455.2298278808594, "learning_rate": 9.549150281252633e-07, - "loss": 10.2188, + "loss": 12.9141, "step": 2296 }, { "epoch": 8.20357142857143, - "grad_norm": 46.78672790527344, + "grad_norm": 424.3283996582031, "learning_rate": 9.512543196115081e-07, - "loss": 9.5352, + "loss": 10.1738, "step": 2297 }, { "epoch": 8.207142857142857, - "grad_norm": 51.798709869384766, + "grad_norm": 747.6268920898438, "learning_rate": 9.47599903538074e-07, - "loss": 9.4961, + "loss": 11.625, "step": 2298 }, { "epoch": 8.210714285714285, - "grad_norm": 48.752140045166016, + "grad_norm": 28.509944915771484, "learning_rate": 9.439517855845448e-07, - "loss": 10.2617, + "loss": 10.2441, "step": 2299 }, { "epoch": 8.214285714285714, - "grad_norm": 43.20792007446289, + "grad_norm": 291.29718017578125, "learning_rate": 9.403099714207175e-07, - "loss": 8.7773, + "loss": 9.4316, "step": 2300 }, { "epoch": 8.217857142857143, - "grad_norm": 48.20103454589844, + "grad_norm": 43.737892150878906, "learning_rate": 9.366744667065914e-07, - "loss": 9.3711, + "loss": 9.8828, "step": 2301 }, { "epoch": 8.221428571428572, - "grad_norm": 49.659759521484375, + "grad_norm": 41.84266662597656, "learning_rate": 9.330452770923604e-07, - "loss": 11.5039, + "loss": 14.3105, "step": 2302 }, { "epoch": 8.225, - "grad_norm": 45.6573600769043, + "grad_norm": 998.8926391601562, "learning_rate": 9.294224082184045e-07, - "loss": 8.9141, + "loss": 12.0312, "step": 2303 }, { "epoch": 8.228571428571428, - "grad_norm": 66.55377197265625, + "grad_norm": 595.5511474609375, "learning_rate": 9.258058657152763e-07, - "loss": 8.0273, + "loss": 8.7852, "step": 2304 }, { "epoch": 8.232142857142858, - "grad_norm": 40.21379089355469, + "grad_norm": 713.6099853515625, "learning_rate": 9.221956552036992e-07, - "loss": 8.2305, + "loss": 18.4238, "step": 2305 }, { "epoch": 8.235714285714286, - "grad_norm": 57.204185485839844, + "grad_norm": 67.6283950805664, "learning_rate": 9.185917822945567e-07, - "loss": 11.0078, + "loss": 12.2578, "step": 2306 }, { "epoch": 8.239285714285714, - "grad_norm": 53.14253234863281, + "grad_norm": 19.464740753173828, "learning_rate": 9.1499425258888e-07, - "loss": 10.5781, + "loss": 9.8242, "step": 2307 }, { "epoch": 8.242857142857142, - "grad_norm": 42.4513053894043, + "grad_norm": 508.0703125, "learning_rate": 9.114030716778433e-07, - "loss": 8.3281, + "loss": 10.0059, "step": 2308 }, { "epoch": 8.246428571428572, - "grad_norm": 50.158077239990234, + "grad_norm": 45.65583038330078, "learning_rate": 9.07818245142753e-07, - "loss": 10.1445, + "loss": 11.4492, "step": 2309 }, { "epoch": 8.25, - "grad_norm": 50.9121208190918, + "grad_norm": 181.98976135253906, "learning_rate": 9.042397785550405e-07, - "loss": 10.5781, + "loss": 10.4805, "step": 2310 }, { "epoch": 8.253571428571428, - "grad_norm": 43.185882568359375, + "grad_norm": 228.0778045654297, "learning_rate": 9.006676774762535e-07, - "loss": 10.1406, + "loss": 12.0156, "step": 2311 }, { "epoch": 8.257142857142856, - "grad_norm": 51.9337158203125, + "grad_norm": 855.9116821289062, "learning_rate": 8.971019474580428e-07, - "loss": 9.2656, + "loss": 13.2754, "step": 2312 }, { "epoch": 8.260714285714286, - "grad_norm": 42.733848571777344, + "grad_norm": 26.817773818969727, "learning_rate": 8.935425940421616e-07, - "loss": 9.4336, + "loss": 19.5137, "step": 2313 }, { "epoch": 8.264285714285714, - "grad_norm": 59.904605865478516, + "grad_norm": 8.808467864990234, "learning_rate": 8.899896227604509e-07, - "loss": 8.5586, + "loss": 7.2461, "step": 2314 }, { "epoch": 8.267857142857142, - "grad_norm": 49.21571350097656, + "grad_norm": 266.4861755371094, "learning_rate": 8.864430391348333e-07, - "loss": 9.3398, + "loss": 9.375, "step": 2315 }, { "epoch": 8.271428571428572, - "grad_norm": 54.32213592529297, + "grad_norm": 343.8249206542969, "learning_rate": 8.82902848677304e-07, - "loss": 11.7695, + "loss": 13.75, "step": 2316 }, { "epoch": 8.275, - "grad_norm": 50.426971435546875, + "grad_norm": 689.6817626953125, "learning_rate": 8.793690568899216e-07, - "loss": 10.8594, + "loss": 39.7266, "step": 2317 }, { "epoch": 8.278571428571428, - "grad_norm": 54.00626754760742, + "grad_norm": 9.111383438110352, "learning_rate": 8.758416692648008e-07, - "loss": 8.2422, + "loss": 5.9238, "step": 2318 }, { "epoch": 8.282142857142857, - "grad_norm": 49.664920806884766, + "grad_norm": 114.14273834228516, "learning_rate": 8.72320691284102e-07, - "loss": 10.3203, + "loss": 10.4316, "step": 2319 }, { "epoch": 8.285714285714286, - "grad_norm": 47.41107940673828, + "grad_norm": 79.61054229736328, "learning_rate": 8.688061284200266e-07, - "loss": 9.9492, + "loss": 13.1641, "step": 2320 }, { "epoch": 8.289285714285715, - "grad_norm": 59.96736145019531, + "grad_norm": 18.635147094726562, "learning_rate": 8.65297986134801e-07, - "loss": 13.2109, + "loss": 13.373, "step": 2321 }, { "epoch": 8.292857142857143, - "grad_norm": 46.115760803222656, + "grad_norm": 41.0429801940918, "learning_rate": 8.617962698806764e-07, - "loss": 9.5703, + "loss": 11.4004, "step": 2322 }, { "epoch": 8.29642857142857, - "grad_norm": 66.26699829101562, + "grad_norm": 831.9913940429688, "learning_rate": 8.58300985099918e-07, - "loss": 13.668, + "loss": 25.4609, "step": 2323 }, { "epoch": 8.3, - "grad_norm": 42.293235778808594, + "grad_norm": 806.58642578125, "learning_rate": 8.54812137224792e-07, - "loss": 8.1719, + "loss": 12.3379, "step": 2324 }, { "epoch": 8.303571428571429, - "grad_norm": 50.119998931884766, + "grad_norm": 203.81053161621094, "learning_rate": 8.513297316775626e-07, - "loss": 10.7188, + "loss": 12.6758, "step": 2325 }, { "epoch": 8.307142857142857, - "grad_norm": 48.238311767578125, + "grad_norm": 85.69949340820312, "learning_rate": 8.478537738704811e-07, - "loss": 10.5078, + "loss": 14.4453, "step": 2326 }, { "epoch": 8.310714285714285, - "grad_norm": 49.420265197753906, + "grad_norm": 11.037935256958008, "learning_rate": 8.44384269205778e-07, - "loss": 10.2109, + "loss": 10.5156, "step": 2327 }, { "epoch": 8.314285714285715, - "grad_norm": 53.57512664794922, + "grad_norm": 65.76129150390625, "learning_rate": 8.409212230756564e-07, - "loss": 10.7305, + "loss": 10.457, "step": 2328 }, { "epoch": 8.317857142857143, - "grad_norm": 50.125892639160156, + "grad_norm": 30.44970703125, "learning_rate": 8.374646408622755e-07, - "loss": 7.9922, + "loss": 8.0898, "step": 2329 }, { "epoch": 8.321428571428571, - "grad_norm": 64.34918212890625, + "grad_norm": 56.250701904296875, "learning_rate": 8.340145279377559e-07, - "loss": 10.3789, + "loss": 10.8359, "step": 2330 }, { "epoch": 8.325, - "grad_norm": 74.88823699951172, + "grad_norm": 1165.6737060546875, "learning_rate": 8.305708896641596e-07, - "loss": 9.9023, + "loss": 15.8906, "step": 2331 }, { "epoch": 8.32857142857143, - "grad_norm": 60.54659652709961, + "grad_norm": 700.8257446289062, "learning_rate": 8.271337313934869e-07, - "loss": 12.5859, + "loss": 20.0234, "step": 2332 }, { "epoch": 8.332142857142857, - "grad_norm": 40.12471389770508, + "grad_norm": 93.58720397949219, "learning_rate": 8.237030584676681e-07, - "loss": 8.0078, + "loss": 8.791, "step": 2333 }, { "epoch": 8.335714285714285, - "grad_norm": 50.93900680541992, + "grad_norm": 12.291764259338379, "learning_rate": 8.202788762185515e-07, - "loss": 6.4922, + "loss": 6.0996, "step": 2334 }, { "epoch": 8.339285714285714, - "grad_norm": 46.12246322631836, + "grad_norm": 641.84765625, "learning_rate": 8.168611899679013e-07, - "loss": 9.8047, + "loss": 12.8867, "step": 2335 }, { "epoch": 8.342857142857143, - "grad_norm": 70.07340240478516, + "grad_norm": 42.02069854736328, "learning_rate": 8.134500050273841e-07, - "loss": 12.75, + "loss": 14.5176, "step": 2336 }, { "epoch": 8.346428571428572, - "grad_norm": 56.815223693847656, + "grad_norm": 679.4635009765625, "learning_rate": 8.100453266985603e-07, - "loss": 9.9414, + "loss": 12.1523, "step": 2337 }, { "epoch": 8.35, - "grad_norm": 50.06435012817383, + "grad_norm": 540.5589599609375, "learning_rate": 8.066471602728804e-07, - "loss": 9.0898, + "loss": 12.9121, "step": 2338 }, { "epoch": 8.353571428571428, - "grad_norm": 47.14213943481445, + "grad_norm": 54.06745529174805, "learning_rate": 8.032555110316748e-07, - "loss": 8.8203, + "loss": 8.6328, "step": 2339 }, { "epoch": 8.357142857142858, - "grad_norm": 49.86742401123047, + "grad_norm": 753.2927856445312, "learning_rate": 7.99870384246143e-07, - "loss": 9.6367, + "loss": 13.2578, "step": 2340 }, { "epoch": 8.360714285714286, - "grad_norm": 65.29377746582031, + "grad_norm": 108.71765899658203, "learning_rate": 7.964917851773496e-07, - "loss": 10.1484, + "loss": 12.3633, "step": 2341 }, { "epoch": 8.364285714285714, - "grad_norm": 46.56884765625, + "grad_norm": 76.73164367675781, "learning_rate": 7.931197190762119e-07, - "loss": 9.082, + "loss": 10.4766, "step": 2342 }, { "epoch": 8.367857142857144, - "grad_norm": 48.13380813598633, + "grad_norm": 94.44290161132812, "learning_rate": 7.89754191183496e-07, - "loss": 7.9219, + "loss": 7.6641, "step": 2343 }, { "epoch": 8.371428571428572, - "grad_norm": 56.2253303527832, + "grad_norm": 38.370567321777344, "learning_rate": 7.863952067298042e-07, - "loss": 11.0938, + "loss": 11.5312, "step": 2344 }, { "epoch": 8.375, - "grad_norm": 51.04689407348633, + "grad_norm": 1249.184814453125, "learning_rate": 7.830427709355726e-07, - "loss": 12.9102, + "loss": 24.123, "step": 2345 }, { "epoch": 8.378571428571428, - "grad_norm": 71.5767593383789, + "grad_norm": 52.99514389038086, "learning_rate": 7.796968890110546e-07, - "loss": 13.6172, + "loss": 12.2461, "step": 2346 }, { "epoch": 8.382142857142856, - "grad_norm": 47.59012222290039, + "grad_norm": 15.680898666381836, "learning_rate": 7.763575661563211e-07, - "loss": 10.0234, + "loss": 11.7578, "step": 2347 }, { "epoch": 8.385714285714286, - "grad_norm": 61.89570236206055, + "grad_norm": 123.9773178100586, "learning_rate": 7.730248075612501e-07, - "loss": 11.0938, + "loss": 14.2031, "step": 2348 }, { "epoch": 8.389285714285714, - "grad_norm": 45.37525939941406, + "grad_norm": 48.7369270324707, "learning_rate": 7.696986184055144e-07, - "loss": 9.5938, + "loss": 8.9277, "step": 2349 }, { "epoch": 8.392857142857142, - "grad_norm": 62.53132247924805, + "grad_norm": 113.82145690917969, "learning_rate": 7.663790038585794e-07, - "loss": 11.4766, + "loss": 12.3828, "step": 2350 }, { "epoch": 8.396428571428572, - "grad_norm": 43.96501541137695, + "grad_norm": 10.118380546569824, "learning_rate": 7.630659690796916e-07, - "loss": 8.3594, + "loss": 7.918, "step": 2351 }, { "epoch": 8.4, - "grad_norm": 42.35443878173828, + "grad_norm": 45.27591323852539, "learning_rate": 7.597595192178702e-07, - "loss": 8.3555, + "loss": 9.127, "step": 2352 }, { "epoch": 8.403571428571428, - "grad_norm": 67.2877197265625, + "grad_norm": 165.45486450195312, "learning_rate": 7.564596594119034e-07, - "loss": 9.6406, + "loss": 9.6055, "step": 2353 }, { "epoch": 8.407142857142857, - "grad_norm": 45.320533752441406, + "grad_norm": 62.9382209777832, "learning_rate": 7.531663947903334e-07, - "loss": 10.0, + "loss": 10.5254, "step": 2354 }, { "epoch": 8.410714285714286, - "grad_norm": 42.77421951293945, + "grad_norm": 41.94437026977539, "learning_rate": 7.498797304714545e-07, - "loss": 8.1523, + "loss": 8.6465, "step": 2355 }, { "epoch": 8.414285714285715, - "grad_norm": 50.28700256347656, + "grad_norm": 1070.3056640625, "learning_rate": 7.465996715633028e-07, - "loss": 11.6328, + "loss": 24.0918, "step": 2356 }, { "epoch": 8.417857142857143, - "grad_norm": 45.969356536865234, + "grad_norm": 12.466291427612305, "learning_rate": 7.433262231636495e-07, - "loss": 8.7344, + "loss": 7.5742, "step": 2357 }, { "epoch": 8.42142857142857, - "grad_norm": 45.9980354309082, + "grad_norm": 39.8794059753418, "learning_rate": 7.4005939035999e-07, - "loss": 8.1719, + "loss": 7.4023, "step": 2358 }, { "epoch": 8.425, - "grad_norm": 40.971466064453125, + "grad_norm": 98.59400177001953, "learning_rate": 7.367991782295392e-07, - "loss": 9.5742, + "loss": 10.627, "step": 2359 }, { "epoch": 8.428571428571429, - "grad_norm": 44.0042610168457, + "grad_norm": 81.03642272949219, "learning_rate": 7.33545591839222e-07, - "loss": 8.793, + "loss": 10.6328, "step": 2360 }, { "epoch": 8.432142857142857, - "grad_norm": 54.009002685546875, + "grad_norm": 62.00534439086914, "learning_rate": 7.302986362456665e-07, - "loss": 11.6133, + "loss": 11.2422, "step": 2361 }, { "epoch": 8.435714285714285, - "grad_norm": 58.54909133911133, + "grad_norm": 861.3793334960938, "learning_rate": 7.270583164951928e-07, - "loss": 8.8555, + "loss": 12.8223, "step": 2362 }, { "epoch": 8.439285714285715, - "grad_norm": 48.46526336669922, + "grad_norm": 136.4292449951172, "learning_rate": 7.238246376238111e-07, - "loss": 7.5234, + "loss": 7.5508, "step": 2363 }, { "epoch": 8.442857142857143, - "grad_norm": 48.5256462097168, + "grad_norm": 64.97135162353516, "learning_rate": 7.205976046572083e-07, - "loss": 10.8438, + "loss": 11.6484, "step": 2364 }, { "epoch": 8.446428571428571, - "grad_norm": 65.98784637451172, + "grad_norm": 1074.9632568359375, "learning_rate": 7.173772226107434e-07, - "loss": 10.6719, + "loss": 15.0391, "step": 2365 }, { "epoch": 8.45, - "grad_norm": 53.93112564086914, + "grad_norm": 227.36944580078125, "learning_rate": 7.141634964894389e-07, - "loss": 10.0156, + "loss": 14.4336, "step": 2366 }, { "epoch": 8.45357142857143, - "grad_norm": 63.64051818847656, + "grad_norm": 558.7694702148438, "learning_rate": 7.109564312879713e-07, - "loss": 13.4453, + "loss": 19.252, "step": 2367 }, { "epoch": 8.457142857142857, - "grad_norm": 58.71651840209961, + "grad_norm": 896.8192138671875, "learning_rate": 7.077560319906696e-07, - "loss": 10.3125, + "loss": 13.6133, "step": 2368 }, { "epoch": 8.460714285714285, - "grad_norm": 56.33332443237305, + "grad_norm": 29.29842758178711, "learning_rate": 7.04562303571496e-07, - "loss": 9.3359, + "loss": 11.6406, "step": 2369 }, { "epoch": 8.464285714285714, - "grad_norm": 43.81982421875, + "grad_norm": 574.2129516601562, "learning_rate": 7.013752509940486e-07, - "loss": 9.8203, + "loss": 12.3516, "step": 2370 }, { "epoch": 8.467857142857143, - "grad_norm": 60.958404541015625, + "grad_norm": 59.9177360534668, "learning_rate": 6.981948792115511e-07, - "loss": 13.207, + "loss": 13.5586, "step": 2371 }, { "epoch": 8.471428571428572, - "grad_norm": 66.20423126220703, + "grad_norm": 227.34176635742188, "learning_rate": 6.950211931668421e-07, - "loss": 10.1719, + "loss": 9.1621, "step": 2372 }, { "epoch": 8.475, - "grad_norm": 47.49088668823242, + "grad_norm": 6.8260064125061035, "learning_rate": 6.918541977923709e-07, - "loss": 10.8906, + "loss": 11.9219, "step": 2373 }, { "epoch": 8.478571428571428, - "grad_norm": 40.73087692260742, + "grad_norm": 10.117810249328613, "learning_rate": 6.88693898010187e-07, - "loss": 8.6953, + "loss": 8.6855, "step": 2374 }, { "epoch": 8.482142857142858, - "grad_norm": 61.85139083862305, + "grad_norm": 954.2267456054688, "learning_rate": 6.855402987319348e-07, - "loss": 10.1172, + "loss": 18.5742, "step": 2375 }, { "epoch": 8.485714285714286, - "grad_norm": 51.53121566772461, + "grad_norm": 138.2713623046875, "learning_rate": 6.82393404858846e-07, - "loss": 10.8789, + "loss": 10.2285, "step": 2376 }, { "epoch": 8.489285714285714, - "grad_norm": 53.55644989013672, + "grad_norm": 13.895771026611328, "learning_rate": 6.792532212817271e-07, - "loss": 9.6523, + "loss": 10.1816, "step": 2377 }, { "epoch": 8.492857142857144, - "grad_norm": 57.04782485961914, + "grad_norm": 9.540787696838379, "learning_rate": 6.761197528809593e-07, - "loss": 8.875, + "loss": 8.918, "step": 2378 }, { "epoch": 8.496428571428572, - "grad_norm": 48.07637405395508, + "grad_norm": 319.09393310546875, "learning_rate": 6.72993004526486e-07, - "loss": 10.5664, + "loss": 13.0234, "step": 2379 }, { "epoch": 8.5, - "grad_norm": 67.14698028564453, + "grad_norm": 28.125244140625, "learning_rate": 6.698729810778065e-07, - "loss": 9.2969, + "loss": 9.6641, "step": 2380 }, { "epoch": 8.503571428571428, - "grad_norm": 49.043270111083984, + "grad_norm": 705.2562866210938, "learning_rate": 6.667596873839694e-07, - "loss": 8.7188, + "loss": 9.4336, "step": 2381 }, { "epoch": 8.507142857142856, - "grad_norm": 46.73062515258789, + "grad_norm": 841.333740234375, "learning_rate": 6.636531282835629e-07, - "loss": 9.207, + "loss": 12.457, "step": 2382 }, { "epoch": 8.510714285714286, - "grad_norm": 46.422725677490234, + "grad_norm": 21.660137176513672, "learning_rate": 6.605533086047089e-07, - "loss": 8.4922, + "loss": 9.3262, "step": 2383 }, { "epoch": 8.514285714285714, - "grad_norm": 55.6292839050293, + "grad_norm": 245.42245483398438, "learning_rate": 6.574602331650559e-07, - "loss": 12.2109, + "loss": 13.9141, "step": 2384 }, { "epoch": 8.517857142857142, - "grad_norm": 40.571006774902344, + "grad_norm": 103.29685974121094, "learning_rate": 6.543739067717681e-07, - "loss": 8.0664, + "loss": 8.8984, "step": 2385 }, { "epoch": 8.521428571428572, - "grad_norm": 54.79071044921875, + "grad_norm": 43.62729263305664, "learning_rate": 6.512943342215234e-07, - "loss": 9.2695, + "loss": 8.5957, "step": 2386 }, { "epoch": 8.525, - "grad_norm": 46.33033752441406, + "grad_norm": 190.54379272460938, "learning_rate": 6.482215203005016e-07, - "loss": 10.4844, + "loss": 12.541, "step": 2387 }, { "epoch": 8.528571428571428, - "grad_norm": 51.75707244873047, + "grad_norm": 241.35244750976562, "learning_rate": 6.451554697843798e-07, - "loss": 10.5898, + "loss": 11.4453, "step": 2388 }, { "epoch": 8.532142857142857, - "grad_norm": 54.257930755615234, + "grad_norm": 214.12347412109375, "learning_rate": 6.420961874383213e-07, - "loss": 9.9297, + "loss": 9.4062, "step": 2389 }, { "epoch": 8.535714285714286, - "grad_norm": 48.07026290893555, + "grad_norm": 67.37564086914062, "learning_rate": 6.390436780169735e-07, - "loss": 8.0391, + "loss": 7.8828, "step": 2390 }, { "epoch": 8.539285714285715, - "grad_norm": 52.937320709228516, + "grad_norm": 57.543800354003906, "learning_rate": 6.359979462644545e-07, - "loss": 8.957, + "loss": 11.2617, "step": 2391 }, { "epoch": 8.542857142857143, - "grad_norm": 52.81268310546875, + "grad_norm": 22.337646484375, "learning_rate": 6.329589969143518e-07, - "loss": 10.543, + "loss": 13.0234, "step": 2392 }, { "epoch": 8.54642857142857, - "grad_norm": 40.393184661865234, + "grad_norm": 368.0253601074219, "learning_rate": 6.299268346897086e-07, - "loss": 8.5664, + "loss": 10.1406, "step": 2393 }, { "epoch": 8.55, - "grad_norm": 62.099727630615234, + "grad_norm": 359.2391052246094, "learning_rate": 6.269014643030214e-07, - "loss": 12.0156, + "loss": 15.7344, "step": 2394 }, { "epoch": 8.553571428571429, - "grad_norm": 45.39421463012695, + "grad_norm": 22.452653884887695, "learning_rate": 6.238828904562316e-07, - "loss": 9.7148, + "loss": 9.6133, "step": 2395 }, { "epoch": 8.557142857142857, - "grad_norm": 45.35146713256836, + "grad_norm": 583.5474243164062, "learning_rate": 6.208711178407173e-07, - "loss": 8.3984, + "loss": 9.9199, "step": 2396 }, { "epoch": 8.560714285714285, - "grad_norm": 39.198909759521484, + "grad_norm": 54.376617431640625, "learning_rate": 6.178661511372858e-07, - "loss": 7.6367, + "loss": 7.2344, "step": 2397 }, { "epoch": 8.564285714285715, - "grad_norm": 61.58844757080078, + "grad_norm": 98.9025650024414, "learning_rate": 6.148679950161673e-07, - "loss": 9.1797, + "loss": 10.1562, "step": 2398 }, { "epoch": 8.567857142857143, - "grad_norm": 53.05892562866211, + "grad_norm": 54.834957122802734, "learning_rate": 6.118766541370063e-07, - "loss": 13.7969, + "loss": 17.2754, "step": 2399 }, { "epoch": 8.571428571428571, - "grad_norm": 54.340816497802734, + "grad_norm": 602.9647216796875, "learning_rate": 6.088921331488568e-07, - "loss": 12.9883, + "loss": 16.0391, "step": 2400 }, { "epoch": 8.575, - "grad_norm": 40.27375411987305, + "grad_norm": 10.308239936828613, "learning_rate": 6.059144366901737e-07, - "loss": 7.4961, + "loss": 7.0117, "step": 2401 }, { "epoch": 8.57857142857143, - "grad_norm": 46.46129608154297, + "grad_norm": 196.94036865234375, "learning_rate": 6.029435693888019e-07, - "loss": 9.4609, + "loss": 10.0352, "step": 2402 }, { "epoch": 8.582142857142857, - "grad_norm": 50.3598518371582, + "grad_norm": 1102.83984375, "learning_rate": 5.999795358619765e-07, - "loss": 9.1484, + "loss": 19.5469, "step": 2403 }, { "epoch": 8.585714285714285, - "grad_norm": 38.87482452392578, + "grad_norm": 25.647859573364258, "learning_rate": 5.9702234071631e-07, - "loss": 7.8008, + "loss": 7.8594, "step": 2404 }, { "epoch": 8.589285714285714, - "grad_norm": 72.28646850585938, + "grad_norm": 14.972293853759766, "learning_rate": 5.94071988547788e-07, - "loss": 13.5352, + "loss": 14.7773, "step": 2405 }, { "epoch": 8.592857142857143, - "grad_norm": 112.76983642578125, + "grad_norm": 54.22498321533203, "learning_rate": 5.911284839417597e-07, - "loss": 15.5703, + "loss": 31.2656, "step": 2406 }, { "epoch": 8.596428571428572, - "grad_norm": 47.89336395263672, + "grad_norm": 27.908143997192383, "learning_rate": 5.88191831472933e-07, - "loss": 8.6328, + "loss": 20.7773, "step": 2407 }, { "epoch": 8.6, - "grad_norm": 39.20845031738281, + "grad_norm": 276.22955322265625, "learning_rate": 5.852620357053651e-07, - "loss": 9.1211, + "loss": 10.8555, "step": 2408 }, { "epoch": 8.603571428571428, - "grad_norm": 56.622947692871094, + "grad_norm": 787.24853515625, "learning_rate": 5.823391011924595e-07, - "loss": 10.1367, + "loss": 12.6836, "step": 2409 }, { "epoch": 8.607142857142858, - "grad_norm": 39.68412399291992, + "grad_norm": 15.195537567138672, "learning_rate": 5.794230324769518e-07, - "loss": 8.5664, + "loss": 8.8086, "step": 2410 }, { "epoch": 8.610714285714286, - "grad_norm": 48.43465805053711, + "grad_norm": 28.00503158569336, "learning_rate": 5.765138340909105e-07, - "loss": 9.1602, + "loss": 11.9902, "step": 2411 }, { "epoch": 8.614285714285714, - "grad_norm": 55.87361145019531, + "grad_norm": 180.44029235839844, "learning_rate": 5.736115105557249e-07, - "loss": 10.3281, + "loss": 12.707, "step": 2412 }, { "epoch": 8.617857142857144, - "grad_norm": 39.29885482788086, + "grad_norm": 12.707915306091309, "learning_rate": 5.707160663821009e-07, - "loss": 8.125, + "loss": 8.4531, "step": 2413 }, { "epoch": 8.621428571428572, - "grad_norm": 54.68297576904297, + "grad_norm": 8.940814018249512, "learning_rate": 5.678275060700517e-07, - "loss": 10.8633, + "loss": 11.4883, "step": 2414 }, { "epoch": 8.625, - "grad_norm": 51.944332122802734, + "grad_norm": 72.0645980834961, "learning_rate": 5.649458341088915e-07, - "loss": 8.5898, + "loss": 7.3105, "step": 2415 }, { "epoch": 8.628571428571428, - "grad_norm": 50.578155517578125, + "grad_norm": 13.0477933883667, "learning_rate": 5.620710549772295e-07, - "loss": 9.6406, + "loss": 10.8008, "step": 2416 }, { "epoch": 8.632142857142856, - "grad_norm": 51.4583740234375, + "grad_norm": 32.3780403137207, "learning_rate": 5.592031731429631e-07, - "loss": 8.8711, + "loss": 8.2148, "step": 2417 }, { "epoch": 8.635714285714286, - "grad_norm": 47.97775650024414, + "grad_norm": 801.3470458984375, "learning_rate": 5.563421930632674e-07, - "loss": 9.1992, + "loss": 17.4219, "step": 2418 }, { "epoch": 8.639285714285714, - "grad_norm": 64.49330139160156, + "grad_norm": 48.91053009033203, "learning_rate": 5.534881191845931e-07, - "loss": 11.9609, + "loss": 12.5781, "step": 2419 }, { "epoch": 8.642857142857142, - "grad_norm": 60.94121170043945, + "grad_norm": 33.87311553955078, "learning_rate": 5.506409559426573e-07, - "loss": 9.9648, + "loss": 8.3555, "step": 2420 }, { "epoch": 8.646428571428572, - "grad_norm": 45.92329025268555, + "grad_norm": 280.30401611328125, "learning_rate": 5.47800707762437e-07, - "loss": 8.6602, + "loss": 10.3203, "step": 2421 }, { "epoch": 8.65, - "grad_norm": 48.250328063964844, + "grad_norm": 750.8541259765625, "learning_rate": 5.449673790581611e-07, - "loss": 9.3594, + "loss": 12.8828, "step": 2422 }, { "epoch": 8.653571428571428, - "grad_norm": 48.30483627319336, + "grad_norm": 12.970129013061523, "learning_rate": 5.42140974233305e-07, - "loss": 10.7227, + "loss": 12.2148, "step": 2423 }, { "epoch": 8.657142857142857, - "grad_norm": 48.42847442626953, + "grad_norm": 29.23980140686035, "learning_rate": 5.393214976805833e-07, - "loss": 10.6055, + "loss": 13.0332, "step": 2424 }, { "epoch": 8.660714285714286, - "grad_norm": 62.04526138305664, + "grad_norm": 114.9522476196289, "learning_rate": 5.365089537819435e-07, - "loss": 9.3711, + "loss": 9.6875, "step": 2425 }, { "epoch": 8.664285714285715, - "grad_norm": 64.88182067871094, + "grad_norm": 11.862305641174316, "learning_rate": 5.337033469085562e-07, - "loss": 14.9805, + "loss": 17.0664, "step": 2426 }, { "epoch": 8.667857142857143, - "grad_norm": 50.4766845703125, + "grad_norm": 57.975406646728516, "learning_rate": 5.30904681420813e-07, - "loss": 8.6953, + "loss": 10.7031, "step": 2427 }, { "epoch": 8.67142857142857, - "grad_norm": 48.91423797607422, + "grad_norm": 46.36818313598633, "learning_rate": 5.281129616683167e-07, - "loss": 8.5742, + "loss": 8.334, "step": 2428 }, { "epoch": 8.675, - "grad_norm": 43.79597091674805, + "grad_norm": 1208.3858642578125, "learning_rate": 5.253281919898751e-07, - "loss": 9.3398, + "loss": 21.248, "step": 2429 }, { "epoch": 8.678571428571429, - "grad_norm": 50.256996154785156, + "grad_norm": 919.2557983398438, "learning_rate": 5.225503767134954e-07, - "loss": 10.8203, + "loss": 16.0469, "step": 2430 }, { "epoch": 8.682142857142857, - "grad_norm": 59.977264404296875, + "grad_norm": 133.0084228515625, "learning_rate": 5.197795201563744e-07, - "loss": 13.6445, + "loss": 15.4238, "step": 2431 }, { "epoch": 8.685714285714285, - "grad_norm": 49.42707443237305, + "grad_norm": 911.8910522460938, "learning_rate": 5.17015626624896e-07, - "loss": 9.0, + "loss": 13.4941, "step": 2432 }, { "epoch": 8.689285714285715, - "grad_norm": 65.9214859008789, + "grad_norm": 148.4451904296875, "learning_rate": 5.142587004146216e-07, - "loss": 10.0234, + "loss": 8.543, "step": 2433 }, { "epoch": 8.692857142857143, - "grad_norm": 52.784114837646484, + "grad_norm": 464.9112854003906, "learning_rate": 5.115087458102841e-07, - "loss": 7.8438, + "loss": 9.9297, "step": 2434 }, { "epoch": 8.696428571428571, - "grad_norm": 51.3972053527832, + "grad_norm": 14.94640827178955, "learning_rate": 5.087657670857799e-07, - "loss": 11.4375, + "loss": 10.3828, "step": 2435 }, { "epoch": 8.7, - "grad_norm": 46.40514373779297, + "grad_norm": 465.3719787597656, "learning_rate": 5.06029768504166e-07, - "loss": 11.2578, + "loss": 13.2773, "step": 2436 }, { "epoch": 8.70357142857143, - "grad_norm": 52.82453155517578, + "grad_norm": 20.6884765625, "learning_rate": 5.0330075431765e-07, - "loss": 9.6328, + "loss": 11.1484, "step": 2437 }, { "epoch": 8.707142857142857, - "grad_norm": 53.92339324951172, + "grad_norm": 184.6486053466797, "learning_rate": 5.005787287675851e-07, - "loss": 11.8359, + "loss": 22.8809, "step": 2438 }, { "epoch": 8.710714285714285, - "grad_norm": 50.849098205566406, + "grad_norm": 19.681598663330078, "learning_rate": 4.978636960844618e-07, - "loss": 7.8281, + "loss": 7.1992, "step": 2439 }, { "epoch": 8.714285714285714, - "grad_norm": 43.19618606567383, + "grad_norm": 93.22776794433594, "learning_rate": 4.951556604879049e-07, - "loss": 9.2773, + "loss": 10.0625, "step": 2440 }, { "epoch": 8.717857142857143, - "grad_norm": 49.05924606323242, + "grad_norm": 910.7708129882812, "learning_rate": 4.924546261866609e-07, - "loss": 11.2734, + "loss": 15.3281, "step": 2441 }, { "epoch": 8.721428571428572, - "grad_norm": 45.482826232910156, + "grad_norm": 84.64936828613281, "learning_rate": 4.897605973785996e-07, - "loss": 8.7695, + "loss": 9.1445, "step": 2442 }, { "epoch": 8.725, - "grad_norm": 47.116485595703125, + "grad_norm": 106.28523254394531, "learning_rate": 4.87073578250698e-07, - "loss": 9.9609, + "loss": 12.1445, "step": 2443 }, { "epoch": 8.728571428571428, - "grad_norm": 64.68405151367188, + "grad_norm": 34.985023498535156, "learning_rate": 4.843935729790422e-07, - "loss": 10.7969, + "loss": 22.8555, "step": 2444 }, { "epoch": 8.732142857142858, - "grad_norm": 49.90243911743164, + "grad_norm": 12.875163078308105, "learning_rate": 4.817205857288176e-07, - "loss": 10.4492, + "loss": 12.209, "step": 2445 }, { "epoch": 8.735714285714286, - "grad_norm": 40.94037628173828, + "grad_norm": 6.5134148597717285, "learning_rate": 4.790546206542995e-07, - "loss": 7.3281, + "loss": 7.4023, "step": 2446 }, { "epoch": 8.739285714285714, - "grad_norm": 46.76800537109375, + "grad_norm": 558.9470825195312, "learning_rate": 4.7639568189885464e-07, - "loss": 8.4805, + "loss": 9.6953, "step": 2447 }, { "epoch": 8.742857142857144, - "grad_norm": 52.823848724365234, + "grad_norm": 143.85272216796875, "learning_rate": 4.737437735949263e-07, - "loss": 11.6562, + "loss": 13.2578, "step": 2448 }, { "epoch": 8.746428571428572, - "grad_norm": 70.87256622314453, + "grad_norm": 69.50762176513672, "learning_rate": 4.710988998640298e-07, - "loss": 8.0547, + "loss": 7.2109, "step": 2449 }, { "epoch": 8.75, - "grad_norm": 55.778106689453125, + "grad_norm": 43.12874221801758, "learning_rate": 4.6846106481675035e-07, - "loss": 8.7227, + "loss": 9.623, "step": 2450 }, { "epoch": 8.753571428571428, - "grad_norm": 53.882808685302734, + "grad_norm": 57.11558151245117, "learning_rate": 4.658302725527325e-07, - "loss": 9.0039, + "loss": 7.7168, "step": 2451 }, { "epoch": 8.757142857142856, - "grad_norm": 54.96946334838867, + "grad_norm": 136.1124725341797, "learning_rate": 4.632065271606756e-07, - "loss": 10.3398, + "loss": 10.4297, "step": 2452 }, { "epoch": 8.760714285714286, - "grad_norm": 44.43289566040039, + "grad_norm": 174.9008331298828, "learning_rate": 4.6058983271832724e-07, - "loss": 8.0781, + "loss": 8.7832, "step": 2453 }, { "epoch": 8.764285714285714, - "grad_norm": 46.3505859375, + "grad_norm": 653.7610473632812, "learning_rate": 4.57980193292476e-07, - "loss": 10.2773, + "loss": 20.5703, "step": 2454 }, { "epoch": 8.767857142857142, - "grad_norm": 42.9606819152832, + "grad_norm": 604.0109252929688, "learning_rate": 4.5537761293894535e-07, - "loss": 10.6836, + "loss": 13.1328, "step": 2455 }, { "epoch": 8.771428571428572, - "grad_norm": 59.98224639892578, + "grad_norm": 8.009281158447266, "learning_rate": 4.5278209570258914e-07, - "loss": 8.6523, + "loss": 10.0547, "step": 2456 }, { "epoch": 8.775, - "grad_norm": 46.981685638427734, + "grad_norm": 6.546170711517334, "learning_rate": 4.501936456172845e-07, - "loss": 8.8281, + "loss": 8.5703, "step": 2457 }, { "epoch": 8.778571428571428, - "grad_norm": 55.56366729736328, + "grad_norm": 776.0973510742188, "learning_rate": 4.4761226670592074e-07, - "loss": 11.2852, + "loss": 15.0527, "step": 2458 }, { "epoch": 8.782142857142857, - "grad_norm": 56.40507888793945, + "grad_norm": 11.404443740844727, "learning_rate": 4.450379629804019e-07, - "loss": 10.3242, + "loss": 10.5, "step": 2459 }, { "epoch": 8.785714285714286, - "grad_norm": 63.77943801879883, + "grad_norm": 308.001708984375, "learning_rate": 4.4247073844163434e-07, - "loss": 10.918, + "loss": 11.8789, "step": 2460 }, { "epoch": 8.789285714285715, - "grad_norm": 61.42527389526367, + "grad_norm": 29.64400863647461, "learning_rate": 4.39910597079522e-07, - "loss": 7.332, + "loss": 6.459, "step": 2461 }, { "epoch": 8.792857142857143, - "grad_norm": 59.541114807128906, + "grad_norm": 64.83880615234375, "learning_rate": 4.3735754287296097e-07, - "loss": 11.0859, + "loss": 11.7578, "step": 2462 }, { "epoch": 8.79642857142857, - "grad_norm": 36.464088439941406, + "grad_norm": 294.67413330078125, "learning_rate": 4.3481157978983167e-07, - "loss": 7.832, + "loss": 9.1191, "step": 2463 }, { "epoch": 8.8, - "grad_norm": 56.44016647338867, + "grad_norm": 154.94874572753906, "learning_rate": 4.322727117869951e-07, - "loss": 8.1094, + "loss": 7.2891, "step": 2464 }, { "epoch": 8.803571428571429, - "grad_norm": 46.23824691772461, + "grad_norm": 92.73117065429688, "learning_rate": 4.29740942810285e-07, - "loss": 8.1211, + "loss": 8.0742, "step": 2465 }, { "epoch": 8.807142857142857, - "grad_norm": 54.77274703979492, + "grad_norm": 762.2399291992188, "learning_rate": 4.2721627679449983e-07, - "loss": 8.7773, + "loss": 9.0352, "step": 2466 }, { "epoch": 8.810714285714285, - "grad_norm": 45.37139129638672, + "grad_norm": 118.10548400878906, "learning_rate": 4.2469871766340096e-07, - "loss": 9.6016, + "loss": 8.125, "step": 2467 }, { "epoch": 8.814285714285715, - "grad_norm": 42.83951950073242, + "grad_norm": 96.16119384765625, "learning_rate": 4.221882693297047e-07, - "loss": 8.4688, + "loss": 8.0938, "step": 2468 }, { "epoch": 8.817857142857143, - "grad_norm": 42.5964469909668, + "grad_norm": 211.6372528076172, "learning_rate": 4.1968493569507494e-07, - "loss": 9.7188, + "loss": 11.0352, "step": 2469 }, { "epoch": 8.821428571428571, - "grad_norm": 59.95365524291992, + "grad_norm": 6.466601848602295, "learning_rate": 4.171887206501191e-07, - "loss": 8.1406, + "loss": 7.8164, "step": 2470 }, { "epoch": 8.825, - "grad_norm": 53.62880325317383, + "grad_norm": 32.10063171386719, "learning_rate": 4.146996280743798e-07, - "loss": 11.1289, + "loss": 10.8906, "step": 2471 }, { "epoch": 8.82857142857143, - "grad_norm": 62.7945442199707, + "grad_norm": 505.868896484375, "learning_rate": 4.122176618363305e-07, - "loss": 9.0195, + "loss": 10.0664, "step": 2472 }, { "epoch": 8.832142857142857, - "grad_norm": 60.71460723876953, + "grad_norm": 89.0667495727539, "learning_rate": 4.0974282579337166e-07, - "loss": 9.8359, + "loss": 10.627, "step": 2473 }, { "epoch": 8.835714285714285, - "grad_norm": 42.633480072021484, + "grad_norm": 30.454980850219727, "learning_rate": 4.0727512379181653e-07, - "loss": 7.7617, + "loss": 8.6777, "step": 2474 }, { "epoch": 8.839285714285714, - "grad_norm": 51.47814178466797, + "grad_norm": 19.218618392944336, "learning_rate": 4.0481455966689673e-07, - "loss": 10.25, + "loss": 9.6484, "step": 2475 }, { "epoch": 8.842857142857143, - "grad_norm": 38.30928421020508, + "grad_norm": 19.977582931518555, "learning_rate": 4.0236113724274716e-07, - "loss": 8.2227, + "loss": 9.0098, "step": 2476 }, { "epoch": 8.846428571428572, - "grad_norm": 68.03428649902344, + "grad_norm": 89.82562255859375, "learning_rate": 3.9991486033240377e-07, - "loss": 11.0352, + "loss": 11.9941, "step": 2477 }, { "epoch": 8.85, - "grad_norm": 58.012454986572266, + "grad_norm": 16.9561767578125, "learning_rate": 3.9747573273779816e-07, - "loss": 12.3516, + "loss": 12.5195, "step": 2478 }, { "epoch": 8.853571428571428, - "grad_norm": 45.58708953857422, + "grad_norm": 95.6129379272461, "learning_rate": 3.9504375824975015e-07, - "loss": 8.6758, + "loss": 8.623, "step": 2479 }, { "epoch": 8.857142857142858, - "grad_norm": 46.21669006347656, + "grad_norm": 78.0531997680664, "learning_rate": 3.9261894064796136e-07, - "loss": 7.8086, + "loss": 6.6172, "step": 2480 }, { "epoch": 8.860714285714286, - "grad_norm": 51.47406005859375, + "grad_norm": 19.798473358154297, "learning_rate": 3.902012837010133e-07, - "loss": 9.8203, + "loss": 9.4336, "step": 2481 }, { "epoch": 8.864285714285714, - "grad_norm": 52.84143829345703, + "grad_norm": 669.5113525390625, "learning_rate": 3.877907911663542e-07, - "loss": 11.0039, + "loss": 20.8848, "step": 2482 }, { "epoch": 8.867857142857144, - "grad_norm": 45.360084533691406, + "grad_norm": 93.54376220703125, "learning_rate": 3.853874667903007e-07, - "loss": 9.2266, + "loss": 10.1562, "step": 2483 }, { "epoch": 8.871428571428572, - "grad_norm": 50.123023986816406, + "grad_norm": 55.845882415771484, "learning_rate": 3.8299131430802826e-07, - "loss": 10.2031, + "loss": 11.4336, "step": 2484 }, { "epoch": 8.875, - "grad_norm": 44.718753814697266, + "grad_norm": 71.59416961669922, "learning_rate": 3.8060233744356634e-07, - "loss": 9.8516, + "loss": 10.583, "step": 2485 }, { "epoch": 8.878571428571428, - "grad_norm": 41.18428421020508, + "grad_norm": 13.114646911621094, "learning_rate": 3.782205399097916e-07, - "loss": 8.5391, + "loss": 9.6406, "step": 2486 }, { "epoch": 8.882142857142856, - "grad_norm": 46.17993927001953, + "grad_norm": 52.71579360961914, "learning_rate": 3.7584592540842347e-07, - "loss": 8.0742, + "loss": 6.3105, "step": 2487 }, { "epoch": 8.885714285714286, - "grad_norm": 50.75123977661133, + "grad_norm": 800.2318115234375, "learning_rate": 3.734784976300165e-07, - "loss": 9.8477, + "loss": 17.5078, "step": 2488 }, { "epoch": 8.889285714285714, - "grad_norm": 62.55894088745117, + "grad_norm": 88.66197204589844, "learning_rate": 3.7111826025395704e-07, - "loss": 6.5195, + "loss": 5.9355, "step": 2489 }, { "epoch": 8.892857142857142, - "grad_norm": 62.54826354980469, + "grad_norm": 585.4476928710938, "learning_rate": 3.687652169484568e-07, - "loss": 8.0117, + "loss": 8.7344, "step": 2490 }, { "epoch": 8.896428571428572, - "grad_norm": 44.3135871887207, + "grad_norm": 39.110130310058594, "learning_rate": 3.6641937137054384e-07, - "loss": 8.5703, + "loss": 9.0449, "step": 2491 }, { "epoch": 8.9, - "grad_norm": 43.17268371582031, + "grad_norm": 23.840309143066406, "learning_rate": 3.6408072716606346e-07, - "loss": 10.7031, + "loss": 11.8945, "step": 2492 }, { "epoch": 8.903571428571428, - "grad_norm": 46.37319564819336, + "grad_norm": 251.6735076904297, "learning_rate": 3.6174928796966603e-07, - "loss": 9.6289, + "loss": 10.7344, "step": 2493 }, { "epoch": 8.907142857142857, - "grad_norm": 45.425071716308594, + "grad_norm": 162.06234741210938, "learning_rate": 3.5942505740480583e-07, - "loss": 9.832, + "loss": 12.8047, "step": 2494 }, { "epoch": 8.910714285714286, - "grad_norm": 42.32011032104492, + "grad_norm": 900.5184326171875, "learning_rate": 3.5710803908373226e-07, - "loss": 8.5156, + "loss": 14.0732, "step": 2495 }, { "epoch": 8.914285714285715, - "grad_norm": 58.855777740478516, + "grad_norm": 967.1931762695312, "learning_rate": 3.5479823660748703e-07, - "loss": 7.5781, + "loss": 11.0801, "step": 2496 }, { "epoch": 8.917857142857143, - "grad_norm": 53.78789520263672, + "grad_norm": 14.505859375, "learning_rate": 3.5249565356589633e-07, - "loss": 7.5781, + "loss": 7.5293, "step": 2497 }, { "epoch": 8.92142857142857, - "grad_norm": 44.720130920410156, + "grad_norm": 377.90484619140625, "learning_rate": 3.5020029353756703e-07, - "loss": 9.9922, + "loss": 13.3906, "step": 2498 }, { "epoch": 8.925, - "grad_norm": 44.32720184326172, + "grad_norm": 124.92404174804688, "learning_rate": 3.479121600898777e-07, - "loss": 10.6484, + "loss": 11.1621, "step": 2499 }, { "epoch": 8.928571428571429, - "grad_norm": 55.57945251464844, + "grad_norm": 39.82273483276367, "learning_rate": 3.4563125677897936e-07, - "loss": 9.5469, + "loss": 9.6816, "step": 2500 }, { "epoch": 8.932142857142857, - "grad_norm": 44.34351348876953, + "grad_norm": 40.38886642456055, "learning_rate": 3.4335758714978296e-07, - "loss": 8.082, + "loss": 7.7852, "step": 2501 }, { "epoch": 8.935714285714285, - "grad_norm": 46.56104278564453, + "grad_norm": 20.37328338623047, "learning_rate": 3.4109115473595855e-07, - "loss": 9.3242, + "loss": 9.8594, "step": 2502 }, { "epoch": 8.939285714285715, - "grad_norm": 39.67842102050781, + "grad_norm": 52.380043029785156, "learning_rate": 3.3883196305992906e-07, - "loss": 7.1328, + "loss": 7.7129, "step": 2503 }, { "epoch": 8.942857142857143, - "grad_norm": 44.03419494628906, + "grad_norm": 24.132450103759766, "learning_rate": 3.365800156328619e-07, - "loss": 7.9492, + "loss": 7.7734, "step": 2504 }, { "epoch": 8.946428571428571, - "grad_norm": 41.56732940673828, + "grad_norm": 25.800006866455078, "learning_rate": 3.343353159546675e-07, - "loss": 9.5625, + "loss": 10.3281, "step": 2505 }, { "epoch": 8.95, - "grad_norm": 40.29929733276367, + "grad_norm": 8.744038581848145, "learning_rate": 3.320978675139919e-07, - "loss": 7.9727, + "loss": 7.3789, "step": 2506 }, { "epoch": 8.95357142857143, - "grad_norm": 40.09354782104492, + "grad_norm": 1170.8529052734375, "learning_rate": 3.2986767378821006e-07, - "loss": 8.7734, + "loss": 19.0889, "step": 2507 }, { "epoch": 8.957142857142857, - "grad_norm": 46.98265075683594, + "grad_norm": 51.611263275146484, "learning_rate": 3.276447382434228e-07, - "loss": 10.6172, + "loss": 14.8242, "step": 2508 }, { "epoch": 8.960714285714285, - "grad_norm": 42.58259963989258, + "grad_norm": 11.201155662536621, "learning_rate": 3.2542906433445156e-07, - "loss": 8.4727, + "loss": 8.7344, "step": 2509 }, { "epoch": 8.964285714285714, - "grad_norm": 53.640567779541016, + "grad_norm": 66.88361358642578, "learning_rate": 3.2322065550483005e-07, - "loss": 11.9766, + "loss": 13.0449, "step": 2510 }, { "epoch": 8.967857142857143, - "grad_norm": 52.04290008544922, + "grad_norm": 314.9934387207031, "learning_rate": 3.210195151868017e-07, - "loss": 8.2383, + "loss": 11.0371, "step": 2511 }, { "epoch": 8.971428571428572, - "grad_norm": 52.985050201416016, + "grad_norm": 9.268099784851074, "learning_rate": 3.18825646801314e-07, - "loss": 10.1484, + "loss": 10.9688, "step": 2512 }, { "epoch": 8.975, - "grad_norm": 41.34638214111328, + "grad_norm": 18.042388916015625, "learning_rate": 3.166390537580122e-07, - "loss": 7.293, + "loss": 5.6523, "step": 2513 }, { "epoch": 8.978571428571428, - "grad_norm": 42.72998046875, + "grad_norm": 18.932697296142578, "learning_rate": 3.14459739455234e-07, - "loss": 8.1172, + "loss": 7.7031, "step": 2514 }, { "epoch": 8.982142857142858, - "grad_norm": 40.73484802246094, + "grad_norm": 47.11307144165039, "learning_rate": 3.122877072800046e-07, - "loss": 8.3398, + "loss": 8.2344, "step": 2515 }, { "epoch": 8.985714285714286, - "grad_norm": 58.4935302734375, + "grad_norm": 295.3365173339844, "learning_rate": 3.101229606080319e-07, - "loss": 7.9609, + "loss": 21.4922, "step": 2516 }, { "epoch": 8.989285714285714, - "grad_norm": 52.428558349609375, + "grad_norm": 13.117593765258789, "learning_rate": 3.079655028037015e-07, - "loss": 8.9375, + "loss": 7.5371, "step": 2517 }, { "epoch": 8.992857142857144, - "grad_norm": 50.213985443115234, + "grad_norm": 535.00732421875, "learning_rate": 3.0581533722006953e-07, - "loss": 12.6875, + "loss": 13.4922, "step": 2518 }, { "epoch": 8.996428571428572, - "grad_norm": 41.41427993774414, + "grad_norm": 958.2476196289062, "learning_rate": 3.0367246719886054e-07, - "loss": 7.0977, + "loss": 12.2305, "step": 2519 }, { "epoch": 9.0, - "grad_norm": 48.054168701171875, + "grad_norm": 1008.5611572265625, "learning_rate": 3.015368960704584e-07, - "loss": 8.0195, + "loss": 11.9023, "step": 2520 }, { "epoch": 9.0, - "eval_loss": 9.71439266204834, - "eval_mse": 9.7161248810923, - "eval_runtime": 11.5864, - "eval_samples_per_second": 245.115, - "eval_steps_per_second": 1.295, - "eval_target_0_mse": 18.510245775039927, - "eval_target_1_mse": 9.987437885081768, - "eval_target_2_mse": 5.2931354023841255, - "eval_target_3_mse": 5.073680461863381, + "eval_loss": 12.644394874572754, + "eval_mse": 12.638826460616018, + "eval_runtime": 10.8858, + "eval_samples_per_second": 260.889, + "eval_steps_per_second": 1.378, + "eval_target_0_mse": 37.87047207774002, + "eval_target_1_mse": 8.894393744909102, + "eval_target_2_mse": 2.5619325220937483, + "eval_target_3_mse": 1.2285074977212156, "step": 2520 }, { "epoch": 9.003571428571428, - "grad_norm": 52.31313705444336, + "grad_norm": 601.1650390625, "learning_rate": 2.9940862715390483e-07, - "loss": 9.6133, + "loss": 11.3379, "step": 2521 }, { "epoch": 9.007142857142858, - "grad_norm": 46.9532356262207, + "grad_norm": 28.371047973632812, "learning_rate": 2.972876637568922e-07, - "loss": 10.0273, + "loss": 10.2715, "step": 2522 }, { "epoch": 9.010714285714286, - "grad_norm": 62.473297119140625, + "grad_norm": 454.1785583496094, "learning_rate": 2.9517400917575987e-07, - "loss": 13.7812, + "loss": 16.707, "step": 2523 }, { "epoch": 9.014285714285714, - "grad_norm": 47.23248291015625, + "grad_norm": 58.796085357666016, "learning_rate": 2.930676666954846e-07, - "loss": 9.0352, + "loss": 9.3545, "step": 2524 }, { "epoch": 9.017857142857142, - "grad_norm": 49.81340026855469, + "grad_norm": 1427.8272705078125, "learning_rate": 2.909686395896827e-07, - "loss": 10.2266, + "loss": 24.5977, "step": 2525 }, { "epoch": 9.021428571428572, - "grad_norm": 62.12689208984375, + "grad_norm": 40.29719543457031, "learning_rate": 2.8887693112060025e-07, - "loss": 10.0078, + "loss": 9.3096, "step": 2526 }, { "epoch": 9.025, - "grad_norm": 65.71543884277344, + "grad_norm": 1031.97314453125, "learning_rate": 2.867925445391079e-07, - "loss": 13.5039, + "loss": 18.8555, "step": 2527 }, { "epoch": 9.028571428571428, - "grad_norm": 44.61238098144531, + "grad_norm": 13.338691711425781, "learning_rate": 2.847154830846971e-07, - "loss": 10.0195, + "loss": 10.1797, "step": 2528 }, { "epoch": 9.032142857142857, - "grad_norm": 39.20927429199219, + "grad_norm": 287.8734130859375, "learning_rate": 2.8264574998547677e-07, - "loss": 7.5195, + "loss": 7.0469, "step": 2529 }, { "epoch": 9.035714285714286, - "grad_norm": 45.33678436279297, + "grad_norm": 274.1270446777344, "learning_rate": 2.8058334845816214e-07, - "loss": 8.2656, + "loss": 10.1406, "step": 2530 }, { "epoch": 9.039285714285715, - "grad_norm": 75.27999877929688, + "grad_norm": 1371.7564697265625, "learning_rate": 2.785282817080781e-07, - "loss": 11.1484, + "loss": 18.748, "step": 2531 }, { "epoch": 9.042857142857143, - "grad_norm": 75.00166320800781, + "grad_norm": 45.59075927734375, "learning_rate": 2.7648055292914754e-07, - "loss": 9.4219, + "loss": 10.9766, "step": 2532 }, { "epoch": 9.04642857142857, - "grad_norm": 45.58433151245117, + "grad_norm": 66.10751342773438, "learning_rate": 2.744401653038903e-07, - "loss": 9.1641, + "loss": 9.1602, "step": 2533 }, { "epoch": 9.05, - "grad_norm": 51.655757904052734, + "grad_norm": 920.0810546875, "learning_rate": 2.724071220034158e-07, - "loss": 9.418, + "loss": 13.4316, "step": 2534 }, { "epoch": 9.053571428571429, - "grad_norm": 51.13249969482422, + "grad_norm": 844.2461547851562, "learning_rate": 2.703814261874199e-07, - "loss": 10.4531, + "loss": 14.2949, "step": 2535 }, { "epoch": 9.057142857142857, - "grad_norm": 41.28540802001953, + "grad_norm": 68.35519409179688, "learning_rate": 2.6836308100417874e-07, - "loss": 8.832, + "loss": 9.1113, "step": 2536 }, { "epoch": 9.060714285714285, - "grad_norm": 50.0010871887207, + "grad_norm": 790.4164428710938, "learning_rate": 2.6635208959054524e-07, - "loss": 8.9258, + "loss": 13.0703, "step": 2537 }, { "epoch": 9.064285714285715, - "grad_norm": 48.82689666748047, + "grad_norm": 1069.7969970703125, "learning_rate": 2.6434845507194106e-07, - "loss": 11.7461, + "loss": 17.1133, "step": 2538 }, { "epoch": 9.067857142857143, - "grad_norm": 49.200225830078125, + "grad_norm": 6.733631610870361, "learning_rate": 2.6235218056235633e-07, - "loss": 12.2539, + "loss": 12.9492, "step": 2539 }, { "epoch": 9.071428571428571, - "grad_norm": 50.13022232055664, + "grad_norm": 19.122325897216797, "learning_rate": 2.6036326916434153e-07, - "loss": 7.6484, + "loss": 7.5566, "step": 2540 }, { "epoch": 9.075, - "grad_norm": 43.01034164428711, + "grad_norm": 1098.03466796875, "learning_rate": 2.583817239690034e-07, - "loss": 9.3633, + "loss": 14.2373, "step": 2541 }, { "epoch": 9.07857142857143, - "grad_norm": 54.12001419067383, + "grad_norm": 564.9404296875, "learning_rate": 2.564075480560013e-07, - "loss": 9.1211, + "loss": 11.2891, "step": 2542 }, { "epoch": 9.082142857142857, - "grad_norm": 48.57778549194336, + "grad_norm": 360.08343505859375, "learning_rate": 2.544407444935404e-07, - "loss": 8.7383, + "loss": 8.5625, "step": 2543 }, { "epoch": 9.085714285714285, - "grad_norm": 66.57128143310547, + "grad_norm": 477.360595703125, "learning_rate": 2.524813163383683e-07, - "loss": 8.1445, + "loss": 7.7129, "step": 2544 }, { "epoch": 9.089285714285714, - "grad_norm": 52.538856506347656, + "grad_norm": 110.55958557128906, "learning_rate": 2.5052926663577006e-07, - "loss": 11.2852, + "loss": 10.2773, "step": 2545 }, { "epoch": 9.092857142857143, - "grad_norm": 51.35068893432617, + "grad_norm": 19.09769630432129, "learning_rate": 2.485845984195645e-07, - "loss": 8.2656, + "loss": 8.5605, "step": 2546 }, { "epoch": 9.096428571428572, - "grad_norm": 43.49100875854492, + "grad_norm": 79.77286529541016, "learning_rate": 2.4664731471209515e-07, - "loss": 8.543, + "loss": 6.5918, "step": 2547 }, { "epoch": 9.1, - "grad_norm": 47.865020751953125, + "grad_norm": 98.95609283447266, "learning_rate": 2.447174185242324e-07, - "loss": 9.5352, + "loss": 11.1035, "step": 2548 }, { "epoch": 9.103571428571428, - "grad_norm": 50.348758697509766, + "grad_norm": 45.28377914428711, "learning_rate": 2.4279491285536304e-07, - "loss": 9.1992, + "loss": 11.1152, "step": 2549 }, { "epoch": 9.107142857142858, - "grad_norm": 56.40528869628906, + "grad_norm": 33.59952163696289, "learning_rate": 2.4087980069338825e-07, - "loss": 10.4883, + "loss": 12.5039, "step": 2550 }, { "epoch": 9.110714285714286, - "grad_norm": 55.5882568359375, + "grad_norm": 267.3100891113281, "learning_rate": 2.389720850147181e-07, - "loss": 10.1406, + "loss": 11.1289, "step": 2551 }, { "epoch": 9.114285714285714, - "grad_norm": 56.1557731628418, + "grad_norm": 874.5932006835938, "learning_rate": 2.3707176878426886e-07, - "loss": 9.4414, + "loss": 12.8945, "step": 2552 }, { "epoch": 9.117857142857142, - "grad_norm": 44.639530181884766, + "grad_norm": 14.682652473449707, "learning_rate": 2.3517885495545456e-07, - "loss": 7.2695, + "loss": 7.9092, "step": 2553 }, { "epoch": 9.121428571428572, - "grad_norm": 51.20123291015625, + "grad_norm": 694.3779296875, "learning_rate": 2.3329334647018696e-07, - "loss": 10.5156, + "loss": 12.5176, "step": 2554 }, { "epoch": 9.125, - "grad_norm": 51.325965881347656, + "grad_norm": 42.64570999145508, "learning_rate": 2.314152462588659e-07, - "loss": 8.7539, + "loss": 9.4102, "step": 2555 }, { "epoch": 9.128571428571428, - "grad_norm": 64.40094757080078, + "grad_norm": 102.675048828125, "learning_rate": 2.2954455724037873e-07, - "loss": 15.1758, + "loss": 16.9961, "step": 2556 }, { "epoch": 9.132142857142858, - "grad_norm": 41.444000244140625, + "grad_norm": 54.40444564819336, "learning_rate": 2.276812823220964e-07, - "loss": 9.0312, + "loss": 11.1484, "step": 2557 }, { "epoch": 9.135714285714286, - "grad_norm": 55.89292526245117, + "grad_norm": 456.921875, "learning_rate": 2.2582542439986422e-07, - "loss": 11.2461, + "loss": 15.7031, "step": 2558 }, { "epoch": 9.139285714285714, - "grad_norm": 45.91322326660156, + "grad_norm": 45.67864990234375, "learning_rate": 2.2397698635800157e-07, - "loss": 8.6289, + "loss": 7.9473, "step": 2559 }, { "epoch": 9.142857142857142, - "grad_norm": 48.67218780517578, + "grad_norm": 901.0260620117188, "learning_rate": 2.2213597106929608e-07, - "loss": 7.7383, + "loss": 13.6094, "step": 2560 }, { "epoch": 9.146428571428572, - "grad_norm": 39.61361312866211, + "grad_norm": 19.360647201538086, "learning_rate": 2.2030238139499948e-07, - "loss": 7.7695, + "loss": 8.0625, "step": 2561 }, { "epoch": 9.15, - "grad_norm": 43.84793472290039, + "grad_norm": 104.29110717773438, "learning_rate": 2.1847622018482283e-07, - "loss": 9.0156, + "loss": 10.5664, "step": 2562 }, { "epoch": 9.153571428571428, - "grad_norm": 51.2229118347168, + "grad_norm": 422.1026916503906, "learning_rate": 2.1665749027693028e-07, - "loss": 11.3828, + "loss": 14.2656, "step": 2563 }, { "epoch": 9.157142857142857, - "grad_norm": 60.32124710083008, + "grad_norm": 378.78668212890625, "learning_rate": 2.1484619449793854e-07, - "loss": 14.7188, + "loss": 18.4512, "step": 2564 }, { "epoch": 9.160714285714286, - "grad_norm": 54.86122512817383, + "grad_norm": 186.12266540527344, "learning_rate": 2.1304233566290967e-07, - "loss": 13.8789, + "loss": 16.6836, "step": 2565 }, { "epoch": 9.164285714285715, - "grad_norm": 53.004676818847656, + "grad_norm": 67.48603820800781, "learning_rate": 2.1124591657534776e-07, - "loss": 9.2305, + "loss": 10.0215, "step": 2566 }, { "epoch": 9.167857142857143, - "grad_norm": 57.1533203125, + "grad_norm": 646.451416015625, "learning_rate": 2.094569400271934e-07, - "loss": 8.8555, + "loss": 19.2168, "step": 2567 }, { "epoch": 9.17142857142857, - "grad_norm": 40.81446838378906, + "grad_norm": 122.65337371826172, "learning_rate": 2.0767540879882143e-07, - "loss": 8.1875, + "loss": 17.5625, "step": 2568 }, { "epoch": 9.175, - "grad_norm": 54.36885452270508, + "grad_norm": 11.131246566772461, "learning_rate": 2.0590132565903475e-07, - "loss": 10.6133, + "loss": 11.7207, "step": 2569 }, { "epoch": 9.178571428571429, - "grad_norm": 53.11064529418945, + "grad_norm": 406.8164978027344, "learning_rate": 2.041346933650612e-07, - "loss": 8.9414, + "loss": 10.3203, "step": 2570 }, { "epoch": 9.182142857142857, - "grad_norm": 48.494049072265625, + "grad_norm": 27.56963348388672, "learning_rate": 2.0237551466254668e-07, - "loss": 9.7578, + "loss": 11.2461, "step": 2571 }, { "epoch": 9.185714285714285, - "grad_norm": 62.527862548828125, + "grad_norm": 780.7577514648438, "learning_rate": 2.006237922855553e-07, - "loss": 11.1289, + "loss": 14.9102, "step": 2572 }, { "epoch": 9.189285714285715, - "grad_norm": 66.7544174194336, + "grad_norm": 25.701181411743164, "learning_rate": 1.9887952895656204e-07, - "loss": 11.1523, + "loss": 10.0352, "step": 2573 }, { "epoch": 9.192857142857143, - "grad_norm": 53.17273712158203, + "grad_norm": 26.210281372070312, "learning_rate": 1.9714272738644957e-07, - "loss": 9.8047, + "loss": 9.4883, "step": 2574 }, { "epoch": 9.196428571428571, - "grad_norm": 59.31477355957031, + "grad_norm": 103.19001007080078, "learning_rate": 1.9541339027450256e-07, - "loss": 7.6016, + "loss": 7.9238, "step": 2575 }, { "epoch": 9.2, - "grad_norm": 45.505794525146484, + "grad_norm": 42.669063568115234, "learning_rate": 1.9369152030840553e-07, - "loss": 7.457, + "loss": 7.2148, "step": 2576 }, { "epoch": 9.20357142857143, - "grad_norm": 67.21479034423828, + "grad_norm": 117.47333526611328, "learning_rate": 1.9197712016423843e-07, - "loss": 10.6523, + "loss": 11.0352, "step": 2577 }, { "epoch": 9.207142857142857, - "grad_norm": 49.70498275756836, + "grad_norm": 79.52140045166016, "learning_rate": 1.9027019250647038e-07, - "loss": 8.9336, + "loss": 8.2715, "step": 2578 }, { "epoch": 9.210714285714285, - "grad_norm": 42.3458366394043, + "grad_norm": 44.69135284423828, "learning_rate": 1.8857073998795827e-07, - "loss": 8.9648, + "loss": 9.7656, "step": 2579 }, { "epoch": 9.214285714285714, - "grad_norm": 39.68901824951172, + "grad_norm": 29.51445960998535, "learning_rate": 1.8687876524993987e-07, - "loss": 7.8906, + "loss": 7.6328, "step": 2580 }, { "epoch": 9.217857142857143, - "grad_norm": 49.26222229003906, + "grad_norm": 411.6716613769531, "learning_rate": 1.851942709220328e-07, - "loss": 11.7148, + "loss": 12.1426, "step": 2581 }, { "epoch": 9.221428571428572, - "grad_norm": 46.08369445800781, + "grad_norm": 68.69324493408203, "learning_rate": 1.8351725962222733e-07, - "loss": 9.5938, + "loss": 8.8066, "step": 2582 }, { "epoch": 9.225, - "grad_norm": 60.363792419433594, + "grad_norm": 120.73213958740234, "learning_rate": 1.8184773395688527e-07, - "loss": 8.3242, + "loss": 8.0801, "step": 2583 }, { "epoch": 9.228571428571428, - "grad_norm": 41.97392272949219, + "grad_norm": 12.6747465133667, "learning_rate": 1.801856965207338e-07, - "loss": 7.8984, + "loss": 8.543, "step": 2584 }, { "epoch": 9.232142857142858, - "grad_norm": 45.65060806274414, + "grad_norm": 23.303619384765625, "learning_rate": 1.785311498968617e-07, - "loss": 9.6875, + "loss": 21.9043, "step": 2585 }, { "epoch": 9.235714285714286, - "grad_norm": 54.93303680419922, + "grad_norm": 407.95123291015625, "learning_rate": 1.7688409665671702e-07, - "loss": 9.2305, + "loss": 11.6367, "step": 2586 }, { "epoch": 9.239285714285714, - "grad_norm": 54.656620025634766, + "grad_norm": 256.31695556640625, "learning_rate": 1.75244539360101e-07, - "loss": 9.3203, + "loss": 10.4336, "step": 2587 }, { "epoch": 9.242857142857142, - "grad_norm": 51.809417724609375, + "grad_norm": 534.0101928710938, "learning_rate": 1.7361248055516366e-07, - "loss": 11.0391, + "loss": 17.9688, "step": 2588 }, { "epoch": 9.246428571428572, - "grad_norm": 44.058319091796875, + "grad_norm": 738.508056640625, "learning_rate": 1.7198792277840327e-07, - "loss": 10.4297, + "loss": 13.082, "step": 2589 }, { "epoch": 9.25, - "grad_norm": 60.2453727722168, + "grad_norm": 582.6513671875, "learning_rate": 1.7037086855465902e-07, - "loss": 11.4531, + "loss": 13.9395, "step": 2590 }, { "epoch": 9.253571428571428, - "grad_norm": 56.208839416503906, + "grad_norm": 13.430068969726562, "learning_rate": 1.687613203971089e-07, - "loss": 7.8047, + "loss": 7.6562, "step": 2591 }, { "epoch": 9.257142857142856, - "grad_norm": 46.5457649230957, + "grad_norm": 19.621938705444336, "learning_rate": 1.6715928080726417e-07, - "loss": 8.2578, + "loss": 9.6445, "step": 2592 }, { "epoch": 9.260714285714286, - "grad_norm": 49.52289581298828, + "grad_norm": 14.469520568847656, "learning_rate": 1.6556475227496816e-07, - "loss": 9.7422, + "loss": 10.6035, "step": 2593 }, { "epoch": 9.264285714285714, - "grad_norm": 51.55187225341797, + "grad_norm": 237.3565673828125, "learning_rate": 1.6397773727838906e-07, - "loss": 8.3516, + "loss": 20.748, "step": 2594 }, { "epoch": 9.267857142857142, - "grad_norm": 44.417972564697266, + "grad_norm": 20.379545211791992, "learning_rate": 1.6239823828401945e-07, - "loss": 8.8945, + "loss": 7.6309, "step": 2595 }, { "epoch": 9.271428571428572, - "grad_norm": 47.2684211730957, + "grad_norm": 119.93964385986328, "learning_rate": 1.6082625774666793e-07, - "loss": 9.8516, + "loss": 12.9102, "step": 2596 }, { "epoch": 9.275, - "grad_norm": 52.92212677001953, + "grad_norm": 56.96625900268555, "learning_rate": 1.5926179810946185e-07, - "loss": 8.3047, + "loss": 7.748, "step": 2597 }, { "epoch": 9.278571428571428, - "grad_norm": 43.995887756347656, + "grad_norm": 31.226198196411133, "learning_rate": 1.5770486180383627e-07, - "loss": 8.9141, + "loss": 8.3848, "step": 2598 }, { "epoch": 9.282142857142857, - "grad_norm": 46.74053192138672, + "grad_norm": 872.9312744140625, "learning_rate": 1.5615545124953668e-07, - "loss": 9.2305, + "loss": 16.7227, "step": 2599 }, { "epoch": 9.285714285714286, - "grad_norm": 49.92512893676758, + "grad_norm": 303.4358215332031, "learning_rate": 1.5461356885461077e-07, - "loss": 9.5, + "loss": 11.7344, "step": 2600 }, { "epoch": 9.289285714285715, - "grad_norm": 62.0781135559082, + "grad_norm": 106.43690490722656, "learning_rate": 1.530792170154055e-07, - "loss": 11.3828, + "loss": 13.207, "step": 2601 }, { "epoch": 9.292857142857143, - "grad_norm": 48.70119857788086, + "grad_norm": 64.59324645996094, "learning_rate": 1.5155239811656562e-07, - "loss": 13.2031, + "loss": 14.4434, "step": 2602 }, { "epoch": 9.29642857142857, - "grad_norm": 69.69535827636719, + "grad_norm": 78.14595794677734, "learning_rate": 1.5003311453102853e-07, - "loss": 16.1328, + "loss": 16.3398, "step": 2603 }, { "epoch": 9.3, - "grad_norm": 42.78739929199219, + "grad_norm": 291.5131530761719, "learning_rate": 1.4852136862001766e-07, - "loss": 8.0469, + "loss": 8.5781, "step": 2604 }, { "epoch": 9.303571428571429, - "grad_norm": 47.093631744384766, + "grad_norm": 43.01646041870117, "learning_rate": 1.4701716273304524e-07, - "loss": 9.5938, + "loss": 10.0195, "step": 2605 }, { "epoch": 9.307142857142857, - "grad_norm": 71.8746109008789, + "grad_norm": 42.90623474121094, "learning_rate": 1.455204992079029e-07, - "loss": 12.0156, + "loss": 13.9453, "step": 2606 }, { "epoch": 9.310714285714285, - "grad_norm": 65.96199798583984, + "grad_norm": 148.1778564453125, "learning_rate": 1.4403138037066056e-07, - "loss": 16.1562, + "loss": 19.3984, "step": 2607 }, { "epoch": 9.314285714285715, - "grad_norm": 45.223690032958984, + "grad_norm": 115.09904479980469, "learning_rate": 1.4254980853566248e-07, - "loss": 10.7383, + "loss": 12.4219, "step": 2608 }, { "epoch": 9.317857142857143, - "grad_norm": 50.430240631103516, + "grad_norm": 18.398578643798828, "learning_rate": 1.4107578600552396e-07, - "loss": 8.1055, + "loss": 10.0039, "step": 2609 }, { "epoch": 9.321428571428571, - "grad_norm": 58.25412368774414, + "grad_norm": 47.547183990478516, "learning_rate": 1.3960931507112752e-07, - "loss": 8.5391, + "loss": 9.3203, "step": 2610 }, { "epoch": 9.325, - "grad_norm": 52.56269073486328, + "grad_norm": 19.388530731201172, "learning_rate": 1.3815039801161723e-07, - "loss": 8.25, + "loss": 7.0156, "step": 2611 }, { "epoch": 9.32857142857143, - "grad_norm": 50.150760650634766, + "grad_norm": 103.86668395996094, "learning_rate": 1.3669903709439936e-07, - "loss": 8.6914, + "loss": 9.0625, "step": 2612 }, { "epoch": 9.332142857142857, - "grad_norm": 43.38456344604492, + "grad_norm": 40.38218307495117, "learning_rate": 1.3525523457513622e-07, - "loss": 8.8008, + "loss": 9.8359, "step": 2613 }, { "epoch": 9.335714285714285, - "grad_norm": 57.97886276245117, + "grad_norm": 87.08971405029297, "learning_rate": 1.338189926977429e-07, - "loss": 11.3242, + "loss": 12.2148, "step": 2614 }, { "epoch": 9.339285714285714, - "grad_norm": 45.65130615234375, + "grad_norm": 19.815153121948242, "learning_rate": 1.3239031369438327e-07, - "loss": 8.6562, + "loss": 9.8945, "step": 2615 }, { "epoch": 9.342857142857143, - "grad_norm": 48.37057876586914, + "grad_norm": 299.9122619628906, "learning_rate": 1.3096919978546842e-07, - "loss": 8.2227, + "loss": 19.793, "step": 2616 }, { "epoch": 9.346428571428572, - "grad_norm": 55.03896713256836, + "grad_norm": 979.6494140625, "learning_rate": 1.2955565317965101e-07, - "loss": 9.918, + "loss": 15.8457, "step": 2617 }, { "epoch": 9.35, - "grad_norm": 44.16893005371094, + "grad_norm": 21.795263290405273, "learning_rate": 1.2814967607382433e-07, - "loss": 10.0859, + "loss": 8.7578, "step": 2618 }, { "epoch": 9.353571428571428, - "grad_norm": 41.1052131652832, + "grad_norm": 49.82844924926758, "learning_rate": 1.2675127065311433e-07, - "loss": 7.5664, + "loss": 8.2578, "step": 2619 }, { "epoch": 9.357142857142858, - "grad_norm": 45.17938232421875, + "grad_norm": 35.53227996826172, "learning_rate": 1.253604390908819e-07, - "loss": 7.7734, + "loss": 8.7559, "step": 2620 }, { "epoch": 9.360714285714286, - "grad_norm": 45.48064041137695, + "grad_norm": 9.328468322753906, "learning_rate": 1.2397718354871692e-07, - "loss": 7.4648, + "loss": 8.4219, "step": 2621 }, { "epoch": 9.364285714285714, - "grad_norm": 57.67876434326172, + "grad_norm": 307.857421875, "learning_rate": 1.226015061764335e-07, - "loss": 9.4688, + "loss": 12.1875, "step": 2622 }, { "epoch": 9.367857142857144, - "grad_norm": 45.98436737060547, + "grad_norm": 7.677233695983887, "learning_rate": 1.2123340911206816e-07, - "loss": 9.457, + "loss": 10.7949, "step": 2623 }, { "epoch": 9.371428571428572, - "grad_norm": 55.42686080932617, + "grad_norm": 727.48974609375, "learning_rate": 1.1987289448187777e-07, - "loss": 10.9727, + "loss": 20.4375, "step": 2624 }, { "epoch": 9.375, - "grad_norm": 40.23261260986328, + "grad_norm": 176.03182983398438, "learning_rate": 1.185199644003332e-07, - "loss": 8.1445, + "loss": 18.9453, "step": 2625 }, { "epoch": 9.378571428571428, - "grad_norm": 47.140541076660156, + "grad_norm": 1026.282470703125, "learning_rate": 1.1717462097011856e-07, - "loss": 8.4102, + "loss": 15.668, "step": 2626 }, { "epoch": 9.382142857142856, - "grad_norm": 54.08537673950195, + "grad_norm": 6.1504011154174805, "learning_rate": 1.1583686628212576e-07, - "loss": 9.9023, + "loss": 9.4629, "step": 2627 }, { "epoch": 9.385714285714286, - "grad_norm": 64.20650482177734, + "grad_norm": 50.256629943847656, "learning_rate": 1.1450670241545392e-07, - "loss": 14.3516, + "loss": 17.8359, "step": 2628 }, { "epoch": 9.389285714285714, - "grad_norm": 71.7461166381836, + "grad_norm": 53.85953903198242, "learning_rate": 1.1318413143740436e-07, - "loss": 11.2969, + "loss": 12.6328, "step": 2629 }, { "epoch": 9.392857142857142, - "grad_norm": 54.68277359008789, + "grad_norm": 247.9139862060547, "learning_rate": 1.1186915540347732e-07, - "loss": 10.6016, + "loss": 12.6836, "step": 2630 }, { "epoch": 9.396428571428572, - "grad_norm": 54.686702728271484, + "grad_norm": 52.78617858886719, "learning_rate": 1.105617763573702e-07, - "loss": 10.0469, + "loss": 10.6621, "step": 2631 }, { "epoch": 9.4, - "grad_norm": 51.340660095214844, + "grad_norm": 177.4543914794922, "learning_rate": 1.0926199633097156e-07, - "loss": 9.4727, + "loss": 10.8711, "step": 2632 }, { "epoch": 9.403571428571428, - "grad_norm": 59.048954010009766, + "grad_norm": 51.078453063964844, "learning_rate": 1.0796981734436218e-07, - "loss": 9.1289, + "loss": 7.9219, "step": 2633 }, { "epoch": 9.407142857142857, - "grad_norm": 52.21156692504883, + "grad_norm": 376.3416442871094, "learning_rate": 1.0668524140580783e-07, - "loss": 9.8359, + "loss": 10.6289, "step": 2634 }, { "epoch": 9.410714285714286, - "grad_norm": 56.45337677001953, + "grad_norm": 228.23077392578125, "learning_rate": 1.0540827051175817e-07, - "loss": 10.9297, + "loss": 12.2031, "step": 2635 }, { "epoch": 9.414285714285715, - "grad_norm": 42.32210159301758, + "grad_norm": 12.981549263000488, "learning_rate": 1.041389066468429e-07, - "loss": 7.6797, + "loss": 6.3555, "step": 2636 }, { "epoch": 9.417857142857143, - "grad_norm": 49.21497344970703, + "grad_norm": 145.0294189453125, "learning_rate": 1.028771517838706e-07, - "loss": 9.9727, + "loss": 10.7734, "step": 2637 }, { "epoch": 9.42142857142857, - "grad_norm": 59.92080307006836, + "grad_norm": 19.851991653442383, "learning_rate": 1.0162300788382263e-07, - "loss": 11.2891, + "loss": 12.5039, "step": 2638 }, { "epoch": 9.425, - "grad_norm": 46.173397064208984, + "grad_norm": 197.9223175048828, "learning_rate": 1.0037647689585207e-07, - "loss": 10.0469, + "loss": 9.6328, "step": 2639 }, { "epoch": 9.428571428571429, - "grad_norm": 66.59366607666016, + "grad_norm": 26.130573272705078, "learning_rate": 9.913756075728088e-08, - "loss": 11.375, + "loss": 13.4102, "step": 2640 }, { "epoch": 9.432142857142857, - "grad_norm": 44.588260650634766, + "grad_norm": 8.797689437866211, "learning_rate": 9.79062613935955e-08, - "loss": 8.6914, + "loss": 8.1797, "step": 2641 }, { "epoch": 9.435714285714285, - "grad_norm": 55.457008361816406, + "grad_norm": 18.51912498474121, "learning_rate": 9.66825807184446e-08, - "loss": 11.5312, + "loss": 13.4688, "step": 2642 }, { "epoch": 9.439285714285715, - "grad_norm": 47.6248893737793, + "grad_norm": 131.27268981933594, "learning_rate": 9.546652063363748e-08, - "loss": 11.1016, + "loss": 12.4629, "step": 2643 }, { "epoch": 9.442857142857143, - "grad_norm": 48.543174743652344, + "grad_norm": 6.392375946044922, "learning_rate": 9.42580830291373e-08, - "loss": 8.4453, + "loss": 7.6641, "step": 2644 }, { "epoch": 9.446428571428571, - "grad_norm": 62.327301025390625, + "grad_norm": 120.46961975097656, "learning_rate": 9.305726978306173e-08, - "loss": 9.8008, + "loss": 9.9102, "step": 2645 }, { "epoch": 9.45, - "grad_norm": 54.224609375, + "grad_norm": 129.64651489257812, "learning_rate": 9.186408276168012e-08, - "loss": 10.7227, + "loss": 9.9492, "step": 2646 }, { "epoch": 9.45357142857143, - "grad_norm": 48.37324142456055, + "grad_norm": 25.1585750579834, "learning_rate": 9.0678523819408e-08, - "loss": 8.8438, + "loss": 7.9277, "step": 2647 }, { "epoch": 9.457142857142857, - "grad_norm": 46.61825180053711, + "grad_norm": 30.826345443725586, "learning_rate": 8.950059479880591e-08, - "loss": 8.3242, + "loss": 8.7812, "step": 2648 }, { "epoch": 9.460714285714285, - "grad_norm": 48.924442291259766, + "grad_norm": 531.9413452148438, "learning_rate": 8.833029753057554e-08, - "loss": 8.3008, + "loss": 11.8887, "step": 2649 }, { "epoch": 9.464285714285714, - "grad_norm": 48.71123123168945, + "grad_norm": 13.570749282836914, "learning_rate": 8.716763383355863e-08, - "loss": 8.5938, + "loss": 9.8301, "step": 2650 }, { "epoch": 9.467857142857143, - "grad_norm": 49.053810119628906, + "grad_norm": 14.452192306518555, "learning_rate": 8.601260551473312e-08, - "loss": 9.7305, + "loss": 10.8984, "step": 2651 }, { "epoch": 9.471428571428572, - "grad_norm": 44.017818450927734, + "grad_norm": 68.92324829101562, "learning_rate": 8.486521436920914e-08, - "loss": 8.8516, + "loss": 9.8281, "step": 2652 }, { "epoch": 9.475, - "grad_norm": 58.97662353515625, + "grad_norm": 172.47352600097656, "learning_rate": 8.372546218022747e-08, - "loss": 10.5938, + "loss": 12.293, "step": 2653 }, { "epoch": 9.478571428571428, - "grad_norm": 43.7421875, + "grad_norm": 470.9335632324219, "learning_rate": 8.25933507191573e-08, - "loss": 9.0703, + "loss": 24.2617, "step": 2654 }, { "epoch": 9.482142857142858, - "grad_norm": 56.42229080200195, + "grad_norm": 63.77068328857422, "learning_rate": 8.14688817454934e-08, - "loss": 11.5508, + "loss": 14.8379, "step": 2655 }, { "epoch": 9.485714285714286, - "grad_norm": 45.261314392089844, + "grad_norm": 576.7171630859375, "learning_rate": 8.035205700685167e-08, - "loss": 9.4297, + "loss": 12.6172, "step": 2656 }, { "epoch": 9.489285714285714, - "grad_norm": 52.597633361816406, + "grad_norm": 445.4435119628906, "learning_rate": 7.924287823896815e-08, - "loss": 9.0625, + "loss": 10.1172, "step": 2657 }, { "epoch": 9.492857142857144, - "grad_norm": 64.90435028076172, + "grad_norm": 1033.304443359375, "learning_rate": 7.8141347165695e-08, - "loss": 10.4883, + "loss": 23.5234, "step": 2658 }, { "epoch": 9.496428571428572, - "grad_norm": 44.80603790283203, + "grad_norm": 172.67877197265625, "learning_rate": 7.704746549899944e-08, - "loss": 9.3125, + "loss": 10.5, "step": 2659 }, { "epoch": 9.5, - "grad_norm": 46.91279983520508, + "grad_norm": 648.22119140625, "learning_rate": 7.59612349389599e-08, - "loss": 10.0898, + "loss": 18.4219, "step": 2660 }, { "epoch": 9.503571428571428, - "grad_norm": 57.484188079833984, + "grad_norm": 33.81277847290039, "learning_rate": 7.488265717376375e-08, - "loss": 11.1211, + "loss": 23.8438, "step": 2661 }, { "epoch": 9.507142857142856, - "grad_norm": 38.80112838745117, + "grad_norm": 23.624717712402344, "learning_rate": 7.381173387970397e-08, - "loss": 6.9766, + "loss": 5.9746, "step": 2662 }, { "epoch": 9.510714285714286, - "grad_norm": 54.68086624145508, + "grad_norm": 64.74032592773438, "learning_rate": 7.274846672117863e-08, - "loss": 6.7812, + "loss": 5.9785, "step": 2663 }, { "epoch": 9.514285714285714, - "grad_norm": 65.97528839111328, + "grad_norm": 63.603763580322266, "learning_rate": 7.169285735068531e-08, - "loss": 9.7734, + "loss": 10.3379, "step": 2664 }, { "epoch": 9.517857142857142, - "grad_norm": 58.50335693359375, + "grad_norm": 291.0750732421875, "learning_rate": 7.064490740882057e-08, - "loss": 12.5234, + "loss": 14.4648, "step": 2665 }, { "epoch": 9.521428571428572, - "grad_norm": 56.798152923583984, + "grad_norm": 13.829938888549805, "learning_rate": 6.960461852427824e-08, - "loss": 11.3281, + "loss": 12.7754, "step": 2666 }, { "epoch": 9.525, - "grad_norm": 66.69861602783203, + "grad_norm": 447.98687744140625, "learning_rate": 6.857199231384282e-08, - "loss": 8.1641, + "loss": 9.6562, "step": 2667 }, { "epoch": 9.528571428571428, - "grad_norm": 64.5541000366211, + "grad_norm": 21.332080841064453, "learning_rate": 6.75470303823933e-08, - "loss": 9.9883, + "loss": 11.8613, "step": 2668 }, { "epoch": 9.532142857142857, - "grad_norm": 47.318031311035156, + "grad_norm": 15.24606704711914, "learning_rate": 6.652973432289322e-08, - "loss": 8.918, + "loss": 10.2109, "step": 2669 }, { "epoch": 9.535714285714286, - "grad_norm": 47.27144241333008, + "grad_norm": 25.225955963134766, "learning_rate": 6.552010571639456e-08, - "loss": 10.6133, + "loss": 11.168, "step": 2670 }, { "epoch": 9.539285714285715, - "grad_norm": 46.30425262451172, + "grad_norm": 889.0580444335938, "learning_rate": 6.451814613203212e-08, - "loss": 11.2305, + "loss": 16.1172, "step": 2671 }, { "epoch": 9.542857142857143, - "grad_norm": 47.45296859741211, + "grad_norm": 83.2572021484375, "learning_rate": 6.352385712702191e-08, - "loss": 9.5703, + "loss": 10.3125, "step": 2672 }, { "epoch": 9.54642857142857, - "grad_norm": 41.24318313598633, + "grad_norm": 43.52366256713867, "learning_rate": 6.253724024665786e-08, - "loss": 8.0273, + "loss": 9.0898, "step": 2673 }, { "epoch": 9.55, - "grad_norm": 69.98167419433594, + "grad_norm": 914.7973022460938, "learning_rate": 6.15582970243117e-08, - "loss": 12.1562, + "loss": 20.1348, "step": 2674 }, { "epoch": 9.553571428571429, - "grad_norm": 45.183258056640625, + "grad_norm": 92.93348693847656, "learning_rate": 6.058702898142643e-08, - "loss": 7.8711, + "loss": 7.9141, "step": 2675 }, { "epoch": 9.557142857142857, - "grad_norm": 48.14056396484375, + "grad_norm": 151.25503540039062, "learning_rate": 5.96234376275201e-08, - "loss": 8.8516, + "loss": 9.3184, "step": 2676 }, { "epoch": 9.560714285714285, - "grad_norm": 41.75016403198242, + "grad_norm": 93.13277435302734, "learning_rate": 5.866752446017532e-08, - "loss": 8.4062, + "loss": 8.5996, "step": 2677 }, { "epoch": 9.564285714285715, - "grad_norm": 43.00480270385742, + "grad_norm": 6.104280948638916, "learning_rate": 5.7719290965045914e-08, - "loss": 8.7539, + "loss": 8.3887, "step": 2678 }, { "epoch": 9.567857142857143, - "grad_norm": 47.847660064697266, + "grad_norm": 719.3890380859375, "learning_rate": 5.677873861584693e-08, - "loss": 10.3086, + "loss": 12.1484, "step": 2679 }, { "epoch": 9.571428571428571, - "grad_norm": 47.77372741699219, + "grad_norm": 28.060171127319336, "learning_rate": 5.584586887435739e-08, - "loss": 8.3125, + "loss": 8.7871, "step": 2680 }, { "epoch": 9.575, - "grad_norm": 40.46303176879883, + "grad_norm": 78.66769409179688, "learning_rate": 5.492068319041588e-08, - "loss": 8.6875, + "loss": 8.9355, "step": 2681 }, { "epoch": 9.57857142857143, - "grad_norm": 58.699073791503906, + "grad_norm": 966.652099609375, "learning_rate": 5.400318300191831e-08, - "loss": 13.1836, + "loss": 18.873, "step": 2682 }, { "epoch": 9.582142857142857, - "grad_norm": 41.49596405029297, + "grad_norm": 389.268310546875, "learning_rate": 5.3093369734816824e-08, - "loss": 8.2188, + "loss": 7.7461, "step": 2683 }, { "epoch": 9.585714285714285, - "grad_norm": 43.31467819213867, + "grad_norm": 689.6474609375, "learning_rate": 5.219124480311533e-08, - "loss": 8.6016, + "loss": 10.0, "step": 2684 }, { "epoch": 9.589285714285714, - "grad_norm": 42.46400451660156, + "grad_norm": 374.2611389160156, "learning_rate": 5.129680960887007e-08, - "loss": 9.1523, + "loss": 9.9766, "step": 2685 }, { "epoch": 9.592857142857143, - "grad_norm": 44.5244140625, + "grad_norm": 63.27288818359375, "learning_rate": 5.041006554218519e-08, - "loss": 11.3867, + "loss": 11.4648, "step": 2686 }, { "epoch": 9.596428571428572, - "grad_norm": 47.11686706542969, + "grad_norm": 15.13329029083252, "learning_rate": 4.9531013981212736e-08, - "loss": 10.2891, + "loss": 11.1523, "step": 2687 }, { "epoch": 9.6, - "grad_norm": 60.13150405883789, + "grad_norm": 53.27433776855469, "learning_rate": 4.865965629214819e-08, - "loss": 7.4336, + "loss": 18.1582, "step": 2688 }, { "epoch": 9.603571428571428, - "grad_norm": 43.29692840576172, + "grad_norm": 23.770097732543945, "learning_rate": 4.779599382922995e-08, - "loss": 7.2109, + "loss": 6.4834, "step": 2689 }, { "epoch": 9.607142857142858, - "grad_norm": 49.5322380065918, + "grad_norm": 168.4155731201172, "learning_rate": 4.694002793473596e-08, - "loss": 9.9922, + "loss": 11.0957, "step": 2690 }, { "epoch": 9.610714285714286, - "grad_norm": 43.944698333740234, + "grad_norm": 703.230224609375, "learning_rate": 4.6091759938984296e-08, - "loss": 10.6133, + "loss": 13.7188, "step": 2691 }, { "epoch": 9.614285714285714, - "grad_norm": 46.738006591796875, + "grad_norm": 34.7758903503418, "learning_rate": 4.52511911603265e-08, - "loss": 8.1797, + "loss": 9.4023, "step": 2692 }, { "epoch": 9.617857142857144, - "grad_norm": 52.92332458496094, + "grad_norm": 153.79379272460938, "learning_rate": 4.44183229051498e-08, - "loss": 9.5195, + "loss": 9.543, "step": 2693 }, { "epoch": 9.621428571428572, - "grad_norm": 46.51583480834961, + "grad_norm": 526.1259765625, "learning_rate": 4.3593156467873765e-08, - "loss": 9.75, + "loss": 13.2891, "step": 2694 }, { "epoch": 9.625, - "grad_norm": 43.23051452636719, + "grad_norm": 1047.293701171875, "learning_rate": 4.2775693130948094e-08, - "loss": 8.0469, + "loss": 19.6855, "step": 2695 }, { "epoch": 9.628571428571428, - "grad_norm": 49.2890625, + "grad_norm": 683.8466186523438, "learning_rate": 4.196593416484873e-08, - "loss": 10.5781, + "loss": 13.668, "step": 2696 }, { "epoch": 9.632142857142856, - "grad_norm": 46.83113479614258, + "grad_norm": 108.80736541748047, "learning_rate": 4.1163880828080094e-08, - "loss": 9.3711, + "loss": 11.4688, "step": 2697 }, { "epoch": 9.635714285714286, - "grad_norm": 52.18455505371094, + "grad_norm": 43.39937210083008, "learning_rate": 4.036953436716895e-08, - "loss": 10.6016, + "loss": 13.2969, "step": 2698 }, { "epoch": 9.639285714285714, - "grad_norm": 57.80366134643555, + "grad_norm": 368.0429382324219, "learning_rate": 3.9582896016665536e-08, - "loss": 7.7109, + "loss": 7.3828, "step": 2699 }, { "epoch": 9.642857142857142, - "grad_norm": 45.886043548583984, + "grad_norm": 127.71127319335938, "learning_rate": 3.8803966999139686e-08, - "loss": 10.0234, + "loss": 10.4844, "step": 2700 }, { "epoch": 9.646428571428572, - "grad_norm": 48.43354415893555, + "grad_norm": 11.83320426940918, "learning_rate": 3.8032748525179684e-08, - "loss": 9.2461, + "loss": 9.752, "step": 2701 }, { "epoch": 9.65, - "grad_norm": 62.92259979248047, + "grad_norm": 466.3165588378906, "learning_rate": 3.726924179339009e-08, - "loss": 8.8438, + "loss": 11.918, "step": 2702 }, { "epoch": 9.653571428571428, - "grad_norm": 50.8563346862793, + "grad_norm": 137.58657836914062, "learning_rate": 3.6513447990390585e-08, - "loss": 8.168, + "loss": 10.8242, "step": 2703 }, { "epoch": 9.657142857142857, - "grad_norm": 71.1656265258789, + "grad_norm": 30.145854949951172, "learning_rate": 3.576536829081323e-08, - "loss": 8.7852, + "loss": 7.4766, "step": 2704 }, { "epoch": 9.660714285714286, - "grad_norm": 42.460025787353516, + "grad_norm": 28.093984603881836, "learning_rate": 3.50250038573019e-08, - "loss": 9.0234, + "loss": 8.418, "step": 2705 }, { "epoch": 9.664285714285715, - "grad_norm": 66.4333267211914, + "grad_norm": 14.471092224121094, "learning_rate": 3.429235584050894e-08, - "loss": 11.5664, + "loss": 13.3477, "step": 2706 }, { "epoch": 9.667857142857143, - "grad_norm": 43.62616729736328, + "grad_norm": 29.60715675354004, "learning_rate": 3.3567425379094074e-08, - "loss": 9.5586, + "loss": 11.1133, "step": 2707 }, { "epoch": 9.67142857142857, - "grad_norm": 47.59396743774414, + "grad_norm": 44.96870422363281, "learning_rate": 3.285021359972218e-08, - "loss": 10.7266, + "loss": 12.6133, "step": 2708 }, { "epoch": 9.675, - "grad_norm": 45.34231185913086, + "grad_norm": 37.37948989868164, "learning_rate": 3.214072161706272e-08, - "loss": 7.2852, + "loss": 7.6641, "step": 2709 }, { "epoch": 9.678571428571429, - "grad_norm": 47.26887512207031, + "grad_norm": 131.27911376953125, "learning_rate": 3.143895053378698e-08, - "loss": 9.3398, + "loss": 11.0312, "step": 2710 }, { "epoch": 9.682142857142857, - "grad_norm": 77.75263214111328, + "grad_norm": 41.74882888793945, "learning_rate": 3.074490144056752e-08, - "loss": 15.5195, + "loss": 20.7422, "step": 2711 }, { "epoch": 9.685714285714285, - "grad_norm": 52.08372116088867, + "grad_norm": 39.21331787109375, "learning_rate": 3.005857541607371e-08, - "loss": 12.1953, + "loss": 13.4629, "step": 2712 }, { "epoch": 9.689285714285715, - "grad_norm": 44.56758117675781, + "grad_norm": 256.6762390136719, "learning_rate": 2.937997352697397e-08, - "loss": 7.4727, + "loss": 9.543, "step": 2713 }, { "epoch": 9.692857142857143, - "grad_norm": 50.627052307128906, + "grad_norm": 183.64016723632812, "learning_rate": 2.8709096827930773e-08, - "loss": 9.2109, + "loss": 9.4551, "step": 2714 }, { "epoch": 9.696428571428571, - "grad_norm": 45.916114807128906, + "grad_norm": 585.6771240234375, "learning_rate": 2.8045946361601185e-08, - "loss": 10.1055, + "loss": 13.416, "step": 2715 }, { "epoch": 9.7, - "grad_norm": 53.467140197753906, + "grad_norm": 34.66190719604492, "learning_rate": 2.7390523158633552e-08, - "loss": 8.8516, + "loss": 9.2207, "step": 2716 }, { "epoch": 9.70357142857143, - "grad_norm": 78.76753234863281, + "grad_norm": 384.3739318847656, "learning_rate": 2.674282823766694e-08, - "loss": 10.082, + "loss": 17.3828, "step": 2717 }, { "epoch": 9.707142857142857, - "grad_norm": 42.52434539794922, + "grad_norm": 492.29119873046875, "learning_rate": 2.6102862605330016e-08, - "loss": 9.9492, + "loss": 11.6992, "step": 2718 }, { "epoch": 9.710714285714285, - "grad_norm": 68.2753677368164, + "grad_norm": 83.62376403808594, "learning_rate": 2.547062725623828e-08, - "loss": 12.0977, + "loss": 13.5469, "step": 2719 }, { "epoch": 9.714285714285714, - "grad_norm": 55.541847229003906, + "grad_norm": 18.454347610473633, "learning_rate": 2.4846123172992953e-08, - "loss": 7.6445, + "loss": 5.0996, "step": 2720 }, { "epoch": 9.717857142857143, - "grad_norm": 54.283836364746094, + "grad_norm": 77.77191162109375, "learning_rate": 2.4229351326179872e-08, - "loss": 8.8555, + "loss": 7.7188, "step": 2721 }, { "epoch": 9.721428571428572, - "grad_norm": 48.64120101928711, + "grad_norm": 764.4256591796875, "learning_rate": 2.3620312674367818e-08, - "loss": 10.7539, + "loss": 16.2012, "step": 2722 }, { "epoch": 9.725, - "grad_norm": 42.51774978637695, + "grad_norm": 29.506166458129883, "learning_rate": 2.301900816410574e-08, - "loss": 9.793, + "loss": 10.3477, "step": 2723 }, { "epoch": 9.728571428571428, - "grad_norm": 49.30501937866211, + "grad_norm": 69.50286102294922, "learning_rate": 2.242543872992442e-08, - "loss": 12.082, + "loss": 13.3633, "step": 2724 }, { "epoch": 9.732142857142858, - "grad_norm": 46.5907096862793, + "grad_norm": 708.2453002929688, "learning_rate": 2.1839605294330935e-08, - "loss": 8.4297, + "loss": 13.5488, "step": 2725 }, { "epoch": 9.735714285714286, - "grad_norm": 49.04130554199219, + "grad_norm": 21.822134017944336, "learning_rate": 2.1261508767810856e-08, - "loss": 8.3164, + "loss": 9.0566, "step": 2726 }, { "epoch": 9.739285714285714, - "grad_norm": 65.96072387695312, + "grad_norm": 702.4413452148438, "learning_rate": 2.0691150048823827e-08, - "loss": 9.4688, + "loss": 18.5, "step": 2727 }, { "epoch": 9.742857142857144, - "grad_norm": 45.70920181274414, + "grad_norm": 730.063232421875, "learning_rate": 2.012853002380466e-08, - "loss": 8.957, + "loss": 11.0312, "step": 2728 }, { "epoch": 9.746428571428572, - "grad_norm": 46.55236053466797, + "grad_norm": 904.7362670898438, "learning_rate": 1.957364956716168e-08, - "loss": 8.9258, + "loss": 11.7344, "step": 2729 }, { "epoch": 9.75, - "grad_norm": 45.06005096435547, + "grad_norm": 998.8553466796875, "learning_rate": 1.9026509541272276e-08, - "loss": 10.4336, + "loss": 14.3867, "step": 2730 }, { "epoch": 9.753571428571428, - "grad_norm": 51.41786193847656, + "grad_norm": 311.33599853515625, "learning_rate": 1.848711079648624e-08, - "loss": 8.8711, + "loss": 33.2637, "step": 2731 }, { "epoch": 9.757142857142856, - "grad_norm": 49.3633918762207, + "grad_norm": 98.9394760131836, "learning_rate": 1.7955454171120766e-08, - "loss": 10.1719, + "loss": 10.6211, "step": 2732 }, { "epoch": 9.760714285714286, - "grad_norm": 51.72982406616211, + "grad_norm": 5.450427532196045, "learning_rate": 1.7431540491459897e-08, - "loss": 11.8008, + "loss": 12.2129, "step": 2733 }, { "epoch": 9.764285714285714, - "grad_norm": 53.53840637207031, + "grad_norm": 15.301919937133789, "learning_rate": 1.6915370571756185e-08, - "loss": 8.9883, + "loss": 8.3145, "step": 2734 }, { "epoch": 9.767857142857142, - "grad_norm": 37.313270568847656, + "grad_norm": 192.17227172851562, "learning_rate": 1.640694521422459e-08, - "loss": 7.4297, + "loss": 7.5039, "step": 2735 }, { "epoch": 9.771428571428572, - "grad_norm": 62.532405853271484, + "grad_norm": 42.99203109741211, "learning_rate": 1.590626520904526e-08, - "loss": 11.7891, + "loss": 11.8203, "step": 2736 }, { "epoch": 9.775, - "grad_norm": 50.08003616333008, + "grad_norm": 82.31640625, "learning_rate": 1.541333133436018e-08, - "loss": 10.1133, + "loss": 8.2451, "step": 2737 }, { "epoch": 9.778571428571428, - "grad_norm": 55.42369842529297, + "grad_norm": 41.21693801879883, "learning_rate": 1.4928144356272102e-08, - "loss": 10.4961, + "loss": 12.5176, "step": 2738 }, { "epoch": 9.782142857142857, - "grad_norm": 43.01629638671875, + "grad_norm": 6.813210487365723, "learning_rate": 1.4450705028844491e-08, - "loss": 10.1367, + "loss": 9.2246, "step": 2739 }, { "epoch": 9.785714285714286, - "grad_norm": 50.327056884765625, + "grad_norm": 79.45853424072266, "learning_rate": 1.3981014094099354e-08, - "loss": 9.5508, + "loss": 10.4033, "step": 2740 }, { "epoch": 9.789285714285715, - "grad_norm": 46.070465087890625, + "grad_norm": 825.2599487304688, "learning_rate": 1.3519072282016653e-08, - "loss": 9.25, + "loss": 13.8359, "step": 2741 }, { "epoch": 9.792857142857143, - "grad_norm": 53.54471969604492, + "grad_norm": 84.781494140625, "learning_rate": 1.3064880310531548e-08, - "loss": 9.4375, + "loss": 10.0977, "step": 2742 }, { "epoch": 9.79642857142857, - "grad_norm": 52.997833251953125, + "grad_norm": 460.5569763183594, "learning_rate": 1.2618438885537154e-08, - "loss": 6.7539, + "loss": 7.5586, "step": 2743 }, { "epoch": 9.8, - "grad_norm": 54.83833312988281, + "grad_norm": 14.744489669799805, "learning_rate": 1.2179748700879013e-08, - "loss": 9.5703, + "loss": 9.6797, "step": 2744 }, { "epoch": 9.803571428571429, - "grad_norm": 82.47637939453125, + "grad_norm": 151.6591339111328, "learning_rate": 1.174881043835563e-08, - "loss": 12.1016, + "loss": 15.4766, "step": 2745 }, { "epoch": 9.807142857142857, - "grad_norm": 70.2025375366211, + "grad_norm": 34.987083435058594, "learning_rate": 1.132562476771959e-08, - "loss": 9.8398, + "loss": 12.8789, "step": 2746 }, { "epoch": 9.810714285714285, - "grad_norm": 50.70209503173828, + "grad_norm": 206.6593780517578, "learning_rate": 1.0910192346672566e-08, - "loss": 9.6094, + "loss": 10.9102, "step": 2747 }, { "epoch": 9.814285714285715, - "grad_norm": 47.2009162902832, + "grad_norm": 22.132347106933594, "learning_rate": 1.0502513820868088e-08, - "loss": 8.582, + "loss": 9.5566, "step": 2748 }, { "epoch": 9.817857142857143, - "grad_norm": 45.879512786865234, + "grad_norm": 142.94093322753906, "learning_rate": 1.010258982390766e-08, - "loss": 9.5391, + "loss": 11.082, "step": 2749 }, { "epoch": 9.821428571428571, - "grad_norm": 49.87171173095703, + "grad_norm": 492.0877990722656, "learning_rate": 9.710420977340763e-09, - "loss": 11.0625, + "loss": 13.624, "step": 2750 }, { "epoch": 9.825, - "grad_norm": 60.61910629272461, + "grad_norm": 919.2109985351562, "learning_rate": 9.3260078906654e-09, - "loss": 12.8164, + "loss": 17.5059, "step": 2751 }, { "epoch": 9.82857142857143, - "grad_norm": 53.086788177490234, + "grad_norm": 41.226173400878906, "learning_rate": 8.949351161324227e-09, - "loss": 9.4023, + "loss": 9.0605, "step": 2752 }, { "epoch": 9.832142857142857, - "grad_norm": 69.24258422851562, + "grad_norm": 90.96417999267578, "learning_rate": 8.580451374706755e-09, - "loss": 12.9727, + "loss": 16.9961, "step": 2753 }, { "epoch": 9.835714285714285, - "grad_norm": 47.484046936035156, + "grad_norm": 29.792098999023438, "learning_rate": 8.219309104145478e-09, - "loss": 9.5391, + "loss": 10.8906, "step": 2754 }, { "epoch": 9.839285714285714, - "grad_norm": 48.15837478637695, + "grad_norm": 598.4071655273438, "learning_rate": 7.865924910916977e-09, - "loss": 8.5469, + "loss": 11.5664, "step": 2755 }, { "epoch": 9.842857142857143, - "grad_norm": 45.919464111328125, + "grad_norm": 45.272438049316406, "learning_rate": 7.520299344241366e-09, - "loss": 7.8359, + "loss": 9.9512, "step": 2756 }, { "epoch": 9.846428571428572, - "grad_norm": 39.049957275390625, + "grad_norm": 82.02886199951172, "learning_rate": 7.182432941278405e-09, - "loss": 8.6602, + "loss": 10.7969, "step": 2757 }, { "epoch": 9.85, - "grad_norm": 49.58108139038086, + "grad_norm": 205.98793029785156, "learning_rate": 6.852326227130835e-09, - "loss": 8.1953, + "loss": 9.8066, "step": 2758 }, { "epoch": 9.853571428571428, - "grad_norm": 53.30030059814453, + "grad_norm": 795.8370361328125, "learning_rate": 6.529979714839929e-09, - "loss": 7.9023, + "loss": 13.8164, "step": 2759 }, { "epoch": 9.857142857142858, - "grad_norm": 45.68861770629883, + "grad_norm": 96.83676147460938, "learning_rate": 6.215393905388278e-09, - "loss": 8.6367, + "loss": 9.9414, "step": 2760 }, { "epoch": 9.860714285714286, - "grad_norm": 51.08280563354492, + "grad_norm": 573.808837890625, "learning_rate": 5.908569287694788e-09, - "loss": 10.4297, + "loss": 13.3594, "step": 2761 }, { "epoch": 9.864285714285714, - "grad_norm": 44.68046951293945, + "grad_norm": 229.8131866455078, "learning_rate": 5.609506338617454e-09, - "loss": 10.4336, + "loss": 11.5859, "step": 2762 }, { "epoch": 9.867857142857144, - "grad_norm": 45.83384704589844, + "grad_norm": 23.866348266601562, "learning_rate": 5.318205522951148e-09, - "loss": 8.9648, + "loss": 8.8984, "step": 2763 }, { "epoch": 9.871428571428572, - "grad_norm": 48.99489212036133, + "grad_norm": 38.91511154174805, "learning_rate": 5.034667293427053e-09, - "loss": 11.1016, + "loss": 12.4531, "step": 2764 }, { "epoch": 9.875, - "grad_norm": 48.72683334350586, + "grad_norm": 42.043243408203125, "learning_rate": 4.758892090711009e-09, - "loss": 9.2227, + "loss": 7.8008, "step": 2765 }, { "epoch": 9.878571428571428, - "grad_norm": 69.18177795410156, + "grad_norm": 153.63693237304688, "learning_rate": 4.490880343405724e-09, - "loss": 8.543, + "loss": 8.5723, "step": 2766 }, { "epoch": 9.882142857142856, - "grad_norm": 49.234066009521484, + "grad_norm": 104.24876403808594, "learning_rate": 4.230632468046892e-09, - "loss": 10.0625, + "loss": 10.8008, "step": 2767 }, { "epoch": 9.885714285714286, - "grad_norm": 43.813560485839844, + "grad_norm": 18.909093856811523, "learning_rate": 3.978148869103748e-09, - "loss": 10.9141, + "loss": 11.293, "step": 2768 }, { "epoch": 9.889285714285714, - "grad_norm": 39.27349090576172, + "grad_norm": 40.45103073120117, "learning_rate": 3.7334299389790715e-09, - "loss": 7.3906, + "loss": 6.4141, "step": 2769 }, { "epoch": 9.892857142857142, - "grad_norm": 45.50040817260742, + "grad_norm": 50.96184158325195, "learning_rate": 3.496476058006959e-09, - "loss": 7.8047, + "loss": 6.7988, "step": 2770 }, { "epoch": 9.896428571428572, - "grad_norm": 53.36546325683594, + "grad_norm": 31.988611221313477, "learning_rate": 3.267287594455604e-09, - "loss": 9.5, + "loss": 9.9453, "step": 2771 }, { "epoch": 9.9, - "grad_norm": 51.97216796875, + "grad_norm": 104.5152816772461, "learning_rate": 3.0458649045211897e-09, - "loss": 9.3047, + "loss": 8.9922, "step": 2772 }, { "epoch": 9.903571428571428, - "grad_norm": 48.812896728515625, + "grad_norm": 1051.7105712890625, "learning_rate": 2.8322083323334417e-09, - "loss": 8.3047, + "loss": 15.4336, "step": 2773 }, { "epoch": 9.907142857142857, - "grad_norm": 46.57819747924805, + "grad_norm": 880.1781616210938, "learning_rate": 2.626318209951184e-09, - "loss": 8.7031, + "loss": 12.8867, "step": 2774 }, { "epoch": 9.910714285714286, - "grad_norm": 47.38971710205078, + "grad_norm": 53.895416259765625, "learning_rate": 2.4281948573617875e-09, - "loss": 9.7891, + "loss": 10.5742, "step": 2775 }, { "epoch": 9.914285714285715, - "grad_norm": 51.00689697265625, + "grad_norm": 479.3047180175781, "learning_rate": 2.237838582483387e-09, - "loss": 10.3516, + "loss": 11.8594, "step": 2776 }, { "epoch": 9.917857142857143, - "grad_norm": 44.66899490356445, + "grad_norm": 80.28567504882812, "learning_rate": 2.055249681161553e-09, - "loss": 9.2422, + "loss": 9.2383, "step": 2777 }, { "epoch": 9.92142857142857, - "grad_norm": 41.58612060546875, + "grad_norm": 873.905029296875, "learning_rate": 1.880428437170956e-09, - "loss": 8.3047, + "loss": 15.8164, "step": 2778 }, { "epoch": 9.925, - "grad_norm": 49.87557601928711, + "grad_norm": 1071.1861572265625, "learning_rate": 1.7133751222137007e-09, - "loss": 9.293, + "loss": 19.5039, "step": 2779 }, { "epoch": 9.928571428571429, - "grad_norm": 53.11792755126953, + "grad_norm": 906.8634033203125, "learning_rate": 1.5540899959187727e-09, - "loss": 9.8555, + "loss": 16.957, "step": 2780 }, { "epoch": 9.932142857142857, - "grad_norm": 57.1493034362793, + "grad_norm": 1230.556396484375, "learning_rate": 1.4025733058420366e-09, - "loss": 12.5312, + "loss": 19.5508, "step": 2781 }, { "epoch": 9.935714285714285, - "grad_norm": 54.42870330810547, + "grad_norm": 1067.6336669921875, "learning_rate": 1.2588252874673469e-09, - "loss": 9.5156, + "loss": 10.791, "step": 2782 }, { "epoch": 9.939285714285715, - "grad_norm": 50.301666259765625, + "grad_norm": 107.23429107666016, "learning_rate": 1.122846164202107e-09, - "loss": 9.3867, + "loss": 12.8789, "step": 2783 }, { "epoch": 9.942857142857143, - "grad_norm": 57.258506774902344, + "grad_norm": 74.28633880615234, "learning_rate": 9.946361473822664e-10, - "loss": 8.7461, + "loss": 8.9023, "step": 2784 }, { "epoch": 9.946428571428571, - "grad_norm": 47.07687759399414, + "grad_norm": 41.50947189331055, "learning_rate": 8.741954362678773e-10, - "loss": 9.6758, + "loss": 9.4434, "step": 2785 }, { "epoch": 9.95, - "grad_norm": 46.212806701660156, + "grad_norm": 114.92245483398438, "learning_rate": 7.615242180436521e-10, - "loss": 7.7227, + "loss": 6.5195, "step": 2786 }, { "epoch": 9.95357142857143, - "grad_norm": 53.371849060058594, + "grad_norm": 769.7374267578125, "learning_rate": 6.566226678206278e-10, - "loss": 9.9375, + "loss": 12.4316, "step": 2787 }, { "epoch": 9.957142857142857, - "grad_norm": 53.20670700073242, + "grad_norm": 73.86483001708984, "learning_rate": 5.594909486328348e-10, - "loss": 9.9141, + "loss": 10.3613, "step": 2788 }, { "epoch": 9.960714285714285, - "grad_norm": 50.10818099975586, + "grad_norm": 120.37371826171875, "learning_rate": 4.701292114400735e-10, - "loss": 9.8008, + "loss": 11.0703, "step": 2789 }, { "epoch": 9.964285714285714, - "grad_norm": 49.338558197021484, + "grad_norm": 16.35982322692871, "learning_rate": 3.885375951256931e-10, - "loss": 9.6523, + "loss": 8.1211, "step": 2790 }, { "epoch": 9.967857142857143, - "grad_norm": 40.599735260009766, + "grad_norm": 7.273563861846924, "learning_rate": 3.147162264971471e-10, - "loss": 9.2344, + "loss": 9.9492, "step": 2791 }, { "epoch": 9.971428571428572, - "grad_norm": 51.326839447021484, + "grad_norm": 790.1627197265625, "learning_rate": 2.486652202848827e-10, - "loss": 9.1211, + "loss": 12.7715, "step": 2792 }, { "epoch": 9.975, - "grad_norm": 66.67410278320312, + "grad_norm": 160.63015747070312, "learning_rate": 1.903846791434516e-10, - "loss": 9.793, + "loss": 9.1328, "step": 2793 }, { "epoch": 9.978571428571428, - "grad_norm": 46.092044830322266, + "grad_norm": 126.76197814941406, "learning_rate": 1.398746936509543e-10, - "loss": 9.5977, + "loss": 9.7637, "step": 2794 }, { "epoch": 9.982142857142858, - "grad_norm": 54.907958984375, + "grad_norm": 609.9268188476562, "learning_rate": 9.713534230904043e-11, - "loss": 9.7188, + "loss": 10.6133, "step": 2795 }, { "epoch": 9.985714285714286, - "grad_norm": 70.20608520507812, + "grad_norm": 30.234859466552734, "learning_rate": 6.216669154068822e-11, - "loss": 9.9062, + "loss": 21.7363, "step": 2796 }, { "epoch": 9.989285714285714, - "grad_norm": 49.69427490234375, + "grad_norm": 362.4040222167969, "learning_rate": 3.49687956946454e-11, - "loss": 10.793, + "loss": 13.2031, "step": 2797 }, { "epoch": 9.992857142857144, - "grad_norm": 53.355491638183594, + "grad_norm": 6.9988694190979, "learning_rate": 1.5541697039878067e-11, - "loss": 9.3398, + "loss": 9.623, "step": 2798 }, { "epoch": 9.996428571428572, - "grad_norm": 60.452571868896484, + "grad_norm": 1324.203369140625, "learning_rate": 3.885425769456496e-12, - "loss": 9.2852, + "loss": 21.7363, "step": 2799 }, { "epoch": 10.0, - "grad_norm": 52.8779296875, + "grad_norm": 1103.317626953125, "learning_rate": 0.0, - "loss": 11.4102, + "loss": 19.6836, "step": 2800 }, { "epoch": 10.0, - "eval_loss": 9.709858894348145, - "eval_mse": 9.709769039260202, - "eval_runtime": 11.5145, - "eval_samples_per_second": 246.646, - "eval_steps_per_second": 1.303, - "eval_target_0_mse": 18.50349469366631, - "eval_target_1_mse": 9.9816600312731, - "eval_target_2_mse": 5.286762892990152, - "eval_target_3_mse": 5.067158539111244, + "eval_loss": 12.646404266357422, + "eval_mse": 12.642837952482898, + "eval_runtime": 11.3765, + "eval_samples_per_second": 249.636, + "eval_steps_per_second": 1.319, + "eval_target_0_mse": 37.8820684668978, + "eval_target_1_mse": 8.898875766105508, + "eval_target_2_mse": 2.5619000897171813, + "eval_target_3_mse": 1.2285074872111057, "step": 2800 }, { "epoch": 10.0, "step": 2800, "total_flos": 1.1526867051872256e+19, - "train_loss": 12.104970703125, - "train_runtime": 3086.6057, - "train_samples_per_second": 174.146, - "train_steps_per_second": 0.907 + "train_loss": 183.57804966517858, + "train_runtime": 3247.1414, + "train_samples_per_second": 165.536, + "train_steps_per_second": 0.862 } ], "logging_steps": 1.0,