diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,4734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.07189072609633357, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_ib": 64.96094512939453, + "ce_orig": 0.7435811758041382, + "epoch": 0, + "kl_loss": 3856.220703125, + "loss_ib": 192.84352111816406, + "step": 0 + }, + { + "ce_ib": 65.86748504638672, + "ce_orig": 1.261900782585144, + "epoch": 0.00014378145219266715, + "kl_loss": 3989.227294921875, + "loss_ib": 199.4943084716797, + "step": 1 + }, + { + "ce_ib": 63.70602798461914, + "ce_orig": 1.069283127784729, + "epoch": 0.0002875629043853343, + "kl_loss": 3018.861572265625, + "loss_ib": 150.9749298095703, + "step": 2 + }, + { + "ce_ib": 65.56930541992188, + "ce_orig": 0.9985544085502625, + "epoch": 0.00043134435657800146, + "kl_loss": 3440.156494140625, + "loss_ib": 172.04061889648438, + "step": 3 + }, + { + "ce_ib": 62.74740982055664, + "ce_orig": 0.7562570571899414, + "epoch": 0.0005751258087706686, + "kl_loss": 3999.942626953125, + "loss_ib": 200.02850341796875, + "step": 4 + }, + { + "epoch": 0.0007189072609633358, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 183.2429, + "step": 5 + }, + { + "ce_ib": 64.26500701904297, + "ce_orig": 0.704367995262146, + "epoch": 0.0007189072609633358, + "kl_loss": 3585.45947265625, + "loss_ib": 179.3050994873047, + "step": 5 + }, + { + "ce_ib": 64.85440826416016, + "ce_orig": 0.990234375, + "epoch": 0.0008626887131560029, + "kl_loss": 3754.7421875, + "loss_ib": 187.76953125, + "step": 6 + }, + { + "ce_ib": 64.96634674072266, + "ce_orig": 0.9086857438087463, + "epoch": 0.00100647016534867, + "kl_loss": 3929.58203125, + "loss_ib": 196.51158142089844, + "step": 7 + }, + { + "ce_ib": 64.07415771484375, + "ce_orig": 0.7150144577026367, + "epoch": 0.0011502516175413372, + "kl_loss": 3740.4150390625, + "loss_ib": 187.0527801513672, + "step": 8 + }, + { + "ce_ib": 65.45182800292969, + "ce_orig": 0.8336902856826782, + "epoch": 0.0012940330697340044, + "kl_loss": 3728.460205078125, + "loss_ib": 186.4557342529297, + "step": 9 + }, + { + "epoch": 0.0014378145219266715, + "grad_norm": 2721.89794921875, + "learning_rate": 0.0, + "loss": 185.5215, + "step": 10 + }, + { + "ce_ib": 64.31118774414062, + "ce_orig": 0.773049533367157, + "epoch": 0.0014378145219266715, + "kl_loss": 3938.441650390625, + "loss_ib": 196.95423889160156, + "step": 10 + }, + { + "ce_ib": 68.13079071044922, + "ce_orig": 1.7455261945724487, + "epoch": 0.0015815959741193387, + "kl_loss": 3521.255615234375, + "loss_ib": 176.0968475341797, + "step": 11 + }, + { + "ce_ib": 65.10643005371094, + "ce_orig": 1.115777850151062, + "epoch": 0.0017253774263120058, + "kl_loss": 3872.32763671875, + "loss_ib": 193.64894104003906, + "step": 12 + }, + { + "ce_ib": 66.39643859863281, + "ce_orig": 0.9639286994934082, + "epoch": 0.001869158878504673, + "kl_loss": 3771.317626953125, + "loss_ib": 188.59909057617188, + "step": 13 + }, + { + "ce_ib": 63.847900390625, + "ce_orig": 0.6167153716087341, + "epoch": 0.00201294033069734, + "kl_loss": 3233.497314453125, + "loss_ib": 161.706787109375, + "step": 14 + }, + { + "epoch": 0.002156721782890007, + "grad_norm": 2457.46435546875, + "learning_rate": 3.9936102236421723e-07, + "loss": 181.2678, + "step": 15 + }, + { + "ce_ib": 64.9281997680664, + "ce_orig": 1.2498202323913574, + "epoch": 0.002156721782890007, + "kl_loss": 3831.1611328125, + "loss_ib": 191.59051513671875, + "step": 15 + }, + { + "ce_ib": 65.2757797241211, + "ce_orig": 1.0968470573425293, + "epoch": 0.0023005032350826744, + "kl_loss": 3685.48193359375, + "loss_ib": 184.30674743652344, + "step": 16 + }, + { + "ce_ib": 64.68568420410156, + "ce_orig": 0.8841207027435303, + "epoch": 0.0024442846872753414, + "kl_loss": 3744.38134765625, + "loss_ib": 187.2514190673828, + "step": 17 + }, + { + "ce_ib": 64.82100677490234, + "ce_orig": 1.1195234060287476, + "epoch": 0.0025880661394680087, + "kl_loss": 3526.25927734375, + "loss_ib": 176.3453826904297, + "step": 18 + }, + { + "ce_ib": 63.829200744628906, + "ce_orig": 0.9802423119544983, + "epoch": 0.0027318475916606757, + "kl_loss": 3915.511474609375, + "loss_ib": 195.80747985839844, + "step": 19 + }, + { + "epoch": 0.002875629043853343, + "grad_norm": 2735.018310546875, + "learning_rate": 7.987220447284345e-07, + "loss": 187.5199, + "step": 20 + }, + { + "ce_ib": 64.30339050292969, + "ce_orig": 0.713705837726593, + "epoch": 0.002875629043853343, + "kl_loss": 3551.751220703125, + "loss_ib": 177.61972045898438, + "step": 20 + }, + { + "ce_ib": 65.2120132446289, + "ce_orig": 1.1765546798706055, + "epoch": 0.00301941049604601, + "kl_loss": 2725.1201171875, + "loss_ib": 136.2886199951172, + "step": 21 + }, + { + "ce_ib": 64.87537384033203, + "ce_orig": 1.1634544134140015, + "epoch": 0.0031631919482386773, + "kl_loss": 3579.1513671875, + "loss_ib": 178.99000549316406, + "step": 22 + }, + { + "ce_ib": 63.88631820678711, + "ce_orig": 0.9461633563041687, + "epoch": 0.0033069734004313443, + "kl_loss": 3845.448974609375, + "loss_ib": 192.3043975830078, + "step": 23 + }, + { + "ce_ib": 64.70732116699219, + "ce_orig": 1.03489351272583, + "epoch": 0.0034507548526240116, + "kl_loss": 3712.78076171875, + "loss_ib": 185.67138671875, + "step": 24 + }, + { + "epoch": 0.0035945363048166786, + "grad_norm": 2554.739501953125, + "learning_rate": 1.1980830670926517e-06, + "loss": 181.5605, + "step": 25 + }, + { + "ce_ib": 60.482200622558594, + "ce_orig": 0.3934582769870758, + "epoch": 0.0035945363048166786, + "kl_loss": 3043.28076171875, + "loss_ib": 152.19427490234375, + "step": 25 + }, + { + "ce_ib": 63.36570739746094, + "ce_orig": 0.7366315126419067, + "epoch": 0.003738317757009346, + "kl_loss": 3766.66796875, + "loss_ib": 188.36508178710938, + "step": 26 + }, + { + "ce_ib": 64.73159790039062, + "ce_orig": 0.8274144530296326, + "epoch": 0.003882099209202013, + "kl_loss": 4114.853515625, + "loss_ib": 205.77505493164062, + "step": 27 + }, + { + "ce_ib": 64.48138427734375, + "ce_orig": 0.9502752423286438, + "epoch": 0.00402588066139468, + "kl_loss": 3778.4033203125, + "loss_ib": 188.95240783691406, + "step": 28 + }, + { + "ce_ib": 65.42862701416016, + "ce_orig": 1.2197273969650269, + "epoch": 0.004169662113587347, + "kl_loss": 3769.447509765625, + "loss_ib": 188.50509643554688, + "step": 29 + }, + { + "epoch": 0.004313443565780014, + "grad_norm": 2604.34765625, + "learning_rate": 1.597444089456869e-06, + "loss": 185.5906, + "step": 30 + }, + { + "ce_ib": 66.01839447021484, + "ce_orig": 1.2358959913253784, + "epoch": 0.004313443565780014, + "kl_loss": 3728.17578125, + "loss_ib": 186.44180297851562, + "step": 30 + }, + { + "ce_ib": 63.86208724975586, + "ce_orig": 0.9959704875946045, + "epoch": 0.004457225017972682, + "kl_loss": 4007.545654296875, + "loss_ib": 200.4092254638672, + "step": 31 + }, + { + "ce_ib": 63.890628814697266, + "ce_orig": 0.785792887210846, + "epoch": 0.004601006470165349, + "kl_loss": 3334.66552734375, + "loss_ib": 166.76522827148438, + "step": 32 + }, + { + "ce_ib": 63.21049118041992, + "ce_orig": 0.9940950274467468, + "epoch": 0.004744787922358016, + "kl_loss": 3633.696533203125, + "loss_ib": 181.7164306640625, + "step": 33 + }, + { + "ce_ib": 65.66783905029297, + "ce_orig": 1.3076696395874023, + "epoch": 0.004888569374550683, + "kl_loss": 3813.81396484375, + "loss_ib": 190.72354125976562, + "step": 34 + }, + { + "epoch": 0.0050323508267433505, + "grad_norm": 2616.953369140625, + "learning_rate": 1.9968051118210863e-06, + "loss": 187.4055, + "step": 35 + }, + { + "ce_ib": 63.93610763549805, + "ce_orig": 1.0693968534469604, + "epoch": 0.0050323508267433505, + "kl_loss": 2865.432373046875, + "loss_ib": 143.3035888671875, + "step": 35 + }, + { + "ce_ib": 63.83279037475586, + "ce_orig": 0.9967127442359924, + "epoch": 0.0051761322789360175, + "kl_loss": 3682.119384765625, + "loss_ib": 184.1378936767578, + "step": 36 + }, + { + "ce_ib": 63.78058624267578, + "ce_orig": 0.7548370957374573, + "epoch": 0.005319913731128684, + "kl_loss": 3521.068359375, + "loss_ib": 176.08531188964844, + "step": 37 + }, + { + "ce_ib": 63.56543731689453, + "ce_orig": 0.9782358407974243, + "epoch": 0.005463695183321351, + "kl_loss": 3663.02587890625, + "loss_ib": 183.18309020996094, + "step": 38 + }, + { + "ce_ib": 65.04608917236328, + "ce_orig": 0.8212652802467346, + "epoch": 0.005607476635514018, + "kl_loss": 4082.810546875, + "loss_ib": 204.1730499267578, + "step": 39 + }, + { + "epoch": 0.005751258087706686, + "grad_norm": 2647.227294921875, + "learning_rate": 2.3961661341853035e-06, + "loss": 181.9243, + "step": 40 + }, + { + "ce_ib": 65.51580047607422, + "ce_orig": 1.1804547309875488, + "epoch": 0.005751258087706686, + "kl_loss": 3949.86279296875, + "loss_ib": 197.52590942382812, + "step": 40 + }, + { + "ce_ib": 63.80126190185547, + "ce_orig": 0.7081286907196045, + "epoch": 0.005895039539899353, + "kl_loss": 3912.12548828125, + "loss_ib": 195.63816833496094, + "step": 41 + }, + { + "ce_ib": 62.75702667236328, + "ce_orig": 0.7087532877922058, + "epoch": 0.00603882099209202, + "kl_loss": 3891.41259765625, + "loss_ib": 194.60202026367188, + "step": 42 + }, + { + "ce_ib": 64.6529541015625, + "ce_orig": 1.2302581071853638, + "epoch": 0.006182602444284687, + "kl_loss": 3733.06787109375, + "loss_ib": 186.6857147216797, + "step": 43 + }, + { + "ce_ib": 63.076133728027344, + "ce_orig": 0.8998420238494873, + "epoch": 0.006326383896477355, + "kl_loss": 3209.076904296875, + "loss_ib": 160.4853973388672, + "step": 44 + }, + { + "epoch": 0.006470165348670022, + "grad_norm": 2475.28173828125, + "learning_rate": 2.7955271565495207e-06, + "loss": 182.2709, + "step": 45 + }, + { + "ce_ib": 61.6526985168457, + "ce_orig": 0.6503346562385559, + "epoch": 0.006470165348670022, + "kl_loss": 3745.544921875, + "loss_ib": 187.30807495117188, + "step": 45 + }, + { + "ce_ib": 63.45009231567383, + "ce_orig": 1.2053508758544922, + "epoch": 0.0066139468008626886, + "kl_loss": 3523.22900390625, + "loss_ib": 176.19317626953125, + "step": 46 + }, + { + "ce_ib": 64.10655212402344, + "ce_orig": 1.1375271081924438, + "epoch": 0.0067577282530553555, + "kl_loss": 3449.62353515625, + "loss_ib": 172.5132293701172, + "step": 47 + }, + { + "ce_ib": 61.43892288208008, + "ce_orig": 0.6051114201545715, + "epoch": 0.006901509705248023, + "kl_loss": 2918.55615234375, + "loss_ib": 145.9585418701172, + "step": 48 + }, + { + "ce_ib": 62.07788848876953, + "ce_orig": 0.6533306837081909, + "epoch": 0.00704529115744069, + "kl_loss": 3852.226318359375, + "loss_ib": 192.64236450195312, + "step": 49 + }, + { + "epoch": 0.007189072609633357, + "grad_norm": 2666.057861328125, + "learning_rate": 3.194888178913738e-06, + "loss": 178.7684, + "step": 50 + }, + { + "ce_ib": 63.88691329956055, + "ce_orig": 0.9801137447357178, + "epoch": 0.007189072609633357, + "kl_loss": 3421.68798828125, + "loss_ib": 171.11634826660156, + "step": 50 + }, + { + "ce_ib": 63.39102554321289, + "ce_orig": 1.208335041999817, + "epoch": 0.007332854061826024, + "kl_loss": 3816.6484375, + "loss_ib": 190.86412048339844, + "step": 51 + }, + { + "ce_ib": 62.79481506347656, + "ce_orig": 1.1687792539596558, + "epoch": 0.007476635514018692, + "kl_loss": 3562.60595703125, + "loss_ib": 178.1616973876953, + "step": 52 + }, + { + "ce_ib": 59.30780792236328, + "ce_orig": 0.48160627484321594, + "epoch": 0.007620416966211359, + "kl_loss": 2727.423828125, + "loss_ib": 136.40084838867188, + "step": 53 + }, + { + "ce_ib": 60.45918655395508, + "ce_orig": 0.729110598564148, + "epoch": 0.007764198418404026, + "kl_loss": 3612.10888671875, + "loss_ib": 180.6356658935547, + "step": 54 + }, + { + "epoch": 0.007907979870596693, + "grad_norm": 2656.139404296875, + "learning_rate": 3.5942492012779555e-06, + "loss": 179.8713, + "step": 55 + }, + { + "ce_ib": 62.10517883300781, + "ce_orig": 0.8718740344047546, + "epoch": 0.007907979870596693, + "kl_loss": 3526.4736328125, + "loss_ib": 176.354736328125, + "step": 55 + }, + { + "ce_ib": 61.65966796875, + "ce_orig": 0.8692609071731567, + "epoch": 0.00805176132278936, + "kl_loss": 3907.55126953125, + "loss_ib": 195.4084014892578, + "step": 56 + }, + { + "ce_ib": 62.87982940673828, + "ce_orig": 1.1107903718948364, + "epoch": 0.008195542774982027, + "kl_loss": 3505.02880859375, + "loss_ib": 175.2828826904297, + "step": 57 + }, + { + "ce_ib": 61.1724853515625, + "ce_orig": 0.7401551008224487, + "epoch": 0.008339324227174694, + "kl_loss": 3798.70947265625, + "loss_ib": 189.966064453125, + "step": 58 + }, + { + "ce_ib": 64.25030517578125, + "ce_orig": 1.37394380569458, + "epoch": 0.008483105679367362, + "kl_loss": 3642.968994140625, + "loss_ib": 182.1805877685547, + "step": 59 + }, + { + "epoch": 0.008626887131560028, + "grad_norm": 2610.953369140625, + "learning_rate": 3.993610223642173e-06, + "loss": 181.3146, + "step": 60 + }, + { + "ce_ib": 61.306488037109375, + "ce_orig": 0.8149375915527344, + "epoch": 0.008626887131560028, + "kl_loss": 3716.3310546875, + "loss_ib": 185.8472137451172, + "step": 60 + }, + { + "ce_ib": 63.48124313354492, + "ce_orig": 1.404058575630188, + "epoch": 0.008770668583752696, + "kl_loss": 3322.540283203125, + "loss_ib": 166.15875244140625, + "step": 61 + }, + { + "ce_ib": 63.29791259765625, + "ce_orig": 1.1193113327026367, + "epoch": 0.008914450035945364, + "kl_loss": 3468.67529296875, + "loss_ib": 173.46542358398438, + "step": 62 + }, + { + "ce_ib": 62.13097381591797, + "ce_orig": 1.177595615386963, + "epoch": 0.00905823148813803, + "kl_loss": 3650.87646484375, + "loss_ib": 182.57489013671875, + "step": 63 + }, + { + "ce_ib": 60.98870849609375, + "ce_orig": 0.9005176424980164, + "epoch": 0.009202012940330698, + "kl_loss": 3419.352783203125, + "loss_ib": 170.99813842773438, + "step": 64 + }, + { + "epoch": 0.009345794392523364, + "grad_norm": 2418.697998046875, + "learning_rate": 4.39297124600639e-06, + "loss": 176.6997, + "step": 65 + }, + { + "ce_ib": 60.742515563964844, + "ce_orig": 0.8558191657066345, + "epoch": 0.009345794392523364, + "kl_loss": 3522.680908203125, + "loss_ib": 176.16441345214844, + "step": 65 + }, + { + "ce_ib": 61.35293960571289, + "ce_orig": 0.6822745203971863, + "epoch": 0.009489575844716032, + "kl_loss": 3174.692138671875, + "loss_ib": 158.76528930664062, + "step": 66 + }, + { + "ce_ib": 60.16307067871094, + "ce_orig": 0.6927408576011658, + "epoch": 0.0096333572969087, + "kl_loss": 3434.4345703125, + "loss_ib": 171.75181579589844, + "step": 67 + }, + { + "ce_ib": 60.551544189453125, + "ce_orig": 0.7352694272994995, + "epoch": 0.009777138749101365, + "kl_loss": 3096.6083984375, + "loss_ib": 154.86070251464844, + "step": 68 + }, + { + "ce_ib": 61.049774169921875, + "ce_orig": 0.8262593150138855, + "epoch": 0.009920920201294033, + "kl_loss": 3612.48095703125, + "loss_ib": 180.6545867919922, + "step": 69 + }, + { + "epoch": 0.010064701653486701, + "grad_norm": 2490.9189453125, + "learning_rate": 4.792332268370607e-06, + "loss": 168.4689, + "step": 70 + }, + { + "ce_ib": 59.70656204223633, + "ce_orig": 0.7031822204589844, + "epoch": 0.010064701653486701, + "kl_loss": 3356.534423828125, + "loss_ib": 167.85658264160156, + "step": 70 + }, + { + "ce_ib": 58.52351760864258, + "ce_orig": 0.8787212371826172, + "epoch": 0.010208483105679367, + "kl_loss": 3540.501953125, + "loss_ib": 177.0543670654297, + "step": 71 + }, + { + "ce_ib": 60.51292419433594, + "ce_orig": 0.872455894947052, + "epoch": 0.010352264557872035, + "kl_loss": 3272.396240234375, + "loss_ib": 163.6500701904297, + "step": 72 + }, + { + "ce_ib": 59.571720123291016, + "ce_orig": 0.9494105577468872, + "epoch": 0.010496046010064701, + "kl_loss": 3453.132568359375, + "loss_ib": 172.68641662597656, + "step": 73 + }, + { + "ce_ib": 60.829315185546875, + "ce_orig": 0.9209774136543274, + "epoch": 0.010639827462257369, + "kl_loss": 3232.62890625, + "loss_ib": 161.661865234375, + "step": 74 + }, + { + "epoch": 0.010783608914450037, + "grad_norm": 2442.193359375, + "learning_rate": 5.191693290734825e-06, + "loss": 169.7118, + "step": 75 + }, + { + "ce_ib": 58.91569519042969, + "ce_orig": 0.6644178628921509, + "epoch": 0.010783608914450037, + "kl_loss": 3597.986572265625, + "loss_ib": 179.9287872314453, + "step": 75 + }, + { + "ce_ib": 59.93379592895508, + "ce_orig": 0.6643899083137512, + "epoch": 0.010927390366642703, + "kl_loss": 3446.6064453125, + "loss_ib": 172.36029052734375, + "step": 76 + }, + { + "ce_ib": 61.500518798828125, + "ce_orig": 1.1189907789230347, + "epoch": 0.01107117181883537, + "kl_loss": 3529.22216796875, + "loss_ib": 176.4918670654297, + "step": 77 + }, + { + "ce_ib": 59.83492660522461, + "ce_orig": 0.6313321590423584, + "epoch": 0.011214953271028037, + "kl_loss": 3552.268798828125, + "loss_ib": 177.6433563232422, + "step": 78 + }, + { + "ce_ib": 59.81180953979492, + "ce_orig": 1.0850353240966797, + "epoch": 0.011358734723220704, + "kl_loss": 3466.029296875, + "loss_ib": 173.3313751220703, + "step": 79 + }, + { + "epoch": 0.011502516175413372, + "grad_norm": 2574.475341796875, + "learning_rate": 5.591054313099041e-06, + "loss": 170.7064, + "step": 80 + }, + { + "ce_ib": 62.353084564208984, + "ce_orig": 1.3420498371124268, + "epoch": 0.011502516175413372, + "kl_loss": 3235.225341796875, + "loss_ib": 161.79244995117188, + "step": 80 + }, + { + "ce_ib": 60.79109573364258, + "ce_orig": 1.3365955352783203, + "epoch": 0.011646297627606038, + "kl_loss": 3334.964599609375, + "loss_ib": 166.77862548828125, + "step": 81 + }, + { + "ce_ib": 60.66354751586914, + "ce_orig": 0.7855740785598755, + "epoch": 0.011790079079798706, + "kl_loss": 3520.52783203125, + "loss_ib": 176.05673217773438, + "step": 82 + }, + { + "ce_ib": 59.765869140625, + "ce_orig": 0.8332124352455139, + "epoch": 0.011933860531991374, + "kl_loss": 3197.080078125, + "loss_ib": 159.88389587402344, + "step": 83 + }, + { + "ce_ib": 60.493858337402344, + "ce_orig": 1.5375986099243164, + "epoch": 0.01207764198418404, + "kl_loss": 3220.095703125, + "loss_ib": 161.0350341796875, + "step": 84 + }, + { + "epoch": 0.012221423436376708, + "grad_norm": 2434.75390625, + "learning_rate": 5.990415335463259e-06, + "loss": 165.5288, + "step": 85 + }, + { + "ce_ib": 57.47407150268555, + "ce_orig": 0.8656359910964966, + "epoch": 0.012221423436376708, + "kl_loss": 3253.564453125, + "loss_ib": 162.7069549560547, + "step": 85 + }, + { + "ce_ib": 59.44199752807617, + "ce_orig": 0.8775683045387268, + "epoch": 0.012365204888569374, + "kl_loss": 3335.959716796875, + "loss_ib": 166.8277130126953, + "step": 86 + }, + { + "ce_ib": 59.450687408447266, + "ce_orig": 0.7520811557769775, + "epoch": 0.012508986340762042, + "kl_loss": 3219.025634765625, + "loss_ib": 160.9810028076172, + "step": 87 + }, + { + "ce_ib": 58.30461120605469, + "ce_orig": 0.9134323000907898, + "epoch": 0.01265276779295471, + "kl_loss": 2271.015625, + "loss_ib": 113.5799331665039, + "step": 88 + }, + { + "ce_ib": 58.684146881103516, + "ce_orig": 1.100233793258667, + "epoch": 0.012796549245147375, + "kl_loss": 3356.873779296875, + "loss_ib": 167.87303161621094, + "step": 89 + }, + { + "epoch": 0.012940330697340043, + "grad_norm": 2395.843994140625, + "learning_rate": 6.389776357827476e-06, + "loss": 153.9006, + "step": 90 + }, + { + "ce_ib": 57.74800491333008, + "ce_orig": 1.038017988204956, + "epoch": 0.012940330697340043, + "kl_loss": 3335.373046875, + "loss_ib": 166.7975311279297, + "step": 90 + }, + { + "ce_ib": 57.7064323425293, + "ce_orig": 0.921914279460907, + "epoch": 0.013084112149532711, + "kl_loss": 3284.214111328125, + "loss_ib": 164.23956298828125, + "step": 91 + }, + { + "ce_ib": 57.82907485961914, + "ce_orig": 0.9391928911209106, + "epoch": 0.013227893601725377, + "kl_loss": 3100.417724609375, + "loss_ib": 155.0498046875, + "step": 92 + }, + { + "ce_ib": 58.58163833618164, + "ce_orig": 1.0686829090118408, + "epoch": 0.013371675053918045, + "kl_loss": 3164.060546875, + "loss_ib": 158.23233032226562, + "step": 93 + }, + { + "ce_ib": 56.14845657348633, + "ce_orig": 0.4495549201965332, + "epoch": 0.013515456506110711, + "kl_loss": 2954.31982421875, + "loss_ib": 147.7440643310547, + "step": 94 + }, + { + "epoch": 0.013659237958303379, + "grad_norm": 2321.239013671875, + "learning_rate": 6.789137380191693e-06, + "loss": 156.8229, + "step": 95 + }, + { + "ce_ib": 57.893184661865234, + "ce_orig": 1.1613727807998657, + "epoch": 0.013659237958303379, + "kl_loss": 3191.733154296875, + "loss_ib": 159.6156005859375, + "step": 95 + }, + { + "ce_ib": 58.80827713012695, + "ce_orig": 1.103546380996704, + "epoch": 0.013803019410496047, + "kl_loss": 2989.6494140625, + "loss_ib": 149.5118865966797, + "step": 96 + }, + { + "ce_ib": 57.372257232666016, + "ce_orig": 0.7489521503448486, + "epoch": 0.013946800862688713, + "kl_loss": 3076.787109375, + "loss_ib": 153.8680419921875, + "step": 97 + }, + { + "ce_ib": 58.20832061767578, + "ce_orig": 1.0948829650878906, + "epoch": 0.01409058231488138, + "kl_loss": 3079.382080078125, + "loss_ib": 153.9982147216797, + "step": 98 + }, + { + "ce_ib": 57.56401443481445, + "ce_orig": 1.252577543258667, + "epoch": 0.014234363767074048, + "kl_loss": 2953.885498046875, + "loss_ib": 147.7230682373047, + "step": 99 + }, + { + "epoch": 0.014378145219266714, + "grad_norm": 2354.333251953125, + "learning_rate": 7.188498402555911e-06, + "loss": 151.9125, + "step": 100 + }, + { + "ce_ib": 57.552894592285156, + "ce_orig": 1.0784616470336914, + "epoch": 0.014378145219266714, + "kl_loss": 3071.802734375, + "loss_ib": 153.61891174316406, + "step": 100 + }, + { + "ce_ib": 56.86176681518555, + "ce_orig": 0.6961782574653625, + "epoch": 0.014521926671459382, + "kl_loss": 3063.923095703125, + "loss_ib": 153.22459411621094, + "step": 101 + }, + { + "ce_ib": 54.33999252319336, + "ce_orig": 0.5625413656234741, + "epoch": 0.014665708123652048, + "kl_loss": 2982.339111328125, + "loss_ib": 149.14413452148438, + "step": 102 + }, + { + "ce_ib": 55.64839553833008, + "ce_orig": 0.7663992047309875, + "epoch": 0.014809489575844716, + "kl_loss": 3048.7392578125, + "loss_ib": 152.46478271484375, + "step": 103 + }, + { + "ce_ib": 58.38852310180664, + "ce_orig": 1.1540484428405762, + "epoch": 0.014953271028037384, + "kl_loss": 2707.0400390625, + "loss_ib": 135.38121032714844, + "step": 104 + }, + { + "epoch": 0.01509705248023005, + "grad_norm": 2202.033447265625, + "learning_rate": 7.5878594249201285e-06, + "loss": 148.0272, + "step": 105 + }, + { + "ce_ib": 58.33769607543945, + "ce_orig": 1.282652735710144, + "epoch": 0.01509705248023005, + "kl_loss": 2638.0634765625, + "loss_ib": 131.93235778808594, + "step": 105 + }, + { + "ce_ib": 57.01485061645508, + "ce_orig": 1.1682567596435547, + "epoch": 0.015240833932422718, + "kl_loss": 2910.34326171875, + "loss_ib": 145.54566955566406, + "step": 106 + }, + { + "ce_ib": 56.478248596191406, + "ce_orig": 1.093648910522461, + "epoch": 0.015384615384615385, + "kl_loss": 2749.603759765625, + "loss_ib": 137.5084228515625, + "step": 107 + }, + { + "ce_ib": 55.969581604003906, + "ce_orig": 0.8221930861473083, + "epoch": 0.015528396836808052, + "kl_loss": 2742.017822265625, + "loss_ib": 137.12889099121094, + "step": 108 + }, + { + "ce_ib": 55.39336013793945, + "ce_orig": 1.138152837753296, + "epoch": 0.015672178289000718, + "kl_loss": 2064.01513671875, + "loss_ib": 103.22845458984375, + "step": 109 + }, + { + "epoch": 0.015815959741193385, + "grad_norm": 1960.8631591796875, + "learning_rate": 7.987220447284345e-06, + "loss": 139.8337, + "step": 110 + }, + { + "ce_ib": 57.7611198425293, + "ce_orig": 0.8048346042633057, + "epoch": 0.015815959741193385, + "kl_loss": 2746.037353515625, + "loss_ib": 137.33074951171875, + "step": 110 + }, + { + "ce_ib": 54.72801208496094, + "ce_orig": 0.9340922236442566, + "epoch": 0.015959741193386053, + "kl_loss": 2660.56982421875, + "loss_ib": 133.0558624267578, + "step": 111 + }, + { + "ce_ib": 56.28373336791992, + "ce_orig": 1.2209872007369995, + "epoch": 0.01610352264557872, + "kl_loss": 2672.66015625, + "loss_ib": 133.66114807128906, + "step": 112 + }, + { + "ce_ib": 55.83729553222656, + "ce_orig": 1.1345103979110718, + "epoch": 0.01624730409777139, + "kl_loss": 2844.884765625, + "loss_ib": 142.27215576171875, + "step": 113 + }, + { + "ce_ib": 55.07097625732422, + "ce_orig": 1.2232747077941895, + "epoch": 0.016391085549964053, + "kl_loss": 2053.98974609375, + "loss_ib": 102.7270278930664, + "step": 114 + }, + { + "epoch": 0.01653486700215672, + "grad_norm": 2072.072998046875, + "learning_rate": 8.386581469648563e-06, + "loss": 135.2724, + "step": 115 + }, + { + "ce_ib": 56.160423278808594, + "ce_orig": 0.8365716338157654, + "epoch": 0.01653486700215672, + "kl_loss": 2660.091796875, + "loss_ib": 133.0326690673828, + "step": 115 + }, + { + "ce_ib": 54.47496795654297, + "ce_orig": 1.0809355974197388, + "epoch": 0.01667864845434939, + "kl_loss": 2690.080078125, + "loss_ib": 134.53123474121094, + "step": 116 + }, + { + "ce_ib": 53.27537155151367, + "ce_orig": 1.1820772886276245, + "epoch": 0.016822429906542057, + "kl_loss": 2644.28173828125, + "loss_ib": 132.24072265625, + "step": 117 + }, + { + "ce_ib": 54.39961242675781, + "ce_orig": 1.2899582386016846, + "epoch": 0.016966211358734724, + "kl_loss": 2602.2080078125, + "loss_ib": 130.13760375976562, + "step": 118 + }, + { + "ce_ib": 52.184234619140625, + "ce_orig": 0.8353484869003296, + "epoch": 0.01710999281092739, + "kl_loss": 2481.124267578125, + "loss_ib": 124.0822982788086, + "step": 119 + }, + { + "epoch": 0.017253774263120056, + "grad_norm": 2074.370361328125, + "learning_rate": 8.78594249201278e-06, + "loss": 132.3383, + "step": 120 + }, + { + "ce_ib": 53.626792907714844, + "ce_orig": 0.689016580581665, + "epoch": 0.017253774263120056, + "kl_loss": 2638.05810546875, + "loss_ib": 131.9297332763672, + "step": 120 + }, + { + "ce_ib": 52.17591857910156, + "ce_orig": 0.7896418571472168, + "epoch": 0.017397555715312724, + "kl_loss": 1895.728515625, + "loss_ib": 94.81251525878906, + "step": 121 + }, + { + "ce_ib": 52.534664154052734, + "ce_orig": 0.7579004764556885, + "epoch": 0.017541337167505392, + "kl_loss": 2552.994873046875, + "loss_ib": 127.67601776123047, + "step": 122 + }, + { + "ce_ib": 52.31755065917969, + "ce_orig": 0.9803644418716431, + "epoch": 0.01768511861969806, + "kl_loss": 2607.78662109375, + "loss_ib": 130.41549682617188, + "step": 123 + }, + { + "ce_ib": 53.674556732177734, + "ce_orig": 0.8895677328109741, + "epoch": 0.017828900071890728, + "kl_loss": 2425.487548828125, + "loss_ib": 121.30122375488281, + "step": 124 + }, + { + "epoch": 0.017972681524083392, + "grad_norm": 1952.8394775390625, + "learning_rate": 9.185303514376996e-06, + "loss": 122.1088, + "step": 125 + }, + { + "ce_ib": 53.564842224121094, + "ce_orig": 1.256234049797058, + "epoch": 0.017972681524083392, + "kl_loss": 2441.9111328125, + "loss_ib": 122.1223373413086, + "step": 125 + }, + { + "ce_ib": 52.68410110473633, + "ce_orig": 1.2045953273773193, + "epoch": 0.01811646297627606, + "kl_loss": 2293.639404296875, + "loss_ib": 114.70831298828125, + "step": 126 + }, + { + "ce_ib": 54.133914947509766, + "ce_orig": 1.514359712600708, + "epoch": 0.018260244428468728, + "kl_loss": 2311.011962890625, + "loss_ib": 115.5776596069336, + "step": 127 + }, + { + "ce_ib": 52.44422149658203, + "ce_orig": 0.9751385450363159, + "epoch": 0.018404025880661395, + "kl_loss": 2449.00048828125, + "loss_ib": 122.47624969482422, + "step": 128 + }, + { + "ce_ib": 50.06406021118164, + "ce_orig": 0.7257946729660034, + "epoch": 0.018547807332854063, + "kl_loss": 2270.14794921875, + "loss_ib": 113.53243255615234, + "step": 129 + }, + { + "epoch": 0.018691588785046728, + "grad_norm": 1946.055419921875, + "learning_rate": 9.584664536741214e-06, + "loss": 119.3412, + "step": 130 + }, + { + "ce_ib": 53.00954055786133, + "ce_orig": 1.0300298929214478, + "epoch": 0.018691588785046728, + "kl_loss": 2245.056640625, + "loss_ib": 112.27933502197266, + "step": 130 + }, + { + "ce_ib": 50.96159362792969, + "ce_orig": 0.8871611952781677, + "epoch": 0.018835370237239395, + "kl_loss": 2287.427734375, + "loss_ib": 114.39686584472656, + "step": 131 + }, + { + "ce_ib": 52.4853515625, + "ce_orig": 1.039488673210144, + "epoch": 0.018979151689432063, + "kl_loss": 2313.669921875, + "loss_ib": 115.7097396850586, + "step": 132 + }, + { + "ce_ib": 51.21004104614258, + "ce_orig": 0.845194935798645, + "epoch": 0.01912293314162473, + "kl_loss": 2127.791259765625, + "loss_ib": 106.41517639160156, + "step": 133 + }, + { + "ce_ib": 52.13531494140625, + "ce_orig": 1.1372461318969727, + "epoch": 0.0192667145938174, + "kl_loss": 2183.52783203125, + "loss_ib": 109.20245361328125, + "step": 134 + }, + { + "epoch": 0.019410496046010063, + "grad_norm": 1868.0174560546875, + "learning_rate": 9.984025559105432e-06, + "loss": 111.8021, + "step": 135 + }, + { + "ce_ib": 50.40250015258789, + "ce_orig": 0.5799722075462341, + "epoch": 0.019410496046010063, + "kl_loss": 2218.98046875, + "loss_ib": 110.9742202758789, + "step": 135 + }, + { + "ce_ib": 49.68068313598633, + "ce_orig": 0.9591949582099915, + "epoch": 0.01955427749820273, + "kl_loss": 2150.03564453125, + "loss_ib": 107.52662658691406, + "step": 136 + }, + { + "ce_ib": 49.44940185546875, + "ce_orig": 0.8390080332756042, + "epoch": 0.0196980589503954, + "kl_loss": 1845.8321533203125, + "loss_ib": 92.31633758544922, + "step": 137 + }, + { + "ce_ib": 52.21055221557617, + "ce_orig": 1.4367856979370117, + "epoch": 0.019841840402588067, + "kl_loss": 2051.351806640625, + "loss_ib": 102.59370422363281, + "step": 138 + }, + { + "ce_ib": 50.15422058105469, + "ce_orig": 1.2788512706756592, + "epoch": 0.019985621854780734, + "kl_loss": 2027.796875, + "loss_ib": 101.41492462158203, + "step": 139 + }, + { + "epoch": 0.020129403306973402, + "grad_norm": 1761.1328125, + "learning_rate": 1.038338658146965e-05, + "loss": 102.0922, + "step": 140 + }, + { + "ce_ib": 50.65812301635742, + "ce_orig": 1.2148778438568115, + "epoch": 0.020129403306973402, + "kl_loss": 1903.0653076171875, + "loss_ib": 95.17859649658203, + "step": 140 + }, + { + "ce_ib": 49.68356704711914, + "ce_orig": 1.0471272468566895, + "epoch": 0.020273184759166066, + "kl_loss": 1945.1756591796875, + "loss_ib": 97.28362274169922, + "step": 141 + }, + { + "ce_ib": 47.467594146728516, + "ce_orig": 0.6498449444770813, + "epoch": 0.020416966211358734, + "kl_loss": 1910.508544921875, + "loss_ib": 95.54916381835938, + "step": 142 + }, + { + "ce_ib": 47.57759094238281, + "ce_orig": 0.8552805781364441, + "epoch": 0.020560747663551402, + "kl_loss": 1907.689697265625, + "loss_ib": 95.40827941894531, + "step": 143 + }, + { + "ce_ib": 50.04168701171875, + "ce_orig": 1.1924092769622803, + "epoch": 0.02070452911574407, + "kl_loss": 1690.5194091796875, + "loss_ib": 84.55098724365234, + "step": 144 + }, + { + "epoch": 0.020848310567936738, + "grad_norm": 1661.7161865234375, + "learning_rate": 1.0782747603833867e-05, + "loss": 95.0661, + "step": 145 + }, + { + "ce_ib": 48.34769821166992, + "ce_orig": 1.0676274299621582, + "epoch": 0.020848310567936738, + "kl_loss": 1760.777099609375, + "loss_ib": 88.06303405761719, + "step": 145 + }, + { + "ce_ib": 47.0106201171875, + "ce_orig": 1.2353283166885376, + "epoch": 0.020992092020129402, + "kl_loss": 1798.0538330078125, + "loss_ib": 89.92620086669922, + "step": 146 + }, + { + "ce_ib": 48.50465393066406, + "ce_orig": 1.0733487606048584, + "epoch": 0.02113587347232207, + "kl_loss": 1734.2774658203125, + "loss_ib": 86.73812866210938, + "step": 147 + }, + { + "ce_ib": 45.04286193847656, + "ce_orig": 0.8353049159049988, + "epoch": 0.021279654924514738, + "kl_loss": 1773.991943359375, + "loss_ib": 88.72212219238281, + "step": 148 + }, + { + "ce_ib": 47.320526123046875, + "ce_orig": 1.219022512435913, + "epoch": 0.021423436376707405, + "kl_loss": 1750.5101318359375, + "loss_ib": 87.54916381835938, + "step": 149 + }, + { + "epoch": 0.021567217828900073, + "grad_norm": 1564.1060791015625, + "learning_rate": 1.1182108626198083e-05, + "loss": 87.3973, + "step": 150 + }, + { + "ce_ib": 47.08059310913086, + "ce_orig": 0.8302248120307922, + "epoch": 0.021567217828900073, + "kl_loss": 1589.67822265625, + "loss_ib": 79.50745391845703, + "step": 150 + }, + { + "ce_ib": 47.74165725708008, + "ce_orig": 0.7171430587768555, + "epoch": 0.021710999281092738, + "kl_loss": 1636.097412109375, + "loss_ib": 81.82874298095703, + "step": 151 + }, + { + "ce_ib": 46.43238830566406, + "ce_orig": 0.7868078947067261, + "epoch": 0.021854780733285405, + "kl_loss": 1594.2950439453125, + "loss_ib": 79.73796844482422, + "step": 152 + }, + { + "ce_ib": 44.30559539794922, + "ce_orig": 1.0339301824569702, + "epoch": 0.021998562185478073, + "kl_loss": 1573.078857421875, + "loss_ib": 78.67610168457031, + "step": 153 + }, + { + "ce_ib": 44.82426834106445, + "ce_orig": 1.0889431238174438, + "epoch": 0.02214234363767074, + "kl_loss": 1503.557861328125, + "loss_ib": 75.20030975341797, + "step": 154 + }, + { + "epoch": 0.02228612508986341, + "grad_norm": 1438.786376953125, + "learning_rate": 1.1581469648562302e-05, + "loss": 77.0769, + "step": 155 + }, + { + "ce_ib": 46.833580017089844, + "ce_orig": 1.3186697959899902, + "epoch": 0.02228612508986341, + "kl_loss": 1401.23486328125, + "loss_ib": 70.08516693115234, + "step": 155 + }, + { + "ce_ib": 46.787254333496094, + "ce_orig": 0.9530097246170044, + "epoch": 0.022429906542056073, + "kl_loss": 1483.8673095703125, + "loss_ib": 74.21675872802734, + "step": 156 + }, + { + "ce_ib": 46.81201171875, + "ce_orig": 1.1565759181976318, + "epoch": 0.02257368799424874, + "kl_loss": 1341.5926513671875, + "loss_ib": 67.10304260253906, + "step": 157 + }, + { + "ce_ib": 45.95816421508789, + "ce_orig": 1.2070651054382324, + "epoch": 0.02271746944644141, + "kl_loss": 1238.02880859375, + "loss_ib": 61.92441940307617, + "step": 158 + }, + { + "ce_ib": 43.655330657958984, + "ce_orig": 0.7560437321662903, + "epoch": 0.022861250898634077, + "kl_loss": 1213.171875, + "loss_ib": 60.680419921875, + "step": 159 + }, + { + "epoch": 0.023005032350826744, + "grad_norm": 1232.082763671875, + "learning_rate": 1.1980830670926518e-05, + "loss": 66.2022, + "step": 160 + }, + { + "ce_ib": 46.1483268737793, + "ce_orig": 1.2238253355026245, + "epoch": 0.023005032350826744, + "kl_loss": 1228.1053466796875, + "loss_ib": 61.428340911865234, + "step": 160 + }, + { + "ce_ib": 47.45783996582031, + "ce_orig": 1.587011694908142, + "epoch": 0.023148813803019412, + "kl_loss": 1157.474365234375, + "loss_ib": 57.8974494934082, + "step": 161 + }, + { + "ce_ib": 45.567657470703125, + "ce_orig": 1.3450465202331543, + "epoch": 0.023292595255212076, + "kl_loss": 1117.0738525390625, + "loss_ib": 55.8764762878418, + "step": 162 + }, + { + "ce_ib": 44.280250549316406, + "ce_orig": 1.2833943367004395, + "epoch": 0.023436376707404744, + "kl_loss": 1064.94677734375, + "loss_ib": 53.26947784423828, + "step": 163 + }, + { + "ce_ib": 43.552276611328125, + "ce_orig": 0.7282137870788574, + "epoch": 0.023580158159597412, + "kl_loss": 1048.7855224609375, + "loss_ib": 52.461055755615234, + "step": 164 + }, + { + "epoch": 0.02372393961179008, + "grad_norm": 1095.414794921875, + "learning_rate": 1.2380191693290735e-05, + "loss": 56.1672, + "step": 165 + }, + { + "ce_ib": 43.77597427368164, + "ce_orig": 1.015390157699585, + "epoch": 0.02372393961179008, + "kl_loss": 996.73828125, + "loss_ib": 49.858802795410156, + "step": 165 + }, + { + "ce_ib": 43.67955017089844, + "ce_orig": 1.0842498540878296, + "epoch": 0.023867721063982748, + "kl_loss": 947.7979736328125, + "loss_ib": 47.411739349365234, + "step": 166 + }, + { + "ce_ib": 42.23929214477539, + "ce_orig": 0.9063572287559509, + "epoch": 0.024011502516175412, + "kl_loss": 897.7328491210938, + "loss_ib": 44.90776062011719, + "step": 167 + }, + { + "ce_ib": 45.15175247192383, + "ce_orig": 1.2839014530181885, + "epoch": 0.02415528396836808, + "kl_loss": 786.1021728515625, + "loss_ib": 39.32768630981445, + "step": 168 + }, + { + "ce_ib": 43.47494125366211, + "ce_orig": 0.9718037247657776, + "epoch": 0.024299065420560748, + "kl_loss": 860.2787475585938, + "loss_ib": 43.035675048828125, + "step": 169 + }, + { + "epoch": 0.024442846872753415, + "grad_norm": 1042.965087890625, + "learning_rate": 1.2779552715654951e-05, + "loss": 47.1098, + "step": 170 + }, + { + "ce_ib": 43.44833755493164, + "ce_orig": 1.3617149591445923, + "epoch": 0.024442846872753415, + "kl_loss": 803.04443359375, + "loss_ib": 40.173946380615234, + "step": 170 + }, + { + "ce_ib": 43.578792572021484, + "ce_orig": 0.9456126093864441, + "epoch": 0.024586628324946083, + "kl_loss": 699.483642578125, + "loss_ib": 34.9959716796875, + "step": 171 + }, + { + "ce_ib": 47.23983383178711, + "ce_orig": 1.8550169467926025, + "epoch": 0.024730409777138748, + "kl_loss": 675.0830688476562, + "loss_ib": 33.777774810791016, + "step": 172 + }, + { + "ce_ib": 43.85140609741211, + "ce_orig": 1.138585090637207, + "epoch": 0.024874191229331415, + "kl_loss": 659.050537109375, + "loss_ib": 32.97445297241211, + "step": 173 + }, + { + "ce_ib": 41.722373962402344, + "ce_orig": 1.243048906326294, + "epoch": 0.025017972681524083, + "kl_loss": 590.1033325195312, + "loss_ib": 29.526029586791992, + "step": 174 + }, + { + "epoch": 0.02516175413371675, + "grad_norm": 766.4675903320312, + "learning_rate": 1.3178913738019169e-05, + "loss": 35.4625, + "step": 175 + }, + { + "ce_ib": 44.18523406982422, + "ce_orig": 1.255010962486267, + "epoch": 0.02516175413371675, + "kl_loss": 592.2353515625, + "loss_ib": 29.633859634399414, + "step": 175 + }, + { + "ce_ib": 46.14919662475586, + "ce_orig": 0.9867228865623474, + "epoch": 0.02530553558590942, + "kl_loss": 509.3572692871094, + "loss_ib": 25.490938186645508, + "step": 176 + }, + { + "ce_ib": 46.58631134033203, + "ce_orig": 1.1491925716400146, + "epoch": 0.025449317038102087, + "kl_loss": 459.72491455078125, + "loss_ib": 23.009538650512695, + "step": 177 + }, + { + "ce_ib": 40.569705963134766, + "ce_orig": 0.7460018396377563, + "epoch": 0.02559309849029475, + "kl_loss": 352.4381103515625, + "loss_ib": 17.64219093322754, + "step": 178 + }, + { + "ce_ib": 46.841224670410156, + "ce_orig": 1.1470024585723877, + "epoch": 0.02573687994248742, + "kl_loss": 389.03765869140625, + "loss_ib": 19.475303649902344, + "step": 179 + }, + { + "epoch": 0.025880661394680086, + "grad_norm": 621.7545776367188, + "learning_rate": 1.3578274760383387e-05, + "loss": 24.304, + "step": 180 + }, + { + "ce_ib": 47.13188552856445, + "ce_orig": 1.330838680267334, + "epoch": 0.025880661394680086, + "kl_loss": 379.19989013671875, + "loss_ib": 18.98356056213379, + "step": 180 + }, + { + "ce_ib": 52.806861877441406, + "ce_orig": 2.114457130432129, + "epoch": 0.026024442846872754, + "kl_loss": 284.24114990234375, + "loss_ib": 14.2384614944458, + "step": 181 + }, + { + "ce_ib": 64.3398208618164, + "ce_orig": 0.9137269854545593, + "epoch": 0.026168224299065422, + "kl_loss": 240.51211547851562, + "loss_ib": 12.05777645111084, + "step": 182 + }, + { + "ce_ib": 62.500736236572266, + "ce_orig": 1.4484410285949707, + "epoch": 0.026312005751258086, + "kl_loss": 223.3756103515625, + "loss_ib": 11.200030326843262, + "step": 183 + }, + { + "ce_ib": 70.71409606933594, + "ce_orig": 1.7095311880111694, + "epoch": 0.026455787203450754, + "kl_loss": 207.35211181640625, + "loss_ib": 10.402962684631348, + "step": 184 + }, + { + "epoch": 0.026599568655643422, + "grad_norm": 365.3516845703125, + "learning_rate": 1.3977635782747606e-05, + "loss": 14.3906, + "step": 185 + }, + { + "ce_ib": 63.17552947998047, + "ce_orig": 2.531080484390259, + "epoch": 0.026599568655643422, + "kl_loss": 165.71923828125, + "loss_ib": 8.317549705505371, + "step": 185 + }, + { + "ce_ib": 84.16175842285156, + "ce_orig": 2.0548126697540283, + "epoch": 0.02674335010783609, + "kl_loss": 140.68856811523438, + "loss_ib": 7.07650899887085, + "step": 186 + }, + { + "ce_ib": 77.55998992919922, + "ce_orig": 1.809746503829956, + "epoch": 0.026887131560028758, + "kl_loss": 121.14314270019531, + "loss_ib": 6.095937252044678, + "step": 187 + }, + { + "ce_ib": 77.81259155273438, + "ce_orig": 1.9566165208816528, + "epoch": 0.027030913012221422, + "kl_loss": 101.40348815917969, + "loss_ib": 5.109080791473389, + "step": 188 + }, + { + "ce_ib": 69.3860092163086, + "ce_orig": 1.703246831893921, + "epoch": 0.02717469446441409, + "kl_loss": 93.15800476074219, + "loss_ib": 4.692593574523926, + "step": 189 + }, + { + "epoch": 0.027318475916606758, + "grad_norm": 189.81011962890625, + "learning_rate": 1.4376996805111822e-05, + "loss": 8.0904, + "step": 190 + }, + { + "ce_ib": 60.081260681152344, + "ce_orig": 1.8992141485214233, + "epoch": 0.027318475916606758, + "kl_loss": 100.07512664794922, + "loss_ib": 5.033797264099121, + "step": 190 + }, + { + "ce_ib": 58.84217834472656, + "ce_orig": 1.018802285194397, + "epoch": 0.027462257368799425, + "kl_loss": 72.03784942626953, + "loss_ib": 3.6313138008117676, + "step": 191 + }, + { + "ce_ib": 58.045005798339844, + "ce_orig": 1.0986676216125488, + "epoch": 0.027606038820992093, + "kl_loss": 78.45074462890625, + "loss_ib": 3.9515597820281982, + "step": 192 + }, + { + "ce_ib": 53.310672760009766, + "ce_orig": 1.2137010097503662, + "epoch": 0.027749820273184757, + "kl_loss": 56.91468811035156, + "loss_ib": 2.872389793395996, + "step": 193 + }, + { + "ce_ib": 47.84365463256836, + "ce_orig": 1.141069769859314, + "epoch": 0.027893601725377425, + "kl_loss": 49.71630096435547, + "loss_ib": 2.5097367763519287, + "step": 194 + }, + { + "epoch": 0.028037383177570093, + "grad_norm": 102.58991241455078, + "learning_rate": 1.477635782747604e-05, + "loss": 4.9077, + "step": 195 + }, + { + "ce_ib": 47.62435531616211, + "ce_orig": 1.2880587577819824, + "epoch": 0.028037383177570093, + "kl_loss": 51.41084289550781, + "loss_ib": 2.5943543910980225, + "step": 195 + }, + { + "ce_ib": 45.36109161376953, + "ce_orig": 0.8635546565055847, + "epoch": 0.02818116462976276, + "kl_loss": 44.00239181518555, + "loss_ib": 2.2228000164031982, + "step": 196 + }, + { + "ce_ib": 41.63899230957031, + "ce_orig": 1.1706253290176392, + "epoch": 0.02832494608195543, + "kl_loss": 40.91154479980469, + "loss_ib": 2.066396713256836, + "step": 197 + }, + { + "ce_ib": 40.076194763183594, + "ce_orig": 0.9194042682647705, + "epoch": 0.028468727534148097, + "kl_loss": 38.050926208496094, + "loss_ib": 1.9225844144821167, + "step": 198 + }, + { + "ce_ib": 35.384429931640625, + "ce_orig": 0.8929119110107422, + "epoch": 0.02861250898634076, + "kl_loss": 40.8455924987793, + "loss_ib": 2.059971809387207, + "step": 199 + }, + { + "epoch": 0.02875629043853343, + "grad_norm": 53.01378631591797, + "learning_rate": 1.5175718849840257e-05, + "loss": 3.3711, + "step": 200 + }, + { + "ce_ib": 37.04123306274414, + "ce_orig": 0.9671800136566162, + "epoch": 0.02875629043853343, + "kl_loss": 35.982879638671875, + "loss_ib": 1.8176645040512085, + "step": 200 + }, + { + "ce_ib": 37.61642837524414, + "ce_orig": 1.3750962018966675, + "epoch": 0.028900071890726096, + "kl_loss": 30.0595703125, + "loss_ib": 1.5217866897583008, + "step": 201 + }, + { + "ce_ib": 40.238006591796875, + "ce_orig": 1.7766847610473633, + "epoch": 0.029043853342918764, + "kl_loss": 26.227649688720703, + "loss_ib": 1.3315014839172363, + "step": 202 + }, + { + "ce_ib": 38.055755615234375, + "ce_orig": 1.213000774383545, + "epoch": 0.029187634795111432, + "kl_loss": 29.874595642089844, + "loss_ib": 1.512757658958435, + "step": 203 + }, + { + "ce_ib": 32.915287017822266, + "ce_orig": 0.86496502161026, + "epoch": 0.029331416247304096, + "kl_loss": 30.330623626708984, + "loss_ib": 1.5329889059066772, + "step": 204 + }, + { + "epoch": 0.029475197699496764, + "grad_norm": 23.908042907714844, + "learning_rate": 1.5575079872204475e-05, + "loss": 2.7418, + "step": 205 + }, + { + "ce_ib": 35.01918411254883, + "ce_orig": 1.2721844911575317, + "epoch": 0.029475197699496764, + "kl_loss": 26.31290054321289, + "loss_ib": 1.333154559135437, + "step": 205 + }, + { + "ce_ib": 32.87287139892578, + "ce_orig": 1.1400614976882935, + "epoch": 0.029618979151689432, + "kl_loss": 26.177352905273438, + "loss_ib": 1.3253041505813599, + "step": 206 + }, + { + "ce_ib": 35.85771942138672, + "ce_orig": 1.3444130420684814, + "epoch": 0.0297627606038821, + "kl_loss": 34.98625183105469, + "loss_ib": 1.7672414779663086, + "step": 207 + }, + { + "ce_ib": 31.547334671020508, + "ce_orig": 0.9652504920959473, + "epoch": 0.029906542056074768, + "kl_loss": 20.096208572387695, + "loss_ib": 1.0205841064453125, + "step": 208 + }, + { + "ce_ib": 32.527896881103516, + "ce_orig": 1.0664165019989014, + "epoch": 0.030050323508267432, + "kl_loss": 20.749666213989258, + "loss_ib": 1.053747296333313, + "step": 209 + }, + { + "epoch": 0.0301941049604601, + "grad_norm": 22.29879379272461, + "learning_rate": 1.597444089456869e-05, + "loss": 2.3639, + "step": 210 + }, + { + "ce_ib": 32.71118927001953, + "ce_orig": 1.2022026777267456, + "epoch": 0.0301941049604601, + "kl_loss": 20.01553726196289, + "loss_ib": 1.0171325206756592, + "step": 210 + }, + { + "ce_ib": 30.730520248413086, + "ce_orig": 0.8476402163505554, + "epoch": 0.030337886412652768, + "kl_loss": 17.856157302856445, + "loss_ib": 0.9081730842590332, + "step": 211 + }, + { + "ce_ib": 29.36618995666504, + "ce_orig": 0.7845667004585266, + "epoch": 0.030481667864845435, + "kl_loss": 18.906429290771484, + "loss_ib": 0.9600045084953308, + "step": 212 + }, + { + "ce_ib": 27.0706844329834, + "ce_orig": 0.3370642066001892, + "epoch": 0.030625449317038103, + "kl_loss": 18.621246337890625, + "loss_ib": 0.9445976614952087, + "step": 213 + }, + { + "ce_ib": 34.2156867980957, + "ce_orig": 0.6062073111534119, + "epoch": 0.03076923076923077, + "kl_loss": 19.93109893798828, + "loss_ib": 1.0136628150939941, + "step": 214 + }, + { + "epoch": 0.030913012221423435, + "grad_norm": 16.176197052001953, + "learning_rate": 1.6373801916932906e-05, + "loss": 1.8299, + "step": 215 + }, + { + "ce_ib": 28.275493621826172, + "ce_orig": 1.0786405801773071, + "epoch": 0.030913012221423435, + "kl_loss": 18.11065673828125, + "loss_ib": 0.919670581817627, + "step": 215 + }, + { + "ce_ib": 32.22303009033203, + "ce_orig": 0.8485822081565857, + "epoch": 0.031056793673616103, + "kl_loss": 19.91845703125, + "loss_ib": 1.012034296989441, + "step": 216 + }, + { + "ce_ib": 27.742530822753906, + "ce_orig": 0.8794713020324707, + "epoch": 0.03120057512580877, + "kl_loss": 14.383018493652344, + "loss_ib": 0.7330222129821777, + "step": 217 + }, + { + "ce_ib": 31.794010162353516, + "ce_orig": 0.6954683661460876, + "epoch": 0.031344356578001435, + "kl_loss": 16.9901180267334, + "loss_ib": 0.86540287733078, + "step": 218 + }, + { + "ce_ib": 31.032941818237305, + "ce_orig": 1.1841130256652832, + "epoch": 0.0314881380301941, + "kl_loss": 13.406427383422852, + "loss_ib": 0.6858378648757935, + "step": 219 + }, + { + "epoch": 0.03163191948238677, + "grad_norm": 14.094661712646484, + "learning_rate": 1.6773162939297126e-05, + "loss": 1.8049, + "step": 220 + }, + { + "ce_ib": 32.347557067871094, + "ce_orig": 0.7493203282356262, + "epoch": 0.03163191948238677, + "kl_loss": 18.79709815979004, + "loss_ib": 0.9560286402702332, + "step": 220 + }, + { + "ce_ib": 31.1737060546875, + "ce_orig": 1.0758991241455078, + "epoch": 0.03177570093457944, + "kl_loss": 20.572391510009766, + "loss_ib": 1.0442065000534058, + "step": 221 + }, + { + "ce_ib": 30.046796798706055, + "ce_orig": 0.6758080124855042, + "epoch": 0.031919482386772106, + "kl_loss": 13.740577697753906, + "loss_ib": 0.7020522952079773, + "step": 222 + }, + { + "ce_ib": 30.532987594604492, + "ce_orig": 1.0658819675445557, + "epoch": 0.032063263838964774, + "kl_loss": 11.958425521850586, + "loss_ib": 0.6131877899169922, + "step": 223 + }, + { + "ce_ib": 31.667829513549805, + "ce_orig": 1.1956491470336914, + "epoch": 0.03220704529115744, + "kl_loss": 12.607803344726562, + "loss_ib": 0.6462240815162659, + "step": 224 + }, + { + "epoch": 0.03235082674335011, + "grad_norm": 5.280292987823486, + "learning_rate": 1.7172523961661345e-05, + "loss": 1.6789, + "step": 225 + }, + { + "ce_ib": 31.769432067871094, + "ce_orig": 1.1676932573318481, + "epoch": 0.03235082674335011, + "kl_loss": 10.932788848876953, + "loss_ib": 0.5625241994857788, + "step": 225 + }, + { + "ce_ib": 30.539987564086914, + "ce_orig": 1.3033503293991089, + "epoch": 0.03249460819554278, + "kl_loss": 12.105158805847168, + "loss_ib": 0.6205279231071472, + "step": 226 + }, + { + "ce_ib": 28.660764694213867, + "ce_orig": 0.6191550493240356, + "epoch": 0.032638389647735445, + "kl_loss": 9.103292465209961, + "loss_ib": 0.46949502825737, + "step": 227 + }, + { + "ce_ib": 29.167789459228516, + "ce_orig": 0.7975085973739624, + "epoch": 0.032782171099928106, + "kl_loss": 10.556224822998047, + "loss_ib": 0.5423951148986816, + "step": 228 + }, + { + "ce_ib": 29.07682991027832, + "ce_orig": 1.0861161947250366, + "epoch": 0.032925952552120774, + "kl_loss": 9.905879974365234, + "loss_ib": 0.5098324418067932, + "step": 229 + }, + { + "epoch": 0.03306973400431344, + "grad_norm": 5.249032974243164, + "learning_rate": 1.757188498402556e-05, + "loss": 1.5289, + "step": 230 + }, + { + "ce_ib": 28.152671813964844, + "ce_orig": 0.8374654054641724, + "epoch": 0.03306973400431344, + "kl_loss": 10.608142852783203, + "loss_ib": 0.544483482837677, + "step": 230 + }, + { + "ce_ib": 26.917634963989258, + "ce_orig": 1.0145775079727173, + "epoch": 0.03321351545650611, + "kl_loss": 12.240139961242676, + "loss_ib": 0.6254658102989197, + "step": 231 + }, + { + "ce_ib": 29.13026237487793, + "ce_orig": 0.8860113024711609, + "epoch": 0.03335729690869878, + "kl_loss": 10.390130043029785, + "loss_ib": 0.5340716242790222, + "step": 232 + }, + { + "ce_ib": 27.842693328857422, + "ce_orig": 1.1651474237442017, + "epoch": 0.033501078360891445, + "kl_loss": 9.602649688720703, + "loss_ib": 0.49405384063720703, + "step": 233 + }, + { + "ce_ib": 24.930410385131836, + "ce_orig": 0.6166008710861206, + "epoch": 0.03364485981308411, + "kl_loss": 10.271495819091797, + "loss_ib": 0.5260400176048279, + "step": 234 + }, + { + "epoch": 0.03378864126527678, + "grad_norm": 5.078007698059082, + "learning_rate": 1.7971246006389777e-05, + "loss": 1.4736, + "step": 235 + }, + { + "ce_ib": 27.262588500976562, + "ce_orig": 1.0158146619796753, + "epoch": 0.03378864126527678, + "kl_loss": 10.76059627532959, + "loss_ib": 0.5516611337661743, + "step": 235 + }, + { + "ce_ib": 28.12372398376465, + "ce_orig": 0.9383306503295898, + "epoch": 0.03393242271746945, + "kl_loss": 9.931174278259277, + "loss_ib": 0.5106205940246582, + "step": 236 + }, + { + "ce_ib": 27.61408233642578, + "ce_orig": 1.3611609935760498, + "epoch": 0.034076204169662117, + "kl_loss": 11.292994499206543, + "loss_ib": 0.5784568190574646, + "step": 237 + }, + { + "ce_ib": 25.42107582092285, + "ce_orig": 0.8392754197120667, + "epoch": 0.03421998562185478, + "kl_loss": 10.153192520141602, + "loss_ib": 0.5203701853752136, + "step": 238 + }, + { + "ce_ib": 26.299903869628906, + "ce_orig": 0.8138580918312073, + "epoch": 0.034363767074047445, + "kl_loss": 10.076581001281738, + "loss_ib": 0.5169789791107178, + "step": 239 + }, + { + "epoch": 0.03450754852624011, + "grad_norm": 1.6015193462371826, + "learning_rate": 1.8370607028753993e-05, + "loss": 1.371, + "step": 240 + }, + { + "ce_ib": 25.303178787231445, + "ce_orig": 0.9828527569770813, + "epoch": 0.03450754852624011, + "kl_loss": 9.301372528076172, + "loss_ib": 0.4777202308177948, + "step": 240 + }, + { + "ce_ib": 17.445037841796875, + "ce_orig": 0.447143018245697, + "epoch": 0.03465132997843278, + "kl_loss": 7.682253837585449, + "loss_ib": 0.39283522963523865, + "step": 241 + }, + { + "ce_ib": 25.544435501098633, + "ce_orig": 0.531434953212738, + "epoch": 0.03479511143062545, + "kl_loss": 9.930435180664062, + "loss_ib": 0.5092939734458923, + "step": 242 + }, + { + "ce_ib": 26.459491729736328, + "ce_orig": 1.5432560443878174, + "epoch": 0.034938892882818116, + "kl_loss": 10.38063907623291, + "loss_ib": 0.5322617292404175, + "step": 243 + }, + { + "ce_ib": 22.71752166748047, + "ce_orig": 0.8145064115524292, + "epoch": 0.035082674335010784, + "kl_loss": 9.922914505004883, + "loss_ib": 0.5075044631958008, + "step": 244 + }, + { + "epoch": 0.03522645578720345, + "grad_norm": 3.806140184402466, + "learning_rate": 1.8769968051118212e-05, + "loss": 1.4145, + "step": 245 + }, + { + "ce_ib": 26.927804946899414, + "ce_orig": 0.44644680619239807, + "epoch": 0.03522645578720345, + "kl_loss": 9.268022537231445, + "loss_ib": 0.4768650233745575, + "step": 245 + }, + { + "ce_ib": 25.422399520874023, + "ce_orig": 1.1885180473327637, + "epoch": 0.03537023723939612, + "kl_loss": 8.077836990356445, + "loss_ib": 0.41660305857658386, + "step": 246 + }, + { + "ce_ib": 25.16580581665039, + "ce_orig": 0.6401370763778687, + "epoch": 0.03551401869158879, + "kl_loss": 8.176619529724121, + "loss_ib": 0.4214138984680176, + "step": 247 + }, + { + "ce_ib": 21.501310348510742, + "ce_orig": 0.7412813901901245, + "epoch": 0.035657800143781455, + "kl_loss": 10.056595802307129, + "loss_ib": 0.5135805010795593, + "step": 248 + }, + { + "ce_ib": 24.469106674194336, + "ce_orig": 0.9428795576095581, + "epoch": 0.035801581595974116, + "kl_loss": 8.948450088500977, + "loss_ib": 0.45965704321861267, + "step": 249 + }, + { + "epoch": 0.035945363048166784, + "grad_norm": 2.816962957382202, + "learning_rate": 1.9169329073482428e-05, + "loss": 1.397, + "step": 250 + }, + { + "ce_ib": 22.195499420166016, + "ce_orig": 0.7263768911361694, + "epoch": 0.035945363048166784, + "kl_loss": 8.741401672363281, + "loss_ib": 0.4481678605079651, + "step": 250 + }, + { + "ce_ib": 22.70380973815918, + "ce_orig": 0.6979135274887085, + "epoch": 0.03608914450035945, + "kl_loss": 9.159427642822266, + "loss_ib": 0.4693233072757721, + "step": 251 + }, + { + "ce_ib": 24.72095489501953, + "ce_orig": 0.8336602449417114, + "epoch": 0.03623292595255212, + "kl_loss": 8.704825401306152, + "loss_ib": 0.4476017653942108, + "step": 252 + }, + { + "ce_ib": 25.478559494018555, + "ce_orig": 1.2140023708343506, + "epoch": 0.03637670740474479, + "kl_loss": 8.463350296020508, + "loss_ib": 0.4359067976474762, + "step": 253 + }, + { + "ce_ib": 24.30400276184082, + "ce_orig": 1.159781813621521, + "epoch": 0.036520488856937455, + "kl_loss": 9.485654830932617, + "loss_ib": 0.4864347577095032, + "step": 254 + }, + { + "epoch": 0.03666427030913012, + "grad_norm": 2.2414402961730957, + "learning_rate": 1.9568690095846644e-05, + "loss": 1.421, + "step": 255 + }, + { + "ce_ib": 22.23937225341797, + "ce_orig": 0.5963650941848755, + "epoch": 0.03666427030913012, + "kl_loss": 8.78721809387207, + "loss_ib": 0.4504806101322174, + "step": 255 + }, + { + "ce_ib": 23.723373413085938, + "ce_orig": 1.1367262601852417, + "epoch": 0.03680805176132279, + "kl_loss": 8.774513244628906, + "loss_ib": 0.45058736205101013, + "step": 256 + }, + { + "ce_ib": 25.06109619140625, + "ce_orig": 1.4180657863616943, + "epoch": 0.03695183321351546, + "kl_loss": 8.770709991455078, + "loss_ib": 0.4510660767555237, + "step": 257 + }, + { + "ce_ib": 23.2429141998291, + "ce_orig": 1.0652941465377808, + "epoch": 0.037095614665708126, + "kl_loss": 8.671271324157715, + "loss_ib": 0.4451850354671478, + "step": 258 + }, + { + "ce_ib": 21.69077491760254, + "ce_orig": 0.3725070655345917, + "epoch": 0.03723939611790079, + "kl_loss": 8.793878555297852, + "loss_ib": 0.45053932070732117, + "step": 259 + }, + { + "epoch": 0.037383177570093455, + "grad_norm": 3.798487663269043, + "learning_rate": 1.9968051118210863e-05, + "loss": 1.4654, + "step": 260 + }, + { + "ce_ib": 23.512039184570312, + "ce_orig": 1.0731854438781738, + "epoch": 0.037383177570093455, + "kl_loss": 8.859106063842773, + "loss_ib": 0.45471134781837463, + "step": 260 + }, + { + "ce_ib": 22.52284049987793, + "ce_orig": 1.0415581464767456, + "epoch": 0.03752695902228612, + "kl_loss": 8.765377044677734, + "loss_ib": 0.4495302736759186, + "step": 261 + }, + { + "ce_ib": 24.735340118408203, + "ce_orig": 1.2017862796783447, + "epoch": 0.03767074047447879, + "kl_loss": 8.242500305175781, + "loss_ib": 0.4244926869869232, + "step": 262 + }, + { + "ce_ib": 21.546173095703125, + "ce_orig": 1.224229097366333, + "epoch": 0.03781452192667146, + "kl_loss": 8.425148963928223, + "loss_ib": 0.4320305287837982, + "step": 263 + }, + { + "ce_ib": 21.48952293395996, + "ce_orig": 0.8686205148696899, + "epoch": 0.037958303378864126, + "kl_loss": 8.037482261657715, + "loss_ib": 0.41261887550354004, + "step": 264 + }, + { + "epoch": 0.038102084831056794, + "grad_norm": 3.353695869445801, + "learning_rate": 2.0367412140575082e-05, + "loss": 1.396, + "step": 265 + }, + { + "ce_ib": 21.23736000061035, + "ce_orig": 1.0968440771102905, + "epoch": 0.038102084831056794, + "kl_loss": 8.250904083251953, + "loss_ib": 0.42316389083862305, + "step": 265 + }, + { + "ce_ib": 23.215932846069336, + "ce_orig": 1.1477916240692139, + "epoch": 0.03824586628324946, + "kl_loss": 8.169689178466797, + "loss_ib": 0.4200924336910248, + "step": 266 + }, + { + "ce_ib": 23.65303611755371, + "ce_orig": 0.9717550277709961, + "epoch": 0.03838964773544213, + "kl_loss": 7.602072715759277, + "loss_ib": 0.39193016290664673, + "step": 267 + }, + { + "ce_ib": 20.33776092529297, + "ce_orig": 0.7842280864715576, + "epoch": 0.0385334291876348, + "kl_loss": 7.547301769256592, + "loss_ib": 0.3875339925289154, + "step": 268 + }, + { + "ce_ib": 21.66446304321289, + "ce_orig": 0.9328132271766663, + "epoch": 0.038677210639827465, + "kl_loss": 7.6735310554504395, + "loss_ib": 0.39450880885124207, + "step": 269 + }, + { + "epoch": 0.038820992092020126, + "grad_norm": 3.106287717819214, + "learning_rate": 2.07667731629393e-05, + "loss": 1.361, + "step": 270 + }, + { + "ce_ib": 14.545405387878418, + "ce_orig": 0.4456147849559784, + "epoch": 0.038820992092020126, + "kl_loss": 6.566807746887207, + "loss_ib": 0.33561310172080994, + "step": 270 + }, + { + "ce_ib": 22.359256744384766, + "ce_orig": 1.0915645360946655, + "epoch": 0.038964773544212794, + "kl_loss": 7.441349029541016, + "loss_ib": 0.38324710726737976, + "step": 271 + }, + { + "ce_ib": 19.833614349365234, + "ce_orig": 0.6568068861961365, + "epoch": 0.03910855499640546, + "kl_loss": 7.507279396057129, + "loss_ib": 0.3852807879447937, + "step": 272 + }, + { + "ce_ib": 18.543352127075195, + "ce_orig": 0.6701676845550537, + "epoch": 0.03925233644859813, + "kl_loss": 7.031156539916992, + "loss_ib": 0.36082950234413147, + "step": 273 + }, + { + "ce_ib": 19.24417495727539, + "ce_orig": 0.9314641952514648, + "epoch": 0.0393961179007908, + "kl_loss": 7.164027214050293, + "loss_ib": 0.36782345175743103, + "step": 274 + }, + { + "epoch": 0.039539899352983465, + "grad_norm": 3.4644718170166016, + "learning_rate": 2.1166134185303514e-05, + "loss": 1.2343, + "step": 275 + }, + { + "ce_ib": 20.395038604736328, + "ce_orig": 0.9956320524215698, + "epoch": 0.039539899352983465, + "kl_loss": 7.304188251495361, + "loss_ib": 0.375406950712204, + "step": 275 + }, + { + "ce_ib": 22.580080032348633, + "ce_orig": 1.0877642631530762, + "epoch": 0.03968368080517613, + "kl_loss": 6.740540504455566, + "loss_ib": 0.34831708669662476, + "step": 276 + }, + { + "ce_ib": 17.001928329467773, + "ce_orig": 0.6867518424987793, + "epoch": 0.0398274622573688, + "kl_loss": 6.005027770996094, + "loss_ib": 0.3087523579597473, + "step": 277 + }, + { + "ce_ib": 20.226699829101562, + "ce_orig": 0.5907849669456482, + "epoch": 0.03997124370956147, + "kl_loss": 6.040300369262695, + "loss_ib": 0.3121283948421478, + "step": 278 + }, + { + "ce_ib": 20.84942626953125, + "ce_orig": 0.8668643832206726, + "epoch": 0.040115025161754136, + "kl_loss": 5.51943302154541, + "loss_ib": 0.2863963544368744, + "step": 279 + }, + { + "epoch": 0.040258806613946804, + "grad_norm": 2.836003541946411, + "learning_rate": 2.1565495207667734e-05, + "loss": 1.2083, + "step": 280 + }, + { + "ce_ib": 21.198511123657227, + "ce_orig": 1.0262176990509033, + "epoch": 0.040258806613946804, + "kl_loss": 5.828641414642334, + "loss_ib": 0.30203133821487427, + "step": 280 + }, + { + "ce_ib": 20.3875675201416, + "ce_orig": 1.1043504476547241, + "epoch": 0.040402588066139465, + "kl_loss": 5.128015041351318, + "loss_ib": 0.2665945291519165, + "step": 281 + }, + { + "ce_ib": 21.915267944335938, + "ce_orig": 0.9482531547546387, + "epoch": 0.04054636951833213, + "kl_loss": 4.9252400398254395, + "loss_ib": 0.2572196424007416, + "step": 282 + }, + { + "ce_ib": 20.219453811645508, + "ce_orig": 0.9774989485740662, + "epoch": 0.0406901509705248, + "kl_loss": 4.067303657531738, + "loss_ib": 0.21347491443157196, + "step": 283 + }, + { + "ce_ib": 20.512344360351562, + "ce_orig": 0.8953350186347961, + "epoch": 0.04083393242271747, + "kl_loss": 2.980595588684082, + "loss_ib": 0.15928594768047333, + "step": 284 + }, + { + "epoch": 0.040977713874910136, + "grad_norm": 3.126970052719116, + "learning_rate": 2.196485623003195e-05, + "loss": 1.0977, + "step": 285 + }, + { + "ce_ib": 22.42814826965332, + "ce_orig": 0.8264601230621338, + "epoch": 0.040977713874910136, + "kl_loss": 2.260897397994995, + "loss_ib": 0.12425895035266876, + "step": 285 + }, + { + "ce_ib": 19.28723907470703, + "ce_orig": 1.0294511318206787, + "epoch": 0.041121495327102804, + "kl_loss": 1.4155142307281494, + "loss_ib": 0.08041933178901672, + "step": 286 + }, + { + "ce_ib": 22.32074546813965, + "ce_orig": 1.2082335948944092, + "epoch": 0.04126527677929547, + "kl_loss": 1.4922515153884888, + "loss_ib": 0.08577295392751694, + "step": 287 + }, + { + "ce_ib": 21.85085105895996, + "ce_orig": 1.1711450815200806, + "epoch": 0.04140905823148814, + "kl_loss": 1.0158560276031494, + "loss_ib": 0.06171822547912598, + "step": 288 + }, + { + "ce_ib": 19.448490142822266, + "ce_orig": 0.8083485960960388, + "epoch": 0.04155283968368081, + "kl_loss": 0.8125163316726685, + "loss_ib": 0.050350066274404526, + "step": 289 + }, + { + "epoch": 0.041696621135873475, + "grad_norm": 0.41338738799095154, + "learning_rate": 2.2364217252396165e-05, + "loss": 0.9838, + "step": 290 + }, + { + "ce_ib": 19.226776123046875, + "ce_orig": 1.1908280849456787, + "epoch": 0.041696621135873475, + "kl_loss": 0.7504492998123169, + "loss_ib": 0.0471358560025692, + "step": 290 + }, + { + "ce_ib": 19.87446403503418, + "ce_orig": 1.02711021900177, + "epoch": 0.041840402588066136, + "kl_loss": 0.6533533930778503, + "loss_ib": 0.04260490462183952, + "step": 291 + }, + { + "ce_ib": 17.87228775024414, + "ce_orig": 1.0570042133331299, + "epoch": 0.041984184040258804, + "kl_loss": 0.5579368472099304, + "loss_ib": 0.036832984536886215, + "step": 292 + }, + { + "ce_ib": 16.936511993408203, + "ce_orig": 0.5302789807319641, + "epoch": 0.04212796549245147, + "kl_loss": 0.6725109219551086, + "loss_ib": 0.042093802243471146, + "step": 293 + }, + { + "ce_ib": 18.851577758789062, + "ce_orig": 1.0163416862487793, + "epoch": 0.04227174694464414, + "kl_loss": 0.5098384618759155, + "loss_ib": 0.03491771221160889, + "step": 294 + }, + { + "epoch": 0.04241552839683681, + "grad_norm": 0.21297426521778107, + "learning_rate": 2.2763578274760385e-05, + "loss": 0.9126, + "step": 295 + }, + { + "ce_ib": 15.781968116760254, + "ce_orig": 0.5711618661880493, + "epoch": 0.04241552839683681, + "kl_loss": 0.4373496174812317, + "loss_ib": 0.02975846640765667, + "step": 295 + }, + { + "ce_ib": 22.636154174804688, + "ce_orig": 1.496230959892273, + "epoch": 0.042559309849029475, + "kl_loss": 0.4643814265727997, + "loss_ib": 0.03453714773058891, + "step": 296 + }, + { + "ce_ib": 14.683899879455566, + "ce_orig": 0.6033921241760254, + "epoch": 0.04270309130122214, + "kl_loss": 0.39723098278045654, + "loss_ib": 0.027203500270843506, + "step": 297 + }, + { + "ce_ib": 17.57473373413086, + "ce_orig": 0.7110596895217896, + "epoch": 0.04284687275341481, + "kl_loss": 0.4147634506225586, + "loss_ib": 0.02952553890645504, + "step": 298 + }, + { + "ce_ib": 16.251811981201172, + "ce_orig": 0.7895355820655823, + "epoch": 0.04299065420560748, + "kl_loss": 0.4082551896572113, + "loss_ib": 0.028538664802908897, + "step": 299 + }, + { + "epoch": 0.043134435657800146, + "grad_norm": 0.1568593829870224, + "learning_rate": 2.3162939297124604e-05, + "loss": 0.9485, + "step": 300 + }, + { + "ce_ib": 18.58441162109375, + "ce_orig": 1.089181900024414, + "epoch": 0.043134435657800146, + "kl_loss": 0.4467368721961975, + "loss_ib": 0.0316290520131588, + "step": 300 + }, + { + "ce_ib": 18.67475700378418, + "ce_orig": 0.566021740436554, + "epoch": 0.043278217109992814, + "kl_loss": 0.3963052034378052, + "loss_ib": 0.029152637347579002, + "step": 301 + }, + { + "ce_ib": 16.824068069458008, + "ce_orig": 0.7251248359680176, + "epoch": 0.043421998562185475, + "kl_loss": 0.34888386726379395, + "loss_ib": 0.025856226682662964, + "step": 302 + }, + { + "ce_ib": 18.8211669921875, + "ce_orig": 0.6579341888427734, + "epoch": 0.04356578001437814, + "kl_loss": 0.3751975893974304, + "loss_ib": 0.02817046456038952, + "step": 303 + }, + { + "ce_ib": 18.44203758239746, + "ce_orig": 0.8343374729156494, + "epoch": 0.04370956146657081, + "kl_loss": 0.3514009118080139, + "loss_ib": 0.02679106593132019, + "step": 304 + }, + { + "epoch": 0.04385334291876348, + "grad_norm": 0.15141652524471283, + "learning_rate": 2.356230031948882e-05, + "loss": 0.9599, + "step": 305 + }, + { + "ce_ib": 16.4177188873291, + "ce_orig": 0.6326029896736145, + "epoch": 0.04385334291876348, + "kl_loss": 0.331454873085022, + "loss_ib": 0.02478160336613655, + "step": 305 + }, + { + "ce_ib": 21.196409225463867, + "ce_orig": 1.230805516242981, + "epoch": 0.043997124370956146, + "kl_loss": 0.40000325441360474, + "loss_ib": 0.030598366633057594, + "step": 306 + }, + { + "ce_ib": 17.367687225341797, + "ce_orig": 0.863945484161377, + "epoch": 0.044140905823148814, + "kl_loss": 0.31744185090065, + "loss_ib": 0.024555936455726624, + "step": 307 + }, + { + "ce_ib": 18.942859649658203, + "ce_orig": 1.148938536643982, + "epoch": 0.04428468727534148, + "kl_loss": 0.3378192186355591, + "loss_ib": 0.02636238932609558, + "step": 308 + }, + { + "ce_ib": 19.47002410888672, + "ce_orig": 1.0959943532943726, + "epoch": 0.04442846872753415, + "kl_loss": 0.31226494908332825, + "loss_ib": 0.025348259136080742, + "step": 309 + }, + { + "epoch": 0.04457225017972682, + "grad_norm": 0.13533012568950653, + "learning_rate": 2.3961661341853036e-05, + "loss": 1.015, + "step": 310 + }, + { + "ce_ib": 17.681161880493164, + "ce_orig": 0.9902611374855042, + "epoch": 0.04457225017972682, + "kl_loss": 0.26686644554138184, + "loss_ib": 0.02218390442430973, + "step": 310 + }, + { + "ce_ib": 16.987899780273438, + "ce_orig": 0.6236902475357056, + "epoch": 0.044716031631919485, + "kl_loss": 0.33637407422065735, + "loss_ib": 0.025312652811408043, + "step": 311 + }, + { + "ce_ib": 18.65533447265625, + "ce_orig": 0.8152080774307251, + "epoch": 0.044859813084112146, + "kl_loss": 0.35383230447769165, + "loss_ib": 0.027019282802939415, + "step": 312 + }, + { + "ce_ib": 20.103788375854492, + "ce_orig": 0.8615632653236389, + "epoch": 0.045003594536304814, + "kl_loss": 0.2777783274650574, + "loss_ib": 0.023940810933709145, + "step": 313 + }, + { + "ce_ib": 16.64844512939453, + "ce_orig": 0.9147619605064392, + "epoch": 0.04514737598849748, + "kl_loss": 0.27109894156455994, + "loss_ib": 0.0218791700899601, + "step": 314 + }, + { + "epoch": 0.04529115744069015, + "grad_norm": 0.11459668725728989, + "learning_rate": 2.4361022364217255e-05, + "loss": 0.8602, + "step": 315 + }, + { + "ce_ib": 11.063411712646484, + "ce_orig": 0.27872321009635925, + "epoch": 0.04529115744069015, + "kl_loss": 0.3863391876220703, + "loss_ib": 0.024848666042089462, + "step": 315 + }, + { + "ce_ib": 17.38253402709961, + "ce_orig": 1.05631422996521, + "epoch": 0.04543493889288282, + "kl_loss": 0.29607954621315, + "loss_ib": 0.023495245724916458, + "step": 316 + }, + { + "ce_ib": 17.782546997070312, + "ce_orig": 0.9817179441452026, + "epoch": 0.045578720345075485, + "kl_loss": 0.28195664286613464, + "loss_ib": 0.02298910729587078, + "step": 317 + }, + { + "ce_ib": 17.297069549560547, + "ce_orig": 0.6919381618499756, + "epoch": 0.04572250179726815, + "kl_loss": 0.29801255464553833, + "loss_ib": 0.023549163714051247, + "step": 318 + }, + { + "ce_ib": 17.82324981689453, + "ce_orig": 1.2199736833572388, + "epoch": 0.04586628324946082, + "kl_loss": 0.2285030335187912, + "loss_ib": 0.020336776971817017, + "step": 319 + }, + { + "epoch": 0.04601006470165349, + "grad_norm": 0.10239739716053009, + "learning_rate": 2.476038338658147e-05, + "loss": 0.7986, + "step": 320 + }, + { + "ce_ib": 16.177907943725586, + "ce_orig": 0.7963648438453674, + "epoch": 0.04601006470165349, + "kl_loss": 0.2211613953113556, + "loss_ib": 0.019147023558616638, + "step": 320 + }, + { + "ce_ib": 17.704599380493164, + "ce_orig": 0.9023761749267578, + "epoch": 0.046153846153846156, + "kl_loss": 0.28145700693130493, + "loss_ib": 0.022925151512026787, + "step": 321 + }, + { + "ce_ib": 17.584495544433594, + "ce_orig": 0.8088329434394836, + "epoch": 0.046297627606038824, + "kl_loss": 0.2326435148715973, + "loss_ib": 0.02042442373931408, + "step": 322 + }, + { + "ce_ib": 16.041574478149414, + "ce_orig": 0.7201854586601257, + "epoch": 0.046441409058231485, + "kl_loss": 0.3453258275985718, + "loss_ib": 0.02528708055615425, + "step": 323 + }, + { + "ce_ib": 17.880464553833008, + "ce_orig": 0.917682409286499, + "epoch": 0.04658519051042415, + "kl_loss": 0.2218429148197174, + "loss_ib": 0.020032377913594246, + "step": 324 + }, + { + "epoch": 0.04672897196261682, + "grad_norm": 0.12532667815685272, + "learning_rate": 2.515974440894569e-05, + "loss": 0.8682, + "step": 325 + }, + { + "ce_ib": 15.092937469482422, + "ce_orig": 0.7272103428840637, + "epoch": 0.04672897196261682, + "kl_loss": 0.31795835494995117, + "loss_ib": 0.023444388061761856, + "step": 325 + }, + { + "ce_ib": 15.842549324035645, + "ce_orig": 0.9047788977622986, + "epoch": 0.04687275341480949, + "kl_loss": 0.2592480480670929, + "loss_ib": 0.020883677527308464, + "step": 326 + }, + { + "ce_ib": 14.095027923583984, + "ce_orig": 0.7625716328620911, + "epoch": 0.047016534867002156, + "kl_loss": 0.17521969974040985, + "loss_ib": 0.015808498486876488, + "step": 327 + }, + { + "ce_ib": 16.17887306213379, + "ce_orig": 0.7734029293060303, + "epoch": 0.047160316319194824, + "kl_loss": 0.23375201225280762, + "loss_ib": 0.01977703720331192, + "step": 328 + }, + { + "ce_ib": 18.957979202270508, + "ce_orig": 0.8559271097183228, + "epoch": 0.04730409777138749, + "kl_loss": 0.33970198035240173, + "loss_ib": 0.026464087888598442, + "step": 329 + }, + { + "epoch": 0.04744787922358016, + "grad_norm": 0.09043259918689728, + "learning_rate": 2.5559105431309903e-05, + "loss": 0.7934, + "step": 330 + }, + { + "ce_ib": 14.730635643005371, + "ce_orig": 0.8875608444213867, + "epoch": 0.04744787922358016, + "kl_loss": 0.2284042239189148, + "loss_ib": 0.01878552883863449, + "step": 330 + }, + { + "ce_ib": 12.517528533935547, + "ce_orig": 0.6206594705581665, + "epoch": 0.04759166067577283, + "kl_loss": 0.21303001046180725, + "loss_ib": 0.016910264268517494, + "step": 331 + }, + { + "ce_ib": 14.753087043762207, + "ce_orig": 0.841992199420929, + "epoch": 0.047735442127965495, + "kl_loss": 0.20966459810733795, + "loss_ib": 0.017859773710370064, + "step": 332 + }, + { + "ce_ib": 14.960477828979492, + "ce_orig": 1.0839353799819946, + "epoch": 0.047879223580158156, + "kl_loss": 0.15192289650440216, + "loss_ib": 0.01507638394832611, + "step": 333 + }, + { + "ce_ib": 15.1857328414917, + "ce_orig": 0.9817880392074585, + "epoch": 0.048023005032350824, + "kl_loss": 0.19470617175102234, + "loss_ib": 0.017328176647424698, + "step": 334 + }, + { + "epoch": 0.04816678648454349, + "grad_norm": 0.10444720834493637, + "learning_rate": 2.5958466453674125e-05, + "loss": 0.8294, + "step": 335 + }, + { + "ce_ib": 15.169656753540039, + "ce_orig": 0.8038283586502075, + "epoch": 0.04816678648454349, + "kl_loss": 0.2266630232334137, + "loss_ib": 0.018917979672551155, + "step": 335 + }, + { + "ce_ib": 14.643211364746094, + "ce_orig": 0.7942169904708862, + "epoch": 0.04831056793673616, + "kl_loss": 0.22511643171310425, + "loss_ib": 0.01857742667198181, + "step": 336 + }, + { + "ce_ib": 12.96651554107666, + "ce_orig": 0.8289546370506287, + "epoch": 0.04845434938892883, + "kl_loss": 0.17975889146327972, + "loss_ib": 0.015471202321350574, + "step": 337 + }, + { + "ce_ib": 16.585371017456055, + "ce_orig": 1.165753722190857, + "epoch": 0.048598130841121495, + "kl_loss": 0.190724715590477, + "loss_ib": 0.017828920856118202, + "step": 338 + }, + { + "ce_ib": 14.718335151672363, + "ce_orig": 0.8502370119094849, + "epoch": 0.04874191229331416, + "kl_loss": 0.20640771090984344, + "loss_ib": 0.017679553478956223, + "step": 339 + }, + { + "epoch": 0.04888569374550683, + "grad_norm": 0.09888631105422974, + "learning_rate": 2.6357827476038338e-05, + "loss": 0.8718, + "step": 340 + }, + { + "ce_ib": 13.714351654052734, + "ce_orig": 1.0317654609680176, + "epoch": 0.04888569374550683, + "kl_loss": 0.19221842288970947, + "loss_ib": 0.016468096524477005, + "step": 340 + }, + { + "ce_ib": 15.859162330627441, + "ce_orig": 0.6472983956336975, + "epoch": 0.0490294751976995, + "kl_loss": 0.2313256412744522, + "loss_ib": 0.01949586346745491, + "step": 341 + }, + { + "ce_ib": 16.512859344482422, + "ce_orig": 1.1302889585494995, + "epoch": 0.049173256649892166, + "kl_loss": 0.1906917691230774, + "loss_ib": 0.0177910178899765, + "step": 342 + }, + { + "ce_ib": 16.816452026367188, + "ce_orig": 0.8666338920593262, + "epoch": 0.049317038102084834, + "kl_loss": 0.17613860964775085, + "loss_ib": 0.01721515692770481, + "step": 343 + }, + { + "ce_ib": 13.920699119567871, + "ce_orig": 0.719912588596344, + "epoch": 0.049460819554277495, + "kl_loss": 0.18724943697452545, + "loss_ib": 0.016322821378707886, + "step": 344 + }, + { + "epoch": 0.04960460100647016, + "grad_norm": 0.10115125775337219, + "learning_rate": 2.6757188498402557e-05, + "loss": 0.843, + "step": 345 + }, + { + "ce_ib": 16.091005325317383, + "ce_orig": 1.0146100521087646, + "epoch": 0.04960460100647016, + "kl_loss": 0.1512732356786728, + "loss_ib": 0.015609164722263813, + "step": 345 + }, + { + "ce_ib": 15.5990571975708, + "ce_orig": 0.7898549437522888, + "epoch": 0.04974838245866283, + "kl_loss": 0.25755369663238525, + "loss_ib": 0.020677214488387108, + "step": 346 + }, + { + "ce_ib": 17.706438064575195, + "ce_orig": 1.2053872346878052, + "epoch": 0.0498921639108555, + "kl_loss": 0.21856242418289185, + "loss_ib": 0.019781339913606644, + "step": 347 + }, + { + "ce_ib": 10.220382690429688, + "ce_orig": 0.5169559121131897, + "epoch": 0.050035945363048166, + "kl_loss": 0.2967785596847534, + "loss_ib": 0.01994911953806877, + "step": 348 + }, + { + "ce_ib": 14.920300483703613, + "ce_orig": 0.7747300267219543, + "epoch": 0.050179726815240834, + "kl_loss": 0.13559795916080475, + "loss_ib": 0.01424004789441824, + "step": 349 + }, + { + "epoch": 0.0503235082674335, + "grad_norm": 0.09113696962594986, + "learning_rate": 2.7156549520766773e-05, + "loss": 0.898, + "step": 350 + }, + { + "ce_ib": 16.00478172302246, + "ce_orig": 1.2635383605957031, + "epoch": 0.0503235082674335, + "kl_loss": 0.2005615234375, + "loss_ib": 0.018030468374490738, + "step": 350 + }, + { + "ce_ib": 16.192142486572266, + "ce_orig": 0.9619611501693726, + "epoch": 0.05046728971962617, + "kl_loss": 0.25098395347595215, + "loss_ib": 0.020645270124077797, + "step": 351 + }, + { + "ce_ib": 13.817422866821289, + "ce_orig": 0.6711569428443909, + "epoch": 0.05061107117181884, + "kl_loss": 0.21178191900253296, + "loss_ib": 0.01749780774116516, + "step": 352 + }, + { + "ce_ib": 15.532769203186035, + "ce_orig": 0.5863211750984192, + "epoch": 0.050754852624011505, + "kl_loss": 0.33604636788368225, + "loss_ib": 0.024568704888224602, + "step": 353 + }, + { + "ce_ib": 14.479625701904297, + "ce_orig": 1.0277538299560547, + "epoch": 0.05089863407620417, + "kl_loss": 0.251324862241745, + "loss_ib": 0.01980605535209179, + "step": 354 + }, + { + "epoch": 0.051042415528396834, + "grad_norm": 0.10617897659540176, + "learning_rate": 2.7555910543130992e-05, + "loss": 0.8813, + "step": 355 + }, + { + "ce_ib": 16.734331130981445, + "ce_orig": 0.9177318215370178, + "epoch": 0.051042415528396834, + "kl_loss": 0.2959950864315033, + "loss_ib": 0.023166919127106667, + "step": 355 + }, + { + "ce_ib": 14.452986717224121, + "ce_orig": 0.8091175556182861, + "epoch": 0.0511861969805895, + "kl_loss": 0.19851821660995483, + "loss_ib": 0.01715240441262722, + "step": 356 + }, + { + "ce_ib": 15.050978660583496, + "ce_orig": 1.065936803817749, + "epoch": 0.05132997843278217, + "kl_loss": 0.1562933474779129, + "loss_ib": 0.015340156853199005, + "step": 357 + }, + { + "ce_ib": 16.42909812927246, + "ce_orig": 0.6200724840164185, + "epoch": 0.05147375988497484, + "kl_loss": 0.22564369440078735, + "loss_ib": 0.0194967333227396, + "step": 358 + }, + { + "ce_ib": 15.036846160888672, + "ce_orig": 0.8413035273551941, + "epoch": 0.051617541337167505, + "kl_loss": 0.15089106559753418, + "loss_ib": 0.015062975697219372, + "step": 359 + }, + { + "epoch": 0.05176132278936017, + "grad_norm": 0.0994093269109726, + "learning_rate": 2.7955271565495212e-05, + "loss": 0.8787, + "step": 360 + }, + { + "ce_ib": 16.430448532104492, + "ce_orig": 1.058258295059204, + "epoch": 0.05176132278936017, + "kl_loss": 0.19877898693084717, + "loss_ib": 0.018154174089431763, + "step": 360 + }, + { + "ce_ib": 16.932239532470703, + "ce_orig": 1.2644236087799072, + "epoch": 0.05190510424155284, + "kl_loss": 0.19584302604198456, + "loss_ib": 0.018258271738886833, + "step": 361 + }, + { + "ce_ib": 14.099466323852539, + "ce_orig": 0.9995023608207703, + "epoch": 0.05204888569374551, + "kl_loss": 0.16139906644821167, + "loss_ib": 0.015119686722755432, + "step": 362 + }, + { + "ce_ib": 15.650708198547363, + "ce_orig": 0.9404743313789368, + "epoch": 0.052192667145938176, + "kl_loss": 0.21341916918754578, + "loss_ib": 0.018496312201023102, + "step": 363 + }, + { + "ce_ib": 16.308624267578125, + "ce_orig": 1.1696351766586304, + "epoch": 0.052336448598130844, + "kl_loss": 0.18094685673713684, + "loss_ib": 0.017201654613018036, + "step": 364 + }, + { + "epoch": 0.052480230050323505, + "grad_norm": 0.08824347704648972, + "learning_rate": 2.8354632587859424e-05, + "loss": 0.92, + "step": 365 + }, + { + "ce_ib": 16.583515167236328, + "ce_orig": 1.182395577430725, + "epoch": 0.052480230050323505, + "kl_loss": 0.19043317437171936, + "loss_ib": 0.017813416197896004, + "step": 365 + }, + { + "ce_ib": 13.93136978149414, + "ce_orig": 0.8936623930931091, + "epoch": 0.05262401150251617, + "kl_loss": 0.20374388992786407, + "loss_ib": 0.017152879387140274, + "step": 366 + }, + { + "ce_ib": 13.256582260131836, + "ce_orig": 0.6269909143447876, + "epoch": 0.05276779295470884, + "kl_loss": 0.16010135412216187, + "loss_ib": 0.014633359387516975, + "step": 367 + }, + { + "ce_ib": 12.879414558410645, + "ce_orig": 0.6315767765045166, + "epoch": 0.05291157440690151, + "kl_loss": 0.16557064652442932, + "loss_ib": 0.014718241058290005, + "step": 368 + }, + { + "ce_ib": 15.146659851074219, + "ce_orig": 1.0897746086120605, + "epoch": 0.053055355859094176, + "kl_loss": 0.16495399177074432, + "loss_ib": 0.015821030363440514, + "step": 369 + }, + { + "epoch": 0.053199137311286844, + "grad_norm": 0.09635983407497406, + "learning_rate": 2.8753993610223644e-05, + "loss": 0.933, + "step": 370 + }, + { + "ce_ib": 14.545082092285156, + "ce_orig": 0.903797447681427, + "epoch": 0.053199137311286844, + "kl_loss": 0.12444749474525452, + "loss_ib": 0.013494915328919888, + "step": 370 + }, + { + "ce_ib": 12.932695388793945, + "ce_orig": 0.6494324207305908, + "epoch": 0.05334291876347951, + "kl_loss": 0.1315731406211853, + "loss_ib": 0.013045004568994045, + "step": 371 + }, + { + "ce_ib": 15.617569923400879, + "ce_orig": 0.7816907167434692, + "epoch": 0.05348670021567218, + "kl_loss": 0.16312208771705627, + "loss_ib": 0.015964889898896217, + "step": 372 + }, + { + "ce_ib": 15.400293350219727, + "ce_orig": 1.033065915107727, + "epoch": 0.05363048166786485, + "kl_loss": 0.17930516600608826, + "loss_ib": 0.01666540466248989, + "step": 373 + }, + { + "ce_ib": 11.445207595825195, + "ce_orig": 0.5388709902763367, + "epoch": 0.053774263120057515, + "kl_loss": 0.2616915702819824, + "loss_ib": 0.0188071820884943, + "step": 374 + }, + { + "epoch": 0.05391804457225018, + "grad_norm": 0.0901573896408081, + "learning_rate": 2.915335463258786e-05, + "loss": 0.926, + "step": 375 + }, + { + "ce_ib": 13.34332275390625, + "ce_orig": 1.1417230367660522, + "epoch": 0.05391804457225018, + "kl_loss": 0.1469050794839859, + "loss_ib": 0.014016914181411266, + "step": 375 + }, + { + "ce_ib": 11.211030006408691, + "ce_orig": 0.6393249034881592, + "epoch": 0.054061826024442844, + "kl_loss": 0.13886746764183044, + "loss_ib": 0.012548888102173805, + "step": 376 + }, + { + "ce_ib": 15.887382507324219, + "ce_orig": 0.9176316261291504, + "epoch": 0.05420560747663551, + "kl_loss": 0.2912940979003906, + "loss_ib": 0.022508395835757256, + "step": 377 + }, + { + "ce_ib": 11.42358112335205, + "ce_orig": 0.8122538924217224, + "epoch": 0.05434938892882818, + "kl_loss": 0.1490350216627121, + "loss_ib": 0.013163541443645954, + "step": 378 + }, + { + "ce_ib": 14.985864639282227, + "ce_orig": 0.9277105927467346, + "epoch": 0.05449317038102085, + "kl_loss": 0.14583294093608856, + "loss_ib": 0.014784579165279865, + "step": 379 + }, + { + "epoch": 0.054636951833213515, + "grad_norm": 0.12827961146831512, + "learning_rate": 2.955271565495208e-05, + "loss": 0.9204, + "step": 380 + }, + { + "ce_ib": 13.780610084533691, + "ce_orig": 0.8345714807510376, + "epoch": 0.054636951833213515, + "kl_loss": 0.1690724641084671, + "loss_ib": 0.015343928709626198, + "step": 380 + }, + { + "ce_ib": 11.765593528747559, + "ce_orig": 0.7576747536659241, + "epoch": 0.05478073328540618, + "kl_loss": 0.15963752567768097, + "loss_ib": 0.013864672742784023, + "step": 381 + }, + { + "ce_ib": 12.893147468566895, + "ce_orig": 1.0378029346466064, + "epoch": 0.05492451473759885, + "kl_loss": 0.13528262078762054, + "loss_ib": 0.013210705481469631, + "step": 382 + }, + { + "ce_ib": 15.782855987548828, + "ce_orig": 0.8279376029968262, + "epoch": 0.05506829618979152, + "kl_loss": 0.16392827033996582, + "loss_ib": 0.016087843105196953, + "step": 383 + }, + { + "ce_ib": 12.443214416503906, + "ce_orig": 0.8166038990020752, + "epoch": 0.055212077641984186, + "kl_loss": 0.14554069936275482, + "loss_ib": 0.013498641550540924, + "step": 384 + }, + { + "epoch": 0.055355859094176854, + "grad_norm": 0.13148367404937744, + "learning_rate": 2.9952076677316295e-05, + "loss": 0.8973, + "step": 385 + }, + { + "ce_ib": 16.574996948242188, + "ce_orig": 1.3873276710510254, + "epoch": 0.055355859094176854, + "kl_loss": 0.206925630569458, + "loss_ib": 0.018633781000971794, + "step": 385 + }, + { + "ce_ib": 14.954483032226562, + "ce_orig": 1.4074153900146484, + "epoch": 0.055499640546369515, + "kl_loss": 0.1725064069032669, + "loss_ib": 0.016102561727166176, + "step": 386 + }, + { + "ce_ib": 13.222760200500488, + "ce_orig": 0.581721842288971, + "epoch": 0.05564342199856218, + "kl_loss": 0.3106327950954437, + "loss_ib": 0.02214301936328411, + "step": 387 + }, + { + "ce_ib": 12.130496978759766, + "ce_orig": 0.8030052185058594, + "epoch": 0.05578720345075485, + "kl_loss": 0.19192326068878174, + "loss_ib": 0.015661410987377167, + "step": 388 + }, + { + "ce_ib": 12.304028511047363, + "ce_orig": 0.838097095489502, + "epoch": 0.05593098490294752, + "kl_loss": 0.17265933752059937, + "loss_ib": 0.0147849814966321, + "step": 389 + }, + { + "epoch": 0.056074766355140186, + "grad_norm": 0.1129549965262413, + "learning_rate": 3.0351437699680514e-05, + "loss": 0.9147, + "step": 390 + }, + { + "ce_ib": 12.584757804870605, + "ce_orig": 0.6829859018325806, + "epoch": 0.056074766355140186, + "kl_loss": 0.15448611974716187, + "loss_ib": 0.014016685076057911, + "step": 390 + }, + { + "ce_ib": 14.888505935668945, + "ce_orig": 0.8695336580276489, + "epoch": 0.056218547807332854, + "kl_loss": 0.18638572096824646, + "loss_ib": 0.016763538122177124, + "step": 391 + }, + { + "ce_ib": 13.158818244934082, + "ce_orig": 0.724577009677887, + "epoch": 0.05636232925952552, + "kl_loss": 0.15236912667751312, + "loss_ib": 0.01419786550104618, + "step": 392 + }, + { + "ce_ib": 14.405329704284668, + "ce_orig": 0.4904825985431671, + "epoch": 0.05650611071171819, + "kl_loss": 0.21818403899669647, + "loss_ib": 0.01811186783015728, + "step": 393 + }, + { + "ce_ib": 10.370551109313965, + "ce_orig": 0.6885640621185303, + "epoch": 0.05664989216391086, + "kl_loss": 0.18041831254959106, + "loss_ib": 0.014206192456185818, + "step": 394 + }, + { + "epoch": 0.056793673616103525, + "grad_norm": 0.09922255575656891, + "learning_rate": 3.075079872204473e-05, + "loss": 0.8916, + "step": 395 + }, + { + "ce_ib": 15.597278594970703, + "ce_orig": 1.2381712198257446, + "epoch": 0.056793673616103525, + "kl_loss": 0.1798793077468872, + "loss_ib": 0.016792604699730873, + "step": 395 + }, + { + "ce_ib": 16.811328887939453, + "ce_orig": 1.2628995180130005, + "epoch": 0.05693745506829619, + "kl_loss": 0.171632319688797, + "loss_ib": 0.016987280920147896, + "step": 396 + }, + { + "ce_ib": 14.330126762390137, + "ce_orig": 0.842546284198761, + "epoch": 0.057081236520488854, + "kl_loss": 0.17398859560489655, + "loss_ib": 0.01586449332535267, + "step": 397 + }, + { + "ce_ib": 13.21159553527832, + "ce_orig": 0.8423411250114441, + "epoch": 0.05722501797268152, + "kl_loss": 0.13025188446044922, + "loss_ib": 0.013118392787873745, + "step": 398 + }, + { + "ce_ib": 12.565587043762207, + "ce_orig": 0.7189036011695862, + "epoch": 0.05736879942487419, + "kl_loss": 0.14162641763687134, + "loss_ib": 0.013364115729928017, + "step": 399 + }, + { + "epoch": 0.05751258087706686, + "grad_norm": 0.0966155007481575, + "learning_rate": 3.115015974440895e-05, + "loss": 0.9267, + "step": 400 + }, + { + "ce_ib": 11.058226585388184, + "ce_orig": 0.9342263340950012, + "epoch": 0.05751258087706686, + "kl_loss": 0.11544251441955566, + "loss_ib": 0.011301239021122456, + "step": 400 + }, + { + "ce_ib": 14.200401306152344, + "ce_orig": 1.0684270858764648, + "epoch": 0.057656362329259525, + "kl_loss": 0.14682269096374512, + "loss_ib": 0.014441335573792458, + "step": 401 + }, + { + "ce_ib": 14.785656929016113, + "ce_orig": 1.1560802459716797, + "epoch": 0.05780014378145219, + "kl_loss": 0.1572328507900238, + "loss_ib": 0.015254470519721508, + "step": 402 + }, + { + "ce_ib": 16.529001235961914, + "ce_orig": 1.4409286975860596, + "epoch": 0.05794392523364486, + "kl_loss": 0.1712377667427063, + "loss_ib": 0.016826389357447624, + "step": 403 + }, + { + "ce_ib": 8.062843322753906, + "ce_orig": 0.4845752716064453, + "epoch": 0.05808770668583753, + "kl_loss": 0.114130899310112, + "loss_ib": 0.009737967513501644, + "step": 404 + }, + { + "epoch": 0.058231488138030196, + "grad_norm": 0.10344849526882172, + "learning_rate": 3.154952076677317e-05, + "loss": 0.9143, + "step": 405 + }, + { + "ce_ib": 14.247758865356445, + "ce_orig": 0.8494449853897095, + "epoch": 0.058231488138030196, + "kl_loss": 0.14982560276985168, + "loss_ib": 0.014615160413086414, + "step": 405 + }, + { + "ce_ib": 15.090539932250977, + "ce_orig": 1.255419135093689, + "epoch": 0.058375269590222864, + "kl_loss": 0.2056526243686676, + "loss_ib": 0.017827901989221573, + "step": 406 + }, + { + "ce_ib": 13.584562301635742, + "ce_orig": 1.0737160444259644, + "epoch": 0.058519051042415525, + "kl_loss": 0.16253307461738586, + "loss_ib": 0.014918935485184193, + "step": 407 + }, + { + "ce_ib": 10.844743728637695, + "ce_orig": 0.7258655428886414, + "epoch": 0.05866283249460819, + "kl_loss": 0.2175343632698059, + "loss_ib": 0.016299089416861534, + "step": 408 + }, + { + "ce_ib": 9.838624000549316, + "ce_orig": 0.5341205596923828, + "epoch": 0.05880661394680086, + "kl_loss": 0.24159343540668488, + "loss_ib": 0.016998983919620514, + "step": 409 + }, + { + "epoch": 0.05895039539899353, + "grad_norm": 0.0828595831990242, + "learning_rate": 3.194888178913738e-05, + "loss": 0.8595, + "step": 410 + }, + { + "ce_ib": 11.394186019897461, + "ce_orig": 0.6941292881965637, + "epoch": 0.05895039539899353, + "kl_loss": 0.13403424620628357, + "loss_ib": 0.012398804537951946, + "step": 410 + }, + { + "ce_ib": 13.786474227905273, + "ce_orig": 1.0304478406906128, + "epoch": 0.059094176851186196, + "kl_loss": 0.24797815084457397, + "loss_ib": 0.01929214410483837, + "step": 411 + }, + { + "ce_ib": 12.280767440795898, + "ce_orig": 0.9082537889480591, + "epoch": 0.059237958303378864, + "kl_loss": 0.09719130396842957, + "loss_ib": 0.010999949648976326, + "step": 412 + }, + { + "ce_ib": 11.551681518554688, + "ce_orig": 0.9754782915115356, + "epoch": 0.05938173975557153, + "kl_loss": 0.1405172348022461, + "loss_ib": 0.0128017021343112, + "step": 413 + }, + { + "ce_ib": 13.329681396484375, + "ce_orig": 0.9015910625457764, + "epoch": 0.0595255212077642, + "kl_loss": 0.15253770351409912, + "loss_ib": 0.014291726052761078, + "step": 414 + }, + { + "epoch": 0.05966930265995687, + "grad_norm": 0.09882552921772003, + "learning_rate": 3.23482428115016e-05, + "loss": 0.8577, + "step": 415 + }, + { + "ce_ib": 12.369913101196289, + "ce_orig": 0.8101427555084229, + "epoch": 0.05966930265995687, + "kl_loss": 0.2113226056098938, + "loss_ib": 0.016751086339354515, + "step": 415 + }, + { + "ce_ib": 14.39426040649414, + "ce_orig": 1.3613587617874146, + "epoch": 0.059813084112149535, + "kl_loss": 0.1314837634563446, + "loss_ib": 0.013771317899227142, + "step": 416 + }, + { + "ce_ib": 12.081097602844238, + "ce_orig": 0.4347302317619324, + "epoch": 0.0599568655643422, + "kl_loss": 0.2995225489139557, + "loss_ib": 0.021016675978899002, + "step": 417 + }, + { + "ce_ib": 15.218514442443848, + "ce_orig": 1.2289142608642578, + "epoch": 0.060100647016534864, + "kl_loss": 0.26874852180480957, + "loss_ib": 0.021046683192253113, + "step": 418 + }, + { + "ce_ib": 13.258194923400879, + "ce_orig": 1.0039843320846558, + "epoch": 0.06024442846872753, + "kl_loss": 0.12897028028964996, + "loss_ib": 0.013077611103653908, + "step": 419 + }, + { + "epoch": 0.0603882099209202, + "grad_norm": 0.1349947154521942, + "learning_rate": 3.274760383386581e-05, + "loss": 0.899, + "step": 420 + }, + { + "ce_ib": 12.171891212463379, + "ce_orig": 0.9144300222396851, + "epoch": 0.0603882099209202, + "kl_loss": 0.21492531895637512, + "loss_ib": 0.01683221198618412, + "step": 420 + }, + { + "ce_ib": 10.995501518249512, + "ce_orig": 0.699188768863678, + "epoch": 0.06053199137311287, + "kl_loss": 0.12099233269691467, + "loss_ib": 0.011547367088496685, + "step": 421 + }, + { + "ce_ib": 12.991347312927246, + "ce_orig": 0.9281318187713623, + "epoch": 0.060675772825305535, + "kl_loss": 0.1598033308982849, + "loss_ib": 0.014485838823020458, + "step": 422 + }, + { + "ce_ib": 10.423280715942383, + "ce_orig": 0.9821050763130188, + "epoch": 0.0608195542774982, + "kl_loss": 0.10947795957326889, + "loss_ib": 0.010685537941753864, + "step": 423 + }, + { + "ce_ib": 11.12364387512207, + "ce_orig": 0.7817228436470032, + "epoch": 0.06096333572969087, + "kl_loss": 0.17594116926193237, + "loss_ib": 0.014358880929648876, + "step": 424 + }, + { + "epoch": 0.06110711718188354, + "grad_norm": 0.0929858386516571, + "learning_rate": 3.314696485623003e-05, + "loss": 0.7994, + "step": 425 + }, + { + "ce_ib": 12.660994529724121, + "ce_orig": 0.9210802912712097, + "epoch": 0.06110711718188354, + "kl_loss": 0.15979456901550293, + "loss_ib": 0.014320224523544312, + "step": 425 + }, + { + "ce_ib": 15.475061416625977, + "ce_orig": 1.6302592754364014, + "epoch": 0.061250898634076206, + "kl_loss": 0.1844199150800705, + "loss_ib": 0.01695852540433407, + "step": 426 + }, + { + "ce_ib": 9.331029891967773, + "ce_orig": 0.5564351081848145, + "epoch": 0.061394680086268874, + "kl_loss": 0.1388329267501831, + "loss_ib": 0.011607161723077297, + "step": 427 + }, + { + "ce_ib": 12.874106407165527, + "ce_orig": 0.9861687421798706, + "epoch": 0.06153846153846154, + "kl_loss": 0.12479162216186523, + "loss_ib": 0.012676633894443512, + "step": 428 + }, + { + "ce_ib": 7.756659507751465, + "ce_orig": 0.28384384512901306, + "epoch": 0.0616822429906542, + "kl_loss": 0.2630873918533325, + "loss_ib": 0.017032699659466743, + "step": 429 + }, + { + "epoch": 0.06182602444284687, + "grad_norm": 0.09535211324691772, + "learning_rate": 3.354632587859425e-05, + "loss": 0.9043, + "step": 430 + }, + { + "ce_ib": 10.622055053710938, + "ce_orig": 0.6629616022109985, + "epoch": 0.06182602444284687, + "kl_loss": 0.13496457040309906, + "loss_ib": 0.012059256434440613, + "step": 430 + }, + { + "ce_ib": 11.811662673950195, + "ce_orig": 0.7327677011489868, + "epoch": 0.06196980589503954, + "kl_loss": 0.14013449847698212, + "loss_ib": 0.012912556529045105, + "step": 431 + }, + { + "ce_ib": 8.620430946350098, + "ce_orig": 0.7203670144081116, + "epoch": 0.062113587347232206, + "kl_loss": 0.11925005167722702, + "loss_ib": 0.010272718034684658, + "step": 432 + }, + { + "ce_ib": 14.616909980773926, + "ce_orig": 1.4517083168029785, + "epoch": 0.062257368799424874, + "kl_loss": 0.15538114309310913, + "loss_ib": 0.01507751177996397, + "step": 433 + }, + { + "ce_ib": 9.763717651367188, + "ce_orig": 0.6260893940925598, + "epoch": 0.06240115025161754, + "kl_loss": 0.13390487432479858, + "loss_ib": 0.011577102355659008, + "step": 434 + }, + { + "epoch": 0.0625449317038102, + "grad_norm": 0.11180251836776733, + "learning_rate": 3.394568690095847e-05, + "loss": 0.9071, + "step": 435 + }, + { + "ce_ib": 12.881009101867676, + "ce_orig": 0.5546009540557861, + "epoch": 0.0625449317038102, + "kl_loss": 0.13927477598190308, + "loss_ib": 0.013404244557023048, + "step": 435 + }, + { + "ce_ib": 12.253645896911621, + "ce_orig": 0.7509746551513672, + "epoch": 0.06268871315600287, + "kl_loss": 0.16948378086090088, + "loss_ib": 0.014601011760532856, + "step": 436 + }, + { + "ce_ib": 13.29328441619873, + "ce_orig": 0.9583929777145386, + "epoch": 0.06283249460819554, + "kl_loss": 0.15764841437339783, + "loss_ib": 0.014529063366353512, + "step": 437 + }, + { + "ce_ib": 12.615095138549805, + "ce_orig": 1.1630975008010864, + "epoch": 0.0629762760603882, + "kl_loss": 0.12097503244876862, + "loss_ib": 0.012356298975646496, + "step": 438 + }, + { + "ce_ib": 11.95744514465332, + "ce_orig": 0.734953761100769, + "epoch": 0.06312005751258087, + "kl_loss": 0.13797758519649506, + "loss_ib": 0.012877601198852062, + "step": 439 + }, + { + "epoch": 0.06326383896477354, + "grad_norm": 0.09555409848690033, + "learning_rate": 3.434504792332269e-05, + "loss": 0.8284, + "step": 440 + }, + { + "ce_ib": 17.494842529296875, + "ce_orig": 1.2540117502212524, + "epoch": 0.06326383896477354, + "kl_loss": 0.20184318721294403, + "loss_ib": 0.018839580938220024, + "step": 440 + }, + { + "ce_ib": 13.520644187927246, + "ce_orig": 1.173345923423767, + "epoch": 0.06340762041696621, + "kl_loss": 0.224016010761261, + "loss_ib": 0.01796112395823002, + "step": 441 + }, + { + "ce_ib": 6.444005489349365, + "ce_orig": 0.33801600337028503, + "epoch": 0.06355140186915888, + "kl_loss": 0.24252083897590637, + "loss_ib": 0.015348044224083424, + "step": 442 + }, + { + "ce_ib": 11.194876670837402, + "ce_orig": 0.8596982359886169, + "epoch": 0.06369518332135155, + "kl_loss": 0.1475781947374344, + "loss_ib": 0.012976348400115967, + "step": 443 + }, + { + "ce_ib": 14.788161277770996, + "ce_orig": 0.9179244041442871, + "epoch": 0.06383896477354421, + "kl_loss": 0.23837195336818695, + "loss_ib": 0.019312677904963493, + "step": 444 + }, + { + "epoch": 0.06398274622573688, + "grad_norm": 0.10049393773078918, + "learning_rate": 3.47444089456869e-05, + "loss": 0.915, + "step": 445 + }, + { + "ce_ib": 12.728458404541016, + "ce_orig": 0.6808370351791382, + "epoch": 0.06398274622573688, + "kl_loss": 0.16207855939865112, + "loss_ib": 0.014468157663941383, + "step": 445 + }, + { + "ce_ib": 9.595919609069824, + "ce_orig": 0.48967745900154114, + "epoch": 0.06412652767792955, + "kl_loss": 0.12488089501857758, + "loss_ib": 0.011042005382478237, + "step": 446 + }, + { + "ce_ib": 15.164140701293945, + "ce_orig": 0.8277769684791565, + "epoch": 0.06427030913012222, + "kl_loss": 0.1784917414188385, + "loss_ib": 0.016506657004356384, + "step": 447 + }, + { + "ce_ib": 11.631290435791016, + "ce_orig": 0.786353588104248, + "epoch": 0.06441409058231488, + "kl_loss": 0.13490960001945496, + "loss_ib": 0.012561124749481678, + "step": 448 + }, + { + "ce_ib": 11.316841125488281, + "ce_orig": 0.6659090518951416, + "epoch": 0.06455787203450755, + "kl_loss": 0.1083206981420517, + "loss_ib": 0.011074455454945564, + "step": 449 + }, + { + "epoch": 0.06470165348670022, + "grad_norm": 0.10170278698205948, + "learning_rate": 3.514376996805112e-05, + "loss": 0.8419, + "step": 450 + }, + { + "ce_ib": 9.49474048614502, + "ce_orig": 0.7802785038948059, + "epoch": 0.06470165348670022, + "kl_loss": 0.1756356954574585, + "loss_ib": 0.013529154472053051, + "step": 450 + }, + { + "ce_ib": 11.215967178344727, + "ce_orig": 0.4214544892311096, + "epoch": 0.06484543493889289, + "kl_loss": 0.12542136013507843, + "loss_ib": 0.01187905203551054, + "step": 451 + }, + { + "ce_ib": 6.312502861022949, + "ce_orig": 0.3134852945804596, + "epoch": 0.06498921639108556, + "kl_loss": 0.2386016845703125, + "loss_ib": 0.015086335130035877, + "step": 452 + }, + { + "ce_ib": 13.910443305969238, + "ce_orig": 0.7964897155761719, + "epoch": 0.06513299784327822, + "kl_loss": 0.19260820746421814, + "loss_ib": 0.016585631296038628, + "step": 453 + }, + { + "ce_ib": 10.858504295349121, + "ce_orig": 0.8178758025169373, + "epoch": 0.06527677929547089, + "kl_loss": 0.1271795630455017, + "loss_ib": 0.011788229458034039, + "step": 454 + }, + { + "epoch": 0.06542056074766354, + "grad_norm": 0.1267521232366562, + "learning_rate": 3.5543130990415334e-05, + "loss": 0.8513, + "step": 455 + }, + { + "ce_ib": 11.637347221374512, + "ce_orig": 1.0193455219268799, + "epoch": 0.06542056074766354, + "kl_loss": 0.144621342420578, + "loss_ib": 0.01304974127560854, + "step": 455 + }, + { + "ce_ib": 10.80041217803955, + "ce_orig": 0.6328637599945068, + "epoch": 0.06556434219985621, + "kl_loss": 0.10495860129594803, + "loss_ib": 0.010648136027157307, + "step": 456 + }, + { + "ce_ib": 7.313602447509766, + "ce_orig": 0.42815887928009033, + "epoch": 0.06570812365204888, + "kl_loss": 0.17510683834552765, + "loss_ib": 0.01241214293986559, + "step": 457 + }, + { + "ce_ib": 13.05362606048584, + "ce_orig": 1.0646302700042725, + "epoch": 0.06585190510424155, + "kl_loss": 0.14800792932510376, + "loss_ib": 0.013927209191024303, + "step": 458 + }, + { + "ce_ib": 8.705698013305664, + "ce_orig": 0.5751362442970276, + "epoch": 0.06599568655643422, + "kl_loss": 0.19291532039642334, + "loss_ib": 0.013998615555465221, + "step": 459 + }, + { + "epoch": 0.06613946800862688, + "grad_norm": 0.14026452600955963, + "learning_rate": 3.5942492012779554e-05, + "loss": 0.8978, + "step": 460 + }, + { + "ce_ib": 12.629561424255371, + "ce_orig": 1.248939871788025, + "epoch": 0.06613946800862688, + "kl_loss": 0.1431877613067627, + "loss_ib": 0.013474169187247753, + "step": 460 + }, + { + "ce_ib": 13.466840744018555, + "ce_orig": 1.1314830780029297, + "epoch": 0.06628324946081955, + "kl_loss": 0.11893537640571594, + "loss_ib": 0.012680189684033394, + "step": 461 + }, + { + "ce_ib": 12.272945404052734, + "ce_orig": 0.5334405303001404, + "epoch": 0.06642703091301222, + "kl_loss": 0.19608467817306519, + "loss_ib": 0.015940707176923752, + "step": 462 + }, + { + "ce_ib": 11.584327697753906, + "ce_orig": 0.5882666707038879, + "epoch": 0.06657081236520489, + "kl_loss": 0.15428690612316132, + "loss_ib": 0.013506509363651276, + "step": 463 + }, + { + "ce_ib": 10.483445167541504, + "ce_orig": 0.5081559419631958, + "epoch": 0.06671459381739756, + "kl_loss": 0.23190432786941528, + "loss_ib": 0.01683693937957287, + "step": 464 + }, + { + "epoch": 0.06685837526959022, + "grad_norm": 0.11186351627111435, + "learning_rate": 3.6341853035143766e-05, + "loss": 0.977, + "step": 465 + }, + { + "ce_ib": 10.818644523620605, + "ce_orig": 0.8423200249671936, + "epoch": 0.06685837526959022, + "kl_loss": 0.12322719395160675, + "loss_ib": 0.011570681817829609, + "step": 465 + }, + { + "ce_ib": 13.477171897888184, + "ce_orig": 1.135223627090454, + "epoch": 0.06700215672178289, + "kl_loss": 0.1358594447374344, + "loss_ib": 0.013531558215618134, + "step": 466 + }, + { + "ce_ib": 12.029156684875488, + "ce_orig": 0.925537645816803, + "epoch": 0.06714593817397556, + "kl_loss": 0.1674036681652069, + "loss_ib": 0.014384761452674866, + "step": 467 + }, + { + "ce_ib": 8.591270446777344, + "ce_orig": 0.6351872086524963, + "epoch": 0.06728971962616823, + "kl_loss": 0.16200634837150574, + "loss_ib": 0.012395952828228474, + "step": 468 + }, + { + "ce_ib": 12.34648609161377, + "ce_orig": 0.8252216577529907, + "epoch": 0.0674335010783609, + "kl_loss": 0.1306806206703186, + "loss_ib": 0.012707273475825787, + "step": 469 + }, + { + "epoch": 0.06757728253055356, + "grad_norm": 0.11462409794330597, + "learning_rate": 3.6741214057507985e-05, + "loss": 0.8112, + "step": 470 + }, + { + "ce_ib": 12.145110130310059, + "ce_orig": 0.7569481730461121, + "epoch": 0.06757728253055356, + "kl_loss": 0.1333114206790924, + "loss_ib": 0.012738126330077648, + "step": 470 + }, + { + "ce_ib": 10.791728019714355, + "ce_orig": 0.8886812329292297, + "epoch": 0.06772106398274623, + "kl_loss": 0.14122334122657776, + "loss_ib": 0.012457030825316906, + "step": 471 + }, + { + "ce_ib": 11.75979232788086, + "ce_orig": 0.93720543384552, + "epoch": 0.0678648454349389, + "kl_loss": 0.08433859050273895, + "loss_ib": 0.010096825659275055, + "step": 472 + }, + { + "ce_ib": 7.816238880157471, + "ce_orig": 0.5898436903953552, + "epoch": 0.06800862688713157, + "kl_loss": 0.26155394315719604, + "loss_ib": 0.0169858168810606, + "step": 473 + }, + { + "ce_ib": 12.64213752746582, + "ce_orig": 1.2193433046340942, + "epoch": 0.06815240833932423, + "kl_loss": 0.1382063627243042, + "loss_ib": 0.013231388293206692, + "step": 474 + }, + { + "epoch": 0.0682961897915169, + "grad_norm": 0.12322834134101868, + "learning_rate": 3.714057507987221e-05, + "loss": 0.942, + "step": 475 + }, + { + "ce_ib": 12.208565711975098, + "ce_orig": 0.6283319592475891, + "epoch": 0.0682961897915169, + "kl_loss": 0.1437569409608841, + "loss_ib": 0.013292129151523113, + "step": 475 + }, + { + "ce_ib": 10.480301856994629, + "ce_orig": 0.6875295042991638, + "epoch": 0.06843997124370955, + "kl_loss": 0.171269491314888, + "loss_ib": 0.013803625479340553, + "step": 476 + }, + { + "ce_ib": 12.140584945678711, + "ce_orig": 0.686497151851654, + "epoch": 0.06858375269590222, + "kl_loss": 0.14714768528938293, + "loss_ib": 0.013427676633000374, + "step": 477 + }, + { + "ce_ib": 13.12353515625, + "ce_orig": 1.330522060394287, + "epoch": 0.06872753414809489, + "kl_loss": 0.18113256990909576, + "loss_ib": 0.015618395991623402, + "step": 478 + }, + { + "ce_ib": 12.710488319396973, + "ce_orig": 1.4100775718688965, + "epoch": 0.06887131560028756, + "kl_loss": 0.12010614573955536, + "loss_ib": 0.01236055139452219, + "step": 479 + }, + { + "epoch": 0.06901509705248023, + "grad_norm": 0.11526408791542053, + "learning_rate": 3.7539936102236424e-05, + "loss": 0.8998, + "step": 480 + }, + { + "ce_ib": 13.879955291748047, + "ce_orig": 1.1341381072998047, + "epoch": 0.06901509705248023, + "kl_loss": 0.1282053291797638, + "loss_ib": 0.013350243680179119, + "step": 480 + }, + { + "ce_ib": 8.764423370361328, + "ce_orig": 0.6200535893440247, + "epoch": 0.0691588785046729, + "kl_loss": 0.1320233792066574, + "loss_ib": 0.010983380489051342, + "step": 481 + }, + { + "ce_ib": 9.943157196044922, + "ce_orig": 0.6673835515975952, + "epoch": 0.06930265995686556, + "kl_loss": 0.10792216658592224, + "loss_ib": 0.010367686860263348, + "step": 482 + }, + { + "ce_ib": 10.92377758026123, + "ce_orig": 0.7028371095657349, + "epoch": 0.06944644140905823, + "kl_loss": 0.15012815594673157, + "loss_ib": 0.012968296185135841, + "step": 483 + }, + { + "ce_ib": 9.512238502502441, + "ce_orig": 0.5816277265548706, + "epoch": 0.0695902228612509, + "kl_loss": 0.13318368792533875, + "loss_ib": 0.011415303684771061, + "step": 484 + }, + { + "epoch": 0.06973400431344356, + "grad_norm": 0.12581641972064972, + "learning_rate": 3.793929712460064e-05, + "loss": 0.8585, + "step": 485 + }, + { + "ce_ib": 13.113508224487305, + "ce_orig": 1.0122153759002686, + "epoch": 0.06973400431344356, + "kl_loss": 0.17498700320720673, + "loss_ib": 0.01530610304325819, + "step": 485 + }, + { + "ce_ib": 11.20240592956543, + "ce_orig": 0.8718814253807068, + "epoch": 0.06987778576563623, + "kl_loss": 0.12174628674983978, + "loss_ib": 0.011688517406582832, + "step": 486 + }, + { + "ce_ib": 12.784674644470215, + "ce_orig": 0.8871896266937256, + "epoch": 0.0700215672178289, + "kl_loss": 0.16703587770462036, + "loss_ib": 0.014744131825864315, + "step": 487 + }, + { + "ce_ib": 10.580418586730957, + "ce_orig": 0.8577698469161987, + "epoch": 0.07016534867002157, + "kl_loss": 0.11169049143791199, + "loss_ib": 0.010874733328819275, + "step": 488 + }, + { + "ce_ib": 10.39923095703125, + "ce_orig": 0.6622049808502197, + "epoch": 0.07030913012221424, + "kl_loss": 0.17256517708301544, + "loss_ib": 0.013827874325215816, + "step": 489 + }, + { + "epoch": 0.0704529115744069, + "grad_norm": 0.11361895501613617, + "learning_rate": 3.8338658146964856e-05, + "loss": 0.901, + "step": 490 + }, + { + "ce_ib": 13.566216468811035, + "ce_orig": 0.9003996849060059, + "epoch": 0.0704529115744069, + "kl_loss": 0.18744704127311707, + "loss_ib": 0.016155460849404335, + "step": 490 + }, + { + "ce_ib": 11.603694915771484, + "ce_orig": 1.0972646474838257, + "epoch": 0.07059669302659957, + "kl_loss": 0.09514350444078445, + "loss_ib": 0.010559022426605225, + "step": 491 + }, + { + "ce_ib": 12.866926193237305, + "ce_orig": 1.1191866397857666, + "epoch": 0.07074047447879224, + "kl_loss": 0.12916871905326843, + "loss_ib": 0.012891898863017559, + "step": 492 + }, + { + "ce_ib": 11.685700416564941, + "ce_orig": 1.0439685583114624, + "epoch": 0.07088425593098491, + "kl_loss": 0.13916221261024475, + "loss_ib": 0.012800960801541805, + "step": 493 + }, + { + "ce_ib": 8.240974426269531, + "ce_orig": 0.6664552092552185, + "epoch": 0.07102803738317758, + "kl_loss": 0.0913599506020546, + "loss_ib": 0.008688484318554401, + "step": 494 + }, + { + "epoch": 0.07117181883537024, + "grad_norm": 0.11460109055042267, + "learning_rate": 3.8738019169329075e-05, + "loss": 0.8749, + "step": 495 + }, + { + "ce_ib": 15.161900520324707, + "ce_orig": 1.5418399572372437, + "epoch": 0.07117181883537024, + "kl_loss": 0.14499804377555847, + "loss_ib": 0.014830851927399635, + "step": 495 + }, + { + "ce_ib": 12.499137878417969, + "ce_orig": 0.7715917229652405, + "epoch": 0.07131560028756291, + "kl_loss": 0.13650982081890106, + "loss_ib": 0.013075060211122036, + "step": 496 + }, + { + "ce_ib": 10.516082763671875, + "ce_orig": 0.810151219367981, + "epoch": 0.07145938173975556, + "kl_loss": 0.1658136546611786, + "loss_ib": 0.013548724353313446, + "step": 497 + }, + { + "ce_ib": 12.26677131652832, + "ce_orig": 0.4871862828731537, + "epoch": 0.07160316319194823, + "kl_loss": 0.15727362036705017, + "loss_ib": 0.013997065834701061, + "step": 498 + }, + { + "ce_ib": 8.879415512084961, + "ce_orig": 0.5100986957550049, + "epoch": 0.0717469446441409, + "kl_loss": 0.10066086053848267, + "loss_ib": 0.009472750127315521, + "step": 499 + }, + { + "epoch": 0.07189072609633357, + "grad_norm": 0.1443518102169037, + "learning_rate": 3.913738019169329e-05, + "loss": 0.8769, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 20865, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}