diff --git "a/checkpoint-1500/trainer_state.json" "b/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1500/trainer_state.json" @@ -0,0 +1,14134 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21567217828900073, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "ce_ib": 64.96094512939453, + "ce_orig": 0.7435811758041382, + "epoch": 0, + "kl_loss": 3856.220703125, + "loss_ib": 192.84352111816406, + "step": 0 + }, + { + "ce_ib": 65.86748504638672, + "ce_orig": 1.261900782585144, + "epoch": 0.00014378145219266715, + "kl_loss": 3989.227294921875, + "loss_ib": 199.4943084716797, + "step": 1 + }, + { + "ce_ib": 63.70602798461914, + "ce_orig": 1.069283127784729, + "epoch": 0.0002875629043853343, + "kl_loss": 3018.861572265625, + "loss_ib": 150.9749298095703, + "step": 2 + }, + { + "ce_ib": 65.56930541992188, + "ce_orig": 0.9985544085502625, + "epoch": 0.00043134435657800146, + "kl_loss": 3440.156494140625, + "loss_ib": 172.04061889648438, + "step": 3 + }, + { + "ce_ib": 62.74740982055664, + "ce_orig": 0.7562570571899414, + "epoch": 0.0005751258087706686, + "kl_loss": 3999.942626953125, + "loss_ib": 200.02850341796875, + "step": 4 + }, + { + "epoch": 0.0007189072609633358, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 183.2429, + "step": 5 + }, + { + "ce_ib": 64.26500701904297, + "ce_orig": 0.704367995262146, + "epoch": 0.0007189072609633358, + "kl_loss": 3585.45947265625, + "loss_ib": 179.3050994873047, + "step": 5 + }, + { + "ce_ib": 64.85440826416016, + "ce_orig": 0.990234375, + "epoch": 0.0008626887131560029, + "kl_loss": 3754.7421875, + "loss_ib": 187.76953125, + "step": 6 + }, + { + "ce_ib": 64.96634674072266, + "ce_orig": 0.9086857438087463, + "epoch": 0.00100647016534867, + "kl_loss": 3929.58203125, + "loss_ib": 196.51158142089844, + "step": 7 + }, + { + "ce_ib": 64.07415771484375, + "ce_orig": 0.7150144577026367, + "epoch": 0.0011502516175413372, + "kl_loss": 3740.4150390625, + "loss_ib": 187.0527801513672, + "step": 8 + }, + { + "ce_ib": 65.45182800292969, + "ce_orig": 0.8336902856826782, + "epoch": 0.0012940330697340044, + "kl_loss": 3728.460205078125, + "loss_ib": 186.4557342529297, + "step": 9 + }, + { + "epoch": 0.0014378145219266715, + "grad_norm": 2721.89794921875, + "learning_rate": 0.0, + "loss": 185.5215, + "step": 10 + }, + { + "ce_ib": 64.31118774414062, + "ce_orig": 0.773049533367157, + "epoch": 0.0014378145219266715, + "kl_loss": 3938.441650390625, + "loss_ib": 196.95423889160156, + "step": 10 + }, + { + "ce_ib": 68.13079071044922, + "ce_orig": 1.7455261945724487, + "epoch": 0.0015815959741193387, + "kl_loss": 3521.255615234375, + "loss_ib": 176.0968475341797, + "step": 11 + }, + { + "ce_ib": 65.10643005371094, + "ce_orig": 1.115777850151062, + "epoch": 0.0017253774263120058, + "kl_loss": 3872.32763671875, + "loss_ib": 193.64894104003906, + "step": 12 + }, + { + "ce_ib": 66.39643859863281, + "ce_orig": 0.9639286994934082, + "epoch": 0.001869158878504673, + "kl_loss": 3771.317626953125, + "loss_ib": 188.59909057617188, + "step": 13 + }, + { + "ce_ib": 63.847900390625, + "ce_orig": 0.6167153716087341, + "epoch": 0.00201294033069734, + "kl_loss": 3233.497314453125, + "loss_ib": 161.706787109375, + "step": 14 + }, + { + "epoch": 0.002156721782890007, + "grad_norm": 2457.46435546875, + "learning_rate": 3.9936102236421723e-07, + "loss": 181.2678, + "step": 15 + }, + { + "ce_ib": 64.9281997680664, + "ce_orig": 1.2498202323913574, + "epoch": 0.002156721782890007, + "kl_loss": 3831.1611328125, + "loss_ib": 191.59051513671875, + "step": 15 + }, + { + "ce_ib": 65.2757797241211, + "ce_orig": 1.0968470573425293, + "epoch": 0.0023005032350826744, + "kl_loss": 3685.48193359375, + "loss_ib": 184.30674743652344, + "step": 16 + }, + { + "ce_ib": 64.68568420410156, + "ce_orig": 0.8841207027435303, + "epoch": 0.0024442846872753414, + "kl_loss": 3744.38134765625, + "loss_ib": 187.2514190673828, + "step": 17 + }, + { + "ce_ib": 64.82100677490234, + "ce_orig": 1.1195234060287476, + "epoch": 0.0025880661394680087, + "kl_loss": 3526.25927734375, + "loss_ib": 176.3453826904297, + "step": 18 + }, + { + "ce_ib": 63.829200744628906, + "ce_orig": 0.9802423119544983, + "epoch": 0.0027318475916606757, + "kl_loss": 3915.511474609375, + "loss_ib": 195.80747985839844, + "step": 19 + }, + { + "epoch": 0.002875629043853343, + "grad_norm": 2735.018310546875, + "learning_rate": 7.987220447284345e-07, + "loss": 187.5199, + "step": 20 + }, + { + "ce_ib": 64.30339050292969, + "ce_orig": 0.713705837726593, + "epoch": 0.002875629043853343, + "kl_loss": 3551.751220703125, + "loss_ib": 177.61972045898438, + "step": 20 + }, + { + "ce_ib": 65.2120132446289, + "ce_orig": 1.1765546798706055, + "epoch": 0.00301941049604601, + "kl_loss": 2725.1201171875, + "loss_ib": 136.2886199951172, + "step": 21 + }, + { + "ce_ib": 64.87537384033203, + "ce_orig": 1.1634544134140015, + "epoch": 0.0031631919482386773, + "kl_loss": 3579.1513671875, + "loss_ib": 178.99000549316406, + "step": 22 + }, + { + "ce_ib": 63.88631820678711, + "ce_orig": 0.9461633563041687, + "epoch": 0.0033069734004313443, + "kl_loss": 3845.448974609375, + "loss_ib": 192.3043975830078, + "step": 23 + }, + { + "ce_ib": 64.70732116699219, + "ce_orig": 1.03489351272583, + "epoch": 0.0034507548526240116, + "kl_loss": 3712.78076171875, + "loss_ib": 185.67138671875, + "step": 24 + }, + { + "epoch": 0.0035945363048166786, + "grad_norm": 2554.739501953125, + "learning_rate": 1.1980830670926517e-06, + "loss": 181.5605, + "step": 25 + }, + { + "ce_ib": 60.482200622558594, + "ce_orig": 0.3934582769870758, + "epoch": 0.0035945363048166786, + "kl_loss": 3043.28076171875, + "loss_ib": 152.19427490234375, + "step": 25 + }, + { + "ce_ib": 63.36570739746094, + "ce_orig": 0.7366315126419067, + "epoch": 0.003738317757009346, + "kl_loss": 3766.66796875, + "loss_ib": 188.36508178710938, + "step": 26 + }, + { + "ce_ib": 64.73159790039062, + "ce_orig": 0.8274144530296326, + "epoch": 0.003882099209202013, + "kl_loss": 4114.853515625, + "loss_ib": 205.77505493164062, + "step": 27 + }, + { + "ce_ib": 64.48138427734375, + "ce_orig": 0.9502752423286438, + "epoch": 0.00402588066139468, + "kl_loss": 3778.4033203125, + "loss_ib": 188.95240783691406, + "step": 28 + }, + { + "ce_ib": 65.42862701416016, + "ce_orig": 1.2197273969650269, + "epoch": 0.004169662113587347, + "kl_loss": 3769.447509765625, + "loss_ib": 188.50509643554688, + "step": 29 + }, + { + "epoch": 0.004313443565780014, + "grad_norm": 2604.34765625, + "learning_rate": 1.597444089456869e-06, + "loss": 185.5906, + "step": 30 + }, + { + "ce_ib": 66.01839447021484, + "ce_orig": 1.2358959913253784, + "epoch": 0.004313443565780014, + "kl_loss": 3728.17578125, + "loss_ib": 186.44180297851562, + "step": 30 + }, + { + "ce_ib": 63.86208724975586, + "ce_orig": 0.9959704875946045, + "epoch": 0.004457225017972682, + "kl_loss": 4007.545654296875, + "loss_ib": 200.4092254638672, + "step": 31 + }, + { + "ce_ib": 63.890628814697266, + "ce_orig": 0.785792887210846, + "epoch": 0.004601006470165349, + "kl_loss": 3334.66552734375, + "loss_ib": 166.76522827148438, + "step": 32 + }, + { + "ce_ib": 63.21049118041992, + "ce_orig": 0.9940950274467468, + "epoch": 0.004744787922358016, + "kl_loss": 3633.696533203125, + "loss_ib": 181.7164306640625, + "step": 33 + }, + { + "ce_ib": 65.66783905029297, + "ce_orig": 1.3076696395874023, + "epoch": 0.004888569374550683, + "kl_loss": 3813.81396484375, + "loss_ib": 190.72354125976562, + "step": 34 + }, + { + "epoch": 0.0050323508267433505, + "grad_norm": 2616.953369140625, + "learning_rate": 1.9968051118210863e-06, + "loss": 187.4055, + "step": 35 + }, + { + "ce_ib": 63.93610763549805, + "ce_orig": 1.0693968534469604, + "epoch": 0.0050323508267433505, + "kl_loss": 2865.432373046875, + "loss_ib": 143.3035888671875, + "step": 35 + }, + { + "ce_ib": 63.83279037475586, + "ce_orig": 0.9967127442359924, + "epoch": 0.0051761322789360175, + "kl_loss": 3682.119384765625, + "loss_ib": 184.1378936767578, + "step": 36 + }, + { + "ce_ib": 63.78058624267578, + "ce_orig": 0.7548370957374573, + "epoch": 0.005319913731128684, + "kl_loss": 3521.068359375, + "loss_ib": 176.08531188964844, + "step": 37 + }, + { + "ce_ib": 63.56543731689453, + "ce_orig": 0.9782358407974243, + "epoch": 0.005463695183321351, + "kl_loss": 3663.02587890625, + "loss_ib": 183.18309020996094, + "step": 38 + }, + { + "ce_ib": 65.04608917236328, + "ce_orig": 0.8212652802467346, + "epoch": 0.005607476635514018, + "kl_loss": 4082.810546875, + "loss_ib": 204.1730499267578, + "step": 39 + }, + { + "epoch": 0.005751258087706686, + "grad_norm": 2647.227294921875, + "learning_rate": 2.3961661341853035e-06, + "loss": 181.9243, + "step": 40 + }, + { + "ce_ib": 65.51580047607422, + "ce_orig": 1.1804547309875488, + "epoch": 0.005751258087706686, + "kl_loss": 3949.86279296875, + "loss_ib": 197.52590942382812, + "step": 40 + }, + { + "ce_ib": 63.80126190185547, + "ce_orig": 0.7081286907196045, + "epoch": 0.005895039539899353, + "kl_loss": 3912.12548828125, + "loss_ib": 195.63816833496094, + "step": 41 + }, + { + "ce_ib": 62.75702667236328, + "ce_orig": 0.7087532877922058, + "epoch": 0.00603882099209202, + "kl_loss": 3891.41259765625, + "loss_ib": 194.60202026367188, + "step": 42 + }, + { + "ce_ib": 64.6529541015625, + "ce_orig": 1.2302581071853638, + "epoch": 0.006182602444284687, + "kl_loss": 3733.06787109375, + "loss_ib": 186.6857147216797, + "step": 43 + }, + { + "ce_ib": 63.076133728027344, + "ce_orig": 0.8998420238494873, + "epoch": 0.006326383896477355, + "kl_loss": 3209.076904296875, + "loss_ib": 160.4853973388672, + "step": 44 + }, + { + "epoch": 0.006470165348670022, + "grad_norm": 2475.28173828125, + "learning_rate": 2.7955271565495207e-06, + "loss": 182.2709, + "step": 45 + }, + { + "ce_ib": 61.6526985168457, + "ce_orig": 0.6503346562385559, + "epoch": 0.006470165348670022, + "kl_loss": 3745.544921875, + "loss_ib": 187.30807495117188, + "step": 45 + }, + { + "ce_ib": 63.45009231567383, + "ce_orig": 1.2053508758544922, + "epoch": 0.0066139468008626886, + "kl_loss": 3523.22900390625, + "loss_ib": 176.19317626953125, + "step": 46 + }, + { + "ce_ib": 64.10655212402344, + "ce_orig": 1.1375271081924438, + "epoch": 0.0067577282530553555, + "kl_loss": 3449.62353515625, + "loss_ib": 172.5132293701172, + "step": 47 + }, + { + "ce_ib": 61.43892288208008, + "ce_orig": 0.6051114201545715, + "epoch": 0.006901509705248023, + "kl_loss": 2918.55615234375, + "loss_ib": 145.9585418701172, + "step": 48 + }, + { + "ce_ib": 62.07788848876953, + "ce_orig": 0.6533306837081909, + "epoch": 0.00704529115744069, + "kl_loss": 3852.226318359375, + "loss_ib": 192.64236450195312, + "step": 49 + }, + { + "epoch": 0.007189072609633357, + "grad_norm": 2666.057861328125, + "learning_rate": 3.194888178913738e-06, + "loss": 178.7684, + "step": 50 + }, + { + "ce_ib": 63.88691329956055, + "ce_orig": 0.9801137447357178, + "epoch": 0.007189072609633357, + "kl_loss": 3421.68798828125, + "loss_ib": 171.11634826660156, + "step": 50 + }, + { + "ce_ib": 63.39102554321289, + "ce_orig": 1.208335041999817, + "epoch": 0.007332854061826024, + "kl_loss": 3816.6484375, + "loss_ib": 190.86412048339844, + "step": 51 + }, + { + "ce_ib": 62.79481506347656, + "ce_orig": 1.1687792539596558, + "epoch": 0.007476635514018692, + "kl_loss": 3562.60595703125, + "loss_ib": 178.1616973876953, + "step": 52 + }, + { + "ce_ib": 59.30780792236328, + "ce_orig": 0.48160627484321594, + "epoch": 0.007620416966211359, + "kl_loss": 2727.423828125, + "loss_ib": 136.40084838867188, + "step": 53 + }, + { + "ce_ib": 60.45918655395508, + "ce_orig": 0.729110598564148, + "epoch": 0.007764198418404026, + "kl_loss": 3612.10888671875, + "loss_ib": 180.6356658935547, + "step": 54 + }, + { + "epoch": 0.007907979870596693, + "grad_norm": 2656.139404296875, + "learning_rate": 3.5942492012779555e-06, + "loss": 179.8713, + "step": 55 + }, + { + "ce_ib": 62.10517883300781, + "ce_orig": 0.8718740344047546, + "epoch": 0.007907979870596693, + "kl_loss": 3526.4736328125, + "loss_ib": 176.354736328125, + "step": 55 + }, + { + "ce_ib": 61.65966796875, + "ce_orig": 0.8692609071731567, + "epoch": 0.00805176132278936, + "kl_loss": 3907.55126953125, + "loss_ib": 195.4084014892578, + "step": 56 + }, + { + "ce_ib": 62.87982940673828, + "ce_orig": 1.1107903718948364, + "epoch": 0.008195542774982027, + "kl_loss": 3505.02880859375, + "loss_ib": 175.2828826904297, + "step": 57 + }, + { + "ce_ib": 61.1724853515625, + "ce_orig": 0.7401551008224487, + "epoch": 0.008339324227174694, + "kl_loss": 3798.70947265625, + "loss_ib": 189.966064453125, + "step": 58 + }, + { + "ce_ib": 64.25030517578125, + "ce_orig": 1.37394380569458, + "epoch": 0.008483105679367362, + "kl_loss": 3642.968994140625, + "loss_ib": 182.1805877685547, + "step": 59 + }, + { + "epoch": 0.008626887131560028, + "grad_norm": 2610.953369140625, + "learning_rate": 3.993610223642173e-06, + "loss": 181.3146, + "step": 60 + }, + { + "ce_ib": 61.306488037109375, + "ce_orig": 0.8149375915527344, + "epoch": 0.008626887131560028, + "kl_loss": 3716.3310546875, + "loss_ib": 185.8472137451172, + "step": 60 + }, + { + "ce_ib": 63.48124313354492, + "ce_orig": 1.404058575630188, + "epoch": 0.008770668583752696, + "kl_loss": 3322.540283203125, + "loss_ib": 166.15875244140625, + "step": 61 + }, + { + "ce_ib": 63.29791259765625, + "ce_orig": 1.1193113327026367, + "epoch": 0.008914450035945364, + "kl_loss": 3468.67529296875, + "loss_ib": 173.46542358398438, + "step": 62 + }, + { + "ce_ib": 62.13097381591797, + "ce_orig": 1.177595615386963, + "epoch": 0.00905823148813803, + "kl_loss": 3650.87646484375, + "loss_ib": 182.57489013671875, + "step": 63 + }, + { + "ce_ib": 60.98870849609375, + "ce_orig": 0.9005176424980164, + "epoch": 0.009202012940330698, + "kl_loss": 3419.352783203125, + "loss_ib": 170.99813842773438, + "step": 64 + }, + { + "epoch": 0.009345794392523364, + "grad_norm": 2418.697998046875, + "learning_rate": 4.39297124600639e-06, + "loss": 176.6997, + "step": 65 + }, + { + "ce_ib": 60.742515563964844, + "ce_orig": 0.8558191657066345, + "epoch": 0.009345794392523364, + "kl_loss": 3522.680908203125, + "loss_ib": 176.16441345214844, + "step": 65 + }, + { + "ce_ib": 61.35293960571289, + "ce_orig": 0.6822745203971863, + "epoch": 0.009489575844716032, + "kl_loss": 3174.692138671875, + "loss_ib": 158.76528930664062, + "step": 66 + }, + { + "ce_ib": 60.16307067871094, + "ce_orig": 0.6927408576011658, + "epoch": 0.0096333572969087, + "kl_loss": 3434.4345703125, + "loss_ib": 171.75181579589844, + "step": 67 + }, + { + "ce_ib": 60.551544189453125, + "ce_orig": 0.7352694272994995, + "epoch": 0.009777138749101365, + "kl_loss": 3096.6083984375, + "loss_ib": 154.86070251464844, + "step": 68 + }, + { + "ce_ib": 61.049774169921875, + "ce_orig": 0.8262593150138855, + "epoch": 0.009920920201294033, + "kl_loss": 3612.48095703125, + "loss_ib": 180.6545867919922, + "step": 69 + }, + { + "epoch": 0.010064701653486701, + "grad_norm": 2490.9189453125, + "learning_rate": 4.792332268370607e-06, + "loss": 168.4689, + "step": 70 + }, + { + "ce_ib": 59.70656204223633, + "ce_orig": 0.7031822204589844, + "epoch": 0.010064701653486701, + "kl_loss": 3356.534423828125, + "loss_ib": 167.85658264160156, + "step": 70 + }, + { + "ce_ib": 58.52351760864258, + "ce_orig": 0.8787212371826172, + "epoch": 0.010208483105679367, + "kl_loss": 3540.501953125, + "loss_ib": 177.0543670654297, + "step": 71 + }, + { + "ce_ib": 60.51292419433594, + "ce_orig": 0.872455894947052, + "epoch": 0.010352264557872035, + "kl_loss": 3272.396240234375, + "loss_ib": 163.6500701904297, + "step": 72 + }, + { + "ce_ib": 59.571720123291016, + "ce_orig": 0.9494105577468872, + "epoch": 0.010496046010064701, + "kl_loss": 3453.132568359375, + "loss_ib": 172.68641662597656, + "step": 73 + }, + { + "ce_ib": 60.829315185546875, + "ce_orig": 0.9209774136543274, + "epoch": 0.010639827462257369, + "kl_loss": 3232.62890625, + "loss_ib": 161.661865234375, + "step": 74 + }, + { + "epoch": 0.010783608914450037, + "grad_norm": 2442.193359375, + "learning_rate": 5.191693290734825e-06, + "loss": 169.7118, + "step": 75 + }, + { + "ce_ib": 58.91569519042969, + "ce_orig": 0.6644178628921509, + "epoch": 0.010783608914450037, + "kl_loss": 3597.986572265625, + "loss_ib": 179.9287872314453, + "step": 75 + }, + { + "ce_ib": 59.93379592895508, + "ce_orig": 0.6643899083137512, + "epoch": 0.010927390366642703, + "kl_loss": 3446.6064453125, + "loss_ib": 172.36029052734375, + "step": 76 + }, + { + "ce_ib": 61.500518798828125, + "ce_orig": 1.1189907789230347, + "epoch": 0.01107117181883537, + "kl_loss": 3529.22216796875, + "loss_ib": 176.4918670654297, + "step": 77 + }, + { + "ce_ib": 59.83492660522461, + "ce_orig": 0.6313321590423584, + "epoch": 0.011214953271028037, + "kl_loss": 3552.268798828125, + "loss_ib": 177.6433563232422, + "step": 78 + }, + { + "ce_ib": 59.81180953979492, + "ce_orig": 1.0850353240966797, + "epoch": 0.011358734723220704, + "kl_loss": 3466.029296875, + "loss_ib": 173.3313751220703, + "step": 79 + }, + { + "epoch": 0.011502516175413372, + "grad_norm": 2574.475341796875, + "learning_rate": 5.591054313099041e-06, + "loss": 170.7064, + "step": 80 + }, + { + "ce_ib": 62.353084564208984, + "ce_orig": 1.3420498371124268, + "epoch": 0.011502516175413372, + "kl_loss": 3235.225341796875, + "loss_ib": 161.79244995117188, + "step": 80 + }, + { + "ce_ib": 60.79109573364258, + "ce_orig": 1.3365955352783203, + "epoch": 0.011646297627606038, + "kl_loss": 3334.964599609375, + "loss_ib": 166.77862548828125, + "step": 81 + }, + { + "ce_ib": 60.66354751586914, + "ce_orig": 0.7855740785598755, + "epoch": 0.011790079079798706, + "kl_loss": 3520.52783203125, + "loss_ib": 176.05673217773438, + "step": 82 + }, + { + "ce_ib": 59.765869140625, + "ce_orig": 0.8332124352455139, + "epoch": 0.011933860531991374, + "kl_loss": 3197.080078125, + "loss_ib": 159.88389587402344, + "step": 83 + }, + { + "ce_ib": 60.493858337402344, + "ce_orig": 1.5375986099243164, + "epoch": 0.01207764198418404, + "kl_loss": 3220.095703125, + "loss_ib": 161.0350341796875, + "step": 84 + }, + { + "epoch": 0.012221423436376708, + "grad_norm": 2434.75390625, + "learning_rate": 5.990415335463259e-06, + "loss": 165.5288, + "step": 85 + }, + { + "ce_ib": 57.47407150268555, + "ce_orig": 0.8656359910964966, + "epoch": 0.012221423436376708, + "kl_loss": 3253.564453125, + "loss_ib": 162.7069549560547, + "step": 85 + }, + { + "ce_ib": 59.44199752807617, + "ce_orig": 0.8775683045387268, + "epoch": 0.012365204888569374, + "kl_loss": 3335.959716796875, + "loss_ib": 166.8277130126953, + "step": 86 + }, + { + "ce_ib": 59.450687408447266, + "ce_orig": 0.7520811557769775, + "epoch": 0.012508986340762042, + "kl_loss": 3219.025634765625, + "loss_ib": 160.9810028076172, + "step": 87 + }, + { + "ce_ib": 58.30461120605469, + "ce_orig": 0.9134323000907898, + "epoch": 0.01265276779295471, + "kl_loss": 2271.015625, + "loss_ib": 113.5799331665039, + "step": 88 + }, + { + "ce_ib": 58.684146881103516, + "ce_orig": 1.100233793258667, + "epoch": 0.012796549245147375, + "kl_loss": 3356.873779296875, + "loss_ib": 167.87303161621094, + "step": 89 + }, + { + "epoch": 0.012940330697340043, + "grad_norm": 2395.843994140625, + "learning_rate": 6.389776357827476e-06, + "loss": 153.9006, + "step": 90 + }, + { + "ce_ib": 57.74800491333008, + "ce_orig": 1.038017988204956, + "epoch": 0.012940330697340043, + "kl_loss": 3335.373046875, + "loss_ib": 166.7975311279297, + "step": 90 + }, + { + "ce_ib": 57.7064323425293, + "ce_orig": 0.921914279460907, + "epoch": 0.013084112149532711, + "kl_loss": 3284.214111328125, + "loss_ib": 164.23956298828125, + "step": 91 + }, + { + "ce_ib": 57.82907485961914, + "ce_orig": 0.9391928911209106, + "epoch": 0.013227893601725377, + "kl_loss": 3100.417724609375, + "loss_ib": 155.0498046875, + "step": 92 + }, + { + "ce_ib": 58.58163833618164, + "ce_orig": 1.0686829090118408, + "epoch": 0.013371675053918045, + "kl_loss": 3164.060546875, + "loss_ib": 158.23233032226562, + "step": 93 + }, + { + "ce_ib": 56.14845657348633, + "ce_orig": 0.4495549201965332, + "epoch": 0.013515456506110711, + "kl_loss": 2954.31982421875, + "loss_ib": 147.7440643310547, + "step": 94 + }, + { + "epoch": 0.013659237958303379, + "grad_norm": 2321.239013671875, + "learning_rate": 6.789137380191693e-06, + "loss": 156.8229, + "step": 95 + }, + { + "ce_ib": 57.893184661865234, + "ce_orig": 1.1613727807998657, + "epoch": 0.013659237958303379, + "kl_loss": 3191.733154296875, + "loss_ib": 159.6156005859375, + "step": 95 + }, + { + "ce_ib": 58.80827713012695, + "ce_orig": 1.103546380996704, + "epoch": 0.013803019410496047, + "kl_loss": 2989.6494140625, + "loss_ib": 149.5118865966797, + "step": 96 + }, + { + "ce_ib": 57.372257232666016, + "ce_orig": 0.7489521503448486, + "epoch": 0.013946800862688713, + "kl_loss": 3076.787109375, + "loss_ib": 153.8680419921875, + "step": 97 + }, + { + "ce_ib": 58.20832061767578, + "ce_orig": 1.0948829650878906, + "epoch": 0.01409058231488138, + "kl_loss": 3079.382080078125, + "loss_ib": 153.9982147216797, + "step": 98 + }, + { + "ce_ib": 57.56401443481445, + "ce_orig": 1.252577543258667, + "epoch": 0.014234363767074048, + "kl_loss": 2953.885498046875, + "loss_ib": 147.7230682373047, + "step": 99 + }, + { + "epoch": 0.014378145219266714, + "grad_norm": 2354.333251953125, + "learning_rate": 7.188498402555911e-06, + "loss": 151.9125, + "step": 100 + }, + { + "ce_ib": 57.552894592285156, + "ce_orig": 1.0784616470336914, + "epoch": 0.014378145219266714, + "kl_loss": 3071.802734375, + "loss_ib": 153.61891174316406, + "step": 100 + }, + { + "ce_ib": 56.86176681518555, + "ce_orig": 0.6961782574653625, + "epoch": 0.014521926671459382, + "kl_loss": 3063.923095703125, + "loss_ib": 153.22459411621094, + "step": 101 + }, + { + "ce_ib": 54.33999252319336, + "ce_orig": 0.5625413656234741, + "epoch": 0.014665708123652048, + "kl_loss": 2982.339111328125, + "loss_ib": 149.14413452148438, + "step": 102 + }, + { + "ce_ib": 55.64839553833008, + "ce_orig": 0.7663992047309875, + "epoch": 0.014809489575844716, + "kl_loss": 3048.7392578125, + "loss_ib": 152.46478271484375, + "step": 103 + }, + { + "ce_ib": 58.38852310180664, + "ce_orig": 1.1540484428405762, + "epoch": 0.014953271028037384, + "kl_loss": 2707.0400390625, + "loss_ib": 135.38121032714844, + "step": 104 + }, + { + "epoch": 0.01509705248023005, + "grad_norm": 2202.033447265625, + "learning_rate": 7.5878594249201285e-06, + "loss": 148.0272, + "step": 105 + }, + { + "ce_ib": 58.33769607543945, + "ce_orig": 1.282652735710144, + "epoch": 0.01509705248023005, + "kl_loss": 2638.0634765625, + "loss_ib": 131.93235778808594, + "step": 105 + }, + { + "ce_ib": 57.01485061645508, + "ce_orig": 1.1682567596435547, + "epoch": 0.015240833932422718, + "kl_loss": 2910.34326171875, + "loss_ib": 145.54566955566406, + "step": 106 + }, + { + "ce_ib": 56.478248596191406, + "ce_orig": 1.093648910522461, + "epoch": 0.015384615384615385, + "kl_loss": 2749.603759765625, + "loss_ib": 137.5084228515625, + "step": 107 + }, + { + "ce_ib": 55.969581604003906, + "ce_orig": 0.8221930861473083, + "epoch": 0.015528396836808052, + "kl_loss": 2742.017822265625, + "loss_ib": 137.12889099121094, + "step": 108 + }, + { + "ce_ib": 55.39336013793945, + "ce_orig": 1.138152837753296, + "epoch": 0.015672178289000718, + "kl_loss": 2064.01513671875, + "loss_ib": 103.22845458984375, + "step": 109 + }, + { + "epoch": 0.015815959741193385, + "grad_norm": 1960.8631591796875, + "learning_rate": 7.987220447284345e-06, + "loss": 139.8337, + "step": 110 + }, + { + "ce_ib": 57.7611198425293, + "ce_orig": 0.8048346042633057, + "epoch": 0.015815959741193385, + "kl_loss": 2746.037353515625, + "loss_ib": 137.33074951171875, + "step": 110 + }, + { + "ce_ib": 54.72801208496094, + "ce_orig": 0.9340922236442566, + "epoch": 0.015959741193386053, + "kl_loss": 2660.56982421875, + "loss_ib": 133.0558624267578, + "step": 111 + }, + { + "ce_ib": 56.28373336791992, + "ce_orig": 1.2209872007369995, + "epoch": 0.01610352264557872, + "kl_loss": 2672.66015625, + "loss_ib": 133.66114807128906, + "step": 112 + }, + { + "ce_ib": 55.83729553222656, + "ce_orig": 1.1345103979110718, + "epoch": 0.01624730409777139, + "kl_loss": 2844.884765625, + "loss_ib": 142.27215576171875, + "step": 113 + }, + { + "ce_ib": 55.07097625732422, + "ce_orig": 1.2232747077941895, + "epoch": 0.016391085549964053, + "kl_loss": 2053.98974609375, + "loss_ib": 102.7270278930664, + "step": 114 + }, + { + "epoch": 0.01653486700215672, + "grad_norm": 2072.072998046875, + "learning_rate": 8.386581469648563e-06, + "loss": 135.2724, + "step": 115 + }, + { + "ce_ib": 56.160423278808594, + "ce_orig": 0.8365716338157654, + "epoch": 0.01653486700215672, + "kl_loss": 2660.091796875, + "loss_ib": 133.0326690673828, + "step": 115 + }, + { + "ce_ib": 54.47496795654297, + "ce_orig": 1.0809355974197388, + "epoch": 0.01667864845434939, + "kl_loss": 2690.080078125, + "loss_ib": 134.53123474121094, + "step": 116 + }, + { + "ce_ib": 53.27537155151367, + "ce_orig": 1.1820772886276245, + "epoch": 0.016822429906542057, + "kl_loss": 2644.28173828125, + "loss_ib": 132.24072265625, + "step": 117 + }, + { + "ce_ib": 54.39961242675781, + "ce_orig": 1.2899582386016846, + "epoch": 0.016966211358734724, + "kl_loss": 2602.2080078125, + "loss_ib": 130.13760375976562, + "step": 118 + }, + { + "ce_ib": 52.184234619140625, + "ce_orig": 0.8353484869003296, + "epoch": 0.01710999281092739, + "kl_loss": 2481.124267578125, + "loss_ib": 124.0822982788086, + "step": 119 + }, + { + "epoch": 0.017253774263120056, + "grad_norm": 2074.370361328125, + "learning_rate": 8.78594249201278e-06, + "loss": 132.3383, + "step": 120 + }, + { + "ce_ib": 53.626792907714844, + "ce_orig": 0.689016580581665, + "epoch": 0.017253774263120056, + "kl_loss": 2638.05810546875, + "loss_ib": 131.9297332763672, + "step": 120 + }, + { + "ce_ib": 52.17591857910156, + "ce_orig": 0.7896418571472168, + "epoch": 0.017397555715312724, + "kl_loss": 1895.728515625, + "loss_ib": 94.81251525878906, + "step": 121 + }, + { + "ce_ib": 52.534664154052734, + "ce_orig": 0.7579004764556885, + "epoch": 0.017541337167505392, + "kl_loss": 2552.994873046875, + "loss_ib": 127.67601776123047, + "step": 122 + }, + { + "ce_ib": 52.31755065917969, + "ce_orig": 0.9803644418716431, + "epoch": 0.01768511861969806, + "kl_loss": 2607.78662109375, + "loss_ib": 130.41549682617188, + "step": 123 + }, + { + "ce_ib": 53.674556732177734, + "ce_orig": 0.8895677328109741, + "epoch": 0.017828900071890728, + "kl_loss": 2425.487548828125, + "loss_ib": 121.30122375488281, + "step": 124 + }, + { + "epoch": 0.017972681524083392, + "grad_norm": 1952.8394775390625, + "learning_rate": 9.185303514376996e-06, + "loss": 122.1088, + "step": 125 + }, + { + "ce_ib": 53.564842224121094, + "ce_orig": 1.256234049797058, + "epoch": 0.017972681524083392, + "kl_loss": 2441.9111328125, + "loss_ib": 122.1223373413086, + "step": 125 + }, + { + "ce_ib": 52.68410110473633, + "ce_orig": 1.2045953273773193, + "epoch": 0.01811646297627606, + "kl_loss": 2293.639404296875, + "loss_ib": 114.70831298828125, + "step": 126 + }, + { + "ce_ib": 54.133914947509766, + "ce_orig": 1.514359712600708, + "epoch": 0.018260244428468728, + "kl_loss": 2311.011962890625, + "loss_ib": 115.5776596069336, + "step": 127 + }, + { + "ce_ib": 52.44422149658203, + "ce_orig": 0.9751385450363159, + "epoch": 0.018404025880661395, + "kl_loss": 2449.00048828125, + "loss_ib": 122.47624969482422, + "step": 128 + }, + { + "ce_ib": 50.06406021118164, + "ce_orig": 0.7257946729660034, + "epoch": 0.018547807332854063, + "kl_loss": 2270.14794921875, + "loss_ib": 113.53243255615234, + "step": 129 + }, + { + "epoch": 0.018691588785046728, + "grad_norm": 1946.055419921875, + "learning_rate": 9.584664536741214e-06, + "loss": 119.3412, + "step": 130 + }, + { + "ce_ib": 53.00954055786133, + "ce_orig": 1.0300298929214478, + "epoch": 0.018691588785046728, + "kl_loss": 2245.056640625, + "loss_ib": 112.27933502197266, + "step": 130 + }, + { + "ce_ib": 50.96159362792969, + "ce_orig": 0.8871611952781677, + "epoch": 0.018835370237239395, + "kl_loss": 2287.427734375, + "loss_ib": 114.39686584472656, + "step": 131 + }, + { + "ce_ib": 52.4853515625, + "ce_orig": 1.039488673210144, + "epoch": 0.018979151689432063, + "kl_loss": 2313.669921875, + "loss_ib": 115.7097396850586, + "step": 132 + }, + { + "ce_ib": 51.21004104614258, + "ce_orig": 0.845194935798645, + "epoch": 0.01912293314162473, + "kl_loss": 2127.791259765625, + "loss_ib": 106.41517639160156, + "step": 133 + }, + { + "ce_ib": 52.13531494140625, + "ce_orig": 1.1372461318969727, + "epoch": 0.0192667145938174, + "kl_loss": 2183.52783203125, + "loss_ib": 109.20245361328125, + "step": 134 + }, + { + "epoch": 0.019410496046010063, + "grad_norm": 1868.0174560546875, + "learning_rate": 9.984025559105432e-06, + "loss": 111.8021, + "step": 135 + }, + { + "ce_ib": 50.40250015258789, + "ce_orig": 0.5799722075462341, + "epoch": 0.019410496046010063, + "kl_loss": 2218.98046875, + "loss_ib": 110.9742202758789, + "step": 135 + }, + { + "ce_ib": 49.68068313598633, + "ce_orig": 0.9591949582099915, + "epoch": 0.01955427749820273, + "kl_loss": 2150.03564453125, + "loss_ib": 107.52662658691406, + "step": 136 + }, + { + "ce_ib": 49.44940185546875, + "ce_orig": 0.8390080332756042, + "epoch": 0.0196980589503954, + "kl_loss": 1845.8321533203125, + "loss_ib": 92.31633758544922, + "step": 137 + }, + { + "ce_ib": 52.21055221557617, + "ce_orig": 1.4367856979370117, + "epoch": 0.019841840402588067, + "kl_loss": 2051.351806640625, + "loss_ib": 102.59370422363281, + "step": 138 + }, + { + "ce_ib": 50.15422058105469, + "ce_orig": 1.2788512706756592, + "epoch": 0.019985621854780734, + "kl_loss": 2027.796875, + "loss_ib": 101.41492462158203, + "step": 139 + }, + { + "epoch": 0.020129403306973402, + "grad_norm": 1761.1328125, + "learning_rate": 1.038338658146965e-05, + "loss": 102.0922, + "step": 140 + }, + { + "ce_ib": 50.65812301635742, + "ce_orig": 1.2148778438568115, + "epoch": 0.020129403306973402, + "kl_loss": 1903.0653076171875, + "loss_ib": 95.17859649658203, + "step": 140 + }, + { + "ce_ib": 49.68356704711914, + "ce_orig": 1.0471272468566895, + "epoch": 0.020273184759166066, + "kl_loss": 1945.1756591796875, + "loss_ib": 97.28362274169922, + "step": 141 + }, + { + "ce_ib": 47.467594146728516, + "ce_orig": 0.6498449444770813, + "epoch": 0.020416966211358734, + "kl_loss": 1910.508544921875, + "loss_ib": 95.54916381835938, + "step": 142 + }, + { + "ce_ib": 47.57759094238281, + "ce_orig": 0.8552805781364441, + "epoch": 0.020560747663551402, + "kl_loss": 1907.689697265625, + "loss_ib": 95.40827941894531, + "step": 143 + }, + { + "ce_ib": 50.04168701171875, + "ce_orig": 1.1924092769622803, + "epoch": 0.02070452911574407, + "kl_loss": 1690.5194091796875, + "loss_ib": 84.55098724365234, + "step": 144 + }, + { + "epoch": 0.020848310567936738, + "grad_norm": 1661.7161865234375, + "learning_rate": 1.0782747603833867e-05, + "loss": 95.0661, + "step": 145 + }, + { + "ce_ib": 48.34769821166992, + "ce_orig": 1.0676274299621582, + "epoch": 0.020848310567936738, + "kl_loss": 1760.777099609375, + "loss_ib": 88.06303405761719, + "step": 145 + }, + { + "ce_ib": 47.0106201171875, + "ce_orig": 1.2353283166885376, + "epoch": 0.020992092020129402, + "kl_loss": 1798.0538330078125, + "loss_ib": 89.92620086669922, + "step": 146 + }, + { + "ce_ib": 48.50465393066406, + "ce_orig": 1.0733487606048584, + "epoch": 0.02113587347232207, + "kl_loss": 1734.2774658203125, + "loss_ib": 86.73812866210938, + "step": 147 + }, + { + "ce_ib": 45.04286193847656, + "ce_orig": 0.8353049159049988, + "epoch": 0.021279654924514738, + "kl_loss": 1773.991943359375, + "loss_ib": 88.72212219238281, + "step": 148 + }, + { + "ce_ib": 47.320526123046875, + "ce_orig": 1.219022512435913, + "epoch": 0.021423436376707405, + "kl_loss": 1750.5101318359375, + "loss_ib": 87.54916381835938, + "step": 149 + }, + { + "epoch": 0.021567217828900073, + "grad_norm": 1564.1060791015625, + "learning_rate": 1.1182108626198083e-05, + "loss": 87.3973, + "step": 150 + }, + { + "ce_ib": 47.08059310913086, + "ce_orig": 0.8302248120307922, + "epoch": 0.021567217828900073, + "kl_loss": 1589.67822265625, + "loss_ib": 79.50745391845703, + "step": 150 + }, + { + "ce_ib": 47.74165725708008, + "ce_orig": 0.7171430587768555, + "epoch": 0.021710999281092738, + "kl_loss": 1636.097412109375, + "loss_ib": 81.82874298095703, + "step": 151 + }, + { + "ce_ib": 46.43238830566406, + "ce_orig": 0.7868078947067261, + "epoch": 0.021854780733285405, + "kl_loss": 1594.2950439453125, + "loss_ib": 79.73796844482422, + "step": 152 + }, + { + "ce_ib": 44.30559539794922, + "ce_orig": 1.0339301824569702, + "epoch": 0.021998562185478073, + "kl_loss": 1573.078857421875, + "loss_ib": 78.67610168457031, + "step": 153 + }, + { + "ce_ib": 44.82426834106445, + "ce_orig": 1.0889431238174438, + "epoch": 0.02214234363767074, + "kl_loss": 1503.557861328125, + "loss_ib": 75.20030975341797, + "step": 154 + }, + { + "epoch": 0.02228612508986341, + "grad_norm": 1438.786376953125, + "learning_rate": 1.1581469648562302e-05, + "loss": 77.0769, + "step": 155 + }, + { + "ce_ib": 46.833580017089844, + "ce_orig": 1.3186697959899902, + "epoch": 0.02228612508986341, + "kl_loss": 1401.23486328125, + "loss_ib": 70.08516693115234, + "step": 155 + }, + { + "ce_ib": 46.787254333496094, + "ce_orig": 0.9530097246170044, + "epoch": 0.022429906542056073, + "kl_loss": 1483.8673095703125, + "loss_ib": 74.21675872802734, + "step": 156 + }, + { + "ce_ib": 46.81201171875, + "ce_orig": 1.1565759181976318, + "epoch": 0.02257368799424874, + "kl_loss": 1341.5926513671875, + "loss_ib": 67.10304260253906, + "step": 157 + }, + { + "ce_ib": 45.95816421508789, + "ce_orig": 1.2070651054382324, + "epoch": 0.02271746944644141, + "kl_loss": 1238.02880859375, + "loss_ib": 61.92441940307617, + "step": 158 + }, + { + "ce_ib": 43.655330657958984, + "ce_orig": 0.7560437321662903, + "epoch": 0.022861250898634077, + "kl_loss": 1213.171875, + "loss_ib": 60.680419921875, + "step": 159 + }, + { + "epoch": 0.023005032350826744, + "grad_norm": 1232.082763671875, + "learning_rate": 1.1980830670926518e-05, + "loss": 66.2022, + "step": 160 + }, + { + "ce_ib": 46.1483268737793, + "ce_orig": 1.2238253355026245, + "epoch": 0.023005032350826744, + "kl_loss": 1228.1053466796875, + "loss_ib": 61.428340911865234, + "step": 160 + }, + { + "ce_ib": 47.45783996582031, + "ce_orig": 1.587011694908142, + "epoch": 0.023148813803019412, + "kl_loss": 1157.474365234375, + "loss_ib": 57.8974494934082, + "step": 161 + }, + { + "ce_ib": 45.567657470703125, + "ce_orig": 1.3450465202331543, + "epoch": 0.023292595255212076, + "kl_loss": 1117.0738525390625, + "loss_ib": 55.8764762878418, + "step": 162 + }, + { + "ce_ib": 44.280250549316406, + "ce_orig": 1.2833943367004395, + "epoch": 0.023436376707404744, + "kl_loss": 1064.94677734375, + "loss_ib": 53.26947784423828, + "step": 163 + }, + { + "ce_ib": 43.552276611328125, + "ce_orig": 0.7282137870788574, + "epoch": 0.023580158159597412, + "kl_loss": 1048.7855224609375, + "loss_ib": 52.461055755615234, + "step": 164 + }, + { + "epoch": 0.02372393961179008, + "grad_norm": 1095.414794921875, + "learning_rate": 1.2380191693290735e-05, + "loss": 56.1672, + "step": 165 + }, + { + "ce_ib": 43.77597427368164, + "ce_orig": 1.015390157699585, + "epoch": 0.02372393961179008, + "kl_loss": 996.73828125, + "loss_ib": 49.858802795410156, + "step": 165 + }, + { + "ce_ib": 43.67955017089844, + "ce_orig": 1.0842498540878296, + "epoch": 0.023867721063982748, + "kl_loss": 947.7979736328125, + "loss_ib": 47.411739349365234, + "step": 166 + }, + { + "ce_ib": 42.23929214477539, + "ce_orig": 0.9063572287559509, + "epoch": 0.024011502516175412, + "kl_loss": 897.7328491210938, + "loss_ib": 44.90776062011719, + "step": 167 + }, + { + "ce_ib": 45.15175247192383, + "ce_orig": 1.2839014530181885, + "epoch": 0.02415528396836808, + "kl_loss": 786.1021728515625, + "loss_ib": 39.32768630981445, + "step": 168 + }, + { + "ce_ib": 43.47494125366211, + "ce_orig": 0.9718037247657776, + "epoch": 0.024299065420560748, + "kl_loss": 860.2787475585938, + "loss_ib": 43.035675048828125, + "step": 169 + }, + { + "epoch": 0.024442846872753415, + "grad_norm": 1042.965087890625, + "learning_rate": 1.2779552715654951e-05, + "loss": 47.1098, + "step": 170 + }, + { + "ce_ib": 43.44833755493164, + "ce_orig": 1.3617149591445923, + "epoch": 0.024442846872753415, + "kl_loss": 803.04443359375, + "loss_ib": 40.173946380615234, + "step": 170 + }, + { + "ce_ib": 43.578792572021484, + "ce_orig": 0.9456126093864441, + "epoch": 0.024586628324946083, + "kl_loss": 699.483642578125, + "loss_ib": 34.9959716796875, + "step": 171 + }, + { + "ce_ib": 47.23983383178711, + "ce_orig": 1.8550169467926025, + "epoch": 0.024730409777138748, + "kl_loss": 675.0830688476562, + "loss_ib": 33.777774810791016, + "step": 172 + }, + { + "ce_ib": 43.85140609741211, + "ce_orig": 1.138585090637207, + "epoch": 0.024874191229331415, + "kl_loss": 659.050537109375, + "loss_ib": 32.97445297241211, + "step": 173 + }, + { + "ce_ib": 41.722373962402344, + "ce_orig": 1.243048906326294, + "epoch": 0.025017972681524083, + "kl_loss": 590.1033325195312, + "loss_ib": 29.526029586791992, + "step": 174 + }, + { + "epoch": 0.02516175413371675, + "grad_norm": 766.4675903320312, + "learning_rate": 1.3178913738019169e-05, + "loss": 35.4625, + "step": 175 + }, + { + "ce_ib": 44.18523406982422, + "ce_orig": 1.255010962486267, + "epoch": 0.02516175413371675, + "kl_loss": 592.2353515625, + "loss_ib": 29.633859634399414, + "step": 175 + }, + { + "ce_ib": 46.14919662475586, + "ce_orig": 0.9867228865623474, + "epoch": 0.02530553558590942, + "kl_loss": 509.3572692871094, + "loss_ib": 25.490938186645508, + "step": 176 + }, + { + "ce_ib": 46.58631134033203, + "ce_orig": 1.1491925716400146, + "epoch": 0.025449317038102087, + "kl_loss": 459.72491455078125, + "loss_ib": 23.009538650512695, + "step": 177 + }, + { + "ce_ib": 40.569705963134766, + "ce_orig": 0.7460018396377563, + "epoch": 0.02559309849029475, + "kl_loss": 352.4381103515625, + "loss_ib": 17.64219093322754, + "step": 178 + }, + { + "ce_ib": 46.841224670410156, + "ce_orig": 1.1470024585723877, + "epoch": 0.02573687994248742, + "kl_loss": 389.03765869140625, + "loss_ib": 19.475303649902344, + "step": 179 + }, + { + "epoch": 0.025880661394680086, + "grad_norm": 621.7545776367188, + "learning_rate": 1.3578274760383387e-05, + "loss": 24.304, + "step": 180 + }, + { + "ce_ib": 47.13188552856445, + "ce_orig": 1.330838680267334, + "epoch": 0.025880661394680086, + "kl_loss": 379.19989013671875, + "loss_ib": 18.98356056213379, + "step": 180 + }, + { + "ce_ib": 52.806861877441406, + "ce_orig": 2.114457130432129, + "epoch": 0.026024442846872754, + "kl_loss": 284.24114990234375, + "loss_ib": 14.2384614944458, + "step": 181 + }, + { + "ce_ib": 64.3398208618164, + "ce_orig": 0.9137269854545593, + "epoch": 0.026168224299065422, + "kl_loss": 240.51211547851562, + "loss_ib": 12.05777645111084, + "step": 182 + }, + { + "ce_ib": 62.500736236572266, + "ce_orig": 1.4484410285949707, + "epoch": 0.026312005751258086, + "kl_loss": 223.3756103515625, + "loss_ib": 11.200030326843262, + "step": 183 + }, + { + "ce_ib": 70.71409606933594, + "ce_orig": 1.7095311880111694, + "epoch": 0.026455787203450754, + "kl_loss": 207.35211181640625, + "loss_ib": 10.402962684631348, + "step": 184 + }, + { + "epoch": 0.026599568655643422, + "grad_norm": 365.3516845703125, + "learning_rate": 1.3977635782747606e-05, + "loss": 14.3906, + "step": 185 + }, + { + "ce_ib": 63.17552947998047, + "ce_orig": 2.531080484390259, + "epoch": 0.026599568655643422, + "kl_loss": 165.71923828125, + "loss_ib": 8.317549705505371, + "step": 185 + }, + { + "ce_ib": 84.16175842285156, + "ce_orig": 2.0548126697540283, + "epoch": 0.02674335010783609, + "kl_loss": 140.68856811523438, + "loss_ib": 7.07650899887085, + "step": 186 + }, + { + "ce_ib": 77.55998992919922, + "ce_orig": 1.809746503829956, + "epoch": 0.026887131560028758, + "kl_loss": 121.14314270019531, + "loss_ib": 6.095937252044678, + "step": 187 + }, + { + "ce_ib": 77.81259155273438, + "ce_orig": 1.9566165208816528, + "epoch": 0.027030913012221422, + "kl_loss": 101.40348815917969, + "loss_ib": 5.109080791473389, + "step": 188 + }, + { + "ce_ib": 69.3860092163086, + "ce_orig": 1.703246831893921, + "epoch": 0.02717469446441409, + "kl_loss": 93.15800476074219, + "loss_ib": 4.692593574523926, + "step": 189 + }, + { + "epoch": 0.027318475916606758, + "grad_norm": 189.81011962890625, + "learning_rate": 1.4376996805111822e-05, + "loss": 8.0904, + "step": 190 + }, + { + "ce_ib": 60.081260681152344, + "ce_orig": 1.8992141485214233, + "epoch": 0.027318475916606758, + "kl_loss": 100.07512664794922, + "loss_ib": 5.033797264099121, + "step": 190 + }, + { + "ce_ib": 58.84217834472656, + "ce_orig": 1.018802285194397, + "epoch": 0.027462257368799425, + "kl_loss": 72.03784942626953, + "loss_ib": 3.6313138008117676, + "step": 191 + }, + { + "ce_ib": 58.045005798339844, + "ce_orig": 1.0986676216125488, + "epoch": 0.027606038820992093, + "kl_loss": 78.45074462890625, + "loss_ib": 3.9515597820281982, + "step": 192 + }, + { + "ce_ib": 53.310672760009766, + "ce_orig": 1.2137010097503662, + "epoch": 0.027749820273184757, + "kl_loss": 56.91468811035156, + "loss_ib": 2.872389793395996, + "step": 193 + }, + { + "ce_ib": 47.84365463256836, + "ce_orig": 1.141069769859314, + "epoch": 0.027893601725377425, + "kl_loss": 49.71630096435547, + "loss_ib": 2.5097367763519287, + "step": 194 + }, + { + "epoch": 0.028037383177570093, + "grad_norm": 102.58991241455078, + "learning_rate": 1.477635782747604e-05, + "loss": 4.9077, + "step": 195 + }, + { + "ce_ib": 47.62435531616211, + "ce_orig": 1.2880587577819824, + "epoch": 0.028037383177570093, + "kl_loss": 51.41084289550781, + "loss_ib": 2.5943543910980225, + "step": 195 + }, + { + "ce_ib": 45.36109161376953, + "ce_orig": 0.8635546565055847, + "epoch": 0.02818116462976276, + "kl_loss": 44.00239181518555, + "loss_ib": 2.2228000164031982, + "step": 196 + }, + { + "ce_ib": 41.63899230957031, + "ce_orig": 1.1706253290176392, + "epoch": 0.02832494608195543, + "kl_loss": 40.91154479980469, + "loss_ib": 2.066396713256836, + "step": 197 + }, + { + "ce_ib": 40.076194763183594, + "ce_orig": 0.9194042682647705, + "epoch": 0.028468727534148097, + "kl_loss": 38.050926208496094, + "loss_ib": 1.9225844144821167, + "step": 198 + }, + { + "ce_ib": 35.384429931640625, + "ce_orig": 0.8929119110107422, + "epoch": 0.02861250898634076, + "kl_loss": 40.8455924987793, + "loss_ib": 2.059971809387207, + "step": 199 + }, + { + "epoch": 0.02875629043853343, + "grad_norm": 53.01378631591797, + "learning_rate": 1.5175718849840257e-05, + "loss": 3.3711, + "step": 200 + }, + { + "ce_ib": 37.04123306274414, + "ce_orig": 0.9671800136566162, + "epoch": 0.02875629043853343, + "kl_loss": 35.982879638671875, + "loss_ib": 1.8176645040512085, + "step": 200 + }, + { + "ce_ib": 37.61642837524414, + "ce_orig": 1.3750962018966675, + "epoch": 0.028900071890726096, + "kl_loss": 30.0595703125, + "loss_ib": 1.5217866897583008, + "step": 201 + }, + { + "ce_ib": 40.238006591796875, + "ce_orig": 1.7766847610473633, + "epoch": 0.029043853342918764, + "kl_loss": 26.227649688720703, + "loss_ib": 1.3315014839172363, + "step": 202 + }, + { + "ce_ib": 38.055755615234375, + "ce_orig": 1.213000774383545, + "epoch": 0.029187634795111432, + "kl_loss": 29.874595642089844, + "loss_ib": 1.512757658958435, + "step": 203 + }, + { + "ce_ib": 32.915287017822266, + "ce_orig": 0.86496502161026, + "epoch": 0.029331416247304096, + "kl_loss": 30.330623626708984, + "loss_ib": 1.5329889059066772, + "step": 204 + }, + { + "epoch": 0.029475197699496764, + "grad_norm": 23.908042907714844, + "learning_rate": 1.5575079872204475e-05, + "loss": 2.7418, + "step": 205 + }, + { + "ce_ib": 35.01918411254883, + "ce_orig": 1.2721844911575317, + "epoch": 0.029475197699496764, + "kl_loss": 26.31290054321289, + "loss_ib": 1.333154559135437, + "step": 205 + }, + { + "ce_ib": 32.87287139892578, + "ce_orig": 1.1400614976882935, + "epoch": 0.029618979151689432, + "kl_loss": 26.177352905273438, + "loss_ib": 1.3253041505813599, + "step": 206 + }, + { + "ce_ib": 35.85771942138672, + "ce_orig": 1.3444130420684814, + "epoch": 0.0297627606038821, + "kl_loss": 34.98625183105469, + "loss_ib": 1.7672414779663086, + "step": 207 + }, + { + "ce_ib": 31.547334671020508, + "ce_orig": 0.9652504920959473, + "epoch": 0.029906542056074768, + "kl_loss": 20.096208572387695, + "loss_ib": 1.0205841064453125, + "step": 208 + }, + { + "ce_ib": 32.527896881103516, + "ce_orig": 1.0664165019989014, + "epoch": 0.030050323508267432, + "kl_loss": 20.749666213989258, + "loss_ib": 1.053747296333313, + "step": 209 + }, + { + "epoch": 0.0301941049604601, + "grad_norm": 22.29879379272461, + "learning_rate": 1.597444089456869e-05, + "loss": 2.3639, + "step": 210 + }, + { + "ce_ib": 32.71118927001953, + "ce_orig": 1.2022026777267456, + "epoch": 0.0301941049604601, + "kl_loss": 20.01553726196289, + "loss_ib": 1.0171325206756592, + "step": 210 + }, + { + "ce_ib": 30.730520248413086, + "ce_orig": 0.8476402163505554, + "epoch": 0.030337886412652768, + "kl_loss": 17.856157302856445, + "loss_ib": 0.9081730842590332, + "step": 211 + }, + { + "ce_ib": 29.36618995666504, + "ce_orig": 0.7845667004585266, + "epoch": 0.030481667864845435, + "kl_loss": 18.906429290771484, + "loss_ib": 0.9600045084953308, + "step": 212 + }, + { + "ce_ib": 27.0706844329834, + "ce_orig": 0.3370642066001892, + "epoch": 0.030625449317038103, + "kl_loss": 18.621246337890625, + "loss_ib": 0.9445976614952087, + "step": 213 + }, + { + "ce_ib": 34.2156867980957, + "ce_orig": 0.6062073111534119, + "epoch": 0.03076923076923077, + "kl_loss": 19.93109893798828, + "loss_ib": 1.0136628150939941, + "step": 214 + }, + { + "epoch": 0.030913012221423435, + "grad_norm": 16.176197052001953, + "learning_rate": 1.6373801916932906e-05, + "loss": 1.8299, + "step": 215 + }, + { + "ce_ib": 28.275493621826172, + "ce_orig": 1.0786405801773071, + "epoch": 0.030913012221423435, + "kl_loss": 18.11065673828125, + "loss_ib": 0.919670581817627, + "step": 215 + }, + { + "ce_ib": 32.22303009033203, + "ce_orig": 0.8485822081565857, + "epoch": 0.031056793673616103, + "kl_loss": 19.91845703125, + "loss_ib": 1.012034296989441, + "step": 216 + }, + { + "ce_ib": 27.742530822753906, + "ce_orig": 0.8794713020324707, + "epoch": 0.03120057512580877, + "kl_loss": 14.383018493652344, + "loss_ib": 0.7330222129821777, + "step": 217 + }, + { + "ce_ib": 31.794010162353516, + "ce_orig": 0.6954683661460876, + "epoch": 0.031344356578001435, + "kl_loss": 16.9901180267334, + "loss_ib": 0.86540287733078, + "step": 218 + }, + { + "ce_ib": 31.032941818237305, + "ce_orig": 1.1841130256652832, + "epoch": 0.0314881380301941, + "kl_loss": 13.406427383422852, + "loss_ib": 0.6858378648757935, + "step": 219 + }, + { + "epoch": 0.03163191948238677, + "grad_norm": 14.094661712646484, + "learning_rate": 1.6773162939297126e-05, + "loss": 1.8049, + "step": 220 + }, + { + "ce_ib": 32.347557067871094, + "ce_orig": 0.7493203282356262, + "epoch": 0.03163191948238677, + "kl_loss": 18.79709815979004, + "loss_ib": 0.9560286402702332, + "step": 220 + }, + { + "ce_ib": 31.1737060546875, + "ce_orig": 1.0758991241455078, + "epoch": 0.03177570093457944, + "kl_loss": 20.572391510009766, + "loss_ib": 1.0442065000534058, + "step": 221 + }, + { + "ce_ib": 30.046796798706055, + "ce_orig": 0.6758080124855042, + "epoch": 0.031919482386772106, + "kl_loss": 13.740577697753906, + "loss_ib": 0.7020522952079773, + "step": 222 + }, + { + "ce_ib": 30.532987594604492, + "ce_orig": 1.0658819675445557, + "epoch": 0.032063263838964774, + "kl_loss": 11.958425521850586, + "loss_ib": 0.6131877899169922, + "step": 223 + }, + { + "ce_ib": 31.667829513549805, + "ce_orig": 1.1956491470336914, + "epoch": 0.03220704529115744, + "kl_loss": 12.607803344726562, + "loss_ib": 0.6462240815162659, + "step": 224 + }, + { + "epoch": 0.03235082674335011, + "grad_norm": 5.280292987823486, + "learning_rate": 1.7172523961661345e-05, + "loss": 1.6789, + "step": 225 + }, + { + "ce_ib": 31.769432067871094, + "ce_orig": 1.1676932573318481, + "epoch": 0.03235082674335011, + "kl_loss": 10.932788848876953, + "loss_ib": 0.5625241994857788, + "step": 225 + }, + { + "ce_ib": 30.539987564086914, + "ce_orig": 1.3033503293991089, + "epoch": 0.03249460819554278, + "kl_loss": 12.105158805847168, + "loss_ib": 0.6205279231071472, + "step": 226 + }, + { + "ce_ib": 28.660764694213867, + "ce_orig": 0.6191550493240356, + "epoch": 0.032638389647735445, + "kl_loss": 9.103292465209961, + "loss_ib": 0.46949502825737, + "step": 227 + }, + { + "ce_ib": 29.167789459228516, + "ce_orig": 0.7975085973739624, + "epoch": 0.032782171099928106, + "kl_loss": 10.556224822998047, + "loss_ib": 0.5423951148986816, + "step": 228 + }, + { + "ce_ib": 29.07682991027832, + "ce_orig": 1.0861161947250366, + "epoch": 0.032925952552120774, + "kl_loss": 9.905879974365234, + "loss_ib": 0.5098324418067932, + "step": 229 + }, + { + "epoch": 0.03306973400431344, + "grad_norm": 5.249032974243164, + "learning_rate": 1.757188498402556e-05, + "loss": 1.5289, + "step": 230 + }, + { + "ce_ib": 28.152671813964844, + "ce_orig": 0.8374654054641724, + "epoch": 0.03306973400431344, + "kl_loss": 10.608142852783203, + "loss_ib": 0.544483482837677, + "step": 230 + }, + { + "ce_ib": 26.917634963989258, + "ce_orig": 1.0145775079727173, + "epoch": 0.03321351545650611, + "kl_loss": 12.240139961242676, + "loss_ib": 0.6254658102989197, + "step": 231 + }, + { + "ce_ib": 29.13026237487793, + "ce_orig": 0.8860113024711609, + "epoch": 0.03335729690869878, + "kl_loss": 10.390130043029785, + "loss_ib": 0.5340716242790222, + "step": 232 + }, + { + "ce_ib": 27.842693328857422, + "ce_orig": 1.1651474237442017, + "epoch": 0.033501078360891445, + "kl_loss": 9.602649688720703, + "loss_ib": 0.49405384063720703, + "step": 233 + }, + { + "ce_ib": 24.930410385131836, + "ce_orig": 0.6166008710861206, + "epoch": 0.03364485981308411, + "kl_loss": 10.271495819091797, + "loss_ib": 0.5260400176048279, + "step": 234 + }, + { + "epoch": 0.03378864126527678, + "grad_norm": 5.078007698059082, + "learning_rate": 1.7971246006389777e-05, + "loss": 1.4736, + "step": 235 + }, + { + "ce_ib": 27.262588500976562, + "ce_orig": 1.0158146619796753, + "epoch": 0.03378864126527678, + "kl_loss": 10.76059627532959, + "loss_ib": 0.5516611337661743, + "step": 235 + }, + { + "ce_ib": 28.12372398376465, + "ce_orig": 0.9383306503295898, + "epoch": 0.03393242271746945, + "kl_loss": 9.931174278259277, + "loss_ib": 0.5106205940246582, + "step": 236 + }, + { + "ce_ib": 27.61408233642578, + "ce_orig": 1.3611609935760498, + "epoch": 0.034076204169662117, + "kl_loss": 11.292994499206543, + "loss_ib": 0.5784568190574646, + "step": 237 + }, + { + "ce_ib": 25.42107582092285, + "ce_orig": 0.8392754197120667, + "epoch": 0.03421998562185478, + "kl_loss": 10.153192520141602, + "loss_ib": 0.5203701853752136, + "step": 238 + }, + { + "ce_ib": 26.299903869628906, + "ce_orig": 0.8138580918312073, + "epoch": 0.034363767074047445, + "kl_loss": 10.076581001281738, + "loss_ib": 0.5169789791107178, + "step": 239 + }, + { + "epoch": 0.03450754852624011, + "grad_norm": 1.6015193462371826, + "learning_rate": 1.8370607028753993e-05, + "loss": 1.371, + "step": 240 + }, + { + "ce_ib": 25.303178787231445, + "ce_orig": 0.9828527569770813, + "epoch": 0.03450754852624011, + "kl_loss": 9.301372528076172, + "loss_ib": 0.4777202308177948, + "step": 240 + }, + { + "ce_ib": 17.445037841796875, + "ce_orig": 0.447143018245697, + "epoch": 0.03465132997843278, + "kl_loss": 7.682253837585449, + "loss_ib": 0.39283522963523865, + "step": 241 + }, + { + "ce_ib": 25.544435501098633, + "ce_orig": 0.531434953212738, + "epoch": 0.03479511143062545, + "kl_loss": 9.930435180664062, + "loss_ib": 0.5092939734458923, + "step": 242 + }, + { + "ce_ib": 26.459491729736328, + "ce_orig": 1.5432560443878174, + "epoch": 0.034938892882818116, + "kl_loss": 10.38063907623291, + "loss_ib": 0.5322617292404175, + "step": 243 + }, + { + "ce_ib": 22.71752166748047, + "ce_orig": 0.8145064115524292, + "epoch": 0.035082674335010784, + "kl_loss": 9.922914505004883, + "loss_ib": 0.5075044631958008, + "step": 244 + }, + { + "epoch": 0.03522645578720345, + "grad_norm": 3.806140184402466, + "learning_rate": 1.8769968051118212e-05, + "loss": 1.4145, + "step": 245 + }, + { + "ce_ib": 26.927804946899414, + "ce_orig": 0.44644680619239807, + "epoch": 0.03522645578720345, + "kl_loss": 9.268022537231445, + "loss_ib": 0.4768650233745575, + "step": 245 + }, + { + "ce_ib": 25.422399520874023, + "ce_orig": 1.1885180473327637, + "epoch": 0.03537023723939612, + "kl_loss": 8.077836990356445, + "loss_ib": 0.41660305857658386, + "step": 246 + }, + { + "ce_ib": 25.16580581665039, + "ce_orig": 0.6401370763778687, + "epoch": 0.03551401869158879, + "kl_loss": 8.176619529724121, + "loss_ib": 0.4214138984680176, + "step": 247 + }, + { + "ce_ib": 21.501310348510742, + "ce_orig": 0.7412813901901245, + "epoch": 0.035657800143781455, + "kl_loss": 10.056595802307129, + "loss_ib": 0.5135805010795593, + "step": 248 + }, + { + "ce_ib": 24.469106674194336, + "ce_orig": 0.9428795576095581, + "epoch": 0.035801581595974116, + "kl_loss": 8.948450088500977, + "loss_ib": 0.45965704321861267, + "step": 249 + }, + { + "epoch": 0.035945363048166784, + "grad_norm": 2.816962957382202, + "learning_rate": 1.9169329073482428e-05, + "loss": 1.397, + "step": 250 + }, + { + "ce_ib": 22.195499420166016, + "ce_orig": 0.7263768911361694, + "epoch": 0.035945363048166784, + "kl_loss": 8.741401672363281, + "loss_ib": 0.4481678605079651, + "step": 250 + }, + { + "ce_ib": 22.70380973815918, + "ce_orig": 0.6979135274887085, + "epoch": 0.03608914450035945, + "kl_loss": 9.159427642822266, + "loss_ib": 0.4693233072757721, + "step": 251 + }, + { + "ce_ib": 24.72095489501953, + "ce_orig": 0.8336602449417114, + "epoch": 0.03623292595255212, + "kl_loss": 8.704825401306152, + "loss_ib": 0.4476017653942108, + "step": 252 + }, + { + "ce_ib": 25.478559494018555, + "ce_orig": 1.2140023708343506, + "epoch": 0.03637670740474479, + "kl_loss": 8.463350296020508, + "loss_ib": 0.4359067976474762, + "step": 253 + }, + { + "ce_ib": 24.30400276184082, + "ce_orig": 1.159781813621521, + "epoch": 0.036520488856937455, + "kl_loss": 9.485654830932617, + "loss_ib": 0.4864347577095032, + "step": 254 + }, + { + "epoch": 0.03666427030913012, + "grad_norm": 2.2414402961730957, + "learning_rate": 1.9568690095846644e-05, + "loss": 1.421, + "step": 255 + }, + { + "ce_ib": 22.23937225341797, + "ce_orig": 0.5963650941848755, + "epoch": 0.03666427030913012, + "kl_loss": 8.78721809387207, + "loss_ib": 0.4504806101322174, + "step": 255 + }, + { + "ce_ib": 23.723373413085938, + "ce_orig": 1.1367262601852417, + "epoch": 0.03680805176132279, + "kl_loss": 8.774513244628906, + "loss_ib": 0.45058736205101013, + "step": 256 + }, + { + "ce_ib": 25.06109619140625, + "ce_orig": 1.4180657863616943, + "epoch": 0.03695183321351546, + "kl_loss": 8.770709991455078, + "loss_ib": 0.4510660767555237, + "step": 257 + }, + { + "ce_ib": 23.2429141998291, + "ce_orig": 1.0652941465377808, + "epoch": 0.037095614665708126, + "kl_loss": 8.671271324157715, + "loss_ib": 0.4451850354671478, + "step": 258 + }, + { + "ce_ib": 21.69077491760254, + "ce_orig": 0.3725070655345917, + "epoch": 0.03723939611790079, + "kl_loss": 8.793878555297852, + "loss_ib": 0.45053932070732117, + "step": 259 + }, + { + "epoch": 0.037383177570093455, + "grad_norm": 3.798487663269043, + "learning_rate": 1.9968051118210863e-05, + "loss": 1.4654, + "step": 260 + }, + { + "ce_ib": 23.512039184570312, + "ce_orig": 1.0731854438781738, + "epoch": 0.037383177570093455, + "kl_loss": 8.859106063842773, + "loss_ib": 0.45471134781837463, + "step": 260 + }, + { + "ce_ib": 22.52284049987793, + "ce_orig": 1.0415581464767456, + "epoch": 0.03752695902228612, + "kl_loss": 8.765377044677734, + "loss_ib": 0.4495302736759186, + "step": 261 + }, + { + "ce_ib": 24.735340118408203, + "ce_orig": 1.2017862796783447, + "epoch": 0.03767074047447879, + "kl_loss": 8.242500305175781, + "loss_ib": 0.4244926869869232, + "step": 262 + }, + { + "ce_ib": 21.546173095703125, + "ce_orig": 1.224229097366333, + "epoch": 0.03781452192667146, + "kl_loss": 8.425148963928223, + "loss_ib": 0.4320305287837982, + "step": 263 + }, + { + "ce_ib": 21.48952293395996, + "ce_orig": 0.8686205148696899, + "epoch": 0.037958303378864126, + "kl_loss": 8.037482261657715, + "loss_ib": 0.41261887550354004, + "step": 264 + }, + { + "epoch": 0.038102084831056794, + "grad_norm": 3.353695869445801, + "learning_rate": 2.0367412140575082e-05, + "loss": 1.396, + "step": 265 + }, + { + "ce_ib": 21.23736000061035, + "ce_orig": 1.0968440771102905, + "epoch": 0.038102084831056794, + "kl_loss": 8.250904083251953, + "loss_ib": 0.42316389083862305, + "step": 265 + }, + { + "ce_ib": 23.215932846069336, + "ce_orig": 1.1477916240692139, + "epoch": 0.03824586628324946, + "kl_loss": 8.169689178466797, + "loss_ib": 0.4200924336910248, + "step": 266 + }, + { + "ce_ib": 23.65303611755371, + "ce_orig": 0.9717550277709961, + "epoch": 0.03838964773544213, + "kl_loss": 7.602072715759277, + "loss_ib": 0.39193016290664673, + "step": 267 + }, + { + "ce_ib": 20.33776092529297, + "ce_orig": 0.7842280864715576, + "epoch": 0.0385334291876348, + "kl_loss": 7.547301769256592, + "loss_ib": 0.3875339925289154, + "step": 268 + }, + { + "ce_ib": 21.66446304321289, + "ce_orig": 0.9328132271766663, + "epoch": 0.038677210639827465, + "kl_loss": 7.6735310554504395, + "loss_ib": 0.39450880885124207, + "step": 269 + }, + { + "epoch": 0.038820992092020126, + "grad_norm": 3.106287717819214, + "learning_rate": 2.07667731629393e-05, + "loss": 1.361, + "step": 270 + }, + { + "ce_ib": 14.545405387878418, + "ce_orig": 0.4456147849559784, + "epoch": 0.038820992092020126, + "kl_loss": 6.566807746887207, + "loss_ib": 0.33561310172080994, + "step": 270 + }, + { + "ce_ib": 22.359256744384766, + "ce_orig": 1.0915645360946655, + "epoch": 0.038964773544212794, + "kl_loss": 7.441349029541016, + "loss_ib": 0.38324710726737976, + "step": 271 + }, + { + "ce_ib": 19.833614349365234, + "ce_orig": 0.6568068861961365, + "epoch": 0.03910855499640546, + "kl_loss": 7.507279396057129, + "loss_ib": 0.3852807879447937, + "step": 272 + }, + { + "ce_ib": 18.543352127075195, + "ce_orig": 0.6701676845550537, + "epoch": 0.03925233644859813, + "kl_loss": 7.031156539916992, + "loss_ib": 0.36082950234413147, + "step": 273 + }, + { + "ce_ib": 19.24417495727539, + "ce_orig": 0.9314641952514648, + "epoch": 0.0393961179007908, + "kl_loss": 7.164027214050293, + "loss_ib": 0.36782345175743103, + "step": 274 + }, + { + "epoch": 0.039539899352983465, + "grad_norm": 3.4644718170166016, + "learning_rate": 2.1166134185303514e-05, + "loss": 1.2343, + "step": 275 + }, + { + "ce_ib": 20.395038604736328, + "ce_orig": 0.9956320524215698, + "epoch": 0.039539899352983465, + "kl_loss": 7.304188251495361, + "loss_ib": 0.375406950712204, + "step": 275 + }, + { + "ce_ib": 22.580080032348633, + "ce_orig": 1.0877642631530762, + "epoch": 0.03968368080517613, + "kl_loss": 6.740540504455566, + "loss_ib": 0.34831708669662476, + "step": 276 + }, + { + "ce_ib": 17.001928329467773, + "ce_orig": 0.6867518424987793, + "epoch": 0.0398274622573688, + "kl_loss": 6.005027770996094, + "loss_ib": 0.3087523579597473, + "step": 277 + }, + { + "ce_ib": 20.226699829101562, + "ce_orig": 0.5907849669456482, + "epoch": 0.03997124370956147, + "kl_loss": 6.040300369262695, + "loss_ib": 0.3121283948421478, + "step": 278 + }, + { + "ce_ib": 20.84942626953125, + "ce_orig": 0.8668643832206726, + "epoch": 0.040115025161754136, + "kl_loss": 5.51943302154541, + "loss_ib": 0.2863963544368744, + "step": 279 + }, + { + "epoch": 0.040258806613946804, + "grad_norm": 2.836003541946411, + "learning_rate": 2.1565495207667734e-05, + "loss": 1.2083, + "step": 280 + }, + { + "ce_ib": 21.198511123657227, + "ce_orig": 1.0262176990509033, + "epoch": 0.040258806613946804, + "kl_loss": 5.828641414642334, + "loss_ib": 0.30203133821487427, + "step": 280 + }, + { + "ce_ib": 20.3875675201416, + "ce_orig": 1.1043504476547241, + "epoch": 0.040402588066139465, + "kl_loss": 5.128015041351318, + "loss_ib": 0.2665945291519165, + "step": 281 + }, + { + "ce_ib": 21.915267944335938, + "ce_orig": 0.9482531547546387, + "epoch": 0.04054636951833213, + "kl_loss": 4.9252400398254395, + "loss_ib": 0.2572196424007416, + "step": 282 + }, + { + "ce_ib": 20.219453811645508, + "ce_orig": 0.9774989485740662, + "epoch": 0.0406901509705248, + "kl_loss": 4.067303657531738, + "loss_ib": 0.21347491443157196, + "step": 283 + }, + { + "ce_ib": 20.512344360351562, + "ce_orig": 0.8953350186347961, + "epoch": 0.04083393242271747, + "kl_loss": 2.980595588684082, + "loss_ib": 0.15928594768047333, + "step": 284 + }, + { + "epoch": 0.040977713874910136, + "grad_norm": 3.126970052719116, + "learning_rate": 2.196485623003195e-05, + "loss": 1.0977, + "step": 285 + }, + { + "ce_ib": 22.42814826965332, + "ce_orig": 0.8264601230621338, + "epoch": 0.040977713874910136, + "kl_loss": 2.260897397994995, + "loss_ib": 0.12425895035266876, + "step": 285 + }, + { + "ce_ib": 19.28723907470703, + "ce_orig": 1.0294511318206787, + "epoch": 0.041121495327102804, + "kl_loss": 1.4155142307281494, + "loss_ib": 0.08041933178901672, + "step": 286 + }, + { + "ce_ib": 22.32074546813965, + "ce_orig": 1.2082335948944092, + "epoch": 0.04126527677929547, + "kl_loss": 1.4922515153884888, + "loss_ib": 0.08577295392751694, + "step": 287 + }, + { + "ce_ib": 21.85085105895996, + "ce_orig": 1.1711450815200806, + "epoch": 0.04140905823148814, + "kl_loss": 1.0158560276031494, + "loss_ib": 0.06171822547912598, + "step": 288 + }, + { + "ce_ib": 19.448490142822266, + "ce_orig": 0.8083485960960388, + "epoch": 0.04155283968368081, + "kl_loss": 0.8125163316726685, + "loss_ib": 0.050350066274404526, + "step": 289 + }, + { + "epoch": 0.041696621135873475, + "grad_norm": 0.41338738799095154, + "learning_rate": 2.2364217252396165e-05, + "loss": 0.9838, + "step": 290 + }, + { + "ce_ib": 19.226776123046875, + "ce_orig": 1.1908280849456787, + "epoch": 0.041696621135873475, + "kl_loss": 0.7504492998123169, + "loss_ib": 0.0471358560025692, + "step": 290 + }, + { + "ce_ib": 19.87446403503418, + "ce_orig": 1.02711021900177, + "epoch": 0.041840402588066136, + "kl_loss": 0.6533533930778503, + "loss_ib": 0.04260490462183952, + "step": 291 + }, + { + "ce_ib": 17.87228775024414, + "ce_orig": 1.0570042133331299, + "epoch": 0.041984184040258804, + "kl_loss": 0.5579368472099304, + "loss_ib": 0.036832984536886215, + "step": 292 + }, + { + "ce_ib": 16.936511993408203, + "ce_orig": 0.5302789807319641, + "epoch": 0.04212796549245147, + "kl_loss": 0.6725109219551086, + "loss_ib": 0.042093802243471146, + "step": 293 + }, + { + "ce_ib": 18.851577758789062, + "ce_orig": 1.0163416862487793, + "epoch": 0.04227174694464414, + "kl_loss": 0.5098384618759155, + "loss_ib": 0.03491771221160889, + "step": 294 + }, + { + "epoch": 0.04241552839683681, + "grad_norm": 0.21297426521778107, + "learning_rate": 2.2763578274760385e-05, + "loss": 0.9126, + "step": 295 + }, + { + "ce_ib": 15.781968116760254, + "ce_orig": 0.5711618661880493, + "epoch": 0.04241552839683681, + "kl_loss": 0.4373496174812317, + "loss_ib": 0.02975846640765667, + "step": 295 + }, + { + "ce_ib": 22.636154174804688, + "ce_orig": 1.496230959892273, + "epoch": 0.042559309849029475, + "kl_loss": 0.4643814265727997, + "loss_ib": 0.03453714773058891, + "step": 296 + }, + { + "ce_ib": 14.683899879455566, + "ce_orig": 0.6033921241760254, + "epoch": 0.04270309130122214, + "kl_loss": 0.39723098278045654, + "loss_ib": 0.027203500270843506, + "step": 297 + }, + { + "ce_ib": 17.57473373413086, + "ce_orig": 0.7110596895217896, + "epoch": 0.04284687275341481, + "kl_loss": 0.4147634506225586, + "loss_ib": 0.02952553890645504, + "step": 298 + }, + { + "ce_ib": 16.251811981201172, + "ce_orig": 0.7895355820655823, + "epoch": 0.04299065420560748, + "kl_loss": 0.4082551896572113, + "loss_ib": 0.028538664802908897, + "step": 299 + }, + { + "epoch": 0.043134435657800146, + "grad_norm": 0.1568593829870224, + "learning_rate": 2.3162939297124604e-05, + "loss": 0.9485, + "step": 300 + }, + { + "ce_ib": 18.58441162109375, + "ce_orig": 1.089181900024414, + "epoch": 0.043134435657800146, + "kl_loss": 0.4467368721961975, + "loss_ib": 0.0316290520131588, + "step": 300 + }, + { + "ce_ib": 18.67475700378418, + "ce_orig": 0.566021740436554, + "epoch": 0.043278217109992814, + "kl_loss": 0.3963052034378052, + "loss_ib": 0.029152637347579002, + "step": 301 + }, + { + "ce_ib": 16.824068069458008, + "ce_orig": 0.7251248359680176, + "epoch": 0.043421998562185475, + "kl_loss": 0.34888386726379395, + "loss_ib": 0.025856226682662964, + "step": 302 + }, + { + "ce_ib": 18.8211669921875, + "ce_orig": 0.6579341888427734, + "epoch": 0.04356578001437814, + "kl_loss": 0.3751975893974304, + "loss_ib": 0.02817046456038952, + "step": 303 + }, + { + "ce_ib": 18.44203758239746, + "ce_orig": 0.8343374729156494, + "epoch": 0.04370956146657081, + "kl_loss": 0.3514009118080139, + "loss_ib": 0.02679106593132019, + "step": 304 + }, + { + "epoch": 0.04385334291876348, + "grad_norm": 0.15141652524471283, + "learning_rate": 2.356230031948882e-05, + "loss": 0.9599, + "step": 305 + }, + { + "ce_ib": 16.4177188873291, + "ce_orig": 0.6326029896736145, + "epoch": 0.04385334291876348, + "kl_loss": 0.331454873085022, + "loss_ib": 0.02478160336613655, + "step": 305 + }, + { + "ce_ib": 21.196409225463867, + "ce_orig": 1.230805516242981, + "epoch": 0.043997124370956146, + "kl_loss": 0.40000325441360474, + "loss_ib": 0.030598366633057594, + "step": 306 + }, + { + "ce_ib": 17.367687225341797, + "ce_orig": 0.863945484161377, + "epoch": 0.044140905823148814, + "kl_loss": 0.31744185090065, + "loss_ib": 0.024555936455726624, + "step": 307 + }, + { + "ce_ib": 18.942859649658203, + "ce_orig": 1.148938536643982, + "epoch": 0.04428468727534148, + "kl_loss": 0.3378192186355591, + "loss_ib": 0.02636238932609558, + "step": 308 + }, + { + "ce_ib": 19.47002410888672, + "ce_orig": 1.0959943532943726, + "epoch": 0.04442846872753415, + "kl_loss": 0.31226494908332825, + "loss_ib": 0.025348259136080742, + "step": 309 + }, + { + "epoch": 0.04457225017972682, + "grad_norm": 0.13533012568950653, + "learning_rate": 2.3961661341853036e-05, + "loss": 1.015, + "step": 310 + }, + { + "ce_ib": 17.681161880493164, + "ce_orig": 0.9902611374855042, + "epoch": 0.04457225017972682, + "kl_loss": 0.26686644554138184, + "loss_ib": 0.02218390442430973, + "step": 310 + }, + { + "ce_ib": 16.987899780273438, + "ce_orig": 0.6236902475357056, + "epoch": 0.044716031631919485, + "kl_loss": 0.33637407422065735, + "loss_ib": 0.025312652811408043, + "step": 311 + }, + { + "ce_ib": 18.65533447265625, + "ce_orig": 0.8152080774307251, + "epoch": 0.044859813084112146, + "kl_loss": 0.35383230447769165, + "loss_ib": 0.027019282802939415, + "step": 312 + }, + { + "ce_ib": 20.103788375854492, + "ce_orig": 0.8615632653236389, + "epoch": 0.045003594536304814, + "kl_loss": 0.2777783274650574, + "loss_ib": 0.023940810933709145, + "step": 313 + }, + { + "ce_ib": 16.64844512939453, + "ce_orig": 0.9147619605064392, + "epoch": 0.04514737598849748, + "kl_loss": 0.27109894156455994, + "loss_ib": 0.0218791700899601, + "step": 314 + }, + { + "epoch": 0.04529115744069015, + "grad_norm": 0.11459668725728989, + "learning_rate": 2.4361022364217255e-05, + "loss": 0.8602, + "step": 315 + }, + { + "ce_ib": 11.063411712646484, + "ce_orig": 0.27872321009635925, + "epoch": 0.04529115744069015, + "kl_loss": 0.3863391876220703, + "loss_ib": 0.024848666042089462, + "step": 315 + }, + { + "ce_ib": 17.38253402709961, + "ce_orig": 1.05631422996521, + "epoch": 0.04543493889288282, + "kl_loss": 0.29607954621315, + "loss_ib": 0.023495245724916458, + "step": 316 + }, + { + "ce_ib": 17.782546997070312, + "ce_orig": 0.9817179441452026, + "epoch": 0.045578720345075485, + "kl_loss": 0.28195664286613464, + "loss_ib": 0.02298910729587078, + "step": 317 + }, + { + "ce_ib": 17.297069549560547, + "ce_orig": 0.6919381618499756, + "epoch": 0.04572250179726815, + "kl_loss": 0.29801255464553833, + "loss_ib": 0.023549163714051247, + "step": 318 + }, + { + "ce_ib": 17.82324981689453, + "ce_orig": 1.2199736833572388, + "epoch": 0.04586628324946082, + "kl_loss": 0.2285030335187912, + "loss_ib": 0.020336776971817017, + "step": 319 + }, + { + "epoch": 0.04601006470165349, + "grad_norm": 0.10239739716053009, + "learning_rate": 2.476038338658147e-05, + "loss": 0.7986, + "step": 320 + }, + { + "ce_ib": 16.177907943725586, + "ce_orig": 0.7963648438453674, + "epoch": 0.04601006470165349, + "kl_loss": 0.2211613953113556, + "loss_ib": 0.019147023558616638, + "step": 320 + }, + { + "ce_ib": 17.704599380493164, + "ce_orig": 0.9023761749267578, + "epoch": 0.046153846153846156, + "kl_loss": 0.28145700693130493, + "loss_ib": 0.022925151512026787, + "step": 321 + }, + { + "ce_ib": 17.584495544433594, + "ce_orig": 0.8088329434394836, + "epoch": 0.046297627606038824, + "kl_loss": 0.2326435148715973, + "loss_ib": 0.02042442373931408, + "step": 322 + }, + { + "ce_ib": 16.041574478149414, + "ce_orig": 0.7201854586601257, + "epoch": 0.046441409058231485, + "kl_loss": 0.3453258275985718, + "loss_ib": 0.02528708055615425, + "step": 323 + }, + { + "ce_ib": 17.880464553833008, + "ce_orig": 0.917682409286499, + "epoch": 0.04658519051042415, + "kl_loss": 0.2218429148197174, + "loss_ib": 0.020032377913594246, + "step": 324 + }, + { + "epoch": 0.04672897196261682, + "grad_norm": 0.12532667815685272, + "learning_rate": 2.515974440894569e-05, + "loss": 0.8682, + "step": 325 + }, + { + "ce_ib": 15.092937469482422, + "ce_orig": 0.7272103428840637, + "epoch": 0.04672897196261682, + "kl_loss": 0.31795835494995117, + "loss_ib": 0.023444388061761856, + "step": 325 + }, + { + "ce_ib": 15.842549324035645, + "ce_orig": 0.9047788977622986, + "epoch": 0.04687275341480949, + "kl_loss": 0.2592480480670929, + "loss_ib": 0.020883677527308464, + "step": 326 + }, + { + "ce_ib": 14.095027923583984, + "ce_orig": 0.7625716328620911, + "epoch": 0.047016534867002156, + "kl_loss": 0.17521969974040985, + "loss_ib": 0.015808498486876488, + "step": 327 + }, + { + "ce_ib": 16.17887306213379, + "ce_orig": 0.7734029293060303, + "epoch": 0.047160316319194824, + "kl_loss": 0.23375201225280762, + "loss_ib": 0.01977703720331192, + "step": 328 + }, + { + "ce_ib": 18.957979202270508, + "ce_orig": 0.8559271097183228, + "epoch": 0.04730409777138749, + "kl_loss": 0.33970198035240173, + "loss_ib": 0.026464087888598442, + "step": 329 + }, + { + "epoch": 0.04744787922358016, + "grad_norm": 0.09043259918689728, + "learning_rate": 2.5559105431309903e-05, + "loss": 0.7934, + "step": 330 + }, + { + "ce_ib": 14.730635643005371, + "ce_orig": 0.8875608444213867, + "epoch": 0.04744787922358016, + "kl_loss": 0.2284042239189148, + "loss_ib": 0.01878552883863449, + "step": 330 + }, + { + "ce_ib": 12.517528533935547, + "ce_orig": 0.6206594705581665, + "epoch": 0.04759166067577283, + "kl_loss": 0.21303001046180725, + "loss_ib": 0.016910264268517494, + "step": 331 + }, + { + "ce_ib": 14.753087043762207, + "ce_orig": 0.841992199420929, + "epoch": 0.047735442127965495, + "kl_loss": 0.20966459810733795, + "loss_ib": 0.017859773710370064, + "step": 332 + }, + { + "ce_ib": 14.960477828979492, + "ce_orig": 1.0839353799819946, + "epoch": 0.047879223580158156, + "kl_loss": 0.15192289650440216, + "loss_ib": 0.01507638394832611, + "step": 333 + }, + { + "ce_ib": 15.1857328414917, + "ce_orig": 0.9817880392074585, + "epoch": 0.048023005032350824, + "kl_loss": 0.19470617175102234, + "loss_ib": 0.017328176647424698, + "step": 334 + }, + { + "epoch": 0.04816678648454349, + "grad_norm": 0.10444720834493637, + "learning_rate": 2.5958466453674125e-05, + "loss": 0.8294, + "step": 335 + }, + { + "ce_ib": 15.169656753540039, + "ce_orig": 0.8038283586502075, + "epoch": 0.04816678648454349, + "kl_loss": 0.2266630232334137, + "loss_ib": 0.018917979672551155, + "step": 335 + }, + { + "ce_ib": 14.643211364746094, + "ce_orig": 0.7942169904708862, + "epoch": 0.04831056793673616, + "kl_loss": 0.22511643171310425, + "loss_ib": 0.01857742667198181, + "step": 336 + }, + { + "ce_ib": 12.96651554107666, + "ce_orig": 0.8289546370506287, + "epoch": 0.04845434938892883, + "kl_loss": 0.17975889146327972, + "loss_ib": 0.015471202321350574, + "step": 337 + }, + { + "ce_ib": 16.585371017456055, + "ce_orig": 1.165753722190857, + "epoch": 0.048598130841121495, + "kl_loss": 0.190724715590477, + "loss_ib": 0.017828920856118202, + "step": 338 + }, + { + "ce_ib": 14.718335151672363, + "ce_orig": 0.8502370119094849, + "epoch": 0.04874191229331416, + "kl_loss": 0.20640771090984344, + "loss_ib": 0.017679553478956223, + "step": 339 + }, + { + "epoch": 0.04888569374550683, + "grad_norm": 0.09888631105422974, + "learning_rate": 2.6357827476038338e-05, + "loss": 0.8718, + "step": 340 + }, + { + "ce_ib": 13.714351654052734, + "ce_orig": 1.0317654609680176, + "epoch": 0.04888569374550683, + "kl_loss": 0.19221842288970947, + "loss_ib": 0.016468096524477005, + "step": 340 + }, + { + "ce_ib": 15.859162330627441, + "ce_orig": 0.6472983956336975, + "epoch": 0.0490294751976995, + "kl_loss": 0.2313256412744522, + "loss_ib": 0.01949586346745491, + "step": 341 + }, + { + "ce_ib": 16.512859344482422, + "ce_orig": 1.1302889585494995, + "epoch": 0.049173256649892166, + "kl_loss": 0.1906917691230774, + "loss_ib": 0.0177910178899765, + "step": 342 + }, + { + "ce_ib": 16.816452026367188, + "ce_orig": 0.8666338920593262, + "epoch": 0.049317038102084834, + "kl_loss": 0.17613860964775085, + "loss_ib": 0.01721515692770481, + "step": 343 + }, + { + "ce_ib": 13.920699119567871, + "ce_orig": 0.719912588596344, + "epoch": 0.049460819554277495, + "kl_loss": 0.18724943697452545, + "loss_ib": 0.016322821378707886, + "step": 344 + }, + { + "epoch": 0.04960460100647016, + "grad_norm": 0.10115125775337219, + "learning_rate": 2.6757188498402557e-05, + "loss": 0.843, + "step": 345 + }, + { + "ce_ib": 16.091005325317383, + "ce_orig": 1.0146100521087646, + "epoch": 0.04960460100647016, + "kl_loss": 0.1512732356786728, + "loss_ib": 0.015609164722263813, + "step": 345 + }, + { + "ce_ib": 15.5990571975708, + "ce_orig": 0.7898549437522888, + "epoch": 0.04974838245866283, + "kl_loss": 0.25755369663238525, + "loss_ib": 0.020677214488387108, + "step": 346 + }, + { + "ce_ib": 17.706438064575195, + "ce_orig": 1.2053872346878052, + "epoch": 0.0498921639108555, + "kl_loss": 0.21856242418289185, + "loss_ib": 0.019781339913606644, + "step": 347 + }, + { + "ce_ib": 10.220382690429688, + "ce_orig": 0.5169559121131897, + "epoch": 0.050035945363048166, + "kl_loss": 0.2967785596847534, + "loss_ib": 0.01994911953806877, + "step": 348 + }, + { + "ce_ib": 14.920300483703613, + "ce_orig": 0.7747300267219543, + "epoch": 0.050179726815240834, + "kl_loss": 0.13559795916080475, + "loss_ib": 0.01424004789441824, + "step": 349 + }, + { + "epoch": 0.0503235082674335, + "grad_norm": 0.09113696962594986, + "learning_rate": 2.7156549520766773e-05, + "loss": 0.898, + "step": 350 + }, + { + "ce_ib": 16.00478172302246, + "ce_orig": 1.2635383605957031, + "epoch": 0.0503235082674335, + "kl_loss": 0.2005615234375, + "loss_ib": 0.018030468374490738, + "step": 350 + }, + { + "ce_ib": 16.192142486572266, + "ce_orig": 0.9619611501693726, + "epoch": 0.05046728971962617, + "kl_loss": 0.25098395347595215, + "loss_ib": 0.020645270124077797, + "step": 351 + }, + { + "ce_ib": 13.817422866821289, + "ce_orig": 0.6711569428443909, + "epoch": 0.05061107117181884, + "kl_loss": 0.21178191900253296, + "loss_ib": 0.01749780774116516, + "step": 352 + }, + { + "ce_ib": 15.532769203186035, + "ce_orig": 0.5863211750984192, + "epoch": 0.050754852624011505, + "kl_loss": 0.33604636788368225, + "loss_ib": 0.024568704888224602, + "step": 353 + }, + { + "ce_ib": 14.479625701904297, + "ce_orig": 1.0277538299560547, + "epoch": 0.05089863407620417, + "kl_loss": 0.251324862241745, + "loss_ib": 0.01980605535209179, + "step": 354 + }, + { + "epoch": 0.051042415528396834, + "grad_norm": 0.10617897659540176, + "learning_rate": 2.7555910543130992e-05, + "loss": 0.8813, + "step": 355 + }, + { + "ce_ib": 16.734331130981445, + "ce_orig": 0.9177318215370178, + "epoch": 0.051042415528396834, + "kl_loss": 0.2959950864315033, + "loss_ib": 0.023166919127106667, + "step": 355 + }, + { + "ce_ib": 14.452986717224121, + "ce_orig": 0.8091175556182861, + "epoch": 0.0511861969805895, + "kl_loss": 0.19851821660995483, + "loss_ib": 0.01715240441262722, + "step": 356 + }, + { + "ce_ib": 15.050978660583496, + "ce_orig": 1.065936803817749, + "epoch": 0.05132997843278217, + "kl_loss": 0.1562933474779129, + "loss_ib": 0.015340156853199005, + "step": 357 + }, + { + "ce_ib": 16.42909812927246, + "ce_orig": 0.6200724840164185, + "epoch": 0.05147375988497484, + "kl_loss": 0.22564369440078735, + "loss_ib": 0.0194967333227396, + "step": 358 + }, + { + "ce_ib": 15.036846160888672, + "ce_orig": 0.8413035273551941, + "epoch": 0.051617541337167505, + "kl_loss": 0.15089106559753418, + "loss_ib": 0.015062975697219372, + "step": 359 + }, + { + "epoch": 0.05176132278936017, + "grad_norm": 0.0994093269109726, + "learning_rate": 2.7955271565495212e-05, + "loss": 0.8787, + "step": 360 + }, + { + "ce_ib": 16.430448532104492, + "ce_orig": 1.058258295059204, + "epoch": 0.05176132278936017, + "kl_loss": 0.19877898693084717, + "loss_ib": 0.018154174089431763, + "step": 360 + }, + { + "ce_ib": 16.932239532470703, + "ce_orig": 1.2644236087799072, + "epoch": 0.05190510424155284, + "kl_loss": 0.19584302604198456, + "loss_ib": 0.018258271738886833, + "step": 361 + }, + { + "ce_ib": 14.099466323852539, + "ce_orig": 0.9995023608207703, + "epoch": 0.05204888569374551, + "kl_loss": 0.16139906644821167, + "loss_ib": 0.015119686722755432, + "step": 362 + }, + { + "ce_ib": 15.650708198547363, + "ce_orig": 0.9404743313789368, + "epoch": 0.052192667145938176, + "kl_loss": 0.21341916918754578, + "loss_ib": 0.018496312201023102, + "step": 363 + }, + { + "ce_ib": 16.308624267578125, + "ce_orig": 1.1696351766586304, + "epoch": 0.052336448598130844, + "kl_loss": 0.18094685673713684, + "loss_ib": 0.017201654613018036, + "step": 364 + }, + { + "epoch": 0.052480230050323505, + "grad_norm": 0.08824347704648972, + "learning_rate": 2.8354632587859424e-05, + "loss": 0.92, + "step": 365 + }, + { + "ce_ib": 16.583515167236328, + "ce_orig": 1.182395577430725, + "epoch": 0.052480230050323505, + "kl_loss": 0.19043317437171936, + "loss_ib": 0.017813416197896004, + "step": 365 + }, + { + "ce_ib": 13.93136978149414, + "ce_orig": 0.8936623930931091, + "epoch": 0.05262401150251617, + "kl_loss": 0.20374388992786407, + "loss_ib": 0.017152879387140274, + "step": 366 + }, + { + "ce_ib": 13.256582260131836, + "ce_orig": 0.6269909143447876, + "epoch": 0.05276779295470884, + "kl_loss": 0.16010135412216187, + "loss_ib": 0.014633359387516975, + "step": 367 + }, + { + "ce_ib": 12.879414558410645, + "ce_orig": 0.6315767765045166, + "epoch": 0.05291157440690151, + "kl_loss": 0.16557064652442932, + "loss_ib": 0.014718241058290005, + "step": 368 + }, + { + "ce_ib": 15.146659851074219, + "ce_orig": 1.0897746086120605, + "epoch": 0.053055355859094176, + "kl_loss": 0.16495399177074432, + "loss_ib": 0.015821030363440514, + "step": 369 + }, + { + "epoch": 0.053199137311286844, + "grad_norm": 0.09635983407497406, + "learning_rate": 2.8753993610223644e-05, + "loss": 0.933, + "step": 370 + }, + { + "ce_ib": 14.545082092285156, + "ce_orig": 0.903797447681427, + "epoch": 0.053199137311286844, + "kl_loss": 0.12444749474525452, + "loss_ib": 0.013494915328919888, + "step": 370 + }, + { + "ce_ib": 12.932695388793945, + "ce_orig": 0.6494324207305908, + "epoch": 0.05334291876347951, + "kl_loss": 0.1315731406211853, + "loss_ib": 0.013045004568994045, + "step": 371 + }, + { + "ce_ib": 15.617569923400879, + "ce_orig": 0.7816907167434692, + "epoch": 0.05348670021567218, + "kl_loss": 0.16312208771705627, + "loss_ib": 0.015964889898896217, + "step": 372 + }, + { + "ce_ib": 15.400293350219727, + "ce_orig": 1.033065915107727, + "epoch": 0.05363048166786485, + "kl_loss": 0.17930516600608826, + "loss_ib": 0.01666540466248989, + "step": 373 + }, + { + "ce_ib": 11.445207595825195, + "ce_orig": 0.5388709902763367, + "epoch": 0.053774263120057515, + "kl_loss": 0.2616915702819824, + "loss_ib": 0.0188071820884943, + "step": 374 + }, + { + "epoch": 0.05391804457225018, + "grad_norm": 0.0901573896408081, + "learning_rate": 2.915335463258786e-05, + "loss": 0.926, + "step": 375 + }, + { + "ce_ib": 13.34332275390625, + "ce_orig": 1.1417230367660522, + "epoch": 0.05391804457225018, + "kl_loss": 0.1469050794839859, + "loss_ib": 0.014016914181411266, + "step": 375 + }, + { + "ce_ib": 11.211030006408691, + "ce_orig": 0.6393249034881592, + "epoch": 0.054061826024442844, + "kl_loss": 0.13886746764183044, + "loss_ib": 0.012548888102173805, + "step": 376 + }, + { + "ce_ib": 15.887382507324219, + "ce_orig": 0.9176316261291504, + "epoch": 0.05420560747663551, + "kl_loss": 0.2912940979003906, + "loss_ib": 0.022508395835757256, + "step": 377 + }, + { + "ce_ib": 11.42358112335205, + "ce_orig": 0.8122538924217224, + "epoch": 0.05434938892882818, + "kl_loss": 0.1490350216627121, + "loss_ib": 0.013163541443645954, + "step": 378 + }, + { + "ce_ib": 14.985864639282227, + "ce_orig": 0.9277105927467346, + "epoch": 0.05449317038102085, + "kl_loss": 0.14583294093608856, + "loss_ib": 0.014784579165279865, + "step": 379 + }, + { + "epoch": 0.054636951833213515, + "grad_norm": 0.12827961146831512, + "learning_rate": 2.955271565495208e-05, + "loss": 0.9204, + "step": 380 + }, + { + "ce_ib": 13.780610084533691, + "ce_orig": 0.8345714807510376, + "epoch": 0.054636951833213515, + "kl_loss": 0.1690724641084671, + "loss_ib": 0.015343928709626198, + "step": 380 + }, + { + "ce_ib": 11.765593528747559, + "ce_orig": 0.7576747536659241, + "epoch": 0.05478073328540618, + "kl_loss": 0.15963752567768097, + "loss_ib": 0.013864672742784023, + "step": 381 + }, + { + "ce_ib": 12.893147468566895, + "ce_orig": 1.0378029346466064, + "epoch": 0.05492451473759885, + "kl_loss": 0.13528262078762054, + "loss_ib": 0.013210705481469631, + "step": 382 + }, + { + "ce_ib": 15.782855987548828, + "ce_orig": 0.8279376029968262, + "epoch": 0.05506829618979152, + "kl_loss": 0.16392827033996582, + "loss_ib": 0.016087843105196953, + "step": 383 + }, + { + "ce_ib": 12.443214416503906, + "ce_orig": 0.8166038990020752, + "epoch": 0.055212077641984186, + "kl_loss": 0.14554069936275482, + "loss_ib": 0.013498641550540924, + "step": 384 + }, + { + "epoch": 0.055355859094176854, + "grad_norm": 0.13148367404937744, + "learning_rate": 2.9952076677316295e-05, + "loss": 0.8973, + "step": 385 + }, + { + "ce_ib": 16.574996948242188, + "ce_orig": 1.3873276710510254, + "epoch": 0.055355859094176854, + "kl_loss": 0.206925630569458, + "loss_ib": 0.018633781000971794, + "step": 385 + }, + { + "ce_ib": 14.954483032226562, + "ce_orig": 1.4074153900146484, + "epoch": 0.055499640546369515, + "kl_loss": 0.1725064069032669, + "loss_ib": 0.016102561727166176, + "step": 386 + }, + { + "ce_ib": 13.222760200500488, + "ce_orig": 0.581721842288971, + "epoch": 0.05564342199856218, + "kl_loss": 0.3106327950954437, + "loss_ib": 0.02214301936328411, + "step": 387 + }, + { + "ce_ib": 12.130496978759766, + "ce_orig": 0.8030052185058594, + "epoch": 0.05578720345075485, + "kl_loss": 0.19192326068878174, + "loss_ib": 0.015661410987377167, + "step": 388 + }, + { + "ce_ib": 12.304028511047363, + "ce_orig": 0.838097095489502, + "epoch": 0.05593098490294752, + "kl_loss": 0.17265933752059937, + "loss_ib": 0.0147849814966321, + "step": 389 + }, + { + "epoch": 0.056074766355140186, + "grad_norm": 0.1129549965262413, + "learning_rate": 3.0351437699680514e-05, + "loss": 0.9147, + "step": 390 + }, + { + "ce_ib": 12.584757804870605, + "ce_orig": 0.6829859018325806, + "epoch": 0.056074766355140186, + "kl_loss": 0.15448611974716187, + "loss_ib": 0.014016685076057911, + "step": 390 + }, + { + "ce_ib": 14.888505935668945, + "ce_orig": 0.8695336580276489, + "epoch": 0.056218547807332854, + "kl_loss": 0.18638572096824646, + "loss_ib": 0.016763538122177124, + "step": 391 + }, + { + "ce_ib": 13.158818244934082, + "ce_orig": 0.724577009677887, + "epoch": 0.05636232925952552, + "kl_loss": 0.15236912667751312, + "loss_ib": 0.01419786550104618, + "step": 392 + }, + { + "ce_ib": 14.405329704284668, + "ce_orig": 0.4904825985431671, + "epoch": 0.05650611071171819, + "kl_loss": 0.21818403899669647, + "loss_ib": 0.01811186783015728, + "step": 393 + }, + { + "ce_ib": 10.370551109313965, + "ce_orig": 0.6885640621185303, + "epoch": 0.05664989216391086, + "kl_loss": 0.18041831254959106, + "loss_ib": 0.014206192456185818, + "step": 394 + }, + { + "epoch": 0.056793673616103525, + "grad_norm": 0.09922255575656891, + "learning_rate": 3.075079872204473e-05, + "loss": 0.8916, + "step": 395 + }, + { + "ce_ib": 15.597278594970703, + "ce_orig": 1.2381712198257446, + "epoch": 0.056793673616103525, + "kl_loss": 0.1798793077468872, + "loss_ib": 0.016792604699730873, + "step": 395 + }, + { + "ce_ib": 16.811328887939453, + "ce_orig": 1.2628995180130005, + "epoch": 0.05693745506829619, + "kl_loss": 0.171632319688797, + "loss_ib": 0.016987280920147896, + "step": 396 + }, + { + "ce_ib": 14.330126762390137, + "ce_orig": 0.842546284198761, + "epoch": 0.057081236520488854, + "kl_loss": 0.17398859560489655, + "loss_ib": 0.01586449332535267, + "step": 397 + }, + { + "ce_ib": 13.21159553527832, + "ce_orig": 0.8423411250114441, + "epoch": 0.05722501797268152, + "kl_loss": 0.13025188446044922, + "loss_ib": 0.013118392787873745, + "step": 398 + }, + { + "ce_ib": 12.565587043762207, + "ce_orig": 0.7189036011695862, + "epoch": 0.05736879942487419, + "kl_loss": 0.14162641763687134, + "loss_ib": 0.013364115729928017, + "step": 399 + }, + { + "epoch": 0.05751258087706686, + "grad_norm": 0.0966155007481575, + "learning_rate": 3.115015974440895e-05, + "loss": 0.9267, + "step": 400 + }, + { + "ce_ib": 11.058226585388184, + "ce_orig": 0.9342263340950012, + "epoch": 0.05751258087706686, + "kl_loss": 0.11544251441955566, + "loss_ib": 0.011301239021122456, + "step": 400 + }, + { + "ce_ib": 14.200401306152344, + "ce_orig": 1.0684270858764648, + "epoch": 0.057656362329259525, + "kl_loss": 0.14682269096374512, + "loss_ib": 0.014441335573792458, + "step": 401 + }, + { + "ce_ib": 14.785656929016113, + "ce_orig": 1.1560802459716797, + "epoch": 0.05780014378145219, + "kl_loss": 0.1572328507900238, + "loss_ib": 0.015254470519721508, + "step": 402 + }, + { + "ce_ib": 16.529001235961914, + "ce_orig": 1.4409286975860596, + "epoch": 0.05794392523364486, + "kl_loss": 0.1712377667427063, + "loss_ib": 0.016826389357447624, + "step": 403 + }, + { + "ce_ib": 8.062843322753906, + "ce_orig": 0.4845752716064453, + "epoch": 0.05808770668583753, + "kl_loss": 0.114130899310112, + "loss_ib": 0.009737967513501644, + "step": 404 + }, + { + "epoch": 0.058231488138030196, + "grad_norm": 0.10344849526882172, + "learning_rate": 3.154952076677317e-05, + "loss": 0.9143, + "step": 405 + }, + { + "ce_ib": 14.247758865356445, + "ce_orig": 0.8494449853897095, + "epoch": 0.058231488138030196, + "kl_loss": 0.14982560276985168, + "loss_ib": 0.014615160413086414, + "step": 405 + }, + { + "ce_ib": 15.090539932250977, + "ce_orig": 1.255419135093689, + "epoch": 0.058375269590222864, + "kl_loss": 0.2056526243686676, + "loss_ib": 0.017827901989221573, + "step": 406 + }, + { + "ce_ib": 13.584562301635742, + "ce_orig": 1.0737160444259644, + "epoch": 0.058519051042415525, + "kl_loss": 0.16253307461738586, + "loss_ib": 0.014918935485184193, + "step": 407 + }, + { + "ce_ib": 10.844743728637695, + "ce_orig": 0.7258655428886414, + "epoch": 0.05866283249460819, + "kl_loss": 0.2175343632698059, + "loss_ib": 0.016299089416861534, + "step": 408 + }, + { + "ce_ib": 9.838624000549316, + "ce_orig": 0.5341205596923828, + "epoch": 0.05880661394680086, + "kl_loss": 0.24159343540668488, + "loss_ib": 0.016998983919620514, + "step": 409 + }, + { + "epoch": 0.05895039539899353, + "grad_norm": 0.0828595831990242, + "learning_rate": 3.194888178913738e-05, + "loss": 0.8595, + "step": 410 + }, + { + "ce_ib": 11.394186019897461, + "ce_orig": 0.6941292881965637, + "epoch": 0.05895039539899353, + "kl_loss": 0.13403424620628357, + "loss_ib": 0.012398804537951946, + "step": 410 + }, + { + "ce_ib": 13.786474227905273, + "ce_orig": 1.0304478406906128, + "epoch": 0.059094176851186196, + "kl_loss": 0.24797815084457397, + "loss_ib": 0.01929214410483837, + "step": 411 + }, + { + "ce_ib": 12.280767440795898, + "ce_orig": 0.9082537889480591, + "epoch": 0.059237958303378864, + "kl_loss": 0.09719130396842957, + "loss_ib": 0.010999949648976326, + "step": 412 + }, + { + "ce_ib": 11.551681518554688, + "ce_orig": 0.9754782915115356, + "epoch": 0.05938173975557153, + "kl_loss": 0.1405172348022461, + "loss_ib": 0.0128017021343112, + "step": 413 + }, + { + "ce_ib": 13.329681396484375, + "ce_orig": 0.9015910625457764, + "epoch": 0.0595255212077642, + "kl_loss": 0.15253770351409912, + "loss_ib": 0.014291726052761078, + "step": 414 + }, + { + "epoch": 0.05966930265995687, + "grad_norm": 0.09882552921772003, + "learning_rate": 3.23482428115016e-05, + "loss": 0.8577, + "step": 415 + }, + { + "ce_ib": 12.369913101196289, + "ce_orig": 0.8101427555084229, + "epoch": 0.05966930265995687, + "kl_loss": 0.2113226056098938, + "loss_ib": 0.016751086339354515, + "step": 415 + }, + { + "ce_ib": 14.39426040649414, + "ce_orig": 1.3613587617874146, + "epoch": 0.059813084112149535, + "kl_loss": 0.1314837634563446, + "loss_ib": 0.013771317899227142, + "step": 416 + }, + { + "ce_ib": 12.081097602844238, + "ce_orig": 0.4347302317619324, + "epoch": 0.0599568655643422, + "kl_loss": 0.2995225489139557, + "loss_ib": 0.021016675978899002, + "step": 417 + }, + { + "ce_ib": 15.218514442443848, + "ce_orig": 1.2289142608642578, + "epoch": 0.060100647016534864, + "kl_loss": 0.26874852180480957, + "loss_ib": 0.021046683192253113, + "step": 418 + }, + { + "ce_ib": 13.258194923400879, + "ce_orig": 1.0039843320846558, + "epoch": 0.06024442846872753, + "kl_loss": 0.12897028028964996, + "loss_ib": 0.013077611103653908, + "step": 419 + }, + { + "epoch": 0.0603882099209202, + "grad_norm": 0.1349947154521942, + "learning_rate": 3.274760383386581e-05, + "loss": 0.899, + "step": 420 + }, + { + "ce_ib": 12.171891212463379, + "ce_orig": 0.9144300222396851, + "epoch": 0.0603882099209202, + "kl_loss": 0.21492531895637512, + "loss_ib": 0.01683221198618412, + "step": 420 + }, + { + "ce_ib": 10.995501518249512, + "ce_orig": 0.699188768863678, + "epoch": 0.06053199137311287, + "kl_loss": 0.12099233269691467, + "loss_ib": 0.011547367088496685, + "step": 421 + }, + { + "ce_ib": 12.991347312927246, + "ce_orig": 0.9281318187713623, + "epoch": 0.060675772825305535, + "kl_loss": 0.1598033308982849, + "loss_ib": 0.014485838823020458, + "step": 422 + }, + { + "ce_ib": 10.423280715942383, + "ce_orig": 0.9821050763130188, + "epoch": 0.0608195542774982, + "kl_loss": 0.10947795957326889, + "loss_ib": 0.010685537941753864, + "step": 423 + }, + { + "ce_ib": 11.12364387512207, + "ce_orig": 0.7817228436470032, + "epoch": 0.06096333572969087, + "kl_loss": 0.17594116926193237, + "loss_ib": 0.014358880929648876, + "step": 424 + }, + { + "epoch": 0.06110711718188354, + "grad_norm": 0.0929858386516571, + "learning_rate": 3.314696485623003e-05, + "loss": 0.7994, + "step": 425 + }, + { + "ce_ib": 12.660994529724121, + "ce_orig": 0.9210802912712097, + "epoch": 0.06110711718188354, + "kl_loss": 0.15979456901550293, + "loss_ib": 0.014320224523544312, + "step": 425 + }, + { + "ce_ib": 15.475061416625977, + "ce_orig": 1.6302592754364014, + "epoch": 0.061250898634076206, + "kl_loss": 0.1844199150800705, + "loss_ib": 0.01695852540433407, + "step": 426 + }, + { + "ce_ib": 9.331029891967773, + "ce_orig": 0.5564351081848145, + "epoch": 0.061394680086268874, + "kl_loss": 0.1388329267501831, + "loss_ib": 0.011607161723077297, + "step": 427 + }, + { + "ce_ib": 12.874106407165527, + "ce_orig": 0.9861687421798706, + "epoch": 0.06153846153846154, + "kl_loss": 0.12479162216186523, + "loss_ib": 0.012676633894443512, + "step": 428 + }, + { + "ce_ib": 7.756659507751465, + "ce_orig": 0.28384384512901306, + "epoch": 0.0616822429906542, + "kl_loss": 0.2630873918533325, + "loss_ib": 0.017032699659466743, + "step": 429 + }, + { + "epoch": 0.06182602444284687, + "grad_norm": 0.09535211324691772, + "learning_rate": 3.354632587859425e-05, + "loss": 0.9043, + "step": 430 + }, + { + "ce_ib": 10.622055053710938, + "ce_orig": 0.6629616022109985, + "epoch": 0.06182602444284687, + "kl_loss": 0.13496457040309906, + "loss_ib": 0.012059256434440613, + "step": 430 + }, + { + "ce_ib": 11.811662673950195, + "ce_orig": 0.7327677011489868, + "epoch": 0.06196980589503954, + "kl_loss": 0.14013449847698212, + "loss_ib": 0.012912556529045105, + "step": 431 + }, + { + "ce_ib": 8.620430946350098, + "ce_orig": 0.7203670144081116, + "epoch": 0.062113587347232206, + "kl_loss": 0.11925005167722702, + "loss_ib": 0.010272718034684658, + "step": 432 + }, + { + "ce_ib": 14.616909980773926, + "ce_orig": 1.4517083168029785, + "epoch": 0.062257368799424874, + "kl_loss": 0.15538114309310913, + "loss_ib": 0.01507751177996397, + "step": 433 + }, + { + "ce_ib": 9.763717651367188, + "ce_orig": 0.6260893940925598, + "epoch": 0.06240115025161754, + "kl_loss": 0.13390487432479858, + "loss_ib": 0.011577102355659008, + "step": 434 + }, + { + "epoch": 0.0625449317038102, + "grad_norm": 0.11180251836776733, + "learning_rate": 3.394568690095847e-05, + "loss": 0.9071, + "step": 435 + }, + { + "ce_ib": 12.881009101867676, + "ce_orig": 0.5546009540557861, + "epoch": 0.0625449317038102, + "kl_loss": 0.13927477598190308, + "loss_ib": 0.013404244557023048, + "step": 435 + }, + { + "ce_ib": 12.253645896911621, + "ce_orig": 0.7509746551513672, + "epoch": 0.06268871315600287, + "kl_loss": 0.16948378086090088, + "loss_ib": 0.014601011760532856, + "step": 436 + }, + { + "ce_ib": 13.29328441619873, + "ce_orig": 0.9583929777145386, + "epoch": 0.06283249460819554, + "kl_loss": 0.15764841437339783, + "loss_ib": 0.014529063366353512, + "step": 437 + }, + { + "ce_ib": 12.615095138549805, + "ce_orig": 1.1630975008010864, + "epoch": 0.0629762760603882, + "kl_loss": 0.12097503244876862, + "loss_ib": 0.012356298975646496, + "step": 438 + }, + { + "ce_ib": 11.95744514465332, + "ce_orig": 0.734953761100769, + "epoch": 0.06312005751258087, + "kl_loss": 0.13797758519649506, + "loss_ib": 0.012877601198852062, + "step": 439 + }, + { + "epoch": 0.06326383896477354, + "grad_norm": 0.09555409848690033, + "learning_rate": 3.434504792332269e-05, + "loss": 0.8284, + "step": 440 + }, + { + "ce_ib": 17.494842529296875, + "ce_orig": 1.2540117502212524, + "epoch": 0.06326383896477354, + "kl_loss": 0.20184318721294403, + "loss_ib": 0.018839580938220024, + "step": 440 + }, + { + "ce_ib": 13.520644187927246, + "ce_orig": 1.173345923423767, + "epoch": 0.06340762041696621, + "kl_loss": 0.224016010761261, + "loss_ib": 0.01796112395823002, + "step": 441 + }, + { + "ce_ib": 6.444005489349365, + "ce_orig": 0.33801600337028503, + "epoch": 0.06355140186915888, + "kl_loss": 0.24252083897590637, + "loss_ib": 0.015348044224083424, + "step": 442 + }, + { + "ce_ib": 11.194876670837402, + "ce_orig": 0.8596982359886169, + "epoch": 0.06369518332135155, + "kl_loss": 0.1475781947374344, + "loss_ib": 0.012976348400115967, + "step": 443 + }, + { + "ce_ib": 14.788161277770996, + "ce_orig": 0.9179244041442871, + "epoch": 0.06383896477354421, + "kl_loss": 0.23837195336818695, + "loss_ib": 0.019312677904963493, + "step": 444 + }, + { + "epoch": 0.06398274622573688, + "grad_norm": 0.10049393773078918, + "learning_rate": 3.47444089456869e-05, + "loss": 0.915, + "step": 445 + }, + { + "ce_ib": 12.728458404541016, + "ce_orig": 0.6808370351791382, + "epoch": 0.06398274622573688, + "kl_loss": 0.16207855939865112, + "loss_ib": 0.014468157663941383, + "step": 445 + }, + { + "ce_ib": 9.595919609069824, + "ce_orig": 0.48967745900154114, + "epoch": 0.06412652767792955, + "kl_loss": 0.12488089501857758, + "loss_ib": 0.011042005382478237, + "step": 446 + }, + { + "ce_ib": 15.164140701293945, + "ce_orig": 0.8277769684791565, + "epoch": 0.06427030913012222, + "kl_loss": 0.1784917414188385, + "loss_ib": 0.016506657004356384, + "step": 447 + }, + { + "ce_ib": 11.631290435791016, + "ce_orig": 0.786353588104248, + "epoch": 0.06441409058231488, + "kl_loss": 0.13490960001945496, + "loss_ib": 0.012561124749481678, + "step": 448 + }, + { + "ce_ib": 11.316841125488281, + "ce_orig": 0.6659090518951416, + "epoch": 0.06455787203450755, + "kl_loss": 0.1083206981420517, + "loss_ib": 0.011074455454945564, + "step": 449 + }, + { + "epoch": 0.06470165348670022, + "grad_norm": 0.10170278698205948, + "learning_rate": 3.514376996805112e-05, + "loss": 0.8419, + "step": 450 + }, + { + "ce_ib": 9.49474048614502, + "ce_orig": 0.7802785038948059, + "epoch": 0.06470165348670022, + "kl_loss": 0.1756356954574585, + "loss_ib": 0.013529154472053051, + "step": 450 + }, + { + "ce_ib": 11.215967178344727, + "ce_orig": 0.4214544892311096, + "epoch": 0.06484543493889289, + "kl_loss": 0.12542136013507843, + "loss_ib": 0.01187905203551054, + "step": 451 + }, + { + "ce_ib": 6.312502861022949, + "ce_orig": 0.3134852945804596, + "epoch": 0.06498921639108556, + "kl_loss": 0.2386016845703125, + "loss_ib": 0.015086335130035877, + "step": 452 + }, + { + "ce_ib": 13.910443305969238, + "ce_orig": 0.7964897155761719, + "epoch": 0.06513299784327822, + "kl_loss": 0.19260820746421814, + "loss_ib": 0.016585631296038628, + "step": 453 + }, + { + "ce_ib": 10.858504295349121, + "ce_orig": 0.8178758025169373, + "epoch": 0.06527677929547089, + "kl_loss": 0.1271795630455017, + "loss_ib": 0.011788229458034039, + "step": 454 + }, + { + "epoch": 0.06542056074766354, + "grad_norm": 0.1267521232366562, + "learning_rate": 3.5543130990415334e-05, + "loss": 0.8513, + "step": 455 + }, + { + "ce_ib": 11.637347221374512, + "ce_orig": 1.0193455219268799, + "epoch": 0.06542056074766354, + "kl_loss": 0.144621342420578, + "loss_ib": 0.01304974127560854, + "step": 455 + }, + { + "ce_ib": 10.80041217803955, + "ce_orig": 0.6328637599945068, + "epoch": 0.06556434219985621, + "kl_loss": 0.10495860129594803, + "loss_ib": 0.010648136027157307, + "step": 456 + }, + { + "ce_ib": 7.313602447509766, + "ce_orig": 0.42815887928009033, + "epoch": 0.06570812365204888, + "kl_loss": 0.17510683834552765, + "loss_ib": 0.01241214293986559, + "step": 457 + }, + { + "ce_ib": 13.05362606048584, + "ce_orig": 1.0646302700042725, + "epoch": 0.06585190510424155, + "kl_loss": 0.14800792932510376, + "loss_ib": 0.013927209191024303, + "step": 458 + }, + { + "ce_ib": 8.705698013305664, + "ce_orig": 0.5751362442970276, + "epoch": 0.06599568655643422, + "kl_loss": 0.19291532039642334, + "loss_ib": 0.013998615555465221, + "step": 459 + }, + { + "epoch": 0.06613946800862688, + "grad_norm": 0.14026452600955963, + "learning_rate": 3.5942492012779554e-05, + "loss": 0.8978, + "step": 460 + }, + { + "ce_ib": 12.629561424255371, + "ce_orig": 1.248939871788025, + "epoch": 0.06613946800862688, + "kl_loss": 0.1431877613067627, + "loss_ib": 0.013474169187247753, + "step": 460 + }, + { + "ce_ib": 13.466840744018555, + "ce_orig": 1.1314830780029297, + "epoch": 0.06628324946081955, + "kl_loss": 0.11893537640571594, + "loss_ib": 0.012680189684033394, + "step": 461 + }, + { + "ce_ib": 12.272945404052734, + "ce_orig": 0.5334405303001404, + "epoch": 0.06642703091301222, + "kl_loss": 0.19608467817306519, + "loss_ib": 0.015940707176923752, + "step": 462 + }, + { + "ce_ib": 11.584327697753906, + "ce_orig": 0.5882666707038879, + "epoch": 0.06657081236520489, + "kl_loss": 0.15428690612316132, + "loss_ib": 0.013506509363651276, + "step": 463 + }, + { + "ce_ib": 10.483445167541504, + "ce_orig": 0.5081559419631958, + "epoch": 0.06671459381739756, + "kl_loss": 0.23190432786941528, + "loss_ib": 0.01683693937957287, + "step": 464 + }, + { + "epoch": 0.06685837526959022, + "grad_norm": 0.11186351627111435, + "learning_rate": 3.6341853035143766e-05, + "loss": 0.977, + "step": 465 + }, + { + "ce_ib": 10.818644523620605, + "ce_orig": 0.8423200249671936, + "epoch": 0.06685837526959022, + "kl_loss": 0.12322719395160675, + "loss_ib": 0.011570681817829609, + "step": 465 + }, + { + "ce_ib": 13.477171897888184, + "ce_orig": 1.135223627090454, + "epoch": 0.06700215672178289, + "kl_loss": 0.1358594447374344, + "loss_ib": 0.013531558215618134, + "step": 466 + }, + { + "ce_ib": 12.029156684875488, + "ce_orig": 0.925537645816803, + "epoch": 0.06714593817397556, + "kl_loss": 0.1674036681652069, + "loss_ib": 0.014384761452674866, + "step": 467 + }, + { + "ce_ib": 8.591270446777344, + "ce_orig": 0.6351872086524963, + "epoch": 0.06728971962616823, + "kl_loss": 0.16200634837150574, + "loss_ib": 0.012395952828228474, + "step": 468 + }, + { + "ce_ib": 12.34648609161377, + "ce_orig": 0.8252216577529907, + "epoch": 0.0674335010783609, + "kl_loss": 0.1306806206703186, + "loss_ib": 0.012707273475825787, + "step": 469 + }, + { + "epoch": 0.06757728253055356, + "grad_norm": 0.11462409794330597, + "learning_rate": 3.6741214057507985e-05, + "loss": 0.8112, + "step": 470 + }, + { + "ce_ib": 12.145110130310059, + "ce_orig": 0.7569481730461121, + "epoch": 0.06757728253055356, + "kl_loss": 0.1333114206790924, + "loss_ib": 0.012738126330077648, + "step": 470 + }, + { + "ce_ib": 10.791728019714355, + "ce_orig": 0.8886812329292297, + "epoch": 0.06772106398274623, + "kl_loss": 0.14122334122657776, + "loss_ib": 0.012457030825316906, + "step": 471 + }, + { + "ce_ib": 11.75979232788086, + "ce_orig": 0.93720543384552, + "epoch": 0.0678648454349389, + "kl_loss": 0.08433859050273895, + "loss_ib": 0.010096825659275055, + "step": 472 + }, + { + "ce_ib": 7.816238880157471, + "ce_orig": 0.5898436903953552, + "epoch": 0.06800862688713157, + "kl_loss": 0.26155394315719604, + "loss_ib": 0.0169858168810606, + "step": 473 + }, + { + "ce_ib": 12.64213752746582, + "ce_orig": 1.2193433046340942, + "epoch": 0.06815240833932423, + "kl_loss": 0.1382063627243042, + "loss_ib": 0.013231388293206692, + "step": 474 + }, + { + "epoch": 0.0682961897915169, + "grad_norm": 0.12322834134101868, + "learning_rate": 3.714057507987221e-05, + "loss": 0.942, + "step": 475 + }, + { + "ce_ib": 12.208565711975098, + "ce_orig": 0.6283319592475891, + "epoch": 0.0682961897915169, + "kl_loss": 0.1437569409608841, + "loss_ib": 0.013292129151523113, + "step": 475 + }, + { + "ce_ib": 10.480301856994629, + "ce_orig": 0.6875295042991638, + "epoch": 0.06843997124370955, + "kl_loss": 0.171269491314888, + "loss_ib": 0.013803625479340553, + "step": 476 + }, + { + "ce_ib": 12.140584945678711, + "ce_orig": 0.686497151851654, + "epoch": 0.06858375269590222, + "kl_loss": 0.14714768528938293, + "loss_ib": 0.013427676633000374, + "step": 477 + }, + { + "ce_ib": 13.12353515625, + "ce_orig": 1.330522060394287, + "epoch": 0.06872753414809489, + "kl_loss": 0.18113256990909576, + "loss_ib": 0.015618395991623402, + "step": 478 + }, + { + "ce_ib": 12.710488319396973, + "ce_orig": 1.4100775718688965, + "epoch": 0.06887131560028756, + "kl_loss": 0.12010614573955536, + "loss_ib": 0.01236055139452219, + "step": 479 + }, + { + "epoch": 0.06901509705248023, + "grad_norm": 0.11526408791542053, + "learning_rate": 3.7539936102236424e-05, + "loss": 0.8998, + "step": 480 + }, + { + "ce_ib": 13.879955291748047, + "ce_orig": 1.1341381072998047, + "epoch": 0.06901509705248023, + "kl_loss": 0.1282053291797638, + "loss_ib": 0.013350243680179119, + "step": 480 + }, + { + "ce_ib": 8.764423370361328, + "ce_orig": 0.6200535893440247, + "epoch": 0.0691588785046729, + "kl_loss": 0.1320233792066574, + "loss_ib": 0.010983380489051342, + "step": 481 + }, + { + "ce_ib": 9.943157196044922, + "ce_orig": 0.6673835515975952, + "epoch": 0.06930265995686556, + "kl_loss": 0.10792216658592224, + "loss_ib": 0.010367686860263348, + "step": 482 + }, + { + "ce_ib": 10.92377758026123, + "ce_orig": 0.7028371095657349, + "epoch": 0.06944644140905823, + "kl_loss": 0.15012815594673157, + "loss_ib": 0.012968296185135841, + "step": 483 + }, + { + "ce_ib": 9.512238502502441, + "ce_orig": 0.5816277265548706, + "epoch": 0.0695902228612509, + "kl_loss": 0.13318368792533875, + "loss_ib": 0.011415303684771061, + "step": 484 + }, + { + "epoch": 0.06973400431344356, + "grad_norm": 0.12581641972064972, + "learning_rate": 3.793929712460064e-05, + "loss": 0.8585, + "step": 485 + }, + { + "ce_ib": 13.113508224487305, + "ce_orig": 1.0122153759002686, + "epoch": 0.06973400431344356, + "kl_loss": 0.17498700320720673, + "loss_ib": 0.01530610304325819, + "step": 485 + }, + { + "ce_ib": 11.20240592956543, + "ce_orig": 0.8718814253807068, + "epoch": 0.06987778576563623, + "kl_loss": 0.12174628674983978, + "loss_ib": 0.011688517406582832, + "step": 486 + }, + { + "ce_ib": 12.784674644470215, + "ce_orig": 0.8871896266937256, + "epoch": 0.0700215672178289, + "kl_loss": 0.16703587770462036, + "loss_ib": 0.014744131825864315, + "step": 487 + }, + { + "ce_ib": 10.580418586730957, + "ce_orig": 0.8577698469161987, + "epoch": 0.07016534867002157, + "kl_loss": 0.11169049143791199, + "loss_ib": 0.010874733328819275, + "step": 488 + }, + { + "ce_ib": 10.39923095703125, + "ce_orig": 0.6622049808502197, + "epoch": 0.07030913012221424, + "kl_loss": 0.17256517708301544, + "loss_ib": 0.013827874325215816, + "step": 489 + }, + { + "epoch": 0.0704529115744069, + "grad_norm": 0.11361895501613617, + "learning_rate": 3.8338658146964856e-05, + "loss": 0.901, + "step": 490 + }, + { + "ce_ib": 13.566216468811035, + "ce_orig": 0.9003996849060059, + "epoch": 0.0704529115744069, + "kl_loss": 0.18744704127311707, + "loss_ib": 0.016155460849404335, + "step": 490 + }, + { + "ce_ib": 11.603694915771484, + "ce_orig": 1.0972646474838257, + "epoch": 0.07059669302659957, + "kl_loss": 0.09514350444078445, + "loss_ib": 0.010559022426605225, + "step": 491 + }, + { + "ce_ib": 12.866926193237305, + "ce_orig": 1.1191866397857666, + "epoch": 0.07074047447879224, + "kl_loss": 0.12916871905326843, + "loss_ib": 0.012891898863017559, + "step": 492 + }, + { + "ce_ib": 11.685700416564941, + "ce_orig": 1.0439685583114624, + "epoch": 0.07088425593098491, + "kl_loss": 0.13916221261024475, + "loss_ib": 0.012800960801541805, + "step": 493 + }, + { + "ce_ib": 8.240974426269531, + "ce_orig": 0.6664552092552185, + "epoch": 0.07102803738317758, + "kl_loss": 0.0913599506020546, + "loss_ib": 0.008688484318554401, + "step": 494 + }, + { + "epoch": 0.07117181883537024, + "grad_norm": 0.11460109055042267, + "learning_rate": 3.8738019169329075e-05, + "loss": 0.8749, + "step": 495 + }, + { + "ce_ib": 15.161900520324707, + "ce_orig": 1.5418399572372437, + "epoch": 0.07117181883537024, + "kl_loss": 0.14499804377555847, + "loss_ib": 0.014830851927399635, + "step": 495 + }, + { + "ce_ib": 12.499137878417969, + "ce_orig": 0.7715917229652405, + "epoch": 0.07131560028756291, + "kl_loss": 0.13650982081890106, + "loss_ib": 0.013075060211122036, + "step": 496 + }, + { + "ce_ib": 10.516082763671875, + "ce_orig": 0.810151219367981, + "epoch": 0.07145938173975556, + "kl_loss": 0.1658136546611786, + "loss_ib": 0.013548724353313446, + "step": 497 + }, + { + "ce_ib": 12.26677131652832, + "ce_orig": 0.4871862828731537, + "epoch": 0.07160316319194823, + "kl_loss": 0.15727362036705017, + "loss_ib": 0.013997065834701061, + "step": 498 + }, + { + "ce_ib": 8.879415512084961, + "ce_orig": 0.5100986957550049, + "epoch": 0.0717469446441409, + "kl_loss": 0.10066086053848267, + "loss_ib": 0.009472750127315521, + "step": 499 + }, + { + "epoch": 0.07189072609633357, + "grad_norm": 0.1443518102169037, + "learning_rate": 3.913738019169329e-05, + "loss": 0.8769, + "step": 500 + }, + { + "ce_ib": 11.191156387329102, + "ce_orig": 0.7616560459136963, + "epoch": 0.07189072609633357, + "kl_loss": 0.13632240891456604, + "loss_ib": 0.012411698698997498, + "step": 500 + }, + { + "ce_ib": 11.555066108703613, + "ce_orig": 0.8566383123397827, + "epoch": 0.07203450754852624, + "kl_loss": 0.162722647190094, + "loss_ib": 0.013913665898144245, + "step": 501 + }, + { + "ce_ib": 10.90505599975586, + "ce_orig": 0.9519692063331604, + "epoch": 0.0721782890007189, + "kl_loss": 0.12059365957975388, + "loss_ib": 0.011482210829854012, + "step": 502 + }, + { + "ce_ib": 9.185860633850098, + "ce_orig": 0.5769492387771606, + "epoch": 0.07232207045291157, + "kl_loss": 0.14534525573253632, + "loss_ib": 0.011860193684697151, + "step": 503 + }, + { + "ce_ib": 12.747846603393555, + "ce_orig": 0.893709123134613, + "epoch": 0.07246585190510424, + "kl_loss": 0.12308457493782043, + "loss_ib": 0.012528151273727417, + "step": 504 + }, + { + "epoch": 0.07260963335729691, + "grad_norm": 0.11177890002727509, + "learning_rate": 3.953674121405751e-05, + "loss": 0.8203, + "step": 505 + }, + { + "ce_ib": 10.646434783935547, + "ce_orig": 1.0113996267318726, + "epoch": 0.07260963335729691, + "kl_loss": 0.12359193712472916, + "loss_ib": 0.01150281447917223, + "step": 505 + }, + { + "ce_ib": 10.180166244506836, + "ce_orig": 1.0264123678207397, + "epoch": 0.07275341480948957, + "kl_loss": 0.1544847935438156, + "loss_ib": 0.012814322486519814, + "step": 506 + }, + { + "ce_ib": 11.716635704040527, + "ce_orig": 0.9286133646965027, + "epoch": 0.07289719626168224, + "kl_loss": 0.1243153065443039, + "loss_ib": 0.012074083089828491, + "step": 507 + }, + { + "ce_ib": 9.650020599365234, + "ce_orig": 0.9953688383102417, + "epoch": 0.07304097771387491, + "kl_loss": 0.12280933558940887, + "loss_ib": 0.010965476743876934, + "step": 508 + }, + { + "ce_ib": 9.697562217712402, + "ce_orig": 0.4963395297527313, + "epoch": 0.07318475916606758, + "kl_loss": 0.11651341617107391, + "loss_ib": 0.010674451477825642, + "step": 509 + }, + { + "epoch": 0.07332854061826025, + "grad_norm": 0.10411624610424042, + "learning_rate": 3.9936102236421726e-05, + "loss": 0.8555, + "step": 510 + }, + { + "ce_ib": 11.558809280395508, + "ce_orig": 1.02761971950531, + "epoch": 0.07332854061826025, + "kl_loss": 0.15587545931339264, + "loss_ib": 0.01357317715883255, + "step": 510 + }, + { + "ce_ib": 12.694424629211426, + "ce_orig": 1.1861635446548462, + "epoch": 0.07347232207045291, + "kl_loss": 0.1220870316028595, + "loss_ib": 0.012451563961803913, + "step": 511 + }, + { + "ce_ib": 8.967998504638672, + "ce_orig": 0.8245717883110046, + "epoch": 0.07361610352264558, + "kl_loss": 0.11289598047733307, + "loss_ib": 0.010128798894584179, + "step": 512 + }, + { + "ce_ib": 10.940625190734863, + "ce_orig": 0.742675244808197, + "epoch": 0.07375988497483825, + "kl_loss": 0.11433502286672592, + "loss_ib": 0.011187063530087471, + "step": 513 + }, + { + "ce_ib": 11.216033935546875, + "ce_orig": 0.7051765322685242, + "epoch": 0.07390366642703092, + "kl_loss": 0.17939868569374084, + "loss_ib": 0.014577952213585377, + "step": 514 + }, + { + "epoch": 0.07404744787922359, + "grad_norm": 0.12375368177890778, + "learning_rate": 4.0335463258785946e-05, + "loss": 0.8018, + "step": 515 + }, + { + "ce_ib": 12.851394653320312, + "ce_orig": 0.959805965423584, + "epoch": 0.07404744787922359, + "kl_loss": 0.11663807928562164, + "loss_ib": 0.012257601134479046, + "step": 515 + }, + { + "ce_ib": 9.45693302154541, + "ce_orig": 0.6901520490646362, + "epoch": 0.07419122933141625, + "kl_loss": 0.16356007754802704, + "loss_ib": 0.012906471267342567, + "step": 516 + }, + { + "ce_ib": 13.640007972717285, + "ce_orig": 1.0324355363845825, + "epoch": 0.07433501078360892, + "kl_loss": 0.14768928289413452, + "loss_ib": 0.01420446764677763, + "step": 517 + }, + { + "ce_ib": 10.631475448608398, + "ce_orig": 0.7622382044792175, + "epoch": 0.07447879223580157, + "kl_loss": 0.1414967179298401, + "loss_ib": 0.012390573509037495, + "step": 518 + }, + { + "ce_ib": 13.440409660339355, + "ce_orig": 1.4887185096740723, + "epoch": 0.07462257368799424, + "kl_loss": 0.13175088167190552, + "loss_ib": 0.013307749293744564, + "step": 519 + }, + { + "epoch": 0.07476635514018691, + "grad_norm": 0.12905798852443695, + "learning_rate": 4.0734824281150165e-05, + "loss": 0.9488, + "step": 520 + }, + { + "ce_ib": 11.399759292602539, + "ce_orig": 0.9179264307022095, + "epoch": 0.07476635514018691, + "kl_loss": 0.1584293693304062, + "loss_ib": 0.013621347956359386, + "step": 520 + }, + { + "ce_ib": 9.124701499938965, + "ce_orig": 0.6920540928840637, + "epoch": 0.07491013659237958, + "kl_loss": 0.1023043617606163, + "loss_ib": 0.009677569381892681, + "step": 521 + }, + { + "ce_ib": 11.09079647064209, + "ce_orig": 1.124171257019043, + "epoch": 0.07505391804457225, + "kl_loss": 0.1274327039718628, + "loss_ib": 0.011917034164071083, + "step": 522 + }, + { + "ce_ib": 11.651871681213379, + "ce_orig": 1.1573556661605835, + "epoch": 0.07519769949676491, + "kl_loss": 0.1257188469171524, + "loss_ib": 0.012111878953874111, + "step": 523 + }, + { + "ce_ib": 10.919167518615723, + "ce_orig": 1.096139907836914, + "epoch": 0.07534148094895758, + "kl_loss": 0.14454081654548645, + "loss_ib": 0.012686625123023987, + "step": 524 + }, + { + "epoch": 0.07548526240115025, + "grad_norm": 0.09990247339010239, + "learning_rate": 4.113418530351438e-05, + "loss": 0.8695, + "step": 525 + }, + { + "ce_ib": 13.44006633758545, + "ce_orig": 1.1762773990631104, + "epoch": 0.07548526240115025, + "kl_loss": 0.1347535103559494, + "loss_ib": 0.013457709923386574, + "step": 525 + }, + { + "ce_ib": 10.870895385742188, + "ce_orig": 0.6810830235481262, + "epoch": 0.07562904385334292, + "kl_loss": 0.1084175780415535, + "loss_ib": 0.010856325738132, + "step": 526 + }, + { + "ce_ib": 11.001641273498535, + "ce_orig": 0.5774558782577515, + "epoch": 0.07577282530553558, + "kl_loss": 0.17218388617038727, + "loss_ib": 0.014110015705227852, + "step": 527 + }, + { + "ce_ib": 12.118193626403809, + "ce_orig": 1.1861246824264526, + "epoch": 0.07591660675772825, + "kl_loss": 0.12062694877386093, + "loss_ib": 0.012090444564819336, + "step": 528 + }, + { + "ce_ib": 10.118423461914062, + "ce_orig": 0.9129387736320496, + "epoch": 0.07606038820992092, + "kl_loss": 0.09990894794464111, + "loss_ib": 0.010054659098386765, + "step": 529 + }, + { + "epoch": 0.07620416966211359, + "grad_norm": 0.11624792218208313, + "learning_rate": 4.15335463258786e-05, + "loss": 0.8571, + "step": 530 + }, + { + "ce_ib": 10.449731826782227, + "ce_orig": 0.5511190891265869, + "epoch": 0.07620416966211359, + "kl_loss": 0.18196257948875427, + "loss_ib": 0.014322995208203793, + "step": 530 + }, + { + "ce_ib": 10.00999641418457, + "ce_orig": 0.7014174461364746, + "epoch": 0.07634795111430626, + "kl_loss": 0.14452239871025085, + "loss_ib": 0.01223111804574728, + "step": 531 + }, + { + "ce_ib": 12.196045875549316, + "ce_orig": 1.0053819417953491, + "epoch": 0.07649173256649892, + "kl_loss": 0.12312173843383789, + "loss_ib": 0.012254110537469387, + "step": 532 + }, + { + "ce_ib": 11.569887161254883, + "ce_orig": 0.9444661140441895, + "epoch": 0.07663551401869159, + "kl_loss": 0.12740397453308105, + "loss_ib": 0.012155142612755299, + "step": 533 + }, + { + "ce_ib": 9.093774795532227, + "ce_orig": 0.7246710658073425, + "epoch": 0.07677929547088426, + "kl_loss": 0.1298743486404419, + "loss_ib": 0.011040604673326015, + "step": 534 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.13377481698989868, + "learning_rate": 4.193290734824281e-05, + "loss": 0.8601, + "step": 535 + }, + { + "ce_ib": 13.371556282043457, + "ce_orig": 1.1632790565490723, + "epoch": 0.07692307692307693, + "kl_loss": 0.13051572442054749, + "loss_ib": 0.013211563229560852, + "step": 535 + }, + { + "ce_ib": 11.797460556030273, + "ce_orig": 0.9421883821487427, + "epoch": 0.0770668583752696, + "kl_loss": 0.13678061962127686, + "loss_ib": 0.012737761251628399, + "step": 536 + }, + { + "ce_ib": 12.24150562286377, + "ce_orig": 0.7441449761390686, + "epoch": 0.07721063982746226, + "kl_loss": 0.11821313202381134, + "loss_ib": 0.012031408958137035, + "step": 537 + }, + { + "ce_ib": 10.22864818572998, + "ce_orig": 0.9380773305892944, + "epoch": 0.07735442127965493, + "kl_loss": 0.10538578778505325, + "loss_ib": 0.010383613407611847, + "step": 538 + }, + { + "ce_ib": 12.875476837158203, + "ce_orig": 0.8242666721343994, + "epoch": 0.0774982027318476, + "kl_loss": 0.15831780433654785, + "loss_ib": 0.014353628270328045, + "step": 539 + }, + { + "epoch": 0.07764198418404025, + "grad_norm": 0.10647567361593246, + "learning_rate": 4.233226837060703e-05, + "loss": 0.8364, + "step": 540 + }, + { + "ce_ib": 11.523818016052246, + "ce_orig": 0.8004295229911804, + "epoch": 0.07764198418404025, + "kl_loss": 0.1284905970096588, + "loss_ib": 0.012186438776552677, + "step": 540 + }, + { + "ce_ib": 11.77370548248291, + "ce_orig": 0.6032363176345825, + "epoch": 0.07778576563623292, + "kl_loss": 0.17018523812294006, + "loss_ib": 0.014396115206182003, + "step": 541 + }, + { + "ce_ib": 10.745952606201172, + "ce_orig": 0.6720147132873535, + "epoch": 0.07792954708842559, + "kl_loss": 0.13083013892173767, + "loss_ib": 0.011914483271539211, + "step": 542 + }, + { + "ce_ib": 12.431722640991211, + "ce_orig": 1.328582525253296, + "epoch": 0.07807332854061826, + "kl_loss": 0.11531039327383041, + "loss_ib": 0.011981381103396416, + "step": 543 + }, + { + "ce_ib": 10.487591743469238, + "ce_orig": 0.8851913809776306, + "epoch": 0.07821710999281092, + "kl_loss": 0.1059553325176239, + "loss_ib": 0.010541562922298908, + "step": 544 + }, + { + "epoch": 0.07836089144500359, + "grad_norm": 0.12910747528076172, + "learning_rate": 4.273162939297125e-05, + "loss": 0.8664, + "step": 545 + }, + { + "ce_ib": 10.0007905960083, + "ce_orig": 0.9234699010848999, + "epoch": 0.07836089144500359, + "kl_loss": 0.1631077080965042, + "loss_ib": 0.013155780732631683, + "step": 545 + }, + { + "ce_ib": 10.215255737304688, + "ce_orig": 0.8365392088890076, + "epoch": 0.07850467289719626, + "kl_loss": 0.11404451727867126, + "loss_ib": 0.010809853672981262, + "step": 546 + }, + { + "ce_ib": 10.295679092407227, + "ce_orig": 0.6545149683952332, + "epoch": 0.07864845434938893, + "kl_loss": 0.12140201032161713, + "loss_ib": 0.011217939667403698, + "step": 547 + }, + { + "ce_ib": 9.238651275634766, + "ce_orig": 0.7438675761222839, + "epoch": 0.0787922358015816, + "kl_loss": 0.10375800728797913, + "loss_ib": 0.009807226248085499, + "step": 548 + }, + { + "ce_ib": 11.177849769592285, + "ce_orig": 0.9599023461341858, + "epoch": 0.07893601725377426, + "kl_loss": 0.15443462133407593, + "loss_ib": 0.013310655951499939, + "step": 549 + }, + { + "epoch": 0.07907979870596693, + "grad_norm": 0.11976125091314316, + "learning_rate": 4.313099041533547e-05, + "loss": 0.9199, + "step": 550 + }, + { + "ce_ib": 8.745708465576172, + "ce_orig": 0.745059072971344, + "epoch": 0.07907979870596693, + "kl_loss": 0.10965421795845032, + "loss_ib": 0.00985556561499834, + "step": 550 + }, + { + "ce_ib": 9.332277297973633, + "ce_orig": 0.7904736995697021, + "epoch": 0.0792235801581596, + "kl_loss": 0.14583170413970947, + "loss_ib": 0.011957723647356033, + "step": 551 + }, + { + "ce_ib": 10.741596221923828, + "ce_orig": 0.9755902290344238, + "epoch": 0.07936736161035227, + "kl_loss": 0.11897563934326172, + "loss_ib": 0.011319580487906933, + "step": 552 + }, + { + "ce_ib": 8.226643562316895, + "ce_orig": 0.6986103057861328, + "epoch": 0.07951114306254493, + "kl_loss": 0.11197762191295624, + "loss_ib": 0.009712203405797482, + "step": 553 + }, + { + "ce_ib": 11.091338157653809, + "ce_orig": 0.7560675740242004, + "epoch": 0.0796549245147376, + "kl_loss": 0.12657985091209412, + "loss_ib": 0.011874661780893803, + "step": 554 + }, + { + "epoch": 0.07979870596693027, + "grad_norm": 0.1107710599899292, + "learning_rate": 4.3530351437699686e-05, + "loss": 0.8873, + "step": 555 + }, + { + "ce_ib": 10.592079162597656, + "ce_orig": 1.1071833372116089, + "epoch": 0.07979870596693027, + "kl_loss": 0.13393369317054749, + "loss_ib": 0.011992724612355232, + "step": 555 + }, + { + "ce_ib": 10.618276596069336, + "ce_orig": 0.9089037775993347, + "epoch": 0.07994248741912294, + "kl_loss": 0.16187895834445953, + "loss_ib": 0.013403086923062801, + "step": 556 + }, + { + "ce_ib": 10.179542541503906, + "ce_orig": 0.7911267876625061, + "epoch": 0.0800862688713156, + "kl_loss": 0.19424620270729065, + "loss_ib": 0.01480208057910204, + "step": 557 + }, + { + "ce_ib": 9.025123596191406, + "ce_orig": 0.8696843385696411, + "epoch": 0.08023005032350827, + "kl_loss": 0.10537652671337128, + "loss_ib": 0.009781388565897942, + "step": 558 + }, + { + "ce_ib": 10.719731330871582, + "ce_orig": 1.1363403797149658, + "epoch": 0.08037383177570094, + "kl_loss": 0.12376905977725983, + "loss_ib": 0.011548318900167942, + "step": 559 + }, + { + "epoch": 0.08051761322789361, + "grad_norm": 0.11026783287525177, + "learning_rate": 4.39297124600639e-05, + "loss": 0.9008, + "step": 560 + }, + { + "ce_ib": 11.907670021057129, + "ce_orig": 1.1177096366882324, + "epoch": 0.08051761322789361, + "kl_loss": 0.13737264275550842, + "loss_ib": 0.012822466902434826, + "step": 560 + }, + { + "ce_ib": 8.7591552734375, + "ce_orig": 0.5712915658950806, + "epoch": 0.08066139468008626, + "kl_loss": 0.0947001650929451, + "loss_ib": 0.009114585816860199, + "step": 561 + }, + { + "ce_ib": 7.738790988922119, + "ce_orig": 0.688946545124054, + "epoch": 0.08080517613227893, + "kl_loss": 0.10011854022741318, + "loss_ib": 0.00887532252818346, + "step": 562 + }, + { + "ce_ib": 9.232172012329102, + "ce_orig": 0.485678106546402, + "epoch": 0.0809489575844716, + "kl_loss": 0.14091522991657257, + "loss_ib": 0.011661847122013569, + "step": 563 + }, + { + "ce_ib": 11.895172119140625, + "ce_orig": 1.1715614795684814, + "epoch": 0.08109273903666427, + "kl_loss": 0.1400681585073471, + "loss_ib": 0.012950994074344635, + "step": 564 + }, + { + "epoch": 0.08123652048885693, + "grad_norm": 0.12719914317131042, + "learning_rate": 4.432907348242812e-05, + "loss": 0.9266, + "step": 565 + }, + { + "ce_ib": 11.971985816955566, + "ce_orig": 1.0788679122924805, + "epoch": 0.08123652048885693, + "kl_loss": 0.1462956666946411, + "loss_ib": 0.013300776481628418, + "step": 565 + }, + { + "ce_ib": 10.301172256469727, + "ce_orig": 1.0739414691925049, + "epoch": 0.0813803019410496, + "kl_loss": 0.13510483503341675, + "loss_ib": 0.01190582849085331, + "step": 566 + }, + { + "ce_ib": 13.213077545166016, + "ce_orig": 1.0148154497146606, + "epoch": 0.08152408339324227, + "kl_loss": 0.14191558957099915, + "loss_ib": 0.01370231807231903, + "step": 567 + }, + { + "ce_ib": 8.01417350769043, + "ce_orig": 0.836399257183075, + "epoch": 0.08166786484543494, + "kl_loss": 0.11862446367740631, + "loss_ib": 0.009938309900462627, + "step": 568 + }, + { + "ce_ib": 9.659282684326172, + "ce_orig": 0.9532420039176941, + "epoch": 0.0818116462976276, + "kl_loss": 0.15856535732746124, + "loss_ib": 0.01275790948420763, + "step": 569 + }, + { + "epoch": 0.08195542774982027, + "grad_norm": 0.11036694049835205, + "learning_rate": 4.472843450479233e-05, + "loss": 0.8765, + "step": 570 + }, + { + "ce_ib": 6.825167655944824, + "ce_orig": 0.5682023763656616, + "epoch": 0.08195542774982027, + "kl_loss": 0.17675429582595825, + "loss_ib": 0.012250298634171486, + "step": 570 + }, + { + "ce_ib": 9.991971969604492, + "ce_orig": 0.8481072783470154, + "epoch": 0.08209920920201294, + "kl_loss": 0.17780299484729767, + "loss_ib": 0.013886136002838612, + "step": 571 + }, + { + "ce_ib": 11.411465644836426, + "ce_orig": 1.0281726121902466, + "epoch": 0.08224299065420561, + "kl_loss": 0.11431736499071121, + "loss_ib": 0.01142160128802061, + "step": 572 + }, + { + "ce_ib": 13.019828796386719, + "ce_orig": 1.1337321996688843, + "epoch": 0.08238677210639828, + "kl_loss": 0.2963365316390991, + "loss_ib": 0.021326741203665733, + "step": 573 + }, + { + "ce_ib": 8.582294464111328, + "ce_orig": 0.5892922878265381, + "epoch": 0.08253055355859094, + "kl_loss": 0.10176214575767517, + "loss_ib": 0.009379254654049873, + "step": 574 + }, + { + "epoch": 0.08267433501078361, + "grad_norm": 0.1296963393688202, + "learning_rate": 4.512779552715655e-05, + "loss": 0.9216, + "step": 575 + }, + { + "ce_ib": 8.016057014465332, + "ce_orig": 0.8886985778808594, + "epoch": 0.08267433501078361, + "kl_loss": 0.11895924806594849, + "loss_ib": 0.009955990128219128, + "step": 575 + }, + { + "ce_ib": 8.56618595123291, + "ce_orig": 0.9640144109725952, + "epoch": 0.08281811646297628, + "kl_loss": 0.10030095279216766, + "loss_ib": 0.009298141114413738, + "step": 576 + }, + { + "ce_ib": 11.222845077514648, + "ce_orig": 0.7690722346305847, + "epoch": 0.08296189791516895, + "kl_loss": 0.15022683143615723, + "loss_ib": 0.01312276441603899, + "step": 577 + }, + { + "ce_ib": 8.99817943572998, + "ce_orig": 0.8388349413871765, + "epoch": 0.08310567936736162, + "kl_loss": 0.1211586445569992, + "loss_ib": 0.01055702194571495, + "step": 578 + }, + { + "ce_ib": 8.70596694946289, + "ce_orig": 0.8935554623603821, + "epoch": 0.08324946081955428, + "kl_loss": 0.18791162967681885, + "loss_ib": 0.013748565688729286, + "step": 579 + }, + { + "epoch": 0.08339324227174695, + "grad_norm": 0.09863687306642532, + "learning_rate": 4.552715654952077e-05, + "loss": 0.8877, + "step": 580 + }, + { + "ce_ib": 9.075626373291016, + "ce_orig": 0.6573423743247986, + "epoch": 0.08339324227174695, + "kl_loss": 0.11877353489398956, + "loss_ib": 0.010476489551365376, + "step": 580 + }, + { + "ce_ib": 9.447694778442383, + "ce_orig": 0.8443095684051514, + "epoch": 0.08353702372393962, + "kl_loss": 0.10504357516765594, + "loss_ib": 0.009976026602089405, + "step": 581 + }, + { + "ce_ib": 10.347243309020996, + "ce_orig": 0.8741527795791626, + "epoch": 0.08368080517613227, + "kl_loss": 0.1390937864780426, + "loss_ib": 0.012128311209380627, + "step": 582 + }, + { + "ce_ib": 10.385614395141602, + "ce_orig": 0.9572657346725464, + "epoch": 0.08382458662832494, + "kl_loss": 0.11069104075431824, + "loss_ib": 0.010727359913289547, + "step": 583 + }, + { + "ce_ib": 9.45213508605957, + "ce_orig": 0.8818128705024719, + "epoch": 0.08396836808051761, + "kl_loss": 0.11259730160236359, + "loss_ib": 0.0103559335693717, + "step": 584 + }, + { + "epoch": 0.08411214953271028, + "grad_norm": 0.13552848994731903, + "learning_rate": 4.592651757188499e-05, + "loss": 0.8925, + "step": 585 + }, + { + "ce_ib": 11.096572875976562, + "ce_orig": 1.0407826900482178, + "epoch": 0.08411214953271028, + "kl_loss": 0.12498383969068527, + "loss_ib": 0.01179747935384512, + "step": 585 + }, + { + "ce_ib": 8.041762351989746, + "ce_orig": 0.5626028776168823, + "epoch": 0.08425593098490294, + "kl_loss": 0.10758772492408752, + "loss_ib": 0.009400267153978348, + "step": 586 + }, + { + "ce_ib": 9.080262184143066, + "ce_orig": 0.8330552577972412, + "epoch": 0.08439971243709561, + "kl_loss": 0.11845473945140839, + "loss_ib": 0.01046286802738905, + "step": 587 + }, + { + "ce_ib": 9.385934829711914, + "ce_orig": 0.9699596166610718, + "epoch": 0.08454349388928828, + "kl_loss": 0.11468707025051117, + "loss_ib": 0.010427321307361126, + "step": 588 + }, + { + "ce_ib": 8.70030689239502, + "ce_orig": 0.8794450163841248, + "epoch": 0.08468727534148095, + "kl_loss": 0.09058257937431335, + "loss_ib": 0.008879282511770725, + "step": 589 + }, + { + "epoch": 0.08483105679367361, + "grad_norm": 0.12226880341768265, + "learning_rate": 4.632587859424921e-05, + "loss": 0.8537, + "step": 590 + }, + { + "ce_ib": 11.954180717468262, + "ce_orig": 0.6831358075141907, + "epoch": 0.08483105679367361, + "kl_loss": 0.11714005470275879, + "loss_ib": 0.011834092438220978, + "step": 590 + }, + { + "ce_ib": 11.130764961242676, + "ce_orig": 1.1798628568649292, + "epoch": 0.08497483824586628, + "kl_loss": 0.11755703389644623, + "loss_ib": 0.01144323404878378, + "step": 591 + }, + { + "ce_ib": 9.00490665435791, + "ce_orig": 1.0365432500839233, + "epoch": 0.08511861969805895, + "kl_loss": 0.10278277099132538, + "loss_ib": 0.009641592390835285, + "step": 592 + }, + { + "ce_ib": 10.016988754272461, + "ce_orig": 0.7779159545898438, + "epoch": 0.08526240115025162, + "kl_loss": 0.1287814825773239, + "loss_ib": 0.01144756842404604, + "step": 593 + }, + { + "ce_ib": 8.835247039794922, + "ce_orig": 0.7125930190086365, + "epoch": 0.08540618260244429, + "kl_loss": 0.14099836349487305, + "loss_ib": 0.011467541567981243, + "step": 594 + }, + { + "epoch": 0.08554996405463695, + "grad_norm": 0.11100345104932785, + "learning_rate": 4.672523961661342e-05, + "loss": 0.8406, + "step": 595 + }, + { + "ce_ib": 11.02431583404541, + "ce_orig": 0.7663914561271667, + "epoch": 0.08554996405463695, + "kl_loss": 0.0822073221206665, + "loss_ib": 0.009622524492442608, + "step": 595 + }, + { + "ce_ib": 10.236846923828125, + "ce_orig": 0.5589779615402222, + "epoch": 0.08569374550682962, + "kl_loss": 0.1600915789604187, + "loss_ib": 0.013123002834618092, + "step": 596 + }, + { + "ce_ib": 8.800088882446289, + "ce_orig": 0.6275121569633484, + "epoch": 0.08583752695902229, + "kl_loss": 0.12892715632915497, + "loss_ib": 0.010846401564776897, + "step": 597 + }, + { + "ce_ib": 8.204318046569824, + "ce_orig": 0.6452017426490784, + "epoch": 0.08598130841121496, + "kl_loss": 0.10103371739387512, + "loss_ib": 0.00915384478867054, + "step": 598 + }, + { + "ce_ib": 8.391890525817871, + "ce_orig": 0.7417363524436951, + "epoch": 0.08612508986340763, + "kl_loss": 0.13140645623207092, + "loss_ib": 0.010766267776489258, + "step": 599 + }, + { + "epoch": 0.08626887131560029, + "grad_norm": 0.132738396525383, + "learning_rate": 4.712460063897764e-05, + "loss": 0.8082, + "step": 600 + }, + { + "ce_ib": 10.40978717803955, + "ce_orig": 0.8029245734214783, + "epoch": 0.08626887131560029, + "kl_loss": 0.09170211851596832, + "loss_ib": 0.009789999574422836, + "step": 600 + }, + { + "ce_ib": 8.616018295288086, + "ce_orig": 1.1102370023727417, + "epoch": 0.08641265276779296, + "kl_loss": 0.11172410845756531, + "loss_ib": 0.009894214570522308, + "step": 601 + }, + { + "ce_ib": 10.94422435760498, + "ce_orig": 0.9171683192253113, + "epoch": 0.08655643421998563, + "kl_loss": 0.13117164373397827, + "loss_ib": 0.01203069370239973, + "step": 602 + }, + { + "ce_ib": 11.655625343322754, + "ce_orig": 1.1917483806610107, + "epoch": 0.08670021567217828, + "kl_loss": 0.13371366262435913, + "loss_ib": 0.012513495981693268, + "step": 603 + }, + { + "ce_ib": 7.700047969818115, + "ce_orig": 0.6811072826385498, + "epoch": 0.08684399712437095, + "kl_loss": 0.13317114114761353, + "loss_ib": 0.010508581064641476, + "step": 604 + }, + { + "epoch": 0.08698777857656362, + "grad_norm": 0.10468501597642899, + "learning_rate": 4.752396166134185e-05, + "loss": 0.8558, + "step": 605 + }, + { + "ce_ib": 9.883849143981934, + "ce_orig": 0.9263736605644226, + "epoch": 0.08698777857656362, + "kl_loss": 0.21517115831375122, + "loss_ib": 0.015700481832027435, + "step": 605 + }, + { + "ce_ib": 9.480666160583496, + "ce_orig": 1.0162242650985718, + "epoch": 0.08713156002875629, + "kl_loss": 0.11819291114807129, + "loss_ib": 0.010649978183209896, + "step": 606 + }, + { + "ce_ib": 9.99695873260498, + "ce_orig": 0.7871977090835571, + "epoch": 0.08727534148094895, + "kl_loss": 0.21086569130420685, + "loss_ib": 0.015541763976216316, + "step": 607 + }, + { + "ce_ib": 10.787432670593262, + "ce_orig": 1.3771346807479858, + "epoch": 0.08741912293314162, + "kl_loss": 0.11459565162658691, + "loss_ib": 0.01112349983304739, + "step": 608 + }, + { + "ce_ib": 7.926780700683594, + "ce_orig": 0.6084997057914734, + "epoch": 0.08756290438533429, + "kl_loss": 0.11265780031681061, + "loss_ib": 0.009596280753612518, + "step": 609 + }, + { + "epoch": 0.08770668583752696, + "grad_norm": 0.1861116737127304, + "learning_rate": 4.792332268370607e-05, + "loss": 0.841, + "step": 610 + }, + { + "ce_ib": 11.350239753723145, + "ce_orig": 1.3010127544403076, + "epoch": 0.08770668583752696, + "kl_loss": 0.1271829605102539, + "loss_ib": 0.012034268118441105, + "step": 610 + }, + { + "ce_ib": 9.161566734313965, + "ce_orig": 1.089638590812683, + "epoch": 0.08785046728971962, + "kl_loss": 0.1308199018239975, + "loss_ib": 0.0111217787489295, + "step": 611 + }, + { + "ce_ib": 9.268600463867188, + "ce_orig": 0.6986385583877563, + "epoch": 0.08799424874191229, + "kl_loss": 0.1047590896487236, + "loss_ib": 0.00987225491553545, + "step": 612 + }, + { + "ce_ib": 9.51627254486084, + "ce_orig": 0.7158625721931458, + "epoch": 0.08813803019410496, + "kl_loss": 0.13369254767894745, + "loss_ib": 0.011442764662206173, + "step": 613 + }, + { + "ce_ib": 8.772662162780762, + "ce_orig": 1.0912283658981323, + "epoch": 0.08828181164629763, + "kl_loss": 0.12375178933143616, + "loss_ib": 0.010573920793831348, + "step": 614 + }, + { + "epoch": 0.0884255930984903, + "grad_norm": 0.11201420426368713, + "learning_rate": 4.832268370607029e-05, + "loss": 0.9532, + "step": 615 + }, + { + "ce_ib": 9.231523513793945, + "ce_orig": 0.9239461421966553, + "epoch": 0.0884255930984903, + "kl_loss": 0.11232542991638184, + "loss_ib": 0.010232033208012581, + "step": 615 + }, + { + "ce_ib": 10.984260559082031, + "ce_orig": 1.0705440044403076, + "epoch": 0.08856937455068296, + "kl_loss": 0.14025995135307312, + "loss_ib": 0.012505128048360348, + "step": 616 + }, + { + "ce_ib": 9.729615211486816, + "ce_orig": 0.7350847721099854, + "epoch": 0.08871315600287563, + "kl_loss": 0.1346513330936432, + "loss_ib": 0.011597374454140663, + "step": 617 + }, + { + "ce_ib": 10.69382381439209, + "ce_orig": 0.7845515608787537, + "epoch": 0.0888569374550683, + "kl_loss": 0.16862596571445465, + "loss_ib": 0.013778209686279297, + "step": 618 + }, + { + "ce_ib": 9.848870277404785, + "ce_orig": 0.9884905815124512, + "epoch": 0.08900071890726097, + "kl_loss": 0.13911129534244537, + "loss_ib": 0.011880000121891499, + "step": 619 + }, + { + "epoch": 0.08914450035945364, + "grad_norm": 0.12046127766370773, + "learning_rate": 4.872204472843451e-05, + "loss": 0.8906, + "step": 620 + }, + { + "ce_ib": 9.231974601745605, + "ce_orig": 0.6775012612342834, + "epoch": 0.08914450035945364, + "kl_loss": 0.17472121119499207, + "loss_ib": 0.013352048583328724, + "step": 620 + }, + { + "ce_ib": 11.804699897766113, + "ce_orig": 1.529233694076538, + "epoch": 0.0892882818116463, + "kl_loss": 0.12259548157453537, + "loss_ib": 0.01203212421387434, + "step": 621 + }, + { + "ce_ib": 7.200037002563477, + "ce_orig": 0.4418286383152008, + "epoch": 0.08943206326383897, + "kl_loss": 0.1409044861793518, + "loss_ib": 0.010645243339240551, + "step": 622 + }, + { + "ce_ib": 13.076546669006348, + "ce_orig": 1.375049114227295, + "epoch": 0.08957584471603164, + "kl_loss": 0.13703083992004395, + "loss_ib": 0.01338981557637453, + "step": 623 + }, + { + "ce_ib": 7.833815574645996, + "ce_orig": 0.5396220684051514, + "epoch": 0.08971962616822429, + "kl_loss": 0.11749826371669769, + "loss_ib": 0.009791821241378784, + "step": 624 + }, + { + "epoch": 0.08986340762041696, + "grad_norm": 0.1378038227558136, + "learning_rate": 4.912140575079873e-05, + "loss": 0.9042, + "step": 625 + }, + { + "ce_ib": 8.829384803771973, + "ce_orig": 0.9549583792686462, + "epoch": 0.08986340762041696, + "kl_loss": 0.08257201313972473, + "loss_ib": 0.008543292991816998, + "step": 625 + }, + { + "ce_ib": 10.215102195739746, + "ce_orig": 0.6707859039306641, + "epoch": 0.09000718907260963, + "kl_loss": 0.1320197582244873, + "loss_ib": 0.011708538979291916, + "step": 626 + }, + { + "ce_ib": 12.020796775817871, + "ce_orig": 1.0823876857757568, + "epoch": 0.0901509705248023, + "kl_loss": 0.12088725715875626, + "loss_ib": 0.012054760940372944, + "step": 627 + }, + { + "ce_ib": 6.340538501739502, + "ce_orig": 0.5447245836257935, + "epoch": 0.09029475197699496, + "kl_loss": 0.08977651596069336, + "loss_ib": 0.0076590958051383495, + "step": 628 + }, + { + "ce_ib": 10.423829078674316, + "ce_orig": 1.2043941020965576, + "epoch": 0.09043853342918763, + "kl_loss": 0.08985006809234619, + "loss_ib": 0.009704417549073696, + "step": 629 + }, + { + "epoch": 0.0905823148813803, + "grad_norm": 0.13364273309707642, + "learning_rate": 4.952076677316294e-05, + "loss": 0.9167, + "step": 630 + }, + { + "ce_ib": 10.545193672180176, + "ce_orig": 1.0608717203140259, + "epoch": 0.0905823148813803, + "kl_loss": 0.10969355702400208, + "loss_ib": 0.01075727492570877, + "step": 630 + }, + { + "ce_ib": 6.636918544769287, + "ce_orig": 0.5482816696166992, + "epoch": 0.09072609633357297, + "kl_loss": 0.14597171545028687, + "loss_ib": 0.010617044754326344, + "step": 631 + }, + { + "ce_ib": 7.407113075256348, + "ce_orig": 0.6304860711097717, + "epoch": 0.09086987778576563, + "kl_loss": 0.1305876523256302, + "loss_ib": 0.010232939384877682, + "step": 632 + }, + { + "ce_ib": 9.818191528320312, + "ce_orig": 0.7274070978164673, + "epoch": 0.0910136592379583, + "kl_loss": 0.10508064925670624, + "loss_ib": 0.01016312837600708, + "step": 633 + }, + { + "ce_ib": 7.220940589904785, + "ce_orig": 0.6228238940238953, + "epoch": 0.09115744069015097, + "kl_loss": 0.11822935938835144, + "loss_ib": 0.009521937929093838, + "step": 634 + }, + { + "epoch": 0.09130122214234364, + "grad_norm": 0.1587258279323578, + "learning_rate": 4.992012779552716e-05, + "loss": 0.8734, + "step": 635 + }, + { + "ce_ib": 11.307782173156738, + "ce_orig": 0.93541419506073, + "epoch": 0.09130122214234364, + "kl_loss": 0.10773300379514694, + "loss_ib": 0.011040541343390942, + "step": 635 + }, + { + "ce_ib": 12.55269718170166, + "ce_orig": 1.1095880270004272, + "epoch": 0.0914450035945363, + "kl_loss": 0.12884891033172607, + "loss_ib": 0.01271879393607378, + "step": 636 + }, + { + "ce_ib": 10.303481101989746, + "ce_orig": 1.0926016569137573, + "epoch": 0.09158878504672897, + "kl_loss": 0.09748281538486481, + "loss_ib": 0.010025881230831146, + "step": 637 + }, + { + "ce_ib": 10.083890914916992, + "ce_orig": 1.1573371887207031, + "epoch": 0.09173256649892164, + "kl_loss": 0.1186913400888443, + "loss_ib": 0.010976512916386127, + "step": 638 + }, + { + "ce_ib": 8.328096389770508, + "ce_orig": 0.8586428165435791, + "epoch": 0.09187634795111431, + "kl_loss": 0.11891971528530121, + "loss_ib": 0.010110034607350826, + "step": 639 + }, + { + "epoch": 0.09202012940330698, + "grad_norm": 0.1216014102101326, + "learning_rate": 4.999999518105881e-05, + "loss": 0.9069, + "step": 640 + }, + { + "ce_ib": 12.23937702178955, + "ce_orig": 1.0276381969451904, + "epoch": 0.09202012940330698, + "kl_loss": 0.1557500809431076, + "loss_ib": 0.013907193206250668, + "step": 640 + }, + { + "ce_ib": 10.701261520385742, + "ce_orig": 1.2324309349060059, + "epoch": 0.09216391085549965, + "kl_loss": 0.12038681656122208, + "loss_ib": 0.011369972489774227, + "step": 641 + }, + { + "ce_ib": 10.260677337646484, + "ce_orig": 0.7305514216423035, + "epoch": 0.09230769230769231, + "kl_loss": 0.1621103733778, + "loss_ib": 0.013235858641564846, + "step": 642 + }, + { + "ce_ib": 8.343496322631836, + "ce_orig": 0.9428196549415588, + "epoch": 0.09245147375988498, + "kl_loss": 0.1055351197719574, + "loss_ib": 0.009448505006730556, + "step": 643 + }, + { + "ce_ib": 11.247652053833008, + "ce_orig": 1.0904582738876343, + "epoch": 0.09259525521207765, + "kl_loss": 0.13140398263931274, + "loss_ib": 0.012194025330245495, + "step": 644 + }, + { + "epoch": 0.0927390366642703, + "grad_norm": 0.11910561472177505, + "learning_rate": 4.9999975604113406e-05, + "loss": 0.9693, + "step": 645 + }, + { + "ce_ib": 6.632207870483398, + "ce_orig": 0.6067308783531189, + "epoch": 0.0927390366642703, + "kl_loss": 0.09612976759672165, + "loss_ib": 0.008122592233121395, + "step": 645 + }, + { + "ce_ib": 8.443811416625977, + "ce_orig": 0.5988494157791138, + "epoch": 0.09288281811646297, + "kl_loss": 0.09652799367904663, + "loss_ib": 0.009048305451869965, + "step": 646 + }, + { + "ce_ib": 4.723007678985596, + "ce_orig": 0.2960648238658905, + "epoch": 0.09302659956865564, + "kl_loss": 0.17113137245178223, + "loss_ib": 0.010918072424829006, + "step": 647 + }, + { + "ce_ib": 9.759404182434082, + "ce_orig": 1.0581693649291992, + "epoch": 0.0931703810208483, + "kl_loss": 0.1296691596508026, + "loss_ib": 0.011363159865140915, + "step": 648 + }, + { + "ce_ib": 10.742829322814941, + "ce_orig": 1.3071558475494385, + "epoch": 0.09331416247304097, + "kl_loss": 0.1319178342819214, + "loss_ib": 0.011967306025326252, + "step": 649 + }, + { + "epoch": 0.09345794392523364, + "grad_norm": 0.1334671825170517, + "learning_rate": 4.999994096799175e-05, + "loss": 0.8334, + "step": 650 + }, + { + "ce_ib": 8.407774925231934, + "ce_orig": 1.0564472675323486, + "epoch": 0.09345794392523364, + "kl_loss": 0.10574951022863388, + "loss_ib": 0.009491363540291786, + "step": 650 + }, + { + "ce_ib": 8.83208179473877, + "ce_orig": 1.1277897357940674, + "epoch": 0.09360172537742631, + "kl_loss": 0.11387504637241364, + "loss_ib": 0.010109792463481426, + "step": 651 + }, + { + "ce_ib": 9.621241569519043, + "ce_orig": 1.163621187210083, + "epoch": 0.09374550682961898, + "kl_loss": 0.15881387889385223, + "loss_ib": 0.012751313857734203, + "step": 652 + }, + { + "ce_ib": 8.543654441833496, + "ce_orig": 1.220476508140564, + "epoch": 0.09388928828181164, + "kl_loss": 0.13807114958763123, + "loss_ib": 0.011175385676324368, + "step": 653 + }, + { + "ce_ib": 7.747812271118164, + "ce_orig": 0.603781521320343, + "epoch": 0.09403306973400431, + "kl_loss": 0.14266037940979004, + "loss_ib": 0.011006924323737621, + "step": 654 + }, + { + "epoch": 0.09417685118619698, + "grad_norm": 0.10387395322322845, + "learning_rate": 4.99998912727147e-05, + "loss": 0.8407, + "step": 655 + }, + { + "ce_ib": 8.64326000213623, + "ce_orig": 0.6705234050750732, + "epoch": 0.09417685118619698, + "kl_loss": 0.08966037631034851, + "loss_ib": 0.008804649114608765, + "step": 655 + }, + { + "ce_ib": 7.918277263641357, + "ce_orig": 0.4241083264350891, + "epoch": 0.09432063263838965, + "kl_loss": 0.10727507621049881, + "loss_ib": 0.00932289194315672, + "step": 656 + }, + { + "ce_ib": 9.530045509338379, + "ce_orig": 1.0509976148605347, + "epoch": 0.09446441409058232, + "kl_loss": 0.11631051450967789, + "loss_ib": 0.010580549016594887, + "step": 657 + }, + { + "ce_ib": 9.818882942199707, + "ce_orig": 1.0639723539352417, + "epoch": 0.09460819554277498, + "kl_loss": 0.09178834408521652, + "loss_ib": 0.009498858824372292, + "step": 658 + }, + { + "ce_ib": 10.010890007019043, + "ce_orig": 0.93562251329422, + "epoch": 0.09475197699496765, + "kl_loss": 0.11310561001300812, + "loss_ib": 0.010660725645720959, + "step": 659 + }, + { + "epoch": 0.09489575844716032, + "grad_norm": 0.11378470063209534, + "learning_rate": 4.9999826518312206e-05, + "loss": 1.0042, + "step": 660 + }, + { + "ce_ib": 6.337740898132324, + "ce_orig": 0.6393367052078247, + "epoch": 0.09489575844716032, + "kl_loss": 0.20546941459178925, + "loss_ib": 0.013442340306937695, + "step": 660 + }, + { + "ce_ib": 9.358152389526367, + "ce_orig": 0.6469663977622986, + "epoch": 0.09503953989935299, + "kl_loss": 0.16981008648872375, + "loss_ib": 0.013169581070542336, + "step": 661 + }, + { + "ce_ib": 12.261252403259277, + "ce_orig": 1.3894939422607422, + "epoch": 0.09518332135154565, + "kl_loss": 0.12019184231758118, + "loss_ib": 0.012140218168497086, + "step": 662 + }, + { + "ce_ib": 7.736466407775879, + "ce_orig": 0.7755306363105774, + "epoch": 0.09532710280373832, + "kl_loss": 0.12816421687602997, + "loss_ib": 0.010276444256305695, + "step": 663 + }, + { + "ce_ib": 7.187182426452637, + "ce_orig": 0.5523344874382019, + "epoch": 0.09547088425593099, + "kl_loss": 0.11785402148962021, + "loss_ib": 0.009486292488873005, + "step": 664 + }, + { + "epoch": 0.09561466570812366, + "grad_norm": 0.1056251972913742, + "learning_rate": 4.999974670482325e-05, + "loss": 0.9002, + "step": 665 + }, + { + "ce_ib": 8.833344459533691, + "ce_orig": 0.6083950400352478, + "epoch": 0.09561466570812366, + "kl_loss": 0.13750138878822327, + "loss_ib": 0.011291741393506527, + "step": 665 + }, + { + "ce_ib": 6.192565441131592, + "ce_orig": 0.44607073068618774, + "epoch": 0.09575844716031631, + "kl_loss": 0.17101138830184937, + "loss_ib": 0.011646851897239685, + "step": 666 + }, + { + "ce_ib": 8.367379188537598, + "ce_orig": 0.6870203018188477, + "epoch": 0.09590222861250898, + "kl_loss": 0.13014401495456696, + "loss_ib": 0.01069089025259018, + "step": 667 + }, + { + "ce_ib": 9.423290252685547, + "ce_orig": 0.8573846817016602, + "epoch": 0.09604601006470165, + "kl_loss": 0.09922461211681366, + "loss_ib": 0.009672875516116619, + "step": 668 + }, + { + "ce_ib": 7.007211208343506, + "ce_orig": 0.5984476208686829, + "epoch": 0.09618979151689432, + "kl_loss": 0.07789325714111328, + "loss_ib": 0.007398268673568964, + "step": 669 + }, + { + "epoch": 0.09633357296908698, + "grad_norm": 0.12372449785470963, + "learning_rate": 4.999965183229593e-05, + "loss": 0.7418, + "step": 670 + }, + { + "ce_ib": 9.530304908752441, + "ce_orig": 1.1927051544189453, + "epoch": 0.09633357296908698, + "kl_loss": 0.1058434247970581, + "loss_ib": 0.01005732361227274, + "step": 670 + }, + { + "ce_ib": 7.50916862487793, + "ce_orig": 0.706110954284668, + "epoch": 0.09647735442127965, + "kl_loss": 0.0839371532201767, + "loss_ib": 0.00795144122093916, + "step": 671 + }, + { + "ce_ib": 9.66496467590332, + "ce_orig": 1.1682391166687012, + "epoch": 0.09662113587347232, + "kl_loss": 0.26062270998954773, + "loss_ib": 0.017863618209958076, + "step": 672 + }, + { + "ce_ib": 13.103985786437988, + "ce_orig": 1.5685844421386719, + "epoch": 0.09676491732566499, + "kl_loss": 0.14245596528053284, + "loss_ib": 0.013674790970981121, + "step": 673 + }, + { + "ce_ib": 8.4186429977417, + "ce_orig": 0.8082234859466553, + "epoch": 0.09690869877785765, + "kl_loss": 0.15234991908073425, + "loss_ib": 0.011826817877590656, + "step": 674 + }, + { + "epoch": 0.09705248023005032, + "grad_norm": 0.11889223009347916, + "learning_rate": 4.9999541900787386e-05, + "loss": 0.9837, + "step": 675 + }, + { + "ce_ib": 10.21523666381836, + "ce_orig": 0.7224376797676086, + "epoch": 0.09705248023005032, + "kl_loss": 0.11821460723876953, + "loss_ib": 0.01101834885776043, + "step": 675 + }, + { + "ce_ib": 11.221392631530762, + "ce_orig": 1.364400863647461, + "epoch": 0.09719626168224299, + "kl_loss": 0.12399769574403763, + "loss_ib": 0.01181058119982481, + "step": 676 + }, + { + "ce_ib": 8.069270133972168, + "ce_orig": 0.6916780471801758, + "epoch": 0.09734004313443566, + "kl_loss": 0.09095580875873566, + "loss_ib": 0.00858242530375719, + "step": 677 + }, + { + "ce_ib": 8.115494728088379, + "ce_orig": 0.5971089601516724, + "epoch": 0.09748382458662833, + "kl_loss": 0.16152238845825195, + "loss_ib": 0.012133866548538208, + "step": 678 + }, + { + "ce_ib": 9.717530250549316, + "ce_orig": 0.9812992215156555, + "epoch": 0.097627606038821, + "kl_loss": 0.1159181147813797, + "loss_ib": 0.010654671117663383, + "step": 679 + }, + { + "epoch": 0.09777138749101366, + "grad_norm": 0.1076693907380104, + "learning_rate": 4.999941691036383e-05, + "loss": 0.7919, + "step": 680 + }, + { + "ce_ib": 6.14840030670166, + "ce_orig": 0.4094475209712982, + "epoch": 0.09777138749101366, + "kl_loss": 0.21727138757705688, + "loss_ib": 0.013937770389020443, + "step": 680 + }, + { + "ce_ib": 5.300361156463623, + "ce_orig": 0.40322771668434143, + "epoch": 0.09791516894320633, + "kl_loss": 0.10324863344430923, + "loss_ib": 0.007812611758708954, + "step": 681 + }, + { + "ce_ib": 10.288033485412598, + "ce_orig": 0.9181535840034485, + "epoch": 0.098058950395399, + "kl_loss": 0.13683810830116272, + "loss_ib": 0.01198592223227024, + "step": 682 + }, + { + "ce_ib": 6.534595489501953, + "ce_orig": 0.588758647441864, + "epoch": 0.09820273184759166, + "kl_loss": 0.12447066605091095, + "loss_ib": 0.00949083175510168, + "step": 683 + }, + { + "ce_ib": 6.564389705657959, + "ce_orig": 0.6715453863143921, + "epoch": 0.09834651329978433, + "kl_loss": 0.09237797558307648, + "loss_ib": 0.007901093922555447, + "step": 684 + }, + { + "epoch": 0.098490294751977, + "grad_norm": 0.13819435238838196, + "learning_rate": 4.999927686110056e-05, + "loss": 0.8023, + "step": 685 + }, + { + "ce_ib": 7.38820219039917, + "ce_orig": 1.00199294090271, + "epoch": 0.098490294751977, + "kl_loss": 0.09955717623233795, + "loss_ib": 0.008671960793435574, + "step": 685 + }, + { + "ce_ib": 8.343016624450684, + "ce_orig": 0.9671911001205444, + "epoch": 0.09863407620416967, + "kl_loss": 0.15917587280273438, + "loss_ib": 0.012130302377045155, + "step": 686 + }, + { + "ce_ib": 7.283801078796387, + "ce_orig": 0.8381296992301941, + "epoch": 0.09877785765636234, + "kl_loss": 0.08582788705825806, + "loss_ib": 0.007933295331895351, + "step": 687 + }, + { + "ce_ib": 7.987790107727051, + "ce_orig": 0.6048274636268616, + "epoch": 0.09892163910855499, + "kl_loss": 0.16408729553222656, + "loss_ib": 0.012198260053992271, + "step": 688 + }, + { + "ce_ib": 13.21379280090332, + "ce_orig": 1.2273222208023071, + "epoch": 0.09906542056074766, + "kl_loss": 0.13596788048744202, + "loss_ib": 0.013405290432274342, + "step": 689 + }, + { + "epoch": 0.09920920201294033, + "grad_norm": 0.10889366269111633, + "learning_rate": 4.999912175308195e-05, + "loss": 0.8662, + "step": 690 + }, + { + "ce_ib": 9.05671501159668, + "ce_orig": 1.0359764099121094, + "epoch": 0.09920920201294033, + "kl_loss": 0.09794212877750397, + "loss_ib": 0.00942546408623457, + "step": 690 + }, + { + "ce_ib": 6.449003219604492, + "ce_orig": 0.5917060971260071, + "epoch": 0.099352983465133, + "kl_loss": 0.1164952963590622, + "loss_ib": 0.009049266576766968, + "step": 691 + }, + { + "ce_ib": 8.728141784667969, + "ce_orig": 1.2859077453613281, + "epoch": 0.09949676491732566, + "kl_loss": 0.09028451889753342, + "loss_ib": 0.008878297172486782, + "step": 692 + }, + { + "ce_ib": 11.696812629699707, + "ce_orig": 1.4011276960372925, + "epoch": 0.09964054636951833, + "kl_loss": 0.0865880697965622, + "loss_ib": 0.010177809745073318, + "step": 693 + }, + { + "ce_ib": 9.487892150878906, + "ce_orig": 0.8238059282302856, + "epoch": 0.099784327821711, + "kl_loss": 0.09209860861301422, + "loss_ib": 0.009348876774311066, + "step": 694 + }, + { + "epoch": 0.09992810927390366, + "grad_norm": 0.1301174759864807, + "learning_rate": 4.999895158640141e-05, + "loss": 0.9068, + "step": 695 + }, + { + "ce_ib": 6.869251728057861, + "ce_orig": 0.7711926102638245, + "epoch": 0.09992810927390366, + "kl_loss": 0.17371296882629395, + "loss_ib": 0.012120273895561695, + "step": 695 + }, + { + "ce_ib": 9.267475128173828, + "ce_orig": 0.5941876173019409, + "epoch": 0.10007189072609633, + "kl_loss": 0.1114778146147728, + "loss_ib": 0.010207627899944782, + "step": 696 + }, + { + "ce_ib": 7.718648910522461, + "ce_orig": 0.9436888694763184, + "epoch": 0.100215672178289, + "kl_loss": 0.10313811898231506, + "loss_ib": 0.009016230702400208, + "step": 697 + }, + { + "ce_ib": 7.209829807281494, + "ce_orig": 0.6572908759117126, + "epoch": 0.10035945363048167, + "kl_loss": 0.11239567399024963, + "loss_ib": 0.00922469887882471, + "step": 698 + }, + { + "ce_ib": 8.946769714355469, + "ce_orig": 0.8208485841751099, + "epoch": 0.10050323508267434, + "kl_loss": 0.12747088074684143, + "loss_ib": 0.010846929624676704, + "step": 699 + }, + { + "epoch": 0.100647016534867, + "grad_norm": 0.11714436113834381, + "learning_rate": 4.999876636116145e-05, + "loss": 0.8387, + "step": 700 + }, + { + "ce_ib": 9.900582313537598, + "ce_orig": 0.894629180431366, + "epoch": 0.100647016534867, + "kl_loss": 0.13778752088546753, + "loss_ib": 0.011839667335152626, + "step": 700 + }, + { + "ce_ib": 8.739860534667969, + "ce_orig": 0.9482055902481079, + "epoch": 0.10079079798705967, + "kl_loss": 0.10394522547721863, + "loss_ib": 0.009567191824316978, + "step": 701 + }, + { + "ce_ib": 7.199070930480957, + "ce_orig": 0.8619567155838013, + "epoch": 0.10093457943925234, + "kl_loss": 0.13609068095684052, + "loss_ib": 0.010404069907963276, + "step": 702 + }, + { + "ce_ib": 7.6442036628723145, + "ce_orig": 0.8066157102584839, + "epoch": 0.10107836089144501, + "kl_loss": 0.09935668110847473, + "loss_ib": 0.00878993608057499, + "step": 703 + }, + { + "ce_ib": 7.810359001159668, + "ce_orig": 0.8702456951141357, + "epoch": 0.10122214234363767, + "kl_loss": 0.0937383696436882, + "loss_ib": 0.008592098020017147, + "step": 704 + }, + { + "epoch": 0.10136592379583034, + "grad_norm": 0.11369116604328156, + "learning_rate": 4.9998566077473645e-05, + "loss": 0.84, + "step": 705 + }, + { + "ce_ib": 8.6611909866333, + "ce_orig": 0.8015434145927429, + "epoch": 0.10136592379583034, + "kl_loss": 0.1313619613647461, + "loss_ib": 0.010898693464696407, + "step": 705 + }, + { + "ce_ib": 8.346274375915527, + "ce_orig": 0.8527460694313049, + "epoch": 0.10150970524802301, + "kl_loss": 0.09497249126434326, + "loss_ib": 0.008921761997044086, + "step": 706 + }, + { + "ce_ib": 11.167051315307617, + "ce_orig": 1.280219554901123, + "epoch": 0.10165348670021568, + "kl_loss": 0.13609477877616882, + "loss_ib": 0.012388264760375023, + "step": 707 + }, + { + "ce_ib": 9.243364334106445, + "ce_orig": 1.1910686492919922, + "epoch": 0.10179726815240835, + "kl_loss": 0.10804169625043869, + "loss_ib": 0.010023767128586769, + "step": 708 + }, + { + "ce_ib": 6.902951240539551, + "ce_orig": 0.6421184539794922, + "epoch": 0.101941049604601, + "kl_loss": 0.10724660754203796, + "loss_ib": 0.008813805878162384, + "step": 709 + }, + { + "epoch": 0.10208483105679367, + "grad_norm": 0.10316238552331924, + "learning_rate": 4.9998350735458646e-05, + "loss": 0.8424, + "step": 710 + }, + { + "ce_ib": 8.496487617492676, + "ce_orig": 0.8121188879013062, + "epoch": 0.10208483105679367, + "kl_loss": 0.1422605663537979, + "loss_ib": 0.01136127207428217, + "step": 710 + }, + { + "ce_ib": 7.606898784637451, + "ce_orig": 0.4156748056411743, + "epoch": 0.10222861250898634, + "kl_loss": 0.12221111357212067, + "loss_ib": 0.009914005175232887, + "step": 711 + }, + { + "ce_ib": 8.244635581970215, + "ce_orig": 0.9483067393302917, + "epoch": 0.102372393961179, + "kl_loss": 0.1223490834236145, + "loss_ib": 0.010239771567285061, + "step": 712 + }, + { + "ce_ib": 8.075944900512695, + "ce_orig": 0.9652693867683411, + "epoch": 0.10251617541337167, + "kl_loss": 0.11330495774745941, + "loss_ib": 0.009703220799565315, + "step": 713 + }, + { + "ce_ib": 7.7777581214904785, + "ce_orig": 0.6340931057929993, + "epoch": 0.10265995686556434, + "kl_loss": 0.11384841799736023, + "loss_ib": 0.009581300429999828, + "step": 714 + }, + { + "epoch": 0.102803738317757, + "grad_norm": 0.11466086655855179, + "learning_rate": 4.999812033524616e-05, + "loss": 0.8816, + "step": 715 + }, + { + "ce_ib": 7.194620609283447, + "ce_orig": 0.5225186944007874, + "epoch": 0.102803738317757, + "kl_loss": 0.16458478569984436, + "loss_ib": 0.011826549656689167, + "step": 715 + }, + { + "ce_ib": 7.576101303100586, + "ce_orig": 0.7184677124023438, + "epoch": 0.10294751976994967, + "kl_loss": 0.17702654004096985, + "loss_ib": 0.012639378197491169, + "step": 716 + }, + { + "ce_ib": 10.157630920410156, + "ce_orig": 0.9195736646652222, + "epoch": 0.10309130122214234, + "kl_loss": 0.11237962543964386, + "loss_ib": 0.010697796940803528, + "step": 717 + }, + { + "ce_ib": 6.4610395431518555, + "ce_orig": 0.2969205975532532, + "epoch": 0.10323508267433501, + "kl_loss": 0.3771587312221527, + "loss_ib": 0.02208845689892769, + "step": 718 + }, + { + "ce_ib": 10.73964786529541, + "ce_orig": 1.206357479095459, + "epoch": 0.10337886412652768, + "kl_loss": 0.11494225263595581, + "loss_ib": 0.011116936802864075, + "step": 719 + }, + { + "epoch": 0.10352264557872035, + "grad_norm": 0.13988007605075836, + "learning_rate": 4.9997874876974966e-05, + "loss": 0.9343, + "step": 720 + }, + { + "ce_ib": 8.02586841583252, + "ce_orig": 0.5530202984809875, + "epoch": 0.10352264557872035, + "kl_loss": 0.15824423730373383, + "loss_ib": 0.011925145983695984, + "step": 720 + }, + { + "ce_ib": 7.987276077270508, + "ce_orig": 0.8079979419708252, + "epoch": 0.10366642703091301, + "kl_loss": 0.09849292039871216, + "loss_ib": 0.008918284438550472, + "step": 721 + }, + { + "ce_ib": 7.2635416984558105, + "ce_orig": 1.058225154876709, + "epoch": 0.10381020848310568, + "kl_loss": 0.07580313086509705, + "loss_ib": 0.007421927060931921, + "step": 722 + }, + { + "ce_ib": 11.01719856262207, + "ce_orig": 1.5954816341400146, + "epoch": 0.10395398993529835, + "kl_loss": 0.1211317703127861, + "loss_ib": 0.011565187945961952, + "step": 723 + }, + { + "ce_ib": 7.661947727203369, + "ce_orig": 0.8771336078643799, + "epoch": 0.10409777138749102, + "kl_loss": 0.17039231956005096, + "loss_ib": 0.012350589968264103, + "step": 724 + }, + { + "epoch": 0.10424155283968368, + "grad_norm": 0.11809071898460388, + "learning_rate": 4.9997614360792934e-05, + "loss": 0.9575, + "step": 725 + }, + { + "ce_ib": 9.20604133605957, + "ce_orig": 0.8485362529754639, + "epoch": 0.10424155283968368, + "kl_loss": 0.10916964709758759, + "loss_ib": 0.010061502456665039, + "step": 725 + }, + { + "ce_ib": 7.615476608276367, + "ce_orig": 0.9565079808235168, + "epoch": 0.10438533429187635, + "kl_loss": 0.09149608016014099, + "loss_ib": 0.008382542990148067, + "step": 726 + }, + { + "ce_ib": 8.909895896911621, + "ce_orig": 1.0168468952178955, + "epoch": 0.10452911574406902, + "kl_loss": 0.10178525745868683, + "loss_ib": 0.009544211439788342, + "step": 727 + }, + { + "ce_ib": 9.962603569030762, + "ce_orig": 1.013784646987915, + "epoch": 0.10467289719626169, + "kl_loss": 0.0983659103512764, + "loss_ib": 0.009899596683681011, + "step": 728 + }, + { + "ce_ib": 7.675144195556641, + "ce_orig": 0.7497639060020447, + "epoch": 0.10481667864845436, + "kl_loss": 0.1465551257133484, + "loss_ib": 0.011165328323841095, + "step": 729 + }, + { + "epoch": 0.10496046010064701, + "grad_norm": 0.10833033919334412, + "learning_rate": 4.999733878685698e-05, + "loss": 0.8967, + "step": 730 + }, + { + "ce_ib": 9.022236824035645, + "ce_orig": 1.01668381690979, + "epoch": 0.10496046010064701, + "kl_loss": 0.12041808664798737, + "loss_ib": 0.010532023385167122, + "step": 730 + }, + { + "ce_ib": 8.386323928833008, + "ce_orig": 0.8513427376747131, + "epoch": 0.10510424155283968, + "kl_loss": 0.12366791814565659, + "loss_ib": 0.01037655770778656, + "step": 731 + }, + { + "ce_ib": 11.588960647583008, + "ce_orig": 1.5426373481750488, + "epoch": 0.10524802300503235, + "kl_loss": 0.13248568773269653, + "loss_ib": 0.012418764643371105, + "step": 732 + }, + { + "ce_ib": 8.538729667663574, + "ce_orig": 0.8282467126846313, + "epoch": 0.10539180445722501, + "kl_loss": 0.121379554271698, + "loss_ib": 0.010338342748582363, + "step": 733 + }, + { + "ce_ib": 8.161299705505371, + "ce_orig": 0.8536421060562134, + "epoch": 0.10553558590941768, + "kl_loss": 0.10607169568538666, + "loss_ib": 0.009384234435856342, + "step": 734 + }, + { + "epoch": 0.10567936736161035, + "grad_norm": 0.12110400199890137, + "learning_rate": 4.999704815533312e-05, + "loss": 0.8765, + "step": 735 + }, + { + "ce_ib": 8.390252113342285, + "ce_orig": 1.1886399984359741, + "epoch": 0.10567936736161035, + "kl_loss": 0.12097896635532379, + "loss_ib": 0.010244074277579784, + "step": 735 + }, + { + "ce_ib": 9.484691619873047, + "ce_orig": 1.0858343839645386, + "epoch": 0.10582314881380302, + "kl_loss": 0.11114107072353363, + "loss_ib": 0.010299399495124817, + "step": 736 + }, + { + "ce_ib": 7.962080478668213, + "ce_orig": 0.9100456237792969, + "epoch": 0.10596693026599568, + "kl_loss": 0.10763818770647049, + "loss_ib": 0.00936294998973608, + "step": 737 + }, + { + "ce_ib": 9.301447868347168, + "ce_orig": 1.0797451734542847, + "epoch": 0.10611071171818835, + "kl_loss": 0.08851733803749084, + "loss_ib": 0.009076590649783611, + "step": 738 + }, + { + "ce_ib": 5.509831428527832, + "ce_orig": 0.5435864925384521, + "epoch": 0.10625449317038102, + "kl_loss": 0.11892074346542358, + "loss_ib": 0.008700952865183353, + "step": 739 + }, + { + "epoch": 0.10639827462257369, + "grad_norm": 0.12525738775730133, + "learning_rate": 4.9996742466396395e-05, + "loss": 0.9647, + "step": 740 + }, + { + "ce_ib": 7.956106662750244, + "ce_orig": 0.5784890055656433, + "epoch": 0.10639827462257369, + "kl_loss": 0.0811728686094284, + "loss_ib": 0.008036697283387184, + "step": 740 + }, + { + "ce_ib": 4.005490779876709, + "ce_orig": 0.20808197557926178, + "epoch": 0.10654205607476636, + "kl_loss": 0.19454577565193176, + "loss_ib": 0.011730033904314041, + "step": 741 + }, + { + "ce_ib": 5.741650581359863, + "ce_orig": 0.5170265436172485, + "epoch": 0.10668583752695902, + "kl_loss": 0.15967172384262085, + "loss_ib": 0.01085441093891859, + "step": 742 + }, + { + "ce_ib": 4.918848514556885, + "ce_orig": 0.5805583596229553, + "epoch": 0.10682961897915169, + "kl_loss": 0.10687030851840973, + "loss_ib": 0.007802939508110285, + "step": 743 + }, + { + "ce_ib": 10.824296951293945, + "ce_orig": 1.5680608749389648, + "epoch": 0.10697340043134436, + "kl_loss": 0.0964423194527626, + "loss_ib": 0.01023426465690136, + "step": 744 + }, + { + "epoch": 0.10711718188353703, + "grad_norm": 0.12692318856716156, + "learning_rate": 4.9996421720230955e-05, + "loss": 0.8577, + "step": 745 + }, + { + "ce_ib": 11.27613639831543, + "ce_orig": 1.3031110763549805, + "epoch": 0.10711718188353703, + "kl_loss": 0.09824083745479584, + "loss_ib": 0.010550110600888729, + "step": 745 + }, + { + "ce_ib": 9.67626953125, + "ce_orig": 0.8898366689682007, + "epoch": 0.1072609633357297, + "kl_loss": 0.07953157275915146, + "loss_ib": 0.008814713917672634, + "step": 746 + }, + { + "ce_ib": 6.659996509552002, + "ce_orig": 0.6965008974075317, + "epoch": 0.10740474478792236, + "kl_loss": 0.15939068794250488, + "loss_ib": 0.01129953283816576, + "step": 747 + }, + { + "ce_ib": 9.864056587219238, + "ce_orig": 0.9512748122215271, + "epoch": 0.10754852624011503, + "kl_loss": 0.13909170031547546, + "loss_ib": 0.011886613443493843, + "step": 748 + }, + { + "ce_ib": 7.443923473358154, + "ce_orig": 0.8524958491325378, + "epoch": 0.1076923076923077, + "kl_loss": 0.11339938640594482, + "loss_ib": 0.009391930885612965, + "step": 749 + }, + { + "epoch": 0.10783608914450037, + "grad_norm": 0.12534423172473907, + "learning_rate": 4.999608591703001e-05, + "loss": 0.8728, + "step": 750 + }, + { + "ce_ib": 4.702718257904053, + "ce_orig": 0.37047120928764343, + "epoch": 0.10783608914450037, + "kl_loss": 0.14064157009124756, + "loss_ib": 0.009383438155055046, + "step": 750 + }, + { + "ce_ib": 6.858421802520752, + "ce_orig": 1.1404428482055664, + "epoch": 0.10797987059669302, + "kl_loss": 0.08100490272045135, + "loss_ib": 0.007479456253349781, + "step": 751 + }, + { + "ce_ib": 8.411442756652832, + "ce_orig": 0.8211296200752258, + "epoch": 0.10812365204888569, + "kl_loss": 0.11985696852207184, + "loss_ib": 0.010198569856584072, + "step": 752 + }, + { + "ce_ib": 6.272851943969727, + "ce_orig": 0.7544890642166138, + "epoch": 0.10826743350107836, + "kl_loss": 0.08774818480014801, + "loss_ib": 0.007523834705352783, + "step": 753 + }, + { + "ce_ib": 8.928715705871582, + "ce_orig": 0.9599756002426147, + "epoch": 0.10841121495327102, + "kl_loss": 0.11729443073272705, + "loss_ib": 0.010329079814255238, + "step": 754 + }, + { + "epoch": 0.10855499640546369, + "grad_norm": 0.11862904578447342, + "learning_rate": 4.9995735056995826e-05, + "loss": 0.8637, + "step": 755 + }, + { + "ce_ib": 10.275476455688477, + "ce_orig": 1.2619425058364868, + "epoch": 0.10855499640546369, + "kl_loss": 0.13122868537902832, + "loss_ib": 0.011699172668159008, + "step": 755 + }, + { + "ce_ib": 8.515475273132324, + "ce_orig": 0.8744072318077087, + "epoch": 0.10869877785765636, + "kl_loss": 0.14314191043376923, + "loss_ib": 0.01141483336687088, + "step": 756 + }, + { + "ce_ib": 8.099058151245117, + "ce_orig": 1.0449519157409668, + "epoch": 0.10884255930984903, + "kl_loss": 0.08819152414798737, + "loss_ib": 0.008459105156362057, + "step": 757 + }, + { + "ce_ib": 7.1664228439331055, + "ce_orig": 0.856492280960083, + "epoch": 0.1089863407620417, + "kl_loss": 0.07748554646968842, + "loss_ib": 0.0074574886821210384, + "step": 758 + }, + { + "ce_ib": 8.10939884185791, + "ce_orig": 0.7625494003295898, + "epoch": 0.10913012221423436, + "kl_loss": 0.12002036720514297, + "loss_ib": 0.010055718012154102, + "step": 759 + }, + { + "epoch": 0.10927390366642703, + "grad_norm": 0.12130390107631683, + "learning_rate": 4.999536914033977e-05, + "loss": 0.864, + "step": 760 + }, + { + "ce_ib": 6.7783203125, + "ce_orig": 0.6925073266029358, + "epoch": 0.10927390366642703, + "kl_loss": 0.14075952768325806, + "loss_ib": 0.010427136905491352, + "step": 760 + }, + { + "ce_ib": 6.035461902618408, + "ce_orig": 0.5443659424781799, + "epoch": 0.1094176851186197, + "kl_loss": 0.11947020888328552, + "loss_ib": 0.008991241455078125, + "step": 761 + }, + { + "ce_ib": 10.941012382507324, + "ce_orig": 1.4812273979187012, + "epoch": 0.10956146657081237, + "kl_loss": 0.09991130232810974, + "loss_ib": 0.010466071777045727, + "step": 762 + }, + { + "ce_ib": 10.077688217163086, + "ce_orig": 1.1687238216400146, + "epoch": 0.10970524802300503, + "kl_loss": 0.10050664842128754, + "loss_ib": 0.01006417628377676, + "step": 763 + }, + { + "ce_ib": 7.7856125831604, + "ce_orig": 0.8300952911376953, + "epoch": 0.1098490294751977, + "kl_loss": 0.11259394139051437, + "loss_ib": 0.009522504173219204, + "step": 764 + }, + { + "epoch": 0.10999281092739037, + "grad_norm": 0.09898856282234192, + "learning_rate": 4.999498816728223e-05, + "loss": 0.8381, + "step": 765 + }, + { + "ce_ib": 6.172881126403809, + "ce_orig": 0.695868968963623, + "epoch": 0.10999281092739037, + "kl_loss": 0.10089915245771408, + "loss_ib": 0.008131398819386959, + "step": 765 + }, + { + "ce_ib": 7.305203437805176, + "ce_orig": 0.7463586330413818, + "epoch": 0.11013659237958304, + "kl_loss": 0.09400838613510132, + "loss_ib": 0.008353020995855331, + "step": 766 + }, + { + "ce_ib": 4.747533798217773, + "ce_orig": 0.5122455954551697, + "epoch": 0.1102803738317757, + "kl_loss": 0.074123814702034, + "loss_ib": 0.0060799578204751015, + "step": 767 + }, + { + "ce_ib": 6.877668857574463, + "ce_orig": 0.5530845522880554, + "epoch": 0.11042415528396837, + "kl_loss": 0.11388804018497467, + "loss_ib": 0.009133236482739449, + "step": 768 + }, + { + "ce_ib": 11.079581260681152, + "ce_orig": 1.1505775451660156, + "epoch": 0.11056793673616104, + "kl_loss": 0.2215118706226349, + "loss_ib": 0.016615385189652443, + "step": 769 + }, + { + "epoch": 0.11071171818835371, + "grad_norm": 0.13919848203659058, + "learning_rate": 4.999459213805272e-05, + "loss": 0.8708, + "step": 770 + }, + { + "ce_ib": 8.570969581604004, + "ce_orig": 1.2589601278305054, + "epoch": 0.11071171818835371, + "kl_loss": 0.08060797303915024, + "loss_ib": 0.008315883576869965, + "step": 770 + }, + { + "ce_ib": 7.168095588684082, + "ce_orig": 0.8928415775299072, + "epoch": 0.11085549964054638, + "kl_loss": 0.10986876487731934, + "loss_ib": 0.009077486582100391, + "step": 771 + }, + { + "ce_ib": 8.855610847473145, + "ce_orig": 0.733345091342926, + "epoch": 0.11099928109273903, + "kl_loss": 0.12182736396789551, + "loss_ib": 0.010519173927605152, + "step": 772 + }, + { + "ce_ib": 3.5407509803771973, + "ce_orig": 0.3610730469226837, + "epoch": 0.1111430625449317, + "kl_loss": 0.14145609736442566, + "loss_ib": 0.00884318072348833, + "step": 773 + }, + { + "ce_ib": 8.146163940429688, + "ce_orig": 0.6188514828681946, + "epoch": 0.11128684399712437, + "kl_loss": 0.10034702718257904, + "loss_ib": 0.009090433828532696, + "step": 774 + }, + { + "epoch": 0.11143062544931703, + "grad_norm": 0.11237223446369171, + "learning_rate": 4.999418105288978e-05, + "loss": 0.8281, + "step": 775 + }, + { + "ce_ib": 6.921514987945557, + "ce_orig": 0.7439333200454712, + "epoch": 0.11143062544931703, + "kl_loss": 0.08917941153049469, + "loss_ib": 0.007919727824628353, + "step": 775 + }, + { + "ce_ib": 6.831997871398926, + "ce_orig": 0.7869361042976379, + "epoch": 0.1115744069015097, + "kl_loss": 0.10595273226499557, + "loss_ib": 0.008713635616004467, + "step": 776 + }, + { + "ce_ib": 7.639923095703125, + "ce_orig": 0.9338688254356384, + "epoch": 0.11171818835370237, + "kl_loss": 0.10657122731208801, + "loss_ib": 0.009148523211479187, + "step": 777 + }, + { + "ce_ib": 6.877979755401611, + "ce_orig": 0.7685055136680603, + "epoch": 0.11186196980589504, + "kl_loss": 0.09290573745965958, + "loss_ib": 0.00808427669107914, + "step": 778 + }, + { + "ce_ib": 10.934708595275879, + "ce_orig": 1.0183820724487305, + "epoch": 0.1120057512580877, + "kl_loss": 0.13018369674682617, + "loss_ib": 0.01197653915733099, + "step": 779 + }, + { + "epoch": 0.11214953271028037, + "grad_norm": 0.1425534039735794, + "learning_rate": 4.999375491204102e-05, + "loss": 0.9258, + "step": 780 + }, + { + "ce_ib": 8.499329566955566, + "ce_orig": 0.6566687822341919, + "epoch": 0.11214953271028037, + "kl_loss": 0.09959501028060913, + "loss_ib": 0.009229416027665138, + "step": 780 + }, + { + "ce_ib": 8.128525733947754, + "ce_orig": 0.7684027552604675, + "epoch": 0.11229331416247304, + "kl_loss": 0.0928315594792366, + "loss_ib": 0.00870584137737751, + "step": 781 + }, + { + "ce_ib": 6.759765148162842, + "ce_orig": 0.7490063309669495, + "epoch": 0.11243709561466571, + "kl_loss": 0.05619842931628227, + "loss_ib": 0.006189804058521986, + "step": 782 + }, + { + "ce_ib": 7.713232517242432, + "ce_orig": 0.9557498693466187, + "epoch": 0.11258087706685838, + "kl_loss": 0.10368062555789948, + "loss_ib": 0.009040648117661476, + "step": 783 + }, + { + "ce_ib": 7.60745906829834, + "ce_orig": 0.7716460227966309, + "epoch": 0.11272465851905104, + "kl_loss": 0.08124081790447235, + "loss_ib": 0.00786577071994543, + "step": 784 + }, + { + "epoch": 0.11286843997124371, + "grad_norm": 0.13931146264076233, + "learning_rate": 4.9993313715763166e-05, + "loss": 0.8111, + "step": 785 + }, + { + "ce_ib": 9.313846588134766, + "ce_orig": 1.1762357950210571, + "epoch": 0.11286843997124371, + "kl_loss": 0.12043385207653046, + "loss_ib": 0.010678616352379322, + "step": 785 + }, + { + "ce_ib": 9.520804405212402, + "ce_orig": 1.0862674713134766, + "epoch": 0.11301222142343638, + "kl_loss": 0.13194264471530914, + "loss_ib": 0.011357533745467663, + "step": 786 + }, + { + "ce_ib": 5.819194793701172, + "ce_orig": 0.5117652416229248, + "epoch": 0.11315600287562905, + "kl_loss": 0.08215292543172836, + "loss_ib": 0.007017243653535843, + "step": 787 + }, + { + "ce_ib": 8.07127857208252, + "ce_orig": 1.1504753828048706, + "epoch": 0.11329978432782171, + "kl_loss": 0.09762965887784958, + "loss_ib": 0.008917122147977352, + "step": 788 + }, + { + "ce_ib": 7.960126876831055, + "ce_orig": 0.7059429883956909, + "epoch": 0.11344356578001438, + "kl_loss": 0.14231815934181213, + "loss_ib": 0.011095970869064331, + "step": 789 + }, + { + "epoch": 0.11358734723220705, + "grad_norm": 0.12224866449832916, + "learning_rate": 4.9992857464321963e-05, + "loss": 0.862, + "step": 790 + }, + { + "ce_ib": 7.631105899810791, + "ce_orig": 0.8110905289649963, + "epoch": 0.11358734723220705, + "kl_loss": 0.08087232708930969, + "loss_ib": 0.007859169505536556, + "step": 790 + }, + { + "ce_ib": 7.410162448883057, + "ce_orig": 0.6299762725830078, + "epoch": 0.11373112868439972, + "kl_loss": 0.12838461995124817, + "loss_ib": 0.010124312713742256, + "step": 791 + }, + { + "ce_ib": 8.221912384033203, + "ce_orig": 0.9979586005210876, + "epoch": 0.11387491013659239, + "kl_loss": 0.1258949488401413, + "loss_ib": 0.010405703447759151, + "step": 792 + }, + { + "ce_ib": 6.832244873046875, + "ce_orig": 0.7152706384658813, + "epoch": 0.11401869158878504, + "kl_loss": 0.08785121142864227, + "loss_ib": 0.0078086829744279385, + "step": 793 + }, + { + "ce_ib": 7.743945598602295, + "ce_orig": 0.9861294627189636, + "epoch": 0.11416247304097771, + "kl_loss": 0.09959419816732407, + "loss_ib": 0.00885168369859457, + "step": 794 + }, + { + "epoch": 0.11430625449317038, + "grad_norm": 0.12471532076597214, + "learning_rate": 4.9992386157992246e-05, + "loss": 0.8318, + "step": 795 + }, + { + "ce_ib": 7.021468162536621, + "ce_orig": 0.7713713049888611, + "epoch": 0.11430625449317038, + "kl_loss": 0.07618668675422668, + "loss_ib": 0.007320068776607513, + "step": 795 + }, + { + "ce_ib": 7.368815898895264, + "ce_orig": 1.1165920495986938, + "epoch": 0.11445003594536304, + "kl_loss": 0.07658970355987549, + "loss_ib": 0.007513892836868763, + "step": 796 + }, + { + "ce_ib": 6.703334808349609, + "ce_orig": 0.5481064915657043, + "epoch": 0.11459381739755571, + "kl_loss": 0.09191010147333145, + "loss_ib": 0.007947172038257122, + "step": 797 + }, + { + "ce_ib": 9.726116180419922, + "ce_orig": 1.2227307558059692, + "epoch": 0.11473759884974838, + "kl_loss": 0.09769535809755325, + "loss_ib": 0.009747825562953949, + "step": 798 + }, + { + "ce_ib": 8.778748512268066, + "ce_orig": 0.9550623893737793, + "epoch": 0.11488138030194105, + "kl_loss": 0.08937544375658035, + "loss_ib": 0.008858147077262402, + "step": 799 + }, + { + "epoch": 0.11502516175413371, + "grad_norm": 0.12755703926086426, + "learning_rate": 4.9991899797057904e-05, + "loss": 0.9259, + "step": 800 + }, + { + "ce_ib": 10.388070106506348, + "ce_orig": 1.069626808166504, + "epoch": 0.11502516175413371, + "kl_loss": 0.12364666163921356, + "loss_ib": 0.011376367881894112, + "step": 800 + }, + { + "ce_ib": 6.369958400726318, + "ce_orig": 0.8895975947380066, + "epoch": 0.11516894320632638, + "kl_loss": 0.06259946525096893, + "loss_ib": 0.0063149528577923775, + "step": 801 + }, + { + "ce_ib": 9.586531639099121, + "ce_orig": 1.2012348175048828, + "epoch": 0.11531272465851905, + "kl_loss": 0.09361310303211212, + "loss_ib": 0.009473921731114388, + "step": 802 + }, + { + "ce_ib": 7.590814113616943, + "ce_orig": 1.0715209245681763, + "epoch": 0.11545650611071172, + "kl_loss": 0.10730178654193878, + "loss_ib": 0.009160496294498444, + "step": 803 + }, + { + "ce_ib": 8.784406661987305, + "ce_orig": 1.1157152652740479, + "epoch": 0.11560028756290439, + "kl_loss": 0.10003470629453659, + "loss_ib": 0.009393938817083836, + "step": 804 + }, + { + "epoch": 0.11574406901509705, + "grad_norm": 0.13564252853393555, + "learning_rate": 4.9991398381811924e-05, + "loss": 0.892, + "step": 805 + }, + { + "ce_ib": 6.886279106140137, + "ce_orig": 0.6827164888381958, + "epoch": 0.11574406901509705, + "kl_loss": 0.08341995626688004, + "loss_ib": 0.007614137139171362, + "step": 805 + }, + { + "ce_ib": 7.802131175994873, + "ce_orig": 1.107555627822876, + "epoch": 0.11588785046728972, + "kl_loss": 0.08331786096096039, + "loss_ib": 0.008066958747804165, + "step": 806 + }, + { + "ce_ib": 7.465454578399658, + "ce_orig": 0.8235023617744446, + "epoch": 0.11603163191948239, + "kl_loss": 0.0998779833316803, + "loss_ib": 0.008726626634597778, + "step": 807 + }, + { + "ce_ib": 5.956389427185059, + "ce_orig": 0.83247971534729, + "epoch": 0.11617541337167506, + "kl_loss": 0.07732568681240082, + "loss_ib": 0.006844479124993086, + "step": 808 + }, + { + "ce_ib": 5.556079864501953, + "ce_orig": 0.6450206637382507, + "epoch": 0.11631919482386772, + "kl_loss": 0.08965234458446503, + "loss_ib": 0.0072606573812663555, + "step": 809 + }, + { + "epoch": 0.11646297627606039, + "grad_norm": 0.11268489807844162, + "learning_rate": 4.999088191255632e-05, + "loss": 0.8579, + "step": 810 + }, + { + "ce_ib": 8.282671928405762, + "ce_orig": 0.7875933051109314, + "epoch": 0.11646297627606039, + "kl_loss": 0.11104097217321396, + "loss_ib": 0.009693384170532227, + "step": 810 + }, + { + "ce_ib": 7.059372901916504, + "ce_orig": 0.8186449408531189, + "epoch": 0.11660675772825306, + "kl_loss": 0.1103602796792984, + "loss_ib": 0.00904770102351904, + "step": 811 + }, + { + "ce_ib": 8.647476196289062, + "ce_orig": 1.0445913076400757, + "epoch": 0.11675053918044573, + "kl_loss": 0.10142830014228821, + "loss_ib": 0.009395153261721134, + "step": 812 + }, + { + "ce_ib": 7.812716007232666, + "ce_orig": 0.9563208222389221, + "epoch": 0.1168943206326384, + "kl_loss": 0.07602293789386749, + "loss_ib": 0.0077075050212442875, + "step": 813 + }, + { + "ce_ib": 7.100307941436768, + "ce_orig": 0.6869685649871826, + "epoch": 0.11703810208483105, + "kl_loss": 0.13484741747379303, + "loss_ib": 0.01029252540320158, + "step": 814 + }, + { + "epoch": 0.11718188353702372, + "grad_norm": 0.145658478140831, + "learning_rate": 4.9990350389602214e-05, + "loss": 0.8295, + "step": 815 + }, + { + "ce_ib": 7.6031060218811035, + "ce_orig": 0.7811675667762756, + "epoch": 0.11718188353702372, + "kl_loss": 0.14399005472660065, + "loss_ib": 0.01100105606019497, + "step": 815 + }, + { + "ce_ib": 7.256862640380859, + "ce_orig": 0.7083752751350403, + "epoch": 0.11732566498921639, + "kl_loss": 0.14461134374141693, + "loss_ib": 0.010858998633921146, + "step": 816 + }, + { + "ce_ib": 6.885143280029297, + "ce_orig": 0.8050568699836731, + "epoch": 0.11746944644140905, + "kl_loss": 0.11550889909267426, + "loss_ib": 0.009218016639351845, + "step": 817 + }, + { + "ce_ib": 6.232802391052246, + "ce_orig": 0.5001097321510315, + "epoch": 0.11761322789360172, + "kl_loss": 0.1038711816072464, + "loss_ib": 0.00830996036529541, + "step": 818 + }, + { + "ce_ib": 5.1833367347717285, + "ce_orig": 0.37858033180236816, + "epoch": 0.11775700934579439, + "kl_loss": 0.11628204584121704, + "loss_ib": 0.008405770175158978, + "step": 819 + }, + { + "epoch": 0.11790079079798706, + "grad_norm": 0.11669674515724182, + "learning_rate": 4.9989803813269775e-05, + "loss": 0.7666, + "step": 820 + }, + { + "ce_ib": 7.362185478210449, + "ce_orig": 0.9856128692626953, + "epoch": 0.11790079079798706, + "kl_loss": 0.09990614652633667, + "loss_ib": 0.008676400408148766, + "step": 820 + }, + { + "ce_ib": 6.022683620452881, + "ce_orig": 0.6719481348991394, + "epoch": 0.11804457225017972, + "kl_loss": 0.09421147406101227, + "loss_ib": 0.0077219158411026, + "step": 821 + }, + { + "ce_ib": 7.54342794418335, + "ce_orig": 0.9077051877975464, + "epoch": 0.11818835370237239, + "kl_loss": 0.09480626881122589, + "loss_ib": 0.008512027561664581, + "step": 822 + }, + { + "ce_ib": 7.5844597816467285, + "ce_orig": 0.8468344807624817, + "epoch": 0.11833213515456506, + "kl_loss": 0.09350645542144775, + "loss_ib": 0.008467552252113819, + "step": 823 + }, + { + "ce_ib": 9.546996116638184, + "ce_orig": 0.9645229578018188, + "epoch": 0.11847591660675773, + "kl_loss": 0.10462982207536697, + "loss_ib": 0.010004988871514797, + "step": 824 + }, + { + "epoch": 0.1186196980589504, + "grad_norm": 0.1411919742822647, + "learning_rate": 4.998924218388824e-05, + "loss": 0.8377, + "step": 825 + }, + { + "ce_ib": 6.855606555938721, + "ce_orig": 0.6925601363182068, + "epoch": 0.1186196980589504, + "kl_loss": 0.11507317423820496, + "loss_ib": 0.009181462228298187, + "step": 825 + }, + { + "ce_ib": 7.184199333190918, + "ce_orig": 0.7112440466880798, + "epoch": 0.11876347951114306, + "kl_loss": 0.11154375225305557, + "loss_ib": 0.009169287048280239, + "step": 826 + }, + { + "ce_ib": 6.221945762634277, + "ce_orig": 0.7550656199455261, + "epoch": 0.11890726096333573, + "kl_loss": 0.08264736086130142, + "loss_ib": 0.00724334130063653, + "step": 827 + }, + { + "ce_ib": 6.1294331550598145, + "ce_orig": 0.8655829429626465, + "epoch": 0.1190510424155284, + "kl_loss": 0.2529202103614807, + "loss_ib": 0.015710726380348206, + "step": 828 + }, + { + "ce_ib": 7.718718528747559, + "ce_orig": 0.9449477195739746, + "epoch": 0.11919482386772107, + "kl_loss": 0.1413031816482544, + "loss_ib": 0.01092451810836792, + "step": 829 + }, + { + "epoch": 0.11933860531991373, + "grad_norm": 0.13230349123477936, + "learning_rate": 4.998866550179591e-05, + "loss": 0.8497, + "step": 830 + }, + { + "ce_ib": 8.910257339477539, + "ce_orig": 1.0684385299682617, + "epoch": 0.11933860531991373, + "kl_loss": 0.08060315996408463, + "loss_ib": 0.008485286496579647, + "step": 830 + }, + { + "ce_ib": 6.161970138549805, + "ce_orig": 0.7749412059783936, + "epoch": 0.1194823867721064, + "kl_loss": 0.09046103060245514, + "loss_ib": 0.00760403648018837, + "step": 831 + }, + { + "ce_ib": 8.188762664794922, + "ce_orig": 1.1623848676681519, + "epoch": 0.11962616822429907, + "kl_loss": 0.1053881123661995, + "loss_ib": 0.00936378724873066, + "step": 832 + }, + { + "ce_ib": 8.579145431518555, + "ce_orig": 1.0423914194107056, + "epoch": 0.11976994967649174, + "kl_loss": 0.0925225242972374, + "loss_ib": 0.00891569908708334, + "step": 833 + }, + { + "ce_ib": 6.8061676025390625, + "ce_orig": 0.7699450850486755, + "epoch": 0.1199137311286844, + "kl_loss": 0.1753084808588028, + "loss_ib": 0.01216850709170103, + "step": 834 + }, + { + "epoch": 0.12005751258087707, + "grad_norm": 0.17299415171146393, + "learning_rate": 4.9988073767340174e-05, + "loss": 0.8972, + "step": 835 + }, + { + "ce_ib": 11.318516731262207, + "ce_orig": 1.7190030813217163, + "epoch": 0.12005751258087707, + "kl_loss": 0.0838870257139206, + "loss_ib": 0.009853609837591648, + "step": 835 + }, + { + "ce_ib": 6.024441242218018, + "ce_orig": 0.5168812274932861, + "epoch": 0.12020129403306973, + "kl_loss": 0.08710253238677979, + "loss_ib": 0.007367347367107868, + "step": 836 + }, + { + "ce_ib": 9.387094497680664, + "ce_orig": 0.5673519968986511, + "epoch": 0.1203450754852624, + "kl_loss": 0.08531000465154648, + "loss_ib": 0.008959047496318817, + "step": 837 + }, + { + "ce_ib": 3.3088152408599854, + "ce_orig": 0.40009650588035583, + "epoch": 0.12048885693745506, + "kl_loss": 0.14413371682167053, + "loss_ib": 0.008861093781888485, + "step": 838 + }, + { + "ce_ib": 5.766994953155518, + "ce_orig": 0.5750284790992737, + "epoch": 0.12063263838964773, + "kl_loss": 0.0825929120182991, + "loss_ib": 0.007013143040239811, + "step": 839 + }, + { + "epoch": 0.1207764198418404, + "grad_norm": 0.15587636828422546, + "learning_rate": 4.998746698087745e-05, + "loss": 0.9364, + "step": 840 + }, + { + "ce_ib": 8.854097366333008, + "ce_orig": 1.37924063205719, + "epoch": 0.1207764198418404, + "kl_loss": 0.12135547399520874, + "loss_ib": 0.010494822636246681, + "step": 840 + }, + { + "ce_ib": 5.431858062744141, + "ce_orig": 0.7637268304824829, + "epoch": 0.12092020129403307, + "kl_loss": 0.07992113381624222, + "loss_ib": 0.006711985915899277, + "step": 841 + }, + { + "ce_ib": 8.615471839904785, + "ce_orig": 1.1368523836135864, + "epoch": 0.12106398274622573, + "kl_loss": 0.10841356217861176, + "loss_ib": 0.009728414006531239, + "step": 842 + }, + { + "ce_ib": 6.431497573852539, + "ce_orig": 0.6517418622970581, + "epoch": 0.1212077641984184, + "kl_loss": 0.12224148958921432, + "loss_ib": 0.009327823296189308, + "step": 843 + }, + { + "ce_ib": 6.216892719268799, + "ce_orig": 0.9238556623458862, + "epoch": 0.12135154565061107, + "kl_loss": 0.07222751528024673, + "loss_ib": 0.006719821598380804, + "step": 844 + }, + { + "epoch": 0.12149532710280374, + "grad_norm": 0.11728943139314651, + "learning_rate": 4.9986845142773275e-05, + "loss": 0.97, + "step": 845 + }, + { + "ce_ib": 7.411965847015381, + "ce_orig": 0.9339240193367004, + "epoch": 0.12149532710280374, + "kl_loss": 0.1781131625175476, + "loss_ib": 0.01261164154857397, + "step": 845 + }, + { + "ce_ib": 6.76970100402832, + "ce_orig": 0.8812845945358276, + "epoch": 0.1216391085549964, + "kl_loss": 0.1000911146402359, + "loss_ib": 0.008389405906200409, + "step": 846 + }, + { + "ce_ib": 4.417732238769531, + "ce_orig": 0.5549830198287964, + "epoch": 0.12178289000718907, + "kl_loss": 0.06278645992279053, + "loss_ib": 0.005348189268261194, + "step": 847 + }, + { + "ce_ib": 7.800521373748779, + "ce_orig": 0.6102645993232727, + "epoch": 0.12192667145938174, + "kl_loss": 0.12678933143615723, + "loss_ib": 0.01023972686380148, + "step": 848 + }, + { + "ce_ib": 9.005143165588379, + "ce_orig": 0.9449028968811035, + "epoch": 0.12207045291157441, + "kl_loss": 0.10566647350788116, + "loss_ib": 0.009785895235836506, + "step": 849 + }, + { + "epoch": 0.12221423436376708, + "grad_norm": 0.12012098729610443, + "learning_rate": 4.998620825340221e-05, + "loss": 0.7973, + "step": 850 + }, + { + "ce_ib": 5.912694454193115, + "ce_orig": 0.6100848317146301, + "epoch": 0.12221423436376708, + "kl_loss": 0.12130458652973175, + "loss_ib": 0.0090215764939785, + "step": 850 + }, + { + "ce_ib": 6.602824687957764, + "ce_orig": 0.6862348318099976, + "epoch": 0.12235801581595974, + "kl_loss": 0.10095177590847015, + "loss_ib": 0.008349001407623291, + "step": 851 + }, + { + "ce_ib": 8.233159065246582, + "ce_orig": 0.880978524684906, + "epoch": 0.12250179726815241, + "kl_loss": 0.074901282787323, + "loss_ib": 0.00786164402961731, + "step": 852 + }, + { + "ce_ib": 7.939795017242432, + "ce_orig": 0.9793850779533386, + "epoch": 0.12264557872034508, + "kl_loss": 0.10613537579774857, + "loss_ib": 0.00927666574716568, + "step": 853 + }, + { + "ce_ib": 6.5190935134887695, + "ce_orig": 0.9610397815704346, + "epoch": 0.12278936017253775, + "kl_loss": 0.16304026544094086, + "loss_ib": 0.011411559768021107, + "step": 854 + }, + { + "epoch": 0.12293314162473042, + "grad_norm": 0.11487976461648941, + "learning_rate": 4.9985556313147895e-05, + "loss": 0.9548, + "step": 855 + }, + { + "ce_ib": 7.369608402252197, + "ce_orig": 0.6912807822227478, + "epoch": 0.12293314162473042, + "kl_loss": 0.12601375579833984, + "loss_ib": 0.009985491633415222, + "step": 855 + }, + { + "ce_ib": 8.943307876586914, + "ce_orig": 1.071799397468567, + "epoch": 0.12307692307692308, + "kl_loss": 0.1637842357158661, + "loss_ib": 0.012660865671932697, + "step": 856 + }, + { + "ce_ib": 7.475729465484619, + "ce_orig": 0.9372177124023438, + "epoch": 0.12322070452911574, + "kl_loss": 0.08353350311517715, + "loss_ib": 0.007914540357887745, + "step": 857 + }, + { + "ce_ib": 4.192291736602783, + "ce_orig": 0.4509224593639374, + "epoch": 0.1233644859813084, + "kl_loss": 0.08783835917711258, + "loss_ib": 0.0064880638383328915, + "step": 858 + }, + { + "ce_ib": 6.574756145477295, + "ce_orig": 0.7229125499725342, + "epoch": 0.12350826743350107, + "kl_loss": 0.12410986423492432, + "loss_ib": 0.009492871351540089, + "step": 859 + }, + { + "epoch": 0.12365204888569374, + "grad_norm": 0.10187579691410065, + "learning_rate": 4.998488932240305e-05, + "loss": 0.896, + "step": 860 + }, + { + "ce_ib": 6.708831310272217, + "ce_orig": 0.624555766582489, + "epoch": 0.12365204888569374, + "kl_loss": 0.10983145236968994, + "loss_ib": 0.008845987729728222, + "step": 860 + }, + { + "ce_ib": 11.562445640563965, + "ce_orig": 1.5727458000183105, + "epoch": 0.12379583033788641, + "kl_loss": 0.11283396929502487, + "loss_ib": 0.011422920972108841, + "step": 861 + }, + { + "ce_ib": 6.905985355377197, + "ce_orig": 0.5196636319160461, + "epoch": 0.12393961179007908, + "kl_loss": 0.08311201632022858, + "loss_ib": 0.007608593441545963, + "step": 862 + }, + { + "ce_ib": 6.673101425170898, + "ce_orig": 0.7954714298248291, + "epoch": 0.12408339324227174, + "kl_loss": 0.12156697362661362, + "loss_ib": 0.009414899162948132, + "step": 863 + }, + { + "ce_ib": 9.510821342468262, + "ce_orig": 1.0977836847305298, + "epoch": 0.12422717469446441, + "kl_loss": 0.11292783170938492, + "loss_ib": 0.010401802137494087, + "step": 864 + }, + { + "epoch": 0.12437095614665708, + "grad_norm": 0.10984697192907333, + "learning_rate": 4.9984207281569426e-05, + "loss": 0.8947, + "step": 865 + }, + { + "ce_ib": 6.371394634246826, + "ce_orig": 0.8775607347488403, + "epoch": 0.12437095614665708, + "kl_loss": 0.10108557343482971, + "loss_ib": 0.00823997613042593, + "step": 865 + }, + { + "ce_ib": 8.300228118896484, + "ce_orig": 0.8505659103393555, + "epoch": 0.12451473759884975, + "kl_loss": 0.13992467522621155, + "loss_ib": 0.011146347038447857, + "step": 866 + }, + { + "ce_ib": 7.113746166229248, + "ce_orig": 0.6279032826423645, + "epoch": 0.12465851905104242, + "kl_loss": 0.07766470313072205, + "loss_ib": 0.007440108340233564, + "step": 867 + }, + { + "ce_ib": 9.46139907836914, + "ce_orig": 0.982869029045105, + "epoch": 0.12480230050323508, + "kl_loss": 0.13695910573005676, + "loss_ib": 0.011578655801713467, + "step": 868 + }, + { + "ce_ib": 6.4365973472595215, + "ce_orig": 0.7115389108657837, + "epoch": 0.12494608195542775, + "kl_loss": 0.08151569217443466, + "loss_ib": 0.00729408347979188, + "step": 869 + }, + { + "epoch": 0.1250898634076204, + "grad_norm": 0.14856663346290588, + "learning_rate": 4.998351019105789e-05, + "loss": 0.8044, + "step": 870 + }, + { + "ce_ib": 7.635079383850098, + "ce_orig": 0.9380092620849609, + "epoch": 0.1250898634076204, + "kl_loss": 0.08349855244159698, + "loss_ib": 0.00799246784299612, + "step": 870 + }, + { + "ce_ib": 6.534551620483398, + "ce_orig": 0.5008100867271423, + "epoch": 0.1252336448598131, + "kl_loss": 0.14466875791549683, + "loss_ib": 0.010500714182853699, + "step": 871 + }, + { + "ce_ib": 7.542534828186035, + "ce_orig": 1.1307094097137451, + "epoch": 0.12537742631200574, + "kl_loss": 0.10606865584850311, + "loss_ib": 0.009074700064957142, + "step": 872 + }, + { + "ce_ib": 5.353913307189941, + "ce_orig": 0.5838197469711304, + "epoch": 0.12552120776419842, + "kl_loss": 0.21842791140079498, + "loss_ib": 0.013598352670669556, + "step": 873 + }, + { + "ce_ib": 6.852460861206055, + "ce_orig": 0.898078978061676, + "epoch": 0.12566498921639108, + "kl_loss": 0.19777730107307434, + "loss_ib": 0.013315095566213131, + "step": 874 + }, + { + "epoch": 0.12580877066858376, + "grad_norm": 0.16305744647979736, + "learning_rate": 4.9982798051288326e-05, + "loss": 0.8503, + "step": 875 + }, + { + "ce_ib": 5.873923301696777, + "ce_orig": 0.4752177894115448, + "epoch": 0.12580877066858376, + "kl_loss": 0.06173687055706978, + "loss_ib": 0.006023805122822523, + "step": 875 + }, + { + "ce_ib": 6.247898101806641, + "ce_orig": 0.8174537420272827, + "epoch": 0.1259525521207764, + "kl_loss": 0.0691542774438858, + "loss_ib": 0.006581662688404322, + "step": 876 + }, + { + "ce_ib": 3.595930814743042, + "ce_orig": 0.4554266929626465, + "epoch": 0.1260963335729691, + "kl_loss": 0.1204795241355896, + "loss_ib": 0.007821941748261452, + "step": 877 + }, + { + "ce_ib": 5.832032203674316, + "ce_orig": 0.7023007273674011, + "epoch": 0.12624011502516175, + "kl_loss": 0.12059217691421509, + "loss_ib": 0.008945624344050884, + "step": 878 + }, + { + "ce_ib": 5.493231773376465, + "ce_orig": 0.74711012840271, + "epoch": 0.12638389647735443, + "kl_loss": 0.13393926620483398, + "loss_ib": 0.009443579241633415, + "step": 879 + }, + { + "epoch": 0.12652767792954708, + "grad_norm": 0.11151118576526642, + "learning_rate": 4.998207086268971e-05, + "loss": 0.7946, + "step": 880 + }, + { + "ce_ib": 7.743311882019043, + "ce_orig": 1.0406618118286133, + "epoch": 0.12652767792954708, + "kl_loss": 0.07510361075401306, + "loss_ib": 0.007626836188137531, + "step": 880 + }, + { + "ce_ib": 6.917181015014648, + "ce_orig": 0.9735674262046814, + "epoch": 0.12667145938173976, + "kl_loss": 0.0983152836561203, + "loss_ib": 0.008374354802072048, + "step": 881 + }, + { + "ce_ib": 8.613320350646973, + "ce_orig": 1.0136125087738037, + "epoch": 0.12681524083393242, + "kl_loss": 0.12226949632167816, + "loss_ib": 0.010420135222375393, + "step": 882 + }, + { + "ce_ib": 10.181398391723633, + "ce_orig": 1.487206220626831, + "epoch": 0.1269590222861251, + "kl_loss": 0.08668573200702667, + "loss_ib": 0.009424986317753792, + "step": 883 + }, + { + "ce_ib": 6.982693672180176, + "ce_orig": 0.819129228591919, + "epoch": 0.12710280373831775, + "kl_loss": 0.11231732368469238, + "loss_ib": 0.009107212536036968, + "step": 884 + }, + { + "epoch": 0.12724658519051044, + "grad_norm": 0.12548822164535522, + "learning_rate": 4.998132862570007e-05, + "loss": 0.9055, + "step": 885 + }, + { + "ce_ib": 10.360311508178711, + "ce_orig": 1.447431206703186, + "epoch": 0.12724658519051044, + "kl_loss": 0.10747027397155762, + "loss_ib": 0.010553669184446335, + "step": 885 + }, + { + "ce_ib": 7.586730003356934, + "ce_orig": 0.9365792870521545, + "epoch": 0.1273903666427031, + "kl_loss": 0.10849650204181671, + "loss_ib": 0.009218189865350723, + "step": 886 + }, + { + "ce_ib": 8.302633285522461, + "ce_orig": 0.9514833688735962, + "epoch": 0.12753414809489577, + "kl_loss": 0.07761082053184509, + "loss_ib": 0.008031858131289482, + "step": 887 + }, + { + "ce_ib": 6.314914703369141, + "ce_orig": 0.6305254101753235, + "epoch": 0.12767792954708843, + "kl_loss": 0.11676283180713654, + "loss_ib": 0.00899559911340475, + "step": 888 + }, + { + "ce_ib": 8.043998718261719, + "ce_orig": 0.9012079238891602, + "epoch": 0.12782171099928108, + "kl_loss": 0.07340681552886963, + "loss_ib": 0.007692340295761824, + "step": 889 + }, + { + "epoch": 0.12796549245147376, + "grad_norm": 0.10787968337535858, + "learning_rate": 4.9980571340766526e-05, + "loss": 0.9241, + "step": 890 + }, + { + "ce_ib": 7.90130090713501, + "ce_orig": 1.0600311756134033, + "epoch": 0.12796549245147376, + "kl_loss": 0.12333562225103378, + "loss_ib": 0.010117431171238422, + "step": 890 + }, + { + "ce_ib": 6.691804885864258, + "ce_orig": 0.9607767462730408, + "epoch": 0.12810927390366642, + "kl_loss": 0.07735402882099152, + "loss_ib": 0.00721360370516777, + "step": 891 + }, + { + "ce_ib": 7.60068416595459, + "ce_orig": 0.7663901448249817, + "epoch": 0.1282530553558591, + "kl_loss": 0.08263403922319412, + "loss_ib": 0.007932043634355068, + "step": 892 + }, + { + "ce_ib": 7.247678756713867, + "ce_orig": 1.0724464654922485, + "epoch": 0.12839683680805175, + "kl_loss": 0.11332201212644577, + "loss_ib": 0.009289939887821674, + "step": 893 + }, + { + "ce_ib": 5.542264938354492, + "ce_orig": 0.664827823638916, + "epoch": 0.12854061826024443, + "kl_loss": 0.08512848615646362, + "loss_ib": 0.007027556654065847, + "step": 894 + }, + { + "epoch": 0.1286843997124371, + "grad_norm": 0.12153299897909164, + "learning_rate": 4.9979799008345215e-05, + "loss": 0.9543, + "step": 895 + }, + { + "ce_ib": 5.504077911376953, + "ce_orig": 0.6800283193588257, + "epoch": 0.1286843997124371, + "kl_loss": 0.07652309536933899, + "loss_ib": 0.006578193511813879, + "step": 895 + }, + { + "ce_ib": 4.984541416168213, + "ce_orig": 0.6408447623252869, + "epoch": 0.12882818116462977, + "kl_loss": 0.0679374560713768, + "loss_ib": 0.005889143329113722, + "step": 896 + }, + { + "ce_ib": 7.2878217697143555, + "ce_orig": 0.8095806837081909, + "epoch": 0.12897196261682242, + "kl_loss": 0.08664879202842712, + "loss_ib": 0.007976350374519825, + "step": 897 + }, + { + "ce_ib": 4.948668003082275, + "ce_orig": 0.4147416055202484, + "epoch": 0.1291157440690151, + "kl_loss": 0.17131486535072327, + "loss_ib": 0.011040077544748783, + "step": 898 + }, + { + "ce_ib": 6.665126323699951, + "ce_orig": 0.6478716731071472, + "epoch": 0.12925952552120776, + "kl_loss": 0.07731156051158905, + "loss_ib": 0.007198141422122717, + "step": 899 + }, + { + "epoch": 0.12940330697340044, + "grad_norm": 0.12399930506944656, + "learning_rate": 4.997901162890139e-05, + "loss": 0.762, + "step": 900 + }, + { + "ce_ib": 7.272383689880371, + "ce_orig": 0.6885168552398682, + "epoch": 0.12940330697340044, + "kl_loss": 0.08325660228729248, + "loss_ib": 0.007799021899700165, + "step": 900 + }, + { + "ce_ib": 5.927865028381348, + "ce_orig": 0.7656009793281555, + "epoch": 0.1295470884255931, + "kl_loss": 0.07506805658340454, + "loss_ib": 0.006717335432767868, + "step": 901 + }, + { + "ce_ib": 5.351132392883301, + "ce_orig": 0.6215174794197083, + "epoch": 0.12969086987778577, + "kl_loss": 0.08371435105800629, + "loss_ib": 0.0068612839095294476, + "step": 902 + }, + { + "ce_ib": 7.883289337158203, + "ce_orig": 0.797773540019989, + "epoch": 0.12983465132997843, + "kl_loss": 0.10347917675971985, + "loss_ib": 0.009115603752434254, + "step": 903 + }, + { + "ce_ib": 5.427731513977051, + "ce_orig": 0.5187583565711975, + "epoch": 0.1299784327821711, + "kl_loss": 0.09261107444763184, + "loss_ib": 0.007344419602304697, + "step": 904 + }, + { + "epoch": 0.13012221423436376, + "grad_norm": 0.1253208965063095, + "learning_rate": 4.997820920290933e-05, + "loss": 0.8364, + "step": 905 + }, + { + "ce_ib": 4.651042938232422, + "ce_orig": 0.5111949443817139, + "epoch": 0.13012221423436376, + "kl_loss": 0.06503793597221375, + "loss_ib": 0.005577418487519026, + "step": 905 + }, + { + "ce_ib": 6.314610481262207, + "ce_orig": 0.7297623753547668, + "epoch": 0.13026599568655645, + "kl_loss": 0.058927081525325775, + "loss_ib": 0.006103659514337778, + "step": 906 + }, + { + "ce_ib": 6.989099502563477, + "ce_orig": 0.9666232466697693, + "epoch": 0.1304097771387491, + "kl_loss": 0.05747806280851364, + "loss_ib": 0.006368452217429876, + "step": 907 + }, + { + "ce_ib": 6.891256332397461, + "ce_orig": 0.6751134395599365, + "epoch": 0.13055355859094178, + "kl_loss": 0.09852255135774612, + "loss_ib": 0.008371755480766296, + "step": 908 + }, + { + "ce_ib": 6.981194496154785, + "ce_orig": 0.9139746427536011, + "epoch": 0.13069734004313444, + "kl_loss": 0.08173255622386932, + "loss_ib": 0.007577225100249052, + "step": 909 + }, + { + "epoch": 0.1308411214953271, + "grad_norm": 0.12957040965557098, + "learning_rate": 4.9977391730852386e-05, + "loss": 0.8999, + "step": 910 + }, + { + "ce_ib": 5.080508232116699, + "ce_orig": 0.7197664380073547, + "epoch": 0.1308411214953271, + "kl_loss": 0.06574570387601852, + "loss_ib": 0.005827539600431919, + "step": 910 + }, + { + "ce_ib": 5.3453497886657715, + "ce_orig": 0.5241774320602417, + "epoch": 0.13098490294751977, + "kl_loss": 0.08800401538610458, + "loss_ib": 0.007072875741869211, + "step": 911 + }, + { + "ce_ib": 6.245478630065918, + "ce_orig": 0.6996411085128784, + "epoch": 0.13112868439971243, + "kl_loss": 0.09634403884410858, + "loss_ib": 0.007939941249787807, + "step": 912 + }, + { + "ce_ib": 7.945882797241211, + "ce_orig": 1.143088698387146, + "epoch": 0.1312724658519051, + "kl_loss": 0.08361925929784775, + "loss_ib": 0.008153904229402542, + "step": 913 + }, + { + "ce_ib": 8.524740219116211, + "ce_orig": 1.103076696395874, + "epoch": 0.13141624730409776, + "kl_loss": 0.07524412125349045, + "loss_ib": 0.008024576120078564, + "step": 914 + }, + { + "epoch": 0.13156002875629044, + "grad_norm": 0.11736583709716797, + "learning_rate": 4.997655921322299e-05, + "loss": 0.7881, + "step": 915 + }, + { + "ce_ib": 7.337964057922363, + "ce_orig": 0.7371410131454468, + "epoch": 0.13156002875629044, + "kl_loss": 0.08646431565284729, + "loss_ib": 0.007992197759449482, + "step": 915 + }, + { + "ce_ib": 8.713349342346191, + "ce_orig": 0.9880151152610779, + "epoch": 0.1317038102084831, + "kl_loss": 0.07716777920722961, + "loss_ib": 0.00821506418287754, + "step": 916 + }, + { + "ce_ib": 6.380884647369385, + "ce_orig": 0.6128709316253662, + "epoch": 0.13184759166067578, + "kl_loss": 0.09613852202892303, + "loss_ib": 0.007997368462383747, + "step": 917 + }, + { + "ce_ib": 5.234755992889404, + "ce_orig": 0.6296612024307251, + "epoch": 0.13199137311286843, + "kl_loss": 0.10441483557224274, + "loss_ib": 0.007838119752705097, + "step": 918 + }, + { + "ce_ib": 5.749408721923828, + "ce_orig": 0.6389755606651306, + "epoch": 0.1321351545650611, + "kl_loss": 0.09272737056016922, + "loss_ib": 0.007511072792112827, + "step": 919 + }, + { + "epoch": 0.13227893601725377, + "grad_norm": 0.10284168273210526, + "learning_rate": 4.997571165052262e-05, + "loss": 0.8021, + "step": 920 + }, + { + "ce_ib": 7.687596797943115, + "ce_orig": 0.9395468235015869, + "epoch": 0.13227893601725377, + "kl_loss": 0.10860046744346619, + "loss_ib": 0.009273822419345379, + "step": 920 + }, + { + "ce_ib": 5.143918991088867, + "ce_orig": 0.6956912279129028, + "epoch": 0.13242271746944645, + "kl_loss": 0.05788666009902954, + "loss_ib": 0.00546629261225462, + "step": 921 + }, + { + "ce_ib": 4.9963812828063965, + "ce_orig": 0.641473114490509, + "epoch": 0.1325664989216391, + "kl_loss": 0.076176717877388, + "loss_ib": 0.0063070268370211124, + "step": 922 + }, + { + "ce_ib": 5.956387996673584, + "ce_orig": 0.6581974625587463, + "epoch": 0.13271028037383178, + "kl_loss": 0.08382155001163483, + "loss_ib": 0.007169271353632212, + "step": 923 + }, + { + "ce_ib": 6.518821716308594, + "ce_orig": 0.8877462148666382, + "epoch": 0.13285406182602444, + "kl_loss": 0.07050419598817825, + "loss_ib": 0.0067846211604774, + "step": 924 + }, + { + "epoch": 0.13299784327821712, + "grad_norm": 0.1254836916923523, + "learning_rate": 4.99748490432618e-05, + "loss": 0.7649, + "step": 925 + }, + { + "ce_ib": 9.03060531616211, + "ce_orig": 1.0601074695587158, + "epoch": 0.13299784327821712, + "kl_loss": 0.1262633204460144, + "loss_ib": 0.010828468017280102, + "step": 925 + }, + { + "ce_ib": 5.390571594238281, + "ce_orig": 0.6605896353721619, + "epoch": 0.13314162473040977, + "kl_loss": 0.08747322857379913, + "loss_ib": 0.007068946957588196, + "step": 926 + }, + { + "ce_ib": 9.145002365112305, + "ce_orig": 0.7847701907157898, + "epoch": 0.13328540618260246, + "kl_loss": 0.08090417087078094, + "loss_ib": 0.008617709390819073, + "step": 927 + }, + { + "ce_ib": 5.643058776855469, + "ce_orig": 0.7314335107803345, + "epoch": 0.1334291876347951, + "kl_loss": 0.05843006446957588, + "loss_ib": 0.0057430327869951725, + "step": 928 + }, + { + "ce_ib": 6.753859996795654, + "ce_orig": 1.0451164245605469, + "epoch": 0.1335729690869878, + "kl_loss": 0.07054602354764938, + "loss_ib": 0.006904230918735266, + "step": 929 + }, + { + "epoch": 0.13371675053918045, + "grad_norm": 0.1414111852645874, + "learning_rate": 4.9973971391960167e-05, + "loss": 0.8961, + "step": 930 + }, + { + "ce_ib": 8.576930046081543, + "ce_orig": 1.1311094760894775, + "epoch": 0.13371675053918045, + "kl_loss": 0.09142087399959564, + "loss_ib": 0.00885950867086649, + "step": 930 + }, + { + "ce_ib": 7.471253871917725, + "ce_orig": 1.0194929838180542, + "epoch": 0.1338605319913731, + "kl_loss": 0.11889418959617615, + "loss_ib": 0.009680337272584438, + "step": 931 + }, + { + "ce_ib": 6.724924564361572, + "ce_orig": 0.864520788192749, + "epoch": 0.13400431344356578, + "kl_loss": 0.09439219534397125, + "loss_ib": 0.008082072250545025, + "step": 932 + }, + { + "ce_ib": 7.768470287322998, + "ce_orig": 1.0322949886322021, + "epoch": 0.13414809489575844, + "kl_loss": 0.16739840805530548, + "loss_ib": 0.012254155240952969, + "step": 933 + }, + { + "ce_ib": 8.237820625305176, + "ce_orig": 1.0064351558685303, + "epoch": 0.13429187634795112, + "kl_loss": 0.10145040601491928, + "loss_ib": 0.009191430173814297, + "step": 934 + }, + { + "epoch": 0.13443565780014377, + "grad_norm": 0.1539893001317978, + "learning_rate": 4.997307869714637e-05, + "loss": 0.8736, + "step": 935 + }, + { + "ce_ib": 6.874648571014404, + "ce_orig": 1.1211917400360107, + "epoch": 0.13443565780014377, + "kl_loss": 0.10167817026376724, + "loss_ib": 0.00852123275399208, + "step": 935 + }, + { + "ce_ib": 5.751309871673584, + "ce_orig": 0.8845812082290649, + "epoch": 0.13457943925233645, + "kl_loss": 0.09366403520107269, + "loss_ib": 0.007558857090771198, + "step": 936 + }, + { + "ce_ib": 8.573022842407227, + "ce_orig": 1.0504612922668457, + "epoch": 0.1347232207045291, + "kl_loss": 0.07060796767473221, + "loss_ib": 0.007816909812390804, + "step": 937 + }, + { + "ce_ib": 8.377857208251953, + "ce_orig": 0.9093595147132874, + "epoch": 0.1348670021567218, + "kl_loss": 0.11797395348548889, + "loss_ib": 0.010087626054883003, + "step": 938 + }, + { + "ce_ib": 7.620189189910889, + "ce_orig": 0.9195590019226074, + "epoch": 0.13501078360891444, + "kl_loss": 0.11536361277103424, + "loss_ib": 0.009578275494277477, + "step": 939 + }, + { + "epoch": 0.13515456506110712, + "grad_norm": 0.12349986284971237, + "learning_rate": 4.9972170959358156e-05, + "loss": 0.8263, + "step": 940 + }, + { + "ce_ib": 4.433289527893066, + "ce_orig": 0.5269767642021179, + "epoch": 0.13515456506110712, + "kl_loss": 0.06309656798839569, + "loss_ib": 0.005371473263949156, + "step": 940 + }, + { + "ce_ib": 6.865793228149414, + "ce_orig": 0.8600917458534241, + "epoch": 0.13529834651329978, + "kl_loss": 0.268341064453125, + "loss_ib": 0.016849949955940247, + "step": 941 + }, + { + "ce_ib": 5.769837856292725, + "ce_orig": 0.9609159827232361, + "epoch": 0.13544212796549246, + "kl_loss": 0.073099285364151, + "loss_ib": 0.006539882626384497, + "step": 942 + }, + { + "ce_ib": 6.068971157073975, + "ce_orig": 0.8975405097007751, + "epoch": 0.1355859094176851, + "kl_loss": 0.07797665894031525, + "loss_ib": 0.006933317985385656, + "step": 943 + }, + { + "ce_ib": 4.143486976623535, + "ce_orig": 0.5329647660255432, + "epoch": 0.1357296908698778, + "kl_loss": 0.1322351098060608, + "loss_ib": 0.008683498948812485, + "step": 944 + }, + { + "epoch": 0.13587347232207045, + "grad_norm": 0.12345600128173828, + "learning_rate": 4.9971248179142296e-05, + "loss": 0.8698, + "step": 945 + }, + { + "ce_ib": 8.5775146484375, + "ce_orig": 1.4092092514038086, + "epoch": 0.13587347232207045, + "kl_loss": 0.07750563323497772, + "loss_ib": 0.008164039812982082, + "step": 945 + }, + { + "ce_ib": 6.79196310043335, + "ce_orig": 1.1046801805496216, + "epoch": 0.13601725377426313, + "kl_loss": 0.08599655330181122, + "loss_ib": 0.007695809006690979, + "step": 946 + }, + { + "ce_ib": 6.365146636962891, + "ce_orig": 0.7971789836883545, + "epoch": 0.13616103522645578, + "kl_loss": 0.08945336192846298, + "loss_ib": 0.007655241526663303, + "step": 947 + }, + { + "ce_ib": 6.125009059906006, + "ce_orig": 0.9030478000640869, + "epoch": 0.13630481667864847, + "kl_loss": 0.08069920539855957, + "loss_ib": 0.007097464986145496, + "step": 948 + }, + { + "ce_ib": 7.449437618255615, + "ce_orig": 0.9372292757034302, + "epoch": 0.13644859813084112, + "kl_loss": 0.07520076632499695, + "loss_ib": 0.007484757341444492, + "step": 949 + }, + { + "epoch": 0.1365923795830338, + "grad_norm": 0.1214538961648941, + "learning_rate": 4.997031035705466e-05, + "loss": 0.8814, + "step": 950 + }, + { + "ce_ib": 6.2350687980651855, + "ce_orig": 0.8483865857124329, + "epoch": 0.1365923795830338, + "kl_loss": 0.08866537362337112, + "loss_ib": 0.007550803013145924, + "step": 950 + }, + { + "ce_ib": 6.066655158996582, + "ce_orig": 0.6078613996505737, + "epoch": 0.13673616103522646, + "kl_loss": 0.06268885731697083, + "loss_ib": 0.006167770363390446, + "step": 951 + }, + { + "ce_ib": 7.969608783721924, + "ce_orig": 1.1507118940353394, + "epoch": 0.1368799424874191, + "kl_loss": 0.07303150743246078, + "loss_ib": 0.007636380381882191, + "step": 952 + }, + { + "ce_ib": 6.006031036376953, + "ce_orig": 0.7712607383728027, + "epoch": 0.1370237239396118, + "kl_loss": 0.1005963534116745, + "loss_ib": 0.008032833226025105, + "step": 953 + }, + { + "ce_ib": 6.047878265380859, + "ce_orig": 0.8621774315834045, + "epoch": 0.13716750539180445, + "kl_loss": 0.06825101375579834, + "loss_ib": 0.006436489522457123, + "step": 954 + }, + { + "epoch": 0.13731128684399713, + "grad_norm": 0.1393401324748993, + "learning_rate": 4.996935749366015e-05, + "loss": 0.9195, + "step": 955 + }, + { + "ce_ib": 5.52618408203125, + "ce_orig": 0.7416598200798035, + "epoch": 0.13731128684399713, + "kl_loss": 0.07697509229183197, + "loss_ib": 0.006611846387386322, + "step": 955 + }, + { + "ce_ib": 5.450565338134766, + "ce_orig": 0.7644197940826416, + "epoch": 0.13745506829618978, + "kl_loss": 0.06023329496383667, + "loss_ib": 0.005736947525292635, + "step": 956 + }, + { + "ce_ib": 7.305458068847656, + "ce_orig": 1.0978727340698242, + "epoch": 0.13759884974838246, + "kl_loss": 0.11561448872089386, + "loss_ib": 0.009433453902602196, + "step": 957 + }, + { + "ce_ib": 6.976890563964844, + "ce_orig": 0.7256874442100525, + "epoch": 0.13774263120057512, + "kl_loss": 0.09455625712871552, + "loss_ib": 0.008216258138418198, + "step": 958 + }, + { + "ce_ib": 6.9282097816467285, + "ce_orig": 0.7956810593605042, + "epoch": 0.1378864126527678, + "kl_loss": 0.087988942861557, + "loss_ib": 0.007863552309572697, + "step": 959 + }, + { + "epoch": 0.13803019410496045, + "grad_norm": 0.12086982280015945, + "learning_rate": 4.996838958953275e-05, + "loss": 0.8213, + "step": 960 + }, + { + "ce_ib": 6.342545032501221, + "ce_orig": 0.7129842638969421, + "epoch": 0.13803019410496045, + "kl_loss": 0.13139081001281738, + "loss_ib": 0.009740813635289669, + "step": 960 + }, + { + "ce_ib": 6.692712306976318, + "ce_orig": 0.8974847197532654, + "epoch": 0.13817397555715313, + "kl_loss": 0.08609960973262787, + "loss_ib": 0.007651336491107941, + "step": 961 + }, + { + "ce_ib": 8.433094024658203, + "ce_orig": 1.1379026174545288, + "epoch": 0.1383177570093458, + "kl_loss": 0.10287706553936005, + "loss_ib": 0.009360400028526783, + "step": 962 + }, + { + "ce_ib": 8.236790657043457, + "ce_orig": 1.2443400621414185, + "epoch": 0.13846153846153847, + "kl_loss": 0.08817592263221741, + "loss_ib": 0.00852719135582447, + "step": 963 + }, + { + "ce_ib": 5.165767669677734, + "ce_orig": 0.5761904716491699, + "epoch": 0.13860531991373112, + "kl_loss": 0.0794210359454155, + "loss_ib": 0.006553936284035444, + "step": 964 + }, + { + "epoch": 0.1387491013659238, + "grad_norm": 0.1046949103474617, + "learning_rate": 4.996740664525549e-05, + "loss": 0.9412, + "step": 965 + }, + { + "ce_ib": 7.795332908630371, + "ce_orig": 1.0906161069869995, + "epoch": 0.1387491013659238, + "kl_loss": 0.07911582291126251, + "loss_ib": 0.00785345770418644, + "step": 965 + }, + { + "ce_ib": 5.823874473571777, + "ce_orig": 0.6993345022201538, + "epoch": 0.13889288281811646, + "kl_loss": 0.08959372341632843, + "loss_ib": 0.0073916236869990826, + "step": 966 + }, + { + "ce_ib": 7.680074214935303, + "ce_orig": 1.1049128770828247, + "epoch": 0.13903666427030914, + "kl_loss": 0.08348916471004486, + "loss_ib": 0.008014495484530926, + "step": 967 + }, + { + "ce_ib": 6.938556671142578, + "ce_orig": 0.6378891468048096, + "epoch": 0.1391804457225018, + "kl_loss": 0.11031496524810791, + "loss_ib": 0.008985026739537716, + "step": 968 + }, + { + "ce_ib": 8.571457862854004, + "ce_orig": 1.1795148849487305, + "epoch": 0.13932422717469448, + "kl_loss": 0.08533471077680588, + "loss_ib": 0.00855246465653181, + "step": 969 + }, + { + "epoch": 0.13946800862688713, + "grad_norm": 0.1300869584083557, + "learning_rate": 4.996640866142046e-05, + "loss": 0.9504, + "step": 970 + }, + { + "ce_ib": 8.15340805053711, + "ce_orig": 1.0846816301345825, + "epoch": 0.13946800862688713, + "kl_loss": 0.08402827382087708, + "loss_ib": 0.008278118446469307, + "step": 970 + }, + { + "ce_ib": 4.409031391143799, + "ce_orig": 0.46880093216896057, + "epoch": 0.1396117900790798, + "kl_loss": 0.11544293165206909, + "loss_ib": 0.007976662367582321, + "step": 971 + }, + { + "ce_ib": 8.286463737487793, + "ce_orig": 0.7137627601623535, + "epoch": 0.13975557153127247, + "kl_loss": 0.07826922088861465, + "loss_ib": 0.008056692779064178, + "step": 972 + }, + { + "ce_ib": 7.249392986297607, + "ce_orig": 1.0557063817977905, + "epoch": 0.13989935298346512, + "kl_loss": 0.08211711049079895, + "loss_ib": 0.007730551995337009, + "step": 973 + }, + { + "ce_ib": 6.922652721405029, + "ce_orig": 0.7434417009353638, + "epoch": 0.1400431344356578, + "kl_loss": 0.09765265882015228, + "loss_ib": 0.008343959227204323, + "step": 974 + }, + { + "epoch": 0.14018691588785046, + "grad_norm": 0.10048684477806091, + "learning_rate": 4.996539563862881e-05, + "loss": 0.9914, + "step": 975 + }, + { + "ce_ib": 5.8394904136657715, + "ce_orig": 0.9871096611022949, + "epoch": 0.14018691588785046, + "kl_loss": 0.07624734193086624, + "loss_ib": 0.0067321122623980045, + "step": 975 + }, + { + "ce_ib": 4.4480671882629395, + "ce_orig": 0.5778042078018188, + "epoch": 0.14033069734004314, + "kl_loss": 0.12898705899715424, + "loss_ib": 0.00867338664829731, + "step": 976 + }, + { + "ce_ib": 4.4938836097717285, + "ce_orig": 0.5107190012931824, + "epoch": 0.1404744787922358, + "kl_loss": 0.060333944857120514, + "loss_ib": 0.005263639148324728, + "step": 977 + }, + { + "ce_ib": 5.551029682159424, + "ce_orig": 0.7901322841644287, + "epoch": 0.14061826024442847, + "kl_loss": 0.1715565174818039, + "loss_ib": 0.011353340931236744, + "step": 978 + }, + { + "ce_ib": 6.682663917541504, + "ce_orig": 0.8593956828117371, + "epoch": 0.14076204169662113, + "kl_loss": 0.08176585286855698, + "loss_ib": 0.007429624442011118, + "step": 979 + }, + { + "epoch": 0.1409058231488138, + "grad_norm": 0.1225329264998436, + "learning_rate": 4.996436757749077e-05, + "loss": 0.8339, + "step": 980 + }, + { + "ce_ib": 5.105162620544434, + "ce_orig": 0.6780040860176086, + "epoch": 0.1409058231488138, + "kl_loss": 0.06629600375890732, + "loss_ib": 0.005867381580173969, + "step": 980 + }, + { + "ce_ib": 6.971573352813721, + "ce_orig": 1.1731876134872437, + "epoch": 0.14104960460100646, + "kl_loss": 0.09537965059280396, + "loss_ib": 0.008254769258201122, + "step": 981 + }, + { + "ce_ib": 7.419947624206543, + "ce_orig": 1.0379914045333862, + "epoch": 0.14119338605319914, + "kl_loss": 0.09539618343114853, + "loss_ib": 0.008479783311486244, + "step": 982 + }, + { + "ce_ib": 8.027047157287598, + "ce_orig": 1.0013635158538818, + "epoch": 0.1413371675053918, + "kl_loss": 0.11693254113197327, + "loss_ib": 0.009860150516033173, + "step": 983 + }, + { + "ce_ib": 5.866478443145752, + "ce_orig": 0.8373990058898926, + "epoch": 0.14148094895758448, + "kl_loss": 0.056306250393390656, + "loss_ib": 0.005748551804572344, + "step": 984 + }, + { + "epoch": 0.14162473040977713, + "grad_norm": 0.12583786249160767, + "learning_rate": 4.99633244786256e-05, + "loss": 0.9591, + "step": 985 + }, + { + "ce_ib": 9.714642524719238, + "ce_orig": 1.50913405418396, + "epoch": 0.14162473040977713, + "kl_loss": 0.11189399659633636, + "loss_ib": 0.010452020913362503, + "step": 985 + }, + { + "ce_ib": 5.668364524841309, + "ce_orig": 0.510465681552887, + "epoch": 0.14176851186196981, + "kl_loss": 0.10138311982154846, + "loss_ib": 0.00790333840996027, + "step": 986 + }, + { + "ce_ib": 6.4766764640808105, + "ce_orig": 0.9878252744674683, + "epoch": 0.14191229331416247, + "kl_loss": 0.05886990204453468, + "loss_ib": 0.0061818333342671394, + "step": 987 + }, + { + "ce_ib": 4.52413272857666, + "ce_orig": 0.6275699734687805, + "epoch": 0.14205607476635515, + "kl_loss": 0.07127900421619415, + "loss_ib": 0.0058260164223611355, + "step": 988 + }, + { + "ce_ib": 7.104588031768799, + "ce_orig": 1.0290782451629639, + "epoch": 0.1421998562185478, + "kl_loss": 0.09330768138170242, + "loss_ib": 0.008217677474021912, + "step": 989 + }, + { + "epoch": 0.14234363767074049, + "grad_norm": 0.15013280510902405, + "learning_rate": 4.9962266342661624e-05, + "loss": 0.8154, + "step": 990 + }, + { + "ce_ib": 6.073655128479004, + "ce_orig": 0.7287778258323669, + "epoch": 0.14234363767074049, + "kl_loss": 0.08072468638420105, + "loss_ib": 0.007073062006384134, + "step": 990 + }, + { + "ce_ib": 7.8407883644104, + "ce_orig": 0.9141483902931213, + "epoch": 0.14248741912293314, + "kl_loss": 0.07278452068567276, + "loss_ib": 0.007559619843959808, + "step": 991 + }, + { + "ce_ib": 7.545557975769043, + "ce_orig": 1.0704678297042847, + "epoch": 0.14263120057512582, + "kl_loss": 0.10130893439054489, + "loss_ib": 0.008838226087391376, + "step": 992 + }, + { + "ce_ib": 9.583335876464844, + "ce_orig": 1.1344443559646606, + "epoch": 0.14277498202731848, + "kl_loss": 0.0773579329252243, + "loss_ib": 0.008659563958644867, + "step": 993 + }, + { + "ce_ib": 6.852370738983154, + "ce_orig": 1.0198527574539185, + "epoch": 0.14291876347951113, + "kl_loss": 0.09604521840810776, + "loss_ib": 0.00822844635695219, + "step": 994 + }, + { + "epoch": 0.1430625449317038, + "grad_norm": 0.140967458486557, + "learning_rate": 4.9961193170236234e-05, + "loss": 0.9539, + "step": 995 + }, + { + "ce_ib": 7.311800956726074, + "ce_orig": 0.8755852580070496, + "epoch": 0.1430625449317038, + "kl_loss": 0.1115192174911499, + "loss_ib": 0.009231860749423504, + "step": 995 + }, + { + "ce_ib": 7.3969621658325195, + "ce_orig": 0.7122746706008911, + "epoch": 0.14320632638389647, + "kl_loss": 0.16122053563594818, + "loss_ib": 0.011759507469832897, + "step": 996 + }, + { + "ce_ib": 6.429070949554443, + "ce_orig": 0.7772039175033569, + "epoch": 0.14335010783608915, + "kl_loss": 0.06662163138389587, + "loss_ib": 0.006545616779476404, + "step": 997 + }, + { + "ce_ib": 6.746508598327637, + "ce_orig": 1.0281816720962524, + "epoch": 0.1434938892882818, + "kl_loss": 0.11056394129991531, + "loss_ib": 0.008901451714336872, + "step": 998 + }, + { + "ce_ib": 5.576045989990234, + "ce_orig": 0.8467837572097778, + "epoch": 0.14363767074047448, + "kl_loss": 0.08954350650310516, + "loss_ib": 0.007265198510140181, + "step": 999 + }, + { + "epoch": 0.14378145219266714, + "grad_norm": 0.1209883913397789, + "learning_rate": 4.996010496199587e-05, + "loss": 0.8696, + "step": 1000 + }, + { + "ce_ib": 7.336287021636963, + "ce_orig": 0.8564727306365967, + "epoch": 0.14378145219266714, + "kl_loss": 0.11579255759716034, + "loss_ib": 0.00945777166634798, + "step": 1000 + }, + { + "ce_ib": 5.5097270011901855, + "ce_orig": 0.7456053495407104, + "epoch": 0.14392523364485982, + "kl_loss": 0.08972552418708801, + "loss_ib": 0.007241139654070139, + "step": 1001 + }, + { + "ce_ib": 4.607260704040527, + "ce_orig": 0.6881207823753357, + "epoch": 0.14406901509705247, + "kl_loss": 0.07507991790771484, + "loss_ib": 0.006057626102119684, + "step": 1002 + }, + { + "ce_ib": 7.471873760223389, + "ce_orig": 1.116858720779419, + "epoch": 0.14421279654924515, + "kl_loss": 0.08241486549377441, + "loss_ib": 0.007856681011617184, + "step": 1003 + }, + { + "ce_ib": 6.235586166381836, + "ce_orig": 0.7015225887298584, + "epoch": 0.1443565780014378, + "kl_loss": 0.1668388545513153, + "loss_ib": 0.01145973615348339, + "step": 1004 + }, + { + "epoch": 0.1445003594536305, + "grad_norm": 0.11427145451307297, + "learning_rate": 4.995900171859604e-05, + "loss": 0.7978, + "step": 1005 + }, + { + "ce_ib": 6.651737689971924, + "ce_orig": 0.7342057228088379, + "epoch": 0.1445003594536305, + "kl_loss": 0.11493319272994995, + "loss_ib": 0.009072529152035713, + "step": 1005 + }, + { + "ce_ib": 7.091629981994629, + "ce_orig": 0.5985906720161438, + "epoch": 0.14464414090582314, + "kl_loss": 0.10984715819358826, + "loss_ib": 0.009038172662258148, + "step": 1006 + }, + { + "ce_ib": 5.66042947769165, + "ce_orig": 0.87228924036026, + "epoch": 0.14478792235801582, + "kl_loss": 0.05440814793109894, + "loss_ib": 0.005550622474402189, + "step": 1007 + }, + { + "ce_ib": 8.690061569213867, + "ce_orig": 1.4457069635391235, + "epoch": 0.14493170381020848, + "kl_loss": 0.07694371044635773, + "loss_ib": 0.008192216046154499, + "step": 1008 + }, + { + "ce_ib": 6.1012163162231445, + "ce_orig": 0.745335578918457, + "epoch": 0.14507548526240116, + "kl_loss": 0.09648852795362473, + "loss_ib": 0.007875034585595131, + "step": 1009 + }, + { + "epoch": 0.14521926671459381, + "grad_norm": 0.1259513795375824, + "learning_rate": 4.995788344070129e-05, + "loss": 1.0045, + "step": 1010 + }, + { + "ce_ib": 6.894099712371826, + "ce_orig": 0.932343065738678, + "epoch": 0.14521926671459381, + "kl_loss": 0.07711976766586304, + "loss_ib": 0.0073030381463468075, + "step": 1010 + }, + { + "ce_ib": 5.7034807205200195, + "ce_orig": 0.43693673610687256, + "epoch": 0.1453630481667865, + "kl_loss": 0.08185969293117523, + "loss_ib": 0.006944724824279547, + "step": 1011 + }, + { + "ce_ib": 6.72266149520874, + "ce_orig": 0.7800190448760986, + "epoch": 0.14550682961897915, + "kl_loss": 0.11143870651721954, + "loss_ib": 0.008933265693485737, + "step": 1012 + }, + { + "ce_ib": 6.198060035705566, + "ce_orig": 0.6116705536842346, + "epoch": 0.14565061107117183, + "kl_loss": 0.061753034591674805, + "loss_ib": 0.006186681799590588, + "step": 1013 + }, + { + "ce_ib": 9.215808868408203, + "ce_orig": 1.5545287132263184, + "epoch": 0.14579439252336449, + "kl_loss": 0.15811239182949066, + "loss_ib": 0.012513524852693081, + "step": 1014 + }, + { + "epoch": 0.14593817397555714, + "grad_norm": 0.11410394310951233, + "learning_rate": 4.9956750128985255e-05, + "loss": 0.8119, + "step": 1015 + }, + { + "ce_ib": 6.611013889312744, + "ce_orig": 0.6692886352539062, + "epoch": 0.14593817397555714, + "kl_loss": 0.20762485265731812, + "loss_ib": 0.013686749152839184, + "step": 1015 + }, + { + "ce_ib": 8.01150894165039, + "ce_orig": 1.0256410837173462, + "epoch": 0.14608195542774982, + "kl_loss": 0.07970279455184937, + "loss_ib": 0.00799089390784502, + "step": 1016 + }, + { + "ce_ib": 7.623814105987549, + "ce_orig": 1.099854588508606, + "epoch": 0.14622573687994248, + "kl_loss": 0.12635360658168793, + "loss_ib": 0.010129587724804878, + "step": 1017 + }, + { + "ce_ib": 8.276596069335938, + "ce_orig": 1.2237489223480225, + "epoch": 0.14636951833213516, + "kl_loss": 0.0816061943769455, + "loss_ib": 0.008218607865273952, + "step": 1018 + }, + { + "ce_ib": 5.7402520179748535, + "ce_orig": 0.7188358902931213, + "epoch": 0.1465132997843278, + "kl_loss": 0.06798361241817474, + "loss_ib": 0.006269306875765324, + "step": 1019 + }, + { + "epoch": 0.1466570812365205, + "grad_norm": 0.14134688675403595, + "learning_rate": 4.995560178413059e-05, + "loss": 0.8605, + "step": 1020 + }, + { + "ce_ib": 6.9819464683532715, + "ce_orig": 0.9869663715362549, + "epoch": 0.1466570812365205, + "kl_loss": 0.08391620218753815, + "loss_ib": 0.00768678355962038, + "step": 1020 + }, + { + "ce_ib": 8.797165870666504, + "ce_orig": 1.0456606149673462, + "epoch": 0.14680086268871315, + "kl_loss": 0.0919669046998024, + "loss_ib": 0.008996928110718727, + "step": 1021 + }, + { + "ce_ib": 7.765720367431641, + "ce_orig": 1.1654472351074219, + "epoch": 0.14694464414090583, + "kl_loss": 0.07526499032974243, + "loss_ib": 0.007646109443157911, + "step": 1022 + }, + { + "ce_ib": 7.337822914123535, + "ce_orig": 0.8869658708572388, + "epoch": 0.14708842559309848, + "kl_loss": 0.15245258808135986, + "loss_ib": 0.011291541159152985, + "step": 1023 + }, + { + "ce_ib": 6.735894203186035, + "ce_orig": 0.9667019844055176, + "epoch": 0.14723220704529116, + "kl_loss": 0.09678266942501068, + "loss_ib": 0.008207080885767937, + "step": 1024 + }, + { + "epoch": 0.14737598849748382, + "grad_norm": 0.12001588195562363, + "learning_rate": 4.995443840682903e-05, + "loss": 0.8643, + "step": 1025 + }, + { + "ce_ib": 7.526907920837402, + "ce_orig": 0.9696366190910339, + "epoch": 0.14737598849748382, + "kl_loss": 0.08580797165632248, + "loss_ib": 0.008053853176534176, + "step": 1025 + }, + { + "ce_ib": 7.954714298248291, + "ce_orig": 1.1440271139144897, + "epoch": 0.1475197699496765, + "kl_loss": 0.07150311022996902, + "loss_ib": 0.007552512921392918, + "step": 1026 + }, + { + "ce_ib": 4.705499649047852, + "ce_orig": 0.5527094006538391, + "epoch": 0.14766355140186915, + "kl_loss": 0.08073259890079498, + "loss_ib": 0.0063893795013427734, + "step": 1027 + }, + { + "ce_ib": 9.533058166503906, + "ce_orig": 1.3014625310897827, + "epoch": 0.14780733285406183, + "kl_loss": 0.12401114404201508, + "loss_ib": 0.01096708606928587, + "step": 1028 + }, + { + "ce_ib": 6.954570770263672, + "ce_orig": 1.1259472370147705, + "epoch": 0.1479511143062545, + "kl_loss": 0.09523732960224152, + "loss_ib": 0.008239151909947395, + "step": 1029 + }, + { + "epoch": 0.14809489575844717, + "grad_norm": 0.12638209760189056, + "learning_rate": 4.995325999778134e-05, + "loss": 0.8628, + "step": 1030 + }, + { + "ce_ib": 6.894320011138916, + "ce_orig": 1.1501129865646362, + "epoch": 0.14809489575844717, + "kl_loss": 0.09805652499198914, + "loss_ib": 0.008349986746907234, + "step": 1030 + }, + { + "ce_ib": 7.414271831512451, + "ce_orig": 1.1060221195220947, + "epoch": 0.14823867721063982, + "kl_loss": 0.07762119174003601, + "loss_ib": 0.007588196080178022, + "step": 1031 + }, + { + "ce_ib": 7.322919845581055, + "ce_orig": 0.9979360103607178, + "epoch": 0.1483824586628325, + "kl_loss": 0.07417953759431839, + "loss_ib": 0.00737043609842658, + "step": 1032 + }, + { + "ce_ib": 8.12809944152832, + "ce_orig": 1.1163820028305054, + "epoch": 0.14852624011502516, + "kl_loss": 0.08834036439657211, + "loss_ib": 0.008481067605316639, + "step": 1033 + }, + { + "ce_ib": 4.993091106414795, + "ce_orig": 0.7060537934303284, + "epoch": 0.14867002156721784, + "kl_loss": 0.09418373554944992, + "loss_ib": 0.00720573216676712, + "step": 1034 + }, + { + "epoch": 0.1488138030194105, + "grad_norm": 0.11499213427305222, + "learning_rate": 4.9952066557697374e-05, + "loss": 0.872, + "step": 1035 + }, + { + "ce_ib": 6.807557106018066, + "ce_orig": 0.9086964130401611, + "epoch": 0.1488138030194105, + "kl_loss": 0.07040740549564362, + "loss_ib": 0.00692414864897728, + "step": 1035 + }, + { + "ce_ib": 7.440614223480225, + "ce_orig": 0.9355759024620056, + "epoch": 0.14895758447160315, + "kl_loss": 0.10693731904029846, + "loss_ib": 0.0090671731159091, + "step": 1036 + }, + { + "ce_ib": 6.433082580566406, + "ce_orig": 0.8045613765716553, + "epoch": 0.14910136592379583, + "kl_loss": 0.09572334587574005, + "loss_ib": 0.008002708666026592, + "step": 1037 + }, + { + "ce_ib": 5.557839870452881, + "ce_orig": 0.7418414354324341, + "epoch": 0.14924514737598849, + "kl_loss": 0.0761575698852539, + "loss_ib": 0.006586798466742039, + "step": 1038 + }, + { + "ce_ib": 7.438146114349365, + "ce_orig": 1.055992841720581, + "epoch": 0.14938892882818117, + "kl_loss": 0.0997689738869667, + "loss_ib": 0.008707522414624691, + "step": 1039 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 0.24093793332576752, + "learning_rate": 4.995085808729602e-05, + "loss": 0.8729, + "step": 1040 + }, + { + "ce_ib": 7.951437950134277, + "ce_orig": 1.4896743297576904, + "epoch": 0.14953271028037382, + "kl_loss": 0.0694376528263092, + "loss_ib": 0.0074476017616689205, + "step": 1040 + }, + { + "ce_ib": 7.103911876678467, + "ce_orig": 0.8137007355690002, + "epoch": 0.1496764917325665, + "kl_loss": 0.09416785836219788, + "loss_ib": 0.008260348811745644, + "step": 1041 + }, + { + "ce_ib": 5.186192512512207, + "ce_orig": 0.8730520606040955, + "epoch": 0.14982027318475916, + "kl_loss": 0.06942439079284668, + "loss_ib": 0.006064315792173147, + "step": 1042 + }, + { + "ce_ib": 4.51282262802124, + "ce_orig": 0.7197614908218384, + "epoch": 0.14996405463695184, + "kl_loss": 0.0622485987842083, + "loss_ib": 0.0053688413463532925, + "step": 1043 + }, + { + "ce_ib": 4.631485939025879, + "ce_orig": 0.6428878307342529, + "epoch": 0.1501078360891445, + "kl_loss": 0.07299801707267761, + "loss_ib": 0.0059656440280377865, + "step": 1044 + }, + { + "epoch": 0.15025161754133717, + "grad_norm": 0.11844494193792343, + "learning_rate": 4.994963458730521e-05, + "loss": 0.8952, + "step": 1045 + }, + { + "ce_ib": 7.407634258270264, + "ce_orig": 1.1897940635681152, + "epoch": 0.15025161754133717, + "kl_loss": 0.07235468924045563, + "loss_ib": 0.007321551442146301, + "step": 1045 + }, + { + "ce_ib": 7.862279891967773, + "ce_orig": 1.3289810419082642, + "epoch": 0.15039539899352983, + "kl_loss": 0.0632917582988739, + "loss_ib": 0.007095727603882551, + "step": 1046 + }, + { + "ce_ib": 5.383222579956055, + "ce_orig": 0.7905131578445435, + "epoch": 0.1505391804457225, + "kl_loss": 0.08215737342834473, + "loss_ib": 0.00679948041215539, + "step": 1047 + }, + { + "ce_ib": 5.386118412017822, + "ce_orig": 0.7516582608222961, + "epoch": 0.15068296189791516, + "kl_loss": 0.08441755175590515, + "loss_ib": 0.006913937162607908, + "step": 1048 + }, + { + "ce_ib": 5.995546340942383, + "ce_orig": 0.657211422920227, + "epoch": 0.15082674335010784, + "kl_loss": 0.10880130529403687, + "loss_ib": 0.008437838405370712, + "step": 1049 + }, + { + "epoch": 0.1509705248023005, + "grad_norm": 0.13365881145000458, + "learning_rate": 4.994839605846193e-05, + "loss": 0.9522, + "step": 1050 + }, + { + "ce_ib": 9.530364990234375, + "ce_orig": 1.4248408079147339, + "epoch": 0.1509705248023005, + "kl_loss": 0.0971805527806282, + "loss_ib": 0.009624210186302662, + "step": 1050 + }, + { + "ce_ib": 5.94097375869751, + "ce_orig": 0.8278445601463318, + "epoch": 0.15111430625449318, + "kl_loss": 0.05822158232331276, + "loss_ib": 0.00588156608864665, + "step": 1051 + }, + { + "ce_ib": 6.744023323059082, + "ce_orig": 0.7395092844963074, + "epoch": 0.15125808770668583, + "kl_loss": 0.059324055910110474, + "loss_ib": 0.006338214967399836, + "step": 1052 + }, + { + "ce_ib": 5.886780261993408, + "ce_orig": 0.8318650722503662, + "epoch": 0.15140186915887852, + "kl_loss": 0.07253434509038925, + "loss_ib": 0.006570107769221067, + "step": 1053 + }, + { + "ce_ib": 5.713149070739746, + "ce_orig": 0.7542092800140381, + "epoch": 0.15154565061107117, + "kl_loss": 0.07283122837543488, + "loss_ib": 0.0064981356263160706, + "step": 1054 + }, + { + "epoch": 0.15168943206326385, + "grad_norm": 0.11253370344638824, + "learning_rate": 4.994714250151226e-05, + "loss": 0.9113, + "step": 1055 + }, + { + "ce_ib": 5.154370307922363, + "ce_orig": 0.9435141682624817, + "epoch": 0.15168943206326385, + "kl_loss": 0.0916782021522522, + "loss_ib": 0.007161095272749662, + "step": 1055 + }, + { + "ce_ib": 6.36662483215332, + "ce_orig": 0.9607053399085999, + "epoch": 0.1518332135154565, + "kl_loss": 0.09349434077739716, + "loss_ib": 0.007858029566705227, + "step": 1056 + }, + { + "ce_ib": 8.031435012817383, + "ce_orig": 0.6474438309669495, + "epoch": 0.15197699496764916, + "kl_loss": 0.2374052107334137, + "loss_ib": 0.0158859770745039, + "step": 1057 + }, + { + "ce_ib": 3.7962605953216553, + "ce_orig": 0.3701985776424408, + "epoch": 0.15212077641984184, + "kl_loss": 0.11971500515937805, + "loss_ib": 0.007883880287408829, + "step": 1058 + }, + { + "ce_ib": 6.266521453857422, + "ce_orig": 0.9449588656425476, + "epoch": 0.1522645578720345, + "kl_loss": 0.06766834855079651, + "loss_ib": 0.006516678724437952, + "step": 1059 + }, + { + "epoch": 0.15240833932422718, + "grad_norm": 0.13617506623268127, + "learning_rate": 4.9945873917211286e-05, + "loss": 0.8128, + "step": 1060 + }, + { + "ce_ib": 8.680150985717773, + "ce_orig": 1.2594325542449951, + "epoch": 0.15240833932422718, + "kl_loss": 0.09351365268230438, + "loss_ib": 0.009015758521854877, + "step": 1060 + }, + { + "ce_ib": 6.909031867980957, + "ce_orig": 0.9241549968719482, + "epoch": 0.15255212077641983, + "kl_loss": 0.09920968115329742, + "loss_ib": 0.008415000513195992, + "step": 1061 + }, + { + "ce_ib": 7.288214206695557, + "ce_orig": 1.096737027168274, + "epoch": 0.1526959022286125, + "kl_loss": 0.07453425228595734, + "loss_ib": 0.007370819337666035, + "step": 1062 + }, + { + "ce_ib": 2.7316372394561768, + "ce_orig": 0.26801544427871704, + "epoch": 0.15283968368080517, + "kl_loss": 0.1300431787967682, + "loss_ib": 0.00786797795444727, + "step": 1063 + }, + { + "ce_ib": 6.3758440017700195, + "ce_orig": 0.9348464012145996, + "epoch": 0.15298346513299785, + "kl_loss": 0.12439551949501038, + "loss_ib": 0.009407698176801205, + "step": 1064 + }, + { + "epoch": 0.1531272465851905, + "grad_norm": 0.10688955336809158, + "learning_rate": 4.994459030632317e-05, + "loss": 0.8912, + "step": 1065 + }, + { + "ce_ib": 7.558090686798096, + "ce_orig": 1.1949886083602905, + "epoch": 0.1531272465851905, + "kl_loss": 0.10483516752719879, + "loss_ib": 0.009020804427564144, + "step": 1065 + }, + { + "ce_ib": 4.913568496704102, + "ce_orig": 0.6736735105514526, + "epoch": 0.15327102803738318, + "kl_loss": 0.07856868952512741, + "loss_ib": 0.006385219283401966, + "step": 1066 + }, + { + "ce_ib": 6.382769584655762, + "ce_orig": 0.8610799312591553, + "epoch": 0.15341480948957584, + "kl_loss": 0.09864749014377594, + "loss_ib": 0.008123759180307388, + "step": 1067 + }, + { + "ce_ib": 5.849618911743164, + "ce_orig": 0.857602059841156, + "epoch": 0.15355859094176852, + "kl_loss": 0.09797794371843338, + "loss_ib": 0.007823706604540348, + "step": 1068 + }, + { + "ce_ib": 5.8813581466674805, + "ce_orig": 0.8178615570068359, + "epoch": 0.15370237239396117, + "kl_loss": 0.10369985550642014, + "loss_ib": 0.008125672116875648, + "step": 1069 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.13013261556625366, + "learning_rate": 4.99432916696211e-05, + "loss": 0.8572, + "step": 1070 + }, + { + "ce_ib": 7.825472354888916, + "ce_orig": 0.9892460107803345, + "epoch": 0.15384615384615385, + "kl_loss": 0.10009782016277313, + "loss_ib": 0.008917626924812794, + "step": 1070 + }, + { + "ce_ib": 6.338406562805176, + "ce_orig": 0.8518531918525696, + "epoch": 0.1539899352983465, + "kl_loss": 0.10175366699695587, + "loss_ib": 0.008256887085735798, + "step": 1071 + }, + { + "ce_ib": 5.73522424697876, + "ce_orig": 1.0242947340011597, + "epoch": 0.1541337167505392, + "kl_loss": 0.06288909912109375, + "loss_ib": 0.006012067198753357, + "step": 1072 + }, + { + "ce_ib": 5.708252429962158, + "ce_orig": 0.7614584565162659, + "epoch": 0.15427749820273184, + "kl_loss": 0.06935782730579376, + "loss_ib": 0.006322017405182123, + "step": 1073 + }, + { + "ce_ib": 7.683177471160889, + "ce_orig": 1.1224682331085205, + "epoch": 0.15442127965492453, + "kl_loss": 0.10193294286727905, + "loss_ib": 0.00893823616206646, + "step": 1074 + }, + { + "epoch": 0.15456506110711718, + "grad_norm": 0.12729093432426453, + "learning_rate": 4.994197800788735e-05, + "loss": 0.9258, + "step": 1075 + }, + { + "ce_ib": 8.33924674987793, + "ce_orig": 0.9208922982215881, + "epoch": 0.15456506110711718, + "kl_loss": 0.10732685029506683, + "loss_ib": 0.009535965509712696, + "step": 1075 + }, + { + "ce_ib": 7.906072616577148, + "ce_orig": 1.3306244611740112, + "epoch": 0.15470884255930986, + "kl_loss": 0.09973481297492981, + "loss_ib": 0.008939777500927448, + "step": 1076 + }, + { + "ce_ib": 4.723592281341553, + "ce_orig": 0.7697830200195312, + "epoch": 0.15485262401150252, + "kl_loss": 0.050751619040966034, + "loss_ib": 0.004899377003312111, + "step": 1077 + }, + { + "ce_ib": 6.142649173736572, + "ce_orig": 0.6455022692680359, + "epoch": 0.1549964054636952, + "kl_loss": 0.04849715158343315, + "loss_ib": 0.005496182478964329, + "step": 1078 + }, + { + "ce_ib": 6.342912673950195, + "ce_orig": 0.6407663822174072, + "epoch": 0.15514018691588785, + "kl_loss": 0.09040147811174393, + "loss_ib": 0.007691530045121908, + "step": 1079 + }, + { + "epoch": 0.1552839683680805, + "grad_norm": 0.11468973010778427, + "learning_rate": 4.9940649321913227e-05, + "loss": 0.8779, + "step": 1080 + }, + { + "ce_ib": 7.3335747718811035, + "ce_orig": 1.1533524990081787, + "epoch": 0.1552839683680805, + "kl_loss": 0.07371225953102112, + "loss_ib": 0.007352400105446577, + "step": 1080 + }, + { + "ce_ib": 7.188230514526367, + "ce_orig": 0.8834959268569946, + "epoch": 0.1554277498202732, + "kl_loss": 0.07604563236236572, + "loss_ib": 0.007396397180855274, + "step": 1081 + }, + { + "ce_ib": 5.953127861022949, + "ce_orig": 0.8870158195495605, + "epoch": 0.15557153127246584, + "kl_loss": 0.10391905158758163, + "loss_ib": 0.008172516711056232, + "step": 1082 + }, + { + "ce_ib": 7.811586856842041, + "ce_orig": 1.019079327583313, + "epoch": 0.15571531272465852, + "kl_loss": 0.10792341828346252, + "loss_ib": 0.009301964193582535, + "step": 1083 + }, + { + "ce_ib": 6.001402854919434, + "ce_orig": 0.824356198310852, + "epoch": 0.15585909417685118, + "kl_loss": 0.08555098623037338, + "loss_ib": 0.0072782509960234165, + "step": 1084 + }, + { + "epoch": 0.15600287562904386, + "grad_norm": 0.10831110924482346, + "learning_rate": 4.993930561249907e-05, + "loss": 0.9178, + "step": 1085 + }, + { + "ce_ib": 5.684700012207031, + "ce_orig": 0.6716042160987854, + "epoch": 0.15600287562904386, + "kl_loss": 0.0965367779135704, + "loss_ib": 0.007669189013540745, + "step": 1085 + }, + { + "ce_ib": 4.981112480163574, + "ce_orig": 0.791946530342102, + "epoch": 0.1561466570812365, + "kl_loss": 0.06735576689243317, + "loss_ib": 0.005858344491571188, + "step": 1086 + }, + { + "ce_ib": 6.52700138092041, + "ce_orig": 0.7721990346908569, + "epoch": 0.1562904385334292, + "kl_loss": 0.08555860817432404, + "loss_ib": 0.007541431579738855, + "step": 1087 + }, + { + "ce_ib": 5.841210842132568, + "ce_orig": 0.5673677325248718, + "epoch": 0.15643421998562185, + "kl_loss": 0.0819430947303772, + "loss_ib": 0.007017760071903467, + "step": 1088 + }, + { + "ce_ib": 6.370672702789307, + "ce_orig": 1.1397329568862915, + "epoch": 0.15657800143781453, + "kl_loss": 0.07329718768596649, + "loss_ib": 0.006850195582956076, + "step": 1089 + }, + { + "epoch": 0.15672178289000718, + "grad_norm": 0.14988858997821808, + "learning_rate": 4.9937946880454314e-05, + "loss": 0.8548, + "step": 1090 + }, + { + "ce_ib": 5.216709136962891, + "ce_orig": 0.6942836046218872, + "epoch": 0.15672178289000718, + "kl_loss": 0.07146431505680084, + "loss_ib": 0.0061815702356398106, + "step": 1090 + }, + { + "ce_ib": 7.10355806350708, + "ce_orig": 1.088200330734253, + "epoch": 0.15686556434219986, + "kl_loss": 0.10119420289993286, + "loss_ib": 0.008611489087343216, + "step": 1091 + }, + { + "ce_ib": 6.328213214874268, + "ce_orig": 0.9673717617988586, + "epoch": 0.15700934579439252, + "kl_loss": 0.07570363581180573, + "loss_ib": 0.006949288304895163, + "step": 1092 + }, + { + "ce_ib": 6.858358860015869, + "ce_orig": 0.7697678804397583, + "epoch": 0.1571531272465852, + "kl_loss": 0.08041620999574661, + "loss_ib": 0.007449989672750235, + "step": 1093 + }, + { + "ce_ib": 5.214935779571533, + "ce_orig": 0.8213125467300415, + "epoch": 0.15729690869877785, + "kl_loss": 0.07450224459171295, + "loss_ib": 0.006332580000162125, + "step": 1094 + }, + { + "epoch": 0.15744069015097054, + "grad_norm": 0.1458931714296341, + "learning_rate": 4.993657312659739e-05, + "loss": 0.8511, + "step": 1095 + }, + { + "ce_ib": 7.4719743728637695, + "ce_orig": 1.2657586336135864, + "epoch": 0.15744069015097054, + "kl_loss": 0.09175530076026917, + "loss_ib": 0.00832375232130289, + "step": 1095 + }, + { + "ce_ib": 7.025816440582275, + "ce_orig": 0.8322177529335022, + "epoch": 0.1575844716031632, + "kl_loss": 0.06736986339092255, + "loss_ib": 0.006881400942802429, + "step": 1096 + }, + { + "ce_ib": 5.072585105895996, + "ce_orig": 0.5746660232543945, + "epoch": 0.15772825305535587, + "kl_loss": 0.06974708288908005, + "loss_ib": 0.0060236467979848385, + "step": 1097 + }, + { + "ce_ib": 7.593356609344482, + "ce_orig": 0.923200249671936, + "epoch": 0.15787203450754853, + "kl_loss": 0.07396435737609863, + "loss_ib": 0.007494896184653044, + "step": 1098 + }, + { + "ce_ib": 6.740930557250977, + "ce_orig": 1.1538021564483643, + "epoch": 0.1580158159597412, + "kl_loss": 0.10030011087656021, + "loss_ib": 0.008385471068322659, + "step": 1099 + }, + { + "epoch": 0.15815959741193386, + "grad_norm": 0.11454831063747406, + "learning_rate": 4.9935184351755814e-05, + "loss": 0.8447, + "step": 1100 + }, + { + "ce_ib": 7.097838401794434, + "ce_orig": 1.1281813383102417, + "epoch": 0.15815959741193386, + "kl_loss": 0.06990428268909454, + "loss_ib": 0.007044133730232716, + "step": 1100 + }, + { + "ce_ib": 4.723097801208496, + "ce_orig": 0.6973043084144592, + "epoch": 0.15830337886412651, + "kl_loss": 0.07121522724628448, + "loss_ib": 0.005922310054302216, + "step": 1101 + }, + { + "ce_ib": 6.346230983734131, + "ce_orig": 0.7637441158294678, + "epoch": 0.1584471603163192, + "kl_loss": 0.1985844075679779, + "loss_ib": 0.013102336786687374, + "step": 1102 + }, + { + "ce_ib": 5.727991580963135, + "ce_orig": 0.8385193943977356, + "epoch": 0.15859094176851185, + "kl_loss": 0.0787828341126442, + "loss_ib": 0.0068031372502446175, + "step": 1103 + }, + { + "ce_ib": 6.202123165130615, + "ce_orig": 0.7559351325035095, + "epoch": 0.15873472322070453, + "kl_loss": 0.06039196252822876, + "loss_ib": 0.006120659410953522, + "step": 1104 + }, + { + "epoch": 0.1588785046728972, + "grad_norm": 0.14600904285907745, + "learning_rate": 4.993378055676614e-05, + "loss": 0.9258, + "step": 1105 + }, + { + "ce_ib": 7.2104668617248535, + "ce_orig": 1.1173107624053955, + "epoch": 0.1588785046728972, + "kl_loss": 0.10114659368991852, + "loss_ib": 0.008662563748657703, + "step": 1105 + }, + { + "ce_ib": 6.067015171051025, + "ce_orig": 0.8034613728523254, + "epoch": 0.15902228612508987, + "kl_loss": 0.11653576791286469, + "loss_ib": 0.008860296569764614, + "step": 1106 + }, + { + "ce_ib": 10.289580345153809, + "ce_orig": 1.5018421411514282, + "epoch": 0.15916606757728252, + "kl_loss": 0.11772806942462921, + "loss_ib": 0.0110311945900321, + "step": 1107 + }, + { + "ce_ib": 5.450833320617676, + "ce_orig": 0.7801606059074402, + "epoch": 0.1593098490294752, + "kl_loss": 0.05637833848595619, + "loss_ib": 0.005544333718717098, + "step": 1108 + }, + { + "ce_ib": 8.64270305633545, + "ce_orig": 1.049017071723938, + "epoch": 0.15945363048166786, + "kl_loss": 0.09192685782909393, + "loss_ib": 0.008917693980038166, + "step": 1109 + }, + { + "epoch": 0.15959741193386054, + "grad_norm": 0.1408187747001648, + "learning_rate": 4.9932361742473963e-05, + "loss": 0.9482, + "step": 1110 + }, + { + "ce_ib": 7.72374963760376, + "ce_orig": 0.8551508784294128, + "epoch": 0.15959741193386054, + "kl_loss": 0.07729790359735489, + "loss_ib": 0.007726769894361496, + "step": 1110 + }, + { + "ce_ib": 5.627543926239014, + "ce_orig": 0.5555012226104736, + "epoch": 0.1597411933860532, + "kl_loss": 0.057770341634750366, + "loss_ib": 0.0057022892870008945, + "step": 1111 + }, + { + "ce_ib": 8.985616683959961, + "ce_orig": 1.3651847839355469, + "epoch": 0.15988497483824587, + "kl_loss": 0.08362238109111786, + "loss_ib": 0.008673927746713161, + "step": 1112 + }, + { + "ce_ib": 5.870021343231201, + "ce_orig": 0.8901808261871338, + "epoch": 0.16002875629043853, + "kl_loss": 0.09243616461753845, + "loss_ib": 0.007556818425655365, + "step": 1113 + }, + { + "ce_ib": 3.3123323917388916, + "ce_orig": 0.35519129037857056, + "epoch": 0.1601725377426312, + "kl_loss": 0.14710363745689392, + "loss_ib": 0.009011347778141499, + "step": 1114 + }, + { + "epoch": 0.16031631919482386, + "grad_norm": 0.10004870593547821, + "learning_rate": 4.993092790973394e-05, + "loss": 0.818, + "step": 1115 + }, + { + "ce_ib": 6.521468162536621, + "ce_orig": 0.8767159581184387, + "epoch": 0.16031631919482386, + "kl_loss": 0.07688285410404205, + "loss_ib": 0.007104876916855574, + "step": 1115 + }, + { + "ce_ib": 5.883849143981934, + "ce_orig": 0.779852032661438, + "epoch": 0.16046010064701655, + "kl_loss": 0.0841967910528183, + "loss_ib": 0.007151763886213303, + "step": 1116 + }, + { + "ce_ib": 5.886414051055908, + "ce_orig": 0.8126896023750305, + "epoch": 0.1606038820992092, + "kl_loss": 0.05374331772327423, + "loss_ib": 0.005630373023450375, + "step": 1117 + }, + { + "ce_ib": 7.834891319274902, + "ce_orig": 1.0611050128936768, + "epoch": 0.16074766355140188, + "kl_loss": 0.08578921854496002, + "loss_ib": 0.008206906728446484, + "step": 1118 + }, + { + "ce_ib": 7.489284515380859, + "ce_orig": 1.0754971504211426, + "epoch": 0.16089144500359454, + "kl_loss": 0.09790926426649094, + "loss_ib": 0.008640105836093426, + "step": 1119 + }, + { + "epoch": 0.16103522645578722, + "grad_norm": 0.1413450539112091, + "learning_rate": 4.992947905940976e-05, + "loss": 0.8758, + "step": 1120 + }, + { + "ce_ib": 5.093112945556641, + "ce_orig": 0.5349186658859253, + "epoch": 0.16103522645578722, + "kl_loss": 0.07928375899791718, + "loss_ib": 0.006510744336992502, + "step": 1120 + }, + { + "ce_ib": 7.093912601470947, + "ce_orig": 0.5596253871917725, + "epoch": 0.16117900790797987, + "kl_loss": 0.12861564755439758, + "loss_ib": 0.009977738372981548, + "step": 1121 + }, + { + "ce_ib": 6.040073871612549, + "ce_orig": 0.9027337431907654, + "epoch": 0.16132278936017252, + "kl_loss": 0.0890701562166214, + "loss_ib": 0.007473544683307409, + "step": 1122 + }, + { + "ce_ib": 5.580602645874023, + "ce_orig": 0.6340703964233398, + "epoch": 0.1614665708123652, + "kl_loss": 0.08454744517803192, + "loss_ib": 0.007017673458904028, + "step": 1123 + }, + { + "ce_ib": 8.226970672607422, + "ce_orig": 1.0298020839691162, + "epoch": 0.16161035226455786, + "kl_loss": 0.09220044314861298, + "loss_ib": 0.008723507635295391, + "step": 1124 + }, + { + "epoch": 0.16175413371675054, + "grad_norm": 0.10399471968412399, + "learning_rate": 4.9928015192374154e-05, + "loss": 0.778, + "step": 1125 + }, + { + "ce_ib": 5.939601421356201, + "ce_orig": 0.7973865866661072, + "epoch": 0.16175413371675054, + "kl_loss": 0.08219926059246063, + "loss_ib": 0.007079763803631067, + "step": 1125 + }, + { + "ce_ib": 4.681643962860107, + "ce_orig": 0.5626142621040344, + "epoch": 0.1618979151689432, + "kl_loss": 0.07760648429393768, + "loss_ib": 0.006221146322786808, + "step": 1126 + }, + { + "ce_ib": 5.186760902404785, + "ce_orig": 0.7491214871406555, + "epoch": 0.16204169662113588, + "kl_loss": 0.07720279693603516, + "loss_ib": 0.006453520152717829, + "step": 1127 + }, + { + "ce_ib": 6.080879211425781, + "ce_orig": 0.6856286525726318, + "epoch": 0.16218547807332853, + "kl_loss": 0.08897431939840317, + "loss_ib": 0.007489155512303114, + "step": 1128 + }, + { + "ce_ib": 5.474771022796631, + "ce_orig": 0.83914715051651, + "epoch": 0.1623292595255212, + "kl_loss": 0.06586287915706635, + "loss_ib": 0.006030529271811247, + "step": 1129 + }, + { + "epoch": 0.16247304097771387, + "grad_norm": 0.11729539930820465, + "learning_rate": 4.992653630950893e-05, + "loss": 0.8157, + "step": 1130 + }, + { + "ce_ib": 6.560685634613037, + "ce_orig": 0.8619605302810669, + "epoch": 0.16247304097771387, + "kl_loss": 0.08206107467412949, + "loss_ib": 0.007383396383374929, + "step": 1130 + }, + { + "ce_ib": 5.883871555328369, + "ce_orig": 0.751981258392334, + "epoch": 0.16261682242990655, + "kl_loss": 0.0961388349533081, + "loss_ib": 0.007748877163976431, + "step": 1131 + }, + { + "ce_ib": 8.889966011047363, + "ce_orig": 1.2208887338638306, + "epoch": 0.1627606038820992, + "kl_loss": 0.07606694102287292, + "loss_ib": 0.00824833009392023, + "step": 1132 + }, + { + "ce_ib": 4.468330383300781, + "ce_orig": 0.5547013282775879, + "epoch": 0.16290438533429188, + "kl_loss": 0.05819021165370941, + "loss_ib": 0.0051436759531497955, + "step": 1133 + }, + { + "ce_ib": 6.565499782562256, + "ce_orig": 0.8970333337783813, + "epoch": 0.16304816678648454, + "kl_loss": 0.0689244270324707, + "loss_ib": 0.0067289709113538265, + "step": 1134 + }, + { + "epoch": 0.16319194823867722, + "grad_norm": 0.11034829914569855, + "learning_rate": 4.9925042411704906e-05, + "loss": 0.8462, + "step": 1135 + }, + { + "ce_ib": 5.932969093322754, + "ce_orig": 0.6410640478134155, + "epoch": 0.16319194823867722, + "kl_loss": 0.05918009579181671, + "loss_ib": 0.0059254891239106655, + "step": 1135 + }, + { + "ce_ib": 6.155702114105225, + "ce_orig": 1.011331558227539, + "epoch": 0.16333572969086987, + "kl_loss": 0.07035033404827118, + "loss_ib": 0.006595368031412363, + "step": 1136 + }, + { + "ce_ib": 4.861450672149658, + "ce_orig": 0.6031864285469055, + "epoch": 0.16347951114306256, + "kl_loss": 0.10412517189979553, + "loss_ib": 0.0076369838789105415, + "step": 1137 + }, + { + "ce_ib": 5.104621887207031, + "ce_orig": 0.7848092913627625, + "epoch": 0.1636232925952552, + "kl_loss": 0.05974641069769859, + "loss_ib": 0.005539631471037865, + "step": 1138 + }, + { + "ce_ib": 5.568374156951904, + "ce_orig": 0.6854334473609924, + "epoch": 0.1637670740474479, + "kl_loss": 0.06520682573318481, + "loss_ib": 0.0060445284470915794, + "step": 1139 + }, + { + "epoch": 0.16391085549964055, + "grad_norm": 0.12411113828420639, + "learning_rate": 4.992353349986195e-05, + "loss": 0.8369, + "step": 1140 + }, + { + "ce_ib": 5.672303199768066, + "ce_orig": 0.6218313574790955, + "epoch": 0.16391085549964055, + "kl_loss": 0.07955818623304367, + "loss_ib": 0.00681406119838357, + "step": 1140 + }, + { + "ce_ib": 8.039485931396484, + "ce_orig": 0.8602107167243958, + "epoch": 0.16405463695183323, + "kl_loss": 0.14670434594154358, + "loss_ib": 0.011354960501194, + "step": 1141 + }, + { + "ce_ib": 7.664279937744141, + "ce_orig": 1.1456722021102905, + "epoch": 0.16419841840402588, + "kl_loss": 0.10382938385009766, + "loss_ib": 0.009023609571158886, + "step": 1142 + }, + { + "ce_ib": 4.566431999206543, + "ce_orig": 0.6573471426963806, + "epoch": 0.16434219985621853, + "kl_loss": 0.0659567192196846, + "loss_ib": 0.005581052042543888, + "step": 1143 + }, + { + "ce_ib": 7.095448970794678, + "ce_orig": 0.5521078705787659, + "epoch": 0.16448598130841122, + "kl_loss": 0.1796271800994873, + "loss_ib": 0.012529084458947182, + "step": 1144 + }, + { + "epoch": 0.16462976276060387, + "grad_norm": 0.10555960237979889, + "learning_rate": 4.992200957488899e-05, + "loss": 0.8855, + "step": 1145 + }, + { + "ce_ib": 6.304956912994385, + "ce_orig": 0.9338970184326172, + "epoch": 0.16462976276060387, + "kl_loss": 0.05889878794550896, + "loss_ib": 0.0060974182561039925, + "step": 1145 + }, + { + "ce_ib": 7.517076015472412, + "ce_orig": 1.3388317823410034, + "epoch": 0.16477354421279655, + "kl_loss": 0.07668067514896393, + "loss_ib": 0.007592571433633566, + "step": 1146 + }, + { + "ce_ib": 5.553647041320801, + "ce_orig": 0.7843688726425171, + "epoch": 0.1649173256649892, + "kl_loss": 0.0665152296423912, + "loss_ib": 0.006102585233747959, + "step": 1147 + }, + { + "ce_ib": 4.305169105529785, + "ce_orig": 0.6364474296569824, + "epoch": 0.1650611071171819, + "kl_loss": 0.07642898708581924, + "loss_ib": 0.00597403384745121, + "step": 1148 + }, + { + "ce_ib": 6.075594902038574, + "ce_orig": 0.7118659019470215, + "epoch": 0.16520488856937454, + "kl_loss": 0.08226151764392853, + "loss_ib": 0.00715087354183197, + "step": 1149 + }, + { + "epoch": 0.16534867002156722, + "grad_norm": 0.10492754727602005, + "learning_rate": 4.9920470637703996e-05, + "loss": 0.8326, + "step": 1150 + }, + { + "ce_ib": 5.802603721618652, + "ce_orig": 0.7407565712928772, + "epoch": 0.16534867002156722, + "kl_loss": 0.0757661759853363, + "loss_ib": 0.0066896104253828526, + "step": 1150 + }, + { + "ce_ib": 6.076198101043701, + "ce_orig": 0.8765375018119812, + "epoch": 0.16549245147375988, + "kl_loss": 0.10455723851919174, + "loss_ib": 0.008265960961580276, + "step": 1151 + }, + { + "ce_ib": 6.5738115310668945, + "ce_orig": 0.8711956739425659, + "epoch": 0.16563623292595256, + "kl_loss": 0.08836830407381058, + "loss_ib": 0.007705320604145527, + "step": 1152 + }, + { + "ce_ib": 3.4032890796661377, + "ce_orig": 0.3757346570491791, + "epoch": 0.1657800143781452, + "kl_loss": 0.09860767424106598, + "loss_ib": 0.006632028613239527, + "step": 1153 + }, + { + "ce_ib": 3.8292911052703857, + "ce_orig": 0.6358639597892761, + "epoch": 0.1659237958303379, + "kl_loss": 0.08545638620853424, + "loss_ib": 0.006187465041875839, + "step": 1154 + }, + { + "epoch": 0.16606757728253055, + "grad_norm": 0.13825102150440216, + "learning_rate": 4.9918916689233966e-05, + "loss": 0.8312, + "step": 1155 + }, + { + "ce_ib": 5.534228324890137, + "ce_orig": 0.6804826259613037, + "epoch": 0.16606757728253055, + "kl_loss": 0.06395240128040314, + "loss_ib": 0.0059647345915436745, + "step": 1155 + }, + { + "ce_ib": 6.463276386260986, + "ce_orig": 1.0032761096954346, + "epoch": 0.16621135873472323, + "kl_loss": 0.07692591845989227, + "loss_ib": 0.007077933754771948, + "step": 1156 + }, + { + "ce_ib": 5.867524147033691, + "ce_orig": 0.6822445392608643, + "epoch": 0.16635514018691588, + "kl_loss": 0.07831412553787231, + "loss_ib": 0.006849468685686588, + "step": 1157 + }, + { + "ce_ib": 7.46630859375, + "ce_orig": 1.2471692562103271, + "epoch": 0.16649892163910857, + "kl_loss": 0.08687464147806168, + "loss_ib": 0.008076886646449566, + "step": 1158 + }, + { + "ce_ib": 8.434159278869629, + "ce_orig": 1.1131393909454346, + "epoch": 0.16664270309130122, + "kl_loss": 0.10605284571647644, + "loss_ib": 0.009519721381366253, + "step": 1159 + }, + { + "epoch": 0.1667864845434939, + "grad_norm": 0.1169283539056778, + "learning_rate": 4.9917347730414955e-05, + "loss": 0.9089, + "step": 1160 + }, + { + "ce_ib": 6.45564079284668, + "ce_orig": 0.8749749660491943, + "epoch": 0.1667864845434939, + "kl_loss": 0.08321629464626312, + "loss_ib": 0.0073886350728571415, + "step": 1160 + }, + { + "ce_ib": 7.384956359863281, + "ce_orig": 0.655971348285675, + "epoch": 0.16693026599568656, + "kl_loss": 0.12570273876190186, + "loss_ib": 0.009977615438401699, + "step": 1161 + }, + { + "ce_ib": 6.389439582824707, + "ce_orig": 0.9102007150650024, + "epoch": 0.16707404744787924, + "kl_loss": 0.07830990850925446, + "loss_ib": 0.00711021525785327, + "step": 1162 + }, + { + "ce_ib": 5.798282623291016, + "ce_orig": 1.0349807739257812, + "epoch": 0.1672178289000719, + "kl_loss": 0.06607086956501007, + "loss_ib": 0.006202684715390205, + "step": 1163 + }, + { + "ce_ib": 4.520916938781738, + "ce_orig": 0.7277075052261353, + "epoch": 0.16736161035226454, + "kl_loss": 0.07415127754211426, + "loss_ib": 0.005968022625893354, + "step": 1164 + }, + { + "epoch": 0.16750539180445723, + "grad_norm": 0.12926308810710907, + "learning_rate": 4.991576376219203e-05, + "loss": 0.9313, + "step": 1165 + }, + { + "ce_ib": 5.858853340148926, + "ce_orig": 0.7173748016357422, + "epoch": 0.16750539180445723, + "kl_loss": 0.057576555758714676, + "loss_ib": 0.005808254238218069, + "step": 1165 + }, + { + "ce_ib": 4.105903625488281, + "ce_orig": 0.5997424125671387, + "epoch": 0.16764917325664988, + "kl_loss": 0.06417367607355118, + "loss_ib": 0.00526163587346673, + "step": 1166 + }, + { + "ce_ib": 4.893276691436768, + "ce_orig": 0.6570137739181519, + "epoch": 0.16779295470884256, + "kl_loss": 0.09996452927589417, + "loss_ib": 0.007444864604622126, + "step": 1167 + }, + { + "ce_ib": 6.648251533508301, + "ce_orig": 0.9961689710617065, + "epoch": 0.16793673616103522, + "kl_loss": 0.07535562664270401, + "loss_ib": 0.007091906853020191, + "step": 1168 + }, + { + "ce_ib": 5.548269271850586, + "ce_orig": 0.8265610337257385, + "epoch": 0.1680805176132279, + "kl_loss": 0.08578751981258392, + "loss_ib": 0.0070635112933814526, + "step": 1169 + }, + { + "epoch": 0.16822429906542055, + "grad_norm": 0.11066246032714844, + "learning_rate": 4.991416478551936e-05, + "loss": 0.8065, + "step": 1170 + }, + { + "ce_ib": 4.687781810760498, + "ce_orig": 0.6614780426025391, + "epoch": 0.16822429906542055, + "kl_loss": 0.10225407779216766, + "loss_ib": 0.007456594612449408, + "step": 1170 + }, + { + "ce_ib": 6.558977127075195, + "ce_orig": 0.9530431032180786, + "epoch": 0.16836808051761323, + "kl_loss": 0.08521037548780441, + "loss_ib": 0.007540007121860981, + "step": 1171 + }, + { + "ce_ib": 5.19743537902832, + "ce_orig": 0.6982885003089905, + "epoch": 0.1685118619698059, + "kl_loss": 0.07881344854831696, + "loss_ib": 0.006539390422403812, + "step": 1172 + }, + { + "ce_ib": 5.995883941650391, + "ce_orig": 0.7644237875938416, + "epoch": 0.16865564342199857, + "kl_loss": 0.04498795047402382, + "loss_ib": 0.005247339606285095, + "step": 1173 + }, + { + "ce_ib": 5.333321571350098, + "ce_orig": 0.6872938871383667, + "epoch": 0.16879942487419122, + "kl_loss": 0.059417806565761566, + "loss_ib": 0.005637551192194223, + "step": 1174 + }, + { + "epoch": 0.1689432063263839, + "grad_norm": 0.12378540635108948, + "learning_rate": 4.9912550801360094e-05, + "loss": 0.7657, + "step": 1175 + }, + { + "ce_ib": 8.82595443725586, + "ce_orig": 1.3515900373458862, + "epoch": 0.1689432063263839, + "kl_loss": 0.09049460291862488, + "loss_ib": 0.008937708102166653, + "step": 1175 + }, + { + "ce_ib": 6.9534173011779785, + "ce_orig": 1.021569848060608, + "epoch": 0.16908698777857656, + "kl_loss": 0.06033141538500786, + "loss_ib": 0.006493279244750738, + "step": 1176 + }, + { + "ce_ib": 5.025914669036865, + "ce_orig": 0.9376049637794495, + "epoch": 0.16923076923076924, + "kl_loss": 0.05850953981280327, + "loss_ib": 0.005438434425741434, + "step": 1177 + }, + { + "ce_ib": 5.084593296051025, + "ce_orig": 0.8552833199501038, + "epoch": 0.1693745506829619, + "kl_loss": 0.05605684593319893, + "loss_ib": 0.0053451391868293285, + "step": 1178 + }, + { + "ce_ib": 5.717614650726318, + "ce_orig": 0.8763951659202576, + "epoch": 0.16951833213515458, + "kl_loss": 0.08483847975730896, + "loss_ib": 0.007100731134414673, + "step": 1179 + }, + { + "epoch": 0.16966211358734723, + "grad_norm": 0.12536191940307617, + "learning_rate": 4.991092181068644e-05, + "loss": 0.9222, + "step": 1180 + }, + { + "ce_ib": 5.29203987121582, + "ce_orig": 0.7156727313995361, + "epoch": 0.16966211358734723, + "kl_loss": 0.08051944524049759, + "loss_ib": 0.006671992130577564, + "step": 1180 + }, + { + "ce_ib": 4.629685401916504, + "ce_orig": 0.8237522840499878, + "epoch": 0.1698058950395399, + "kl_loss": 0.06042894721031189, + "loss_ib": 0.005336289759725332, + "step": 1181 + }, + { + "ce_ib": 6.4098005294799805, + "ce_orig": 1.1176815032958984, + "epoch": 0.16994967649173257, + "kl_loss": 0.0590413399040699, + "loss_ib": 0.006156967021524906, + "step": 1182 + }, + { + "ce_ib": 6.5210089683532715, + "ce_orig": 1.03764808177948, + "epoch": 0.17009345794392525, + "kl_loss": 0.06617483496665955, + "loss_ib": 0.006569246295839548, + "step": 1183 + }, + { + "ce_ib": 5.111931800842285, + "ce_orig": 0.9622370004653931, + "epoch": 0.1702372393961179, + "kl_loss": 0.06135128438472748, + "loss_ib": 0.005623530130833387, + "step": 1184 + }, + { + "epoch": 0.17038102084831055, + "grad_norm": 0.13301178812980652, + "learning_rate": 4.990927781447965e-05, + "loss": 0.795, + "step": 1185 + }, + { + "ce_ib": 5.9920573234558105, + "ce_orig": 0.9049957990646362, + "epoch": 0.17038102084831055, + "kl_loss": 0.07264045625925064, + "loss_ib": 0.006628051400184631, + "step": 1185 + }, + { + "ce_ib": 6.230532646179199, + "ce_orig": 1.0566587448120117, + "epoch": 0.17052480230050324, + "kl_loss": 0.07908719033002853, + "loss_ib": 0.0070696258917450905, + "step": 1186 + }, + { + "ce_ib": 5.547403335571289, + "ce_orig": 0.7412517666816711, + "epoch": 0.1706685837526959, + "kl_loss": 0.06213938817381859, + "loss_ib": 0.005880671087652445, + "step": 1187 + }, + { + "ce_ib": 6.3378777503967285, + "ce_orig": 0.9786301851272583, + "epoch": 0.17081236520488857, + "kl_loss": 0.052673254162073135, + "loss_ib": 0.00580260157585144, + "step": 1188 + }, + { + "ce_ib": 5.9338483810424805, + "ce_orig": 0.5440419912338257, + "epoch": 0.17095614665708123, + "kl_loss": 0.0711212083697319, + "loss_ib": 0.0065229847095906734, + "step": 1189 + }, + { + "epoch": 0.1710999281092739, + "grad_norm": 0.10571207106113434, + "learning_rate": 4.990761881373003e-05, + "loss": 0.8404, + "step": 1190 + }, + { + "ce_ib": 4.170894622802734, + "ce_orig": 0.5145422220230103, + "epoch": 0.1710999281092739, + "kl_loss": 0.11988720297813416, + "loss_ib": 0.00807980727404356, + "step": 1190 + }, + { + "ce_ib": 5.607202529907227, + "ce_orig": 0.5342667698860168, + "epoch": 0.17124370956146656, + "kl_loss": 0.10903531312942505, + "loss_ib": 0.008255367167294025, + "step": 1191 + }, + { + "ce_ib": 7.607751846313477, + "ce_orig": 1.0716842412948608, + "epoch": 0.17138749101365924, + "kl_loss": 0.07368045300245285, + "loss_ib": 0.007487898226827383, + "step": 1192 + }, + { + "ce_ib": 4.4001359939575195, + "ce_orig": 0.5072839856147766, + "epoch": 0.1715312724658519, + "kl_loss": 0.09861178696155548, + "loss_ib": 0.007130657322704792, + "step": 1193 + }, + { + "ce_ib": 6.831094741821289, + "ce_orig": 0.7226266860961914, + "epoch": 0.17167505391804458, + "kl_loss": 0.09814979135990143, + "loss_ib": 0.008323037065565586, + "step": 1194 + }, + { + "epoch": 0.17181883537023723, + "grad_norm": 0.13412193953990936, + "learning_rate": 4.990594480943689e-05, + "loss": 0.8231, + "step": 1195 + }, + { + "ce_ib": 5.302410125732422, + "ce_orig": 0.8520780801773071, + "epoch": 0.17181883537023723, + "kl_loss": 0.10553916543722153, + "loss_ib": 0.007928162813186646, + "step": 1195 + }, + { + "ce_ib": 7.1728105545043945, + "ce_orig": 1.304303765296936, + "epoch": 0.17196261682242991, + "kl_loss": 0.08810941874980927, + "loss_ib": 0.007991875521838665, + "step": 1196 + }, + { + "ce_ib": 4.987504959106445, + "ce_orig": 0.7122842073440552, + "epoch": 0.17210639827462257, + "kl_loss": 0.07356394827365875, + "loss_ib": 0.00617195013910532, + "step": 1197 + }, + { + "ce_ib": 5.243117809295654, + "ce_orig": 0.9049599170684814, + "epoch": 0.17225017972681525, + "kl_loss": 0.06261973083019257, + "loss_ib": 0.005752545781433582, + "step": 1198 + }, + { + "ce_ib": 6.555300712585449, + "ce_orig": 1.0679662227630615, + "epoch": 0.1723939611790079, + "kl_loss": 0.07326988875865936, + "loss_ib": 0.0069411443546414375, + "step": 1199 + }, + { + "epoch": 0.17253774263120059, + "grad_norm": 0.120902881026268, + "learning_rate": 4.990425580260861e-05, + "loss": 0.785, + "step": 1200 + }, + { + "ce_ib": 6.999645709991455, + "ce_orig": 1.1089929342269897, + "epoch": 0.17253774263120059, + "kl_loss": 0.0474993959069252, + "loss_ib": 0.005874792579561472, + "step": 1200 + }, + { + "ce_ib": 3.891652822494507, + "ce_orig": 0.3429041802883148, + "epoch": 0.17268152408339324, + "kl_loss": 0.1296495497226715, + "loss_ib": 0.008428304456174374, + "step": 1201 + }, + { + "ce_ib": 6.550596237182617, + "ce_orig": 1.068969488143921, + "epoch": 0.17282530553558592, + "kl_loss": 0.07385049760341644, + "loss_ib": 0.00696782348677516, + "step": 1202 + }, + { + "ce_ib": 4.990350246429443, + "ce_orig": 0.7823203206062317, + "epoch": 0.17296908698777858, + "kl_loss": 0.080950066447258, + "loss_ib": 0.006542678456753492, + "step": 1203 + }, + { + "ce_ib": 6.351553916931152, + "ce_orig": 0.5572970509529114, + "epoch": 0.17311286843997126, + "kl_loss": 0.08578500151634216, + "loss_ib": 0.007465027272701263, + "step": 1204 + }, + { + "epoch": 0.1732566498921639, + "grad_norm": 0.11740091443061829, + "learning_rate": 4.990255179426259e-05, + "loss": 0.7717, + "step": 1205 + }, + { + "ce_ib": 4.087786674499512, + "ce_orig": 0.6453249454498291, + "epoch": 0.1732566498921639, + "kl_loss": 0.05839306116104126, + "loss_ib": 0.004963546525686979, + "step": 1205 + }, + { + "ce_ib": 5.859645366668701, + "ce_orig": 0.819754958152771, + "epoch": 0.17340043134435656, + "kl_loss": 0.07804655283689499, + "loss_ib": 0.006832150276750326, + "step": 1206 + }, + { + "ce_ib": 8.538081169128418, + "ce_orig": 1.053422212600708, + "epoch": 0.17354421279654925, + "kl_loss": 0.08176679909229279, + "loss_ib": 0.008357380516827106, + "step": 1207 + }, + { + "ce_ib": 8.998468399047852, + "ce_orig": 1.3600265979766846, + "epoch": 0.1736879942487419, + "kl_loss": 0.11819840967655182, + "loss_ib": 0.010409154929220676, + "step": 1208 + }, + { + "ce_ib": 5.05460262298584, + "ce_orig": 0.6827840805053711, + "epoch": 0.17383177570093458, + "kl_loss": 0.0864296555519104, + "loss_ib": 0.006848783697932959, + "step": 1209 + }, + { + "epoch": 0.17397555715312724, + "grad_norm": 0.11643915623426437, + "learning_rate": 4.9900832785425255e-05, + "loss": 0.9545, + "step": 1210 + }, + { + "ce_ib": 6.801517963409424, + "ce_orig": 0.9832918047904968, + "epoch": 0.17397555715312724, + "kl_loss": 0.07543274015188217, + "loss_ib": 0.007172395940870047, + "step": 1210 + }, + { + "ce_ib": 6.775679588317871, + "ce_orig": 1.140964150428772, + "epoch": 0.17411933860531992, + "kl_loss": 0.087261863052845, + "loss_ib": 0.0077509330585598946, + "step": 1211 + }, + { + "ce_ib": 7.438926696777344, + "ce_orig": 1.3403379917144775, + "epoch": 0.17426312005751257, + "kl_loss": 0.0789506807923317, + "loss_ib": 0.0076669976115226746, + "step": 1212 + }, + { + "ce_ib": 5.44244909286499, + "ce_orig": 1.1251771450042725, + "epoch": 0.17440690150970525, + "kl_loss": 0.06914874166250229, + "loss_ib": 0.006178661715239286, + "step": 1213 + }, + { + "ce_ib": 6.185060977935791, + "ce_orig": 0.8939319252967834, + "epoch": 0.1745506829618979, + "kl_loss": 0.09630735218524933, + "loss_ib": 0.007907898165285587, + "step": 1214 + }, + { + "epoch": 0.1746944644140906, + "grad_norm": 0.10256122052669525, + "learning_rate": 4.98990987771321e-05, + "loss": 0.9393, + "step": 1215 + }, + { + "ce_ib": 5.719100475311279, + "ce_orig": 0.7967482209205627, + "epoch": 0.1746944644140906, + "kl_loss": 0.09343187510967255, + "loss_ib": 0.0075311437249183655, + "step": 1215 + }, + { + "ce_ib": 8.084674835205078, + "ce_orig": 0.9587056040763855, + "epoch": 0.17483824586628324, + "kl_loss": 0.15284928679466248, + "loss_ib": 0.011684802360832691, + "step": 1216 + }, + { + "ce_ib": 3.7043116092681885, + "ce_orig": 0.7526902556419373, + "epoch": 0.17498202731847592, + "kl_loss": 0.0783972516655922, + "loss_ib": 0.00577201833948493, + "step": 1217 + }, + { + "ce_ib": 9.836902618408203, + "ce_orig": 1.3606029748916626, + "epoch": 0.17512580877066858, + "kl_loss": 0.15290556848049164, + "loss_ib": 0.012563729658722878, + "step": 1218 + }, + { + "ce_ib": 6.844699859619141, + "ce_orig": 0.8876489400863647, + "epoch": 0.17526959022286126, + "kl_loss": 0.08287541568279266, + "loss_ib": 0.007566120475530624, + "step": 1219 + }, + { + "epoch": 0.17541337167505391, + "grad_norm": 0.13088709115982056, + "learning_rate": 4.9897349770427635e-05, + "loss": 0.9004, + "step": 1220 + }, + { + "ce_ib": 6.765967845916748, + "ce_orig": 1.0248358249664307, + "epoch": 0.17541337167505391, + "kl_loss": 0.06924465298652649, + "loss_ib": 0.006845216732472181, + "step": 1220 + }, + { + "ce_ib": 5.917281627655029, + "ce_orig": 0.820586621761322, + "epoch": 0.1755571531272466, + "kl_loss": 0.0708693414926529, + "loss_ib": 0.006502108182758093, + "step": 1221 + }, + { + "ce_ib": 6.330194473266602, + "ce_orig": 1.0256482362747192, + "epoch": 0.17570093457943925, + "kl_loss": 0.06842375546693802, + "loss_ib": 0.006586284842342138, + "step": 1222 + }, + { + "ce_ib": 5.407456398010254, + "ce_orig": 0.7433579564094543, + "epoch": 0.17584471603163193, + "kl_loss": 0.08904191106557846, + "loss_ib": 0.007155823986977339, + "step": 1223 + }, + { + "ce_ib": 5.031669616699219, + "ce_orig": 0.7584470510482788, + "epoch": 0.17598849748382459, + "kl_loss": 0.06316515803337097, + "loss_ib": 0.005674092564731836, + "step": 1224 + }, + { + "epoch": 0.17613227893601727, + "grad_norm": 0.1254931092262268, + "learning_rate": 4.989558576636538e-05, + "loss": 0.8734, + "step": 1225 + }, + { + "ce_ib": 4.3486480712890625, + "ce_orig": 0.674187421798706, + "epoch": 0.17613227893601727, + "kl_loss": 0.0859442949295044, + "loss_ib": 0.006471538450568914, + "step": 1225 + }, + { + "ce_ib": 6.384706974029541, + "ce_orig": 0.987047553062439, + "epoch": 0.17627606038820992, + "kl_loss": 0.07326386868953705, + "loss_ib": 0.006855546962469816, + "step": 1226 + }, + { + "ce_ib": 4.738792896270752, + "ce_orig": 0.6714680790901184, + "epoch": 0.17641984184040257, + "kl_loss": 0.11897613853216171, + "loss_ib": 0.008318203501403332, + "step": 1227 + }, + { + "ce_ib": 4.5174360275268555, + "ce_orig": 0.5024546384811401, + "epoch": 0.17656362329259526, + "kl_loss": 0.07360441237688065, + "loss_ib": 0.005938938818871975, + "step": 1228 + }, + { + "ce_ib": 4.433870792388916, + "ce_orig": 0.562360405921936, + "epoch": 0.1767074047447879, + "kl_loss": 0.059870630502700806, + "loss_ib": 0.005210466682910919, + "step": 1229 + }, + { + "epoch": 0.1768511861969806, + "grad_norm": 0.13647156953811646, + "learning_rate": 4.989380676600795e-05, + "loss": 0.7922, + "step": 1230 + }, + { + "ce_ib": 6.828713417053223, + "ce_orig": 0.899097740650177, + "epoch": 0.1768511861969806, + "kl_loss": 0.06694717705249786, + "loss_ib": 0.0067617157474160194, + "step": 1230 + }, + { + "ce_ib": 4.22491979598999, + "ce_orig": 0.616287350654602, + "epoch": 0.17699496764917325, + "kl_loss": 0.06532734632492065, + "loss_ib": 0.00537882698699832, + "step": 1231 + }, + { + "ce_ib": 6.296413898468018, + "ce_orig": 0.7972742319107056, + "epoch": 0.17713874910136593, + "kl_loss": 0.1285838782787323, + "loss_ib": 0.009577400982379913, + "step": 1232 + }, + { + "ce_ib": 4.380596160888672, + "ce_orig": 0.6534774899482727, + "epoch": 0.17728253055355858, + "kl_loss": 0.07830242812633514, + "loss_ib": 0.006105419248342514, + "step": 1233 + }, + { + "ce_ib": 4.298580169677734, + "ce_orig": 0.5774059891700745, + "epoch": 0.17742631200575126, + "kl_loss": 0.0666581392288208, + "loss_ib": 0.005482196807861328, + "step": 1234 + }, + { + "epoch": 0.17757009345794392, + "grad_norm": 0.10755088925361633, + "learning_rate": 4.9892012770426924e-05, + "loss": 0.777, + "step": 1235 + }, + { + "ce_ib": 6.432723522186279, + "ce_orig": 1.0068868398666382, + "epoch": 0.17757009345794392, + "kl_loss": 0.07314655929803848, + "loss_ib": 0.006873689591884613, + "step": 1235 + }, + { + "ce_ib": 6.71904182434082, + "ce_orig": 1.2869292497634888, + "epoch": 0.1777138749101366, + "kl_loss": 0.06186103820800781, + "loss_ib": 0.0064525725319981575, + "step": 1236 + }, + { + "ce_ib": 5.607357025146484, + "ce_orig": 0.8650424480438232, + "epoch": 0.17785765636232925, + "kl_loss": 0.07406433671712875, + "loss_ib": 0.0065068951807916164, + "step": 1237 + }, + { + "ce_ib": 5.314675331115723, + "ce_orig": 0.5908254384994507, + "epoch": 0.17800143781452193, + "kl_loss": 0.07040902227163315, + "loss_ib": 0.006177789065986872, + "step": 1238 + }, + { + "ce_ib": 5.304821014404297, + "ce_orig": 0.9293766617774963, + "epoch": 0.1781452192667146, + "kl_loss": 0.06702286005020142, + "loss_ib": 0.0060035535134375095, + "step": 1239 + }, + { + "epoch": 0.17828900071890727, + "grad_norm": 0.1296769231557846, + "learning_rate": 4.989020378070297e-05, + "loss": 0.8665, + "step": 1240 + }, + { + "ce_ib": 3.641101360321045, + "ce_orig": 0.5473067164421082, + "epoch": 0.17828900071890727, + "kl_loss": 0.06358035653829575, + "loss_ib": 0.004999568220227957, + "step": 1240 + }, + { + "ce_ib": 7.719653129577637, + "ce_orig": 1.387372612953186, + "epoch": 0.17843278217109992, + "kl_loss": 0.07892841845750809, + "loss_ib": 0.007806247565895319, + "step": 1241 + }, + { + "ce_ib": 6.006373405456543, + "ce_orig": 0.8500123620033264, + "epoch": 0.1785765636232926, + "kl_loss": 0.0664682537317276, + "loss_ib": 0.006326599512249231, + "step": 1242 + }, + { + "ce_ib": 7.288457870483398, + "ce_orig": 1.2919074296951294, + "epoch": 0.17872034507548526, + "kl_loss": 0.08672478795051575, + "loss_ib": 0.007980468682944775, + "step": 1243 + }, + { + "ce_ib": 5.790271759033203, + "ce_orig": 0.6914542317390442, + "epoch": 0.17886412652767794, + "kl_loss": 0.07108654826879501, + "loss_ib": 0.006449463311582804, + "step": 1244 + }, + { + "epoch": 0.1790079079798706, + "grad_norm": 0.1391477733850479, + "learning_rate": 4.9888379797925756e-05, + "loss": 0.8775, + "step": 1245 + }, + { + "ce_ib": 6.970901966094971, + "ce_orig": 0.9524872899055481, + "epoch": 0.1790079079798706, + "kl_loss": 0.06376741081476212, + "loss_ib": 0.006673821713775396, + "step": 1245 + }, + { + "ce_ib": 5.183405876159668, + "ce_orig": 0.7919313907623291, + "epoch": 0.17915168943206328, + "kl_loss": 0.08737723529338837, + "loss_ib": 0.006960564758628607, + "step": 1246 + }, + { + "ce_ib": 5.038588047027588, + "ce_orig": 0.702634334564209, + "epoch": 0.17929547088425593, + "kl_loss": 0.06210719794034958, + "loss_ib": 0.005624653771519661, + "step": 1247 + }, + { + "ce_ib": 5.748839378356934, + "ce_orig": 0.9377890825271606, + "epoch": 0.17943925233644858, + "kl_loss": 0.0736536979675293, + "loss_ib": 0.006557104643434286, + "step": 1248 + }, + { + "ce_ib": 7.30675745010376, + "ce_orig": 1.1550698280334473, + "epoch": 0.17958303378864127, + "kl_loss": 0.07683317363262177, + "loss_ib": 0.007495037745684385, + "step": 1249 + }, + { + "epoch": 0.17972681524083392, + "grad_norm": 0.1211005225777626, + "learning_rate": 4.9886540823193984e-05, + "loss": 0.9011, + "step": 1250 + }, + { + "ce_ib": 3.9555113315582275, + "ce_orig": 0.6502870917320251, + "epoch": 0.17972681524083392, + "kl_loss": 0.07374510914087296, + "loss_ib": 0.005665011238306761, + "step": 1250 + }, + { + "ce_ib": 4.92016077041626, + "ce_orig": 0.7719355225563049, + "epoch": 0.1798705966930266, + "kl_loss": 0.08895428478717804, + "loss_ib": 0.006907794624567032, + "step": 1251 + }, + { + "ce_ib": 8.089271545410156, + "ce_orig": 1.3001760244369507, + "epoch": 0.18001437814521926, + "kl_loss": 0.08735226094722748, + "loss_ib": 0.008412248454988003, + "step": 1252 + }, + { + "ce_ib": 4.87213659286499, + "ce_orig": 0.5505903959274292, + "epoch": 0.18015815959741194, + "kl_loss": 0.07251042127609253, + "loss_ib": 0.00606158934533596, + "step": 1253 + }, + { + "ce_ib": 5.427359580993652, + "ce_orig": 0.8354029655456543, + "epoch": 0.1803019410496046, + "kl_loss": 0.07384747266769409, + "loss_ib": 0.006406053435057402, + "step": 1254 + }, + { + "epoch": 0.18044572250179727, + "grad_norm": 0.1342553049325943, + "learning_rate": 4.9884686857615406e-05, + "loss": 0.8675, + "step": 1255 + }, + { + "ce_ib": 5.144403457641602, + "ce_orig": 0.7595053315162659, + "epoch": 0.18044572250179727, + "kl_loss": 0.06468242406845093, + "loss_ib": 0.005806323140859604, + "step": 1255 + }, + { + "ce_ib": 6.6486945152282715, + "ce_orig": 1.1814298629760742, + "epoch": 0.18058950395398993, + "kl_loss": 0.07554161548614502, + "loss_ib": 0.00710142869502306, + "step": 1256 + }, + { + "ce_ib": 7.030621528625488, + "ce_orig": 1.0686991214752197, + "epoch": 0.1807332854061826, + "kl_loss": 0.10883104801177979, + "loss_ib": 0.008956863544881344, + "step": 1257 + }, + { + "ce_ib": 4.138352394104004, + "ce_orig": 0.6883175373077393, + "epoch": 0.18087706685837526, + "kl_loss": 0.056147072464227676, + "loss_ib": 0.004876529797911644, + "step": 1258 + }, + { + "ce_ib": 8.369958877563477, + "ce_orig": 1.3614051342010498, + "epoch": 0.18102084831056794, + "kl_loss": 0.09674566984176636, + "loss_ib": 0.009022262878715992, + "step": 1259 + }, + { + "epoch": 0.1811646297627606, + "grad_norm": 0.12647263705730438, + "learning_rate": 4.9882817902306776e-05, + "loss": 0.9193, + "step": 1260 + }, + { + "ce_ib": 6.464113712310791, + "ce_orig": 0.9669797420501709, + "epoch": 0.1811646297627606, + "kl_loss": 0.08194872736930847, + "loss_ib": 0.007329493295401335, + "step": 1260 + }, + { + "ce_ib": 6.782562732696533, + "ce_orig": 1.2130578756332397, + "epoch": 0.18130841121495328, + "kl_loss": 0.06134729087352753, + "loss_ib": 0.006458646152168512, + "step": 1261 + }, + { + "ce_ib": 5.289116382598877, + "ce_orig": 0.7766171097755432, + "epoch": 0.18145219266714593, + "kl_loss": 0.07611186802387238, + "loss_ib": 0.0064501515589654446, + "step": 1262 + }, + { + "ce_ib": 3.832165002822876, + "ce_orig": 0.5674840807914734, + "epoch": 0.18159597411933862, + "kl_loss": 0.05480628088116646, + "loss_ib": 0.0046563963405787945, + "step": 1263 + }, + { + "ce_ib": 7.87674617767334, + "ce_orig": 1.2013108730316162, + "epoch": 0.18173975557153127, + "kl_loss": 0.061254192143678665, + "loss_ib": 0.007001082878559828, + "step": 1264 + }, + { + "epoch": 0.18188353702372395, + "grad_norm": 0.1493852436542511, + "learning_rate": 4.988093395839391e-05, + "loss": 0.8975, + "step": 1265 + }, + { + "ce_ib": 8.470536231994629, + "ce_orig": 1.435308814048767, + "epoch": 0.18188353702372395, + "kl_loss": 0.07152620702981949, + "loss_ib": 0.007811578456312418, + "step": 1265 + }, + { + "ce_ib": 5.668684482574463, + "ce_orig": 1.0418508052825928, + "epoch": 0.1820273184759166, + "kl_loss": 0.05794573947787285, + "loss_ib": 0.005731629207730293, + "step": 1266 + }, + { + "ce_ib": 5.763350486755371, + "ce_orig": 0.9967442154884338, + "epoch": 0.1821710999281093, + "kl_loss": 0.07836513221263885, + "loss_ib": 0.006799932103604078, + "step": 1267 + }, + { + "ce_ib": 5.015903949737549, + "ce_orig": 0.8127515912055969, + "epoch": 0.18231488138030194, + "kl_loss": 0.07750500738620758, + "loss_ib": 0.006383202504366636, + "step": 1268 + }, + { + "ce_ib": 5.03570556640625, + "ce_orig": 0.7906293869018555, + "epoch": 0.1824586628324946, + "kl_loss": 0.07191342115402222, + "loss_ib": 0.006113524083048105, + "step": 1269 + }, + { + "epoch": 0.18260244428468728, + "grad_norm": 0.1055343821644783, + "learning_rate": 4.987903502701162e-05, + "loss": 0.8825, + "step": 1270 + }, + { + "ce_ib": 6.143768310546875, + "ce_orig": 1.0385949611663818, + "epoch": 0.18260244428468728, + "kl_loss": 0.0859479010105133, + "loss_ib": 0.007369279395788908, + "step": 1270 + }, + { + "ce_ib": 4.371922492980957, + "ce_orig": 0.6059293150901794, + "epoch": 0.18274622573687993, + "kl_loss": 0.05015269294381142, + "loss_ib": 0.0046935961581766605, + "step": 1271 + }, + { + "ce_ib": 5.447070598602295, + "ce_orig": 0.8176404237747192, + "epoch": 0.1828900071890726, + "kl_loss": 0.06905705481767654, + "loss_ib": 0.006176387891173363, + "step": 1272 + }, + { + "ce_ib": 5.226484775543213, + "ce_orig": 0.7134627103805542, + "epoch": 0.18303378864126527, + "kl_loss": 0.07531459629535675, + "loss_ib": 0.006378972437232733, + "step": 1273 + }, + { + "ce_ib": 5.871837615966797, + "ce_orig": 1.1023709774017334, + "epoch": 0.18317757009345795, + "kl_loss": 0.06532324850559235, + "loss_ib": 0.0062020812183618546, + "step": 1274 + }, + { + "epoch": 0.1833213515456506, + "grad_norm": 0.12195762991905212, + "learning_rate": 4.987712110930377e-05, + "loss": 0.8923, + "step": 1275 + }, + { + "ce_ib": 3.5683703422546387, + "ce_orig": 0.6101716756820679, + "epoch": 0.1833213515456506, + "kl_loss": 0.08677677810192108, + "loss_ib": 0.006123024504631758, + "step": 1275 + }, + { + "ce_ib": 3.6534624099731445, + "ce_orig": 0.4296231269836426, + "epoch": 0.18346513299784328, + "kl_loss": 0.06283899396657944, + "loss_ib": 0.004968680907040834, + "step": 1276 + }, + { + "ce_ib": 7.290037155151367, + "ce_orig": 1.2038967609405518, + "epoch": 0.18360891445003594, + "kl_loss": 0.07194894552230835, + "loss_ib": 0.007242465857416391, + "step": 1277 + }, + { + "ce_ib": 3.1599771976470947, + "ce_orig": 0.370847225189209, + "epoch": 0.18375269590222862, + "kl_loss": 0.10093646496534348, + "loss_ib": 0.006626811809837818, + "step": 1278 + }, + { + "ce_ib": 4.3129448890686035, + "ce_orig": 0.6996619701385498, + "epoch": 0.18389647735442127, + "kl_loss": 0.15407751500606537, + "loss_ib": 0.009860347956418991, + "step": 1279 + }, + { + "epoch": 0.18404025880661395, + "grad_norm": 0.15581533312797546, + "learning_rate": 4.9875192206423235e-05, + "loss": 0.8297, + "step": 1280 + }, + { + "ce_ib": 6.469317436218262, + "ce_orig": 0.8039449453353882, + "epoch": 0.18404025880661395, + "kl_loss": 0.06512141227722168, + "loss_ib": 0.006490729749202728, + "step": 1280 + }, + { + "ce_ib": 5.138251781463623, + "ce_orig": 0.7558371424674988, + "epoch": 0.1841840402588066, + "kl_loss": 0.07659327238798141, + "loss_ib": 0.006398789584636688, + "step": 1281 + }, + { + "ce_ib": 6.566182613372803, + "ce_orig": 1.0166020393371582, + "epoch": 0.1843278217109993, + "kl_loss": 0.07558292150497437, + "loss_ib": 0.007062237244099379, + "step": 1282 + }, + { + "ce_ib": 6.4338178634643555, + "ce_orig": 0.7807300090789795, + "epoch": 0.18447160316319194, + "kl_loss": 0.07789655774831772, + "loss_ib": 0.007111737038940191, + "step": 1283 + }, + { + "ce_ib": 5.977124214172363, + "ce_orig": 0.8951448798179626, + "epoch": 0.18461538461538463, + "kl_loss": 0.0591699443757534, + "loss_ib": 0.005947059486061335, + "step": 1284 + }, + { + "epoch": 0.18475916606757728, + "grad_norm": 0.13328978419303894, + "learning_rate": 4.987324831953194e-05, + "loss": 0.9289, + "step": 1285 + }, + { + "ce_ib": 5.127737998962402, + "ce_orig": 0.7508841753005981, + "epoch": 0.18475916606757728, + "kl_loss": 0.054981742054224014, + "loss_ib": 0.005312956403940916, + "step": 1285 + }, + { + "ce_ib": 4.990250110626221, + "ce_orig": 0.7245780825614929, + "epoch": 0.18490294751976996, + "kl_loss": 0.06779134273529053, + "loss_ib": 0.005884692072868347, + "step": 1286 + }, + { + "ce_ib": 5.296464920043945, + "ce_orig": 0.9019365906715393, + "epoch": 0.18504672897196262, + "kl_loss": 0.08331352472305298, + "loss_ib": 0.006813908461481333, + "step": 1287 + }, + { + "ce_ib": 4.657319068908691, + "ce_orig": 0.7216635942459106, + "epoch": 0.1851905104241553, + "kl_loss": 0.056679096072912216, + "loss_ib": 0.005162614397704601, + "step": 1288 + }, + { + "ce_ib": 5.621036529541016, + "ce_orig": 1.060532808303833, + "epoch": 0.18533429187634795, + "kl_loss": 0.09513445943593979, + "loss_ib": 0.007567241322249174, + "step": 1289 + }, + { + "epoch": 0.1854780733285406, + "grad_norm": 0.1216038316488266, + "learning_rate": 4.987128944980079e-05, + "loss": 0.8664, + "step": 1290 + }, + { + "ce_ib": 8.308136940002441, + "ce_orig": 0.7171756029129028, + "epoch": 0.1854780733285406, + "kl_loss": 0.10778230428695679, + "loss_ib": 0.009543183259665966, + "step": 1290 + }, + { + "ce_ib": 5.605352401733398, + "ce_orig": 1.0017091035842896, + "epoch": 0.1856218547807333, + "kl_loss": 0.05229116976261139, + "loss_ib": 0.005417234729975462, + "step": 1291 + }, + { + "ce_ib": 6.048659324645996, + "ce_orig": 0.8577781915664673, + "epoch": 0.18576563623292594, + "kl_loss": 0.09568467736244202, + "loss_ib": 0.0078085632994771, + "step": 1292 + }, + { + "ce_ib": 5.150838851928711, + "ce_orig": 0.792262077331543, + "epoch": 0.18590941768511862, + "kl_loss": 0.04230610281229019, + "loss_ib": 0.004690724890679121, + "step": 1293 + }, + { + "ce_ib": 6.009768009185791, + "ce_orig": 1.0261638164520264, + "epoch": 0.18605319913731128, + "kl_loss": 0.07403481751680374, + "loss_ib": 0.00670662522315979, + "step": 1294 + }, + { + "epoch": 0.18619698058950396, + "grad_norm": 0.1265409141778946, + "learning_rate": 4.9869315598409765e-05, + "loss": 0.8246, + "step": 1295 + }, + { + "ce_ib": 4.652333736419678, + "ce_orig": 0.6744408011436462, + "epoch": 0.18619698058950396, + "kl_loss": 0.06151320040225983, + "loss_ib": 0.0054018269293010235, + "step": 1295 + }, + { + "ce_ib": 3.8486576080322266, + "ce_orig": 0.6293001770973206, + "epoch": 0.1863407620416966, + "kl_loss": 0.0954466164112091, + "loss_ib": 0.006696659605950117, + "step": 1296 + }, + { + "ce_ib": 5.1713409423828125, + "ce_orig": 0.8571648001670837, + "epoch": 0.1864845434938893, + "kl_loss": 0.06734561920166016, + "loss_ib": 0.0059529514983296394, + "step": 1297 + }, + { + "ce_ib": 5.222567081451416, + "ce_orig": 1.017317295074463, + "epoch": 0.18662832494608195, + "kl_loss": 0.0596066378057003, + "loss_ib": 0.005591615568846464, + "step": 1298 + }, + { + "ce_ib": 4.391148567199707, + "ce_orig": 0.7634097933769226, + "epoch": 0.18677210639827463, + "kl_loss": 0.07027769088745117, + "loss_ib": 0.005709459073841572, + "step": 1299 + }, + { + "epoch": 0.18691588785046728, + "grad_norm": 0.11799900978803635, + "learning_rate": 4.986732676654785e-05, + "loss": 0.8538, + "step": 1300 + }, + { + "ce_ib": 4.019314765930176, + "ce_orig": 0.5227394700050354, + "epoch": 0.18691588785046728, + "kl_loss": 0.12590855360031128, + "loss_ib": 0.008305084891617298, + "step": 1300 + }, + { + "ce_ib": 5.0924391746521, + "ce_orig": 0.7489091157913208, + "epoch": 0.18705966930265996, + "kl_loss": 0.09419790655374527, + "loss_ib": 0.007256114389747381, + "step": 1301 + }, + { + "ce_ib": 5.216235160827637, + "ce_orig": 0.46432167291641235, + "epoch": 0.18720345075485262, + "kl_loss": 0.07039137184619904, + "loss_ib": 0.0061276862397789955, + "step": 1302 + }, + { + "ce_ib": 6.321829319000244, + "ce_orig": 1.1543270349502563, + "epoch": 0.1873472322070453, + "kl_loss": 0.09047479927539825, + "loss_ib": 0.007684654090553522, + "step": 1303 + }, + { + "ce_ib": 6.893261432647705, + "ce_orig": 1.0330978631973267, + "epoch": 0.18749101365923795, + "kl_loss": 0.09423843026161194, + "loss_ib": 0.008158552460372448, + "step": 1304 + }, + { + "epoch": 0.18763479511143064, + "grad_norm": 0.14226000010967255, + "learning_rate": 4.986532295541304e-05, + "loss": 0.8484, + "step": 1305 + }, + { + "ce_ib": 6.995725154876709, + "ce_orig": 1.227460265159607, + "epoch": 0.18763479511143064, + "kl_loss": 0.06483827531337738, + "loss_ib": 0.006739776115864515, + "step": 1305 + }, + { + "ce_ib": 4.532535076141357, + "ce_orig": 0.7172744870185852, + "epoch": 0.1877785765636233, + "kl_loss": 0.07356537878513336, + "loss_ib": 0.005944536533206701, + "step": 1306 + }, + { + "ce_ib": 6.620168685913086, + "ce_orig": 0.9778208136558533, + "epoch": 0.18792235801581597, + "kl_loss": 0.0553460493683815, + "loss_ib": 0.0060773869045078754, + "step": 1307 + }, + { + "ce_ib": 5.705600738525391, + "ce_orig": 1.096453309059143, + "epoch": 0.18806613946800863, + "kl_loss": 0.06891658902168274, + "loss_ib": 0.006298630032688379, + "step": 1308 + }, + { + "ce_ib": 5.646377086639404, + "ce_orig": 0.7780153751373291, + "epoch": 0.1882099209202013, + "kl_loss": 0.049502164125442505, + "loss_ib": 0.005298296920955181, + "step": 1309 + }, + { + "epoch": 0.18835370237239396, + "grad_norm": 0.1416177600622177, + "learning_rate": 4.9863304166212386e-05, + "loss": 0.9146, + "step": 1310 + }, + { + "ce_ib": 5.660383701324463, + "ce_orig": 0.5649182796478271, + "epoch": 0.18835370237239396, + "kl_loss": 0.09952712059020996, + "loss_ib": 0.0078065479174256325, + "step": 1310 + }, + { + "ce_ib": 5.526841640472412, + "ce_orig": 0.9878823757171631, + "epoch": 0.18849748382458661, + "kl_loss": 0.0728154331445694, + "loss_ib": 0.00640419265255332, + "step": 1311 + }, + { + "ce_ib": 6.063813209533691, + "ce_orig": 1.0294197797775269, + "epoch": 0.1886412652767793, + "kl_loss": 0.06455469876527786, + "loss_ib": 0.006259641144424677, + "step": 1312 + }, + { + "ce_ib": 5.626165866851807, + "ce_orig": 1.0900945663452148, + "epoch": 0.18878504672897195, + "kl_loss": 0.0658339336514473, + "loss_ib": 0.0061047798953950405, + "step": 1313 + }, + { + "ce_ib": 6.964661121368408, + "ce_orig": 0.9383156299591064, + "epoch": 0.18892882818116463, + "kl_loss": 0.0714457705616951, + "loss_ib": 0.007054619491100311, + "step": 1314 + }, + { + "epoch": 0.18907260963335729, + "grad_norm": 0.13234072923660278, + "learning_rate": 4.986127040016192e-05, + "loss": 0.8824, + "step": 1315 + }, + { + "ce_ib": 5.2854695320129395, + "ce_orig": 0.6770355701446533, + "epoch": 0.18907260963335729, + "kl_loss": 0.06618240475654602, + "loss_ib": 0.00595185486599803, + "step": 1315 + }, + { + "ce_ib": 6.135135173797607, + "ce_orig": 0.9822208881378174, + "epoch": 0.18921639108554997, + "kl_loss": 0.07969026267528534, + "loss_ib": 0.007052080240100622, + "step": 1316 + }, + { + "ce_ib": 6.86212158203125, + "ce_orig": 1.2484806776046753, + "epoch": 0.18936017253774262, + "kl_loss": 0.0599304735660553, + "loss_ib": 0.006427584681659937, + "step": 1317 + }, + { + "ce_ib": 5.172977924346924, + "ce_orig": 1.0631622076034546, + "epoch": 0.1895039539899353, + "kl_loss": 0.07144273817539215, + "loss_ib": 0.006158626172691584, + "step": 1318 + }, + { + "ce_ib": 7.243893623352051, + "ce_orig": 1.4096187353134155, + "epoch": 0.18964773544212796, + "kl_loss": 0.07435773313045502, + "loss_ib": 0.00733983376994729, + "step": 1319 + }, + { + "epoch": 0.18979151689432064, + "grad_norm": 0.12008816003799438, + "learning_rate": 4.985922165848673e-05, + "loss": 0.8539, + "step": 1320 + }, + { + "ce_ib": 5.233951091766357, + "ce_orig": 0.8850072026252747, + "epoch": 0.18979151689432064, + "kl_loss": 0.053318850696086884, + "loss_ib": 0.005282917991280556, + "step": 1320 + }, + { + "ce_ib": 5.289816856384277, + "ce_orig": 0.888075053691864, + "epoch": 0.1899352983465133, + "kl_loss": 0.0667005255818367, + "loss_ib": 0.005979934707283974, + "step": 1321 + }, + { + "ce_ib": 5.563723087310791, + "ce_orig": 0.8860766291618347, + "epoch": 0.19007907979870597, + "kl_loss": 0.0715082660317421, + "loss_ib": 0.006357274949550629, + "step": 1322 + }, + { + "ce_ib": 5.824801445007324, + "ce_orig": 0.8612881302833557, + "epoch": 0.19022286125089863, + "kl_loss": 0.05713614821434021, + "loss_ib": 0.005769208073616028, + "step": 1323 + }, + { + "ce_ib": 5.4752702713012695, + "ce_orig": 0.9038432836532593, + "epoch": 0.1903666427030913, + "kl_loss": 0.07209070026874542, + "loss_ib": 0.00634216982871294, + "step": 1324 + }, + { + "epoch": 0.19051042415528396, + "grad_norm": 0.11129456013441086, + "learning_rate": 4.9857157942420915e-05, + "loss": 0.8688, + "step": 1325 + }, + { + "ce_ib": 6.194485187530518, + "ce_orig": 0.8805114030838013, + "epoch": 0.19051042415528396, + "kl_loss": 0.07600448280572891, + "loss_ib": 0.0068974667228758335, + "step": 1325 + }, + { + "ce_ib": 6.337533950805664, + "ce_orig": 0.9046097993850708, + "epoch": 0.19065420560747665, + "kl_loss": 0.0880471020936966, + "loss_ib": 0.007571122143417597, + "step": 1326 + }, + { + "ce_ib": 7.4663591384887695, + "ce_orig": 1.4075063467025757, + "epoch": 0.1907979870596693, + "kl_loss": 0.10501080751419067, + "loss_ib": 0.008983719162642956, + "step": 1327 + }, + { + "ce_ib": 4.563877582550049, + "ce_orig": 0.6890296339988708, + "epoch": 0.19094176851186198, + "kl_loss": 0.05470540374517441, + "loss_ib": 0.005017209332436323, + "step": 1328 + }, + { + "ce_ib": 5.0459489822387695, + "ce_orig": 0.686896026134491, + "epoch": 0.19108554996405464, + "kl_loss": 0.04170939326286316, + "loss_ib": 0.004608444403856993, + "step": 1329 + }, + { + "epoch": 0.19122933141624732, + "grad_norm": 0.1648108959197998, + "learning_rate": 4.985507925320757e-05, + "loss": 0.8948, + "step": 1330 + }, + { + "ce_ib": 3.4446299076080322, + "ce_orig": 0.5561388731002808, + "epoch": 0.19122933141624732, + "kl_loss": 0.04271192103624344, + "loss_ib": 0.0038579110987484455, + "step": 1330 + }, + { + "ce_ib": 5.135603427886963, + "ce_orig": 0.9304662346839905, + "epoch": 0.19137311286843997, + "kl_loss": 0.0660550445318222, + "loss_ib": 0.005870554130524397, + "step": 1331 + }, + { + "ce_ib": 7.067209720611572, + "ce_orig": 1.0317341089248657, + "epoch": 0.19151689432063262, + "kl_loss": 0.059279002249240875, + "loss_ib": 0.0064975544810295105, + "step": 1332 + }, + { + "ce_ib": 6.808300018310547, + "ce_orig": 1.139445185661316, + "epoch": 0.1916606757728253, + "kl_loss": 0.07774926722049713, + "loss_ib": 0.007291613612323999, + "step": 1333 + }, + { + "ce_ib": 5.8968939781188965, + "ce_orig": 1.1360212564468384, + "epoch": 0.19180445722501796, + "kl_loss": 0.10399194806814194, + "loss_ib": 0.008148044347763062, + "step": 1334 + }, + { + "epoch": 0.19194823867721064, + "grad_norm": 0.10446657240390778, + "learning_rate": 4.985298559209886e-05, + "loss": 0.8143, + "step": 1335 + }, + { + "ce_ib": 7.768556118011475, + "ce_orig": 1.5995025634765625, + "epoch": 0.19194823867721064, + "kl_loss": 0.0673665702342987, + "loss_ib": 0.007252607028931379, + "step": 1335 + }, + { + "ce_ib": 5.141104698181152, + "ce_orig": 0.9242339134216309, + "epoch": 0.1920920201294033, + "kl_loss": 0.053758151829242706, + "loss_ib": 0.005258460063487291, + "step": 1336 + }, + { + "ce_ib": 5.2834792137146, + "ce_orig": 0.6842889785766602, + "epoch": 0.19223580158159598, + "kl_loss": 0.08340615779161453, + "loss_ib": 0.006812047213315964, + "step": 1337 + }, + { + "ce_ib": 4.150651931762695, + "ce_orig": 0.5668556094169617, + "epoch": 0.19237958303378863, + "kl_loss": 0.05208231508731842, + "loss_ib": 0.004679441452026367, + "step": 1338 + }, + { + "ce_ib": 4.186277866363525, + "ce_orig": 0.7017203569412231, + "epoch": 0.1925233644859813, + "kl_loss": 0.08236505836248398, + "loss_ib": 0.006211392115801573, + "step": 1339 + }, + { + "epoch": 0.19266714593817397, + "grad_norm": 0.10543952137231827, + "learning_rate": 4.985087696035591e-05, + "loss": 0.8096, + "step": 1340 + }, + { + "ce_ib": 3.9252965450286865, + "ce_orig": 0.609882116317749, + "epoch": 0.19266714593817397, + "kl_loss": 0.07309718430042267, + "loss_ib": 0.005617507733404636, + "step": 1340 + }, + { + "ce_ib": 4.523309707641602, + "ce_orig": 0.6241291165351868, + "epoch": 0.19281092739036665, + "kl_loss": 0.08059638738632202, + "loss_ib": 0.0062914746813476086, + "step": 1341 + }, + { + "ce_ib": 6.225103378295898, + "ce_orig": 0.9400013089179993, + "epoch": 0.1929547088425593, + "kl_loss": 0.06948919594287872, + "loss_ib": 0.006587011273950338, + "step": 1342 + }, + { + "ce_ib": 6.541982650756836, + "ce_orig": 0.8905702829360962, + "epoch": 0.19309849029475198, + "kl_loss": 0.05320864915847778, + "loss_ib": 0.005931423511356115, + "step": 1343 + }, + { + "ce_ib": 5.0491743087768555, + "ce_orig": 1.0454604625701904, + "epoch": 0.19324227174694464, + "kl_loss": 0.07858042418956757, + "loss_ib": 0.00645360816270113, + "step": 1344 + }, + { + "epoch": 0.19338605319913732, + "grad_norm": 0.115959033370018, + "learning_rate": 4.9848753359248914e-05, + "loss": 0.8567, + "step": 1345 + }, + { + "ce_ib": 4.882938385009766, + "ce_orig": 0.6454526782035828, + "epoch": 0.19338605319913732, + "kl_loss": 0.09313278645277023, + "loss_ib": 0.007098108530044556, + "step": 1345 + }, + { + "ce_ib": 4.675527095794678, + "ce_orig": 0.7172112464904785, + "epoch": 0.19352983465132997, + "kl_loss": 0.07279226183891296, + "loss_ib": 0.005977376829832792, + "step": 1346 + }, + { + "ce_ib": 6.803047180175781, + "ce_orig": 0.9909824728965759, + "epoch": 0.19367361610352266, + "kl_loss": 0.16327355802059174, + "loss_ib": 0.011565200984477997, + "step": 1347 + }, + { + "ce_ib": 4.686718463897705, + "ce_orig": 0.5485274791717529, + "epoch": 0.1938173975557153, + "kl_loss": 0.07588573545217514, + "loss_ib": 0.006137646269053221, + "step": 1348 + }, + { + "ce_ib": 5.5454583168029785, + "ce_orig": 0.7043033242225647, + "epoch": 0.193961179007908, + "kl_loss": 0.06774990260601044, + "loss_ib": 0.006160224322229624, + "step": 1349 + }, + { + "epoch": 0.19410496046010065, + "grad_norm": 0.10718260705471039, + "learning_rate": 4.984661479005704e-05, + "loss": 0.7852, + "step": 1350 + }, + { + "ce_ib": 5.474034309387207, + "ce_orig": 0.7613490223884583, + "epoch": 0.19410496046010065, + "kl_loss": 0.1212555468082428, + "loss_ib": 0.008799794130027294, + "step": 1350 + }, + { + "ce_ib": 5.272262096405029, + "ce_orig": 0.7977457046508789, + "epoch": 0.19424874191229333, + "kl_loss": 0.06805640459060669, + "loss_ib": 0.006038951221853495, + "step": 1351 + }, + { + "ce_ib": 4.755674362182617, + "ce_orig": 0.6557518243789673, + "epoch": 0.19439252336448598, + "kl_loss": 0.04519767314195633, + "loss_ib": 0.004637720994651318, + "step": 1352 + }, + { + "ce_ib": 4.685527801513672, + "ce_orig": 0.7481217980384827, + "epoch": 0.19453630481667863, + "kl_loss": 0.06416691839694977, + "loss_ib": 0.0055511100217700005, + "step": 1353 + }, + { + "ce_ib": 5.590147495269775, + "ce_orig": 0.8784522414207458, + "epoch": 0.19468008626887132, + "kl_loss": 0.05754717439413071, + "loss_ib": 0.005672432482242584, + "step": 1354 + }, + { + "epoch": 0.19482386772106397, + "grad_norm": 0.12686419486999512, + "learning_rate": 4.984446125406851e-05, + "loss": 0.8276, + "step": 1355 + }, + { + "ce_ib": 5.616003036499023, + "ce_orig": 0.9167315363883972, + "epoch": 0.19482386772106397, + "kl_loss": 0.09981972724199295, + "loss_ib": 0.007798987906426191, + "step": 1355 + }, + { + "ce_ib": 6.558966159820557, + "ce_orig": 1.3429272174835205, + "epoch": 0.19496764917325665, + "kl_loss": 0.09527970105409622, + "loss_ib": 0.008043467998504639, + "step": 1356 + }, + { + "ce_ib": 5.295566558837891, + "ce_orig": 0.7045623064041138, + "epoch": 0.1951114306254493, + "kl_loss": 0.07721194624900818, + "loss_ib": 0.006508380174636841, + "step": 1357 + }, + { + "ce_ib": 2.879274368286133, + "ce_orig": 0.4031473398208618, + "epoch": 0.195255212077642, + "kl_loss": 0.13180555403232574, + "loss_ib": 0.00802991446107626, + "step": 1358 + }, + { + "ce_ib": 5.730945587158203, + "ce_orig": 0.9784193634986877, + "epoch": 0.19539899352983464, + "kl_loss": 0.06422359496355057, + "loss_ib": 0.006076652556657791, + "step": 1359 + }, + { + "epoch": 0.19554277498202732, + "grad_norm": 0.12580661475658417, + "learning_rate": 4.984229275258054e-05, + "loss": 0.8353, + "step": 1360 + }, + { + "ce_ib": 3.765678644180298, + "ce_orig": 0.5086674690246582, + "epoch": 0.19554277498202732, + "kl_loss": 0.08472566306591034, + "loss_ib": 0.006119122263044119, + "step": 1360 + }, + { + "ce_ib": 4.736020088195801, + "ce_orig": 0.4839497208595276, + "epoch": 0.19568655643421998, + "kl_loss": 0.0478425994515419, + "loss_ib": 0.004760140087455511, + "step": 1361 + }, + { + "ce_ib": 6.7575249671936035, + "ce_orig": 1.1090867519378662, + "epoch": 0.19583033788641266, + "kl_loss": 0.07463841885328293, + "loss_ib": 0.007110683713108301, + "step": 1362 + }, + { + "ce_ib": 4.888116359710693, + "ce_orig": 0.7908861637115479, + "epoch": 0.1959741193386053, + "kl_loss": 0.05212397128343582, + "loss_ib": 0.005050256848335266, + "step": 1363 + }, + { + "ce_ib": 5.1172966957092285, + "ce_orig": 0.5396475791931152, + "epoch": 0.196117900790798, + "kl_loss": 0.04441031441092491, + "loss_ib": 0.004779163748025894, + "step": 1364 + }, + { + "epoch": 0.19626168224299065, + "grad_norm": 0.12461841851472855, + "learning_rate": 4.984010928689936e-05, + "loss": 0.7943, + "step": 1365 + }, + { + "ce_ib": 5.743456840515137, + "ce_orig": 0.9942952990531921, + "epoch": 0.19626168224299065, + "kl_loss": 0.06097549945116043, + "loss_ib": 0.0059205032885074615, + "step": 1365 + }, + { + "ce_ib": 6.305510520935059, + "ce_orig": 0.9737942218780518, + "epoch": 0.19640546369518333, + "kl_loss": 0.17247089743614197, + "loss_ib": 0.011776300147175789, + "step": 1366 + }, + { + "ce_ib": 5.222244739532471, + "ce_orig": 0.6830456256866455, + "epoch": 0.19654924514737598, + "kl_loss": 0.08971808850765228, + "loss_ib": 0.00709702679887414, + "step": 1367 + }, + { + "ce_ib": 6.454967975616455, + "ce_orig": 0.7466832399368286, + "epoch": 0.19669302659956867, + "kl_loss": 0.10465604811906815, + "loss_ib": 0.00846028607338667, + "step": 1368 + }, + { + "ce_ib": 4.010262489318848, + "ce_orig": 0.5327467322349548, + "epoch": 0.19683680805176132, + "kl_loss": 0.07438971102237701, + "loss_ib": 0.005724616814404726, + "step": 1369 + }, + { + "epoch": 0.196980589503954, + "grad_norm": 0.15052177011966705, + "learning_rate": 4.983791085834022e-05, + "loss": 0.8876, + "step": 1370 + }, + { + "ce_ib": 4.99446439743042, + "ce_orig": 1.031826376914978, + "epoch": 0.196980589503954, + "kl_loss": 0.05401468649506569, + "loss_ib": 0.00519796647131443, + "step": 1370 + }, + { + "ce_ib": 6.3165507316589355, + "ce_orig": 1.1690744161605835, + "epoch": 0.19712437095614666, + "kl_loss": 0.07904824614524841, + "loss_ib": 0.007110688369721174, + "step": 1371 + }, + { + "ce_ib": 4.413170337677002, + "ce_orig": 0.793796718120575, + "epoch": 0.19726815240833934, + "kl_loss": 0.05433220416307449, + "loss_ib": 0.004923195578157902, + "step": 1372 + }, + { + "ce_ib": 4.517746448516846, + "ce_orig": 0.6368403434753418, + "epoch": 0.197411933860532, + "kl_loss": 0.050136927515268326, + "loss_ib": 0.004765719641000032, + "step": 1373 + }, + { + "ce_ib": 4.015274524688721, + "ce_orig": 0.6775529980659485, + "epoch": 0.19755571531272467, + "kl_loss": 0.044358380138874054, + "loss_ib": 0.004225556273013353, + "step": 1374 + }, + { + "epoch": 0.19769949676491733, + "grad_norm": 0.12334876507520676, + "learning_rate": 4.983569746822739e-05, + "loss": 0.8314, + "step": 1375 + }, + { + "ce_ib": 5.306783676147461, + "ce_orig": 0.9006129503250122, + "epoch": 0.19769949676491733, + "kl_loss": 0.08385761082172394, + "loss_ib": 0.006846272386610508, + "step": 1375 + }, + { + "ce_ib": 6.2950568199157715, + "ce_orig": 0.7801182866096497, + "epoch": 0.19784327821710998, + "kl_loss": 0.088884636759758, + "loss_ib": 0.007591760251671076, + "step": 1376 + }, + { + "ce_ib": 5.0033698081970215, + "ce_orig": 0.5632942318916321, + "epoch": 0.19798705966930266, + "kl_loss": 0.06423759460449219, + "loss_ib": 0.005713564809411764, + "step": 1377 + }, + { + "ce_ib": 5.550046920776367, + "ce_orig": 0.8530508875846863, + "epoch": 0.19813084112149532, + "kl_loss": 0.08627203106880188, + "loss_ib": 0.007088624872267246, + "step": 1378 + }, + { + "ce_ib": 4.105034351348877, + "ce_orig": 0.4530830681324005, + "epoch": 0.198274622573688, + "kl_loss": 0.08440049737691879, + "loss_ib": 0.0062725418247282505, + "step": 1379 + }, + { + "epoch": 0.19841840402588065, + "grad_norm": 0.13778747618198395, + "learning_rate": 4.983346911789414e-05, + "loss": 0.8036, + "step": 1380 + }, + { + "ce_ib": 4.411414623260498, + "ce_orig": 0.8279843330383301, + "epoch": 0.19841840402588065, + "kl_loss": 0.08065389096736908, + "loss_ib": 0.006238401867449284, + "step": 1380 + }, + { + "ce_ib": 5.244317531585693, + "ce_orig": 1.1526085138320923, + "epoch": 0.19856218547807333, + "kl_loss": 0.05347134917974472, + "loss_ib": 0.005295726004987955, + "step": 1381 + }, + { + "ce_ib": 5.454021453857422, + "ce_orig": 0.56093430519104, + "epoch": 0.198705966930266, + "kl_loss": 0.08189015090465546, + "loss_ib": 0.006821518298238516, + "step": 1382 + }, + { + "ce_ib": 6.119863986968994, + "ce_orig": 0.9518615007400513, + "epoch": 0.19884974838245867, + "kl_loss": 0.07461654394865036, + "loss_ib": 0.006790758576244116, + "step": 1383 + }, + { + "ce_ib": 6.555818557739258, + "ce_orig": 1.2137396335601807, + "epoch": 0.19899352983465132, + "kl_loss": 0.06121841073036194, + "loss_ib": 0.006338830571621656, + "step": 1384 + }, + { + "epoch": 0.199137311286844, + "grad_norm": 0.1511547863483429, + "learning_rate": 4.983122580868275e-05, + "loss": 0.9237, + "step": 1385 + }, + { + "ce_ib": 6.947441577911377, + "ce_orig": 1.1190574169158936, + "epoch": 0.199137311286844, + "kl_loss": 0.08730851858854294, + "loss_ib": 0.007839146070182323, + "step": 1385 + }, + { + "ce_ib": 7.337833881378174, + "ce_orig": 1.5090909004211426, + "epoch": 0.19928109273903666, + "kl_loss": 0.06059400737285614, + "loss_ib": 0.006698617246001959, + "step": 1386 + }, + { + "ce_ib": 5.586292743682861, + "ce_orig": 0.9534856081008911, + "epoch": 0.19942487419122934, + "kl_loss": 0.049000680446624756, + "loss_ib": 0.0052431803196668625, + "step": 1387 + }, + { + "ce_ib": 4.483768463134766, + "ce_orig": 0.5571135878562927, + "epoch": 0.199568655643422, + "kl_loss": 0.06389360129833221, + "loss_ib": 0.005436564330011606, + "step": 1388 + }, + { + "ce_ib": 4.807966232299805, + "ce_orig": 0.788553774356842, + "epoch": 0.19971243709561468, + "kl_loss": 0.05055851489305496, + "loss_ib": 0.004931909032166004, + "step": 1389 + }, + { + "epoch": 0.19985621854780733, + "grad_norm": 0.14505986869335175, + "learning_rate": 4.9828967541944524e-05, + "loss": 0.8757, + "step": 1390 + }, + { + "ce_ib": 6.073001861572266, + "ce_orig": 0.983887255191803, + "epoch": 0.19985621854780733, + "kl_loss": 0.09870649874210358, + "loss_ib": 0.007971826009452343, + "step": 1390 + }, + { + "ce_ib": 7.64382791519165, + "ce_orig": 0.999534547328949, + "epoch": 0.2, + "kl_loss": 0.06764799356460571, + "loss_ib": 0.007204313762485981, + "step": 1391 + }, + { + "ce_ib": 5.345618724822998, + "ce_orig": 0.5965936779975891, + "epoch": 0.20014378145219267, + "kl_loss": 0.07192911207675934, + "loss_ib": 0.006269264966249466, + "step": 1392 + }, + { + "ce_ib": 4.161827564239502, + "ce_orig": 0.593263566493988, + "epoch": 0.20028756290438535, + "kl_loss": 0.058980681002140045, + "loss_ib": 0.005029947962611914, + "step": 1393 + }, + { + "ce_ib": 6.008248329162598, + "ce_orig": 1.027796745300293, + "epoch": 0.200431344356578, + "kl_loss": 0.0661287009716034, + "loss_ib": 0.006310559343546629, + "step": 1394 + }, + { + "epoch": 0.20057512580877068, + "grad_norm": 0.12259102612733841, + "learning_rate": 4.982669431903977e-05, + "loss": 0.808, + "step": 1395 + }, + { + "ce_ib": 5.8604044914245605, + "ce_orig": 1.0801239013671875, + "epoch": 0.20057512580877068, + "kl_loss": 0.08428308367729187, + "loss_ib": 0.007144356612116098, + "step": 1395 + }, + { + "ce_ib": 5.270923614501953, + "ce_orig": 0.8836106657981873, + "epoch": 0.20071890726096334, + "kl_loss": 0.06545711308717728, + "loss_ib": 0.005908317398279905, + "step": 1396 + }, + { + "ce_ib": 6.3282270431518555, + "ce_orig": 0.5579991340637207, + "epoch": 0.200862688713156, + "kl_loss": 0.11529186367988586, + "loss_ib": 0.008928706869482994, + "step": 1397 + }, + { + "ce_ib": 5.8910441398620605, + "ce_orig": 0.7428718209266663, + "epoch": 0.20100647016534867, + "kl_loss": 0.08392181992530823, + "loss_ib": 0.0071416134014725685, + "step": 1398 + }, + { + "ce_ib": 5.221210956573486, + "ce_orig": 0.9338892102241516, + "epoch": 0.20115025161754133, + "kl_loss": 0.05993063002824783, + "loss_ib": 0.005607136990875006, + "step": 1399 + }, + { + "epoch": 0.201294033069734, + "grad_norm": 0.1601199358701706, + "learning_rate": 4.98244061413378e-05, + "loss": 0.8131, + "step": 1400 + }, + { + "ce_ib": 3.716848611831665, + "ce_orig": 0.580088198184967, + "epoch": 0.201294033069734, + "kl_loss": 0.049116991460323334, + "loss_ib": 0.0043142735958099365, + "step": 1400 + }, + { + "ce_ib": 5.9390130043029785, + "ce_orig": 1.029770851135254, + "epoch": 0.20143781452192666, + "kl_loss": 0.1097383052110672, + "loss_ib": 0.00845642201602459, + "step": 1401 + }, + { + "ce_ib": 4.419147491455078, + "ce_orig": 0.8180220723152161, + "epoch": 0.20158159597411934, + "kl_loss": 0.05362674593925476, + "loss_ib": 0.004890910815447569, + "step": 1402 + }, + { + "ce_ib": 5.547792911529541, + "ce_orig": 1.1538474559783936, + "epoch": 0.201725377426312, + "kl_loss": 0.06782442331314087, + "loss_ib": 0.006165117956697941, + "step": 1403 + }, + { + "ce_ib": 6.935868740081787, + "ce_orig": 1.0117311477661133, + "epoch": 0.20186915887850468, + "kl_loss": 0.0784970298409462, + "loss_ib": 0.007392785977572203, + "step": 1404 + }, + { + "epoch": 0.20201294033069733, + "grad_norm": 0.13027290999889374, + "learning_rate": 4.982210301021694e-05, + "loss": 0.9138, + "step": 1405 + }, + { + "ce_ib": 5.055147171020508, + "ce_orig": 0.6767200231552124, + "epoch": 0.20201294033069733, + "kl_loss": 0.08463143557310104, + "loss_ib": 0.006759145297110081, + "step": 1405 + }, + { + "ce_ib": 6.904526233673096, + "ce_orig": 1.1851789951324463, + "epoch": 0.20215672178289001, + "kl_loss": 0.09897538274526596, + "loss_ib": 0.008401031605899334, + "step": 1406 + }, + { + "ce_ib": 5.40473747253418, + "ce_orig": 0.9029414057731628, + "epoch": 0.20230050323508267, + "kl_loss": 0.0825282633304596, + "loss_ib": 0.006828782148659229, + "step": 1407 + }, + { + "ce_ib": 6.4676513671875, + "ce_orig": 1.0907763242721558, + "epoch": 0.20244428468727535, + "kl_loss": 0.07482163608074188, + "loss_ib": 0.006974907126277685, + "step": 1408 + }, + { + "ce_ib": 3.7740111351013184, + "ce_orig": 0.5375373363494873, + "epoch": 0.202588066139468, + "kl_loss": 0.04807378351688385, + "loss_ib": 0.004290694836527109, + "step": 1409 + }, + { + "epoch": 0.20273184759166069, + "grad_norm": 0.12365573644638062, + "learning_rate": 4.981978492706452e-05, + "loss": 0.8518, + "step": 1410 + }, + { + "ce_ib": 4.376240253448486, + "ce_orig": 0.7760727405548096, + "epoch": 0.20273184759166069, + "kl_loss": 0.11358863115310669, + "loss_ib": 0.007867551408708096, + "step": 1410 + }, + { + "ce_ib": 5.7131853103637695, + "ce_orig": 0.8759678602218628, + "epoch": 0.20287562904385334, + "kl_loss": 0.07917153090238571, + "loss_ib": 0.006815169006586075, + "step": 1411 + }, + { + "ce_ib": 8.131108283996582, + "ce_orig": 0.8118664026260376, + "epoch": 0.20301941049604602, + "kl_loss": 0.06217491626739502, + "loss_ib": 0.007174299564212561, + "step": 1412 + }, + { + "ce_ib": 6.700382232666016, + "ce_orig": 1.1185380220413208, + "epoch": 0.20316319194823867, + "kl_loss": 0.06485722213983536, + "loss_ib": 0.006593052297830582, + "step": 1413 + }, + { + "ce_ib": 7.007409572601318, + "ce_orig": 1.0472086668014526, + "epoch": 0.20330697340043136, + "kl_loss": 0.06941254436969757, + "loss_ib": 0.00697433203458786, + "step": 1414 + }, + { + "epoch": 0.203450754852624, + "grad_norm": 0.12549830973148346, + "learning_rate": 4.9817451893276875e-05, + "loss": 0.9038, + "step": 1415 + }, + { + "ce_ib": 6.704219341278076, + "ce_orig": 1.1752408742904663, + "epoch": 0.203450754852624, + "kl_loss": 0.07044374197721481, + "loss_ib": 0.006874296814203262, + "step": 1415 + }, + { + "ce_ib": 4.809553146362305, + "ce_orig": 0.8325048685073853, + "epoch": 0.2035945363048167, + "kl_loss": 0.05817954242229462, + "loss_ib": 0.005313753616064787, + "step": 1416 + }, + { + "ce_ib": 4.333613872528076, + "ce_orig": 0.6595436930656433, + "epoch": 0.20373831775700935, + "kl_loss": 0.06892372667789459, + "loss_ib": 0.005612993147224188, + "step": 1417 + }, + { + "ce_ib": 8.648900032043457, + "ce_orig": 1.5878820419311523, + "epoch": 0.203882099209202, + "kl_loss": 0.0689108818769455, + "loss_ib": 0.0077699944376945496, + "step": 1418 + }, + { + "ce_ib": 5.39652156829834, + "ce_orig": 0.9428832530975342, + "epoch": 0.20402588066139468, + "kl_loss": 0.07233736664056778, + "loss_ib": 0.006315129343420267, + "step": 1419 + }, + { + "epoch": 0.20416966211358734, + "grad_norm": 0.11503734439611435, + "learning_rate": 4.9815103910259354e-05, + "loss": 0.8548, + "step": 1420 + }, + { + "ce_ib": 6.764338493347168, + "ce_orig": 1.0282443761825562, + "epoch": 0.20416966211358734, + "kl_loss": 0.07213668525218964, + "loss_ib": 0.006989003624767065, + "step": 1420 + }, + { + "ce_ib": 3.8117384910583496, + "ce_orig": 0.639327347278595, + "epoch": 0.20431344356578002, + "kl_loss": 0.06720799207687378, + "loss_ib": 0.005266268737614155, + "step": 1421 + }, + { + "ce_ib": 3.3015217781066895, + "ce_orig": 0.5186365842819214, + "epoch": 0.20445722501797267, + "kl_loss": 0.054001692682504654, + "loss_ib": 0.004350845701992512, + "step": 1422 + }, + { + "ce_ib": 5.977197647094727, + "ce_orig": 1.1004377603530884, + "epoch": 0.20460100647016535, + "kl_loss": 0.06124700605869293, + "loss_ib": 0.0060509489849209785, + "step": 1423 + }, + { + "ce_ib": 5.054637432098389, + "ce_orig": 0.7079076170921326, + "epoch": 0.204744787922358, + "kl_loss": 0.07295742630958557, + "loss_ib": 0.006175190210342407, + "step": 1424 + }, + { + "epoch": 0.2048885693745507, + "grad_norm": 0.11248598992824554, + "learning_rate": 4.981274097942631e-05, + "loss": 0.8585, + "step": 1425 + }, + { + "ce_ib": 4.233011722564697, + "ce_orig": 0.8790975213050842, + "epoch": 0.2048885693745507, + "kl_loss": 0.04312340170145035, + "loss_ib": 0.004272676073014736, + "step": 1425 + }, + { + "ce_ib": 4.183152198791504, + "ce_orig": 0.695999801158905, + "epoch": 0.20503235082674334, + "kl_loss": 0.05767499655485153, + "loss_ib": 0.004975325893610716, + "step": 1426 + }, + { + "ce_ib": 5.656023025512695, + "ce_orig": 0.9146031737327576, + "epoch": 0.20517613227893602, + "kl_loss": 0.15879961848258972, + "loss_ib": 0.010767992585897446, + "step": 1427 + }, + { + "ce_ib": 6.515829086303711, + "ce_orig": 1.235692024230957, + "epoch": 0.20531991373112868, + "kl_loss": 0.05390184372663498, + "loss_ib": 0.005953006912022829, + "step": 1428 + }, + { + "ce_ib": 4.633606910705566, + "ce_orig": 0.7983617186546326, + "epoch": 0.20546369518332136, + "kl_loss": 0.0672098845243454, + "loss_ib": 0.0056772977113723755, + "step": 1429 + }, + { + "epoch": 0.205607476635514, + "grad_norm": 0.1175980418920517, + "learning_rate": 4.981036310220108e-05, + "loss": 0.8431, + "step": 1430 + }, + { + "ce_ib": 4.554419040679932, + "ce_orig": 0.8392414450645447, + "epoch": 0.205607476635514, + "kl_loss": 0.060240574181079865, + "loss_ib": 0.005289237946271896, + "step": 1430 + }, + { + "ce_ib": 6.2082977294921875, + "ce_orig": 0.7209426760673523, + "epoch": 0.2057512580877067, + "kl_loss": 0.10012741386890411, + "loss_ib": 0.008110519498586655, + "step": 1431 + }, + { + "ce_ib": 6.682995796203613, + "ce_orig": 0.8945934772491455, + "epoch": 0.20589503953989935, + "kl_loss": 0.08409430831670761, + "loss_ib": 0.0075462134554982185, + "step": 1432 + }, + { + "ce_ib": 6.7147088050842285, + "ce_orig": 1.2028496265411377, + "epoch": 0.20603882099209203, + "kl_loss": 0.08167049288749695, + "loss_ib": 0.007440878544002771, + "step": 1433 + }, + { + "ce_ib": 5.5593156814575195, + "ce_orig": 0.8950801491737366, + "epoch": 0.20618260244428468, + "kl_loss": 0.09822431206703186, + "loss_ib": 0.007690873928368092, + "step": 1434 + }, + { + "epoch": 0.20632638389647737, + "grad_norm": 0.14298565685749054, + "learning_rate": 4.980797028001605e-05, + "loss": 0.8001, + "step": 1435 + }, + { + "ce_ib": 6.609431266784668, + "ce_orig": 1.0981193780899048, + "epoch": 0.20632638389647737, + "kl_loss": 0.09056198596954346, + "loss_ib": 0.007832814939320087, + "step": 1435 + }, + { + "ce_ib": 5.954436302185059, + "ce_orig": 0.9232958555221558, + "epoch": 0.20647016534867002, + "kl_loss": 0.07938443124294281, + "loss_ib": 0.0069464403204619884, + "step": 1436 + }, + { + "ce_ib": 6.491918563842773, + "ce_orig": 0.9802238345146179, + "epoch": 0.2066139468008627, + "kl_loss": 0.09788002073764801, + "loss_ib": 0.0081399604678154, + "step": 1437 + }, + { + "ce_ib": 7.028619766235352, + "ce_orig": 1.122996211051941, + "epoch": 0.20675772825305536, + "kl_loss": 0.054605305194854736, + "loss_ib": 0.006244575139135122, + "step": 1438 + }, + { + "ce_ib": 5.062302112579346, + "ce_orig": 0.6410950422286987, + "epoch": 0.206901509705248, + "kl_loss": 0.05597269907593727, + "loss_ib": 0.005329785868525505, + "step": 1439 + }, + { + "epoch": 0.2070452911574407, + "grad_norm": 0.10494133085012436, + "learning_rate": 4.980556251431254e-05, + "loss": 0.9148, + "step": 1440 + }, + { + "ce_ib": 4.733548164367676, + "ce_orig": 1.0352236032485962, + "epoch": 0.2070452911574407, + "kl_loss": 0.05532206594944, + "loss_ib": 0.005132877267897129, + "step": 1440 + }, + { + "ce_ib": 5.002044677734375, + "ce_orig": 0.8640213012695312, + "epoch": 0.20718907260963335, + "kl_loss": 0.06451234966516495, + "loss_ib": 0.005726640112698078, + "step": 1441 + }, + { + "ce_ib": 5.413660526275635, + "ce_orig": 0.9514120221138, + "epoch": 0.20733285406182603, + "kl_loss": 0.08851265162229538, + "loss_ib": 0.0071324631571769714, + "step": 1442 + }, + { + "ce_ib": 5.5871710777282715, + "ce_orig": 1.2692968845367432, + "epoch": 0.20747663551401868, + "kl_loss": 0.066330187022686, + "loss_ib": 0.006110094953328371, + "step": 1443 + }, + { + "ce_ib": 4.385312557220459, + "ce_orig": 0.5806764960289001, + "epoch": 0.20762041696621136, + "kl_loss": 0.07349787652492523, + "loss_ib": 0.005867549683898687, + "step": 1444 + }, + { + "epoch": 0.20776419841840402, + "grad_norm": 0.12775132060050964, + "learning_rate": 4.980313980654094e-05, + "loss": 0.9138, + "step": 1445 + }, + { + "ce_ib": 5.83311128616333, + "ce_orig": 1.148367166519165, + "epoch": 0.20776419841840402, + "kl_loss": 0.04666166752576828, + "loss_ib": 0.005249639041721821, + "step": 1445 + }, + { + "ce_ib": 5.259827136993408, + "ce_orig": 0.7694585919380188, + "epoch": 0.2079079798705967, + "kl_loss": 0.0764630138874054, + "loss_ib": 0.0064530642703175545, + "step": 1446 + }, + { + "ce_ib": 5.33991813659668, + "ce_orig": 0.909938633441925, + "epoch": 0.20805176132278935, + "kl_loss": 0.08604171127080917, + "loss_ib": 0.0069720447063446045, + "step": 1447 + }, + { + "ce_ib": 5.412893772125244, + "ce_orig": 0.5353676080703735, + "epoch": 0.20819554277498203, + "kl_loss": 0.18382969498634338, + "loss_ib": 0.011897931806743145, + "step": 1448 + }, + { + "ce_ib": 2.7452378273010254, + "ce_orig": 0.3200097978115082, + "epoch": 0.2083393242271747, + "kl_loss": 0.11197317391633987, + "loss_ib": 0.006971277296543121, + "step": 1449 + }, + { + "epoch": 0.20848310567936737, + "grad_norm": 0.1159176230430603, + "learning_rate": 4.9800702158160605e-05, + "loss": 0.8825, + "step": 1450 + }, + { + "ce_ib": 5.469526767730713, + "ce_orig": 0.9460609555244446, + "epoch": 0.20848310567936737, + "kl_loss": 0.0640353113412857, + "loss_ib": 0.005936529021710157, + "step": 1450 + }, + { + "ce_ib": 4.701572895050049, + "ce_orig": 0.8877841830253601, + "epoch": 0.20862688713156002, + "kl_loss": 0.0617247000336647, + "loss_ib": 0.005437021609395742, + "step": 1451 + }, + { + "ce_ib": 4.481944561004639, + "ce_orig": 0.7527979612350464, + "epoch": 0.2087706685837527, + "kl_loss": 0.06673047691583633, + "loss_ib": 0.005577496252954006, + "step": 1452 + }, + { + "ce_ib": 6.46573543548584, + "ce_orig": 1.1999479532241821, + "epoch": 0.20891445003594536, + "kl_loss": 0.08012167364358902, + "loss_ib": 0.00723895151168108, + "step": 1453 + }, + { + "ce_ib": 4.899156093597412, + "ce_orig": 0.7474861741065979, + "epoch": 0.20905823148813804, + "kl_loss": 0.0811930000782013, + "loss_ib": 0.006509228143841028, + "step": 1454 + }, + { + "epoch": 0.2092020129403307, + "grad_norm": 0.12713485956192017, + "learning_rate": 4.979824957063989e-05, + "loss": 0.8784, + "step": 1455 + }, + { + "ce_ib": 7.523890018463135, + "ce_orig": 1.0977692604064941, + "epoch": 0.2092020129403307, + "kl_loss": 0.055691562592983246, + "loss_ib": 0.006546522956341505, + "step": 1455 + }, + { + "ce_ib": 5.109311103820801, + "ce_orig": 0.6441048383712769, + "epoch": 0.20934579439252338, + "kl_loss": 0.04580700397491455, + "loss_ib": 0.004845005925744772, + "step": 1456 + }, + { + "ce_ib": 6.434846878051758, + "ce_orig": 1.0908502340316772, + "epoch": 0.20948957584471603, + "kl_loss": 0.08040127158164978, + "loss_ib": 0.00723748654127121, + "step": 1457 + }, + { + "ce_ib": 4.771369934082031, + "ce_orig": 0.6678093075752258, + "epoch": 0.2096333572969087, + "kl_loss": 0.07948639988899231, + "loss_ib": 0.006360005121678114, + "step": 1458 + }, + { + "ce_ib": 4.858242988586426, + "ce_orig": 0.5852605700492859, + "epoch": 0.20977713874910137, + "kl_loss": 0.09732958674430847, + "loss_ib": 0.007295601069927216, + "step": 1459 + }, + { + "epoch": 0.20992092020129402, + "grad_norm": 0.10284240543842316, + "learning_rate": 4.9795782045456144e-05, + "loss": 0.8425, + "step": 1460 + }, + { + "ce_ib": 5.734887599945068, + "ce_orig": 0.9484885931015015, + "epoch": 0.20992092020129402, + "kl_loss": 0.053582172840833664, + "loss_ib": 0.00554655259475112, + "step": 1460 + }, + { + "ce_ib": 5.57645845413208, + "ce_orig": 0.8518679141998291, + "epoch": 0.2100647016534867, + "kl_loss": 0.0746665894985199, + "loss_ib": 0.006521558854728937, + "step": 1461 + }, + { + "ce_ib": 5.58461332321167, + "ce_orig": 0.7684972286224365, + "epoch": 0.21020848310567936, + "kl_loss": 0.07737333327531815, + "loss_ib": 0.006660973187536001, + "step": 1462 + }, + { + "ce_ib": 5.090893745422363, + "ce_orig": 0.8380985856056213, + "epoch": 0.21035226455787204, + "kl_loss": 0.06075616925954819, + "loss_ib": 0.005583255551755428, + "step": 1463 + }, + { + "ce_ib": 4.113478183746338, + "ce_orig": 0.6833850145339966, + "epoch": 0.2104960460100647, + "kl_loss": 0.04907282441854477, + "loss_ib": 0.004510380327701569, + "step": 1464 + }, + { + "epoch": 0.21063982746225737, + "grad_norm": 0.13784626126289368, + "learning_rate": 4.979329958409575e-05, + "loss": 0.8449, + "step": 1465 + }, + { + "ce_ib": 5.734272003173828, + "ce_orig": 0.9160619974136353, + "epoch": 0.21063982746225737, + "kl_loss": 0.059984222054481506, + "loss_ib": 0.005866347346454859, + "step": 1465 + }, + { + "ce_ib": 5.554203510284424, + "ce_orig": 0.7350531816482544, + "epoch": 0.21078360891445003, + "kl_loss": 0.07819847762584686, + "loss_ib": 0.006687026005238295, + "step": 1466 + }, + { + "ce_ib": 7.594614505767822, + "ce_orig": 1.3538212776184082, + "epoch": 0.2109273903666427, + "kl_loss": 0.11122459173202515, + "loss_ib": 0.009358537383377552, + "step": 1467 + }, + { + "ce_ib": 5.303101539611816, + "ce_orig": 0.8379545211791992, + "epoch": 0.21107117181883536, + "kl_loss": 0.08008278906345367, + "loss_ib": 0.006655690260231495, + "step": 1468 + }, + { + "ce_ib": 6.422070503234863, + "ce_orig": 0.9334965348243713, + "epoch": 0.21121495327102804, + "kl_loss": 0.06777994334697723, + "loss_ib": 0.006600033026188612, + "step": 1469 + }, + { + "epoch": 0.2113587347232207, + "grad_norm": 0.10584981739521027, + "learning_rate": 4.979080218805404e-05, + "loss": 0.8907, + "step": 1470 + }, + { + "ce_ib": 3.5332536697387695, + "ce_orig": 0.4448845386505127, + "epoch": 0.2113587347232207, + "kl_loss": 0.07799813896417618, + "loss_ib": 0.005666533950716257, + "step": 1470 + }, + { + "ce_ib": 5.854316711425781, + "ce_orig": 0.9122508764266968, + "epoch": 0.21150251617541338, + "kl_loss": 0.06359520554542542, + "loss_ib": 0.006106918212026358, + "step": 1471 + }, + { + "ce_ib": 5.000598907470703, + "ce_orig": 0.8846215605735779, + "epoch": 0.21164629762760603, + "kl_loss": 0.0447930172085762, + "loss_ib": 0.0047399504110217094, + "step": 1472 + }, + { + "ce_ib": 7.332060813903809, + "ce_orig": 1.5303081274032593, + "epoch": 0.21179007907979872, + "kl_loss": 0.06140690669417381, + "loss_ib": 0.006736375391483307, + "step": 1473 + }, + { + "ce_ib": 4.4685235023498535, + "ce_orig": 0.8994055986404419, + "epoch": 0.21193386053199137, + "kl_loss": 0.0497230589389801, + "loss_ib": 0.004720414523035288, + "step": 1474 + }, + { + "epoch": 0.21207764198418405, + "grad_norm": 0.13744258880615234, + "learning_rate": 4.978828985883538e-05, + "loss": 0.8791, + "step": 1475 + }, + { + "ce_ib": 6.150574207305908, + "ce_orig": 1.1865558624267578, + "epoch": 0.21207764198418405, + "kl_loss": 0.07172469794750214, + "loss_ib": 0.006661522667855024, + "step": 1475 + }, + { + "ce_ib": 5.569920539855957, + "ce_orig": 0.7519057989120483, + "epoch": 0.2122214234363767, + "kl_loss": 0.0842377245426178, + "loss_ib": 0.006996846292167902, + "step": 1476 + }, + { + "ce_ib": 4.689029216766357, + "ce_orig": 0.9762951135635376, + "epoch": 0.2123652048885694, + "kl_loss": 0.03252793475985527, + "loss_ib": 0.003970911260694265, + "step": 1477 + }, + { + "ce_ib": 2.7434751987457275, + "ce_orig": 0.5294933915138245, + "epoch": 0.21250898634076204, + "kl_loss": 0.07534334063529968, + "loss_ib": 0.0051389047876000404, + "step": 1478 + }, + { + "ce_ib": 4.046224117279053, + "ce_orig": 0.8334237933158875, + "epoch": 0.21265276779295472, + "kl_loss": 0.052661050111055374, + "loss_ib": 0.004656164441257715, + "step": 1479 + }, + { + "epoch": 0.21279654924514738, + "grad_norm": 0.10680610686540604, + "learning_rate": 4.97857625979531e-05, + "loss": 0.8204, + "step": 1480 + }, + { + "ce_ib": 5.4969072341918945, + "ce_orig": 0.8406203389167786, + "epoch": 0.21279654924514738, + "kl_loss": 0.05738233029842377, + "loss_ib": 0.005617570132017136, + "step": 1480 + }, + { + "ce_ib": 3.677432060241699, + "ce_orig": 0.5892288088798523, + "epoch": 0.21294033069734003, + "kl_loss": 0.05613876134157181, + "loss_ib": 0.004645654000341892, + "step": 1481 + }, + { + "ce_ib": 4.960013389587402, + "ce_orig": 0.5945039987564087, + "epoch": 0.2130841121495327, + "kl_loss": 0.09380084276199341, + "loss_ib": 0.007170048542320728, + "step": 1482 + }, + { + "ce_ib": 5.3266921043396, + "ce_orig": 0.7922897338867188, + "epoch": 0.21322789360172537, + "kl_loss": 0.07225018739700317, + "loss_ib": 0.006275855004787445, + "step": 1483 + }, + { + "ce_ib": 7.143407821655273, + "ce_orig": 1.251823902130127, + "epoch": 0.21337167505391805, + "kl_loss": 0.06974963843822479, + "loss_ib": 0.007059185300022364, + "step": 1484 + }, + { + "epoch": 0.2135154565061107, + "grad_norm": 0.13681012392044067, + "learning_rate": 4.978322040692955e-05, + "loss": 0.877, + "step": 1485 + }, + { + "ce_ib": 5.060217380523682, + "ce_orig": 0.8638373017311096, + "epoch": 0.2135154565061107, + "kl_loss": 0.06189016252756119, + "loss_ib": 0.005624616984277964, + "step": 1485 + }, + { + "ce_ib": 4.532433032989502, + "ce_orig": 0.5628700852394104, + "epoch": 0.21365923795830338, + "kl_loss": 0.07572378218173981, + "loss_ib": 0.006052405573427677, + "step": 1486 + }, + { + "ce_ib": 4.463733196258545, + "ce_orig": 0.6085463762283325, + "epoch": 0.21380301941049604, + "kl_loss": 0.06972453743219376, + "loss_ib": 0.005718093365430832, + "step": 1487 + }, + { + "ce_ib": 5.135135650634766, + "ce_orig": 0.8072947859764099, + "epoch": 0.21394680086268872, + "kl_loss": 0.07542696595191956, + "loss_ib": 0.006338916253298521, + "step": 1488 + }, + { + "ce_ib": 5.989941120147705, + "ce_orig": 0.9643988609313965, + "epoch": 0.21409058231488137, + "kl_loss": 0.0718611627817154, + "loss_ib": 0.0065880282782018185, + "step": 1489 + }, + { + "epoch": 0.21423436376707405, + "grad_norm": 0.1143512949347496, + "learning_rate": 4.978066328729607e-05, + "loss": 0.7277, + "step": 1490 + }, + { + "ce_ib": 6.156676292419434, + "ce_orig": 0.6449971795082092, + "epoch": 0.21423436376707405, + "kl_loss": 0.07167178392410278, + "loss_ib": 0.006661927793174982, + "step": 1490 + }, + { + "ce_ib": 5.7858195304870605, + "ce_orig": 1.0668694972991943, + "epoch": 0.2143781452192667, + "kl_loss": 0.06922002136707306, + "loss_ib": 0.0063539110124111176, + "step": 1491 + }, + { + "ce_ib": 5.9640212059021, + "ce_orig": 0.6456109285354614, + "epoch": 0.2145219266714594, + "kl_loss": 0.06728428602218628, + "loss_ib": 0.006346225272864103, + "step": 1492 + }, + { + "ce_ib": 3.687690019607544, + "ce_orig": 0.7481469511985779, + "epoch": 0.21466570812365204, + "kl_loss": 0.06135900691151619, + "loss_ib": 0.004911795258522034, + "step": 1493 + }, + { + "ce_ib": 6.0639214515686035, + "ce_orig": 1.100325107574463, + "epoch": 0.21480948957584473, + "kl_loss": 0.06355655193328857, + "loss_ib": 0.006209788378328085, + "step": 1494 + }, + { + "epoch": 0.21495327102803738, + "grad_norm": 0.11802612245082855, + "learning_rate": 4.977809124059298e-05, + "loss": 0.8809, + "step": 1495 + }, + { + "ce_ib": 5.384814262390137, + "ce_orig": 1.034774899482727, + "epoch": 0.21495327102803738, + "kl_loss": 0.06506302952766418, + "loss_ib": 0.005945558659732342, + "step": 1495 + }, + { + "ce_ib": 5.679831504821777, + "ce_orig": 0.9156610369682312, + "epoch": 0.21509705248023006, + "kl_loss": 0.040716562420129776, + "loss_ib": 0.0048757437616586685, + "step": 1496 + }, + { + "ce_ib": 6.579013347625732, + "ce_orig": 1.3346660137176514, + "epoch": 0.21524083393242271, + "kl_loss": 0.06767557561397552, + "loss_ib": 0.006673285271972418, + "step": 1497 + }, + { + "ce_ib": 4.874380111694336, + "ce_orig": 0.7892958521842957, + "epoch": 0.2153846153846154, + "kl_loss": 0.06907147169113159, + "loss_ib": 0.005890763830393553, + "step": 1498 + }, + { + "ce_ib": 4.703640460968018, + "ce_orig": 0.9356851577758789, + "epoch": 0.21552839683680805, + "kl_loss": 0.0733482837677002, + "loss_ib": 0.00601923419162631, + "step": 1499 + }, + { + "epoch": 0.21567217828900073, + "grad_norm": 0.11023815721273422, + "learning_rate": 4.977550426836959e-05, + "loss": 0.9166, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 20865, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}